Commit ce3886b2 authored by Alexander Lercher's avatar Alexander Lercher

Major code cleanup

Removed all old and unused source code, including source code from old approaches
parent a58cc362
layers:
user:
properties:
starting-point:
properties:
- Latitude_StartingPoint
- Longitude_StartingPoint
\ No newline at end of file
...@@ -29,59 +29,7 @@ paths: ...@@ -29,59 +29,7 @@ paths:
200: 200:
description: "Successful echo of request data" description: "Successful echo of request data"
# Locations
# TODO remove
/locations:
post:
operationId: "routes.location.post"
tags:
- "Locations"
summary: "Add new location data"
parameters:
- in: body
name: "Location"
description: "The location data to be added"
required: true
schema:
$ref: "#/definitions/Location"
responses:
201:
description: "Successful operation"
400:
description: "Invalid input"
get:
operationId: "routes.location.get"
tags:
- "Locations"
summary: "Get location data"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/LocationCollection"
/location-collections:
post:
operationId: "routes.location.post_many"
tags:
- "Locations"
summary: "Add new location data collection"
parameters:
- in: body
name: "Locations"
description: "The location data collection to be added"
required: true
schema:
$ref: "#/definitions/LocationCollection"
responses:
201:
description: "Successful operation"
400:
description: "Invalid input"
#region Layers #region Layers
/layers: /layers:
post: post:
operationId: "routes.layers.post" operationId: "routes.layers.post"
...@@ -176,7 +124,7 @@ paths: ...@@ -176,7 +124,7 @@ paths:
/layers/{name}/clusters: /layers/{name}/clusters:
get: get:
operationId: "routes.clustersets.get_by_name2" operationId: "routes.clustersets.get_by_name"
tags: tags:
- "Layers" - "Layers"
summary: "Get all clusters for the layer" summary: "Get all clusters for the layer"
...@@ -196,7 +144,7 @@ paths: ...@@ -196,7 +144,7 @@ paths:
/layers/{name}/timeslices: /layers/{name}/timeslices:
get: get:
operationId: "routes.timeslices.get_by_name2" operationId: "routes.timeslices.get_by_name"
tags: tags:
- "Layers" - "Layers"
summary: "Get all timeslices for the layer" summary: "Get all timeslices for the layer"
...@@ -213,162 +161,10 @@ paths: ...@@ -213,162 +161,10 @@ paths:
$ref: "#/definitions/TimeSliceCollection" $ref: "#/definitions/TimeSliceCollection"
404: 404:
description: "Layer not found" description: "Layer not found"
#endregion
# Clusters
# TODO remove partially
/location-clusters:
get:
operationId: "routes.cluster.get_locations"
tags:
- "Clusters"
summary: "Get user communities clustered by location"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/LocationClusterCollection"
# /clusters/cluster.png:
# get:
# operationId: "routes.cluster.get_image"
# tags:
# - "Clusters"
# summary: "Get user communities per date per hour as image"
# parameters: []
# produces:
# - "image/png"
# responses:
# 200:
# description: "Successful operation"
/time-clusters:
get:
operationId: "routes.cluster.get_times"
tags:
- "Clusters"
summary: "Get user communities clustered by time per hour"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/TimeClusterCollection"
# /agi/clusters/cluster.png:
# get:
# operationId: "routes.agi_cluster.get_image"
# tags:
# - "Clusters"
# summary: "Get user communities per date per hour from agi data as image"
# parameters: []
# produces:
# - "image/png"
# responses:
# 200:
# description: "Successful operation"
# TODO remove
/clustersets:
get:
operationId: "routes.clustersets.get"
tags:
- "Clusters"
summary: "Get clustersets for all layers"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/ClusterSetCollection"
/clustersets/names:
get:
operationId: "routes.clustersets.get_names"
tags:
- "Clusters"
summary: "Get clusterset names for all layers"
parameters: []
responses:
200:
description: "Successful operation"
schema:
type: array
items:
type: string
/clustersets/{layername}:
get:
operationId: "routes.clustersets.get_by_name"
tags:
- "Clusters"
summary: "Get clusterset for layer-name"
parameters:
- name: "layername"
in: "path"
description: "Name of the layer to return the clusterset for"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/ClusterSet"
404:
description: "Clusterset not found"
# TODO remove
/user-cluster-graphs:
get:
operationId: "routes.user_cluster.get"
tags:
- "User Graphs"
summary: "Get user graphs per layer per cluster"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/UserClusterGraphCollection"
# Time slices #endregion
/timeslices:
get:
operationId: "routes.timeslices.get"
tags:
- "Time Slices"
summary: "Get all time slices based on individual layers containing clusters with nodes for that time"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/TimeSliceCollection"
/timeslices/{layername}:
get:
operationId: "routes.timeslices.get_by_name"
tags:
- "Time Slices"
summary: "Get all time slices for one layer"
parameters:
- name: "layername"
in: "path"
description: "Name of the layer to return the time slices for"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/TimeSliceCollection"
404:
description: "No time slices found for layername"
# Function Calls #region Function Calls
/rfc/run: /rfc/run:
post: post:
operationId: "routes.functions.run_agi_clustering_and_graph_creation" operationId: "routes.functions.run_agi_clustering_and_graph_creation"
...@@ -379,30 +175,11 @@ paths: ...@@ -379,30 +175,11 @@ paths:
responses: responses:
204: 204:
description: "Successful operation" description: "Successful operation"
#endregion
definitions: definitions:
Location:
type: "object"
properties:
id:
type: string
user:
type: "string"
latitude:
type: "number"
format: float
longitude:
type: "number"
format: float
timestamp:
type: "number"
LocationCollection:
type: array
items:
$ref: "#/definitions/Location"
Cluster: Cluster:
type: object type: object
properties: properties:
...@@ -414,83 +191,17 @@ definitions: ...@@ -414,83 +191,17 @@ definitions:
type: array type: array
items: items:
$ref: "#/definitions/Node" $ref: "#/definitions/Node"
ClusterCollection: ClusterCollection:
type: array type: array
items: items:
$ref: "#/definitions/Cluster" $ref: "#/definitions/Cluster"
LocationCluster:
type: object
properties:
id:
type: string
cluster_label:
type: number
nodes:
type: array
items:
$ref: "#/definitions/Location"
LocationClusterCollection:
type: array
items:
$ref: "#/definitions/LocationCluster"
TimeCluster:
type: object
properties:
id:
type: string
date:
type: string
hour:
type: number
cluster_label:
type: number
nodes:
type: array
items:
$ref: "#/definitions/Location"
TimeClusterCollection:
type: array
items:
$ref: "#/definitions/TimeCluster"
UserClusterGraph:
type: object
properties:
nodes:
type: array
items:
type: string
edges:
type: array
items:
type: array
items:
type: string
example:
- user1
- user2
- weight
UserClusterGraphCollection:
type: array
items:
$ref: "#/definitions/UserClusterGraph"
Layer-UpperCase: Layer-UpperCase:
type: object type: object
properties: properties:
LayerName: LayerName:
type: string type: string
# Nodes:
# type: array
# items:
# type: object
Properties: Properties:
type: array type: array
items: items:
...@@ -501,10 +212,6 @@ definitions: ...@@ -501,10 +212,6 @@ definitions:
properties: properties:
layer_name: layer_name:
type: string type: string
# nodes:
# type: array
# items:
# type: object
properties: properties:
type: array type: array
items: items:
...@@ -531,21 +238,6 @@ definitions: ...@@ -531,21 +238,6 @@ definitions:
items: items:
$ref: "#/definitions/Node" $ref: "#/definitions/Node"
ClusterSet:
type: object
properties:
layer_name:
type: string
clusters:
type: array
items:
$ref: "#/definitions/Cluster"
ClusterSetCollection:
type: array
items:
$ref: "#/definitions/ClusterSet"
TimeSlice: TimeSlice:
type: object type: object
properties: properties:
......
import sys
import os
modules_paths = ['../../../modules/']
for modules_path in modules_paths:
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from typing import List, Tuple, Any
from networkx import Graph
from db.entities import LocationCluster, UserClusterGraph, Cluster
from db.repository import Repository
from processing.user_graph_generator import UserGraphGenerator
repo = Repository()
def get_edges_with_weights(g: Graph) -> List[Tuple[Any, Any, int]]:
res = []
for e in g.edges:
res.append((*e, g.edges[e]['weight']))
return res
def create_graphs_for_location_clusters():
graphs_for_clusters = []
ug = UserGraphGenerator()
clusters: Cluster = repo.get_location_clusters()
for cluster in clusters:
user_ids = [n['user'] for n in cluster.nodes]
graph: Graph = ug.create_graph_from_nodes(user_ids)
vertices = list(graph.nodes)
edges = get_edges_with_weights(graph)
cluster_graph = UserClusterGraph(vertices, edges)
graphs_for_clusters.append(cluster_graph)
store_graphs(graphs_for_clusters)
def store_graphs(graphs: List):
for g in graphs:
repo.add_user_cluster_graph(g)
if __name__ == "__main__":
create_graphs_for_location_clusters()
import json
from typing import List, Dict
import hashlib
class AgiRepository:
def getLocations(self) -> List[Dict]:
locations = []
travels = self.readDataFromFile()
# only take started travels
travels = [t for t in travels if t['status'] >= 2]
for travel in travels:
num_complete_travels = min(len(travel['startedBy']), len(travel['users']))
for i in range(num_complete_travels):
cur_location = travel['startedBy'][i]
cur_user = travel['users'][i]
locations.append(
self.location(f'{travel["id"]}-{cur_location["moment"]}',
cur_location['coordinate']['latitude'],
cur_location['coordinate']['longitude'],
cur_location['moment'],
# todo user in travel startedBy not available from dataset - currently using user list
hashlib.sha1(cur_user['userId'].encode()).hexdigest() # not showing generated username
))
return locations
def getLocationsBasedOnNewDataSchema(self):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data = {
'layer_name': 'Destination',
'nodes': self.getLocations(),
'properties': ['latitude', 'longitude']
}
return data
def getTimesBasedOnNewDataSchema(self):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data = {
'layer_name': 'Starting_Time',
'nodes': self.getLocations(),
'properties': ['timestamp']
}
return data
def readDataFromFile(self) -> List[Dict]:
with open('./db/agi/travels.json', 'r') as f_travels:
travels = json.loads(f_travels.read())
return travels
def location(self, id_, lat, long_, timestamp, username) -> dict:
return {
"id": id_,
'latitude': lat,
'longitude': long_,
"timestamp": timestamp,
"user": username
}
from db.entities.location import Location from db.entities.location import Location
from db.entities.popular_location import PopularLocation from db.entities.popular_location import PopularLocation
from db.entities.cluster import Cluster, LocationCluster, TimeCluster from db.entities.cluster import Cluster
from db.entities.clusterset import ClusterSet from db.entities.clusterset import ClusterSet
from db.entities.user_cluster_graph import UserClusterGraph from db.entities.user_cluster_graph import UserClusterGraph
from db.entities.layer import Layer from db.entities.layer import Layer
......
...@@ -39,67 +39,3 @@ class Cluster: ...@@ -39,67 +39,3 @@ class Cluster:
def __str__(self): def __str__(self):
return f"Cluster({self.__repr__()})" return f"Cluster({self.__repr__()})"
class LocationCluster(Cluster):
def __init__(self, cluster_label: int = None, nodes: List = None,
location_dict: Dict = None, from_db=False):
super().__init__(cluster_label, nodes)
self.id = f'{self.cluster_label}'
if location_dict is not None:
self.from_serializable_dict(location_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes
}
def from_serializable_dict(self, location_dict: Dict, from_db=False):
self.id = location_dict["id"]
self.cluster_label = location_dict["cluster_label"]
self.nodes = json.loads(location_dict["nodes"]) \
if from_db else location_dict["nodes"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"LocationCluster({self.__repr__()})"
class TimeCluster(Cluster):
def __init__(self, date: date = None, hour: int = None, cluster_label: int = None, nodes: List = None,
time_dict: Dict = None, from_db=False):
super().__init__(cluster_label, nodes)
self.date = date
self.hour = hour
self.id = f'{self.date}-{self.hour}-{self.cluster_label}'
if time_dict is not None:
self.from_serializable_dict(time_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"date": str(self.date),
"hour": self.hour,
"cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes
}
def from_serializable_dict(self, time_dict: Dict, from_db=False):
self.id = time_dict["id"]
self.date = datetime.strptime(time_dict["date"], '%Y-%m-%d').date()
self.hour = time_dict["hour"]
self.cluster_label = time_dict["cluster_label"]
self.nodes = json.loads(time_dict["nodes"]) \
if from_db else time_dict["nodes"]
def __repr__(self):
return json.dumps(self.to_serializable_dict(True))
def __str__(self):
return f"TimeCluster({self.__repr__()})"
...@@ -3,8 +3,6 @@ import network_constants as netconst ...@@ -3,8 +3,6 @@ import network_constants as netconst
from database.MongoRepositoryBase import MongoRepositoryBase from database.MongoRepositoryBase import MongoRepositoryBase
import json import json
from db.agi.agi_repository import AgiRepository
from db.entities import * from db.entities import *
from typing import List from typing import List
...@@ -13,61 +11,14 @@ class Repository(MongoRepositoryBase): ...@@ -13,61 +11,14 @@ class Repository(MongoRepositoryBase):
'''This is a repository for MongoDb.''' '''This is a repository for MongoDb.'''
def __init__(self): def __init__(self):
super().__init__(netconst.COMMUNITY_DETECTION_DB_HOSTNAME, super().__init__(netconst.ROLESTAGE_DISCOVERY_DB_HOSTNAME,
netconst.COMMUNITY_DETECTION_DB_PORT, netconst.ROLESTAGE_DISCOVERY_DB_PORT,
'communityDetectionDb') 'roleStageDb')
self._location_collection = 'location'
self._location_cluster_collection = 'location_cluster'
self._time_cluster_collection = 'time_cluster'
self._user_cluster_graph_collection = 'user_cluster_graph'
self._layer_collection = 'layer-new'
self._layer_nodes_collection = 'layer_nodes-new'
self._clusterset_collection = 'cluster_set-new'
self._time_slice_collection = 'time_slice-new'
self.agi_repo = AgiRepository()
#region Location
def add_location(self, location: Location):
super().insert_entry(self._location_collection, location.to_serializable_dict())
def get_locations(self) -> List[Location]:
locations = super().get_entries(self._location_collection)
return [Location(l) for l in locations]
def get_agi_locations(self) -> List[Location]:
agi_locations = self.agi_repo.getLocations()
return [Location(agi_loc) for agi_loc in agi_locations]
#endregion
#region Specific Clusters
def add_location_cluster(self, cluster: LocationCluster):
super().insert_entry(self._location_cluster_collection,
cluster.to_serializable_dict(for_db=True))
def get_location_clusters(self) -> List[LocationCluster]: self._layer_collection = 'layers'
clusters = super().get_entries(self._location_cluster_collection) self._layer_nodes_collection = 'layer_nodes'
return [LocationCluster(location_dict=c, from_db=True) for c in clusters] self._clusters_collection = 'clusters'
self._time_slice_collection = 'time_slices'
def add_time_cluster(self, cluster: TimeCluster):
super().insert_entry(self._time_cluster_collection,
cluster.to_serializable_dict(for_db=True))
def get_time_clusters(self) -> List[TimeCluster]:
clusters = super().get_entries(self._time_cluster_collection)
return [TimeCluster(time_dict=c, from_db=True) for c in clusters]
#endregion
#region Cluster Graph
def add_user_cluster_graph(self, user_graph: UserClusterGraph):
super().insert_entry(self._user_cluster_graph_collection,
user_graph.to_serializable_dict(for_db=True))
def get_user_cluster_graphs(self) -> List[UserClusterGraph]:
user_graphs = super().get_entries(self._user_cluster_graph_collection)
return [UserClusterGraph(dict_=u, from_db=True) for u in user_graphs]
#endregion
#region Layers #region Layers
def add_layer(self, layer: Layer): def add_layer(self, layer: Layer):
...@@ -77,10 +28,6 @@ class Repository(MongoRepositoryBase): ...@@ -77,10 +28,6 @@ class Repository(MongoRepositoryBase):
entries = super().get_entries(self._layer_collection) entries = super().get_entries(self._layer_collection)
return [Layer(e) for e in entries] return [Layer(e) for e in entries]
def get_layer_names(self) -> List[str]:
entries = super().get_entries(self._layer_collection, projection={'layer_name': 1})
return [e['layer_name'] for e in entries]
def get_layer(self, layer_name) -> Layer: def get_layer(self, layer_name) -> Layer:
entries = super().get_entries(self._layer_collection, selection={'layer_name': layer_name}) entries = super().get_entries(self._layer_collection, selection={'layer_name': layer_name})
entries = [Layer(e) for e in entries] entries = [Layer(e) for e in entries]
...@@ -103,38 +50,13 @@ class Repository(MongoRepositoryBase): ...@@ -103,38 +50,13 @@ class Repository(MongoRepositoryBase):
#endregion #endregion
#region ClusterSet #region Clusters
# TODO cleanup
def add_clusterset(self, cluster_set: ClusterSet):
super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict())
def get_clustersets(self) -> List[ClusterSet]:
'''Returns all clustersets.'''
entries = super().get_entries(self._clusterset_collection)
return [ClusterSet(cluster_set_dict=e) for e in entries]
def get_clusterset_names(self) -> List[str]:
'''Returns the names of all clustersets.'''
entries = super().get_entries(self._clusterset_collection, projection={'layer_name': 1})
return [e['layer_name'] for e in entries]
def get_clusterset(self, layer_name) -> ClusterSet:
'''Returns a single clusterset with the given name or None otherwise.'''
entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name})
entries = [ClusterSet(cluster_set_dict=e) for e in entries]
if entries is not None and len(entries) > 0:
return entries[0]
else:
return None
def add_clusters(self, clusters: List[Cluster]): def add_clusters(self, clusters: List[Cluster]):
cluster_dicts = [c.to_serializable_dict(for_db=True) for c in clusters] cluster_dicts = [c.to_serializable_dict(for_db=True) for c in clusters]
super().insert_many(self._clusterset_collection, cluster_dicts) super().insert_many(self._clusters_collection, cluster_dicts)
def get_clusters_for_layer(self, layer_name: str) -> List[Cluster]: def get_clusters_for_layer(self, layer_name: str) -> List[Cluster]:
entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name}, projection={'_id': 0}) entries = super().get_entries(self._clusters_collection, selection={'layer_name': layer_name}, projection={'_id': 0})
return [Cluster(cluster_dict=e, from_db=True) for e in entries] return [Cluster(cluster_dict=e, from_db=True) for e in entries]
#endregion #endregion
...@@ -155,4 +77,5 @@ class Repository(MongoRepositoryBase): ...@@ -155,4 +77,5 @@ class Repository(MongoRepositoryBase):
def remove_all_time_slices(self): def remove_all_time_slices(self):
super().drop_collection(self._time_slice_collection) super().drop_collection(self._time_slice_collection)
#endregion #endregion
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.repository import Repository
def insert_locations():
repo = Repository()
locs = repo.get_agi_locations()
for l in locs:
repo.add_location(l)
if __name__ == "__main__":
insert_locations()
...@@ -3,7 +3,6 @@ import numpy as np ...@@ -3,7 +3,6 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.cluster import OPTICS from sklearn.cluster import OPTICS
from typing import List, Dict, Any, TypeVar from typing import List, Dict, Any, TypeVar
from deprecated import deprecated
T = TypeVar('T') T = TypeVar('T')
ClusterGroup = Dict[Any, List[Dict]] ClusterGroup = Dict[Any, List[Dict]]
...@@ -16,63 +15,20 @@ class Clusterer: ...@@ -16,63 +15,20 @@ class Clusterer:
:param epsilon: Eps used in OPTICS :param epsilon: Eps used in OPTICS
:param min_points: MinPts used in OPTICS :param min_points: MinPts used in OPTICS
''' '''
def __init__(self, epsilon=11, min_points=5): def __init__(self, min_points=5):
self.epsilon = epsilon
self.min_points = min_points self.min_points = min_points
def draw_locations(self, locations:List, labels:List=None) -> plt.Figure:
if locations is None or len(locations) == 0:
return self._draw_locations()
if labels is None or len(locations) != len(labels):
labels = self.create_labels(locations)
return self._draw_locations(
locations = self.extract_location_data(locations),
partition_info = labels
)
def _draw_locations(self, locations:np.ndarray=None, centroids:np.ndarray=None, partition_info:List=None) -> plt.Figure:
fig = plt.Figure()
axis = fig.add_subplot(1, 1, 1)
if locations is not None:
colors = plt.cm.rainbow(np.linspace(0, 1, len(locations)))
if partition_info is not None:
distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(partition_info))))
colors = [distinct_colors[pi] for pi in partition_info]
# draw locations with random colors
axis.scatter(locations[:,0],
locations[:,1],
c=colors)
if centroids is not None:
# draw black centroids
axis.scatter(centroids[:,0], centroids[:,1], c='k', marker='x', s=80)
return fig
def create_labels(self, features:np.ndarray) -> List[int]: def create_labels(self, features:np.ndarray) -> List[int]:
'''Creates labels for the items based on DBSCAN.''' '''Creates labels for the items based on OPTICS.'''
if features is None or len(features) == 0: if features is None or len(features) == 0:
return features # trash in trash out return features # trash in trash out
dbsc = OPTICS(min_samples=self.min_points) # DBSCAN(eps = self.epsilon, min_samples = self.min_points) dbsc = OPTICS(min_samples=self.min_points)
dbsc = dbsc.fit(features) dbsc = dbsc.fit(features)
labels = dbsc.labels_ labels = dbsc.labels_
return labels.tolist() return labels.tolist()
@deprecated(reason="Use generic version instead")
def extract_location_features(self, locations: List[dict]) -> np.ndarray:
return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations])
@deprecated(reason="Use generic version instead")
def extract_time_features(self, times: List[Dict]) -> np.ndarray:
return np.asarray([[float(t['timestamp'])] for t in times])
def _extract_features(self, dataset: List[Dict], features:List[str]) -> np.ndarray: def _extract_features(self, dataset: List[Dict], features:List[str]) -> np.ndarray:
'''Extracts the feature values from the dataset into a np array with same order as original dataset.''' '''Extracts the feature values from the dataset into a np array with same order as original dataset.'''
extracted_features = [] extracted_features = []
...@@ -104,30 +60,6 @@ class Clusterer: ...@@ -104,30 +60,6 @@ class Clusterer:
return clusters return clusters
@deprecated(reason="Use generic version instead")
def cluster_locations(self, locations:List[Dict]) -> ClusterGroup:
'''Returns a dictionary with identified clusters and their locations copied from the input'''
if locations is None or len(locations) == 0:
# raise Exception("locations has to contain something")
return {}
features = self.extract_location_features(locations)
labels = self.create_labels(features)
self.label_dataset(locations, labels)
return self.group_by_clusters(locations, labels)
@deprecated(reason="Use generic version instead")
def cluster_times(self, times:List[Dict]) -> ClusterGroup:
'''Returns a dictionary with identified clusters and their times copied from the input'''
features = self.extract_time_features(times)
labels = self.create_labels(features)
self.label_dataset(times, labels)
return self.group_by_clusters(times, labels)
def cluster_dataset(self, dataset:List[Dict], features:List[str]) -> ClusterGroup: def cluster_dataset(self, dataset:List[Dict], features:List[str]) -> ClusterGroup:
''' '''
Returns the identified clusters containing a subset of nodes from the dataset. Returns the identified clusters containing a subset of nodes from the dataset.
......
import yaml
from typing import Generator
from pathlib import Path
### init logging ###
import logging
LOG_FORMAT = (
'%(levelname) -5s %(asctime)s %(name)s:%(funcName) -35s %(lineno) -5d: %(message)s')
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
LOGGER = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent.parent
class ClusteringConfig:
'''Contains the configuration for the clustering algorithm defined in configs/clustering.yaml.'''
config_path = f'{PROJECT_ROOT}/configs/clustering.yaml'
config: dict = None
def __init__(self):
self.config = self._load_config()
def _load_config(self) -> dict:
'''Loads the whole configuration from file.'''
config = None
with open(self.config_path, 'r') as stream:
try:
config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
LOGGER.error(exc)
config = {}
return config
def get_config(self):
return self.config
def get_layer_configs(self) -> Generator[dict, None, None]:
"""
Returns a generator for the individual layer configs.
Layer configs are dicts including a layer-name.
"""
for key, layer in self.config['layers'].items():
layer['layer-name'] = key
yield layer
import itertools
from typing import List, Dict, Tuple, Any
from networkx import Graph
class UserGraphGenerator:
def __init__(self):
pass
def count_edges(self, nodes: List) -> Dict[Tuple, int]:
edge_counts = {}
coms = itertools.combinations(nodes, 2)
for first, second in coms:
if first == second: # dont process reflexive connections
continue
if (first, second) in edge_counts:
edge_counts[first, second] += 1
else:
edge_counts[first, second] = 1
return edge_counts
def create_edges_with_weights(self, edge_counts: Dict[Tuple[Any, Any], int]) -> List[Tuple[Any, Any, Dict]]:
edges = []
for (key1, key2), value in edge_counts.items():
edge = (key1, key2, {'weight': value})
edges.append(edge)
return edges
def create_fully_connected_edges_for_nodes(self, nodes: List) -> List[Tuple[Any, Any, Dict]]:
return self.create_edges_with_weights(self.count_edges(nodes))
def create_graph_from_nodes(self, nodes: List) -> Graph:
'''Creates a networkx.Graph with distinct nodes and weighted edges between these nodes'''
g = Graph()
g.add_nodes_from(nodes)
g.add_edges_from(self.create_fully_connected_edges_for_nodes(nodes))
return g
import io
from flask import request, Response
from db.repository import Repository
from processing.clustering.clusterer import Clusterer
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
repo = Repository()
clusterer = Clusterer()
def get_locations():
clusters = repo.get_location_clusters()
return [c.to_serializable_dict() for c in clusters]
def get_times():
clusters = repo.get_time_clusters()
return [c.to_serializable_dict() for c in clusters]
def get_image_1():
return Response(status=501)
# todo
locations = repo.getLocations()
fig = clusterer.draw_locations(locations)
output = io.BytesIO()
FigureCanvas(fig).print_png(output)
return Response(output.getvalue(), mimetype="image/png")
def get_image_2():
return Response(status=501)
# todo
locations = repo.getLocations()
fig = clusterer.draw_locations(locations)
output = io.BytesIO()
FigureCanvas(fig).print_png(output)
return Response(output.getvalue(), mimetype="image/png")
\ No newline at end of file
...@@ -4,23 +4,9 @@ from db.entities import ClusterSet ...@@ -4,23 +4,9 @@ from db.entities import ClusterSet
repo = Repository() repo = Repository()
def get(): def get_by_name(name):
return [c.to_serializable_dict() for c in repo.get_clustersets()]
def get_names():
return repo.get_clusterset_names()
def get_by_name2(name):
res = repo.get_clusters_for_layer(name) res = repo.get_clusters_for_layer(name)
if res is None or len(res) == 0: if res is None or len(res) == 0:
return Response(status=404) return Response(status=404)
else: else:
return [c.to_serializable_dict() for c in res] return [c.to_serializable_dict() for c in res]
def get_by_name(name):
res = repo.get_clusterset(name)
if res is not None:
return res.to_serializable_dict()
else:
return Response(status=404)
\ No newline at end of file
import insert_agi_locations
import run_clustering
import create_user_graphs
def run_agi_clustering_and_graph_creation(): def run_agi_clustering_and_graph_creation():
insert_agi_locations.insert_locations() pass
run_clustering.run_location_clustering()
run_clustering.run_time_clustering()
create_user_graphs.create_graphs_for_location_clusters()
...@@ -15,7 +15,6 @@ def post(): ...@@ -15,7 +15,6 @@ def post():
def _insert_layer(layer_data: dict): def _insert_layer(layer_data: dict):
'''Converts object keys from external source and inserts into database.''' '''Converts object keys from external source and inserts into database.'''
layer_data['layer_name'] = layer_data.pop('LayerName') layer_data['layer_name'] = layer_data.pop('LayerName')
# layer_data['nodes'] = layer_data.pop('Nodes')
layer_data['properties'] = layer_data.pop('Properties') layer_data['properties'] = layer_data.pop('Properties')
repo.add_layer(Layer(layer_data)) repo.add_layer(Layer(layer_data))
......
from flask import request, Response
from db.repository import Repository
from db.entities import Location
repo = Repository()
def post():
body = request.json
_insert_location(body)
return Response(status=201)
def post_many():
body = request.json
for location in body:
_insert_location(location)
return Response(status=201)
def get():
return [l.to_serializable_dict() for l in repo.get_locations()]
def _insert_location(location_data: dict):
repo.add_location(Location(location_data))
...@@ -4,23 +4,8 @@ from db.entities import TimeSlice ...@@ -4,23 +4,8 @@ from db.entities import TimeSlice
repo = Repository() repo = Repository()
def get_by_name(name):
def get():
return [e.to_serializable_dict() for e in repo.get_time_slices()]
def get_by_name(layername):
res = repo.get_time_slices_by_name(layername)
# print(len(res))
if res is not None and len(res) != 0:
return [e.to_serializable_dict() for e in res]
else:
return Response(status=404)
def get_by_name2(name):
res = repo.get_time_slices_by_name(name) res = repo.get_time_slices_by_name(name)
# print(len(res))
if res is not None and len(res) != 0: if res is not None and len(res) != 0:
return [e.to_serializable_dict() for e in res] return [e.to_serializable_dict() for e in res]
......
from flask import request, Response
from db.repository import Repository
repo = Repository()
def get():
data = repo.get_user_cluster_graphs()
return [d.to_serializable_dict() for d in data]
...@@ -5,16 +5,13 @@ if os.path.exists(modules_path): ...@@ -5,16 +5,13 @@ if os.path.exists(modules_path):
sys.path.insert(1, modules_path) sys.path.insert(1, modules_path)
import json import json
from db.entities import * from db.entities import Layer, Cluster
from typing import List, Dict, Tuple from typing import List, Dict, Tuple
from db.repository import Repository, AgiRepository from db.repository import Repository
from processing.clustering.clusterer import Clusterer from processing.clustering.clusterer import Clusterer
DEBUG = False
repo = Repository() repo = Repository()
test_repo = AgiRepository()
def run_generic_clustering(): def run_generic_clustering():
...@@ -29,7 +26,6 @@ def run_generic_clustering(): ...@@ -29,7 +26,6 @@ def run_generic_clustering():
continue continue
clusters = run_clustering_for_layer(layer) clusters = run_clustering_for_layer(layer)
# cluster_set = ClusterSet(layer.layer_name, clusters)
store_generic_clusters(clusters) store_generic_clusters(clusters)
...@@ -44,74 +40,10 @@ def run_clustering_for_layer(layer: Layer) -> List[Cluster]: ...@@ -44,74 +40,10 @@ def run_clustering_for_layer(layer: Layer) -> List[Cluster]:
return [Cluster(layer.layer_name, key, value) for key, value in res.items()] return [Cluster(layer.layer_name, key, value) for key, value in res.items()]
def store_generic_clusters(clusters: List[Cluster]): def store_generic_clusters(clusters: List[Cluster]):
repo.add_clusters(clusters) repo.add_clusters(clusters)
# with open(f'clusterset_{cluster_set.layer_name}.txt', 'w') as file:
# file.write(json.dumps(cluster_set.to_serializable_dict()))
def run_location_clustering():
user_clusterer = Clusterer()
all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_locations(
[l.to_serializable_dict() for l in all_location_traces])
clusters = [LocationCluster(key, value)
for key, value in cluster_result.items()]
store_clusters('locations', clusters)
def run_time_clustering():
clusters: List[TimeCluster] = []
user_clusterer = Clusterer(epsilon=600) # clustered within 10 minutes
all_location_traces = repo.get_locations()
# for each date in timestamp list
dates = {trace.timestamp.date() for trace in all_location_traces}
for cur_date in dates:
traces_for_cur_date = [
trace for trace in all_location_traces if trace.timestamp.date() == cur_date]
# for each hour of that day
for cur_hour in list(range(24)):
traces_for_time_slice = [
trace for trace in traces_for_cur_date if trace.timestamp.hour == cur_hour]
if len(traces_for_time_slice) == 0:
continue
# clustering per hour
cluster_result = user_clusterer.cluster_times(
[t.to_serializable_dict() for t in traces_for_time_slice])
cur_clusters = [TimeCluster(cur_date, cur_hour, key, value)
for key, value in cluster_result.items()]
clusters.extend(cur_clusters)
store_clusters('times', clusters)
def store_clusters(type: str, clusters: List):
if DEBUG:
print(clusters)
return
if type == 'locations':
for c in clusters:
repo.add_location_cluster(c)
if type == 'times':
for c in clusters:
repo.add_time_cluster(c)
if __name__ == "__main__": if __name__ == "__main__":
run_generic_clustering() run_generic_clustering()
# TODO cleanup
# run_location_clustering()
# run_time_clustering()
import sys
import os
for path in ['../', './', '../../../modules/']:
if os.path.exists(path):
sys.path.insert(1, path)
import matplotlib.pyplot as plt
from db.repository import Repository
from db.entities import TimeSlice
from typing import List
def plt_show_circles(time_slices: List[TimeSlice], cluster_no):
cluster_no = str(cluster_no)
for slice_ in time_slices:
nodes = slice_.get_nodes_for_cluster(cluster_no)
# print(f"{slice_.time} number elements for cluster {cluster_no}: {len(nodes)}")
plt.title(str(slice_.time))
plt.scatter([n['Longitude_Destination'] if 'Longitude_Destination' in n else 0
for n in nodes],
[n['Latitude_Destination'] if 'Latitude_Destination' in n else 0
for n in nodes],
s=[len(nodes)*100]*len(nodes))
plt.pause(0.5)
def plt_show_bars(time_slices: List[TimeSlice], cluster_no):
cluster_no = str(cluster_no)
labels = [ts.time for ts in time_slices]
x_axis_label_stepsize = 10
nodes_per_slice_for_single_cluster = \
[len(time_slice.get_nodes_for_cluster(cluster_no))
for time_slice
in time_slices]
fig, ax = plt.subplots()
ax.bar(x=range(len(labels)),
height=nodes_per_slice_for_single_cluster)
ax.set_ylabel('Size')
ax.set_title(f'Cluster-{cluster_no} size over time')
ax.set_xticks(range(len(labels))[::x_axis_label_stepsize])
ax.set_xticklabels(labels[::x_axis_label_stepsize])
plt.show()
if __name__ == "__main__":
repo = Repository()
time_slices = repo.get_time_slices_by_name("Destination_Layer")
# chronological order
time_slices.sort(key=lambda ts: eval(ts.time))
print(len(time_slices))
plt_show_bars(time_slices, cluster_no = 0)
\ No newline at end of file
...@@ -16,8 +16,8 @@ SEMANTIC_LINKING_REST_PORT = 80 ...@@ -16,8 +16,8 @@ SEMANTIC_LINKING_REST_PORT = 80
SEMANTIC_LINKING_DB_HOSTNAME = f'{SEMANTIC_LINKING_HOSTNAME}-db' SEMANTIC_LINKING_DB_HOSTNAME = f'{SEMANTIC_LINKING_HOSTNAME}-db'
SEMANTIC_LINKING_DB_PORT = 27017 SEMANTIC_LINKING_DB_PORT = 27017
## Community Detection ## Role Stage Discovery
COMMUNITY_DETECTION_HOSTNAME = 'role-stage-discovery' ROLESTAGE_DISCOVERY_HOSTNAME = 'role-stage-discovery'
COMMUNITY_DETECTION_REST_PORT = 80 ROLESTAGE_DISCOVERY_REST_PORT = 80
COMMUNITY_DETECTION_DB_HOSTNAME = f'{COMMUNITY_DETECTION_HOSTNAME}-db' ROLESTAGE_DISCOVERY_DB_HOSTNAME = f'{ROLESTAGE_DISCOVERY_HOSTNAME}-db'
COMMUNITY_DETECTION_DB_PORT = 27017 ROLESTAGE_DISCOVERY_DB_PORT = 27017
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment