Commit 71008e36 authored by Alexander Lercher's avatar Alexander Lercher

Merge branch 'feature/generic-clustering' into develop

parents 6f8b2f9c cc03bae0
layers:
user:
properties:
starting-point:
properties:
- Latitude_StartingPoint
- Longitude_StartingPoint
\ No newline at end of file
...@@ -14,7 +14,7 @@ basePath: "/api" ...@@ -14,7 +14,7 @@ basePath: "/api"
paths: paths:
/debug: /debug:
post: post:
operationId: "rest.debug.echo" operationId: "routes.debug.echo"
tags: tags:
- "Echo" - "Echo"
summary: "Echo function for debugging purposes" summary: "Echo function for debugging purposes"
...@@ -29,9 +29,11 @@ paths: ...@@ -29,9 +29,11 @@ paths:
200: 200:
description: "Successful echo of request data" description: "Successful echo of request data"
# Locations
# TODO remove
/locations: /locations:
post: post:
operationId: "rest.location.post" operationId: "routes.location.post"
tags: tags:
- "Locations" - "Locations"
summary: "Add new location data" summary: "Add new location data"
...@@ -48,7 +50,7 @@ paths: ...@@ -48,7 +50,7 @@ paths:
400: 400:
description: "Invalid input" description: "Invalid input"
get: get:
operationId: "rest.location.get" operationId: "routes.location.get"
tags: tags:
- "Locations" - "Locations"
summary: "Get location data" summary: "Get location data"
...@@ -61,7 +63,7 @@ paths: ...@@ -61,7 +63,7 @@ paths:
/location-collections: /location-collections:
post: post:
operationId: "rest.location.post_many" operationId: "routes.location.post_many"
tags: tags:
- "Locations" - "Locations"
summary: "Add new location data collection" summary: "Add new location data collection"
...@@ -78,9 +80,77 @@ paths: ...@@ -78,9 +80,77 @@ paths:
400: 400:
description: "Invalid input" description: "Invalid input"
# Layers
/layers:
post:
operationId: "routes.layers.post"
tags:
- "Layers"
summary: "Add a new layer or overwrite an existing one"
parameters:
- in: body
name: "Layer"
description: "The layer data to be added"
required: true
schema:
$ref: "#/definitions/Layer-UpperCase"
responses:
201:
description: "Successful operation"
400:
description: "Invalid input"
get:
operationId: "routes.layers.get"
tags:
- "Layers"
summary: "Get all layer data"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/LayerCollection"
/layers/names:
get:
operationId: "routes.layers.get_names"
tags:
- "Layers"
summary: "Get all layer names"
parameters: []
responses:
200:
description: "Successful operation"
schema:
type: array
items:
type: string
/layers/{name}:
get:
operationId: "routes.layers.get_by_name"
tags:
- "Layers"
summary: "Get layer data for layer-name"
parameters:
- name: "name"
in: "path"
description: "Name of the layer to return"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/Layer"
404:
description: "Layer not found"
# Clusters
# TODO remove partially
/location-clusters: /location-clusters:
get: get:
operationId: "rest.cluster.get_locations" operationId: "routes.cluster.get_locations"
tags: tags:
- "Clusters" - "Clusters"
summary: "Get user communities clustered by location" summary: "Get user communities clustered by location"
...@@ -93,7 +163,7 @@ paths: ...@@ -93,7 +163,7 @@ paths:
# /clusters/cluster.png: # /clusters/cluster.png:
# get: # get:
# operationId: "rest.cluster.get_image" # operationId: "routes.cluster.get_image"
# tags: # tags:
# - "Clusters" # - "Clusters"
# summary: "Get user communities per date per hour as image" # summary: "Get user communities per date per hour as image"
...@@ -106,7 +176,7 @@ paths: ...@@ -106,7 +176,7 @@ paths:
/time-clusters: /time-clusters:
get: get:
operationId: "rest.cluster.get_times" operationId: "routes.cluster.get_times"
tags: tags:
- "Clusters" - "Clusters"
summary: "Get user communities clustered by time per hour" summary: "Get user communities clustered by time per hour"
...@@ -119,7 +189,7 @@ paths: ...@@ -119,7 +189,7 @@ paths:
# /agi/clusters/cluster.png: # /agi/clusters/cluster.png:
# get: # get:
# operationId: "rest.agi_cluster.get_image" # operationId: "routes.agi_cluster.get_image"
# tags: # tags:
# - "Clusters" # - "Clusters"
# summary: "Get user communities per date per hour from agi data as image" # summary: "Get user communities per date per hour from agi data as image"
...@@ -130,9 +200,58 @@ paths: ...@@ -130,9 +200,58 @@ paths:
# 200: # 200:
# description: "Successful operation" # description: "Successful operation"
/clustersets:
get:
operationId: "routes.clustersets.get"
tags:
- "Clusters"
summary: "Get clustersets for all layers"
parameters: []
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/ClusterSetCollection"
/clustersets/names:
get:
operationId: "routes.clustersets.get_names"
tags:
- "Clusters"
summary: "Get clusterset names for all layers"
parameters: []
responses:
200:
description: "Successful operation"
schema:
type: array
items:
type: string
/clustersets/{name}:
get:
operationId: "routes.clustersets.get_by_name"
tags:
- "Clusters"
summary: "Get clusterset for layer-name"
parameters:
- name: "name"
in: "path"
description: "Name of the layer to return the clusterset for"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/ClusterSet"
404:
description: "Clusterset not found"
# TODO remove
/user-cluster-graphs: /user-cluster-graphs:
get: get:
operationId: "rest.user_cluster.get" operationId: "routes.user_cluster.get"
tags: tags:
- "User Graphs" - "User Graphs"
summary: "Get user graphs per layer per cluster" summary: "Get user graphs per layer per cluster"
...@@ -142,10 +261,11 @@ paths: ...@@ -142,10 +261,11 @@ paths:
description: "Successful operation" description: "Successful operation"
schema: schema:
$ref: "#/definitions/UserClusterGraphCollection" $ref: "#/definitions/UserClusterGraphCollection"
# Function Calls
/rfc/run: /rfc/run:
post: post:
operationId: "rest.functions.run_agi_clustering_and_graph_creation" operationId: "routes.functions.run_agi_clustering_and_graph_creation"
tags: tags:
- "Remote function calls" - "Remote function calls"
summary: "Insert locations from AGI, create clusters for starting time and location layers, create graphs for the location clusters" summary: "Insert locations from AGI, create clusters for starting time and location layers, create graphs for the location clusters"
...@@ -154,6 +274,7 @@ paths: ...@@ -154,6 +274,7 @@ paths:
204: 204:
description: "Successful operation" description: "Successful operation"
definitions: definitions:
Location: Location:
type: "object" type: "object"
...@@ -176,6 +297,24 @@ definitions: ...@@ -176,6 +297,24 @@ definitions:
items: items:
$ref: "#/definitions/Location" $ref: "#/definitions/Location"
Cluster:
type: object
properties:
cluster_label:
type: number
nodes:
type: array
items:
type: object
example:
"Finished_time": 1576631193265951
"Latitude_Destination": -5.973257
"Longitude_Destination": 37.416316
"TravelID": "5e57ec9159bc0668543f156a"
"TravelPrice": 15
"UniqueID": "a95075f5042b1b27060080156d87fe34ec7e712c5e57ec9159bc0668543f156a"
"UserID": "a95075f5042b1b27060080156d87fe34ec7e712c"
LocationCluster: LocationCluster:
type: object type: object
properties: properties:
...@@ -235,4 +374,52 @@ definitions: ...@@ -235,4 +374,52 @@ definitions:
UserClusterGraphCollection: UserClusterGraphCollection:
type: array type: array
items: items:
$ref: "#/definitions/UserClusterGraph" $ref: "#/definitions/UserClusterGraph"
\ No newline at end of file
Layer-UpperCase:
type: object
properties:
LayerName:
type: string
Nodes:
type: array
items:
type: object
Properties:
type: array
items:
type: string
Layer:
type: object
properties:
layer_name:
type: string
nodes:
type: array
items:
type: object
properties:
type: array
items:
type: string
LayerCollection:
type: array
items:
$ref: "#/definitions/Layer"
ClusterSet:
type: object
properties:
layer_name:
type: string
clusters:
type: array
items:
$ref: "#/definitions/Cluster"
ClusterSetCollection:
type: array
items:
$ref: "#/definitions/ClusterSet"
\ No newline at end of file
...@@ -29,6 +29,24 @@ class AgiRepository: ...@@ -29,6 +29,24 @@ class AgiRepository:
return locations return locations
def getLocationsBasedOnNewDataSchema(self):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data = {
'layer_name': 'Destination',
'nodes': self.getLocations(),
'properties': ['latitude', 'longitude']
}
return data
def getTimesBasedOnNewDataSchema(self):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data = {
'layer_name': 'Starting_Time',
'nodes': self.getLocations(),
'properties': ['timestamp']
}
return data
def readDataFromFile(self) -> List[Dict]: def readDataFromFile(self) -> List[Dict]:
with open('./db/agi/travels.json', 'r') as f_travels: with open('./db/agi/travels.json', 'r') as f_travels:
travels = json.loads(f_travels.read()) travels = json.loads(f_travels.read())
......
from db.entities.location import Location from db.entities.location import Location
from db.entities.popular_location import PopularLocation from db.entities.popular_location import PopularLocation
from db.entities.cluster import Cluster, LocationCluster, TimeCluster from db.entities.cluster import Cluster, LocationCluster, TimeCluster
from db.entities.user_cluster_graph import UserClusterGraph from db.entities.clusterset import ClusterSet
\ No newline at end of file from db.entities.user_cluster_graph import UserClusterGraph
from db.entities.layer import Layer
...@@ -4,10 +4,38 @@ from datetime import date, datetime ...@@ -4,10 +4,38 @@ from datetime import date, datetime
class Cluster: class Cluster:
def __init__(self, cluster_label: int = None, nodes: List = None): '''
A cluster for an arbitrary layer containing some nodes.
:param cluster_label: The label of the cluster unique for the layer
:param nodes: The individual nodes of the cluster
'''
def __init__(self, cluster_label: int = None, nodes: List = None,
cluster_dict: Dict = None, from_db=False):
self.cluster_label = cluster_label self.cluster_label = cluster_label
self.nodes = nodes self.nodes = nodes
if cluster_dict is not None:
self.from_serializable_dict(cluster_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes
}
def from_serializable_dict(self, cluster_dict: Dict, from_db=False):
self.cluster_label = cluster_dict["cluster_label"]
self.nodes = json.loads(cluster_dict["nodes"]) \
if from_db else cluster_dict["nodes"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Cluster({self.__repr__()})"
class LocationCluster(Cluster): class LocationCluster(Cluster):
def __init__(self, cluster_label: int = None, nodes: List = None, def __init__(self, cluster_label: int = None, nodes: List = None,
...@@ -67,7 +95,7 @@ class TimeCluster(Cluster): ...@@ -67,7 +95,7 @@ class TimeCluster(Cluster):
if from_db else time_dict["nodes"] if from_db else time_dict["nodes"]
def __repr__(self): def __repr__(self):
return json.dumps(self.to_serializable_dict()) return json.dumps(self.to_serializable_dict(True))
def __str__(self): def __str__(self):
return f"TimeCluster({self.__repr__()})" return f"TimeCluster({self.__repr__()})"
import json
from db.entities.cluster import Cluster
from typing import List, Dict
from datetime import date, datetime
class ClusterSet:
'''
A clusterset for an arbitrary layer containing all clusters.
:param layer_name: The name of the layer
:param clusters: The individual clusters
'''
def __init__(self, layer_name: str = None, clusters: List[Cluster] = None,
cluster_set_dict: Dict = None, from_db=False):
self.layer_name = layer_name
self.clusters = clusters
if cluster_set_dict is not None:
self.from_serializable_dict(cluster_set_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
serialized_dict_clusters = [cluster.to_serializable_dict(for_db)
for cluster in self.clusters]
return {
"layer_name": self.layer_name,
"clusters": json.dumps(serialized_dict_clusters) if for_db else serialized_dict_clusters
}
def from_serializable_dict(self, cluster_set_dict: Dict, from_db=False):
self.layer_name = cluster_set_dict["layer_name"]
serialized_dict_clusters = json.loads(cluster_set_dict["clusters"]) \
if from_db else cluster_set_dict["clusters"]
self.clusters = [Cluster(cluster_dict=cluster_dict, from_db=from_db)
for cluster_dict in serialized_dict_clusters]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"ClusterSet({self.__repr__()})"
import json
from datetime import datetime
from typing import Dict
class Layer:
'''
This class represents a single layer of the Multilayer Graph.
:param layer_info: Information as dictionary to restore the layer object.
'''
def __init__(self, layer_info: Dict = None, from_db=False):
if layer_info is not None:
self.from_serializable_dict(layer_info, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"layer_name": self.layer_name,
"properties": self.properties,
"nodes": json.dumps(self.nodes) if for_db else self.nodes
}
def from_serializable_dict(self, layer_info: Dict, from_db=False):
self.layer_name = layer_info['layer_name']
self.properties = layer_info['properties']
self.nodes = json.loads(layer_info["nodes"]) \
if from_db else layer_info["nodes"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Layer({self.__repr__()})"
...@@ -5,12 +5,12 @@ import json ...@@ -5,12 +5,12 @@ import json
from db.agi.agi_repository import AgiRepository from db.agi.agi_repository import AgiRepository
from db.entities import Location, TimeCluster, PopularLocation, LocationCluster, UserClusterGraph from db.entities import *
from typing import List from typing import List
class Repository(MongoRepositoryBase): class Repository(MongoRepositoryBase):
'''This repository stores and loads locations and clusters with MongoDb.''' '''This is a repository for MongoDb.'''
def __init__(self): def __init__(self):
super().__init__(netconst.COMMUNITY_DETECTION_DB_HOSTNAME, super().__init__(netconst.COMMUNITY_DETECTION_DB_HOSTNAME,
...@@ -21,9 +21,12 @@ class Repository(MongoRepositoryBase): ...@@ -21,9 +21,12 @@ class Repository(MongoRepositoryBase):
self._location_cluster_collection = 'location_cluster' self._location_cluster_collection = 'location_cluster'
self._time_cluster_collection = 'time_cluster' self._time_cluster_collection = 'time_cluster'
self._user_cluster_graph_collection = 'user_cluster_graph' self._user_cluster_graph_collection = 'user_cluster_graph'
self._layer_collection = 'layer'
self._clusterset_collection = 'cluster_set'
self.agi_repo = AgiRepository() self.agi_repo = AgiRepository()
#region Location
def add_location(self, location: Location): def add_location(self, location: Location):
super().insert_entry(self._location_collection, location.to_serializable_dict()) super().insert_entry(self._location_collection, location.to_serializable_dict())
...@@ -34,7 +37,9 @@ class Repository(MongoRepositoryBase): ...@@ -34,7 +37,9 @@ class Repository(MongoRepositoryBase):
def get_agi_locations(self) -> List[Location]: def get_agi_locations(self) -> List[Location]:
agi_locations = self.agi_repo.getLocations() agi_locations = self.agi_repo.getLocations()
return [Location(agi_loc) for agi_loc in agi_locations] return [Location(agi_loc) for agi_loc in agi_locations]
#endregion
#region Specific Clusters
def add_location_cluster(self, cluster: LocationCluster): def add_location_cluster(self, cluster: LocationCluster):
super().insert_entry(self._location_cluster_collection, super().insert_entry(self._location_cluster_collection,
cluster.to_serializable_dict(for_db=True)) cluster.to_serializable_dict(for_db=True))
...@@ -50,7 +55,9 @@ class Repository(MongoRepositoryBase): ...@@ -50,7 +55,9 @@ class Repository(MongoRepositoryBase):
def get_time_clusters(self) -> List[TimeCluster]: def get_time_clusters(self) -> List[TimeCluster]:
clusters = super().get_entries(self._time_cluster_collection) clusters = super().get_entries(self._time_cluster_collection)
return [TimeCluster(time_dict=c, from_db=True) for c in clusters] return [TimeCluster(time_dict=c, from_db=True) for c in clusters]
#endregion
#region Cluster Graph
def add_user_cluster_graph(self, user_graph: UserClusterGraph): def add_user_cluster_graph(self, user_graph: UserClusterGraph):
super().insert_entry(self._user_cluster_graph_collection, super().insert_entry(self._user_cluster_graph_collection,
user_graph.to_serializable_dict(for_db=True)) user_graph.to_serializable_dict(for_db=True))
...@@ -58,3 +65,48 @@ class Repository(MongoRepositoryBase): ...@@ -58,3 +65,48 @@ class Repository(MongoRepositoryBase):
def get_user_cluster_graphs(self) -> List[UserClusterGraph]: def get_user_cluster_graphs(self) -> List[UserClusterGraph]:
user_graphs = super().get_entries(self._user_cluster_graph_collection) user_graphs = super().get_entries(self._user_cluster_graph_collection)
return [UserClusterGraph(dict_=u, from_db=True) for u in user_graphs] return [UserClusterGraph(dict_=u, from_db=True) for u in user_graphs]
#endregion
#region Layers
def add_layer(self, layer: Layer):
super().insert_entry(self._layer_collection, layer.to_serializable_dict())
def get_layers(self) -> List[Layer]:
entries = super().get_entries(self._layer_collection)
return [Layer(e) for e in entries]
def get_layer_names(self) -> List[str]:
entries = super().get_entries(self._layer_collection, projection={'layer_name': 1})
return [e['layer_name'] for e in entries]
def get_layer(self, layer_name) -> Layer:
entries = super().get_entries(self._layer_collection, selection={'layer_name': layer_name})
entries = [Layer(e) for e in entries]
if entries is not None and len(entries) > 0:
return entries[0]
else:
return None
#endregion
#region ClusterSet
def add_clusterset(self, cluster_set: ClusterSet):
super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict())
def get_clustersets(self) -> List[ClusterSet]:
entries = super().get_entries(self._clusterset_collection)
return [ClusterSet(cluster_set_dict=e) for e in entries]
def get_clusterset_names(self) -> List[str]:
entries = super().get_entries(self._clusterset_collection, projection={'layer_name': 1})
return [e['layer_name'] for e in entries]
def get_clusterset(self, layer_name) -> ClusterSet:
entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name})
entries = [ClusterSet(cluster_set_dict=e) for e in entries]
if entries is not None and len(entries) > 0:
return entries[0]
else:
return None
#endregion
...@@ -2,9 +2,20 @@ import json ...@@ -2,9 +2,20 @@ import json
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN from sklearn.cluster import DBSCAN
from typing import List, Dict from typing import List, Dict, Any, TypeVar
from deprecated import deprecated
T = TypeVar('T')
ClusterGroup = Dict[Any, List[Dict]]
class Clusterer: class Clusterer:
'''
Clusterer for applying density-based clustering on datasets.
The clustering is done with DBSCAN.
:param epsilon: Epsilon used in DBSCAN
:param min_points: Min_points used in DBSCAN
'''
def __init__(self, epsilon=11, min_points=2): def __init__(self, epsilon=11, min_points=2):
self.epsilon = epsilon self.epsilon = epsilon
self.min_points = min_points self.min_points = min_points
...@@ -43,7 +54,8 @@ class Clusterer: ...@@ -43,7 +54,8 @@ class Clusterer:
return fig return fig
def create_labels(self, features:np.ndarray) -> List: def create_labels(self, features:np.ndarray) -> List[int]:
'''Creates labels for the items based on DBSCAN.'''
if features is None or len(features) == 0: if features is None or len(features) == 0:
return features # trash in trash out return features # trash in trash out
...@@ -53,13 +65,25 @@ class Clusterer: ...@@ -53,13 +65,25 @@ class Clusterer:
return labels.tolist() return labels.tolist()
@deprecated(reason="Use generic version instead")
def extract_location_features(self, locations: List[dict]) -> np.ndarray: def extract_location_features(self, locations: List[dict]) -> np.ndarray:
return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations]) return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations])
@deprecated(reason="Use generic version instead")
def extract_time_features(self, times: List[Dict]) -> np.ndarray: def extract_time_features(self, times: List[Dict]) -> np.ndarray:
return np.asarray([((t['timestamp']), 0) for t in times]) return np.asarray([[float(t['timestamp'])] for t in times])
def _extract_features(self, dataset: List[Dict], features:List[str]) -> np.ndarray:
'''Extracts the feature values from the dataset into a np array with same order as original dataset.'''
extracted_features = []
for data in dataset:
entry = [float(data[feature]) for feature in features]
extracted_features.append(entry)
def label_dataset(self, dataset:List[Dict], labels:List) -> List: return np.asarray(extracted_features)
def label_dataset(self, dataset:List[Dict], labels:List[Any]) -> List:
'''Adds the labels to the elements of the dataset at the same position. The new key is called cluster_label.'''
if dataset is None or labels is None: if dataset is None or labels is None:
return return
...@@ -67,16 +91,21 @@ class Clusterer: ...@@ -67,16 +91,21 @@ class Clusterer:
raise ValueError("dataset and labels has to have same length") raise ValueError("dataset and labels has to have same length")
for i in range(len(dataset)): for i in range(len(dataset)):
if 'cluster_label' in dataset[i]:
continue
dataset[i]['cluster_label'] = labels[i] dataset[i]['cluster_label'] = labels[i]
def group_by_clusters(self, dataset:List[Dict], labels:List) -> Dict[int, List[Dict]]: def group_by_clusters(self, dataset:List[Dict], labels:List[Any]) -> ClusterGroup:
self.label_dataset(dataset, labels)
clusters = {} clusters = {}
for label in labels: for label in labels:
clusters[label] = [ds for ds in dataset if ds['cluster_label'] == label] clusters[label] = [ds for ds in dataset if ds['cluster_label'] == label]
return clusters return clusters
def cluster_locations(self, locations:List[Dict]) -> Dict[int, List[Dict]]: @deprecated(reason="Use generic version instead")
def cluster_locations(self, locations:List[Dict]) -> ClusterGroup:
'''Returns a dictionary with identified clusters and their locations copied from the input''' '''Returns a dictionary with identified clusters and their locations copied from the input'''
if locations is None or len(locations) == 0: if locations is None or len(locations) == 0:
# raise Exception("locations has to contain something") # raise Exception("locations has to contain something")
...@@ -88,12 +117,29 @@ class Clusterer: ...@@ -88,12 +117,29 @@ class Clusterer:
self.label_dataset(locations, labels) self.label_dataset(locations, labels)
return self.group_by_clusters(locations, labels) return self.group_by_clusters(locations, labels)
def cluster_times(self, times:List[Dict]) -> Dict[int, List[Dict]]: @deprecated(reason="Use generic version instead")
def cluster_times(self, times:List[Dict]) -> ClusterGroup:
'''Returns a dictionary with identified clusters and their times copied from the input''' '''Returns a dictionary with identified clusters and their times copied from the input'''
features = self.extract_time_features(times) features = self.extract_time_features(times)
labels = self.create_labels(features) labels = self.create_labels(features)
self.label_dataset(times, labels) self.label_dataset(times, labels)
return self.group_by_clusters(times, labels) return self.group_by_clusters(times, labels)
\ No newline at end of file
def cluster_dataset(self, dataset:List[Dict], features:List[str]) -> ClusterGroup:
'''
Returns the identified clusters containing a subset of nodes from the dataset.
:param dataset: The nodes to assign to clusters
:param features: The feature names of the nodes to use for clustering
:returns: A dictionary of clusters, where each value is a non-empty subset of dataset if dataset was not empty
'''
arr = self._extract_features(dataset, features)
labels = self.create_labels(arr)
return self.group_by_clusters(dataset, labels)
import yaml
from typing import Generator
### init logging ###
import logging
LOG_FORMAT = (
'%(levelname) -5s %(asctime)s %(name)s:%(funcName) -35s %(lineno) -5d: %(message)s')
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
LOGGER = logging.getLogger(__name__)
class ClusteringConfig:
'''Contains the configuration for the clustering algorithm defined in configs/clustering.yaml.'''
config_path = 'configs/clustering.yaml'
config: dict = None
def __init__(self):
self.config = self._load_config()
def _load_config(self) -> dict:
'''Loads the whole configuration from file.'''
config = None
with open(self.config_path, 'r') as stream:
try:
config = yaml.safe_load(stream)
except yaml.YAMLError as exc:
LOGGER.error(exc)
config = {}
return config
def get_config(self):
return self.config
def get_layer_configs(self) -> Generator[dict, None, None]:
"""
Returns a generator for the individual layer configs.
Layer configs are dicts including a layer-name.
"""
for key, layer in self.config['layers'].items():
layer['layer-name'] = key
yield layer
...@@ -5,9 +5,11 @@ certifi==2019.11.28 ...@@ -5,9 +5,11 @@ certifi==2019.11.28
chardet==3.0.4 chardet==3.0.4
Click==7.0 Click==7.0
clickclick==1.2.2 clickclick==1.2.2
colorama==0.4.3
connexion==2.6.0 connexion==2.6.0
cycler==0.10.0 cycler==0.10.0
decorator==4.4.1 decorator==4.4.1
Deprecated==1.2.7
Flask==1.1.1 Flask==1.1.1
idna==2.8 idna==2.8
importlib-metadata==1.5.0 importlib-metadata==1.5.0
......
import io import io
from flask import request, Response from flask import request, Response
from db.repository import Repository from db.repository import Repository
from processing.clusterer import Clusterer from processing.clustering.clusterer import Clusterer
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
repo = Repository() repo = Repository()
......
from flask import request, Response
from db.repository import Repository
from db.entities import ClusterSet
repo = Repository()
def get():
return [c.to_serializable_dict() for c in repo.get_clustersets()]
def get_names():
return repo.get_clusterset_names()
def get_by_name(name):
res = repo.get_clusterset(name)
if res is not None:
return res.to_serializable_dict()
else:
return Response(status=404)
\ No newline at end of file
from flask import request, Response
from db.repository import Repository
from db.entities import Layer
repo = Repository()
def post():
body = request.json
_insert_layer(body)
return Response(status=201)
def _insert_layer(layer_data: dict):
# convert object keys from ext source
layer_data['layer_name'] = layer_data.pop('LayerName')
layer_data['nodes'] = layer_data.pop('Nodes')
layer_data['properties'] = layer_data.pop('Properties')
repo.add_layer(Layer(layer_data))
def get():
return [l.to_serializable_dict() for l in repo.get_layers()]
def get_names():
return repo.get_layer_names()
def get_by_name(name):
res = repo.get_layer(name)
if res is not None:
return res.to_serializable_dict()
else:
return Response(status=404)
\ No newline at end of file
...@@ -4,15 +4,40 @@ modules_path = '../../../modules/' ...@@ -4,15 +4,40 @@ modules_path = '../../../modules/'
if os.path.exists(modules_path): if os.path.exists(modules_path):
sys.path.insert(1, modules_path) sys.path.insert(1, modules_path)
from db.entities import Location, PopularLocation, LocationCluster, TimeCluster from db.entities import *
from typing import List, Dict, Tuple from typing import List, Dict, Tuple
from db.repository import Repository from db.repository import Repository, AgiRepository
from processing.clusterer import Clusterer from processing.clustering.clusterer import Clusterer
DEBUG = False DEBUG = False
repo = Repository() repo = Repository()
test_repo = AgiRepository()
def run_generic_clustering():
'''Runs the clustering for all layers found in the repository.'''
all_layers:List[Layer] = repo.get_layers()
for layer in all_layers:
print(f"Clustering {layer.layer_name}")
clusters = run_clustering_for_layer(layer)
cluster_set = ClusterSet(layer.layer_name, clusters)
repo.add_clusterset(cluster_set)
def run_clustering_for_layer(layer: Layer) -> List[Cluster]:
clusterer = Clusterer()
res = clusterer.cluster_dataset(
layer.nodes,
layer.properties
)
return [Cluster(key, value) for key, value in res.items()]
def run_location_clustering(): def run_location_clustering():
user_clusterer = Clusterer() user_clusterer = Clusterer()
...@@ -74,5 +99,7 @@ def store_clusters(type: str, clusters: List): ...@@ -74,5 +99,7 @@ def store_clusters(type: str, clusters: List):
if __name__ == "__main__": if __name__ == "__main__":
run_location_clustering() run_generic_clustering()
run_time_clustering()
# run_location_clustering()
# run_time_clustering()
import unittest import unittest
import sys import sys
sys.path.insert(1, '../') for path in ['../', './']:
sys.path.insert(1, path)
# python -m unittest discover # python -m unittest discover
from processing.clusterer import Clusterer from processing.clustering.clusterer import Clusterer
class TestClusterer(unittest.TestCase): class TestClusterer(unittest.TestCase):
clusterer:Clusterer = None clusterer:Clusterer = None
...@@ -71,11 +72,50 @@ class TestClusterer(unittest.TestCase): ...@@ -71,11 +72,50 @@ class TestClusterer(unittest.TestCase):
self.assertEqual(3, len(locations)) self.assertEqual(3, len(locations))
self.assertHaveLabelsAsNewKey(locations, labels) self.assertHaveLabelsAsNewKey(locations, labels)
def test_cluster_locations_multInput_correctlyLabeled(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = [0,0,-1]
res = self.clusterer.cluster_locations(locations)
self.assertHaveLabelsAsNewKey(locations, labels)
self.assertDictEqual(res, {0: [{'latitude': 1, 'longitude': 2, 'cluster_label': 0}, {'latitude': 2, 'longitude': 2, 'cluster_label': 0}], -1: [{'latitude': 20, 'longitude': 20, 'cluster_label': -1}]})
def test_cluster_times_multInput_correctlyLabeled(self):
times = [self.time(123), self.time(128), self.time(223)]
labels = [0,0,-1]
res = self.clusterer.cluster_times(times)
self.assertHaveLabelsAsNewKey(times, labels)
self.assertDictEqual(res, {0: [{'timestamp': 123, 'cluster_label': 0}, {'timestamp': 128, 'cluster_label': 0}], -1: [{'timestamp': 223, 'cluster_label': -1}]})
def test_cluster_dataset_locationsMultInput_correctlyLabeled(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = [0,0,-1]
res = self.clusterer.cluster_dataset(locations, ['latitude', 'longitude'])
self.assertHaveLabelsAsNewKey(locations, labels)
self.assertDictEqual(res, {0: [{'latitude': 1, 'longitude': 2, 'cluster_label': 0}, {'latitude': 2, 'longitude': 2, 'cluster_label': 0}], -1: [{'latitude': 20, 'longitude': 20, 'cluster_label': -1}]})
def test_cluster_dataset_timesMultInput_correctlyLabeled(self):
times = [self.time(123), self.time(128), self.time(223)]
labels = [0,0,-1]
res = self.clusterer.cluster_dataset(times, ['timestamp'])
self.assertHaveLabelsAsNewKey(times, labels)
self.assertDictEqual(res, {0: [{'timestamp': 123, 'cluster_label': 0}, {'timestamp': 128, 'cluster_label': 0}], -1: [{'timestamp': 223, 'cluster_label': -1}]})
# helper methods: # helper methods:
def location(self, lat, long_) -> dict: def location(self, lat, long_) -> dict:
return {'latitude': lat, 'longitude':long_} return {'latitude': lat, 'longitude':long_}
def time(self, ts) -> dict:
return {'timestamp': ts}
def assertHaveLabelsAsNewKey(self, locations, labels): def assertHaveLabelsAsNewKey(self, locations, labels):
for i in range(len(locations)): for i in range(len(locations)):
self.assertEqual(labels[i], locations[i]['cluster_label']) self.assertEqual(labels[i], locations[i]['cluster_label'])
......
import unittest
import sys
for path in ['../', './']:
sys.path.insert(1, path)
# python -m unittest discover
from processing.clustering.clustering_config import ClusteringConfig
class TestClusteringConfig(unittest.TestCase):
def setUp(self):
self.clustering_config = ClusteringConfig()
def test_get_layer_configs_noneInput_noneOutput(self):
for layer_config in self.clustering_config.get_layer_configs():
self.assertIn('layer-name', layer_config)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment