Commit 31f80acb authored by Alexander Lercher's avatar Alexander Lercher

Merge branch 'feature/handle-large-datasets' into develop

parents bd4aa55b cf555c9f
...@@ -80,16 +80,17 @@ paths: ...@@ -80,16 +80,17 @@ paths:
400: 400:
description: "Invalid input" description: "Invalid input"
# Layers #region Layers
/layers: /layers:
post: post:
operationId: "routes.layers.post" operationId: "routes.layers.post"
tags: tags:
- "Layers" - "Layers"
summary: "Add a new layer or overwrite an existing one" summary: "Add a new layer [TODO: or overwrite an existing one]"
parameters: parameters:
- in: body - in: body
name: "Layer" name: "layer"
description: "The layer data to be added" description: "The layer data to be added"
required: true required: true
schema: schema:
...@@ -111,41 +112,109 @@ paths: ...@@ -111,41 +112,109 @@ paths:
schema: schema:
$ref: "#/definitions/LayerCollection" $ref: "#/definitions/LayerCollection"
/layers/names: /layers/{name}:
get: get:
operationId: "routes.layers.get_names" operationId: "routes.layers.get_by_name"
tags: tags:
- "Layers" - "Layers"
summary: "Get all layer names" summary: "Get single layer data"
parameters: [] parameters:
- name: "name"
in: "path"
description: "Name of the requested layer"
required: true
type: "string"
responses: responses:
200: 200:
description: "Successful operation" description: "Successful operation"
schema: schema:
type: array $ref: "#/definitions/Layer"
items: 404:
type: string description: "Layer not found"
/layers/{name}: /layers/{name}/nodes:
get: get:
operationId: "routes.layers.get_by_name" operationId: "routes.layers.get_nodes"
tags: tags:
- "Layers" - "Layers"
summary: "Get layer data for layer-name" summary: "Get all individual nodes for the layer"
parameters: parameters:
- name: "name" - name: "name"
in: "path" in: "path"
description: "Name of the layer to return" description: "Name of the layer"
required: true required: true
type: "string" type: "string"
responses: responses:
200: 200:
description: "Successful operation" description: "Successful operation"
schema: schema:
$ref: "#/definitions/Layer" $ref: "#/definitions/NodeCollection"
404: 404:
description: "Layer not found" description: "Layer not found"
post:
operationId: "routes.layers.post_nodes"
tags:
- "Layers"
summary: "Adds a single or multiple nodes to the layer"
parameters:
- name: "name"
in: "path"
description: "Name of the layer"
required: true
type: "string"
- name: "node"
in: body
description: "The node(s) to be added"
required: true
schema:
$ref: "#/definitions/NodeCollection"
responses:
201:
description: "Successful operation"
400:
description: "Invalid input"
/layers/{name}/clusters:
get:
operationId: "routes.clustersets.get_by_name2"
tags:
- "Layers"
summary: "Get all clusters for the layer"
parameters:
- name: "name"
in: "path"
description: "Name of the layer"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/ClusterCollection"
404:
description: "Layer not found"
/layers/{name}/timeslices:
get:
operationId: "routes.timeslices.get_by_name2"
tags:
- "Layers"
summary: "Get all timeslices for the layer"
parameters:
- name: "name"
in: "path"
description: "Name of the layer"
required: true
type: "string"
responses:
200:
description: "Successful operation"
schema:
$ref: "#/definitions/TimeSliceCollection"
404:
description: "Layer not found"
#endregion
# Clusters # Clusters
# TODO remove partially # TODO remove partially
/location-clusters: /location-clusters:
...@@ -200,6 +269,7 @@ paths: ...@@ -200,6 +269,7 @@ paths:
# 200: # 200:
# description: "Successful operation" # description: "Successful operation"
# TODO remove
/clustersets: /clustersets:
get: get:
operationId: "routes.clustersets.get" operationId: "routes.clustersets.get"
...@@ -248,6 +318,7 @@ paths: ...@@ -248,6 +318,7 @@ paths:
404: 404:
description: "Clusterset not found" description: "Clusterset not found"
# TODO remove # TODO remove
/user-cluster-graphs: /user-cluster-graphs:
get: get:
...@@ -335,20 +406,20 @@ definitions: ...@@ -335,20 +406,20 @@ definitions:
Cluster: Cluster:
type: object type: object
properties: properties:
layer_name:
type: string
cluster_label: cluster_label:
type: number type: number
nodes: nodes:
type: array type: array
items: items:
type: object $ref: "#/definitions/Node"
example:
"Finished_time": 1576631193265951
"Latitude_Destination": -5.973257 ClusterCollection:
"Longitude_Destination": 37.416316 type: array
"TravelID": "5e57ec9159bc0668543f156a" items:
"TravelPrice": 15 $ref: "#/definitions/Cluster"
"UniqueID": "a95075f5042b1b27060080156d87fe34ec7e712c5e57ec9159bc0668543f156a"
"UserID": "a95075f5042b1b27060080156d87fe34ec7e712c"
LocationCluster: LocationCluster:
type: object type: object
...@@ -416,10 +487,10 @@ definitions: ...@@ -416,10 +487,10 @@ definitions:
properties: properties:
LayerName: LayerName:
type: string type: string
Nodes: # Nodes:
type: array # type: array
items: # items:
type: object # type: object
Properties: Properties:
type: array type: array
items: items:
...@@ -430,10 +501,10 @@ definitions: ...@@ -430,10 +501,10 @@ definitions:
properties: properties:
layer_name: layer_name:
type: string type: string
nodes: # nodes:
type: array # type: array
items: # items:
type: object # type: object
properties: properties:
type: array type: array
items: items:
...@@ -444,6 +515,22 @@ definitions: ...@@ -444,6 +515,22 @@ definitions:
items: items:
$ref: "#/definitions/Layer" $ref: "#/definitions/Layer"
Node:
type: object
example:
"Finished_time": 1576631193265951
"Latitude_Destination": -5.973257
"Longitude_Destination": 37.416316
"TravelID": "5e57ec9159bc0668543f156a"
"TravelPrice": 15
"UniqueID": "a95075f5042b1b27060080156d87fe34ec7e712c5e57ec9159bc0668543f156a"
"UserID": "a95075f5042b1b27060080156d87fe34ec7e712c"
NodeCollection:
type: array
items:
$ref: "#/definitions/Node"
ClusterSet: ClusterSet:
type: object type: object
properties: properties:
......
...@@ -7,12 +7,14 @@ class Cluster: ...@@ -7,12 +7,14 @@ class Cluster:
''' '''
A cluster for an arbitrary layer containing some nodes. A cluster for an arbitrary layer containing some nodes.
:param layer_name: The name of the layer in which the cluster is located
:param cluster_label: The label of the cluster unique for the layer :param cluster_label: The label of the cluster unique for the layer
:param nodes: The individual nodes of the cluster :param nodes: The individual nodes of the cluster
''' '''
def __init__(self, cluster_label: int = None, nodes: List = None, def __init__(self, layer_name: str = None, cluster_label: int = None, nodes: List[Dict] = None,
cluster_dict: Dict = None, from_db=False): cluster_dict: Dict = None, from_db=False):
self.layer_name = layer_name
self.cluster_label = cluster_label self.cluster_label = cluster_label
self.nodes = nodes self.nodes = nodes
...@@ -21,11 +23,13 @@ class Cluster: ...@@ -21,11 +23,13 @@ class Cluster:
def to_serializable_dict(self, for_db=False) -> Dict: def to_serializable_dict(self, for_db=False) -> Dict:
return { return {
"layer_name": self.layer_name,
"cluster_label": self.cluster_label, "cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes "nodes": json.dumps(self.nodes) if for_db else self.nodes
} }
def from_serializable_dict(self, cluster_dict: Dict, from_db=False): def from_serializable_dict(self, cluster_dict: Dict, from_db=False):
self.layer_name = cluster_dict["layer_name"]
self.cluster_label = cluster_dict["cluster_label"] self.cluster_label = cluster_dict["cluster_label"]
self.nodes = json.loads(cluster_dict["nodes"]) \ self.nodes = json.loads(cluster_dict["nodes"]) \
if from_db else cluster_dict["nodes"] if from_db else cluster_dict["nodes"]
......
...@@ -17,15 +17,12 @@ class Layer: ...@@ -17,15 +17,12 @@ class Layer:
def to_serializable_dict(self, for_db=False) -> Dict: def to_serializable_dict(self, for_db=False) -> Dict:
return { return {
"layer_name": self.layer_name, "layer_name": self.layer_name,
"properties": self.properties, "properties": self.properties
"nodes": json.dumps(self.nodes) if for_db else self.nodes
} }
def from_serializable_dict(self, layer_info: Dict, from_db=False): def from_serializable_dict(self, layer_info: Dict, from_db=False):
self.layer_name = layer_info['layer_name'] self.layer_name = layer_info['layer_name']
self.properties = layer_info['properties'] self.properties = layer_info['properties']
self.nodes = json.loads(layer_info["nodes"]) \
if from_db else layer_info["nodes"]
def __repr__(self): def __repr__(self):
return json.dumps(self.to_serializable_dict()) return json.dumps(self.to_serializable_dict())
......
...@@ -21,9 +21,10 @@ class Repository(MongoRepositoryBase): ...@@ -21,9 +21,10 @@ class Repository(MongoRepositoryBase):
self._location_cluster_collection = 'location_cluster' self._location_cluster_collection = 'location_cluster'
self._time_cluster_collection = 'time_cluster' self._time_cluster_collection = 'time_cluster'
self._user_cluster_graph_collection = 'user_cluster_graph' self._user_cluster_graph_collection = 'user_cluster_graph'
self._layer_collection = 'layer' self._layer_collection = 'layer-new'
self._clusterset_collection = 'cluster_set' self._layer_nodes_collection = 'layer_nodes-new'
self._time_slice_collection = 'time_slice' self._clusterset_collection = 'cluster_set-new'
self._time_slice_collection = 'time_slice-new'
self.agi_repo = AgiRepository() self.agi_repo = AgiRepository()
...@@ -88,9 +89,22 @@ class Repository(MongoRepositoryBase): ...@@ -88,9 +89,22 @@ class Repository(MongoRepositoryBase):
return entries[0] return entries[0]
else: else:
return None return None
def add_layer_node(self, node: dict):
super().insert_entry(self._layer_nodes_collection, node)
def add_layer_nodes(self, nodes:List[dict]):
super().insert_many(self._layer_nodes_collection, nodes)
def get_layer_nodes(self, layer_name: str) -> dict:
'''Returns all nodes for the layer.'''
entries = super().get_entries(self._layer_nodes_collection, selection={'layer_name': layer_name}, projection={'_id': 0})
return [e for e in entries]
#endregion #endregion
#region ClusterSet #region ClusterSet
# TODO cleanup
def add_clusterset(self, cluster_set: ClusterSet): def add_clusterset(self, cluster_set: ClusterSet):
super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict()) super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict())
...@@ -113,6 +127,16 @@ class Repository(MongoRepositoryBase): ...@@ -113,6 +127,16 @@ class Repository(MongoRepositoryBase):
return entries[0] return entries[0]
else: else:
return None return None
def add_clusters(self, clusters: List[Cluster]):
cluster_dicts = [c.to_serializable_dict(for_db=True) for c in clusters]
super().insert_many(self._clusterset_collection, cluster_dicts)
def get_clusters_for_layer(self, layer_name: str) -> List[Cluster]:
entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name}, projection={'_id': 0})
return [Cluster(cluster_dict=e, from_db=True) for e in entries]
#endregion #endregion
#region TimeSlice #region TimeSlice
......
...@@ -10,8 +10,16 @@ def get(): ...@@ -10,8 +10,16 @@ def get():
def get_names(): def get_names():
return repo.get_clusterset_names() return repo.get_clusterset_names()
def get_by_name(layername):
res = repo.get_clusterset(layername) def get_by_name2(name):
res = repo.get_clusters_for_layer(name)
if res is None or len(res) == 0:
return Response(status=404)
else:
return [c.to_serializable_dict() for c in res]
def get_by_name(name):
res = repo.get_clusterset(name)
if res is not None: if res is not None:
return res.to_serializable_dict() return res.to_serializable_dict()
else: else:
......
...@@ -4,15 +4,18 @@ from db.entities import Layer ...@@ -4,15 +4,18 @@ from db.entities import Layer
repo = Repository() repo = Repository()
#region layers
def post(): def post():
'''Insert a new layer or overwrite an existing one.'''
# TODO overwrite
body = request.json body = request.json
_insert_layer(body) _insert_layer(body)
return Response(status=201) return Response(status=201)
def _insert_layer(layer_data: dict): def _insert_layer(layer_data: dict):
# convert object keys from ext source '''Converts object keys from external source and inserts into database.'''
layer_data['layer_name'] = layer_data.pop('LayerName') layer_data['layer_name'] = layer_data.pop('LayerName')
layer_data['nodes'] = layer_data.pop('Nodes') # layer_data['nodes'] = layer_data.pop('Nodes')
layer_data['properties'] = layer_data.pop('Properties') layer_data['properties'] = layer_data.pop('Properties')
repo.add_layer(Layer(layer_data)) repo.add_layer(Layer(layer_data))
...@@ -20,12 +23,27 @@ def _insert_layer(layer_data: dict): ...@@ -20,12 +23,27 @@ def _insert_layer(layer_data: dict):
def get(): def get():
return [l.to_serializable_dict() for l in repo.get_layers()] return [l.to_serializable_dict() for l in repo.get_layers()]
def get_names():
return repo.get_layer_names()
def get_by_name(name): def get_by_name(name):
res = repo.get_layer(name) res = repo.get_layer(name)
if res is not None: if res is not None:
return res.to_serializable_dict() return res.to_serializable_dict()
else: else:
return Response(status=404) return Response(status=404)
\ No newline at end of file
#endregion
#region nodes
def get_nodes(name):
res = repo.get_layer_nodes(name)
# print(res)
return res
def post_nodes(name):
body = request.json
for node in body:
node['layer_name'] = name
repo.add_layer_nodes(body)
return Response(status=201)
#endregion nodes
\ No newline at end of file
...@@ -11,9 +11,18 @@ def get(): ...@@ -11,9 +11,18 @@ def get():
def get_by_name(layername): def get_by_name(layername):
res = repo.get_time_slices_by_name(layername) res = repo.get_time_slices_by_name(layername)
print(len(res)) # print(len(res))
if res is not None and len(res) != 0: if res is not None and len(res) != 0:
return [e.to_serializable_dict() for e in res] return [e.to_serializable_dict() for e in res]
else: else:
return Response(status=404) return Response(status=404)
def get_by_name2(name):
res = repo.get_time_slices_by_name(name)
# print(len(res))
if res is not None and len(res) != 0:
return [e.to_serializable_dict() for e in res]
else:
return Response(status=404)
\ No newline at end of file
...@@ -22,27 +22,30 @@ def run_generic_clustering(): ...@@ -22,27 +22,30 @@ def run_generic_clustering():
all_layers:List[Layer] = repo.get_layers() all_layers:List[Layer] = repo.get_layers()
for layer in all_layers: for layer in all_layers:
print(f"Clustering {layer.layer_name}")
if layer.properties is None or len(layer.properties) == 0: if layer.properties is None or len(layer.properties) == 0:
print("skipping")
continue continue
print(f"Clustering {layer.layer_name}")
clusters = run_clustering_for_layer(layer) clusters = run_clustering_for_layer(layer)
cluster_set = ClusterSet(layer.layer_name, clusters) # cluster_set = ClusterSet(layer.layer_name, clusters)
store_clusterset(cluster_set) store_generic_clusters(clusters)
def run_clustering_for_layer(layer: Layer) -> List[Cluster]: def run_clustering_for_layer(layer: Layer) -> List[Cluster]:
clusterer = Clusterer() nodes = repo.get_layer_nodes(layer.layer_name)
clusterer = Clusterer()
res = clusterer.cluster_dataset( res = clusterer.cluster_dataset(
layer.nodes, nodes,
layer.properties layer.properties
) )
return [Cluster(key, value) for key, value in res.items()] return [Cluster(layer.layer_name, key, value) for key, value in res.items()]
def store_clusterset(cluster_set: ClusterSet): def store_generic_clusters(clusters: List[Cluster]):
repo.add_clusterset(cluster_set) repo.add_clusters(clusters)
# with open(f'clusterset_{cluster_set.layer_name}.txt', 'w') as file: # with open(f'clusterset_{cluster_set.layer_name}.txt', 'w') as file:
# file.write(json.dumps(cluster_set.to_serializable_dict())) # file.write(json.dumps(cluster_set.to_serializable_dict()))
...@@ -109,5 +112,6 @@ def store_clusters(type: str, clusters: List): ...@@ -109,5 +112,6 @@ def store_clusters(type: str, clusters: List):
if __name__ == "__main__": if __name__ == "__main__":
run_generic_clustering() run_generic_clustering()
# TODO cleanup
# run_location_clustering() # run_location_clustering()
# run_time_clustering() # run_time_clustering()
...@@ -8,8 +8,8 @@ import json ...@@ -8,8 +8,8 @@ import json
from datetime import datetime, date from datetime import datetime, date
from db.repository import Repository from db.repository import Repository
from db.entities.timeslice import TimeSlice from db.entities.timeslice import TimeSlice
from db.entities import ClusterSet from db.entities import ClusterSet, Cluster
from typing import Tuple, Dict, Any from typing import Tuple, Dict, Any, List
TimeSliceKey = Tuple[int, int] TimeSliceKey = Tuple[int, int]
...@@ -20,28 +20,30 @@ def convert_to_time_slice_key(timestamp: str) -> TimeSliceKey: ...@@ -20,28 +20,30 @@ def convert_to_time_slice_key(timestamp: str) -> TimeSliceKey:
return (y, w) return (y, w)
def split_clusterset_by_time(clustersets) -> Dict[TimeSliceKey, TimeSlice]: def split_clusterset_by_time(layer_name: str, clusters: List[Cluster]) -> Dict[TimeSliceKey, TimeSlice]:
''' '''
Distributes all nodes of a single clusterset into individual time slices based on their timestamps. Distributes all nodes in clusters of a single layer into individual time slices based on their timestamps.
If a node spans over multiple slices it will be added to all of them. If a node spans over multiple slices it will be added to all of them.
Information about clusters and the nodes in the clusters will not be changed. Information about clusters and the nodes in the clusters will not be changed.
:params clustersets: The clusterset whichs nodes are split :params clusters: The clusters whichs nodes are split
:returns: A dict of time slices where the key is the time info and value is the information about the time slice :returns: A dict of time slices where the key is the time info and value is the information about the time slice
''' '''
time_property_names = ['Finished_time', 'Starting_time']
time_slices: Dict[Any, TimeSlice] = {} time_slices: Dict[Any, TimeSlice] = {}
for cluster_no in clusterset.clusters: for cluster_no in clusters:
for node in cluster_no.nodes: for node in cluster_no.nodes:
time_keys = { # retrieve times the node is located in based on the defined time properties in the schema
convert_to_time_slice_key(str(node['Finished_time'])), time_keys = set()
convert_to_time_slice_key(str(node['Starting_time'])) for time_property in time_property_names:
} if time_property in node:
time_keys.add(convert_to_time_slice_key(str(node[time_property])))
for time_key in time_keys: for time_key in time_keys:
if time_key not in time_slices: if time_key not in time_slices:
time_slices[time_key] = TimeSlice(time_key, clusterset.layer_name) time_slices[time_key] = TimeSlice(time_key, layer_name)
time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node) time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
...@@ -53,9 +55,17 @@ if __name__ == "__main__": ...@@ -53,9 +55,17 @@ if __name__ == "__main__":
repo.remove_all_time_slices() repo.remove_all_time_slices()
clustersets = repo.get_clustersets() layers = repo.get_layers()
for clusterset in clustersets: for layer in layers:
time_slices = split_clusterset_by_time(clusterset) layer_name = layer.layer_name
print(f"Working on {layer_name}")
clusters_for_layer = repo.get_clusters_for_layer(layer_name)
# if no clusters were generated use one large cluster instead of skipping the layer
if clusters_for_layer is None or len(clusters_for_layer) == 0:
clusters_for_layer = [Cluster(layer_name, -1, repo.get_layer_nodes(layer_name))]
time_slices = split_clusterset_by_time(layer_name, clusters_for_layer)
for k,v in time_slices.items(): for k,v in time_slices.items():
repo.add_time_slice(v) repo.add_time_slice(v)
...@@ -19,6 +19,10 @@ class MongoRepositoryBase: ...@@ -19,6 +19,10 @@ class MongoRepositoryBase:
collection = self._database[collection_name] collection = self._database[collection_name]
collection.insert_one(content) collection.insert_one(content)
def insert_many(self, collection_name, content: list):
collection = self._database[collection_name]
collection.insert_many(content)
def get_entries(self, collection_name, selection: dict = {}, projection: dict = {'_': 0}) -> cursor.Cursor: def get_entries(self, collection_name, selection: dict = {}, projection: dict = {'_': 0}) -> cursor.Cursor:
collection = self._database[collection_name] collection = self._database[collection_name]
return collection.find(selection, projection) return collection.find(selection, projection)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment