Merge branch 'feature/network-stages' into develop

bd4aa55b · Alexander Lercher · 0e20ca32 · c66bd0dd · bd4aa55b · bd4aa55b
Commit bd4aa55b authored Apr 22, 2020 by Alexander Lercher
9 changed files
--- a/src/data-hub/community-detection-microservice/app/configs/swagger.yml
+++ b/src/data-hub/community-detection-microservice/app/configs/swagger.yml
@@ -228,14 +228,14 @@ paths:
                    items:
                        type: string

-  /clustersets/{name}:
+  /clustersets/{layername}:
    get:
        operationId: "routes.clustersets.get_by_name"
        tags:
            - "Clusters"
        summary: "Get clusterset for layer-name"
        parameters: 
-          - name: "name"
+          - name: "layername"
            in: "path"
            description: "Name of the layer to return the clusterset for"
            required: true
@@ -262,6 +262,41 @@ paths:
                schema:
                  $ref: "#/definitions/UserClusterGraphCollection"

+# Time slices
+  /timeslices:
+    get:
+        operationId: "routes.timeslices.get"
+        tags:
+            - "Time Slices"
+        summary: "Get all time slices based on individual layers containing clusters with nodes for that time"
+        parameters: []
+        responses:
+            200:
+                description: "Successful operation"
+                schema:
+                    $ref: "#/definitions/TimeSliceCollection"
+                
+                
+  /timeslices/{layername}:
+    get:
+        operationId: "routes.timeslices.get_by_name"
+        tags:
+            - "Time Slices"
+        summary: "Get all time slices for one layer"
+        parameters:
+          - name: "layername"
+            in: "path"
+            description: "Name of the layer to return the time slices for"
+            required: true
+            type: "string"
+        responses:
+            200:
+                description: "Successful operation"
+                schema:
+                    $ref: "#/definitions/TimeSliceCollection"
+            404:
+                description: "No time slices found for layername"
+
 # Function Calls
  /rfc/run:
    post:
@@ -422,4 +457,31 @@ definitions:
  ClusterSetCollection:
    type: array
    items:
-      $ref: "#/definitions/ClusterSet"
\ No newline at end of file
+      $ref: "#/definitions/ClusterSet"
+
+  TimeSlice:
+    type: object
+    properties:
+        time: 
+            type: object
+            example: "(2020, 52)"
+        layer_name:
+            type: string
+        clusters:
+            type: object
+            additionalProperties:
+                type: array
+                items:
+                    type: object
+                    properties:
+                        UniqueID:
+                            type: string
+            example:
+                "0": 
+                    - UniqueID: abc
+                    - UniqueID: def
+  
+  TimeSliceCollection:
+    type: array
+    items:
+        $ref: "#/definitions/TimeSlice"
\ No newline at end of file
--- a/src/data-hub/community-detection-microservice/app/db/entities/__init__.py
+++ b/src/data-hub/community-detection-microservice/app/db/entities/__init__.py
@@ -4,3 +4,4 @@ from db.entities.cluster import Cluster, LocationCluster, TimeCluster
 from db.entities.clusterset import ClusterSet
 from db.entities.user_cluster_graph import UserClusterGraph
 from db.entities.layer import Layer
+from db.entities.timeslice import TimeSlice
\ No newline at end of file
--- a/src/data-hub/community-detection-microservice/app/db/entities/timeslice.py
+++ b/src/data-hub/community-detection-microservice/app/db/entities/timeslice.py
 import json
-from typing import List, Dict, TypeVar, Any
+from typing import List, Dict, NewType, Any
 from datetime import date, datetime

-Node = TypeVar('Node')
-
+Node = NewType('Node', dict)

 class TimeSlice:
-
-    def __init__(self, time, nodes = None,
-                 cluster_set_dict: Dict = None, from_db = False):
-        self.time = time
-        self.nodes: Dict[int, List[Node]] = {}
-
-        # if cluster_set_dict is not None:
-        #     self.from_serializable_dict(cluster_set_dict, from_db)
-
-    def add_node_to_cluster(self, cluster_label, node):
-        if cluster_label not in self.nodes:
-            self.nodes[cluster_label] = []
-
-        self.nodes[cluster_label].append(node)
-
-    # todo
-
-    # def to_serializable_dict(self, for_db=False) -> Dict:
-    #     serialized_dict_clusters = [cluster.to_serializable_dict(for_db)
-    #                                 for cluster in self.clusters]
-    #     return {
-    #         "layer_name": self.layer_name,
-    #         "clusters": json.dumps(serialized_dict_clusters) if for_db else serialized_dict_clusters
-    #     }
-
-    # def from_serializable_dict(self, cluster_set_dict: Dict, from_db=False):
-    #     self.layer_name = cluster_set_dict["layer_name"]
-
-    #     serialized_dict_clusters = json.loads(cluster_set_dict["clusters"]) \
-    #         if from_db else cluster_set_dict["clusters"]
-    #     self.clusters = [Cluster(cluster_dict=cluster_dict, from_db=from_db)
-    #                      for cluster_dict in serialized_dict_clusters]
+    '''
+    A time slice for a single layer containing all nodes for that time.
+
+    :param time: The tag indicating the time
+    :param layer_name: The name of the layer the nodes belong to
+    '''
+
+    def __init__(self, time: Any, layer_name: str,
+                 time_slice_dict: Dict = None, from_db = False):
+        self.time = str(time)
+        self.layer_name = layer_name
+        self.clusters: Dict[str, List[Node]] = {}
+
+        if time_slice_dict is not None:
+            self.from_serializable_dict(time_slice_dict, from_db)
+
+    def add_node_to_cluster(self, cluster_label: str, node):
+        # only string keys can be stored in json
+        cluster_label = str(cluster_label)
+
+        if cluster_label not in self.clusters:
+            self.clusters[cluster_label] = []
+
+        node = self._get_unique_id(node)
+        self.clusters[cluster_label].append(node)
+
+    def get_nodes_for_cluster(self, cluster_label: str):
+        if cluster_label in self.clusters:
+            return self.clusters[cluster_label]
+        else:
+            return []
+        
+    def _get_unique_id(self, node : Dict) -> Dict:
+        '''Returns a new dict with the unique id only.'''
+        uid_key = 'UniqueID'
+        if uid_key in node:
+            return {uid_key: node[uid_key]}
+
+
+    def to_serializable_dict(self, for_db=False) -> Dict:
+        return {
+            "time": self.time,
+            'layer_name': self.layer_name,
+            "clusters": json.dumps(self.clusters) if for_db else self.clusters
+        }
+
+    def from_serializable_dict(self, dict: Dict, from_db=False):
+        self.time = dict["time"]
+        self.layer_name = dict['layer_name']
+        self.clusters = json.loads(dict['clusters']) if from_db else dict['clusters']

    def __repr__(self):
-        return self.__str__()
-        # return {'time': self.time, "#nodes": len(self.nodes)}
-        # json.dumps(self.to_serializable_dict())
+        return json.dumps(self.to_serializable_dict())

    def __str__(self):
-        return f"TimeSlice({self.time}, {[len(v) for k, v in self.nodes.items()]})"
+        return f"TimeSlice({self.__repr__()})"
--- a/src/data-hub/community-detection-microservice/app/db/repository.py
+++ b/src/data-hub/community-detection-microservice/app/db/repository.py
@@ -23,6 +23,7 @@ class Repository(MongoRepositoryBase):
        self._user_cluster_graph_collection = 'user_cluster_graph'
        self._layer_collection = 'layer'
        self._clusterset_collection = 'cluster_set'
+        self._time_slice_collection = 'time_slice'

        self.agi_repo = AgiRepository()

@@ -113,3 +114,21 @@ class Repository(MongoRepositoryBase):
        else:
            return None
 #endregion
+
+#region TimeSlice
+    def add_time_slice(self, timeslice: TimeSlice):
+        super().insert_entry(self._time_slice_collection, timeslice.to_serializable_dict(for_db=True))
+
+    def get_time_slices(self) -> List[TimeSlice]:
+        '''Returns all time slices.'''
+        entries = super().get_entries(self._time_slice_collection)
+        return [TimeSlice(None, None, time_slice_dict=e, from_db=True) for e in entries]
+
+    def get_time_slices_by_name(self, layer_name) -> List[TimeSlice]:
+        '''Returns all time slices with the given layer_name.'''
+        entries = super().get_entries(self._time_slice_collection, selection={'layer_name': layer_name})
+        return [TimeSlice(None, None, time_slice_dict=e, from_db=True) for e in entries]
+
+    def remove_all_time_slices(self):
+        super().drop_collection(self._time_slice_collection)
+#endregion
\ No newline at end of file
--- a/src/data-hub/community-detection-microservice/app/routes/clustersets.py
+++ b/src/data-hub/community-detection-microservice/app/routes/clustersets.py
@@ -10,8 +10,8 @@ def get():
 def get_names():
    return repo.get_clusterset_names()

-def get_by_name(name):
-    res = repo.get_clusterset(name)
+def get_by_name(layername):
+    res = repo.get_clusterset(layername)
    if res is not None:
        return res.to_serializable_dict() 
    else:

--- a/src/data-hub/community-detection-microservice/app/routes/timeslices.py
+++ b/src/data-hub/community-detection-microservice/app/routes/timeslices.py
+from flask import request, Response
+from db.repository import Repository
+from db.entities import TimeSlice
+
+repo = Repository()
+
+
+def get():
+    return [e.to_serializable_dict() for e in repo.get_time_slices()]
+
+
+def get_by_name(layername):
+    res = repo.get_time_slices_by_name(layername)
+    print(len(res))
+    
+    if res is not None and len(res) != 0:
+        return [e.to_serializable_dict() for e in res]
+    else:
+        return Response(status=404)
--- a/src/data-hub/community-detection-microservice/app/run_time_slicing.py
+++ b/src/data-hub/community-detection-microservice/app/run_time_slicing.py
@@ -6,132 +6,56 @@ if os.path.exists(modules_path):

 import json
 from datetime import datetime, date
-import matplotlib.pyplot as plt
 from db.repository import Repository
 from db.entities.timeslice import TimeSlice
 from db.entities import ClusterSet
-from typing import Tuple
+from typing import Tuple, Dict, Any

-# repo = Repository()
+TimeSliceKey = Tuple[int, int]

-
-def convert_to_time_slice_key(timestamp: str) -> Tuple[int, int]:
-    '''Returns the tuple (year, week_of_year) from a timestamp.'''
-    timestamp = datetime.fromtimestamp(float(timestamp[0:10]))
-    (y, w, _) = timestamp.isocalendar()
+def convert_to_time_slice_key(timestamp: str) -> TimeSliceKey:
+    '''Returns the tuple (year, week_of_year) from a timestamp. This is used as the key for the slicing.'''
+    time = datetime.utcfromtimestamp(float(timestamp[0:10]))
+    (y, w, _) = time.isocalendar()
    return (y, w)


-def get_clusterset():
-    # clusterset = repo.get_clusterset('Destination_Layer')
-    with open('clustering_results/optics/clusterset_Destination_Layer.txt') as file:
-        clusterset = ClusterSet(cluster_set_dict=json.loads(file.read()))
-    return clusterset
-
-    clusterset = ClusterSet(cluster_set_dict={
-        "clusters": [{
-            "cluster_label": 0,
-            "nodes": [{
-                    "Finished_time": 1579143634812589,
-                    "Latitude_Destination": -5.95081,
-                    "Longitude_Destination": 37.415281,
-                    "TravelID": "5e57ec9159bc0668543f1568",
-                    "TravelPrice": 19,
-                    "UniqueID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a45e57ec9159bc0668543f1568",
-                    "UserID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a4",
-                    "cluster_label": 0
-                }, {
-                    "Finished_time": 1582709512112368,
-                    "Latitude_Destination": -5.95081,
-                    "Longitude_Destination": 37.415281,
-                    "TravelID": "5e57ec9159bc0668543f15cf",
-                    "TravelPrice": 16,
-                    "UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cf",
-                    "UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
-                    "cluster_label": 0
-                }, {
-                    "Finished_time": 1582709512112367,
-                    "Latitude_Destination": -5.95081,
-                    "Longitude_Destination": 37.415281,
-                    "TravelID": "5e57ec9159bc0668543f15cf",
-                    "TravelPrice": 16,
-                    "UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cd",
-                    "UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
-                    "cluster_label": 0
-                }]
-            }],
-        "layer_name": "Destination_Layer"
-    })
-
-    return clusterset
-
-
-def plt_show_circles(keys, time_slices, cluster_no):
-    for k in keys:
-        slice_ = time_slices[k]
-
-        if cluster_no in slice_.nodes:
-            nodes = slice_.nodes[cluster_no] 
-        else:
-            nodes = []    
-            
-        # print(f"{slice_.time} number elements for cluster {cluster_no}: {len(nodes)}")
-
-        plt.title(str(k))
-
-        plt.scatter([n['Longitude_Destination'] for n in nodes],
-                    [n['Latitude_Destination'] for n in nodes],
-                    s=[len(nodes)*100]*len(nodes))
-
-        plt.pause(0.5)
+def split_clusterset_by_time(clustersets) -> Dict[TimeSliceKey, TimeSlice]:
+    '''
+    Distributes all nodes of a single clusterset into individual time slices based on their timestamps. 
+    If a node spans over multiple slices it will be added to all of them.
+    Information about clusters and the nodes in the clusters will not be changed.

+    :params clustersets: The clusterset whichs nodes are split
+    :returns: A dict of time slices where the key is the time info and value is the information about the time slice
+    '''

-def plt_show_bars(keys, time_slices, cluster_no):
-    x_axis_label_stepsize = 10
+    time_slices: Dict[Any, TimeSlice] = {} 
+    for cluster_no in clusterset.clusters:
+        for node in cluster_no.nodes:

-    nodes_per_slice_for_single_cluster = \
-            [len(time_slices[k].nodes[cluster_no])
-                if cluster_no in time_slices[k].nodes
-                else 0
-            for k 
-            in keys]
+            time_keys = {
+                convert_to_time_slice_key(str(node['Finished_time'])), 
+                convert_to_time_slice_key(str(node['Starting_time']))
+            }

-    fig, ax = plt.subplots()
-    ax.bar(x=range(len(keys)),
-        height=nodes_per_slice_for_single_cluster)
+            for time_key in time_keys:
+                if time_key not in time_slices:
+                    time_slices[time_key] = TimeSlice(time_key, clusterset.layer_name)

-    ax.set_ylabel('Size')
-    ax.set_title(f'Cluster-{cluster_no} size over time')
-    ax.set_xticks(range(len(keys))[::x_axis_label_stepsize])
-    ax.set_xticklabels(keys[::x_axis_label_stepsize])
+                time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)

-    plt.show()
-
-
-
-clusterset = get_clusterset()
-# print(clusterset.layer_name)
-
-cnt = 0
-time_slices = {} 
-# for clusterset in clustersets:
-for cluster_no in clusterset.clusters:
-    for node in cluster_no.nodes:
-        # assign the nodes to time slices and recreate the clusters there
-        time_key = convert_to_time_slice_key(str(node['Finished_time']))
+    return time_slices
        
-        if time_key not in time_slices:
-            time_slices[time_key] = TimeSlice(time_key)
-
-        time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
-        
-
-# sort chronologically
-keys = list(time_slices.keys())
-keys.sort()
-

+if __name__ == "__main__":
+    repo = Repository()

-plt_show_bars(keys, time_slices, cluster_no = 20)
+    repo.remove_all_time_slices()

+    clustersets = repo.get_clustersets()
+    for clusterset in clustersets:
+        time_slices = split_clusterset_by_time(clusterset)

+        for k,v in time_slices.items():
+            repo.add_time_slice(v)
--- a/src/data-hub/community-detection-microservice/app/visualization/visualize_time_slices.py
+++ b/src/data-hub/community-detection-microservice/app/visualization/visualize_time_slices.py
+import sys
+import os
+for path in ['../', './', '../../../modules/']:
+    if os.path.exists(path):
+        sys.path.insert(1, path)
+
+import matplotlib.pyplot as plt
+from db.repository import Repository
+from db.entities import TimeSlice
+from typing import List
+
+
+def plt_show_circles(time_slices: List[TimeSlice], cluster_no):
+    cluster_no = str(cluster_no)
+
+    for slice_ in time_slices:
+
+        nodes = slice_.get_nodes_for_cluster(cluster_no)
+            
+        # print(f"{slice_.time} number elements for cluster {cluster_no}: {len(nodes)}")
+
+        plt.title(str(slice_.time))
+
+        plt.scatter([n['Longitude_Destination'] if 'Longitude_Destination' in n else 0
+                        for n in nodes],
+                    [n['Latitude_Destination'] if 'Latitude_Destination' in n else 0
+                        for n in nodes],
+                    s=[len(nodes)*100]*len(nodes))
+
+        plt.pause(0.5)
+
+
+def plt_show_bars(time_slices: List[TimeSlice], cluster_no):
+    cluster_no = str(cluster_no)
+    
+    labels = [ts.time for ts in time_slices]  
+    x_axis_label_stepsize = 10  
+
+    nodes_per_slice_for_single_cluster = \
+            [len(time_slice.get_nodes_for_cluster(cluster_no))
+            for time_slice
+            in time_slices]
+
+    fig, ax = plt.subplots()
+    ax.bar(x=range(len(labels)),
+        height=nodes_per_slice_for_single_cluster)
+
+    ax.set_ylabel('Size')
+    ax.set_title(f'Cluster-{cluster_no} size over time')
+    ax.set_xticks(range(len(labels))[::x_axis_label_stepsize])
+    ax.set_xticklabels(labels[::x_axis_label_stepsize])
+
+    plt.show()
+
+
+if __name__ == "__main__":
+    repo = Repository()
+    time_slices = repo.get_time_slices_by_name("Destination_Layer")
+
+    # chronological order
+    time_slices.sort(key=lambda ts: eval(ts.time))
+
+    print(len(time_slices))
+    plt_show_bars(time_slices, cluster_no = 0)
\ No newline at end of file
--- a/src/modules/database/MongoRepositoryBase.py
+++ b/src/modules/database/MongoRepositoryBase.py
@@ -12,6 +12,9 @@ class MongoRepositoryBase:
        self._mongo_client = MongoClient(f"mongodb://{username}:{password}@{hostname}:{port}/")
        self._database = self._mongo_client[database_name]

+    def drop_collection(self, collection_name):
+        self._database[collection_name].drop()
+
    def insert_entry(self, collection_name, content: dict):
        collection = self._database[collection_name]
        collection.insert_one(content)