Splitting clusters based on stages

currently weeks of the year

Splitting clusters based on stages
currently weeks of the year
0e20ca32 · Alexander Lercher · 0beaf5bb · 0e20ca32 · 0e20ca32 · 0e20ca32
Commit 0e20ca32 authored Apr 16, 2020 by Alexander Lercher
3 changed files
--- a/src/data-hub/community-detection-microservice/app/db/entities/timeslice.py
+++ b/src/data-hub/community-detection-microservice/app/db/entities/timeslice.py
+import json
+from typing import List, Dict, TypeVar, Any
+from datetime import date, datetime
+
+Node = TypeVar('Node')
+
+
+class TimeSlice:
+
+    def __init__(self, time, nodes = None,
+                 cluster_set_dict: Dict = None, from_db = False):
+        self.time = time
+        self.nodes: Dict[int, List[Node]] = {}
+
+        # if cluster_set_dict is not None:
+        #     self.from_serializable_dict(cluster_set_dict, from_db)
+
+    def add_node_to_cluster(self, cluster_label, node):
+        if cluster_label not in self.nodes:
+            self.nodes[cluster_label] = []
+
+        self.nodes[cluster_label].append(node)
+
+    # todo
+
+    # def to_serializable_dict(self, for_db=False) -> Dict:
+    #     serialized_dict_clusters = [cluster.to_serializable_dict(for_db)
+    #                                 for cluster in self.clusters]
+    #     return {
+    #         "layer_name": self.layer_name,
+    #         "clusters": json.dumps(serialized_dict_clusters) if for_db else serialized_dict_clusters
+    #     }
+
+    # def from_serializable_dict(self, cluster_set_dict: Dict, from_db=False):
+    #     self.layer_name = cluster_set_dict["layer_name"]
+
+    #     serialized_dict_clusters = json.loads(cluster_set_dict["clusters"]) \
+    #         if from_db else cluster_set_dict["clusters"]
+    #     self.clusters = [Cluster(cluster_dict=cluster_dict, from_db=from_db)
+    #                      for cluster_dict in serialized_dict_clusters]
+
+    def __repr__(self):
+        return self.__str__()
+        # return {'time': self.time, "#nodes": len(self.nodes)}
+        # json.dumps(self.to_serializable_dict())
+
+    def __str__(self):
+        return f"TimeSlice({self.time}, {[len(v) for k, v in self.nodes.items()]})"
--- a/src/data-hub/community-detection-microservice/app/db/repository.py
+++ b/src/data-hub/community-detection-microservice/app/db/repository.py
@@ -94,14 +94,17 @@ class Repository(MongoRepositoryBase):
        super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict())

    def get_clustersets(self) -> List[ClusterSet]:
+        '''Returns all clustersets.'''
        entries = super().get_entries(self._clusterset_collection)
        return [ClusterSet(cluster_set_dict=e) for e in entries]

    def get_clusterset_names(self) -> List[str]:
+        '''Returns the names of all clustersets.'''
        entries = super().get_entries(self._clusterset_collection, projection={'layer_name': 1})
        return [e['layer_name'] for e in entries]

    def get_clusterset(self, layer_name) -> ClusterSet:
+        '''Returns a single clusterset with the given name or None otherwise.'''
        entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name})
        entries = [ClusterSet(cluster_set_dict=e) for e in entries]


--- a/src/data-hub/community-detection-microservice/app/run_time_slicing.py
+++ b/src/data-hub/community-detection-microservice/app/run_time_slicing.py
+import sys
+import os
+modules_path = '../../../modules/'
+if os.path.exists(modules_path):
+    sys.path.insert(1, modules_path)
+
+import json
+from datetime import datetime, date
+import matplotlib.pyplot as plt
+from db.repository import Repository
+from db.entities.timeslice import TimeSlice
+from db.entities import ClusterSet
+from typing import Tuple
+
+# repo = Repository()
+
+
+def convert_to_time_slice_key(timestamp: str) -> Tuple[int, int]:
+    '''Returns the tuple (year, week_of_year) from a timestamp.'''
+    timestamp = datetime.fromtimestamp(float(timestamp[0:10]))
+    (y, w, _) = timestamp.isocalendar()
+    return (y, w)
+
+
+def get_clusterset():
+    # clusterset = repo.get_clusterset('Destination_Layer')
+    with open('clustering_results/optics/clusterset_Destination_Layer.txt') as file:
+        clusterset = ClusterSet(cluster_set_dict=json.loads(file.read()))
+    return clusterset
+
+    clusterset = ClusterSet(cluster_set_dict={
+        "clusters": [{
+            "cluster_label": 0,
+            "nodes": [{
+                    "Finished_time": 1579143634812589,
+                    "Latitude_Destination": -5.95081,
+                    "Longitude_Destination": 37.415281,
+                    "TravelID": "5e57ec9159bc0668543f1568",
+                    "TravelPrice": 19,
+                    "UniqueID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a45e57ec9159bc0668543f1568",
+                    "UserID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a4",
+                    "cluster_label": 0
+                }, {
+                    "Finished_time": 1582709512112368,
+                    "Latitude_Destination": -5.95081,
+                    "Longitude_Destination": 37.415281,
+                    "TravelID": "5e57ec9159bc0668543f15cf",
+                    "TravelPrice": 16,
+                    "UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cf",
+                    "UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
+                    "cluster_label": 0
+                }, {
+                    "Finished_time": 1582709512112367,
+                    "Latitude_Destination": -5.95081,
+                    "Longitude_Destination": 37.415281,
+                    "TravelID": "5e57ec9159bc0668543f15cf",
+                    "TravelPrice": 16,
+                    "UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cd",
+                    "UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
+                    "cluster_label": 0
+                }]
+            }],
+        "layer_name": "Destination_Layer"
+    })
+
+    return clusterset
+
+
+def plt_show_circles(keys, time_slices, cluster_no):
+    for k in keys:
+        slice_ = time_slices[k]
+
+        if cluster_no in slice_.nodes:
+            nodes = slice_.nodes[cluster_no] 
+        else:
+            nodes = []    
+            
+        # print(f"{slice_.time} number elements for cluster {cluster_no}: {len(nodes)}")
+
+        plt.title(str(k))
+
+        plt.scatter([n['Longitude_Destination'] for n in nodes],
+                    [n['Latitude_Destination'] for n in nodes],
+                    s=[len(nodes)*100]*len(nodes))
+
+        plt.pause(0.5)
+
+
+def plt_show_bars(keys, time_slices, cluster_no):
+    x_axis_label_stepsize = 10
+
+    nodes_per_slice_for_single_cluster = \
+            [len(time_slices[k].nodes[cluster_no])
+                if cluster_no in time_slices[k].nodes
+                else 0
+            for k 
+            in keys]
+
+    fig, ax = plt.subplots()
+    ax.bar(x=range(len(keys)),
+        height=nodes_per_slice_for_single_cluster)
+
+    ax.set_ylabel('Size')
+    ax.set_title(f'Cluster-{cluster_no} size over time')
+    ax.set_xticks(range(len(keys))[::x_axis_label_stepsize])
+    ax.set_xticklabels(keys[::x_axis_label_stepsize])
+
+    plt.show()
+
+
+
+clusterset = get_clusterset()
+# print(clusterset.layer_name)
+
+cnt = 0
+time_slices = {} 
+# for clusterset in clustersets:
+for cluster_no in clusterset.clusters:
+    for node in cluster_no.nodes:
+        # assign the nodes to time slices and recreate the clusters there
+        time_key = convert_to_time_slice_key(str(node['Finished_time']))
+        
+        if time_key not in time_slices:
+            time_slices[time_key] = TimeSlice(time_key)
+
+        time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
+        
+
+# sort chronologically
+keys = list(time_slices.keys())
+keys.sort()
+
+
+
+plt_show_bars(keys, time_slices, cluster_no = 20)
+
+