Slicing for all clusters using start- and end-timestamp

c66bd0dd · Alexander Lercher · 9885bbf7 · c66bd0dd
Commit c66bd0dd authored Apr 22, 2020 by Alexander Lercher
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 22 deletions

run_time_slicing.py .../community-detection-microservice/app/run_time_slicing.py +26 -22

No files found.
--- a/src/data-hub/community-detection-microservice/app/run_time_slicing.py
+++ b/src/data-hub/community-detection-microservice/app/run_time_slicing.py
@@ -11,47 +11,51 @@ from db.entities.timeslice import TimeSlice
 from db.entities import ClusterSet
 from typing import Tuple, Dict, Any

+TimeSliceKey = Tuple[int, int]

-def convert_to_time_slice_key(timestamp: str) -> Tuple[int, int]:
+def convert_to_time_slice_key(timestamp: str) -> TimeSliceKey:
    '''Returns the tuple (year, week_of_year) from a timestamp. This is used as the key for the slicing.'''
-    timestamp = datetime.fromtimestamp(float(timestamp[0:10]))
-    (y, w, _) = timestamp.isocalendar()
+    time = datetime.utcfromtimestamp(float(timestamp[0:10]))
+    (y, w, _) = time.isocalendar()
    return (y, w)


-def split_clustersets_by_time(clustersets) -> Dict[Any, TimeSlice]:
+def split_clusterset_by_time(clustersets) -> Dict[TimeSliceKey, TimeSlice]:
    '''
-    Partitions all nodes of each clusterset into idividual time slices based on their timestamp. The information about the cluster is kept.
+    Distributes all nodes of a single clusterset into individual time slices based on their timestamps. 
+    If a node spans over multiple slices it will be added to all of them.
+    Information about clusters and the nodes in the clusters will not be changed.

-    :params clustersets: The clustersets whichs nodes are split
+    :params clustersets: The clusterset whichs nodes are split
    :returns: A dict of time slices where the key is the time info and value is the information about the time slice
    '''
-    cnt = 0
+
    time_slices: Dict[Any, TimeSlice] = {} 
-    for clusterset in clustersets:
-        for cluster_no in clusterset.clusters:
-            for node in cluster_no.nodes:
-                # assign the nodes to time slices and recreate the clusters there
-                # TODO use start and end time for assignment
-                time_key = convert_to_time_slice_key(str(node['Finished_time']))
-                
+    for cluster_no in clusterset.clusters:
+        for node in cluster_no.nodes:
+
+            time_keys = {
+                convert_to_time_slice_key(str(node['Finished_time'])), 
+                convert_to_time_slice_key(str(node['Starting_time']))
+            }
+
+            for time_key in time_keys:
                if time_key not in time_slices:
                    time_slices[time_key] = TimeSlice(time_key, clusterset.layer_name)

                time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
+
    return time_slices
        

 if __name__ == "__main__":
    repo = Repository()

-    clustersets = [repo.get_clusterset('Destination_Layer')]
-    time_slices = split_clustersets_by_time(clustersets)
+    repo.remove_all_time_slices()

-    # sort chronologically
-    keys = list(time_slices.keys())
-    keys.sort()
+    clustersets = repo.get_clustersets()
+    for clusterset in clustersets:
+        time_slices = split_clusterset_by_time(clusterset)

-    repo.remove_all_time_slices()
-    for k,v in time_slices.items():
-        repo.add_time_slice(v)
+        for k,v in time_slices.items():
+            repo.add_time_slice(v)