Commit 0e20ca32 authored by Alexander Lercher's avatar Alexander Lercher

Splitting clusters based on stages

currently weeks of the year
parent 0beaf5bb
import json
from typing import List, Dict, TypeVar, Any
from datetime import date, datetime
Node = TypeVar('Node')
class TimeSlice:
def __init__(self, time, nodes = None,
cluster_set_dict: Dict = None, from_db = False):
self.time = time
self.nodes: Dict[int, List[Node]] = {}
# if cluster_set_dict is not None:
# self.from_serializable_dict(cluster_set_dict, from_db)
def add_node_to_cluster(self, cluster_label, node):
if cluster_label not in self.nodes:
self.nodes[cluster_label] = []
self.nodes[cluster_label].append(node)
# todo
# def to_serializable_dict(self, for_db=False) -> Dict:
# serialized_dict_clusters = [cluster.to_serializable_dict(for_db)
# for cluster in self.clusters]
# return {
# "layer_name": self.layer_name,
# "clusters": json.dumps(serialized_dict_clusters) if for_db else serialized_dict_clusters
# }
# def from_serializable_dict(self, cluster_set_dict: Dict, from_db=False):
# self.layer_name = cluster_set_dict["layer_name"]
# serialized_dict_clusters = json.loads(cluster_set_dict["clusters"]) \
# if from_db else cluster_set_dict["clusters"]
# self.clusters = [Cluster(cluster_dict=cluster_dict, from_db=from_db)
# for cluster_dict in serialized_dict_clusters]
def __repr__(self):
return self.__str__()
# return {'time': self.time, "#nodes": len(self.nodes)}
# json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeSlice({self.time}, {[len(v) for k, v in self.nodes.items()]})"
...@@ -94,14 +94,17 @@ class Repository(MongoRepositoryBase): ...@@ -94,14 +94,17 @@ class Repository(MongoRepositoryBase):
super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict()) super().insert_entry(self._clusterset_collection, cluster_set.to_serializable_dict())
def get_clustersets(self) -> List[ClusterSet]: def get_clustersets(self) -> List[ClusterSet]:
'''Returns all clustersets.'''
entries = super().get_entries(self._clusterset_collection) entries = super().get_entries(self._clusterset_collection)
return [ClusterSet(cluster_set_dict=e) for e in entries] return [ClusterSet(cluster_set_dict=e) for e in entries]
def get_clusterset_names(self) -> List[str]: def get_clusterset_names(self) -> List[str]:
'''Returns the names of all clustersets.'''
entries = super().get_entries(self._clusterset_collection, projection={'layer_name': 1}) entries = super().get_entries(self._clusterset_collection, projection={'layer_name': 1})
return [e['layer_name'] for e in entries] return [e['layer_name'] for e in entries]
def get_clusterset(self, layer_name) -> ClusterSet: def get_clusterset(self, layer_name) -> ClusterSet:
'''Returns a single clusterset with the given name or None otherwise.'''
entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name}) entries = super().get_entries(self._clusterset_collection, selection={'layer_name': layer_name})
entries = [ClusterSet(cluster_set_dict=e) for e in entries] entries = [ClusterSet(cluster_set_dict=e) for e in entries]
......
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
import json
from datetime import datetime, date
import matplotlib.pyplot as plt
from db.repository import Repository
from db.entities.timeslice import TimeSlice
from db.entities import ClusterSet
from typing import Tuple
# repo = Repository()
def convert_to_time_slice_key(timestamp: str) -> Tuple[int, int]:
'''Returns the tuple (year, week_of_year) from a timestamp.'''
timestamp = datetime.fromtimestamp(float(timestamp[0:10]))
(y, w, _) = timestamp.isocalendar()
return (y, w)
def get_clusterset():
# clusterset = repo.get_clusterset('Destination_Layer')
with open('clustering_results/optics/clusterset_Destination_Layer.txt') as file:
clusterset = ClusterSet(cluster_set_dict=json.loads(file.read()))
return clusterset
clusterset = ClusterSet(cluster_set_dict={
"clusters": [{
"cluster_label": 0,
"nodes": [{
"Finished_time": 1579143634812589,
"Latitude_Destination": -5.95081,
"Longitude_Destination": 37.415281,
"TravelID": "5e57ec9159bc0668543f1568",
"TravelPrice": 19,
"UniqueID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a45e57ec9159bc0668543f1568",
"UserID": "2696718d7a33ab3dbf28e9c88411afcfe9a933a4",
"cluster_label": 0
}, {
"Finished_time": 1582709512112368,
"Latitude_Destination": -5.95081,
"Longitude_Destination": 37.415281,
"TravelID": "5e57ec9159bc0668543f15cf",
"TravelPrice": 16,
"UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cf",
"UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
"cluster_label": 0
}, {
"Finished_time": 1582709512112367,
"Latitude_Destination": -5.95081,
"Longitude_Destination": 37.415281,
"TravelID": "5e57ec9159bc0668543f15cf",
"TravelPrice": 16,
"UniqueID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c5e57ec9159bc0668543f15cd",
"UserID": "98dcb2717ddae152d5b359c6ea97e4fe34a29d4c",
"cluster_label": 0
}]
}],
"layer_name": "Destination_Layer"
})
return clusterset
def plt_show_circles(keys, time_slices, cluster_no):
for k in keys:
slice_ = time_slices[k]
if cluster_no in slice_.nodes:
nodes = slice_.nodes[cluster_no]
else:
nodes = []
# print(f"{slice_.time} number elements for cluster {cluster_no}: {len(nodes)}")
plt.title(str(k))
plt.scatter([n['Longitude_Destination'] for n in nodes],
[n['Latitude_Destination'] for n in nodes],
s=[len(nodes)*100]*len(nodes))
plt.pause(0.5)
def plt_show_bars(keys, time_slices, cluster_no):
x_axis_label_stepsize = 10
nodes_per_slice_for_single_cluster = \
[len(time_slices[k].nodes[cluster_no])
if cluster_no in time_slices[k].nodes
else 0
for k
in keys]
fig, ax = plt.subplots()
ax.bar(x=range(len(keys)),
height=nodes_per_slice_for_single_cluster)
ax.set_ylabel('Size')
ax.set_title(f'Cluster-{cluster_no} size over time')
ax.set_xticks(range(len(keys))[::x_axis_label_stepsize])
ax.set_xticklabels(keys[::x_axis_label_stepsize])
plt.show()
clusterset = get_clusterset()
# print(clusterset.layer_name)
cnt = 0
time_slices = {}
# for clusterset in clustersets:
for cluster_no in clusterset.clusters:
for node in cluster_no.nodes:
# assign the nodes to time slices and recreate the clusters there
time_key = convert_to_time_slice_key(str(node['Finished_time']))
if time_key not in time_slices:
time_slices[time_key] = TimeSlice(time_key)
time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
# sort chronologically
keys = list(time_slices.keys())
keys.sort()
plt_show_bars(keys, time_slices, cluster_no = 20)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment