Commit 1d90011e authored by Alexander's avatar Alexander

time clustering for each hour

parent e4993f0c
from db.entities.location import Location from db.entities.location import Location
from db.entities.popular_location import PopularLocation from db.entities.popular_location import PopularLocation
from db.entities.user_cluster import UserCluster from db.entities.cluster import LocationCluster, TimeCluster
\ No newline at end of file \ No newline at end of file
import json
from typing import List, Dict
from datetime import date, datetime
class Cluster:
def __init__(self, cluster_label: int = None, clusters: List = None):
self.cluster_label = cluster_label
self.clusters = clusters
class LocationCluster(Cluster):
def __init__(self, cluster_label: int = None, clusters: List = None,
location_dict: Dict = None, from_db=False):
super().__init__(cluster_label, clusters)
self.id = f'{self.cluster_label}'
if location_dict is not None:
self.from_serializable_dict(location_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, location_dict: Dict, from_db=False):
self.id = location_dict["id"]
self.cluster_label = location_dict["cluster_label"]
self.clusters = json.loads(location_dict["clusters"]) \
if from_db else location_dict["clusters"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"LocationCluster({self.__repr__()})"
class TimeCluster(Cluster):
def __init__(self, date: date = None, hour: int = None, cluster_label: int = None, clusters: List = None,
time_dict: Dict = None, from_db=False):
super().__init__(cluster_label, clusters)
self.date = date
self.hour = hour
self.id = f'{self.date}-{self.hour}-{self.cluster_label}'
if time_dict is not None:
self.from_serializable_dict(time_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"date": str(self.date),
"hour": self.hour,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, time_dict: Dict, from_db=False):
self.id = time_dict["id"]
self.date = datetime.strptime(time_dict["date"], '%Y-%m-%d').date()
self.hour = time_dict["hour"]
self.cluster_label = time_dict["cluster_label"]
self.clusters = json.loads(time_dict["clusters"]) \
if from_db else time_dict["clusters"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeCluster({self.__repr__()})"
import json
from typing import List
class UserCluster:
def __init__(self, cluster_label: int, clusters: List):
super().__init__()
self.cluster_label = cluster_label
self.clusters = clusters
self.id = f'{self.cluster_label}'
def to_serializable_dict(self, for_db=False):
return {
"id": self.id,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"UserCluster({self.__repr__()})"
...@@ -5,7 +5,7 @@ import json ...@@ -5,7 +5,7 @@ import json
from db.agi.agi_repository import AgiRepository from db.agi.agi_repository import AgiRepository
from db.entities import Location, UserCluster, PopularLocation from db.entities import Location, TimeCluster, PopularLocation, LocationCluster
from typing import List from typing import List
...@@ -32,19 +32,17 @@ class Repository(MongoRepositoryBase): ...@@ -32,19 +32,17 @@ class Repository(MongoRepositoryBase):
agi_locations = self.agi_repo.getLocations() agi_locations = self.agi_repo.getLocations()
return [Location(agi_loc) for agi_loc in agi_locations] return [Location(agi_loc) for agi_loc in agi_locations]
def add_location_cluster(self, cluster: UserCluster): def add_location_cluster(self, cluster: LocationCluster):
super().insert_entry(self._location_cluster_collection, cluster.to_serializable_dict(for_db=True)) super().insert_entry(self._location_cluster_collection, cluster.to_serializable_dict(for_db=True))
def get_location_clusters(self) -> List[UserCluster]: def get_location_clusters(self) -> List[LocationCluster]:
clusters = super().get_entries(self._location_cluster_collection) clusters = super().get_entries(self._location_cluster_collection)
return [UserCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters] return [LocationCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
def add_time_cluster(self, cluster: UserCluster): def add_time_cluster(self, cluster: TimeCluster):
super().insert_entry(self._time_cluster_collection, cluster.to_serializable_dict(for_db=True)) super().insert_entry(self._time_cluster_collection, cluster.to_serializable_dict(for_db=True))
def get_time_clusters(self) -> List[UserCluster]: def get_time_clusters(self) -> List[TimeCluster]:
clusters = super().get_entries(self._time_cluster_collection) clusters = super().get_entries(self._time_cluster_collection)
return [UserCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters] return [TimeCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
def add_popular_location(self, popular_location: PopularLocation):
pass
...@@ -4,7 +4,7 @@ modules_path = '../../../modules/' ...@@ -4,7 +4,7 @@ modules_path = '../../../modules/'
if os.path.exists(modules_path): if os.path.exists(modules_path):
sys.path.insert(1, modules_path) sys.path.insert(1, modules_path)
from db.entities import Location, PopularLocation, UserCluster from db.entities import Location, PopularLocation, LocationCluster, TimeCluster
from typing import List, Dict, Tuple from typing import List, Dict, Tuple
from db.repository import Repository from db.repository import Repository
from processing.clusterer import Clusterer from processing.clusterer import Clusterer
...@@ -20,6 +20,7 @@ repo = Repository() ...@@ -20,6 +20,7 @@ repo = Repository()
# exit() # exit()
def run_location_clustering(): def run_location_clustering():
user_clusterer = Clusterer() user_clusterer = Clusterer()
...@@ -28,36 +29,57 @@ def run_location_clustering(): ...@@ -28,36 +29,57 @@ def run_location_clustering():
cluster_result = user_clusterer.cluster_locations( cluster_result = user_clusterer.cluster_locations(
[l.to_serializable_dict() for l in all_location_traces]) [l.to_serializable_dict() for l in all_location_traces])
clusters = [UserCluster(key, value) clusters = [LocationCluster(key, value)
for key, value in cluster_result.items()] for key, value in cluster_result.items()]
store_user_clusters(clusters) store_clusters('locations', clusters)
def run_time_clustering(): def run_time_clustering():
user_clusterer = Clusterer(epsilon=10**5.8) clusters: List[TimeCluster] = []
user_clusterer = Clusterer(epsilon=600) # clustered within 10 minutes
all_location_traces = repo.get_locations() all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces]) # for each date in timestamp list
dates = {trace.timestamp.date() for trace in all_location_traces}
for cur_date in dates:
traces_for_cur_date = [
trace for trace in all_location_traces if trace.timestamp.date() == cur_date]
clusters = [UserCluster(key, value) # for each hour of that day
for key, value in cluster_result.items()] for cur_hour in list(range(24)):
traces_for_time_slice = [
for c in clusters: trace for trace in traces_for_cur_date if trace.timestamp.hour == cur_hour]
repo.add_time_cluster(c)
if len(traces_for_time_slice) == 0:
continue
# clustering per hour
cluster_result = user_clusterer.cluster_times(
[t.to_serializable_dict() for t in traces_for_time_slice])
cur_clusters = [TimeCluster(cur_date, cur_hour, key, value)
for key, value in cluster_result.items()]
clusters.extend(cur_clusters)
store_clusters('times', clusters)
# TODO make abstract for other features
def store_user_clusters(user_clusters: List[UserCluster]): def store_clusters(type: str, clusters: List):
if DEBUG: if DEBUG:
print(user_clusters) print(clusters)
return return
for c in user_clusters: if type == 'locations':
repo.add_location_cluster(c) for c in clusters:
repo.add_location_cluster(c)
if type == 'times':
for c in clusters:
repo.add_time_cluster(c)
if __name__ == "__main__": if __name__ == "__main__":
run_time_clustering()
run_location_clustering() run_location_clustering()
run_time_clustering()
import unittest
import sys
sys.path.insert(1, './')
# python -m unittest discover -v tests
from db.entities.cluster import Cluster
from db.entities import TimeCluster, LocationCluster
from datetime import date, datetime
import json
class TestCluster(unittest.TestCase):
def test_init_Cluster(self):
c = Cluster(1, [1, 2, 3])
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
class TestLocationCluster(unittest.TestCase):
def setUp(self):
self.c = LocationCluster(1, [1, 2, 3])
def test_init_individualArguments(self):
c = LocationCluster(1, [1, 2, 3])
self.assertEqual('1', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': [1, 2, 3]}
c = LocationCluster(location_dict=dict_)
self.assertEqual('123', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument_fromDb(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': '[1, 2, 3]'}
c = LocationCluster(location_dict=dict_, from_db=True)
self.assertEqual('123', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_to_serializable_dict_noDb(self):
c_dict = self.c.to_serializable_dict()
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, c_dict['clusters'])
def test_from_serializable_dict_noDb(self):
new_c = LocationCluster()
new_c.from_serializable_dict(self.c.to_serializable_dict())
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
def test_to_serializable_dict_db_jsonClusters(self):
c_dict = self.c.to_serializable_dict(for_db=True)
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, json.loads(c_dict['clusters']))
def test_from_serializable_dict_fromDb(self):
new_c = LocationCluster()
new_c.from_serializable_dict(
self.c.to_serializable_dict(for_db=True), from_db=True)
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
class TestTimeCluster(unittest.TestCase):
def setUp(self):
self.date_ = date(2020, 1, 1)
self.c = TimeCluster(self.date_, 14, 1, [1, 2, 3])
def test_init_individualArguments(self):
c = TimeCluster(self.date_, 14, 1, [1, 2, 3])
self.assertEqual(f'{self.date_}-14-1', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': [1, 2, 3],
'date': str(self.date_), 'hour': 14}
c = TimeCluster(time_dict=dict_)
self.assertEqual('123', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument_fromDb(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': '[1, 2, 3]',
'date': str(self.date_), 'hour': 14}
c = TimeCluster(time_dict=dict_, from_db=True)
self.assertEqual('123', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_to_serializable_dict_noDb(self):
c_dict = self.c.to_serializable_dict()
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, c_dict['clusters'])
self.assertEqual(self.c.date, datetime.strptime(
c_dict['date'], '%Y-%m-%d').date())
self.assertEqual(self.c.hour, c_dict['hour'])
def test_from_serializable_dict_noDb(self):
new_c = TimeCluster()
new_c.from_serializable_dict(self.c.to_serializable_dict())
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
def test_to_serializable_dict_fromDb_jsonClusters(self):
c_dict = self.c.to_serializable_dict(for_db=True)
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, json.loads(c_dict['clusters']))
self.assertEqual(self.c.date, datetime.strptime(
c_dict['date'], '%Y-%m-%d').date())
self.assertEqual(self.c.hour, c_dict['hour'])
def test_from_serializable_dict_fromDb(self):
new_c = TimeCluster()
new_c.from_serializable_dict(
self.c.to_serializable_dict(for_db=True), from_db=True)
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment