Commit 1d90011e authored by Alexander's avatar Alexander

time clustering for each hour

parent e4993f0c
from db.entities.location import Location
from db.entities.popular_location import PopularLocation
from db.entities.user_cluster import UserCluster
\ No newline at end of file
from db.entities.cluster import LocationCluster, TimeCluster
\ No newline at end of file
import json
from typing import List, Dict
from datetime import date, datetime
class Cluster:
def __init__(self, cluster_label: int = None, clusters: List = None):
self.cluster_label = cluster_label
self.clusters = clusters
class LocationCluster(Cluster):
def __init__(self, cluster_label: int = None, clusters: List = None,
location_dict: Dict = None, from_db=False):
super().__init__(cluster_label, clusters)
self.id = f'{self.cluster_label}'
if location_dict is not None:
self.from_serializable_dict(location_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, location_dict: Dict, from_db=False):
self.id = location_dict["id"]
self.cluster_label = location_dict["cluster_label"]
self.clusters = json.loads(location_dict["clusters"]) \
if from_db else location_dict["clusters"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"LocationCluster({self.__repr__()})"
class TimeCluster(Cluster):
def __init__(self, date: date = None, hour: int = None, cluster_label: int = None, clusters: List = None,
time_dict: Dict = None, from_db=False):
super().__init__(cluster_label, clusters)
self.date = date
self.hour = hour
self.id = f'{self.date}-{self.hour}-{self.cluster_label}'
if time_dict is not None:
self.from_serializable_dict(time_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"id": self.id,
"date": str(self.date),
"hour": self.hour,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, time_dict: Dict, from_db=False):
self.id = time_dict["id"]
self.date = datetime.strptime(time_dict["date"], '%Y-%m-%d').date()
self.hour = time_dict["hour"]
self.cluster_label = time_dict["cluster_label"]
self.clusters = json.loads(time_dict["clusters"]) \
if from_db else time_dict["clusters"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeCluster({self.__repr__()})"
import json
from typing import List
class UserCluster:
def __init__(self, cluster_label: int, clusters: List):
super().__init__()
self.cluster_label = cluster_label
self.clusters = clusters
self.id = f'{self.cluster_label}'
def to_serializable_dict(self, for_db=False):
return {
"id": self.id,
"cluster_label": self.cluster_label,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"UserCluster({self.__repr__()})"
......@@ -5,7 +5,7 @@ import json
from db.agi.agi_repository import AgiRepository
from db.entities import Location, UserCluster, PopularLocation
from db.entities import Location, TimeCluster, PopularLocation, LocationCluster
from typing import List
......@@ -32,19 +32,17 @@ class Repository(MongoRepositoryBase):
agi_locations = self.agi_repo.getLocations()
return [Location(agi_loc) for agi_loc in agi_locations]
def add_location_cluster(self, cluster: UserCluster):
def add_location_cluster(self, cluster: LocationCluster):
super().insert_entry(self._location_cluster_collection, cluster.to_serializable_dict(for_db=True))
def get_location_clusters(self) -> List[UserCluster]:
def get_location_clusters(self) -> List[LocationCluster]:
clusters = super().get_entries(self._location_cluster_collection)
return [UserCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
return [LocationCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
def add_time_cluster(self, cluster: UserCluster):
def add_time_cluster(self, cluster: TimeCluster):
super().insert_entry(self._time_cluster_collection, cluster.to_serializable_dict(for_db=True))
def get_time_clusters(self) -> List[UserCluster]:
def get_time_clusters(self) -> List[TimeCluster]:
clusters = super().get_entries(self._time_cluster_collection)
return [UserCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
return [TimeCluster(c['cluster_label'], json.loads(c['clusters'])) for c in clusters]
def add_popular_location(self, popular_location: PopularLocation):
pass
......@@ -4,7 +4,7 @@ modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.entities import Location, PopularLocation, UserCluster
from db.entities import Location, PopularLocation, LocationCluster, TimeCluster
from typing import List, Dict, Tuple
from db.repository import Repository
from processing.clusterer import Clusterer
......@@ -20,6 +20,7 @@ repo = Repository()
# exit()
def run_location_clustering():
user_clusterer = Clusterer()
......@@ -28,36 +29,57 @@ def run_location_clustering():
cluster_result = user_clusterer.cluster_locations(
[l.to_serializable_dict() for l in all_location_traces])
clusters = [UserCluster(key, value)
clusters = [LocationCluster(key, value)
for key, value in cluster_result.items()]
store_user_clusters(clusters)
store_clusters('locations', clusters)
def run_time_clustering():
user_clusterer = Clusterer(epsilon=10**5.8)
clusters: List[TimeCluster] = []
user_clusterer = Clusterer(epsilon=600) # clustered within 10 minutes
all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces])
# for each date in timestamp list
dates = {trace.timestamp.date() for trace in all_location_traces}
for cur_date in dates:
traces_for_cur_date = [
trace for trace in all_location_traces if trace.timestamp.date() == cur_date]
clusters = [UserCluster(key, value)
for key, value in cluster_result.items()]
for c in clusters:
repo.add_time_cluster(c)
# for each hour of that day
for cur_hour in list(range(24)):
traces_for_time_slice = [
trace for trace in traces_for_cur_date if trace.timestamp.hour == cur_hour]
if len(traces_for_time_slice) == 0:
continue
# clustering per hour
cluster_result = user_clusterer.cluster_times(
[t.to_serializable_dict() for t in traces_for_time_slice])
cur_clusters = [TimeCluster(cur_date, cur_hour, key, value)
for key, value in cluster_result.items()]
clusters.extend(cur_clusters)
store_clusters('times', clusters)
# TODO make abstract for other features
def store_user_clusters(user_clusters: List[UserCluster]):
def store_clusters(type: str, clusters: List):
if DEBUG:
print(user_clusters)
print(clusters)
return
for c in user_clusters:
repo.add_location_cluster(c)
if type == 'locations':
for c in clusters:
repo.add_location_cluster(c)
if type == 'times':
for c in clusters:
repo.add_time_cluster(c)
if __name__ == "__main__":
run_time_clustering()
run_location_clustering()
run_time_clustering()
import unittest
import sys
sys.path.insert(1, './')
# python -m unittest discover -v tests
from db.entities.cluster import Cluster
from db.entities import TimeCluster, LocationCluster
from datetime import date, datetime
import json
class TestCluster(unittest.TestCase):
def test_init_Cluster(self):
c = Cluster(1, [1, 2, 3])
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
class TestLocationCluster(unittest.TestCase):
def setUp(self):
self.c = LocationCluster(1, [1, 2, 3])
def test_init_individualArguments(self):
c = LocationCluster(1, [1, 2, 3])
self.assertEqual('1', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': [1, 2, 3]}
c = LocationCluster(location_dict=dict_)
self.assertEqual('123', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument_fromDb(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': '[1, 2, 3]'}
c = LocationCluster(location_dict=dict_, from_db=True)
self.assertEqual('123', c.id)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_to_serializable_dict_noDb(self):
c_dict = self.c.to_serializable_dict()
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, c_dict['clusters'])
def test_from_serializable_dict_noDb(self):
new_c = LocationCluster()
new_c.from_serializable_dict(self.c.to_serializable_dict())
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
def test_to_serializable_dict_db_jsonClusters(self):
c_dict = self.c.to_serializable_dict(for_db=True)
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, json.loads(c_dict['clusters']))
def test_from_serializable_dict_fromDb(self):
new_c = LocationCluster()
new_c.from_serializable_dict(
self.c.to_serializable_dict(for_db=True), from_db=True)
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
class TestTimeCluster(unittest.TestCase):
def setUp(self):
self.date_ = date(2020, 1, 1)
self.c = TimeCluster(self.date_, 14, 1, [1, 2, 3])
def test_init_individualArguments(self):
c = TimeCluster(self.date_, 14, 1, [1, 2, 3])
self.assertEqual(f'{self.date_}-14-1', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': [1, 2, 3],
'date': str(self.date_), 'hour': 14}
c = TimeCluster(time_dict=dict_)
self.assertEqual('123', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_init_dictArgument_fromDb(self):
dict_ = {'id': '123', 'cluster_label': 1, 'clusters': '[1, 2, 3]',
'date': str(self.date_), 'hour': 14}
c = TimeCluster(time_dict=dict_, from_db=True)
self.assertEqual('123', c.id)
self.assertEqual(self.date_, c.date)
self.assertEqual(14, c.hour)
self.assertEqual(1, c.cluster_label)
self.assertEqual([1, 2, 3], c.clusters)
def test_to_serializable_dict_noDb(self):
c_dict = self.c.to_serializable_dict()
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, c_dict['clusters'])
self.assertEqual(self.c.date, datetime.strptime(
c_dict['date'], '%Y-%m-%d').date())
self.assertEqual(self.c.hour, c_dict['hour'])
def test_from_serializable_dict_noDb(self):
new_c = TimeCluster()
new_c.from_serializable_dict(self.c.to_serializable_dict())
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
def test_to_serializable_dict_fromDb_jsonClusters(self):
c_dict = self.c.to_serializable_dict(for_db=True)
self.assertEqual(self.c.id, c_dict['id'])
self.assertEqual(self.c.cluster_label, c_dict['cluster_label'])
self.assertEqual(self.c.clusters, json.loads(c_dict['clusters']))
self.assertEqual(self.c.date, datetime.strptime(
c_dict['date'], '%Y-%m-%d').date())
self.assertEqual(self.c.hour, c_dict['hour'])
def test_from_serializable_dict_fromDb(self):
new_c = TimeCluster()
new_c.from_serializable_dict(
self.c.to_serializable_dict(for_db=True), from_db=True)
self.assertEqual(self.c.id, new_c.id)
self.assertEqual(str(self.c), str(new_c))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment