Commit e4993f0c authored by Alexander's avatar Alexander

generalized clusterer methods

parent 8287b767
...@@ -43,32 +43,38 @@ class Clusterer: ...@@ -43,32 +43,38 @@ class Clusterer:
return fig return fig
# TODO refactor for other input def create_labels(self, features:np.ndarray) -> List:
def create_labels(self, locations:List) -> List: if features is None or len(features) == 0:
if locations is None or len(locations) == 0: return features # trash in trash out
return locations # trash in trash out
locations = self.extract_location_data(locations)
dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points) dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
dbsc = dbsc.fit(locations) dbsc = dbsc.fit(features)
labels = dbsc.labels_ labels = dbsc.labels_
return labels.tolist() return labels.tolist()
def extract_location_data(self, locations: List[dict]) -> np.ndarray: def extract_location_features(self, locations: List[dict]) -> np.ndarray:
return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations]) return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations])
# TODO refactor for other input def extract_time_features(self, times: List[Dict]) -> np.ndarray:
def label_locations(self, locations:List[Dict], labels:List) -> List: return np.asarray([((t['timestamp']), 0) for t in times])
if locations is None or labels is None:
def label_dataset(self, dataset:List[Dict], labels:List) -> List:
if dataset is None or labels is None:
return return
if len(locations) != len(labels): if len(dataset) != len(labels):
raise ValueError("locations and labels has to have same length") raise ValueError("dataset and labels has to have same length")
for i in range(len(locations)): for i in range(len(dataset)):
locations[i]['cluster_label'] = labels[i] dataset[i]['cluster_label'] = labels[i]
def group_by_clusters(self, dataset:List[Dict], labels:List) -> Dict[int, List[Dict]]:
clusters = {}
for label in labels:
clusters[label] = [ds for ds in dataset if ds['cluster_label'] == label]
return clusters
def cluster_locations(self, locations:List[Dict]) -> Dict[int, List[Dict]]: def cluster_locations(self, locations:List[Dict]) -> Dict[int, List[Dict]]:
'''Returns a dictionary with identified clusters and their locations copied from the input''' '''Returns a dictionary with identified clusters and their locations copied from the input'''
...@@ -76,29 +82,18 @@ class Clusterer: ...@@ -76,29 +82,18 @@ class Clusterer:
# raise Exception("locations has to contain something") # raise Exception("locations has to contain something")
return {} return {}
labels = self.create_labels(locations) features = self.extract_location_features(locations)
self.label_locations(locations, labels)
clusters = {} labels = self.create_labels(features)
for label in labels: self.label_dataset(locations, labels)
clusters[label] = [l for l in locations if l['cluster_label'] == label]
return clusters return self.group_by_clusters(locations, labels)
def cluster_times(self, times:List[Dict]) -> Dict[int, List[Dict]]: def cluster_times(self, times:List[Dict]) -> Dict[int, List[Dict]]:
times1 = np.asarray([((t['timestamp']), 0) for t in times]) '''Returns a dictionary with identified clusters and their times copied from the input'''
features = self.extract_time_features(times)
# TODO refactor for other input
dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
dbsc = dbsc.fit(times1)
labels = dbsc.labels_.tolist()
self.label_locations(times, labels) labels = self.create_labels(features)
self.label_dataset(times, labels)
clusters = {} return self.group_by_clusters(times, labels)
for label in labels: \ No newline at end of file
clusters[label] = [l for l in times if l['cluster_label'] == label]
# fig = self._draw_locations(locations=times1, partition_info=labels)
# fig.savefig('img.png')
return clusters
\ No newline at end of file
...@@ -10,15 +10,20 @@ from db.repository import Repository ...@@ -10,15 +10,20 @@ from db.repository import Repository
from processing.clusterer import Clusterer from processing.clusterer import Clusterer
DEBUG = True DEBUG = False
repo = Repository() repo = Repository()
# locs = repo.get_agi_locations()
# for l in locs:
# repo.add_location(l)
# exit()
def run_location_clustering(): def run_location_clustering():
user_clusterer = Clusterer() user_clusterer = Clusterer()
all_location_traces = repo.get_agi_locations() all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_locations( cluster_result = user_clusterer.cluster_locations(
[l.to_serializable_dict() for l in all_location_traces]) [l.to_serializable_dict() for l in all_location_traces])
...@@ -32,7 +37,7 @@ def run_location_clustering(): ...@@ -32,7 +37,7 @@ def run_location_clustering():
def run_time_clustering(): def run_time_clustering():
user_clusterer = Clusterer(epsilon=10**5.8) user_clusterer = Clusterer(epsilon=10**5.8)
all_location_traces = repo.get_agi_locations() all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces]) cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces])
...@@ -43,6 +48,7 @@ def run_time_clustering(): ...@@ -43,6 +48,7 @@ def run_time_clustering():
repo.add_time_cluster(c) repo.add_time_cluster(c)
# TODO make abstract for other features
def store_user_clusters(user_clusters: List[UserCluster]): def store_user_clusters(user_clusters: List[UserCluster]):
if DEBUG: if DEBUG:
print(user_clusters) print(user_clusters)
...@@ -54,3 +60,4 @@ def store_user_clusters(user_clusters: List[UserCluster]): ...@@ -54,3 +60,4 @@ def store_user_clusters(user_clusters: List[UserCluster]):
if __name__ == "__main__": if __name__ == "__main__":
run_time_clustering() run_time_clustering()
run_location_clustering()
...@@ -20,13 +20,15 @@ class TestClusterer(unittest.TestCase): ...@@ -20,13 +20,15 @@ class TestClusterer(unittest.TestCase):
self.assertEqual([], labels) self.assertEqual([], labels)
def test_create_labels_singleInput_singleCluster(self): def test_create_labels_singleInput_singleCluster(self):
labels = self.clusterer.create_labels([self.location(1,2)]) features = self.clusterer.extract_location_features([self.location(1,2)])
labels = self.clusterer.create_labels(features)
self.assertEqual(1, len(labels)) self.assertEqual(1, len(labels))
def test_create_labels_nearInputs_singleCluster(self): def test_create_labels_nearInputs_singleCluster(self):
locations = [self.location(1,2), self.location(2,2)] locations = [self.location(1,2), self.location(2,2)]
labels = self.clusterer.create_labels(locations) features = self.clusterer.extract_location_features(locations)
labels = self.clusterer.create_labels(features)
self.assertEqual(2, len(labels)) self.assertEqual(2, len(labels))
self.assertEqual(labels[0], labels[1]) self.assertEqual(labels[0], labels[1])
...@@ -34,36 +36,37 @@ class TestClusterer(unittest.TestCase): ...@@ -34,36 +36,37 @@ class TestClusterer(unittest.TestCase):
def test_create_labels_nearInputs_twoClusters(self): def test_create_labels_nearInputs_twoClusters(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)] locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = self.clusterer.create_labels(locations) features = self.clusterer.extract_location_features(locations)
labels = self.clusterer.create_labels(features)
self.assertEqual(3, len(labels)) self.assertEqual(3, len(labels))
self.assertEqual(labels[0], labels[1]) self.assertEqual(labels[0], labels[1])
self.assertNotEqual(labels[0], labels[2]) self.assertNotEqual(labels[0], labels[2])
def test_label_locations_NoneLocations_NoException(self): def test_label_locations_NoneLocations_NoException(self):
self.clusterer.label_locations(None, []) self.clusterer.label_dataset(None, [])
def test_label_locations_NoneLabels_NoException(self): def test_label_locations_NoneLabels_NoException(self):
self.clusterer.label_locations([], None) self.clusterer.label_dataset([], None)
def test_label_locations_emptyInput_emptyOutput(self): def test_label_locations_emptyInput_emptyOutput(self):
locations = [] locations = []
self.clusterer.label_locations(locations, []) self.clusterer.label_dataset(locations, [])
self.assertEqual(0, len(locations)) self.assertEqual(0, len(locations))
def test_label_locations_diffInputLengths_ValueError_1(self): def test_label_locations_diffInputLengths_ValueError_1(self):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.clusterer.label_locations([], [1]) self.clusterer.label_dataset([], [1])
def test_label_locations_diffInputLengths_ValueError_2(self): def test_label_locations_diffInputLengths_ValueError_2(self):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.clusterer.label_locations([self.location(1,2)], []) self.clusterer.label_dataset([self.location(1,2)], [])
def test_label_locations_multInput_correctlyLabeled(self): def test_label_locations_multInput_correctlyLabeled(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)] locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = [17,2,20] labels = [17,2,20]
self.clusterer.label_locations(locations, labels) self.clusterer.label_dataset(locations, labels)
self.assertEqual(3, len(locations)) self.assertEqual(3, len(locations))
self.assertHaveLabelsAsNewKey(locations, labels) self.assertHaveLabelsAsNewKey(locations, labels)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment