Commit e4993f0c authored by Alexander's avatar Alexander

generalized clusterer methods

parent 8287b767
......@@ -43,32 +43,38 @@ class Clusterer:
return fig
# TODO refactor for other input
def create_labels(self, locations:List) -> List:
if locations is None or len(locations) == 0:
return locations # trash in trash out
locations = self.extract_location_data(locations)
def create_labels(self, features:np.ndarray) -> List:
if features is None or len(features) == 0:
return features # trash in trash out
dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
dbsc = dbsc.fit(locations)
dbsc = dbsc.fit(features)
labels = dbsc.labels_
return labels.tolist()
def extract_location_data(self, locations: List[dict]) -> np.ndarray:
def extract_location_features(self, locations: List[dict]) -> np.ndarray:
return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations])
# TODO refactor for other input
def label_locations(self, locations:List[Dict], labels:List) -> List:
if locations is None or labels is None:
def extract_time_features(self, times: List[Dict]) -> np.ndarray:
return np.asarray([((t['timestamp']), 0) for t in times])
def label_dataset(self, dataset:List[Dict], labels:List) -> List:
if dataset is None or labels is None:
return
if len(locations) != len(labels):
raise ValueError("locations and labels has to have same length")
if len(dataset) != len(labels):
raise ValueError("dataset and labels has to have same length")
for i in range(len(locations)):
locations[i]['cluster_label'] = labels[i]
for i in range(len(dataset)):
dataset[i]['cluster_label'] = labels[i]
def group_by_clusters(self, dataset:List[Dict], labels:List) -> Dict[int, List[Dict]]:
clusters = {}
for label in labels:
clusters[label] = [ds for ds in dataset if ds['cluster_label'] == label]
return clusters
def cluster_locations(self, locations:List[Dict]) -> Dict[int, List[Dict]]:
'''Returns a dictionary with identified clusters and their locations copied from the input'''
......@@ -76,29 +82,18 @@ class Clusterer:
# raise Exception("locations has to contain something")
return {}
labels = self.create_labels(locations)
self.label_locations(locations, labels)
clusters = {}
for label in labels:
clusters[label] = [l for l in locations if l['cluster_label'] == label]
features = self.extract_location_features(locations)
return clusters
labels = self.create_labels(features)
self.label_dataset(locations, labels)
return self.group_by_clusters(locations, labels)
def cluster_times(self, times:List[Dict]) -> Dict[int, List[Dict]]:
times1 = np.asarray([((t['timestamp']), 0) for t in times])
'''Returns a dictionary with identified clusters and their times copied from the input'''
features = self.extract_time_features(times)
# TODO refactor for other input
dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
dbsc = dbsc.fit(times1)
labels = dbsc.labels_.tolist()
labels = self.create_labels(features)
self.label_dataset(times, labels)
self.label_locations(times, labels)
clusters = {}
for label in labels:
clusters[label] = [l for l in times if l['cluster_label'] == label]
# fig = self._draw_locations(locations=times1, partition_info=labels)
# fig.savefig('img.png')
return clusters
\ No newline at end of file
return self.group_by_clusters(times, labels)
\ No newline at end of file
......@@ -10,15 +10,20 @@ from db.repository import Repository
from processing.clusterer import Clusterer
DEBUG = True
DEBUG = False
repo = Repository()
# locs = repo.get_agi_locations()
# for l in locs:
# repo.add_location(l)
# exit()
def run_location_clustering():
user_clusterer = Clusterer()
all_location_traces = repo.get_agi_locations()
all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_locations(
[l.to_serializable_dict() for l in all_location_traces])
......@@ -32,7 +37,7 @@ def run_location_clustering():
def run_time_clustering():
user_clusterer = Clusterer(epsilon=10**5.8)
all_location_traces = repo.get_agi_locations()
all_location_traces = repo.get_locations()
cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces])
......@@ -43,6 +48,7 @@ def run_time_clustering():
repo.add_time_cluster(c)
# TODO make abstract for other features
def store_user_clusters(user_clusters: List[UserCluster]):
if DEBUG:
print(user_clusters)
......@@ -54,3 +60,4 @@ def store_user_clusters(user_clusters: List[UserCluster]):
if __name__ == "__main__":
run_time_clustering()
run_location_clustering()
......@@ -20,13 +20,15 @@ class TestClusterer(unittest.TestCase):
self.assertEqual([], labels)
def test_create_labels_singleInput_singleCluster(self):
labels = self.clusterer.create_labels([self.location(1,2)])
features = self.clusterer.extract_location_features([self.location(1,2)])
labels = self.clusterer.create_labels(features)
self.assertEqual(1, len(labels))
def test_create_labels_nearInputs_singleCluster(self):
locations = [self.location(1,2), self.location(2,2)]
labels = self.clusterer.create_labels(locations)
features = self.clusterer.extract_location_features(locations)
labels = self.clusterer.create_labels(features)
self.assertEqual(2, len(labels))
self.assertEqual(labels[0], labels[1])
......@@ -34,36 +36,37 @@ class TestClusterer(unittest.TestCase):
def test_create_labels_nearInputs_twoClusters(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = self.clusterer.create_labels(locations)
features = self.clusterer.extract_location_features(locations)
labels = self.clusterer.create_labels(features)
self.assertEqual(3, len(labels))
self.assertEqual(labels[0], labels[1])
self.assertNotEqual(labels[0], labels[2])
def test_label_locations_NoneLocations_NoException(self):
self.clusterer.label_locations(None, [])
self.clusterer.label_dataset(None, [])
def test_label_locations_NoneLabels_NoException(self):
self.clusterer.label_locations([], None)
self.clusterer.label_dataset([], None)
def test_label_locations_emptyInput_emptyOutput(self):
locations = []
self.clusterer.label_locations(locations, [])
self.clusterer.label_dataset(locations, [])
self.assertEqual(0, len(locations))
def test_label_locations_diffInputLengths_ValueError_1(self):
with self.assertRaises(ValueError):
self.clusterer.label_locations([], [1])
self.clusterer.label_dataset([], [1])
def test_label_locations_diffInputLengths_ValueError_2(self):
with self.assertRaises(ValueError):
self.clusterer.label_locations([self.location(1,2)], [])
self.clusterer.label_dataset([self.location(1,2)], [])
def test_label_locations_multInput_correctlyLabeled(self):
locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
labels = [17,2,20]
self.clusterer.label_locations(locations, labels)
self.clusterer.label_dataset(locations, labels)
self.assertEqual(3, len(locations))
self.assertHaveLabelsAsNewKey(locations, labels)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment