generalized clusterer methods

e4993f0c · Alexander · 8287b767 · e4993f0c · e4993f0c · e4993f0c
Commit e4993f0c authored Feb 10, 2020 by Alexander
3 changed files
--- a/src/data-hub/community-detection-microservice/app/processing/clusterer.py
+++ b/src/data-hub/community-detection-microservice/app/processing/clusterer.py
@@ -43,32 +43,38 @@ class Clusterer:
            
        return fig
    
-    # TODO refactor for other input
-    def create_labels(self, locations:List) -> List:
-        if locations is None or len(locations) == 0:
-            return locations # trash in trash out
-
-        locations = self.extract_location_data(locations)
+    def create_labels(self, features:np.ndarray) -> List:
+        if features is None or len(features) == 0:
+            return features # trash in trash out
    
        dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
-        dbsc = dbsc.fit(locations)
+        dbsc = dbsc.fit(features)
        labels = dbsc.labels_

        return labels.tolist()

-    def extract_location_data(self, locations: List[dict]) -> np.ndarray:
+    def extract_location_features(self, locations: List[dict]) -> np.ndarray:
        return np.asarray([(float(l['latitude']), float(l['longitude'])) for l in locations])

-    # TODO refactor for other input
-    def label_locations(self, locations:List[Dict], labels:List) -> List:
-        if locations is None or labels is None:
+    def extract_time_features(self, times: List[Dict]) -> np.ndarray:
+        return np.asarray([((t['timestamp']), 0) for t in times])
+
+    def label_dataset(self, dataset:List[Dict], labels:List) -> List:
+        if dataset is None or labels is None:
            return

-        if len(locations) != len(labels):
-            raise ValueError("locations and labels has to have same length")
+        if len(dataset) != len(labels):
+            raise ValueError("dataset and labels has to have same length")

-        for i in range(len(locations)):
-            locations[i]['cluster_label'] = labels[i]
+        for i in range(len(dataset)):
+            dataset[i]['cluster_label'] = labels[i]
+
+    def group_by_clusters(self, dataset:List[Dict], labels:List) -> Dict[int, List[Dict]]:
+        clusters = {}
+        for label in labels:
+            clusters[label] = [ds for ds in dataset if ds['cluster_label'] == label]
+        
+        return clusters

    def cluster_locations(self, locations:List[Dict]) -> Dict[int, List[Dict]]:
        '''Returns a dictionary with identified clusters and their locations copied from the input'''
@@ -76,29 +82,18 @@ class Clusterer:
            # raise Exception("locations has to contain something")
            return {}

-        labels = self.create_labels(locations)
-        self.label_locations(locations, labels)
-
-        clusters = {}
-        for label in labels:
-            clusters[label] = [l for l in locations if l['cluster_label'] == label]
+        features = self.extract_location_features(locations)
        
-        return clusters
+        labels = self.create_labels(features)
+        self.label_dataset(locations, labels)

+        return self.group_by_clusters(locations, labels)
+        
    def cluster_times(self, times:List[Dict]) -> Dict[int, List[Dict]]:
-        times1 = np.asarray([((t['timestamp']), 0) for t in times])
+        '''Returns a dictionary with identified clusters and their times copied from the input'''
+        features = self.extract_time_features(times)
    
-        # TODO refactor for other input
-        dbsc = DBSCAN(eps = self.epsilon, min_samples = self.min_points)
-        dbsc = dbsc.fit(times1)
-        labels = dbsc.labels_.tolist()
+        labels = self.create_labels(features)
+        self.label_dataset(times, labels)

-        self.label_locations(times, labels)
-
-        clusters = {}
-        for label in labels:
-            clusters[label] = [l for l in times if l['cluster_label'] == label]
-        
-        # fig = self._draw_locations(locations=times1, partition_info=labels)
-        # fig.savefig('img.png')
-        return clusters
\ No newline at end of file
+        return self.group_by_clusters(times, labels)
\ No newline at end of file
--- a/src/data-hub/community-detection-microservice/app/run_clustering.py
+++ b/src/data-hub/community-detection-microservice/app/run_clustering.py
@@ -10,15 +10,20 @@ from db.repository import Repository
 from processing.clusterer import Clusterer


-DEBUG = True
+DEBUG = False

 repo = Repository()

+# locs = repo.get_agi_locations()
+# for l in locs:
+#     repo.add_location(l)
+
+# exit()

 def run_location_clustering():
    user_clusterer = Clusterer()

-    all_location_traces = repo.get_agi_locations()
+    all_location_traces = repo.get_locations()

    cluster_result = user_clusterer.cluster_locations(
        [l.to_serializable_dict() for l in all_location_traces])
@@ -32,7 +37,7 @@ def run_location_clustering():
 def run_time_clustering():
    user_clusterer = Clusterer(epsilon=10**5.8)

-    all_location_traces = repo.get_agi_locations()
+    all_location_traces = repo.get_locations()

    cluster_result = user_clusterer.cluster_times([l.to_serializable_dict() for l in all_location_traces])

@@ -43,6 +48,7 @@ def run_time_clustering():
        repo.add_time_cluster(c)


+# TODO make abstract for other features
 def store_user_clusters(user_clusters: List[UserCluster]):
    if DEBUG:
        print(user_clusters)
@@ -54,3 +60,4 @@ def store_user_clusters(user_clusters: List[UserCluster]):

 if __name__ == "__main__":
    run_time_clustering()
+    run_location_clustering()
--- a/src/data-hub/community-detection-microservice/app/tests/test_clusterer.py
+++ b/src/data-hub/community-detection-microservice/app/tests/test_clusterer.py
@@ -20,13 +20,15 @@ class TestClusterer(unittest.TestCase):
        self.assertEqual([], labels)

    def test_create_labels_singleInput_singleCluster(self):
-        labels = self.clusterer.create_labels([self.location(1,2)])
+        features = self.clusterer.extract_location_features([self.location(1,2)])
+        labels = self.clusterer.create_labels(features)
        self.assertEqual(1, len(labels))

    def test_create_labels_nearInputs_singleCluster(self):
        locations = [self.location(1,2), self.location(2,2)]

-        labels = self.clusterer.create_labels(locations)
+        features = self.clusterer.extract_location_features(locations)
+        labels = self.clusterer.create_labels(features)

        self.assertEqual(2, len(labels))
        self.assertEqual(labels[0], labels[1])
@@ -34,36 +36,37 @@ class TestClusterer(unittest.TestCase):
    def test_create_labels_nearInputs_twoClusters(self):
        locations = [self.location(1,2), self.location(2,2), self.location(20,20)]

-        labels = self.clusterer.create_labels(locations)
+        features = self.clusterer.extract_location_features(locations)
+        labels = self.clusterer.create_labels(features)

        self.assertEqual(3, len(labels))
        self.assertEqual(labels[0], labels[1])
        self.assertNotEqual(labels[0], labels[2])

    def test_label_locations_NoneLocations_NoException(self):
-        self.clusterer.label_locations(None, [])
+        self.clusterer.label_dataset(None, [])

    def test_label_locations_NoneLabels_NoException(self):
-        self.clusterer.label_locations([], None)
+        self.clusterer.label_dataset([], None)

    def test_label_locations_emptyInput_emptyOutput(self):
        locations = []
-        self.clusterer.label_locations(locations, [])
+        self.clusterer.label_dataset(locations, [])
        self.assertEqual(0, len(locations))

    def test_label_locations_diffInputLengths_ValueError_1(self):
        with self.assertRaises(ValueError):
-            self.clusterer.label_locations([], [1])
+            self.clusterer.label_dataset([], [1])

    def test_label_locations_diffInputLengths_ValueError_2(self):
        with self.assertRaises(ValueError):
-            self.clusterer.label_locations([self.location(1,2)], [])
+            self.clusterer.label_dataset([self.location(1,2)], [])

    def test_label_locations_multInput_correctlyLabeled(self):
        locations = [self.location(1,2), self.location(2,2), self.location(20,20)]
        labels = [17,2,20]

-        self.clusterer.label_locations(locations, labels)
+        self.clusterer.label_dataset(locations, labels)

        self.assertEqual(3, len(locations))
        self.assertHaveLabelsAsNewKey(locations, labels)