Improved tests for clustering

d728e14e · Alexander Lercher · ad4c67ae · d728e14e · d728e14e · d728e14e
Commit d728e14e authored Jun 16, 2020 by Alexander Lercher
3 changed files
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/clustering/clusterer.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/clustering/clusterer.py
@@ -22,6 +22,8 @@ class Clusterer:
        '''Creates labels for the items based on OPTICS.'''
        if features is None or len(features) == 0:
            return features # trash in trash out
+        if len(features) == 1:
+            return [-1]
    
        optics = OPTICS(min_samples=self.min_points)
        optics = optics.fit(features)
@@ -31,7 +33,6 @@ class Clusterer:

    def _extract_features(self, dataset: List[Dict], features:List[str]) -> np.ndarray:
        '''Extracts the feature values from the dataset into a np array with same order as original dataset.'''
-        # TODO single input
        extracted_features = []
        for data in dataset:
            entry = [float(data[feature]) for feature in features]

--- a/src/data-hub/role-stage-discovery-microservice/app/tests/test_clusterer.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/tests/test_clusterer.py
@@ -46,19 +46,22 @@ class TestClusterer(unittest.TestCase):
    def test_create_labels_emptyInput_emptyOutput(self):
        labels = self.clusterer.create_labels([])
        self.assertEqual([], labels)
-    
-    def test_create_labels_singleInput_error(self):
-        clusterer = Clusterer(min_points=2) 
-        features =  clusterer._extract_features(dataset=[self.location(1,2)], features=self.get_location_features())
-        with self.assertRaises(ValueError):
-            # Fails because (min_pts > |input elements|)
-            clusterer.create_labels(features)
-            
-    def test_create_labels_singleInput_error_2(self):
+                
+    def test_create_labels_singleInput_noise(self):
        clusterer = Clusterer(min_points=1)
+
        features =  clusterer._extract_features(dataset=[self.location(1,2)], features=self.get_location_features())
+        labels = clusterer.create_labels(features)
+
+        self.assertEqual(1, len(labels))
+        self.assertEqual(-1, labels[0])
+
+    def test_create_labels_tooSmallInputForMinPtsHyperparameter_error(self):
+        clusterer = Clusterer(min_points=3) 
+
+        features =  clusterer._extract_features(dataset=[self.location(1,2), self.location(1,2)], features=self.get_location_features())
        with self.assertRaises(ValueError):
-            # Fails because fitting does not work internally 
+            # Fails because (min_pts > |input elements|)
            clusterer.create_labels(features)

    def test_create_labels_nearInputs_singleCluster(self):
@@ -138,15 +141,17 @@ class TestClusterer(unittest.TestCase):
        self.assertClusteringResult(exp_res, res)

    def test_cluster_dataset_locationsMultInput_correctlyLabeled_2(self):
-        return
-        # TODO why is the single location added to the last cluster?
        clusterer = Clusterer(3)
-        locations = [self.location(1,2), self.location(2,2), self.location(2,2), self.location(20,20), self.location(20,21),  self.location(20,20), self.location(400,1000), self.location(200,1), self.location(200,2), self.location(201,-1)]
-        labels = [0,0,1,1]
-        exp_res = {0:locations[0:2], 1:locations[2:4]}
+        locations = [
+                    self.location(1,2), self.location(2,2), self.location(2,2), 
+                    self.location(20,20), self.location(20,21),  self.location(20,20), 
+                    self.location(50,50), 
+                    self.location(50,1), self.location(50,2), self.location(50,-1)
+                    ]
+        labels = [0,0,0,1,1,1,-1,2,2,2]
+        exp_res = {0:locations[0:3], 1:locations[3:6], -1:locations[6:7], 2:locations[7:10]}

        res = clusterer.cluster_dataset(locations, self.get_location_features())
-        print(res)

        self.assertHaveLabelsAsNewKey(locations, labels)
        self.assertClusteringResult(exp_res, res)

--- a/src/data-hub/role-stage-discovery-microservice/app/visualizations/vis_cluster_results.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/visualizations/vis_cluster_results.py
+# clustering of generated nodes
+import sys
+import os
+modules_path = './'
+if os.path.exists(modules_path):
+    sys.path.insert(1, modules_path)
+
+import matplotlib.pyplot as plt
+import sklearn.datasets
+import numpy as np
+from processing.clustering.clusterer import Clusterer
+
+# parameters for data generation
+N_SAMPLES = 20
+N_FEATURES = 2
+N_CENTERS = 3
+STD_DEVIATION = 1.0
+
+def show_generated_data(ax, nodes, labels):
+    distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))))
+    colors = [distinct_colors[label] for label in labels]
+
+    ax.set_title('Generated Dataset')
+    ax.set_xlabel('Feature 1')
+    ax.set_ylabel('Feature 2')
+    ax.scatter(nodes[:,0], nodes[:,1], c=colors)
+
+def show_clustering_result(ax, min_pts, clusters: dict):
+    labels = clusters.keys()
+    # flatten values in dict
+    nodes = [node for subset in clusters.values() for node in subset]
+    
+    if -1 in labels:
+        # clustering contains noise, add them in black
+        distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))-1))
+        distinct_colors = np.append(distinct_colors, [[0,0,0,1]], axis=0)
+    else:
+        distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))))
+    colors = [distinct_colors[node['cluster_label']] for node in nodes]
+
+    ax.set_title(f'Clustering Result with MinPts={min_pts}')
+    ax.set_xlabel('Feature 1')
+    ax.set_ylabel('Feature 2')
+    ax.scatter( [n['1'] for n in nodes], 
+                [n['2'] for n in nodes], 
+                c=colors)
+
+def run_clustering(min_points, dataset):
+    clusterer = Clusterer(min_points=min_points)
+    return clusterer.cluster_dataset(
+        dataset=dataset,
+        features=['1','2']
+    )
+
+if __name__ == '__main__':
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
+    fig.tight_layout(pad=3.0)
+
+    nodes, labels = sklearn.datasets.make_blobs(n_samples=20, n_features=N_FEATURES, centers=[[5,5]], cluster_std=1)
+    nodes2, labels2 = sklearn.datasets.make_blobs(n_samples=20, n_features=N_FEATURES, centers=[[30,5]], cluster_std=5)
+   
+    nodes = np.append(nodes, nodes2, axis=0)
+    labels = np.append(labels, labels2+1)
+
+    show_generated_data(ax1, nodes, labels)
+
+    for min_pts, ax in zip([5, 10, 15], [ax2, ax3, ax4]):
+        dataset = [{'1':n[0], '2':n[1]} for n in nodes]
+        clusters = run_clustering(min_pts, dataset)
+        show_clustering_result(ax, min_pts, clusters)
+
+    plt.show()
\ No newline at end of file