Visualized clustering results based on small dataset

b6e044be · Alexander Lercher · d032fb36 · b6e044be
Commit b6e044be authored May 13, 2020 by Alexander Lercher
Hide whitespace changes
Inline Side-by-side

Showing with 76 additions and 0 deletions

vis_cluster_results.py ...ion-microservice/app/visualization/vis_cluster_results.py +76 -0

No files found.
--- a/src/data-hub/community-detection-microservice/app/visualization/vis_cluster_results.py
+++ b/src/data-hub/community-detection-microservice/app/visualization/vis_cluster_results.py
+# clustering of generated nodes
+import sys
+import os
+modules_path = './'
+if os.path.exists(modules_path):
+    sys.path.insert(1, modules_path)
+
+import matplotlib.pyplot as plt
+import sklearn.datasets
+import numpy as np
+from processing.clustering.clusterer import Clusterer
+
+# parameters for data generation
+N_SAMPLES = 1000
+N_FEATURES = 2
+N_CENTERS = 3
+STD_DEVIATION = 1.0
+
+def show_generated_data(ax, nodes, labels):
+    distinct_colors = plt.cm.rainbow(np.linspace(0, 1, N_CENTERS))
+    colors = [distinct_colors[label] for label in labels]
+
+    ax.set_title('Generated Dataset')
+    ax.set_xlabel('Feature 1')
+    ax.set_ylabel('Feature 2')
+    ax.scatter(nodes[:,0], nodes[:,1], c=colors)
+
+def show_clustering_result(ax, min_pts, clusters: dict):
+    labels = clusters.keys()
+    # flatten values in dict
+    nodes = [node for subset in clusters.values() for node in subset]
+    
+    if -1 in labels:
+        # clustering contains noise, add them in black
+        distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))-1))
+        distinct_colors = np.append(distinct_colors, [[0,0,0,1]], axis=0)
+    else:
+        distinct_colors = plt.cm.rainbow(np.linspace(0, 1, len(set(labels))))
+    colors = [distinct_colors[node['cluster_label']] for node in nodes]
+
+    ax.set_title(f'Clustering Result with MinPts={min_pts}')
+    ax.set_xlabel('Feature 1')
+    ax.set_ylabel('Feature 2')
+    ax.scatter( [n['1'] for n in nodes], 
+                [n['2'] for n in nodes], 
+                c=colors)
+
+def run_clustering(min_points, dataset):
+    clusterer = Clusterer(min_points=min_points)
+    return clusterer.cluster_dataset(
+        dataset=dataset,
+        features=['1','2']
+    )
+
+
+if __name__ == '__main__':
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
+    fig.tight_layout(pad=3.0)
+
+    nodes, labels = sklearn.datasets.make_blobs(n_samples=N_SAMPLES, n_features=N_FEATURES, centers=N_CENTERS, cluster_std=STD_DEVIATION)
+    # nodes = np.multiply(nodes, .1)
+    show_generated_data(ax1, nodes, labels)
+
+    dataset = [{'1':n[0], '2':n[1]} for n in nodes]
+    clusters = run_clustering(5, dataset)
+    show_clustering_result(ax2, 5, clusters)
+    
+    dataset = [{'1':n[0], '2':n[1]} for n in nodes]
+    clusters = run_clustering(10, dataset)
+    show_clustering_result(ax3, 10, clusters)
+    
+    dataset = [{'1':n[0], '2':n[1]} for n in nodes]
+    clusters = run_clustering(15, dataset)
+    show_clustering_result(ax4, 15, clusters)
+
+    plt.show()
\ No newline at end of file