Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
d728e14e
Commit
d728e14e
authored
Jun 16, 2020
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Improved tests for clustering
parent
ad4c67ae
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
95 additions
and
17 deletions
+95
-17
clusterer.py
...overy-microservice/app/processing/clustering/clusterer.py
+2
-1
test_clusterer.py
...-stage-discovery-microservice/app/tests/test_clusterer.py
+21
-16
vis_cluster_results.py
...ry-microservice/app/visualizations/vis_cluster_results.py
+72
-0
No files found.
src/data-hub/role-stage-discovery-microservice/app/processing/clustering/clusterer.py
View file @
d728e14e
...
...
@@ -22,6 +22,8 @@ class Clusterer:
'''Creates labels for the items based on OPTICS.'''
if
features
is
None
or
len
(
features
)
==
0
:
return
features
# trash in trash out
if
len
(
features
)
==
1
:
return
[
-
1
]
optics
=
OPTICS
(
min_samples
=
self
.
min_points
)
optics
=
optics
.
fit
(
features
)
...
...
@@ -31,7 +33,6 @@ class Clusterer:
def
_extract_features
(
self
,
dataset
:
List
[
Dict
],
features
:
List
[
str
])
->
np
.
ndarray
:
'''Extracts the feature values from the dataset into a np array with same order as original dataset.'''
# TODO single input
extracted_features
=
[]
for
data
in
dataset
:
entry
=
[
float
(
data
[
feature
])
for
feature
in
features
]
...
...
src/data-hub/role-stage-discovery-microservice/app/tests/test_clusterer.py
View file @
d728e14e
...
...
@@ -46,19 +46,22 @@ class TestClusterer(unittest.TestCase):
def
test_create_labels_emptyInput_emptyOutput
(
self
):
labels
=
self
.
clusterer
.
create_labels
([])
self
.
assertEqual
([],
labels
)
def
test_create_labels_singleInput_error
(
self
):
clusterer
=
Clusterer
(
min_points
=
2
)
features
=
clusterer
.
_extract_features
(
dataset
=
[
self
.
location
(
1
,
2
)],
features
=
self
.
get_location_features
())
with
self
.
assertRaises
(
ValueError
):
# Fails because (min_pts > |input elements|)
clusterer
.
create_labels
(
features
)
def
test_create_labels_singleInput_error_2
(
self
):
def
test_create_labels_singleInput_noise
(
self
):
clusterer
=
Clusterer
(
min_points
=
1
)
features
=
clusterer
.
_extract_features
(
dataset
=
[
self
.
location
(
1
,
2
)],
features
=
self
.
get_location_features
())
labels
=
clusterer
.
create_labels
(
features
)
self
.
assertEqual
(
1
,
len
(
labels
))
self
.
assertEqual
(
-
1
,
labels
[
0
])
def
test_create_labels_tooSmallInputForMinPtsHyperparameter_error
(
self
):
clusterer
=
Clusterer
(
min_points
=
3
)
features
=
clusterer
.
_extract_features
(
dataset
=
[
self
.
location
(
1
,
2
),
self
.
location
(
1
,
2
)],
features
=
self
.
get_location_features
())
with
self
.
assertRaises
(
ValueError
):
# Fails because
fitting does not work internally
# Fails because
(min_pts > |input elements|)
clusterer
.
create_labels
(
features
)
def
test_create_labels_nearInputs_singleCluster
(
self
):
...
...
@@ -138,15 +141,17 @@ class TestClusterer(unittest.TestCase):
self
.
assertClusteringResult
(
exp_res
,
res
)
def
test_cluster_dataset_locationsMultInput_correctlyLabeled_2
(
self
):
return
# TODO why is the single location added to the last cluster?
clusterer
=
Clusterer
(
3
)
locations
=
[
self
.
location
(
1
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
20
,
20
),
self
.
location
(
20
,
21
),
self
.
location
(
20
,
20
),
self
.
location
(
400
,
1000
),
self
.
location
(
200
,
1
),
self
.
location
(
200
,
2
),
self
.
location
(
201
,
-
1
)]
labels
=
[
0
,
0
,
1
,
1
]
exp_res
=
{
0
:
locations
[
0
:
2
],
1
:
locations
[
2
:
4
]}
locations
=
[
self
.
location
(
1
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
20
,
20
),
self
.
location
(
20
,
21
),
self
.
location
(
20
,
20
),
self
.
location
(
50
,
50
),
self
.
location
(
50
,
1
),
self
.
location
(
50
,
2
),
self
.
location
(
50
,
-
1
)
]
labels
=
[
0
,
0
,
0
,
1
,
1
,
1
,
-
1
,
2
,
2
,
2
]
exp_res
=
{
0
:
locations
[
0
:
3
],
1
:
locations
[
3
:
6
],
-
1
:
locations
[
6
:
7
],
2
:
locations
[
7
:
10
]}
res
=
clusterer
.
cluster_dataset
(
locations
,
self
.
get_location_features
())
print
(
res
)
self
.
assertHaveLabelsAsNewKey
(
locations
,
labels
)
self
.
assertClusteringResult
(
exp_res
,
res
)
...
...
src/data-hub/role-stage-discovery-microservice/app/visualizations/vis_cluster_results.py
0 → 100644
View file @
d728e14e
# clustering of generated nodes
import
sys
import
os
modules_path
=
'./'
if
os
.
path
.
exists
(
modules_path
):
sys
.
path
.
insert
(
1
,
modules_path
)
import
matplotlib.pyplot
as
plt
import
sklearn.datasets
import
numpy
as
np
from
processing.clustering.clusterer
import
Clusterer
# parameters for data generation
N_SAMPLES
=
20
N_FEATURES
=
2
N_CENTERS
=
3
STD_DEVIATION
=
1.0
def
show_generated_data
(
ax
,
nodes
,
labels
):
distinct_colors
=
plt
.
cm
.
rainbow
(
np
.
linspace
(
0
,
1
,
len
(
set
(
labels
))))
colors
=
[
distinct_colors
[
label
]
for
label
in
labels
]
ax
.
set_title
(
'Generated Dataset'
)
ax
.
set_xlabel
(
'Feature 1'
)
ax
.
set_ylabel
(
'Feature 2'
)
ax
.
scatter
(
nodes
[:,
0
],
nodes
[:,
1
],
c
=
colors
)
def
show_clustering_result
(
ax
,
min_pts
,
clusters
:
dict
):
labels
=
clusters
.
keys
()
# flatten values in dict
nodes
=
[
node
for
subset
in
clusters
.
values
()
for
node
in
subset
]
if
-
1
in
labels
:
# clustering contains noise, add them in black
distinct_colors
=
plt
.
cm
.
rainbow
(
np
.
linspace
(
0
,
1
,
len
(
set
(
labels
))
-
1
))
distinct_colors
=
np
.
append
(
distinct_colors
,
[[
0
,
0
,
0
,
1
]],
axis
=
0
)
else
:
distinct_colors
=
plt
.
cm
.
rainbow
(
np
.
linspace
(
0
,
1
,
len
(
set
(
labels
))))
colors
=
[
distinct_colors
[
node
[
'cluster_label'
]]
for
node
in
nodes
]
ax
.
set_title
(
f
'Clustering Result with MinPts={min_pts}'
)
ax
.
set_xlabel
(
'Feature 1'
)
ax
.
set_ylabel
(
'Feature 2'
)
ax
.
scatter
(
[
n
[
'1'
]
for
n
in
nodes
],
[
n
[
'2'
]
for
n
in
nodes
],
c
=
colors
)
def
run_clustering
(
min_points
,
dataset
):
clusterer
=
Clusterer
(
min_points
=
min_points
)
return
clusterer
.
cluster_dataset
(
dataset
=
dataset
,
features
=
[
'1'
,
'2'
]
)
if
__name__
==
'__main__'
:
fig
,
((
ax1
,
ax2
),
(
ax3
,
ax4
))
=
plt
.
subplots
(
2
,
2
)
fig
.
tight_layout
(
pad
=
3.0
)
nodes
,
labels
=
sklearn
.
datasets
.
make_blobs
(
n_samples
=
20
,
n_features
=
N_FEATURES
,
centers
=
[[
5
,
5
]],
cluster_std
=
1
)
nodes2
,
labels2
=
sklearn
.
datasets
.
make_blobs
(
n_samples
=
20
,
n_features
=
N_FEATURES
,
centers
=
[[
30
,
5
]],
cluster_std
=
5
)
nodes
=
np
.
append
(
nodes
,
nodes2
,
axis
=
0
)
labels
=
np
.
append
(
labels
,
labels2
+
1
)
show_generated_data
(
ax1
,
nodes
,
labels
)
for
min_pts
,
ax
in
zip
([
5
,
10
,
15
],
[
ax2
,
ax3
,
ax4
]):
dataset
=
[{
'1'
:
n
[
0
],
'2'
:
n
[
1
]}
for
n
in
nodes
]
clusters
=
run_clustering
(
min_pts
,
dataset
)
show_clustering_result
(
ax
,
min_pts
,
clusters
)
plt
.
show
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment