Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
71008e36
Commit
71008e36
authored
Mar 31, 2020
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature/generic-clustering' into develop
parents
6f8b2f9c
cc03bae0
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
634 additions
and
34 deletions
+634
-34
clustering.yaml
...munity-detection-microservice/app/configs/clustering.yaml
+8
-0
swagger.yml
.../community-detection-microservice/app/configs/swagger.yml
+199
-12
agi_repository.py
...unity-detection-microservice/app/db/agi/agi_repository.py
+18
-0
__init__.py
...munity-detection-microservice/app/db/entities/__init__.py
+3
-1
cluster.py
...mmunity-detection-microservice/app/db/entities/cluster.py
+30
-2
clusterset.py
...nity-detection-microservice/app/db/entities/clusterset.py
+43
-0
layer.py
...community-detection-microservice/app/db/entities/layer.py
+34
-0
repository.py
...hub/community-detection-microservice/app/db/repository.py
+54
-2
clusterer.py
...ction-microservice/app/processing/clustering/clusterer.py
+55
-9
clustering_config.py
...croservice/app/processing/clustering/clustering_config.py
+45
-0
requirements.txt
...hub/community-detection-microservice/app/requirements.txt
+2
-0
cluster.py
...ub/community-detection-microservice/app/routes/cluster.py
+1
-1
clustersets.py
...ommunity-detection-microservice/app/routes/clustersets.py
+18
-0
debug.py
...-hub/community-detection-microservice/app/routes/debug.py
+0
-0
functions.py
.../community-detection-microservice/app/routes/functions.py
+0
-0
layers.py
...hub/community-detection-microservice/app/routes/layers.py
+31
-0
location.py
...b/community-detection-microservice/app/routes/location.py
+0
-0
user_cluster.py
...mmunity-detection-microservice/app/routes/user_cluster.py
+0
-0
run_clustering.py
...ub/community-detection-microservice/app/run_clustering.py
+32
-5
test_clusterer.py
...munity-detection-microservice/app/tests/test_clusterer.py
+42
-2
test_clustering_config.py
...etection-microservice/app/tests/test_clustering_config.py
+19
-0
No files found.
src/data-hub/community-detection-microservice/app/configs/clustering.yaml
0 → 100644
View file @
71008e36
layers
:
user
:
properties
:
starting-point
:
properties
:
-
Latitude_StartingPoint
-
Longitude_StartingPoint
\ No newline at end of file
src/data-hub/community-detection-microservice/app/configs/swagger.yml
View file @
71008e36
...
...
@@ -14,7 +14,7 @@ basePath: "/api"
paths
:
/debug
:
post
:
operationId
:
"
r
est
.debug.echo"
operationId
:
"
r
outes
.debug.echo"
tags
:
-
"
Echo"
summary
:
"
Echo
function
for
debugging
purposes"
...
...
@@ -29,9 +29,11 @@ paths:
200
:
description
:
"
Successful
echo
of
request
data"
# Locations
# TODO remove
/locations
:
post
:
operationId
:
"
r
est
.location.post"
operationId
:
"
r
outes
.location.post"
tags
:
-
"
Locations"
summary
:
"
Add
new
location
data"
...
...
@@ -48,7 +50,7 @@ paths:
400
:
description
:
"
Invalid
input"
get
:
operationId
:
"
r
est
.location.get"
operationId
:
"
r
outes
.location.get"
tags
:
-
"
Locations"
summary
:
"
Get
location
data"
...
...
@@ -61,7 +63,7 @@ paths:
/location-collections
:
post
:
operationId
:
"
r
est
.location.post_many"
operationId
:
"
r
outes
.location.post_many"
tags
:
-
"
Locations"
summary
:
"
Add
new
location
data
collection"
...
...
@@ -78,9 +80,77 @@ paths:
400
:
description
:
"
Invalid
input"
# Layers
/layers
:
post
:
operationId
:
"
routes.layers.post"
tags
:
-
"
Layers"
summary
:
"
Add
a
new
layer
or
overwrite
an
existing
one"
parameters
:
-
in
:
body
name
:
"
Layer"
description
:
"
The
layer
data
to
be
added"
required
:
true
schema
:
$ref
:
"
#/definitions/Layer-UpperCase"
responses
:
201
:
description
:
"
Successful
operation"
400
:
description
:
"
Invalid
input"
get
:
operationId
:
"
routes.layers.get"
tags
:
-
"
Layers"
summary
:
"
Get
all
layer
data"
parameters
:
[]
responses
:
200
:
description
:
"
Successful
operation"
schema
:
$ref
:
"
#/definitions/LayerCollection"
/layers/names
:
get
:
operationId
:
"
routes.layers.get_names"
tags
:
-
"
Layers"
summary
:
"
Get
all
layer
names"
parameters
:
[]
responses
:
200
:
description
:
"
Successful
operation"
schema
:
type
:
array
items
:
type
:
string
/layers/{name}
:
get
:
operationId
:
"
routes.layers.get_by_name"
tags
:
-
"
Layers"
summary
:
"
Get
layer
data
for
layer-name"
parameters
:
-
name
:
"
name"
in
:
"
path"
description
:
"
Name
of
the
layer
to
return"
required
:
true
type
:
"
string"
responses
:
200
:
description
:
"
Successful
operation"
schema
:
$ref
:
"
#/definitions/Layer"
404
:
description
:
"
Layer
not
found"
# Clusters
# TODO remove partially
/location-clusters
:
get
:
operationId
:
"
r
est
.cluster.get_locations"
operationId
:
"
r
outes
.cluster.get_locations"
tags
:
-
"
Clusters"
summary
:
"
Get
user
communities
clustered
by
location"
...
...
@@ -93,7 +163,7 @@ paths:
# /clusters/cluster.png:
# get:
# operationId: "r
est
.cluster.get_image"
# operationId: "r
outes
.cluster.get_image"
# tags:
# - "Clusters"
# summary: "Get user communities per date per hour as image"
...
...
@@ -106,7 +176,7 @@ paths:
/time-clusters
:
get
:
operationId
:
"
r
est
.cluster.get_times"
operationId
:
"
r
outes
.cluster.get_times"
tags
:
-
"
Clusters"
summary
:
"
Get
user
communities
clustered
by
time
per
hour"
...
...
@@ -119,7 +189,7 @@ paths:
# /agi/clusters/cluster.png:
# get:
# operationId: "r
est
.agi_cluster.get_image"
# operationId: "r
outes
.agi_cluster.get_image"
# tags:
# - "Clusters"
# summary: "Get user communities per date per hour from agi data as image"
...
...
@@ -130,9 +200,58 @@ paths:
# 200:
# description: "Successful operation"
/clustersets
:
get
:
operationId
:
"
routes.clustersets.get"
tags
:
-
"
Clusters"
summary
:
"
Get
clustersets
for
all
layers"
parameters
:
[]
responses
:
200
:
description
:
"
Successful
operation"
schema
:
$ref
:
"
#/definitions/ClusterSetCollection"
/clustersets/names
:
get
:
operationId
:
"
routes.clustersets.get_names"
tags
:
-
"
Clusters"
summary
:
"
Get
clusterset
names
for
all
layers"
parameters
:
[]
responses
:
200
:
description
:
"
Successful
operation"
schema
:
type
:
array
items
:
type
:
string
/clustersets/{name}
:
get
:
operationId
:
"
routes.clustersets.get_by_name"
tags
:
-
"
Clusters"
summary
:
"
Get
clusterset
for
layer-name"
parameters
:
-
name
:
"
name"
in
:
"
path"
description
:
"
Name
of
the
layer
to
return
the
clusterset
for"
required
:
true
type
:
"
string"
responses
:
200
:
description
:
"
Successful
operation"
schema
:
$ref
:
"
#/definitions/ClusterSet"
404
:
description
:
"
Clusterset
not
found"
# TODO remove
/user-cluster-graphs
:
get
:
operationId
:
"
r
est
.user_cluster.get"
operationId
:
"
r
outes
.user_cluster.get"
tags
:
-
"
User
Graphs"
summary
:
"
Get
user
graphs
per
layer
per
cluster"
...
...
@@ -142,10 +261,11 @@ paths:
description
:
"
Successful
operation"
schema
:
$ref
:
"
#/definitions/UserClusterGraphCollection"
# Function Calls
/rfc/run
:
post
:
operationId
:
"
r
est
.functions.run_agi_clustering_and_graph_creation"
operationId
:
"
r
outes
.functions.run_agi_clustering_and_graph_creation"
tags
:
-
"
Remote
function
calls"
summary
:
"
Insert
locations
from
AGI,
create
clusters
for
starting
time
and
location
layers,
create
graphs
for
the
location
clusters"
...
...
@@ -154,6 +274,7 @@ paths:
204
:
description
:
"
Successful
operation"
definitions
:
Location
:
type
:
"
object"
...
...
@@ -176,6 +297,24 @@ definitions:
items
:
$ref
:
"
#/definitions/Location"
Cluster
:
type
:
object
properties
:
cluster_label
:
type
:
number
nodes
:
type
:
array
items
:
type
:
object
example
:
"
Finished_time"
:
1576631193265951
"
Latitude_Destination"
:
-5.973257
"
Longitude_Destination"
:
37.416316
"
TravelID"
:
"
5e57ec9159bc0668543f156a"
"
TravelPrice"
:
15
"
UniqueID"
:
"
a95075f5042b1b27060080156d87fe34ec7e712c5e57ec9159bc0668543f156a"
"
UserID"
:
"
a95075f5042b1b27060080156d87fe34ec7e712c"
LocationCluster
:
type
:
object
properties
:
...
...
@@ -235,4 +374,52 @@ definitions:
UserClusterGraphCollection
:
type
:
array
items
:
$ref
:
"
#/definitions/UserClusterGraph"
\ No newline at end of file
$ref
:
"
#/definitions/UserClusterGraph"
Layer-UpperCase
:
type
:
object
properties
:
LayerName
:
type
:
string
Nodes
:
type
:
array
items
:
type
:
object
Properties
:
type
:
array
items
:
type
:
string
Layer
:
type
:
object
properties
:
layer_name
:
type
:
string
nodes
:
type
:
array
items
:
type
:
object
properties
:
type
:
array
items
:
type
:
string
LayerCollection
:
type
:
array
items
:
$ref
:
"
#/definitions/Layer"
ClusterSet
:
type
:
object
properties
:
layer_name
:
type
:
string
clusters
:
type
:
array
items
:
$ref
:
"
#/definitions/Cluster"
ClusterSetCollection
:
type
:
array
items
:
$ref
:
"
#/definitions/ClusterSet"
\ No newline at end of file
src/data-hub/community-detection-microservice/app/db/agi/agi_repository.py
View file @
71008e36
...
...
@@ -29,6 +29,24 @@ class AgiRepository:
return
locations
def
getLocationsBasedOnNewDataSchema
(
self
):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data
=
{
'layer_name'
:
'Destination'
,
'nodes'
:
self
.
getLocations
(),
'properties'
:
[
'latitude'
,
'longitude'
]
}
return
data
def
getTimesBasedOnNewDataSchema
(
self
):
'''Creates the new data generic schema to be used beginning on 24.03.2020'''
data
=
{
'layer_name'
:
'Starting_Time'
,
'nodes'
:
self
.
getLocations
(),
'properties'
:
[
'timestamp'
]
}
return
data
def
readDataFromFile
(
self
)
->
List
[
Dict
]:
with
open
(
'./db/agi/travels.json'
,
'r'
)
as
f_travels
:
travels
=
json
.
loads
(
f_travels
.
read
())
...
...
src/data-hub/community-detection-microservice/app/db/entities/__init__.py
View file @
71008e36
from
db.entities.location
import
Location
from
db.entities.popular_location
import
PopularLocation
from
db.entities.cluster
import
Cluster
,
LocationCluster
,
TimeCluster
from
db.entities.user_cluster_graph
import
UserClusterGraph
\ No newline at end of file
from
db.entities.clusterset
import
ClusterSet
from
db.entities.user_cluster_graph
import
UserClusterGraph
from
db.entities.layer
import
Layer
src/data-hub/community-detection-microservice/app/db/entities/cluster.py
View file @
71008e36
...
...
@@ -4,10 +4,38 @@ from datetime import date, datetime
class
Cluster
:
def
__init__
(
self
,
cluster_label
:
int
=
None
,
nodes
:
List
=
None
):
'''
A cluster for an arbitrary layer containing some nodes.
:param cluster_label: The label of the cluster unique for the layer
:param nodes: The individual nodes of the cluster
'''
def
__init__
(
self
,
cluster_label
:
int
=
None
,
nodes
:
List
=
None
,
cluster_dict
:
Dict
=
None
,
from_db
=
False
):
self
.
cluster_label
=
cluster_label
self
.
nodes
=
nodes
if
cluster_dict
is
not
None
:
self
.
from_serializable_dict
(
cluster_dict
,
from_db
)
def
to_serializable_dict
(
self
,
for_db
=
False
)
->
Dict
:
return
{
"cluster_label"
:
self
.
cluster_label
,
"nodes"
:
json
.
dumps
(
self
.
nodes
)
if
for_db
else
self
.
nodes
}
def
from_serializable_dict
(
self
,
cluster_dict
:
Dict
,
from_db
=
False
):
self
.
cluster_label
=
cluster_dict
[
"cluster_label"
]
self
.
nodes
=
json
.
loads
(
cluster_dict
[
"nodes"
])
\
if
from_db
else
cluster_dict
[
"nodes"
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"Cluster({self.__repr__()})"
class
LocationCluster
(
Cluster
):
def
__init__
(
self
,
cluster_label
:
int
=
None
,
nodes
:
List
=
None
,
...
...
@@ -67,7 +95,7 @@ class TimeCluster(Cluster):
if
from_db
else
time_dict
[
"nodes"
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
return
json
.
dumps
(
self
.
to_serializable_dict
(
True
))
def
__str__
(
self
):
return
f
"TimeCluster({self.__repr__()})"
src/data-hub/community-detection-microservice/app/db/entities/clusterset.py
0 → 100644
View file @
71008e36
import
json
from
db.entities.cluster
import
Cluster
from
typing
import
List
,
Dict
from
datetime
import
date
,
datetime
class
ClusterSet
:
'''
A clusterset for an arbitrary layer containing all clusters.
:param layer_name: The name of the layer
:param clusters: The individual clusters
'''
def
__init__
(
self
,
layer_name
:
str
=
None
,
clusters
:
List
[
Cluster
]
=
None
,
cluster_set_dict
:
Dict
=
None
,
from_db
=
False
):
self
.
layer_name
=
layer_name
self
.
clusters
=
clusters
if
cluster_set_dict
is
not
None
:
self
.
from_serializable_dict
(
cluster_set_dict
,
from_db
)
def
to_serializable_dict
(
self
,
for_db
=
False
)
->
Dict
:
serialized_dict_clusters
=
[
cluster
.
to_serializable_dict
(
for_db
)
for
cluster
in
self
.
clusters
]
return
{
"layer_name"
:
self
.
layer_name
,
"clusters"
:
json
.
dumps
(
serialized_dict_clusters
)
if
for_db
else
serialized_dict_clusters
}
def
from_serializable_dict
(
self
,
cluster_set_dict
:
Dict
,
from_db
=
False
):
self
.
layer_name
=
cluster_set_dict
[
"layer_name"
]
serialized_dict_clusters
=
json
.
loads
(
cluster_set_dict
[
"clusters"
])
\
if
from_db
else
cluster_set_dict
[
"clusters"
]
self
.
clusters
=
[
Cluster
(
cluster_dict
=
cluster_dict
,
from_db
=
from_db
)
for
cluster_dict
in
serialized_dict_clusters
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"ClusterSet({self.__repr__()})"
src/data-hub/community-detection-microservice/app/db/entities/layer.py
0 → 100644
View file @
71008e36
import
json
from
datetime
import
datetime
from
typing
import
Dict
class
Layer
:
'''
This class represents a single layer of the Multilayer Graph.
:param layer_info: Information as dictionary to restore the layer object.
'''
def
__init__
(
self
,
layer_info
:
Dict
=
None
,
from_db
=
False
):
if
layer_info
is
not
None
:
self
.
from_serializable_dict
(
layer_info
,
from_db
)
def
to_serializable_dict
(
self
,
for_db
=
False
)
->
Dict
:
return
{
"layer_name"
:
self
.
layer_name
,
"properties"
:
self
.
properties
,
"nodes"
:
json
.
dumps
(
self
.
nodes
)
if
for_db
else
self
.
nodes
}
def
from_serializable_dict
(
self
,
layer_info
:
Dict
,
from_db
=
False
):
self
.
layer_name
=
layer_info
[
'layer_name'
]
self
.
properties
=
layer_info
[
'properties'
]
self
.
nodes
=
json
.
loads
(
layer_info
[
"nodes"
])
\
if
from_db
else
layer_info
[
"nodes"
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"Layer({self.__repr__()})"
src/data-hub/community-detection-microservice/app/db/repository.py
View file @
71008e36
...
...
@@ -5,12 +5,12 @@ import json
from
db.agi.agi_repository
import
AgiRepository
from
db.entities
import
Location
,
TimeCluster
,
PopularLocation
,
LocationCluster
,
UserClusterGraph
from
db.entities
import
*
from
typing
import
List
class
Repository
(
MongoRepositoryBase
):
'''This
repository stores and loads locations and clusters with
MongoDb.'''
'''This
is a repository for
MongoDb.'''
def
__init__
(
self
):
super
()
.
__init__
(
netconst
.
COMMUNITY_DETECTION_DB_HOSTNAME
,
...
...
@@ -21,9 +21,12 @@ class Repository(MongoRepositoryBase):
self
.
_location_cluster_collection
=
'location_cluster'
self
.
_time_cluster_collection
=
'time_cluster'
self
.
_user_cluster_graph_collection
=
'user_cluster_graph'
self
.
_layer_collection
=
'layer'
self
.
_clusterset_collection
=
'cluster_set'
self
.
agi_repo
=
AgiRepository
()
#region Location
def
add_location
(
self
,
location
:
Location
):
super
()
.
insert_entry
(
self
.
_location_collection
,
location
.
to_serializable_dict
())
...
...
@@ -34,7 +37,9 @@ class Repository(MongoRepositoryBase):
def
get_agi_locations
(
self
)
->
List
[
Location
]:
agi_locations
=
self
.
agi_repo
.
getLocations
()
return
[
Location
(
agi_loc
)
for
agi_loc
in
agi_locations
]
#endregion
#region Specific Clusters
def
add_location_cluster
(
self
,
cluster
:
LocationCluster
):
super
()
.
insert_entry
(
self
.
_location_cluster_collection
,
cluster
.
to_serializable_dict
(
for_db
=
True
))
...
...
@@ -50,7 +55,9 @@ class Repository(MongoRepositoryBase):
def
get_time_clusters
(
self
)
->
List
[
TimeCluster
]:
clusters
=
super
()
.
get_entries
(
self
.
_time_cluster_collection
)
return
[
TimeCluster
(
time_dict
=
c
,
from_db
=
True
)
for
c
in
clusters
]
#endregion
#region Cluster Graph
def
add_user_cluster_graph
(
self
,
user_graph
:
UserClusterGraph
):
super
()
.
insert_entry
(
self
.
_user_cluster_graph_collection
,
user_graph
.
to_serializable_dict
(
for_db
=
True
))
...
...
@@ -58,3 +65,48 @@ class Repository(MongoRepositoryBase):
def
get_user_cluster_graphs
(
self
)
->
List
[
UserClusterGraph
]:
user_graphs
=
super
()
.
get_entries
(
self
.
_user_cluster_graph_collection
)
return
[
UserClusterGraph
(
dict_
=
u
,
from_db
=
True
)
for
u
in
user_graphs
]
#endregion
#region Layers
def
add_layer
(
self
,
layer
:
Layer
):
super
()
.
insert_entry
(
self
.
_layer_collection
,
layer
.
to_serializable_dict
())
def
get_layers
(
self
)
->
List
[
Layer
]:
entries
=
super
()
.
get_entries
(
self
.
_layer_collection
)
return
[
Layer
(
e
)
for
e
in
entries
]
def
get_layer_names
(
self
)
->
List
[
str
]:
entries
=
super
()
.
get_entries
(
self
.
_layer_collection
,
projection
=
{
'layer_name'
:
1
})
return
[
e
[
'layer_name'
]
for
e
in
entries
]
def
get_layer
(
self
,
layer_name
)
->
Layer
:
entries
=
super
()
.
get_entries
(
self
.
_layer_collection
,
selection
=
{
'layer_name'
:
layer_name
})
entries
=
[
Layer
(
e
)
for
e
in
entries
]
if
entries
is
not
None
and
len
(
entries
)
>
0
:
return
entries
[
0
]
else
:
return
None
#endregion
#region ClusterSet
def
add_clusterset
(
self
,
cluster_set
:
ClusterSet
):
super
()
.
insert_entry
(
self
.
_clusterset_collection
,
cluster_set
.
to_serializable_dict
())
def
get_clustersets
(
self
)
->
List
[
ClusterSet
]:
entries
=
super
()
.
get_entries
(
self
.
_clusterset_collection
)
return
[
ClusterSet
(
cluster_set_dict
=
e
)
for
e
in
entries
]
def
get_clusterset_names
(
self
)
->
List
[
str
]:
entries
=
super
()
.
get_entries
(
self
.
_clusterset_collection
,
projection
=
{
'layer_name'
:
1
})
return
[
e
[
'layer_name'
]
for
e
in
entries
]
def
get_clusterset
(
self
,
layer_name
)
->
ClusterSet
:
entries
=
super
()
.
get_entries
(
self
.
_clusterset_collection
,
selection
=
{
'layer_name'
:
layer_name
})
entries
=
[
ClusterSet
(
cluster_set_dict
=
e
)
for
e
in
entries
]
if
entries
is
not
None
and
len
(
entries
)
>
0
:
return
entries
[
0
]
else
:
return
None
#endregion
src/data-hub/community-detection-microservice/app/processing/clusterer.py
→
src/data-hub/community-detection-microservice/app/processing/cluster
ing/cluster
er.py
View file @
71008e36
...
...
@@ -2,9 +2,20 @@ import json
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
sklearn.cluster
import
DBSCAN
from
typing
import
List
,
Dict
from
typing
import
List
,
Dict
,
Any
,
TypeVar
from
deprecated
import
deprecated
T
=
TypeVar
(
'T'
)
ClusterGroup
=
Dict
[
Any
,
List
[
Dict
]]
class
Clusterer
:
'''
Clusterer for applying density-based clustering on datasets.
The clustering is done with DBSCAN.
:param epsilon: Epsilon used in DBSCAN
:param min_points: Min_points used in DBSCAN
'''
def
__init__
(
self
,
epsilon
=
11
,
min_points
=
2
):
self
.
epsilon
=
epsilon
self
.
min_points
=
min_points
...
...
@@ -43,7 +54,8 @@ class Clusterer:
return
fig
def
create_labels
(
self
,
features
:
np
.
ndarray
)
->
List
:
def
create_labels
(
self
,
features
:
np
.
ndarray
)
->
List
[
int
]:
'''Creates labels for the items based on DBSCAN.'''
if
features
is
None
or
len
(
features
)
==
0
:
return
features
# trash in trash out
...
...
@@ -53,13 +65,25 @@ class Clusterer:
return
labels
.
tolist
()
@
deprecated
(
reason
=
"Use generic version instead"
)
def
extract_location_features
(
self
,
locations
:
List
[
dict
])
->
np
.
ndarray
:
return
np
.
asarray
([(
float
(
l
[
'latitude'
]),
float
(
l
[
'longitude'
]))
for
l
in
locations
])
@
deprecated
(
reason
=
"Use generic version instead"
)
def
extract_time_features
(
self
,
times
:
List
[
Dict
])
->
np
.
ndarray
:
return
np
.
asarray
([((
t
[
'timestamp'
]),
0
)
for
t
in
times
])
return
np
.
asarray
([[
float
(
t
[
'timestamp'
])]
for
t
in
times
])
def
_extract_features
(
self
,
dataset
:
List
[
Dict
],
features
:
List
[
str
])
->
np
.
ndarray
:
'''Extracts the feature values from the dataset into a np array with same order as original dataset.'''
extracted_features
=
[]
for
data
in
dataset
:
entry
=
[
float
(
data
[
feature
])
for
feature
in
features
]
extracted_features
.
append
(
entry
)
def
label_dataset
(
self
,
dataset
:
List
[
Dict
],
labels
:
List
)
->
List
:
return
np
.
asarray
(
extracted_features
)
def
label_dataset
(
self
,
dataset
:
List
[
Dict
],
labels
:
List
[
Any
])
->
List
:
'''Adds the labels to the elements of the dataset at the same position. The new key is called cluster_label.'''
if
dataset
is
None
or
labels
is
None
:
return
...
...
@@ -67,16 +91,21 @@ class Clusterer:
raise
ValueError
(
"dataset and labels has to have same length"
)
for
i
in
range
(
len
(
dataset
)):
if
'cluster_label'
in
dataset
[
i
]:
continue
dataset
[
i
][
'cluster_label'
]
=
labels
[
i
]
def
group_by_clusters
(
self
,
dataset
:
List
[
Dict
],
labels
:
List
)
->
Dict
[
int
,
List
[
Dict
]]:
def
group_by_clusters
(
self
,
dataset
:
List
[
Dict
],
labels
:
List
[
Any
])
->
ClusterGroup
:
self
.
label_dataset
(
dataset
,
labels
)
clusters
=
{}
for
label
in
labels
:
clusters
[
label
]
=
[
ds
for
ds
in
dataset
if
ds
[
'cluster_label'
]
==
label
]
return
clusters
def
cluster_locations
(
self
,
locations
:
List
[
Dict
])
->
Dict
[
int
,
List
[
Dict
]]:
@
deprecated
(
reason
=
"Use generic version instead"
)
def
cluster_locations
(
self
,
locations
:
List
[
Dict
])
->
ClusterGroup
:
'''Returns a dictionary with identified clusters and their locations copied from the input'''
if
locations
is
None
or
len
(
locations
)
==
0
:
# raise Exception("locations has to contain something")
...
...
@@ -88,12 +117,29 @@ class Clusterer:
self
.
label_dataset
(
locations
,
labels
)
return
self
.
group_by_clusters
(
locations
,
labels
)
def
cluster_times
(
self
,
times
:
List
[
Dict
])
->
Dict
[
int
,
List
[
Dict
]]:
@
deprecated
(
reason
=
"Use generic version instead"
)
def
cluster_times
(
self
,
times
:
List
[
Dict
])
->
ClusterGroup
:
'''Returns a dictionary with identified clusters and their times copied from the input'''
features
=
self
.
extract_time_features
(
times
)
labels
=
self
.
create_labels
(
features
)
self
.
label_dataset
(
times
,
labels
)
return
self
.
group_by_clusters
(
times
,
labels
)
\ No newline at end of file
return
self
.
group_by_clusters
(
times
,
labels
)
def
cluster_dataset
(
self
,
dataset
:
List
[
Dict
],
features
:
List
[
str
])
->
ClusterGroup
:
'''
Returns the identified clusters containing a subset of nodes from the dataset.
:param dataset: The nodes to assign to clusters
:param features: The feature names of the nodes to use for clustering
:returns: A dictionary of clusters, where each value is a non-empty subset of dataset if dataset was not empty
'''
arr
=
self
.
_extract_features
(
dataset
,
features
)
labels
=
self
.
create_labels
(
arr
)
return
self
.
group_by_clusters
(
dataset
,
labels
)
src/data-hub/community-detection-microservice/app/processing/clustering/clustering_config.py
0 → 100644
View file @
71008e36
import
yaml
from
typing
import
Generator
### init logging ###
import
logging
LOG_FORMAT
=
(
'
%(levelname) -5
s
%(asctime)
s
%(name)
s:
%(funcName) -35
s
%(lineno) -5
d:
%(message)
s'
)
logging
.
basicConfig
(
level
=
logging
.
INFO
,
format
=
LOG_FORMAT
)
LOGGER
=
logging
.
getLogger
(
__name__
)
class
ClusteringConfig
:
'''Contains the configuration for the clustering algorithm defined in configs/clustering.yaml.'''
config_path
=
'configs/clustering.yaml'
config
:
dict
=
None
def
__init__
(
self
):
self
.
config
=
self
.
_load_config
()
def
_load_config
(
self
)
->
dict
:
'''Loads the whole configuration from file.'''
config
=
None
with
open
(
self
.
config_path
,
'r'
)
as
stream
:
try
:
config
=
yaml
.
safe_load
(
stream
)
except
yaml
.
YAMLError
as
exc
:
LOGGER
.
error
(
exc
)
config
=
{}
return
config
def
get_config
(
self
):
return
self
.
config
def
get_layer_configs
(
self
)
->
Generator
[
dict
,
None
,
None
]:
"""
Returns a generator for the individual layer configs.
Layer configs are dicts including a layer-name.
"""
for
key
,
layer
in
self
.
config
[
'layers'
]
.
items
():
layer
[
'layer-name'
]
=
key
yield
layer
src/data-hub/community-detection-microservice/app/requirements.txt
View file @
71008e36
...
...
@@ -5,9 +5,11 @@ certifi==2019.11.28
chardet==3.0.4
Click==7.0
clickclick==1.2.2
colorama==0.4.3
connexion==2.6.0
cycler==0.10.0
decorator==4.4.1
Deprecated==1.2.7
Flask==1.1.1
idna==2.8
importlib-metadata==1.5.0
...
...
src/data-hub/community-detection-microservice/app/r
est
/cluster.py
→
src/data-hub/community-detection-microservice/app/r
outes
/cluster.py
View file @
71008e36
import
io
from
flask
import
request
,
Response
from
db.repository
import
Repository
from
processing.clusterer
import
Clusterer
from
processing.cluster
ing.cluster
er
import
Clusterer
from
matplotlib.backends.backend_agg
import
FigureCanvasAgg
as
FigureCanvas
repo
=
Repository
()
...
...
src/data-hub/community-detection-microservice/app/routes/clustersets.py
0 → 100644
View file @
71008e36
from
flask
import
request
,
Response
from
db.repository
import
Repository
from
db.entities
import
ClusterSet
repo
=
Repository
()
def
get
():
return
[
c
.
to_serializable_dict
()
for
c
in
repo
.
get_clustersets
()]
def
get_names
():
return
repo
.
get_clusterset_names
()
def
get_by_name
(
name
):
res
=
repo
.
get_clusterset
(
name
)
if
res
is
not
None
:
return
res
.
to_serializable_dict
()
else
:
return
Response
(
status
=
404
)
\ No newline at end of file
src/data-hub/community-detection-microservice/app/r
est
/debug.py
→
src/data-hub/community-detection-microservice/app/r
outes
/debug.py
View file @
71008e36
File moved
src/data-hub/community-detection-microservice/app/r
est
/functions.py
→
src/data-hub/community-detection-microservice/app/r
outes
/functions.py
View file @
71008e36
File moved
src/data-hub/community-detection-microservice/app/routes/layers.py
0 → 100644
View file @
71008e36
from
flask
import
request
,
Response
from
db.repository
import
Repository
from
db.entities
import
Layer
repo
=
Repository
()
def
post
():
body
=
request
.
json
_insert_layer
(
body
)
return
Response
(
status
=
201
)
def
_insert_layer
(
layer_data
:
dict
):
# convert object keys from ext source
layer_data
[
'layer_name'
]
=
layer_data
.
pop
(
'LayerName'
)
layer_data
[
'nodes'
]
=
layer_data
.
pop
(
'Nodes'
)
layer_data
[
'properties'
]
=
layer_data
.
pop
(
'Properties'
)
repo
.
add_layer
(
Layer
(
layer_data
))
def
get
():
return
[
l
.
to_serializable_dict
()
for
l
in
repo
.
get_layers
()]
def
get_names
():
return
repo
.
get_layer_names
()
def
get_by_name
(
name
):
res
=
repo
.
get_layer
(
name
)
if
res
is
not
None
:
return
res
.
to_serializable_dict
()
else
:
return
Response
(
status
=
404
)
\ No newline at end of file
src/data-hub/community-detection-microservice/app/r
est
/location.py
→
src/data-hub/community-detection-microservice/app/r
outes
/location.py
View file @
71008e36
File moved
src/data-hub/community-detection-microservice/app/r
est
/user_cluster.py
→
src/data-hub/community-detection-microservice/app/r
outes
/user_cluster.py
View file @
71008e36
File moved
src/data-hub/community-detection-microservice/app/run_clustering.py
View file @
71008e36
...
...
@@ -4,15 +4,40 @@ modules_path = '../../../modules/'
if
os
.
path
.
exists
(
modules_path
):
sys
.
path
.
insert
(
1
,
modules_path
)
from
db.entities
import
Location
,
PopularLocation
,
LocationCluster
,
TimeCluster
from
db.entities
import
*
from
typing
import
List
,
Dict
,
Tuple
from
db.repository
import
Repository
from
processing.clusterer
import
Clusterer
from
db.repository
import
Repository
,
AgiRepository
from
processing.cluster
ing.cluster
er
import
Clusterer
DEBUG
=
False
repo
=
Repository
()
test_repo
=
AgiRepository
()
def
run_generic_clustering
():
'''Runs the clustering for all layers found in the repository.'''
all_layers
:
List
[
Layer
]
=
repo
.
get_layers
()
for
layer
in
all_layers
:
print
(
f
"Clustering {layer.layer_name}"
)
clusters
=
run_clustering_for_layer
(
layer
)
cluster_set
=
ClusterSet
(
layer
.
layer_name
,
clusters
)
repo
.
add_clusterset
(
cluster_set
)
def
run_clustering_for_layer
(
layer
:
Layer
)
->
List
[
Cluster
]:
clusterer
=
Clusterer
()
res
=
clusterer
.
cluster_dataset
(
layer
.
nodes
,
layer
.
properties
)
return
[
Cluster
(
key
,
value
)
for
key
,
value
in
res
.
items
()]
def
run_location_clustering
():
user_clusterer
=
Clusterer
()
...
...
@@ -74,5 +99,7 @@ def store_clusters(type: str, clusters: List):
if
__name__
==
"__main__"
:
run_location_clustering
()
run_time_clustering
()
run_generic_clustering
()
# run_location_clustering()
# run_time_clustering()
src/data-hub/community-detection-microservice/app/tests/test_clusterer.py
View file @
71008e36
import
unittest
import
sys
sys
.
path
.
insert
(
1
,
'../'
)
for
path
in
[
'../'
,
'./'
]:
sys
.
path
.
insert
(
1
,
path
)
# python -m unittest discover
from
processing.clusterer
import
Clusterer
from
processing.cluster
ing.cluster
er
import
Clusterer
class
TestClusterer
(
unittest
.
TestCase
):
clusterer
:
Clusterer
=
None
...
...
@@ -71,11 +72,50 @@ class TestClusterer(unittest.TestCase):
self
.
assertEqual
(
3
,
len
(
locations
))
self
.
assertHaveLabelsAsNewKey
(
locations
,
labels
)
def
test_cluster_locations_multInput_correctlyLabeled
(
self
):
locations
=
[
self
.
location
(
1
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
20
,
20
)]
labels
=
[
0
,
0
,
-
1
]
res
=
self
.
clusterer
.
cluster_locations
(
locations
)
self
.
assertHaveLabelsAsNewKey
(
locations
,
labels
)
self
.
assertDictEqual
(
res
,
{
0
:
[{
'latitude'
:
1
,
'longitude'
:
2
,
'cluster_label'
:
0
},
{
'latitude'
:
2
,
'longitude'
:
2
,
'cluster_label'
:
0
}],
-
1
:
[{
'latitude'
:
20
,
'longitude'
:
20
,
'cluster_label'
:
-
1
}]})
def
test_cluster_times_multInput_correctlyLabeled
(
self
):
times
=
[
self
.
time
(
123
),
self
.
time
(
128
),
self
.
time
(
223
)]
labels
=
[
0
,
0
,
-
1
]
res
=
self
.
clusterer
.
cluster_times
(
times
)
self
.
assertHaveLabelsAsNewKey
(
times
,
labels
)
self
.
assertDictEqual
(
res
,
{
0
:
[{
'timestamp'
:
123
,
'cluster_label'
:
0
},
{
'timestamp'
:
128
,
'cluster_label'
:
0
}],
-
1
:
[{
'timestamp'
:
223
,
'cluster_label'
:
-
1
}]})
def
test_cluster_dataset_locationsMultInput_correctlyLabeled
(
self
):
locations
=
[
self
.
location
(
1
,
2
),
self
.
location
(
2
,
2
),
self
.
location
(
20
,
20
)]
labels
=
[
0
,
0
,
-
1
]
res
=
self
.
clusterer
.
cluster_dataset
(
locations
,
[
'latitude'
,
'longitude'
])
self
.
assertHaveLabelsAsNewKey
(
locations
,
labels
)
self
.
assertDictEqual
(
res
,
{
0
:
[{
'latitude'
:
1
,
'longitude'
:
2
,
'cluster_label'
:
0
},
{
'latitude'
:
2
,
'longitude'
:
2
,
'cluster_label'
:
0
}],
-
1
:
[{
'latitude'
:
20
,
'longitude'
:
20
,
'cluster_label'
:
-
1
}]})
def
test_cluster_dataset_timesMultInput_correctlyLabeled
(
self
):
times
=
[
self
.
time
(
123
),
self
.
time
(
128
),
self
.
time
(
223
)]
labels
=
[
0
,
0
,
-
1
]
res
=
self
.
clusterer
.
cluster_dataset
(
times
,
[
'timestamp'
])
self
.
assertHaveLabelsAsNewKey
(
times
,
labels
)
self
.
assertDictEqual
(
res
,
{
0
:
[{
'timestamp'
:
123
,
'cluster_label'
:
0
},
{
'timestamp'
:
128
,
'cluster_label'
:
0
}],
-
1
:
[{
'timestamp'
:
223
,
'cluster_label'
:
-
1
}]})
# helper methods:
def
location
(
self
,
lat
,
long_
)
->
dict
:
return
{
'latitude'
:
lat
,
'longitude'
:
long_
}
def
time
(
self
,
ts
)
->
dict
:
return
{
'timestamp'
:
ts
}
def
assertHaveLabelsAsNewKey
(
self
,
locations
,
labels
):
for
i
in
range
(
len
(
locations
)):
self
.
assertEqual
(
labels
[
i
],
locations
[
i
][
'cluster_label'
])
...
...
src/data-hub/community-detection-microservice/app/tests/test_clustering_config.py
0 → 100644
View file @
71008e36
import
unittest
import
sys
for
path
in
[
'../'
,
'./'
]:
sys
.
path
.
insert
(
1
,
path
)
# python -m unittest discover
from
processing.clustering.clustering_config
import
ClusteringConfig
class
TestClusteringConfig
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
clustering_config
=
ClusteringConfig
()
def
test_get_layer_configs_noneInput_noneOutput
(
self
):
for
layer_config
in
self
.
clustering_config
.
get_layer_configs
():
self
.
assertIn
(
'layer-name'
,
layer_config
)
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment