Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
1d90011e
Commit
1d90011e
authored
Feb 10, 2020
by
Alexander
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
time clustering for each hour
parent
e4993f0c
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
271 additions
and
48 deletions
+271
-48
__init__.py
...munity-detection-microservice/app/db/entities/__init__.py
+1
-1
cluster.py
...mmunity-detection-microservice/app/db/entities/cluster.py
+73
-0
user_cluster.py
...ty-detection-microservice/app/db/entities/user_cluster.py
+0
-22
repository.py
...hub/community-detection-microservice/app/db/repository.py
+7
-9
run_clustering.py
...ub/community-detection-microservice/app/run_clustering.py
+38
-16
test_cluster.py
...ommunity-detection-microservice/app/tests/test_cluster.py
+152
-0
No files found.
src/data-hub/community-detection-microservice/app/db/entities/__init__.py
View file @
1d90011e
from
db.entities.location
import
Location
from
db.entities.popular_location
import
PopularLocation
from
db.entities.user_cluster
import
UserCluster
\ No newline at end of file
from
db.entities.cluster
import
LocationCluster
,
TimeCluster
\ No newline at end of file
src/data-hub/community-detection-microservice/app/db/entities/cluster.py
0 → 100644
View file @
1d90011e
import
json
from
typing
import
List
,
Dict
from
datetime
import
date
,
datetime
class
Cluster
:
def
__init__
(
self
,
cluster_label
:
int
=
None
,
clusters
:
List
=
None
):
self
.
cluster_label
=
cluster_label
self
.
clusters
=
clusters
class
LocationCluster
(
Cluster
):
def
__init__
(
self
,
cluster_label
:
int
=
None
,
clusters
:
List
=
None
,
location_dict
:
Dict
=
None
,
from_db
=
False
):
super
()
.
__init__
(
cluster_label
,
clusters
)
self
.
id
=
f
'{self.cluster_label}'
if
location_dict
is
not
None
:
self
.
from_serializable_dict
(
location_dict
,
from_db
)
def
to_serializable_dict
(
self
,
for_db
=
False
)
->
Dict
:
return
{
"id"
:
self
.
id
,
"cluster_label"
:
self
.
cluster_label
,
"clusters"
:
json
.
dumps
(
self
.
clusters
)
if
for_db
else
self
.
clusters
}
def
from_serializable_dict
(
self
,
location_dict
:
Dict
,
from_db
=
False
):
self
.
id
=
location_dict
[
"id"
]
self
.
cluster_label
=
location_dict
[
"cluster_label"
]
self
.
clusters
=
json
.
loads
(
location_dict
[
"clusters"
])
\
if
from_db
else
location_dict
[
"clusters"
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"LocationCluster({self.__repr__()})"
class
TimeCluster
(
Cluster
):
def
__init__
(
self
,
date
:
date
=
None
,
hour
:
int
=
None
,
cluster_label
:
int
=
None
,
clusters
:
List
=
None
,
time_dict
:
Dict
=
None
,
from_db
=
False
):
super
()
.
__init__
(
cluster_label
,
clusters
)
self
.
date
=
date
self
.
hour
=
hour
self
.
id
=
f
'{self.date}-{self.hour}-{self.cluster_label}'
if
time_dict
is
not
None
:
self
.
from_serializable_dict
(
time_dict
,
from_db
)
def
to_serializable_dict
(
self
,
for_db
=
False
)
->
Dict
:
return
{
"id"
:
self
.
id
,
"date"
:
str
(
self
.
date
),
"hour"
:
self
.
hour
,
"cluster_label"
:
self
.
cluster_label
,
"clusters"
:
json
.
dumps
(
self
.
clusters
)
if
for_db
else
self
.
clusters
}
def
from_serializable_dict
(
self
,
time_dict
:
Dict
,
from_db
=
False
):
self
.
id
=
time_dict
[
"id"
]
self
.
date
=
datetime
.
strptime
(
time_dict
[
"date"
],
'
%
Y-
%
m-
%
d'
)
.
date
()
self
.
hour
=
time_dict
[
"hour"
]
self
.
cluster_label
=
time_dict
[
"cluster_label"
]
self
.
clusters
=
json
.
loads
(
time_dict
[
"clusters"
])
\
if
from_db
else
time_dict
[
"clusters"
]
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"TimeCluster({self.__repr__()})"
src/data-hub/community-detection-microservice/app/db/entities/user_cluster.py
deleted
100644 → 0
View file @
e4993f0c
import
json
from
typing
import
List
class
UserCluster
:
def
__init__
(
self
,
cluster_label
:
int
,
clusters
:
List
):
super
()
.
__init__
()
self
.
cluster_label
=
cluster_label
self
.
clusters
=
clusters
self
.
id
=
f
'{self.cluster_label}'
def
to_serializable_dict
(
self
,
for_db
=
False
):
return
{
"id"
:
self
.
id
,
"cluster_label"
:
self
.
cluster_label
,
"clusters"
:
json
.
dumps
(
self
.
clusters
)
if
for_db
else
self
.
clusters
}
def
__repr__
(
self
):
return
json
.
dumps
(
self
.
to_serializable_dict
())
def
__str__
(
self
):
return
f
"UserCluster({self.__repr__()})"
src/data-hub/community-detection-microservice/app/db/repository.py
View file @
1d90011e
...
...
@@ -5,7 +5,7 @@ import json
from
db.agi.agi_repository
import
AgiRepository
from
db.entities
import
Location
,
UserCluster
,
PopularLocation
from
db.entities
import
Location
,
TimeCluster
,
PopularLocation
,
LocationCluster
from
typing
import
List
...
...
@@ -32,19 +32,17 @@ class Repository(MongoRepositoryBase):
agi_locations
=
self
.
agi_repo
.
getLocations
()
return
[
Location
(
agi_loc
)
for
agi_loc
in
agi_locations
]
def
add_location_cluster
(
self
,
cluster
:
User
Cluster
):
def
add_location_cluster
(
self
,
cluster
:
Location
Cluster
):
super
()
.
insert_entry
(
self
.
_location_cluster_collection
,
cluster
.
to_serializable_dict
(
for_db
=
True
))
def
get_location_clusters
(
self
)
->
List
[
User
Cluster
]:
def
get_location_clusters
(
self
)
->
List
[
Location
Cluster
]:
clusters
=
super
()
.
get_entries
(
self
.
_location_cluster_collection
)
return
[
User
Cluster
(
c
[
'cluster_label'
],
json
.
loads
(
c
[
'clusters'
]))
for
c
in
clusters
]
return
[
Location
Cluster
(
c
[
'cluster_label'
],
json
.
loads
(
c
[
'clusters'
]))
for
c
in
clusters
]
def
add_time_cluster
(
self
,
cluster
:
User
Cluster
):
def
add_time_cluster
(
self
,
cluster
:
Time
Cluster
):
super
()
.
insert_entry
(
self
.
_time_cluster_collection
,
cluster
.
to_serializable_dict
(
for_db
=
True
))
def
get_time_clusters
(
self
)
->
List
[
User
Cluster
]:
def
get_time_clusters
(
self
)
->
List
[
Time
Cluster
]:
clusters
=
super
()
.
get_entries
(
self
.
_time_cluster_collection
)
return
[
User
Cluster
(
c
[
'cluster_label'
],
json
.
loads
(
c
[
'clusters'
]))
for
c
in
clusters
]
return
[
Time
Cluster
(
c
[
'cluster_label'
],
json
.
loads
(
c
[
'clusters'
]))
for
c
in
clusters
]
def
add_popular_location
(
self
,
popular_location
:
PopularLocation
):
pass
src/data-hub/community-detection-microservice/app/run_clustering.py
View file @
1d90011e
...
...
@@ -4,7 +4,7 @@ modules_path = '../../../modules/'
if
os
.
path
.
exists
(
modules_path
):
sys
.
path
.
insert
(
1
,
modules_path
)
from
db.entities
import
Location
,
PopularLocation
,
User
Cluster
from
db.entities
import
Location
,
PopularLocation
,
LocationCluster
,
Time
Cluster
from
typing
import
List
,
Dict
,
Tuple
from
db.repository
import
Repository
from
processing.clusterer
import
Clusterer
...
...
@@ -20,6 +20,7 @@ repo = Repository()
# exit()
def
run_location_clustering
():
user_clusterer
=
Clusterer
()
...
...
@@ -28,36 +29,57 @@ def run_location_clustering():
cluster_result
=
user_clusterer
.
cluster_locations
(
[
l
.
to_serializable_dict
()
for
l
in
all_location_traces
])
clusters
=
[
User
Cluster
(
key
,
value
)
clusters
=
[
Location
Cluster
(
key
,
value
)
for
key
,
value
in
cluster_result
.
items
()]
store_
user_clusters
(
clusters
)
store_
clusters
(
'locations'
,
clusters
)
def
run_time_clustering
():
user_clusterer
=
Clusterer
(
epsilon
=
10
**
5.8
)
clusters
:
List
[
TimeCluster
]
=
[]
user_clusterer
=
Clusterer
(
epsilon
=
600
)
# clustered within 10 minutes
all_location_traces
=
repo
.
get_locations
()
cluster_result
=
user_clusterer
.
cluster_times
([
l
.
to_serializable_dict
()
for
l
in
all_location_traces
])
# for each date in timestamp list
dates
=
{
trace
.
timestamp
.
date
()
for
trace
in
all_location_traces
}
for
cur_date
in
dates
:
traces_for_cur_date
=
[
trace
for
trace
in
all_location_traces
if
trace
.
timestamp
.
date
()
==
cur_date
]
# for each hour of that day
for
cur_hour
in
list
(
range
(
24
)):
traces_for_time_slice
=
[
trace
for
trace
in
traces_for_cur_date
if
trace
.
timestamp
.
hour
==
cur_hour
]
if
len
(
traces_for_time_slice
)
==
0
:
continue
clusters
=
[
UserCluster
(
key
,
value
)
# clustering per hour
cluster_result
=
user_clusterer
.
cluster_times
(
[
t
.
to_serializable_dict
()
for
t
in
traces_for_time_slice
])
cur_clusters
=
[
TimeCluster
(
cur_date
,
cur_hour
,
key
,
value
)
for
key
,
value
in
cluster_result
.
items
()]
for
c
in
clusters
:
repo
.
add_time_cluster
(
c
)
clusters
.
extend
(
cur_clusters
)
store_clusters
(
'times'
,
clusters
)
# TODO make abstract for other features
def
store_user_clusters
(
user_clusters
:
List
[
UserCluster
]):
def
store_clusters
(
type
:
str
,
clusters
:
List
):
if
DEBUG
:
print
(
user_
clusters
)
print
(
clusters
)
return
for
c
in
user_clusters
:
if
type
==
'locations'
:
for
c
in
clusters
:
repo
.
add_location_cluster
(
c
)
if
type
==
'times'
:
for
c
in
clusters
:
repo
.
add_time_cluster
(
c
)
if
__name__
==
"__main__"
:
run_time_clustering
()
run_location_clustering
()
run_time_clustering
()
src/data-hub/community-detection-microservice/app/tests/test_cluster.py
0 → 100644
View file @
1d90011e
import
unittest
import
sys
sys
.
path
.
insert
(
1
,
'./'
)
# python -m unittest discover -v tests
from
db.entities.cluster
import
Cluster
from
db.entities
import
TimeCluster
,
LocationCluster
from
datetime
import
date
,
datetime
import
json
class
TestCluster
(
unittest
.
TestCase
):
def
test_init_Cluster
(
self
):
c
=
Cluster
(
1
,
[
1
,
2
,
3
])
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
class
TestLocationCluster
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
c
=
LocationCluster
(
1
,
[
1
,
2
,
3
])
def
test_init_individualArguments
(
self
):
c
=
LocationCluster
(
1
,
[
1
,
2
,
3
])
self
.
assertEqual
(
'1'
,
c
.
id
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_init_dictArgument
(
self
):
dict_
=
{
'id'
:
'123'
,
'cluster_label'
:
1
,
'clusters'
:
[
1
,
2
,
3
]}
c
=
LocationCluster
(
location_dict
=
dict_
)
self
.
assertEqual
(
'123'
,
c
.
id
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_init_dictArgument_fromDb
(
self
):
dict_
=
{
'id'
:
'123'
,
'cluster_label'
:
1
,
'clusters'
:
'[1, 2, 3]'
}
c
=
LocationCluster
(
location_dict
=
dict_
,
from_db
=
True
)
self
.
assertEqual
(
'123'
,
c
.
id
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_to_serializable_dict_noDb
(
self
):
c_dict
=
self
.
c
.
to_serializable_dict
()
self
.
assertEqual
(
self
.
c
.
id
,
c_dict
[
'id'
])
self
.
assertEqual
(
self
.
c
.
cluster_label
,
c_dict
[
'cluster_label'
])
self
.
assertEqual
(
self
.
c
.
clusters
,
c_dict
[
'clusters'
])
def
test_from_serializable_dict_noDb
(
self
):
new_c
=
LocationCluster
()
new_c
.
from_serializable_dict
(
self
.
c
.
to_serializable_dict
())
self
.
assertEqual
(
self
.
c
.
id
,
new_c
.
id
)
self
.
assertEqual
(
str
(
self
.
c
),
str
(
new_c
))
def
test_to_serializable_dict_db_jsonClusters
(
self
):
c_dict
=
self
.
c
.
to_serializable_dict
(
for_db
=
True
)
self
.
assertEqual
(
self
.
c
.
id
,
c_dict
[
'id'
])
self
.
assertEqual
(
self
.
c
.
cluster_label
,
c_dict
[
'cluster_label'
])
self
.
assertEqual
(
self
.
c
.
clusters
,
json
.
loads
(
c_dict
[
'clusters'
]))
def
test_from_serializable_dict_fromDb
(
self
):
new_c
=
LocationCluster
()
new_c
.
from_serializable_dict
(
self
.
c
.
to_serializable_dict
(
for_db
=
True
),
from_db
=
True
)
self
.
assertEqual
(
self
.
c
.
id
,
new_c
.
id
)
self
.
assertEqual
(
str
(
self
.
c
),
str
(
new_c
))
class
TestTimeCluster
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
date_
=
date
(
2020
,
1
,
1
)
self
.
c
=
TimeCluster
(
self
.
date_
,
14
,
1
,
[
1
,
2
,
3
])
def
test_init_individualArguments
(
self
):
c
=
TimeCluster
(
self
.
date_
,
14
,
1
,
[
1
,
2
,
3
])
self
.
assertEqual
(
f
'{self.date_}-14-1'
,
c
.
id
)
self
.
assertEqual
(
self
.
date_
,
c
.
date
)
self
.
assertEqual
(
14
,
c
.
hour
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_init_dictArgument
(
self
):
dict_
=
{
'id'
:
'123'
,
'cluster_label'
:
1
,
'clusters'
:
[
1
,
2
,
3
],
'date'
:
str
(
self
.
date_
),
'hour'
:
14
}
c
=
TimeCluster
(
time_dict
=
dict_
)
self
.
assertEqual
(
'123'
,
c
.
id
)
self
.
assertEqual
(
self
.
date_
,
c
.
date
)
self
.
assertEqual
(
14
,
c
.
hour
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_init_dictArgument_fromDb
(
self
):
dict_
=
{
'id'
:
'123'
,
'cluster_label'
:
1
,
'clusters'
:
'[1, 2, 3]'
,
'date'
:
str
(
self
.
date_
),
'hour'
:
14
}
c
=
TimeCluster
(
time_dict
=
dict_
,
from_db
=
True
)
self
.
assertEqual
(
'123'
,
c
.
id
)
self
.
assertEqual
(
self
.
date_
,
c
.
date
)
self
.
assertEqual
(
14
,
c
.
hour
)
self
.
assertEqual
(
1
,
c
.
cluster_label
)
self
.
assertEqual
([
1
,
2
,
3
],
c
.
clusters
)
def
test_to_serializable_dict_noDb
(
self
):
c_dict
=
self
.
c
.
to_serializable_dict
()
self
.
assertEqual
(
self
.
c
.
id
,
c_dict
[
'id'
])
self
.
assertEqual
(
self
.
c
.
cluster_label
,
c_dict
[
'cluster_label'
])
self
.
assertEqual
(
self
.
c
.
clusters
,
c_dict
[
'clusters'
])
self
.
assertEqual
(
self
.
c
.
date
,
datetime
.
strptime
(
c_dict
[
'date'
],
'
%
Y-
%
m-
%
d'
)
.
date
())
self
.
assertEqual
(
self
.
c
.
hour
,
c_dict
[
'hour'
])
def
test_from_serializable_dict_noDb
(
self
):
new_c
=
TimeCluster
()
new_c
.
from_serializable_dict
(
self
.
c
.
to_serializable_dict
())
self
.
assertEqual
(
self
.
c
.
id
,
new_c
.
id
)
self
.
assertEqual
(
str
(
self
.
c
),
str
(
new_c
))
def
test_to_serializable_dict_fromDb_jsonClusters
(
self
):
c_dict
=
self
.
c
.
to_serializable_dict
(
for_db
=
True
)
self
.
assertEqual
(
self
.
c
.
id
,
c_dict
[
'id'
])
self
.
assertEqual
(
self
.
c
.
cluster_label
,
c_dict
[
'cluster_label'
])
self
.
assertEqual
(
self
.
c
.
clusters
,
json
.
loads
(
c_dict
[
'clusters'
]))
self
.
assertEqual
(
self
.
c
.
date
,
datetime
.
strptime
(
c_dict
[
'date'
],
'
%
Y-
%
m-
%
d'
)
.
date
())
self
.
assertEqual
(
self
.
c
.
hour
,
c_dict
[
'hour'
])
def
test_from_serializable_dict_fromDb
(
self
):
new_c
=
TimeCluster
()
new_c
.
from_serializable_dict
(
self
.
c
.
to_serializable_dict
(
for_db
=
True
),
from_db
=
True
)
self
.
assertEqual
(
self
.
c
.
id
,
new_c
.
id
)
self
.
assertEqual
(
str
(
self
.
c
),
str
(
new_c
))
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment