Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
9bca8889
Commit
9bca8889
authored
Feb 07, 2020
by
Alexander
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
location clustering independent of time
parent
ad1ef89c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
17 additions
and
118 deletions
+17
-118
user_cluster.py
...ty-detection-microservice/app/db/entities/user_cluster.py
+5
-6
clusterer.py
...munity-detection-microservice/app/processing/clusterer.py
+1
-0
run_clustering.py
...ub/community-detection-microservice/app/run_clustering.py
+11
-112
No files found.
src/data-hub/community-detection-microservice/app/db/entities/user_cluster.py
View file @
9bca8889
import
json
import
json
from
typing
import
List
class
UserCluster
:
class
UserCluster
:
def
__init__
(
self
,
date
,
hour
,
clusters
):
def
__init__
(
self
,
cluster_label
:
int
,
clusters
:
List
):
super
()
.
__init__
()
super
()
.
__init__
()
self
.
date
=
date
self
.
cluster_label
=
cluster_label
self
.
hour
=
hour
self
.
clusters
=
clusters
self
.
clusters
=
clusters
self
.
id
=
f
'{self.
date}-{self.hour
}'
self
.
id
=
f
'{self.
cluster_label
}'
def
to_serializable_dict
(
self
,
for_db
=
False
):
def
to_serializable_dict
(
self
,
for_db
=
False
):
return
{
return
{
"id"
:
self
.
id
,
"id"
:
self
.
id
,
"date"
:
str
(
self
.
date
),
"cluster_label"
:
self
.
cluster_label
,
"hour"
:
self
.
hour
,
"clusters"
:
json
.
dumps
(
self
.
clusters
)
if
for_db
else
self
.
clusters
"clusters"
:
json
.
dumps
(
self
.
clusters
)
if
for_db
else
self
.
clusters
}
}
...
...
src/data-hub/community-detection-microservice/app/processing/clusterer.py
View file @
9bca8889
...
@@ -66,6 +66,7 @@ class Clusterer:
...
@@ -66,6 +66,7 @@ class Clusterer:
locations
[
i
][
'cluster_label'
]
=
labels
[
i
]
locations
[
i
][
'cluster_label'
]
=
labels
[
i
]
def
run
(
self
,
locations
:
List
[
Dict
])
->
Dict
[
int
,
List
[
Dict
]]:
def
run
(
self
,
locations
:
List
[
Dict
])
->
Dict
[
int
,
List
[
Dict
]]:
'''Returns a dictionary with identified clusters and their locations copied from the input'''
if
locations
is
None
or
len
(
locations
)
==
0
:
if
locations
is
None
or
len
(
locations
)
==
0
:
# raise Exception("locations has to contain something")
# raise Exception("locations has to contain something")
return
{}
return
{}
...
...
src/data-hub/community-detection-microservice/app/run_clustering.py
View file @
9bca8889
...
@@ -4,122 +4,30 @@ modules_path = '../../../modules/'
...
@@ -4,122 +4,30 @@ modules_path = '../../../modules/'
if
os
.
path
.
exists
(
modules_path
):
if
os
.
path
.
exists
(
modules_path
):
sys
.
path
.
insert
(
1
,
modules_path
)
sys
.
path
.
insert
(
1
,
modules_path
)
from
processing.clusterer
import
Clusterer
from
db.repository
import
Repository
from
datetime
import
datetime
,
timedelta
from
typing
import
List
,
Dict
,
Tuple
from
db.entities
import
Location
,
PopularLocation
,
UserCluster
from
db.entities
import
Location
,
PopularLocation
,
UserCluster
import
statistics
from
typing
import
List
,
Dict
,
Tuple
from
collections
import
Counter
from
db.repository
import
Repository
import
json
from
processing.clusterer
import
Clusterer
DEBUG
=
False
DEBUG
=
False
NR_DECIMAL_FOR_BEST_LOCATIONS
=
4
# used to cluster locations of a single user to detect main location per time slice
main_loc_clusterer
=
Clusterer
()
# used to cluster the users based on their main location
# used to cluster the users based on their main location
user_clusterer
=
Clusterer
()
user_clusterer
=
Clusterer
()
time_slices
=
list
(
range
(
24
))
repo
=
Repository
(
agi_data
=
True
)
repo
=
Repository
()
def
run_location_clustering
():
def
run_location_clustering
():
user_clusters
:
List
[
UserCluster
]
=
[]
popular_locations
:
List
[
PopularLocation
]
=
[]
all_location_traces
=
repo
.
get_locations
()
all_location_traces
=
repo
.
get_locations
()
# for each date in timestamp list
cluster_result
=
user_clusterer
.
run
(
dates
=
{
trace
.
timestamp
.
date
()
for
trace
in
all_location_traces
}
[
l
.
to_serializable_dict
()
for
l
in
all_location_traces
])
for
cur_date
in
dates
:
traces_for_cur_date
=
[
trace
for
trace
in
all_location_traces
if
trace
.
timestamp
.
date
()
==
cur_date
]
location_counter
:
Dict
[
str
,
int
]
=
{}
# for each hour of that day
for
cur_hour
in
time_slices
:
traces_for_time_slice
=
[
trace
for
trace
in
traces_for_cur_date
if
trace
.
timestamp
.
hour
-
cur_hour
==
0
]
if
len
(
traces_for_time_slice
)
==
0
:
continue
main_locations
=
[]
# store the main location for each user
users
=
{
trace
.
user
for
trace
in
traces_for_time_slice
}
for
user
in
users
:
main_loc
=
get_main_location_for_user
(
traces_for_time_slice
,
user
)
main_loc
[
'user'
]
=
user
main_locations
.
append
(
main_loc
)
# cluster the main locations for all users
cluster_result
=
user_clusterer
.
run
(
main_locations
)
clusters
=
{}
for
key
,
vals
in
cluster_result
.
items
():
clusters
[
key
]
=
[
v
[
'user'
]
for
v
in
vals
]
# print(f"{cur_date} @ {cur_hour}h-{cur_hour+1}h (Group #{key}): {[v['user'] for v in vals]}")
# add the clusters for the cur_hour to the global cluster list
clusters
=
[
UserCluster
(
key
,
value
)
user_clusters
.
append
(
UserCluster
(
cur_date
,
cur_hour
,
clusters
))
for
key
,
value
in
cluster_result
.
items
()]
# add locations for cur_hour to location counter
store_user_clusters
(
clusters
)
for
main_l
in
main_locations
:
key
=
json
.
dumps
({
'lat'
:
round
(
main_l
[
'latitude'
],
NR_DECIMAL_FOR_BEST_LOCATIONS
),
'long'
:
round
(
main_l
[
'longitude'
],
NR_DECIMAL_FOR_BEST_LOCATIONS
)})
if
key
not
in
location_counter
:
location_counter
[
key
]
=
0
location_counter
[
key
]
+=
1
# print(f"{cur_date} @ {cur_hour}h-{cur_hour+1}h: {main_locations}")
# add the top three locations to the global popular location list
top_locations
=
get_top_three_locations
(
location_counter
)
top_locations
=
[
json
.
loads
(
l
[
0
])
for
l
in
top_locations
]
popular_locations
.
append
(
PopularLocation
(
cur_date
,
top_locations
))
store_user_clusters
(
user_clusters
)
store_popular_locations
(
popular_locations
)
def
get_main_location_for_user
(
location_traces
:
List
[
Location
],
user
:
str
)
->
dict
:
# cluster based on locations
locations_for_user
=
[
t
for
t
in
location_traces
if
t
.
user
==
user
]
clusters
=
main_loc_clusterer
.
run
([
l
.
__dict__
for
l
in
locations_for_user
])
# largest cluster has most locations
max_c
=
{
'id'
:
-
1
,
'size'
:
0
}
for
cluster_key
,
cluster_vals
in
clusters
.
items
():
if
len
(
cluster_vals
)
>
max_c
[
'size'
]:
max_c
[
'id'
]
=
cluster_key
max_c
[
'size'
]
=
len
(
cluster_vals
)
# calculate center of the location from the largest cluster
locations_of_largest_cluster
=
clusters
[
max_c
[
'id'
]]
center
=
get_center_of_2d_points
(
locations_of_largest_cluster
)
return
center
def
get_center_of_2d_points
(
points
,
nr_decimal_places
=
5
)
->
dict
:
center
=
{}
center
[
'latitude'
]
=
round
(
statistics
.
mean
(
[
p
[
'latitude'
]
for
p
in
points
]),
nr_decimal_places
)
center
[
'longitude'
]
=
round
(
statistics
.
mean
(
[
p
[
'longitude'
]
for
p
in
points
]),
nr_decimal_places
)
return
center
def
get_top_three_locations
(
location_counts
:
Dict
[
str
,
int
])
->
List
[
Tuple
[
str
,
int
]]:
cnter
=
Counter
(
location_counts
)
max_three
=
cnter
.
most_common
(
3
)
return
max_three
def
store_user_clusters
(
user_clusters
:
List
[
UserCluster
]):
def
store_user_clusters
(
user_clusters
:
List
[
UserCluster
]):
...
@@ -131,14 +39,5 @@ def store_user_clusters(user_clusters: List[UserCluster]):
...
@@ -131,14 +39,5 @@ def store_user_clusters(user_clusters: List[UserCluster]):
repo
.
add_user_cluster
(
c
)
repo
.
add_user_cluster
(
c
)
def
store_popular_locations
(
popular_locations
:
List
[
PopularLocation
]):
if
DEBUG
:
print
(
popular_locations
)
return
for
l
in
popular_locations
:
repo
.
add_popular_location
(
l
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
run_location_clustering
()
run_location_clustering
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment