Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
495390ac
Commit
495390ac
authored
Jul 27, 2021
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cross context prediction
parent
a878064e
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
399 additions
and
196 deletions
+399
-196
repository.py
...ive-community-detection-microservice/app/db/repository.py
+3
-0
cluster.py
...-community-detection-microservice/app/entities/cluster.py
+1
-2
layer.py
...ve-community-detection-microservice/app/entities/layer.py
+4
-0
predict.ipynb
...active-community-detection-microservice/app/predict.ipynb
+218
-146
cluster_metrics_calc.py
...oservice/app/processing/data_prep/cluster_metrics_calc.py
+2
-2
layer_metrics_calc.py
...croservice/app/processing/data_prep/layer_metrics_calc.py
+1
-12
metrics_base.py
...ion-microservice/app/processing/data_prep/metrics_base.py
+19
-1
predict_base.py
...-detection-microservice/app/processing/ml/predict_base.py
+32
-0
predict_cross_context.py
...n-microservice/app/processing/ml/predict_cross_context.py
+98
-0
predict_single_context.py
...-microservice/app/processing/ml/predict_single_context.py
+7
-27
train_cross_context.py
...ion-microservice/app/processing/ml/train_cross_context.py
+6
-4
run_prediction.py
...ve-community-detection-microservice/app/run_prediction.py
+8
-2
No files found.
src/data-hub/proactive-community-detection-microservice/app/db/repository.py
View file @
495390ac
...
...
@@ -131,4 +131,7 @@ class Repository(MongoRepositoryBase):
def
get_prediction_results
(
self
,
use_case
:
str
)
->
List
[
PredictionResult
]:
entries
=
super
()
.
get_entries
(
self
.
_prediction_result_collection
,
selection
=
{
'use_case'
:
use_case
},
projection
=
{
'_id'
:
0
})
return
[
PredictionResult
.
create_from_dict
(
e
)
for
e
in
entries
]
def
delete_all_prediction_results
(
self
):
super
()
.
drop_collection
(
self
.
_prediction_result_collection
)
#endregion
src/data-hub/proactive-community-detection-microservice/app/entities/cluster.py
View file @
495390ac
...
...
@@ -33,8 +33,7 @@ class Cluster:
def
get_time_info
(
self
)
->
int
:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
str_tuple
=
self
.
time_window_id
return
int
(
str_tuple
.
split
(
','
)[
1
]
.
strip
()[:
-
1
])
return
eval
(
self
.
time_window_id
)[
1
]
def
__repr__
(
self
):
return
str
(
self
.
__dict__
)
...
...
src/data-hub/proactive-community-detection-microservice/app/entities/layer.py
View file @
495390ac
...
...
@@ -53,6 +53,10 @@ class Layer:
self
.
distances_from_global_centers
=
self
.
get_distances_from_global_center
(
active_clusters
)
self
.
cluster_center_distance_agg_metrics
=
self
.
get_center_distance_min_max_avg_sum
(
active_clusters
)
def
get_time_info
(
self
)
->
int
:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
return
eval
(
self
.
time_window_id
)[
1
]
def
get_size_min_max_avg_sum
(
self
,
clusters
:
List
[
InternalCluster
])
->
dict
:
'''Returns min, max, avg, and sum of the cluster's absolute sizes.'''
if
len
(
clusters
)
==
0
:
...
...
src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
View file @
495390ac
...
...
@@ -2,23 +2,24 @@
"cells": [
{
"cell_type": "code",
"execution_count":
52
,
"execution_count":
1
,
"source": [
"use_case = 'community-prediction-youtube-n'\r\n",
"layer_name = 'LikesLayer'"
"layer_name = 'LikesLayer'\r\n",
"reference_layer_name = 'ViewsLayer'"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
2
,
"execution_count":
5
,
"source": [
"import json\r\n",
"from entities import Cluster\r\n",
"import collections\r\n",
"import numpy as np\r\n",
"from typing import Iterable, Tuple"
"from typing import Iterable, Tuple
, List, Dict, Any
"
],
"outputs": [],
"metadata": {}
...
...
@@ -27,150 +28,164 @@
"cell_type": "code",
"execution_count": 3,
"source": [
"N=
3
"
"N=
2
"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
53
,
"execution_count":
6
,
"source": [
"path_in = f\"data/{use_case}/cluster_metrics/{layer_name}.json\"\r\n",
"with open(path_in, 'r') as file:\r\n",
" data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]\r\n",
"from entities import Layer, Cluster\r\n",
"\r\n",
"data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"data[-1]"
"with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:\r\n",
" cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" cluster_ids = {c.cluster_id for c in cluster_metrics}\r\n",
" cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}\r\n",
" \r\n",
"with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:\r\n",
" layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}\r\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
54
,
"execution_count":
11
,
"source": [
"cluster_map = {}\r\n",
"\r\n",
"# for cluster in {c.cluster_id for c in data}:\r\n",
"# data_map[cluster] = [c for c in data if c.cluster_id == cluster]\r\n",
"\r\n",
"for cluster in data:\r\n",
" id_ = cluster.cluster_id\r\n",
"\r\n",
" if id_ not in cluster_map:\r\n",
" cluster_map[id_] = []\r\n",
"\r\n",
" cluster_map[id_].append(cluster)\r\n"
"# load the time keys chronologically\r\n",
"ordered_time_keys = list(layer_metrics.keys())\r\n",
"ordered_time_keys.sort(key=lambda x: eval(x))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
55
,
"execution_count":
13
,
"source": [
"{c.cluster_id for c in data} == cluster_map.keys()"
"ordered_time_keys = ordered_time_keys[-N:]\r\n",
"ordered_time_keys"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"
True
"
"
['(2018, 23)', '(2018, 24)']
"
]
},
"metadata": {},
"execution_count":
55
"execution_count":
13
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
null
,
"execution_count":
19
,
"source": [
"len(cluster_map.keys())"
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))\r\n",
"\r\n",
"def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:\r\n",
" return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))\r\n",
"\r\n",
"def get_layer_metrics(layer: Layer) -> Iterable:\r\n",
" res = [layer.n_nodes, layer.n_clusters, layer.entropy]\r\n",
" res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))\r\n",
" return res"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
38
,
"execution_count":
25
,
"source": [
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
"prediction_metrics_raw = []"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
8
,
"execution_count":
26
,
"source": [
"from typing import Tuple\r\n",
"current_layer_metric = layer_metrics[ordered_time_keys[1]]\r\n",
"prev_layer_metric = layer_metrics[ordered_time_keys[0]]\r\n",
"\r\n",
"current_layer_metric_tuple = get_layer_metrics(current_layer_metric)\r\n",
"prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)\r\n",
"\r\n",
"def get_metrics(cur_cluster: Cluster) -> Tuple:\r\n",
" return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))"
"for cluster_id in cluster_ids:\r\n",
" # yield each combination of reference layer metrics to clusters\r\n",
" prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
56
,
"execution_count":
38
,
"source": [
"method = 'cross_context'\r\n",
"\r\n",
"import pickle \r\n",
"\r\n",
"method = 'single_context'\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}.model', 'rb') as file:\r\n",
" svc = pickle.load(file)\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:\r\n",
" s
vc
= pickle.load(file)"
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}
_{reference_layer_name}_scaler
.model', 'rb') as file:\r\n",
" s
caler
= pickle.load(file)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
63
,
"execution_count":
38
,
"source": [
"import
pickle
\r\n",
"import
numpy as np
\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:\r\n",
" scaler = pickle.load(file)"
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
1
0,
"execution_count":
3
0,
"source": [
"def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
"import numpy as np\r\n",
"\r\n",
"def flatten_layer_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
" '''\r\n",
" Flattens a single metrics data point in the form:\r\n",
" [(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N, evolution_label]\r\n",
" Flattens a single layer metrics data point in the form:\r\n",
" [(n_nodes, n_clusters, entropy,\r\n",
" (relative_cluster_size)^M, (distance_from_global_centers)^M, \r\n",
" (time1, time2))^N, \r\n",
" cluster_number, evolution_label]\r\n",
" to:\r\n",
" (X, y: np.array)\r\n",
" '''\r\n",
" flat_list = []\r\n",
" for entry in datapoint: # for all x\r\n",
" flat_list.extend(entry[:-1]) # add all number features except the time tuple\r\n",
" flat_list.extend(entry[-1]) # add time tuple\r\n",
" for layer_metric_tuple in datapoint[:-1]: # for all x\r\n",
" flat_list.extend(layer_metric_tuple[0:-1]) # everything before time\r\n",
" flat_list.extend(layer_metric_tuple[-1]) # time1/2\r\n",
"\r\n",
" flat_list.append(datapoint[-1]) # cluster num\r\n",
"\r\n",
" # flat_list.append(datapoint[-1]) # y\r\n",
" return np.asarray(flat_list)"
],
"outputs": [],
...
...
@@ -178,7 +193,7 @@
},
{
"cell_type": "code",
"execution_count":
1
1,
"execution_count":
3
1,
"source": [
"def increase_time_window(time_window_id: str):\r\n",
" tuple_ = eval(time_window_id)\r\n",
...
...
@@ -195,168 +210,225 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 33,
"source": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"895\n",
"[ 1.01800000e+04 6.94600000e+03 1.25669044e+01 1.00000000e+00\n",
" 1.20000000e+01 1.46559171e+00 1.01800000e+04 9.82318271e-05\n",
" 1.17878193e-03 1.43967751e-04 1.00000000e+00 0.00000000e+00\n",
" 2.37254283e+06 1.14923227e+03 7.98256735e+06 3.54604887e-01\n",
" -9.35016243e-01 4.35300000e+03 3.25600000e+03 1.15021768e+01\n",
" 1.00000000e+00 1.00000000e+01 1.33691646e+00 4.35300000e+03\n",
" 2.29726625e-04 2.29726625e-03 3.07125307e-04 1.00000000e+00\n",
" 0.00000000e+00 2.36405615e+05 3.69147185e+02 1.20194323e+06\n",
" 2.39315664e-01 -9.70941817e-01 8.95000000e+02]\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 34,
"source": [
"from db.dao import PredictionResult\r\n",
"\r\n",
"# prediction_results = []\r\n",
"prediction_cluster_ids = []\r\n",
"prediction_time_window
s = []
\r\n",
"prediction_time_window
= increase_time_window(ordered_time_keys[1])
\r\n",
"prediction_metrics = []\r\n",
"\r\n",
"for cluster_id, time_windows in cluster_map.items():\r\n",
" v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows\r\n",
" v_flattened = flatten_metrics_datapoint(v)\r\n",
"\r\n",
" \r\n",
"for pred in prediction_metrics_raw:\r\n",
" cluster_id = pred[-1]\r\n",
" prediction_cluster_ids.append(cluster_id)\r\n",
" prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))\r\n",
" prediction_metrics.append(v_flattened)\r\n",
"\r\n",
"\r\n",
" # v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
" # res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
" # prediction_results.append(res)"
" flat_ = flatten_layer_metrics_datapoint(pred)\r\n",
" prediction_metrics.append(flat_)\r\n",
" "
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
64
,
"execution_count":
41
,
"source": [
"scaler.transform(prediction_metrics[0].reshape(1,27))"
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 42,
"source": [
"prediction_metrics[15]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[-0.2525847 , -0.00725354, -0.00748744, -0.26150883, -0.61179695,\n",
" -0.00699078, -0.0156031 , 0.10230883, -1.49959068, -0.25198809,\n",
" -0.00721248, -0.00740694, -0.2559145 , -0.6125857 , -0.0069614 ,\n",
" -0.01582086, -0.22871208, -1.567934 , -0.25144835, -0.00729236,\n",
" -0.00753175, -0.25448947, -0.6134931 , -0.00698498, -0.01589221,\n",
" -0.63013244, -1.62002196]])"
"array([ 1.01800000e+04, 6.94600000e+03, 1.25669044e+01, 1.00000000e+00,\n",
" 1.20000000e+01, 1.46559171e+00, 1.01800000e+04, 9.82318271e-05,\n",
" 1.17878193e-03, 1.43967751e-04, 1.00000000e+00, 0.00000000e+00,\n",
" 2.37254283e+06, 1.14923227e+03, 7.98256735e+06, 3.54604887e-01,\n",
" -9.35016243e-01, 4.35300000e+03, 3.25600000e+03, 1.15021768e+01,\n",
" 1.00000000e+00, 1.00000000e+01, 1.33691646e+00, 4.35300000e+03,\n",
" 2.29726625e-04, 2.29726625e-03, 3.07125307e-04, 1.00000000e+00,\n",
" 0.00000000e+00, 2.36405615e+05, 3.69147185e+02, 1.20194323e+06,\n",
" 2.39315664e-01, -9.70941817e-01, 4.36000000e+03])"
]
},
"metadata": {},
"execution_count":
64
"execution_count":
42
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
65
,
"execution_count":
29
,
"source": [
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
]
},
"metadata": {},
"execution_count": 29
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 43,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])\r\n",
" )"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
67
,
"execution_count":
45
,
"source": [
"
prediction_metrics[15]
"
"
list(zip(np.unique(prediction_results, return_counts=True)))
"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.46472317, -0.88545603, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0.35460489, -0.93501624, 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0.23931566, -0.97094182])"
"[(array([0., 1., 2., 3.]),),\n",
" (array([ 5335, 1511, 355, 13007], dtype=int64),)]"
]
},
"metadata": {},
"execution_count":
67
"execution_count":
45
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
29
,
"execution_count":
46
,
"source": [
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
"prediction_results"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
"array([3.
, 0., 0., ..., 0., 3., 3.
])"
]
},
"metadata": {},
"execution_count":
29
"execution_count":
46
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
68
,
"execution_count":
51
,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])\r\n",
" )"
"time = '(2019, 45)'\r\n",
"int(time.replace('(', '').replace(')', '').split(',')[1])"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45"
]
},
"metadata": {},
"execution_count": 51
}
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
74
,
"execution_count":
52
,
"source": [
"
list(zip(np.unique(prediction_results, return_counts=True)))
"
"
eval(time)[1]
"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(array([0., 1., 2., 3., 4.]),),\n",
" (array([ 2740, 596, 1429, 1324, 14119], dtype=int64),)]"
"45"
]
},
"metadata": {},
"execution_count":
74
"execution_count":
52
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
70
,
"execution_count":
53
,
"source": [
"
prediction_results
"
"
int(time.split(',')[1].strip()[:-1])
"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"
array([4., 4., 0., ..., 0., 0., 0.])
"
"
45
"
]
},
"metadata": {},
"execution_count":
70
"execution_count":
53
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count":
15
,
"execution_count":
47
,
"source": [
"[r.__dict__ for r in predictions[:10]]"
],
...
...
@@ -367,88 +439,88 @@
"text/plain": [
"[{'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'0'
,\n",
" 'cluster_id':
895
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'1'
,\n",
" 'cluster_id':
8947
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
3
.0},\n",
" 'prediction':
0
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'2'
,\n",
" 'cluster_id':
10464
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
3
.0},\n",
" 'prediction':
0
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'3'
,\n",
" 'cluster_id':
14671
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'4'
,\n",
" 'cluster_id':
18000
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'5'
,\n",
" 'cluster_id':
17895
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
3
.0},\n",
" 'prediction':
2
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'6'
,\n",
" 'cluster_id':
1234
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'7'
,\n",
" 'cluster_id':
16236
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'8'
,\n",
" 'cluster_id':
1995
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': '
single
_context',\n",
" 'method': '
cross
_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id':
'9'
,\n",
" 'cluster_id':
5161
,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
3
.0}]"
" 'prediction':
0
.0}]"
]
},
"metadata": {},
"execution_count":
15
"execution_count":
47
}
],
"metadata": {}
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/cluster_metrics_calc.py
View file @
495390ac
from
processing.data_prep.metrics_base
import
calculate_center
,
get_cyclic_time_feature
,
get_evolution_label
,
convert_metrics_data_to_dataframe
from
processing.data_prep.metrics_base
import
calculate_center
,
get_cyclic_time_feature
,
get_evolution_label
,
convert_metrics_data_to_dataframe
,
get_cluster_metrics
from
pathlib
import
Path
#############################
...
...
@@ -86,7 +86,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples
=
[]
continue
cur_metrics
=
(
cur_cluster
.
size
,
cur_cluster
.
std_dev
,
cur_cluster
.
scarcity
,
cur_cluster
.
importance1
,
cur_cluster
.
importance2
,
cur_cluster
.
range_
,
cur_cluster
.
global_center_distance
,
get_cyclic_time_feature
(
cur_cluster
.
get_time_info
())
)
cur_metrics
=
get_cluster_metrics
(
cur_cluster
)
# deque function: adding N+1st element will remove oldest one
if
len
(
tuples
)
==
N
:
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/layer_metrics_calc.py
View file @
495390ac
from
processing.data_prep.metrics_base
import
calculate_center
,
get_cyclic_time_feature
,
get_evolution_label
,
convert_metrics_data_to_dataframe
from
processing.data_prep.metrics_base
import
calculate_center
,
get_cyclic_time_feature
,
get_evolution_label
,
convert_metrics_data_to_dataframe
,
get_layer_metrics
from
pathlib
import
Path
#################
...
...
@@ -59,21 +59,10 @@ def get_columns(N) -> List[str]:
cols
=
cols
*
N
return
cols
+
[
'cluster_id'
]
+
[
'evolution_label'
]
######################
def
get_cyclic_time_feature_from_time_window
(
time
:
str
)
->
Tuple
[
float
,
float
]:
return
get_cyclic_time_feature
(
int
(
time
.
replace
(
'('
,
''
)
.
replace
(
')'
,
''
)
.
split
(
','
)[
1
]))
#######################
from
typing
import
Iterable
,
List
,
Dict
,
Any
import
json
from
entities
import
Layer
,
Cluster
def
get_layer_metrics
(
layer
:
Layer
)
->
Iterable
:
res
=
[
layer
.
n_nodes
,
layer
.
n_clusters
,
layer
.
entropy
]
res
+=
[
layer
.
cluster_size_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
+=
[
layer
.
cluster_relative_size_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
+=
[
layer
.
cluster_center_distance_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
.
append
(
get_cyclic_time_feature_from_time_window
(
layer
.
time_window_id
))
return
res
def
create_layer_metrics_training_data
(
use_case
:
str
,
layer_name
:
str
,
reference_layer
:
str
,
N
:
int
=
2
)
->
Iterable
:
"""
Loads the metrics training data for an individual layer from disk.
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/data_prep/metrics_base.py
View file @
495390ac
...
...
@@ -46,3 +46,21 @@ def convert_metrics_data_to_dataframe(data: Iterable, columns: list, flattening_
training_data
.
append
(
xy
)
return
pd
.
DataFrame
(
data
=
training_data
,
columns
=
columns
)
####################
from
entities
import
Cluster
,
Layer
from
typing
import
Dict
,
Tuple
def
get_cluster_metrics
(
cur_cluster
:
Cluster
)
->
Tuple
:
return
(
cur_cluster
.
size
,
cur_cluster
.
std_dev
,
cur_cluster
.
scarcity
,
cur_cluster
.
importance1
,
cur_cluster
.
importance2
,
cur_cluster
.
range_
,
cur_cluster
.
global_center_distance
,
get_cyclic_time_feature
(
cur_cluster
.
get_time_info
()))
####################
def
get_layer_metrics
(
layer
:
Layer
)
->
Iterable
:
res
=
[
layer
.
n_nodes
,
layer
.
n_clusters
,
layer
.
entropy
]
res
+=
[
layer
.
cluster_size_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
+=
[
layer
.
cluster_relative_size_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
+=
[
layer
.
cluster_center_distance_agg_metrics
[
k
]
for
k
in
[
'min'
,
'max'
,
'avg'
,
'sum'
]]
res
.
append
(
get_cyclic_time_feature
(
layer
.
get_time_info
()))
return
res
###################
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_base.py
0 → 100644
View file @
495390ac
def
increase_time_window
(
time_window_id
:
str
)
->
str
:
tuple_
=
eval
(
time_window_id
)
if
tuple_
[
1
]
==
52
:
# 1st week next year
return
(
tuple_
[
0
]
+
1
,
1
)
else
:
# next week
return
str
((
tuple_
[
0
],
tuple_
[
1
]
+
1
))
######################
from
typing
import
Tuple
import
pickle
def
load_ml_models
(
use_case
,
method
,
layer_name
,
reference_layer_name
=
None
)
->
Tuple
[
'scaler'
,
'clf'
]:
path_
=
f
'data/{use_case}/ml_output/{method}/{layer_name}'
if
method
==
'single_context'
:
with
open
(
f
'{path_}.model'
,
'rb'
)
as
file
:
svc
=
pickle
.
load
(
file
)
with
open
(
f
'{path_}_scaler.model'
,
'rb'
)
as
file
:
scaler
=
pickle
.
load
(
file
)
elif
method
==
'cross_context'
:
with
open
(
f
'{path_}_{reference_layer_name}.model'
,
'rb'
)
as
file
:
svc
=
pickle
.
load
(
file
)
with
open
(
f
'{path_}_{reference_layer_name}_scaler.model'
,
'rb'
)
as
file
:
scaler
=
pickle
.
load
(
file
)
else
:
raise
NotImplementedError
(
'Prediction method is not implemented'
)
return
scaler
,
svc
\ No newline at end of file
src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_cross_context.py
0 → 100644
View file @
495390ac
from
processing.data_prep.metrics_base
import
get_cyclic_time_feature
,
get_layer_metrics
from
processing.ml.predict_base
import
increase_time_window
,
load_ml_models
method
=
'cross_context'
N
=
2
# Currently N is fixed to 2
####################
import
pandas
as
pd
from
pandas
import
DataFrame
#####################
import
json
from
entities
import
Layer
,
Cluster
import
collections
import
numpy
as
np
from
typing
import
Iterable
,
Tuple
,
List
,
Dict
,
Any
####################
import
pickle
#####################
import
numpy
as
np
def
flatten_layer_metrics_datapoint
(
datapoint
:
list
)
->
np
.
array
:
'''
Flattens a single layer metrics data point in the form:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number]
to:
(X)
'''
flat_list
=
[]
for
layer_metric_tuple
in
datapoint
[:
-
1
]:
# for all x
flat_list
.
extend
(
layer_metric_tuple
[
0
:
-
1
])
# everything before time
flat_list
.
extend
(
layer_metric_tuple
[
-
1
])
# time1/2
flat_list
.
append
(
datapoint
[
-
1
])
# cluster num
return
np
.
asarray
(
flat_list
)
#########################
from
db.repository
import
Repository
from
db.dao
import
PredictionResult
repo
=
Repository
()
def
run_prediction
(
use_case
:
str
):
for
layerpair
in
repo
.
get_layer_pairs
(
use_case
):
layer_name
=
layerpair
.
layer
reference_layer_name
=
layerpair
.
reference_layer
print
(
f
"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}"
)
##########################
with
open
(
f
'data/{use_case}/cluster_metrics/{layer_name}.json'
)
as
file
:
cluster_metrics
:
List
[
Cluster
]
=
[
Cluster
.
create_from_dict
(
e
)
for
e
in
json
.
loads
(
file
.
read
())]
cluster_ids
=
{
c
.
cluster_id
for
c
in
cluster_metrics
}
cluster_metrics
:
Dict
[
Any
,
Cluster
]
=
{(
c
.
time_window_id
,
c
.
cluster_id
):
c
for
c
in
cluster_metrics
}
with
open
(
f
'data/{use_case}/layer_metrics/{reference_layer_name}.json'
)
as
file
:
layer_metrics
:
List
[
Layer
]
=
[
Layer
.
create_from_dict
(
e
)
for
e
in
json
.
loads
(
file
.
read
())]
layer_metrics
:
Dict
[
Any
,
Layer
]
=
{
l
.
time_window_id
:
l
for
l
in
layer_metrics
}
######################
# load the time keys chronologically
ordered_time_keys
=
list
(
layer_metrics
.
keys
())
ordered_time_keys
.
sort
(
key
=
lambda
x
:
eval
(
x
))
######################
ordered_time_keys
=
ordered_time_keys
[
-
N
:]
#################
prediction_metrics_raw
=
[]
current_layer_metric
=
layer_metrics
[
ordered_time_keys
[
1
]]
prev_layer_metric
=
layer_metrics
[
ordered_time_keys
[
0
]]
current_layer_metric_tuple
=
get_layer_metrics
(
current_layer_metric
)
prev_layer_metric_tuple
=
get_layer_metrics
(
prev_layer_metric
)
for
cluster_id
in
cluster_ids
:
# yield each combination of reference layer metrics to clusters
prediction_metrics_raw
.
append
([
prev_layer_metric_tuple
,
current_layer_metric_tuple
,
int
(
cluster_id
)])
#######################
scaler
,
svc
=
load_ml_models
(
use_case
,
method
,
layer_name
,
reference_layer_name
)
################
prediction_cluster_ids
=
[]
prediction_time_window
=
increase_time_window
(
ordered_time_keys
[
1
])
prediction_metrics
=
[]
for
pred
in
prediction_metrics_raw
:
cluster_id
=
pred
[
-
1
]
prediction_cluster_ids
.
append
(
cluster_id
)
flat_
=
flatten_layer_metrics_datapoint
(
pred
)
prediction_metrics
.
append
(
flat_
)
prediction_results
=
svc
.
predict
(
scaler
.
transform
(
np
.
array
(
prediction_metrics
)))
print
(
np
.
unique
(
prediction_results
,
return_counts
=
True
))
for
i
in
range
(
len
(
prediction_cluster_ids
)):
res
=
PredictionResult
(
use_case
,
use_case
,
method
,
layer_name
,
reference_layer_name
,
prediction_cluster_ids
[
i
],
prediction_time_window
,
prediction_results
[
i
])
repo
.
add_prediction_result
(
res
)
src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_single_context.py
View file @
495390ac
from
processing.data_prep.metrics_base
import
get_cyclic_time_feature
from
processing.data_prep.metrics_base
import
get_cyclic_time_feature
,
get_cluster_metrics
from
processing.ml.predict_base
import
increase_time_window
,
load_ml_models
N
=
3
# Currently N is fixed to 3
method
=
'single_context'
...
...
@@ -11,18 +12,11 @@ import json
from
entities
import
Cluster
import
collections
import
numpy
as
np
from
typing
import
Iterable
,
Tuple
from
typing
import
Iterable
,
Tuple
,
Dict
,
List
######################
from
typing
import
Dict
from
typing
import
Tuple
def
get_metrics
(
cur_cluster
:
Cluster
)
->
Tuple
:
return
(
cur_cluster
.
size
,
cur_cluster
.
std_dev
,
cur_cluster
.
scarcity
,
cur_cluster
.
importance1
,
cur_cluster
.
importance2
,
cur_cluster
.
range_
,
cur_cluster
.
global_center_distance
,
get_cyclic_time_feature
(
cur_cluster
.
get_time_info
()))
####################
import
pickle
#####################
def
flatten_metrics_datapoint
(
datapoint
:
list
)
->
Tuple
[
'X'
,
np
.
array
]
:
def
flatten_metrics_datapoint
(
datapoint
:
list
)
->
np
.
array
:
'''
Flattens a single metrics data point in the form:
[(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N]
...
...
@@ -35,16 +29,6 @@ def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
flat_list
.
extend
(
entry
[
-
1
])
# add time tuple
return
np
.
asarray
(
flat_list
)
######################
def
increase_time_window
(
time_window_id
:
str
):
tuple_
=
eval
(
time_window_id
)
if
tuple_
[
1
]
==
52
:
# 1st week next year
return
(
tuple_
[
0
]
+
1
,
1
)
else
:
# next week
return
str
((
tuple_
[
0
],
tuple_
[
1
]
+
1
))
#########################
from
db.repository
import
Repository
from
db.dao
import
PredictionResult
...
...
@@ -73,11 +57,7 @@ def run_prediction(use_case: str):
cluster_map
[
id_
]
.
append
(
cluster
)
####################
with
open
(
f
'data/{use_case}/ml_output/{method}/{layer_name}.model'
,
'rb'
)
as
file
:
svc
=
pickle
.
load
(
file
)
####################
with
open
(
f
'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model'
,
'rb'
)
as
file
:
scaler
=
pickle
.
load
(
file
)
scaler
,
svc
=
load_ml_models
(
use_case
,
method
,
layer_name
)
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids
=
[]
...
...
@@ -85,7 +65,7 @@ def run_prediction(use_case: str):
prediction_metrics
=
[]
for
cluster_id
,
time_windows
in
cluster_map
.
items
():
v
=
[
get_metrics
(
c
)
for
c
in
time_windows
[
-
N
:]]
# metrics for last N time windows
v
=
[
get_
cluster_
metrics
(
c
)
for
c
in
time_windows
[
-
N
:]]
# metrics for last N time windows
v_flattened
=
flatten_metrics_datapoint
(
v
)
prediction_cluster_ids
.
append
(
cluster_id
)
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_cross_context.py
View file @
495390ac
...
...
@@ -8,10 +8,10 @@ approach = 'cross_context'
import
pickle
from
pathlib
import
Path
def
export_model
(
model
,
use_case
,
layer_name
,
reference_layer_name
):
def
export_model
(
model
,
use_case
,
layer_name
,
reference_layer_name
,
scaler
=
False
):
fpath
=
f
'data/{use_case}/ml_output/{approach}'
Path
(
fpath
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
f
'{fpath}/{layer_name}_{reference_layer_name}.model'
,
'wb'
)
as
f
:
with
open
(
f
'{fpath}/{layer_name}_{reference_layer_name}
{"_scaler" if scaler else ""}
.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
###################
from
sklearn.ensemble
import
RandomForestClassifier
...
...
@@ -46,11 +46,13 @@ def run_training(use_case):
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
train_X
=
scaler
.
fit_transform
(
training
)[:,:
-
1
]
# all except y
train_X
=
scaler
.
fit_transform
(
training
[
training
.
columns
[:
-
1
]])
# all except y
train_Y
=
training
[
training
.
columns
[
-
1
]]
test_X
=
scaler
.
transform
(
testing
)[:,:
-
1
]
# all except y
test_X
=
scaler
.
transform
(
testing
[
testing
.
columns
[:
-
1
]])
# all except y
test_Y
=
testing
[
testing
.
columns
[
-
1
]]
export_model
(
scaler
,
use_case
,
layer_name
,
reference_layer_name
,
scaler
=
True
)
########################
from
processing
import
DataSampler
...
...
src/data-hub/proactive-community-detection-microservice/app/run_prediction.py
View file @
495390ac
...
...
@@ -5,12 +5,18 @@ if os.path.exists(modules_path):
sys
.
path
.
insert
(
1
,
modules_path
)
from
db.repository
import
Repository
from
processing.ml.predict_single_context
import
run_prediction
as
run_single_prediction
# from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
from
processing.ml.predict_cross_context
import
run_prediction
as
run_cross_prediction
if
__name__
==
'__main__'
:
'''Executes the predictions.'''
use_case
=
'community-prediction-youtube-n'
repo
=
Repository
()
repo
.
delete_all_prediction_results
()
run_single_prediction
(
use_case
)
# run_cross_prediction(use_case)
\ No newline at end of file
run_cross_prediction
(
use_case
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment