Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
SMART
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Registry
Registry
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
UNI-KLU
SMART
Commits
a878064e
Commit
a878064e
authored
Jul 26, 2021
by
Alexander Lercher
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Correctly predicting with scaled metrics data
parent
d94b70d7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
268 additions
and
368 deletions
+268
-368
predict.ipynb
...active-community-detection-microservice/app/predict.ipynb
+240
-357
predict_single_context.py
...-microservice/app/processing/ml/predict_single_context.py
+22
-7
train_single_context.py
...on-microservice/app/processing/ml/train_single_context.py
+6
-4
No files found.
src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
View file @
a878064e
...
@@ -2,316 +2,61 @@
...
@@ -2,316 +2,61 @@
"cells": [
"cells": [
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 3,
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"source": [
"use_case = 'community-prediction-youtube-n'\r\n",
"use_case = 'community-prediction-youtube-n'\r\n",
"layer_name = 'LikesLayer'"
"layer_name = 'LikesLayer'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\r\n",
"from pandas import DataFrame\r\n",
"\r\n",
"df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>cluster_size</th>\n",
" <th>cluster_variance</th>\n",
" <th>cluster_density</th>\n",
" <th>cluster_import1</th>\n",
" <th>cluster_import2</th>\n",
" <th>cluster_area</th>\n",
" <th>cluster_center_distance</th>\n",
" <th>time_f1</th>\n",
" <th>time_f2</th>\n",
" <th>cluster_size.1</th>\n",
" <th>...</th>\n",
" <th>cluster_size.2</th>\n",
" <th>cluster_variance.2</th>\n",
" <th>cluster_density.2</th>\n",
" <th>cluster_import1.2</th>\n",
" <th>cluster_import2.2</th>\n",
" <th>cluster_area.2</th>\n",
" <th>cluster_center_distance.2</th>\n",
" <th>time_f1.2</th>\n",
" <th>time_f2.2</th>\n",
" <th>evolution_label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>565819</th>\n",
" <td>4.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000336</td>\n",
" <td>0.000168</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.992709</td>\n",
" <td>0.120537</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.992709</td>\n",
" <td>-0.120537</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>565820</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.935016</td>\n",
" <td>-0.354605</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.822984</td>\n",
" <td>-0.568065</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>565821</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.970942</td>\n",
" <td>-0.239316</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.885456</td>\n",
" <td>-0.464723</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>565822</th>\n",
" <td>4.0</td>\n",
" <td>1.089725</td>\n",
" <td>0.75</td>\n",
" <td>0.000334</td>\n",
" <td>0.000166</td>\n",
" <td>3.0</td>\n",
" <td>6.0</td>\n",
" <td>0.885456</td>\n",
" <td>-0.464723</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.748511</td>\n",
" <td>-0.663123</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>565823</th>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.748511</td>\n",
" <td>-0.663123</td>\n",
" <td>1.0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.663123</td>\n",
" <td>-0.748511</td>\n",
" <td>-1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" cluster_size cluster_variance cluster_density cluster_import1 \\\n",
"565819 4.0 0.000000 0.00 0.000336 \n",
"565820 0.0 0.000000 0.00 0.000000 \n",
"565821 0.0 0.000000 0.00 0.000000 \n",
"565822 4.0 1.089725 0.75 0.000334 \n",
"565823 0.0 0.000000 0.00 0.000000 \n",
"\n",
" cluster_import2 cluster_area cluster_center_distance time_f1 \\\n",
"565819 0.000168 0.0 0.0 0.992709 \n",
"565820 0.000000 0.0 0.0 0.935016 \n",
"565821 0.000000 0.0 0.0 0.970942 \n",
"565822 0.000166 3.0 6.0 0.885456 \n",
"565823 0.000000 0.0 0.0 0.748511 \n",
"\n",
" time_f2 cluster_size.1 ... cluster_size.2 cluster_variance.2 \\\n",
"565819 0.120537 1.0 ... 0.0 0.0 \n",
"565820 -0.354605 1.0 ... 0.0 0.0 \n",
"565821 -0.239316 0.0 ... 0.0 0.0 \n",
"565822 -0.464723 1.0 ... 0.0 0.0 \n",
"565823 -0.663123 1.0 ... 0.0 0.0 \n",
"\n",
" cluster_density.2 cluster_import1.2 cluster_import2.2 \\\n",
"565819 0.0 0.0 0.0 \n",
"565820 0.0 0.0 0.0 \n",
"565821 0.0 0.0 0.0 \n",
"565822 0.0 0.0 0.0 \n",
"565823 0.0 0.0 0.0 \n",
"\n",
" cluster_area.2 cluster_center_distance.2 time_f1.2 time_f2.2 \\\n",
"565819 0.0 0.0 0.992709 -0.120537 \n",
"565820 0.0 0.0 0.822984 -0.568065 \n",
"565821 0.0 0.0 0.885456 -0.464723 \n",
"565822 0.0 0.0 0.748511 -0.663123 \n",
"565823 0.0 0.0 0.663123 -0.748511 \n",
"\n",
" evolution_label \n",
"565819 -1.0 \n",
"565820 4.0 \n",
"565821 -1.0 \n",
"565822 -1.0 \n",
"565823 -1.0 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
],
"source": [
"outputs": [],
"df.tail()"
"metadata": {}
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"source": [
"import json\r\n",
"import json\r\n",
"from entities import Cluster\r\n",
"from entities import Cluster\r\n",
"import collections\r\n",
"import collections\r\n",
"import numpy as np\r\n",
"import numpy as np\r\n",
"from typing import Iterable, Tuple"
"from typing import Iterable, Tuple"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"source": [
"N=3"
"N=3"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 7,
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"source": [
"path_in = f\"data/{use_case}/cluster_metrics/{layer_name}.json\"\r\n",
"path_in = f\"data/{use_case}/cluster_metrics/{layer_name}.json\"\r\n",
"with open(path_in, 'r') as file:\r\n",
"with open(path_in, 'r') as file:\r\n",
" data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]\r\n",
" data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]\r\n",
"\r\n",
"\r\n",
"data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))"
"data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'time_window_id': '(2018, 24)', 'cluster_id': '20207', 'size': 0, 'std_dev': 0, 'scarcity': 0, 'importance1': 0, 'importance2': 0, 'range_': 0.0, 'center': [0, 0], 'global_center_distance': 0}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"data[-1]"
"data[-1]"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 8,
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"source": [
"cluster_map = {}\r\n",
"cluster_map = {}\r\n",
"\r\n",
"\r\n",
...
@@ -325,78 +70,67 @@
...
@@ -325,78 +70,67 @@
" cluster_map[id_] = []\r\n",
" cluster_map[id_] = []\r\n",
"\r\n",
"\r\n",
" cluster_map[id_].append(cluster)\r\n"
" cluster_map[id_].append(cluster)\r\n"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 9,
"execution_count": 55,
"metadata": {},
"source": [
"{c.cluster_id for c in data} == cluster_map.keys()"
],
"outputs": [
"outputs": [
{
{
"output_type": "execute_result",
"data": {
"data": {
"text/plain": [
"text/plain": [
"True"
"True"
]
]
},
},
"execution_count": 9,
"metadata": {},
"metadata": {},
"
output_type": "execute_result"
"
execution_count": 55
}
}
],
],
"source": [
"metadata": {}
"{c.cluster_id for c in data} == cluster_map.keys()"
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20208"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"source": [
"len(cluster_map.keys())"
"len(cluster_map.keys())"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 10,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"source": [
"import numpy as np\r\n",
"import numpy as np\r\n",
"\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
" np.cos(2*np.pi*time/max_time_value))"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"source": [
"from typing import Tuple\r\n",
"from typing import Tuple\r\n",
"\r\n",
"\r\n",
"def get_metrics(cur_cluster: Cluster) -> Tuple:\r\n",
"def get_metrics(cur_cluster: Cluster) -> Tuple:\r\n",
" return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))"
" return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 12,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"source": [
"import pickle \r\n",
"import pickle \r\n",
"\r\n",
"\r\n",
...
@@ -404,13 +138,25 @@
...
@@ -404,13 +138,25 @@
"\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:\r\n",
" svc = pickle.load(file)"
" svc = pickle.load(file)"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 13,
"execution_count": 63,
"metadata": {},
"source": [
"import pickle \r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:\r\n",
" scaler = pickle.load(file)"
],
"outputs": [],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"source": [
"def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
"def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
" '''\r\n",
" '''\r\n",
...
@@ -426,13 +172,13 @@
...
@@ -426,13 +172,13 @@
"\r\n",
"\r\n",
" # flat_list.append(datapoint[-1]) # y\r\n",
" # flat_list.append(datapoint[-1]) # y\r\n",
" return np.asarray(flat_list)"
" return np.asarray(flat_list)"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 18,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"source": [
"def increase_time_window(time_window_id: str):\r\n",
"def increase_time_window(time_window_id: str):\r\n",
" tuple_ = eval(time_window_id)\r\n",
" tuple_ = eval(time_window_id)\r\n",
...
@@ -443,32 +189,180 @@
...
@@ -443,32 +189,180 @@
" else:\r\n",
" else:\r\n",
" # next week\r\n",
" # next week\r\n",
" return str((tuple_[0], tuple_[1]+1))\r\n"
" return str((tuple_[0], tuple_[1]+1))\r\n"
]
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 20,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"source": [
"from
entities
import PredictionResult\r\n",
"from
db.dao
import PredictionResult\r\n",
"\r\n",
"\r\n",
"prediction_results = []\r\n",
"# prediction_results = []\r\n",
"prediction_cluster_ids = []\r\n",
"prediction_time_windows = []\r\n",
"prediction_metrics = []\r\n",
"\r\n",
"\r\n",
"for cluster_id, time_windows in cluster_map.items():\r\n",
"for cluster_id, time_windows in cluster_map.items():\r\n",
" v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows\r\n",
" v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows\r\n",
" v_flattened = flatten_metrics_datapoint(v)\r\n",
" v_flattened = flatten_metrics_datapoint(v)\r\n",
" v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
"\r\n",
" res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
" prediction_cluster_ids.append(cluster_id)\r\n",
" prediction_results.append(res)"
" prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))\r\n",
]
" prediction_metrics.append(v_flattened)\r\n",
"\r\n",
"\r\n",
" # v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
" # res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
" # prediction_results.append(res)"
],
"outputs": [],
"metadata": {}
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 22,
"execution_count": 64,
"metadata": {},
"source": [
"scaler.transform(prediction_metrics[0].reshape(1,27))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[-0.2525847 , -0.00725354, -0.00748744, -0.26150883, -0.61179695,\n",
" -0.00699078, -0.0156031 , 0.10230883, -1.49959068, -0.25198809,\n",
" -0.00721248, -0.00740694, -0.2559145 , -0.6125857 , -0.0069614 ,\n",
" -0.01582086, -0.22871208, -1.567934 , -0.25144835, -0.00729236,\n",
" -0.00753175, -0.25448947, -0.6134931 , -0.00698498, -0.01589221,\n",
" -0.63013244, -1.62002196]])"
]
},
"metadata": {},
"execution_count": 64
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 65,
"source": [
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 67,
"source": [
"prediction_metrics[15]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.46472317, -0.88545603, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0.35460489, -0.93501624, 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0.23931566, -0.97094182])"
]
},
"metadata": {},
"execution_count": 67
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 29,
"source": [
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
]
},
"metadata": {},
"execution_count": 29
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 68,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])\r\n",
" )"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 74,
"source": [
"list(zip(np.unique(prediction_results, return_counts=True)))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(array([0., 1., 2., 3., 4.]),),\n",
" (array([ 2740, 596, 1429, 1324, 14119], dtype=int64),)]"
]
},
"metadata": {},
"execution_count": 74
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 70,
"source": [
"prediction_results"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([4., 4., 0., ..., 0., 0., 0.])"
]
},
"metadata": {},
"execution_count": 70
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 15,
"source": [
"[r.__dict__ for r in predictions[:10]]"
],
"outputs": [
"outputs": [
{
{
"output_type": "execute_result",
"data": {
"data": {
"text/plain": [
"text/plain": [
"[{'use_case': 'community-prediction-youtube-n',\n",
"[{'use_case': 'community-prediction-youtube-n',\n",
...
@@ -478,7 +372,7 @@
...
@@ -478,7 +372,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '0',\n",
" 'cluster_id': '0',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -486,7 +380,7 @@
...
@@ -486,7 +380,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '1',\n",
" 'cluster_id': '1',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -502,7 +396,7 @@
...
@@ -502,7 +396,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '3',\n",
" 'cluster_id': '3',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -510,7 +404,7 @@
...
@@ -510,7 +404,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '4',\n",
" 'cluster_id': '4',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -518,7 +412,7 @@
...
@@ -518,7 +412,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '5',\n",
" 'cluster_id': '5',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -526,7 +420,7 @@
...
@@ -526,7 +420,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '6',\n",
" 'cluster_id': '6',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -542,7 +436,7 @@
...
@@ -542,7 +436,7 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '8',\n",
" 'cluster_id': '8',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0},\n",
" 'prediction':
3
.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'single_context',\n",
...
@@ -550,41 +444,30 @@
...
@@ -550,41 +444,30 @@
" 'reference_layer': None,\n",
" 'reference_layer': None,\n",
" 'cluster_id': '9',\n",
" 'cluster_id': '9',\n",
" 'time_window': '(2018, 25)',\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction':
2
.0}]"
" 'prediction':
3
.0}]"
]
]
},
},
"execution_count": 22,
"metadata": {},
"metadata": {},
"
output_type": "execute_result"
"
execution_count": 15
}
}
],
],
"source": [
"metadata": {}
"[r.__dict__ for r in prediction_results[:10]]"
]
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 79,
"execution_count": null,
"metadata": {},
"source": [],
"outputs": [
"outputs": [],
{
"metadata": {}
"name": "stdout",
"output_type": "stream",
"text": [
"0\n"
]
}
],
"source": []
}
}
],
],
"metadata": {
"metadata": {
"interpreter": {
"interpreter": {
"hash": "
6f758d9e9b2866087a1d464f700475727f47c3870deef6e7815ca445f120e6ad
"
"hash": "
f4b37965f8116f61e214526431d03f7da6e57badb249bab76499e8551fed5453
"
},
},
"kernelspec": {
"kernelspec": {
"
display_name": "Python 3.7.6 64-bit ('venv': venv)
",
"
name": "python3
",
"
name": "python3
"
"
display_name": "Python 3.7.8 64-bit ('venv': venv)
"
},
},
"language_info": {
"language_info": {
"codemirror_mode": {
"codemirror_mode": {
...
@@ -596,7 +479,7 @@
...
@@ -596,7 +479,7 @@
"name": "python",
"name": "python",
"nbconvert_exporter": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"pygments_lexer": "ipython3",
"version": "3.7.
6
"
"version": "3.7.
8
"
},
},
"orig_nbformat": 4
"orig_nbformat": 4
},
},
...
...
src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_single_context.py
View file @
a878064e
...
@@ -17,7 +17,8 @@ from typing import Dict
...
@@ -17,7 +17,8 @@ from typing import Dict
from
typing
import
Tuple
from
typing
import
Tuple
def
get_metrics
(
cur_cluster
:
Cluster
)
->
Tuple
:
def
get_metrics
(
cur_cluster
:
Cluster
)
->
Tuple
:
return
(
cur_cluster
.
size
,
cur_cluster
.
std_dev
,
cur_cluster
.
scarcity
,
cur_cluster
.
importance1
,
cur_cluster
.
importance2
,
cur_cluster
.
range_
,
cur_cluster
.
global_center_distance
,
get_cyclic_time_feature
(
cur_cluster
.
get_time_info
()))
return
(
cur_cluster
.
size
,
cur_cluster
.
std_dev
,
cur_cluster
.
scarcity
,
cur_cluster
.
importance1
,
cur_cluster
.
importance2
,
cur_cluster
.
range_
,
cur_cluster
.
global_center_distance
,
get_cyclic_time_feature
(
cur_cluster
.
get_time_info
()))
####################
####################
import
pickle
import
pickle
#####################
#####################
...
@@ -53,9 +54,8 @@ repo = Repository()
...
@@ -53,9 +54,8 @@ repo = Repository()
def
run_prediction
(
use_case
:
str
):
def
run_prediction
(
use_case
:
str
):
for
layer
in
repo
.
get_layers_for_use_case
(
use_case
):
for
layer
in
repo
.
get_layers_for_use_case
(
use_case
):
layer_name
=
layer
.
layer_name
layer_name
=
layer
.
layer_name
print
(
f
"Predicting {method} for {use_case}//{layer_name}"
)
################
df
:
DataFrame
=
pd
.
read_csv
(
f
'data/{use_case}/ml_input/single_context/{layer_name}.csv'
,
index_col
=
0
)
#################
#################
path_in
=
f
"data/{use_case}/cluster_metrics/{layer_name}.json"
path_in
=
f
"data/{use_case}/cluster_metrics/{layer_name}.json"
with
open
(
path_in
,
'r'
)
as
file
:
with
open
(
path_in
,
'r'
)
as
file
:
...
@@ -75,12 +75,27 @@ def run_prediction(use_case: str):
...
@@ -75,12 +75,27 @@ def run_prediction(use_case: str):
####################
####################
with
open
(
f
'data/{use_case}/ml_output/{method}/{layer_name}.model'
,
'rb'
)
as
file
:
with
open
(
f
'data/{use_case}/ml_output/{method}/{layer_name}.model'
,
'rb'
)
as
file
:
svc
=
pickle
.
load
(
file
)
svc
=
pickle
.
load
(
file
)
####################
with
open
(
f
'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model'
,
'rb'
)
as
file
:
scaler
=
pickle
.
load
(
file
)
#####################
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids
=
[]
prediction_time_windows
=
[]
prediction_metrics
=
[]
for
cluster_id
,
time_windows
in
cluster_map
.
items
():
for
cluster_id
,
time_windows
in
cluster_map
.
items
():
v
=
[
get_metrics
(
c
)
for
c
in
time_windows
[
-
N
:]]
# metrics for last N time windows
v
=
[
get_metrics
(
c
)
for
c
in
time_windows
[
-
N
:]]
# metrics for last N time windows
v_flattened
=
flatten_metrics_datapoint
(
v
)
v_flattened
=
flatten_metrics_datapoint
(
v
)
v_flattened
=
v_flattened
.
reshape
(
1
,
v_flattened
.
shape
[
0
])
# reshape for ML with only 1 pred value
res
=
PredictionResult
(
use_case
,
use_case
,
method
,
layer_name
,
None
,
cluster_id
,
increase_time_window
(
time_windows
[
-
1
]
.
time_window_id
),
svc
.
predict
(
v_flattened
)[
0
])
prediction_cluster_ids
.
append
(
cluster_id
)
repo
.
add_prediction_result
(
res
)
prediction_time_windows
.
append
(
increase_time_window
(
time_windows
[
-
1
]
.
time_window_id
))
#####################
prediction_metrics
.
append
(
v_flattened
)
# predict all at once for speedup
prediction_results
=
svc
.
predict
(
scaler
.
transform
(
np
.
array
(
prediction_metrics
)))
print
(
np
.
unique
(
prediction_results
,
return_counts
=
True
))
for
i
in
range
(
len
(
prediction_cluster_ids
)):
res
=
PredictionResult
(
use_case
,
use_case
,
method
,
layer_name
,
None
,
prediction_cluster_ids
[
i
],
prediction_time_windows
[
i
],
prediction_results
[
i
])
repo
.
add_prediction_result
(
res
)
src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
View file @
a878064e
...
@@ -8,10 +8,10 @@ approach = 'single_context'
...
@@ -8,10 +8,10 @@ approach = 'single_context'
import
pickle
import
pickle
from
pathlib
import
Path
from
pathlib
import
Path
def
export_model
(
model
,
use_case
,
layer_name
):
def
export_model
(
model
,
use_case
,
layer_name
,
scaler
=
False
):
fpath
=
f
'data/{use_case}/ml_output/{approach}'
fpath
=
f
'data/{use_case}/ml_output/{approach}'
Path
(
fpath
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
Path
(
fpath
)
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
f
'{fpath}/{layer_name}.model'
,
'wb'
)
as
f
:
with
open
(
f
'{fpath}/{layer_name}
{"_scaler" if scaler else ""}
.model'
,
'wb'
)
as
f
:
pickle
.
dump
(
model
,
f
)
pickle
.
dump
(
model
,
f
)
#####################
#####################
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.ensemble
import
RandomForestClassifier
...
@@ -45,11 +45,13 @@ def run_training(use_case):
...
@@ -45,11 +45,13 @@ def run_training(use_case):
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler
()
scaler
=
StandardScaler
()
train_X
=
scaler
.
fit_transform
(
training
)[:,:
-
1
]
# all except y
train_X
=
scaler
.
fit_transform
(
training
[
training
.
columns
[:
-
1
]])
# all except y
train_Y
=
training
[
training
.
columns
[
-
1
]]
train_Y
=
training
[
training
.
columns
[
-
1
]]
test_X
=
scaler
.
transform
(
testing
)[:,:
-
1
]
# all except y
test_X
=
scaler
.
transform
(
testing
[
testing
.
columns
[:
-
1
]])
# all except y
test_Y
=
testing
[
testing
.
columns
[
-
1
]]
test_Y
=
testing
[
testing
.
columns
[
-
1
]]
export_model
(
scaler
,
use_case
,
layer_name
,
scaler
=
True
)
########################
########################
from
processing
import
DataSampler
from
processing
import
DataSampler
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment