Commit b7097db8 authored by Alexander Lercher's avatar Alexander Lercher

Cleanup

parent 495390ac
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"source": [
"use_case = 'community-prediction-youtube-n'\r\n",
"layer_name = 'LikesLayer'\r\n",
"reference_layer_name = 'ViewsLayer'"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 5,
"source": [
"import json\r\n",
"from entities import Cluster\r\n",
"import collections\r\n",
"import numpy as np\r\n",
"from typing import Iterable, Tuple, List, Dict, Any"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
"source": [
"N=2"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 6,
"source": [
"from entities import Layer, Cluster\r\n",
"\r\n",
"with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:\r\n",
" cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" cluster_ids = {c.cluster_id for c in cluster_metrics}\r\n",
" cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}\r\n",
" \r\n",
"with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:\r\n",
" layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}\r\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 11,
"source": [
"# load the time keys chronologically\r\n",
"ordered_time_keys = list(layer_metrics.keys())\r\n",
"ordered_time_keys.sort(key=lambda x: eval(x))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 13,
"source": [
"ordered_time_keys = ordered_time_keys[-N:]\r\n",
"ordered_time_keys"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['(2018, 23)', '(2018, 24)']"
]
},
"metadata": {},
"execution_count": 13
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 19,
"source": [
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))\r\n",
"\r\n",
"def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:\r\n",
" return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))\r\n",
"\r\n",
"def get_layer_metrics(layer: Layer) -> Iterable:\r\n",
" res = [layer.n_nodes, layer.n_clusters, layer.entropy]\r\n",
" res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))\r\n",
" return res"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 25,
"source": [
"prediction_metrics_raw = []"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 26,
"source": [
"current_layer_metric = layer_metrics[ordered_time_keys[1]]\r\n",
"prev_layer_metric = layer_metrics[ordered_time_keys[0]]\r\n",
"\r\n",
"current_layer_metric_tuple = get_layer_metrics(current_layer_metric)\r\n",
"prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)\r\n",
"\r\n",
"for cluster_id in cluster_ids:\r\n",
" # yield each combination of reference layer metrics to clusters\r\n",
" prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 38,
"source": [
"method = 'cross_context'\r\n",
"\r\n",
"import pickle \r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}.model', 'rb') as file:\r\n",
" svc = pickle.load(file)\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}_scaler.model', 'rb') as file:\r\n",
" scaler = pickle.load(file)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 38,
"source": [
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 30,
"source": [
"import numpy as np\r\n",
"\r\n",
"def flatten_layer_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
" '''\r\n",
" Flattens a single layer metrics data point in the form:\r\n",
" [(n_nodes, n_clusters, entropy,\r\n",
" (relative_cluster_size)^M, (distance_from_global_centers)^M, \r\n",
" (time1, time2))^N, \r\n",
" cluster_number, evolution_label]\r\n",
" to:\r\n",
" (X, y: np.array)\r\n",
" '''\r\n",
" flat_list = []\r\n",
" for layer_metric_tuple in datapoint[:-1]: # for all x\r\n",
" flat_list.extend(layer_metric_tuple[0:-1]) # everything before time\r\n",
" flat_list.extend(layer_metric_tuple[-1]) # time1/2\r\n",
"\r\n",
" flat_list.append(datapoint[-1]) # cluster num\r\n",
"\r\n",
" return np.asarray(flat_list)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 31,
"source": [
"def increase_time_window(time_window_id: str):\r\n",
" tuple_ = eval(time_window_id)\r\n",
" \r\n",
" if tuple_[1] == 52:\r\n",
" # 1st week next year\r\n",
" return (tuple_[0]+1 , 1)\r\n",
" else:\r\n",
" # next week\r\n",
" return str((tuple_[0], tuple_[1]+1))\r\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 33,
"source": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"895\n",
"[ 1.01800000e+04 6.94600000e+03 1.25669044e+01 1.00000000e+00\n",
" 1.20000000e+01 1.46559171e+00 1.01800000e+04 9.82318271e-05\n",
" 1.17878193e-03 1.43967751e-04 1.00000000e+00 0.00000000e+00\n",
" 2.37254283e+06 1.14923227e+03 7.98256735e+06 3.54604887e-01\n",
" -9.35016243e-01 4.35300000e+03 3.25600000e+03 1.15021768e+01\n",
" 1.00000000e+00 1.00000000e+01 1.33691646e+00 4.35300000e+03\n",
" 2.29726625e-04 2.29726625e-03 3.07125307e-04 1.00000000e+00\n",
" 0.00000000e+00 2.36405615e+05 3.69147185e+02 1.20194323e+06\n",
" 2.39315664e-01 -9.70941817e-01 8.95000000e+02]\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 34,
"source": [
"from db.dao import PredictionResult\r\n",
"\r\n",
"prediction_cluster_ids = []\r\n",
"prediction_time_window = increase_time_window(ordered_time_keys[1])\r\n",
"prediction_metrics = []\r\n",
" \r\n",
"for pred in prediction_metrics_raw:\r\n",
" cluster_id = pred[-1]\r\n",
" prediction_cluster_ids.append(cluster_id)\r\n",
"\r\n",
" flat_ = flatten_layer_metrics_datapoint(pred)\r\n",
" prediction_metrics.append(flat_)\r\n",
" "
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 41,
"source": [
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 42,
"source": [
"prediction_metrics[15]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([ 1.01800000e+04, 6.94600000e+03, 1.25669044e+01, 1.00000000e+00,\n",
" 1.20000000e+01, 1.46559171e+00, 1.01800000e+04, 9.82318271e-05,\n",
" 1.17878193e-03, 1.43967751e-04, 1.00000000e+00, 0.00000000e+00,\n",
" 2.37254283e+06, 1.14923227e+03, 7.98256735e+06, 3.54604887e-01,\n",
" -9.35016243e-01, 4.35300000e+03, 3.25600000e+03, 1.15021768e+01,\n",
" 1.00000000e+00, 1.00000000e+01, 1.33691646e+00, 4.35300000e+03,\n",
" 2.29726625e-04, 2.29726625e-03, 3.07125307e-04, 1.00000000e+00,\n",
" 0.00000000e+00, 2.36405615e+05, 3.69147185e+02, 1.20194323e+06,\n",
" 2.39315664e-01, -9.70941817e-01, 4.36000000e+03])"
]
},
"metadata": {},
"execution_count": 42
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 29,
"source": [
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
]
},
"metadata": {},
"execution_count": 29
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 43,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])\r\n",
" )"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 45,
"source": [
"list(zip(np.unique(prediction_results, return_counts=True)))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(array([0., 1., 2., 3.]),),\n",
" (array([ 5335, 1511, 355, 13007], dtype=int64),)]"
]
},
"metadata": {},
"execution_count": 45
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 46,
"source": [
"prediction_results"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3., 0., 0., ..., 0., 3., 3.])"
]
},
"metadata": {},
"execution_count": 46
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 51,
"source": [
"time = '(2019, 45)'\r\n",
"int(time.replace('(', '').replace(')', '').split(',')[1])"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45"
]
},
"metadata": {},
"execution_count": 51
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 52,
"source": [
"eval(time)[1]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45"
]
},
"metadata": {},
"execution_count": 52
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 53,
"source": [
"int(time.split(',')[1].strip()[:-1])"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45"
]
},
"metadata": {},
"execution_count": 53
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 47,
"source": [
"[r.__dict__ for r in predictions[:10]]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 895,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 8947,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 0.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 10464,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 0.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 14671,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 18000,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 17895,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 2.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 1234,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 16236,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 1995,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': 5161,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 0.0}]"
]
},
"metadata": {},
"execution_count": 47
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"interpreter": {
"hash": "f4b37965f8116f61e214526431d03f7da6e57badb249bab76499e8551fed5453"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.8 64-bit ('venv': venv)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
attrs==21.2.0
backcall==0.2.0
beautifulsoup4==4.9.3
certifi==2021.5.30
chardet==4.0.0
charset-normalizer==2.0.3
......@@ -6,26 +8,64 @@ click==7.1.2
clickclick==20.10.2
colorama==0.4.4
connexion==2.9.0
cycler==0.10.0
debugpy==1.4.0
decorator==5.0.9
Flask==1.1.4
Flask-Cors==3.0.10
idna==3.2
importlib-metadata==4.6.1
imbalanced-learn==0.8.0
imblearn==0.0
importlib-metadata==3.10.1
inflection==0.5.1
ipykernel==6.0.3
ipython==7.25.0
ipython-genutils==0.2.0
isodate==0.6.0
itsdangerous==1.1.0
jedi==0.18.0
Jinja2==2.11.3
joblib==1.0.1
jsonschema==3.2.0
jupyter-client==6.1.12
jupyter-core==4.7.1
kiwisolver==1.3.1
libpysal==4.5.1
MarkupSafe==2.0.1
matplotlib==3.4.2
matplotlib-inline==0.1.2
numpy==1.21.1
openapi-schema-validator==0.1.5
openapi-spec-validator==0.3.1
opencv-contrib-python==4.5.3.56
pandas==1.3.0
parso==0.8.2
pickleshare==0.7.5
Pillow==8.3.1
pointpats==2.2.0
prance==0.21.2
prompt-toolkit==3.0.19
Pygments==2.9.0
pymongo==3.12.0
pyparsing==2.4.7
pyrsistent==0.18.0
python-dateutil==2.8.2
pytz==2021.1
pywin32==301
PyYAML==5.4.1
pyzmq==22.1.0
requests==2.26.0
scikit-learn==0.24.2
scipy==1.7.0
semver==2.13.0
six==1.16.0
soupsieve==2.2.1
swagger-ui-bundle==0.0.8
threadpoolctl==2.2.0
tornado==6.1
traitlets==5.0.5
typing-extensions==3.10.0.0
urllib3==1.26.6
wcwidth==0.2.5
Werkzeug==1.0.1
zipp==3.5.0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment