Cleanup

b7097db8 · Alexander Lercher · 495390ac · 495390ac · b7097db8
Commit b7097db8 authored Jul 27, 2021 by Alexander Lercher
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 561 deletions

predict.ipynb ...active-community-detection-microservice/app/predict.ipynb +0 -560

requirements.txt ...ive-community-detection-microservice/app/requirements.txt +41 -1

No files found.
--- a/src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
+++ b/src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "source": [
-    "use_case = 'community-prediction-youtube-n'\r\n",
-    "layer_name = 'LikesLayer'\r\n",
-    "reference_layer_name = 'ViewsLayer'"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "source": [
-    "import json\r\n",
-    "from entities import Cluster\r\n",
-    "import collections\r\n",
-    "import numpy as np\r\n",
-    "from typing import Iterable, Tuple, List, Dict, Any"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "source": [
-    "N=2"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "source": [
-    "from entities import Layer, Cluster\r\n",
-    "\r\n",
-    "with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:\r\n",
-    "    cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]\r\n",
-    "    cluster_ids = {c.cluster_id for c in cluster_metrics}\r\n",
-    "    cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}\r\n",
-    "        \r\n",
-    "with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:\r\n",
-    "    layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]\r\n",
-    "    layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}\r\n"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "source": [
-    "# load the time keys chronologically\r\n",
-    "ordered_time_keys = list(layer_metrics.keys())\r\n",
-    "ordered_time_keys.sort(key=lambda x: eval(x))"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "source": [
-    "ordered_time_keys = ordered_time_keys[-N:]\r\n",
-    "ordered_time_keys"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "['(2018, 23)', '(2018, 24)']"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 13
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "source": [
-    "import numpy as np\r\n",
-    "\r\n",
-    "def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
-    "    return (np.sin(2*np.pi*time/max_time_value),\r\n",
-    "            np.cos(2*np.pi*time/max_time_value))\r\n",
-    "\r\n",
-    "def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:\r\n",
-    "    return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))\r\n",
-    "\r\n",
-    "def get_layer_metrics(layer: Layer) -> Iterable:\r\n",
-    "    res = [layer.n_nodes, layer.n_clusters, layer.entropy]\r\n",
-    "    res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
-    "    res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
-    "    res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
-    "    res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))\r\n",
-    "    return res"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "source": [
-    "prediction_metrics_raw = []"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "source": [
-    "current_layer_metric = layer_metrics[ordered_time_keys[1]]\r\n",
-    "prev_layer_metric = layer_metrics[ordered_time_keys[0]]\r\n",
-    "\r\n",
-    "current_layer_metric_tuple = get_layer_metrics(current_layer_metric)\r\n",
-    "prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)\r\n",
-    "\r\n",
-    "for cluster_id in cluster_ids:\r\n",
-    "    # yield each combination of reference layer metrics to clusters\r\n",
-    "    prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "source": [
-    "method = 'cross_context'\r\n",
-    "\r\n",
-    "import pickle \r\n",
-    "\r\n",
-    "with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}.model', 'rb') as file:\r\n",
-    "    svc = pickle.load(file)\r\n",
-    "\r\n",
-    "with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}_scaler.model', 'rb') as file:\r\n",
-    "    scaler = pickle.load(file)"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "source": [
-    "import numpy as np\r\n",
-    "\r\n",
-    "def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
-    "    return (np.sin(2*np.pi*time/max_time_value),\r\n",
-    "            np.cos(2*np.pi*time/max_time_value))"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "source": [
-    "import numpy as np\r\n",
-    "\r\n",
-    "def flatten_layer_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
-    "    '''\r\n",
-    "    Flattens a single layer metrics data point in the form:\r\n",
-    "    [(n_nodes, n_clusters, entropy,\r\n",
-    "     (relative_cluster_size)^M, (distance_from_global_centers)^M, \r\n",
-    "     (time1, time2))^N, \r\n",
-    "     cluster_number, evolution_label]\r\n",
-    "    to:\r\n",
-    "    (X, y: np.array)\r\n",
-    "    '''\r\n",
-    "    flat_list = []\r\n",
-    "    for layer_metric_tuple in datapoint[:-1]: # for all x\r\n",
-    "        flat_list.extend(layer_metric_tuple[0:-1]) # everything before time\r\n",
-    "        flat_list.extend(layer_metric_tuple[-1]) # time1/2\r\n",
-    "\r\n",
-    "    flat_list.append(datapoint[-1]) # cluster num\r\n",
-    "\r\n",
-    "    return np.asarray(flat_list)"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "source": [
-    "def increase_time_window(time_window_id: str):\r\n",
-    "    tuple_ = eval(time_window_id)\r\n",
-    "    \r\n",
-    "    if tuple_[1] == 52:\r\n",
-    "        # 1st week next year\r\n",
-    "        return (tuple_[0]+1 , 1)\r\n",
-    "    else:\r\n",
-    "        # next week\r\n",
-    "        return str((tuple_[0], tuple_[1]+1))\r\n"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "source": [],
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "895\n",
-      "[ 1.01800000e+04  6.94600000e+03  1.25669044e+01  1.00000000e+00\n",
-      "  1.20000000e+01  1.46559171e+00  1.01800000e+04  9.82318271e-05\n",
-      "  1.17878193e-03  1.43967751e-04  1.00000000e+00  0.00000000e+00\n",
-      "  2.37254283e+06  1.14923227e+03  7.98256735e+06  3.54604887e-01\n",
-      " -9.35016243e-01  4.35300000e+03  3.25600000e+03  1.15021768e+01\n",
-      "  1.00000000e+00  1.00000000e+01  1.33691646e+00  4.35300000e+03\n",
-      "  2.29726625e-04  2.29726625e-03  3.07125307e-04  1.00000000e+00\n",
-      "  0.00000000e+00  2.36405615e+05  3.69147185e+02  1.20194323e+06\n",
-      "  2.39315664e-01 -9.70941817e-01  8.95000000e+02]\n"
-     ]
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "source": [
-    "from db.dao import PredictionResult\r\n",
-    "\r\n",
-    "prediction_cluster_ids = []\r\n",
-    "prediction_time_window = increase_time_window(ordered_time_keys[1])\r\n",
-    "prediction_metrics = []\r\n",
-    "    \r\n",
-    "for pred in  prediction_metrics_raw:\r\n",
-    "    cluster_id = pred[-1]\r\n",
-    "    prediction_cluster_ids.append(cluster_id)\r\n",
-    "\r\n",
-    "    flat_ = flatten_layer_metrics_datapoint(pred)\r\n",
-    "    prediction_metrics.append(flat_)\r\n",
-    "    "
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "source": [
-    "prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "source": [
-    "prediction_metrics[15]"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "array([ 1.01800000e+04,  6.94600000e+03,  1.25669044e+01,  1.00000000e+00,\n",
-       "        1.20000000e+01,  1.46559171e+00,  1.01800000e+04,  9.82318271e-05,\n",
-       "        1.17878193e-03,  1.43967751e-04,  1.00000000e+00,  0.00000000e+00,\n",
-       "        2.37254283e+06,  1.14923227e+03,  7.98256735e+06,  3.54604887e-01,\n",
-       "       -9.35016243e-01,  4.35300000e+03,  3.25600000e+03,  1.15021768e+01,\n",
-       "        1.00000000e+00,  1.00000000e+01,  1.33691646e+00,  4.35300000e+03,\n",
-       "        2.29726625e-04,  2.29726625e-03,  3.07125307e-04,  1.00000000e+00,\n",
-       "        0.00000000e+00,  2.36405615e+05,  3.69147185e+02,  1.20194323e+06,\n",
-       "        2.39315664e-01, -9.70941817e-01,  4.36000000e+03])"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 42
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "source": [
-    "dataa = np.array(prediction_metrics)\r\n",
-    "svc.predict(dataa[3].reshape(1, 27))"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "array([3.])"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 29
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "source": [
-    "predictions = []\r\n",
-    "for i in range(len(prediction_cluster_ids)):\r\n",
-    "    predictions.append(\r\n",
-    "        PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])\r\n",
-    "    )"
-   ],
-   "outputs": [],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "source": [
-    "list(zip(np.unique(prediction_results, return_counts=True)))"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "[(array([0., 1., 2., 3.]),),\n",
-       " (array([ 5335,  1511,   355, 13007], dtype=int64),)]"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 45
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "source": [
-    "prediction_results"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "array([3., 0., 0., ..., 0., 3., 3.])"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 46
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "source": [
-    "time = '(2019, 45)'\r\n",
-    "int(time.replace('(', '').replace(')', '').split(',')[1])"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "45"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 51
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "source": [
-    "eval(time)[1]"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "45"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 52
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "source": [
-    "int(time.split(',')[1].strip()[:-1])"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "45"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 53
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "source": [
-    "[r.__dict__ for r in predictions[:10]]"
-   ],
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "[{'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 895,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 8947,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 0.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 10464,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 0.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 14671,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 18000,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 17895,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 1234,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 16236,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 1995,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 3.0},\n",
-       " {'use_case': 'community-prediction-youtube-n',\n",
-       "  'table': 'community-prediction-youtube-n',\n",
-       "  'method': 'cross_context',\n",
-       "  'layer': 'LikesLayer',\n",
-       "  'reference_layer': None,\n",
-       "  'cluster_id': 5161,\n",
-       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 0.0}]"
-      ]
-     },
-     "metadata": {},
-     "execution_count": 47
-    }
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "source": [],
-   "outputs": [],
-   "metadata": {}
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "f4b37965f8116f61e214526431d03f7da6e57badb249bab76499e8551fed5453"
-  },
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.7.8 64-bit ('venv': venv)"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.8"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
--- a/src/data-hub/proactive-community-detection-microservice/app/requirements.txt
+++ b/src/data-hub/proactive-community-detection-microservice/app/requirements.txt
 attrs==21.2.0
+backcall==0.2.0
+beautifulsoup4==4.9.3
 certifi==2021.5.30
 chardet==4.0.0
 charset-normalizer==2.0.3
@@ -6,26 +8,64 @@ click==7.1.2
 clickclick==20.10.2
 colorama==0.4.4
 connexion==2.9.0
+cycler==0.10.0
+debugpy==1.4.0
+decorator==5.0.9
 Flask==1.1.4
 Flask-Cors==3.0.10
 idna==3.2
-importlib-metadata==4.6.1
+imbalanced-learn==0.8.0
+imblearn==0.0
+importlib-metadata==3.10.1
 inflection==0.5.1
+ipykernel==6.0.3
+ipython==7.25.0
+ipython-genutils==0.2.0
 isodate==0.6.0
 itsdangerous==1.1.0
+jedi==0.18.0
 Jinja2==2.11.3
+joblib==1.0.1
 jsonschema==3.2.0
+jupyter-client==6.1.12
+jupyter-core==4.7.1
+kiwisolver==1.3.1
+libpysal==4.5.1
 MarkupSafe==2.0.1
+matplotlib==3.4.2
+matplotlib-inline==0.1.2
+numpy==1.21.1
 openapi-schema-validator==0.1.5
 openapi-spec-validator==0.3.1
+opencv-contrib-python==4.5.3.56
+pandas==1.3.0
+parso==0.8.2
+pickleshare==0.7.5
+Pillow==8.3.1
+pointpats==2.2.0
 prance==0.21.2
+prompt-toolkit==3.0.19
+Pygments==2.9.0
+pymongo==3.12.0
+pyparsing==2.4.7
 pyrsistent==0.18.0
+python-dateutil==2.8.2
+pytz==2021.1
+pywin32==301
 PyYAML==5.4.1
+pyzmq==22.1.0
 requests==2.26.0
+scikit-learn==0.24.2
+scipy==1.7.0
 semver==2.13.0
 six==1.16.0
+soupsieve==2.2.1
 swagger-ui-bundle==0.0.8
+threadpoolctl==2.2.0
+tornado==6.1
+traitlets==5.0.5
 typing-extensions==3.10.0.0
 urllib3==1.26.6
+wcwidth==0.2.5
 Werkzeug==1.0.1
 zipp==3.5.0