Correctly predicting with scaled metrics data

a878064e · Alexander Lercher · d94b70d7 · a878064e · a878064e · a878064e
Commit a878064e authored Jul 26, 2021 by Alexander Lercher
3 changed files
--- a/src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
+++ b/src/data-hub/proactive-community-detection-microservice/app/predict.ipynb
@@ -2,316 +2,61 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 52,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "use_case = 'community-prediction-youtube-n'\r\n",
    "layer_name = 'LikesLayer'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\r\n",
-    "from pandas import DataFrame\r\n",
-    "\r\n",
-    "df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>cluster_size</th>\n",
-       "      <th>cluster_variance</th>\n",
-       "      <th>cluster_density</th>\n",
-       "      <th>cluster_import1</th>\n",
-       "      <th>cluster_import2</th>\n",
-       "      <th>cluster_area</th>\n",
-       "      <th>cluster_center_distance</th>\n",
-       "      <th>time_f1</th>\n",
-       "      <th>time_f2</th>\n",
-       "      <th>cluster_size.1</th>\n",
-       "      <th>...</th>\n",
-       "      <th>cluster_size.2</th>\n",
-       "      <th>cluster_variance.2</th>\n",
-       "      <th>cluster_density.2</th>\n",
-       "      <th>cluster_import1.2</th>\n",
-       "      <th>cluster_import2.2</th>\n",
-       "      <th>cluster_area.2</th>\n",
-       "      <th>cluster_center_distance.2</th>\n",
-       "      <th>time_f1.2</th>\n",
-       "      <th>time_f2.2</th>\n",
-       "      <th>evolution_label</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>565819</th>\n",
-       "      <td>4.0</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.000336</td>\n",
-       "      <td>0.000168</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.992709</td>\n",
-       "      <td>0.120537</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.992709</td>\n",
-       "      <td>-0.120537</td>\n",
-       "      <td>-1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>565820</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.935016</td>\n",
-       "      <td>-0.354605</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.822984</td>\n",
-       "      <td>-0.568065</td>\n",
-       "      <td>4.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>565821</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.970942</td>\n",
-       "      <td>-0.239316</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.885456</td>\n",
-       "      <td>-0.464723</td>\n",
-       "      <td>-1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>565822</th>\n",
-       "      <td>4.0</td>\n",
-       "      <td>1.089725</td>\n",
-       "      <td>0.75</td>\n",
-       "      <td>0.000334</td>\n",
-       "      <td>0.000166</td>\n",
-       "      <td>3.0</td>\n",
-       "      <td>6.0</td>\n",
-       "      <td>0.885456</td>\n",
-       "      <td>-0.464723</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.748511</td>\n",
-       "      <td>-0.663123</td>\n",
-       "      <td>-1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>565823</th>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.748511</td>\n",
-       "      <td>-0.663123</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.663123</td>\n",
-       "      <td>-0.748511</td>\n",
-       "      <td>-1.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 28 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "        cluster_size  cluster_variance  cluster_density  cluster_import1  \\\n",
-       "565819           4.0          0.000000             0.00         0.000336   \n",
-       "565820           0.0          0.000000             0.00         0.000000   \n",
-       "565821           0.0          0.000000             0.00         0.000000   \n",
-       "565822           4.0          1.089725             0.75         0.000334   \n",
-       "565823           0.0          0.000000             0.00         0.000000   \n",
-       "\n",
-       "        cluster_import2  cluster_area  cluster_center_distance   time_f1  \\\n",
-       "565819         0.000168           0.0                      0.0  0.992709   \n",
-       "565820         0.000000           0.0                      0.0  0.935016   \n",
-       "565821         0.000000           0.0                      0.0  0.970942   \n",
-       "565822         0.000166           3.0                      6.0  0.885456   \n",
-       "565823         0.000000           0.0                      0.0  0.748511   \n",
-       "\n",
-       "         time_f2  cluster_size.1  ...  cluster_size.2  cluster_variance.2  \\\n",
-       "565819  0.120537             1.0  ...             0.0                 0.0   \n",
-       "565820 -0.354605             1.0  ...             0.0                 0.0   \n",
-       "565821 -0.239316             0.0  ...             0.0                 0.0   \n",
-       "565822 -0.464723             1.0  ...             0.0                 0.0   \n",
-       "565823 -0.663123             1.0  ...             0.0                 0.0   \n",
-       "\n",
-       "        cluster_density.2  cluster_import1.2  cluster_import2.2  \\\n",
-       "565819                0.0                0.0                0.0   \n",
-       "565820                0.0                0.0                0.0   \n",
-       "565821                0.0                0.0                0.0   \n",
-       "565822                0.0                0.0                0.0   \n",
-       "565823                0.0                0.0                0.0   \n",
-       "\n",
-       "        cluster_area.2  cluster_center_distance.2  time_f1.2  time_f2.2  \\\n",
-       "565819             0.0                        0.0   0.992709  -0.120537   \n",
-       "565820             0.0                        0.0   0.822984  -0.568065   \n",
-       "565821             0.0                        0.0   0.885456  -0.464723   \n",
-       "565822             0.0                        0.0   0.748511  -0.663123   \n",
-       "565823             0.0                        0.0   0.663123  -0.748511   \n",
-       "\n",
-       "        evolution_label  \n",
-       "565819             -1.0  \n",
-       "565820              4.0  \n",
-       "565821             -1.0  \n",
-       "565822             -1.0  \n",
-       "565823             -1.0  \n",
-       "\n",
-       "[5 rows x 28 columns]"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
   ],
-   "source": [
+   "outputs": [],
-    "df.tail()"
+   "metadata": {}
-   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "import json\r\n",
    "from entities import Cluster\r\n",
    "import collections\r\n",
    "import numpy as np\r\n",
    "from typing import Iterable, Tuple"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "N=3"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 53,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "path_in = f\"data/{use_case}/cluster_metrics/{layer_name}.json\"\r\n",
    "with open(path_in, 'r') as file:\r\n",
    "    data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]\r\n",
    "\r\n",
    "data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'time_window_id': '(2018, 24)', 'cluster_id': '20207', 'size': 0, 'std_dev': 0, 'scarcity': 0, 'importance1': 0, 'importance2': 0, 'range_': 0.0, 'center': [0, 0], 'global_center_distance': 0}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "data[-1]"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 54,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "cluster_map = {}\r\n",
    "\r\n",
@@ -325,78 +70,67 @@
    "        cluster_map[id_] = []\r\n",
    "\r\n",
    "    cluster_map[id_].append(cluster)\r\n"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 55,
-   "metadata": {},
+   "source": [
+    "{c.cluster_id for c in data} == cluster_map.keys()"
+   ],
   "outputs": [
    {
+     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "True"
      ]
     },
-     "execution_count": 9,
     "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 55
    }
   ],
-   "source": [
+   "metadata": {}
-    "{c.cluster_id for c in data} == cluster_map.keys()"
-   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "20208"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "len(cluster_map.keys())"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "import numpy as np\r\n",
    "\r\n",
    "def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
    "    return (np.sin(2*np.pi*time/max_time_value),\r\n",
    "            np.cos(2*np.pi*time/max_time_value))"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "from typing import Tuple\r\n",
    "\r\n",
    "def get_metrics(cur_cluster: Cluster) -> Tuple:\r\n",
    "    return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 56,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "import pickle \r\n",
    "\r\n",
@@ -404,13 +138,25 @@
    "\r\n",
    "with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:\r\n",
    "    svc = pickle.load(file)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 63,
-   "metadata": {},
+   "source": [
+    "import pickle \r\n",
+    "\r\n",
+    "with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:\r\n",
+    "    scaler = pickle.load(file)"
+   ],
   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
   "source": [
    "def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
    "    '''\r\n",
@@ -426,13 +172,13 @@
    "\r\n",
    "    # flat_list.append(datapoint[-1]) # y\r\n",
    "    return np.asarray(flat_list)"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "def increase_time_window(time_window_id: str):\r\n",
    "    tuple_ = eval(time_window_id)\r\n",
@@ -443,32 +189,180 @@
    "    else:\r\n",
    "        # next week\r\n",
    "        return str((tuple_[0], tuple_[1]+1))\r\n"
-   ]
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 58,
-   "metadata": {},
-   "outputs": [],
   "source": [
-    "from entities import PredictionResult\r\n",
+    "from db.dao import PredictionResult\r\n",
    "\r\n",
-    "prediction_results = []\r\n",
+    "# prediction_results = []\r\n",
+    "prediction_cluster_ids = []\r\n",
+    "prediction_time_windows = []\r\n",
+    "prediction_metrics = []\r\n",
    "\r\n",
    "for cluster_id, time_windows in cluster_map.items():\r\n",
    "    v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows\r\n",
    "    v_flattened = flatten_metrics_datapoint(v)\r\n",
-    "    v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
+    "\r\n",
-    "    res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
+    "    prediction_cluster_ids.append(cluster_id)\r\n",
-    "    prediction_results.append(res)"
+    "    prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))\r\n",
-   ]
+    "    prediction_metrics.append(v_flattened)\r\n",
+    "\r\n",
+    "\r\n",
+    "    # v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
+    "    # res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
+    "    # prediction_results.append(res)"
+   ],
+   "outputs": [],
+   "metadata": {}
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 64,
-   "metadata": {},
+   "source": [
+    "scaler.transform(prediction_metrics[0].reshape(1,27))"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "array([[-0.2525847 , -0.00725354, -0.00748744, -0.26150883, -0.61179695,\n",
+       "        -0.00699078, -0.0156031 ,  0.10230883, -1.49959068, -0.25198809,\n",
+       "        -0.00721248, -0.00740694, -0.2559145 , -0.6125857 , -0.0069614 ,\n",
+       "        -0.01582086, -0.22871208, -1.567934  , -0.25144835, -0.00729236,\n",
+       "        -0.00753175, -0.25448947, -0.6134931 , -0.00698498, -0.01589221,\n",
+       "        -0.63013244, -1.62002196]])"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 64
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "source": [
+    "prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "source": [
+    "prediction_metrics[15]"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
+       "        0.        ,  0.        ,  0.46472317, -0.88545603,  0.        ,\n",
+       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
+       "        0.        ,  0.35460489, -0.93501624,  0.        ,  0.        ,\n",
+       "        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
+       "        0.23931566, -0.97094182])"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 67
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "source": [
+    "dataa = np.array(prediction_metrics)\r\n",
+    "svc.predict(dataa[3].reshape(1, 27))"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "array([3.])"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 29
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "source": [
+    "predictions = []\r\n",
+    "for i in range(len(prediction_cluster_ids)):\r\n",
+    "    predictions.append(\r\n",
+    "        PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])\r\n",
+    "    )"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "source": [
+    "list(zip(np.unique(prediction_results, return_counts=True)))"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "[(array([0., 1., 2., 3., 4.]),),\n",
+       " (array([ 2740,   596,  1429,  1324, 14119], dtype=int64),)]"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 74
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "source": [
+    "prediction_results"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "array([4., 4., 0., ..., 0., 0., 0.])"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 70
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "source": [
+    "[r.__dict__ for r in predictions[:10]]"
+   ],
   "outputs": [
    {
+     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[{'use_case': 'community-prediction-youtube-n',\n",
@@ -478,7 +372,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '0',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -486,7 +380,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '1',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -502,7 +396,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '3',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -510,7 +404,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '4',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -518,7 +412,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '5',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -526,7 +420,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '6',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -542,7 +436,7 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '8',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0},\n",
+       "  'prediction': 3.0},\n",
       " {'use_case': 'community-prediction-youtube-n',\n",
       "  'table': 'community-prediction-youtube-n',\n",
       "  'method': 'single_context',\n",
@@ -550,41 +444,30 @@
       "  'reference_layer': None,\n",
       "  'cluster_id': '9',\n",
       "  'time_window': '(2018, 25)',\n",
-       "  'prediction': 2.0}]"
+       "  'prediction': 3.0}]"
      ]
     },
-     "execution_count": 22,
     "metadata": {},
-     "output_type": "execute_result"
+     "execution_count": 15
    }
   ],
-   "source": [
+   "metadata": {}
-    "[r.__dict__ for r in prediction_results[:10]]"
-   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": null,
-   "metadata": {},
+   "source": [],
-   "outputs": [
+   "outputs": [],
-    {
+   "metadata": {}
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n"
-     ]
-    }
-   ],
-   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
-   "hash": "6f758d9e9b2866087a1d464f700475727f47c3870deef6e7815ca445f120e6ad"
+   "hash": "f4b37965f8116f61e214526431d03f7da6e57badb249bab76499e8551fed5453"
  },
  "kernelspec": {
-   "display_name": "Python 3.7.6 64-bit ('venv': venv)",
+   "name": "python3",
-   "name": "python3"
+   "display_name": "Python 3.7.8 64-bit ('venv': venv)"
  },
  "language_info": {
   "codemirror_mode": {
@@ -596,7 +479,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.8"
  },
  "orig_nbformat": 4
 },

--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_single_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/predict_single_context.py
@@ -17,7 +17,8 @@ from typing import Dict
 from typing import Tuple
 def get_metrics(cur_cluster: Cluster) -> Tuple:
-    return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
+    return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, 
+            cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
 ####################
 import pickle 
 #####################
@@ -53,9 +54,8 @@ repo = Repository()
 def run_prediction(use_case: str):
    for layer in repo.get_layers_for_use_case(use_case):
        layer_name = layer.layer_name
+        print(f"Predicting {method} for {use_case}//{layer_name}")
-        ################
-        df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)
        #################
        path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json"
        with open(path_in, 'r') as file:
@@ -75,12 +75,27 @@ def run_prediction(use_case: str):
        ####################        
        with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:
            svc = pickle.load(file)
+        ####################
+        with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:
+            scaler = pickle.load(file)
        #####################
+        # store id, future time window, and flattened metrics to combine the latter during prediction
+        prediction_cluster_ids = []
+        prediction_time_windows = []
+        prediction_metrics = []
        for cluster_id, time_windows in cluster_map.items():
            v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
            v_flattened = flatten_metrics_datapoint(v)
-            v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value
-            res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])
+            prediction_cluster_ids.append(cluster_id)
-            repo.add_prediction_result(res)
+            prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))
-        #####################
+            prediction_metrics.append(v_flattened)
+        # predict all at once for speedup
+        prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
+        print(np.unique(prediction_results, return_counts=True))
+        for i in range(len(prediction_cluster_ids)):
+            res = PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])
+            repo.add_prediction_result(res)
--- a/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
+++ b/src/data-hub/proactive-community-detection-microservice/app/processing/ml/train_single_context.py
@@ -8,10 +8,10 @@ approach = 'single_context'
 import pickle 
 from pathlib import Path
-def export_model(model, use_case, layer_name):
+def export_model(model, use_case, layer_name, scaler=False):
    fpath = f'data/{use_case}/ml_output/{approach}'
    Path(fpath).mkdir(parents=True, exist_ok=True)
-    with open(f'{fpath}/{layer_name}.model', 'wb') as f:
+    with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
        pickle.dump(model, f)
 #####################
 from sklearn.ensemble import RandomForestClassifier
@@ -45,11 +45,13 @@ def run_training(use_case):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
-        train_X = scaler.fit_transform(training)[:,:-1] # all except y
+        train_X = scaler.fit_transform(training[training.columns[:-1]]) # all except y
        train_Y = training[training.columns[-1]]
-        test_X = scaler.transform(testing)[:,:-1] # all except y
+        test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
        test_Y = testing[testing.columns[-1]]
+        export_model(scaler, use_case, layer_name, scaler=True)
        ########################
        from processing import DataSampler