Commit 495390ac authored by Alexander Lercher's avatar Alexander Lercher

Cross context prediction

parent a878064e
......@@ -131,4 +131,7 @@ class Repository(MongoRepositoryBase):
def get_prediction_results(self, use_case: str) -> List[PredictionResult]:
entries = super().get_entries(self._prediction_result_collection, selection={'use_case': use_case}, projection={'_id': 0})
return [PredictionResult.create_from_dict(e) for e in entries]
def delete_all_prediction_results(self):
super().drop_collection(self._prediction_result_collection)
#endregion
......@@ -33,8 +33,7 @@ class Cluster:
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
str_tuple = self.time_window_id
return int(str_tuple.split(',')[1].strip()[:-1])
return eval(self.time_window_id)[1]
def __repr__(self):
return str(self.__dict__)
......
......@@ -53,6 +53,10 @@ class Layer:
self.distances_from_global_centers = self.get_distances_from_global_center(active_clusters)
self.cluster_center_distance_agg_metrics = self.get_center_distance_min_max_avg_sum(active_clusters)
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
return eval(self.time_window_id)[1]
def get_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's absolute sizes.'''
if len(clusters) == 0:
......
......@@ -2,23 +2,24 @@
"cells": [
{
"cell_type": "code",
"execution_count": 52,
"execution_count": 1,
"source": [
"use_case = 'community-prediction-youtube-n'\r\n",
"layer_name = 'LikesLayer'"
"layer_name = 'LikesLayer'\r\n",
"reference_layer_name = 'ViewsLayer'"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"source": [
"import json\r\n",
"from entities import Cluster\r\n",
"import collections\r\n",
"import numpy as np\r\n",
"from typing import Iterable, Tuple"
"from typing import Iterable, Tuple, List, Dict, Any"
],
"outputs": [],
"metadata": {}
......@@ -27,150 +28,164 @@
"cell_type": "code",
"execution_count": 3,
"source": [
"N=3"
"N=2"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": 6,
"source": [
"path_in = f\"data/{use_case}/cluster_metrics/{layer_name}.json\"\r\n",
"with open(path_in, 'r') as file:\r\n",
" data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]\r\n",
"from entities import Layer, Cluster\r\n",
"\r\n",
"data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"data[-1]"
"with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:\r\n",
" cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" cluster_ids = {c.cluster_id for c in cluster_metrics}\r\n",
" cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}\r\n",
" \r\n",
"with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:\r\n",
" layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]\r\n",
" layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}\r\n"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 11,
"source": [
"cluster_map = {}\r\n",
"\r\n",
"# for cluster in {c.cluster_id for c in data}:\r\n",
"# data_map[cluster] = [c for c in data if c.cluster_id == cluster]\r\n",
"\r\n",
"for cluster in data:\r\n",
" id_ = cluster.cluster_id\r\n",
"\r\n",
" if id_ not in cluster_map:\r\n",
" cluster_map[id_] = []\r\n",
"\r\n",
" cluster_map[id_].append(cluster)\r\n"
"# load the time keys chronologically\r\n",
"ordered_time_keys = list(layer_metrics.keys())\r\n",
"ordered_time_keys.sort(key=lambda x: eval(x))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 13,
"source": [
"{c.cluster_id for c in data} == cluster_map.keys()"
"ordered_time_keys = ordered_time_keys[-N:]\r\n",
"ordered_time_keys"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
"['(2018, 23)', '(2018, 24)']"
]
},
"metadata": {},
"execution_count": 55
"execution_count": 13
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"source": [
"len(cluster_map.keys())"
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))\r\n",
"\r\n",
"def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:\r\n",
" return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))\r\n",
"\r\n",
"def get_layer_metrics(layer: Layer) -> Iterable:\r\n",
" res = [layer.n_nodes, layer.n_clusters, layer.entropy]\r\n",
" res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]\r\n",
" res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))\r\n",
" return res"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 25,
"source": [
"import numpy as np\r\n",
"\r\n",
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
"prediction_metrics_raw = []"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 26,
"source": [
"from typing import Tuple\r\n",
"current_layer_metric = layer_metrics[ordered_time_keys[1]]\r\n",
"prev_layer_metric = layer_metrics[ordered_time_keys[0]]\r\n",
"\r\n",
"def get_metrics(cur_cluster: Cluster) -> Tuple:\r\n",
" return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))"
"current_layer_metric_tuple = get_layer_metrics(current_layer_metric)\r\n",
"prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)\r\n",
"\r\n",
"for cluster_id in cluster_ids:\r\n",
" # yield each combination of reference layer metrics to clusters\r\n",
" prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 38,
"source": [
"method = 'cross_context'\r\n",
"\r\n",
"import pickle \r\n",
"\r\n",
"method = 'single_context'\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}.model', 'rb') as file:\r\n",
" svc = pickle.load(file)\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:\r\n",
" svc = pickle.load(file)"
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_{reference_layer_name}_scaler.model', 'rb') as file:\r\n",
" scaler = pickle.load(file)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 38,
"source": [
"import pickle \r\n",
"import numpy as np\r\n",
"\r\n",
"with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:\r\n",
" scaler = pickle.load(file)"
"def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:\r\n",
" return (np.sin(2*np.pi*time/max_time_value),\r\n",
" np.cos(2*np.pi*time/max_time_value))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 30,
"source": [
"def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
"import numpy as np\r\n",
"\r\n",
"def flatten_layer_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:\r\n",
" '''\r\n",
" Flattens a single metrics data point in the form:\r\n",
" [(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N, evolution_label]\r\n",
" Flattens a single layer metrics data point in the form:\r\n",
" [(n_nodes, n_clusters, entropy,\r\n",
" (relative_cluster_size)^M, (distance_from_global_centers)^M, \r\n",
" (time1, time2))^N, \r\n",
" cluster_number, evolution_label]\r\n",
" to:\r\n",
" (X, y: np.array)\r\n",
" '''\r\n",
" flat_list = []\r\n",
" for entry in datapoint: # for all x\r\n",
" flat_list.extend(entry[:-1]) # add all number features except the time tuple\r\n",
" flat_list.extend(entry[-1]) # add time tuple\r\n",
" for layer_metric_tuple in datapoint[:-1]: # for all x\r\n",
" flat_list.extend(layer_metric_tuple[0:-1]) # everything before time\r\n",
" flat_list.extend(layer_metric_tuple[-1]) # time1/2\r\n",
"\r\n",
" flat_list.append(datapoint[-1]) # cluster num\r\n",
"\r\n",
" # flat_list.append(datapoint[-1]) # y\r\n",
" return np.asarray(flat_list)"
],
"outputs": [],
......@@ -178,7 +193,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 31,
"source": [
"def increase_time_window(time_window_id: str):\r\n",
" tuple_ = eval(time_window_id)\r\n",
......@@ -195,168 +210,225 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 33,
"source": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"895\n",
"[ 1.01800000e+04 6.94600000e+03 1.25669044e+01 1.00000000e+00\n",
" 1.20000000e+01 1.46559171e+00 1.01800000e+04 9.82318271e-05\n",
" 1.17878193e-03 1.43967751e-04 1.00000000e+00 0.00000000e+00\n",
" 2.37254283e+06 1.14923227e+03 7.98256735e+06 3.54604887e-01\n",
" -9.35016243e-01 4.35300000e+03 3.25600000e+03 1.15021768e+01\n",
" 1.00000000e+00 1.00000000e+01 1.33691646e+00 4.35300000e+03\n",
" 2.29726625e-04 2.29726625e-03 3.07125307e-04 1.00000000e+00\n",
" 0.00000000e+00 2.36405615e+05 3.69147185e+02 1.20194323e+06\n",
" 2.39315664e-01 -9.70941817e-01 8.95000000e+02]\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 34,
"source": [
"from db.dao import PredictionResult\r\n",
"\r\n",
"# prediction_results = []\r\n",
"prediction_cluster_ids = []\r\n",
"prediction_time_windows = []\r\n",
"prediction_time_window = increase_time_window(ordered_time_keys[1])\r\n",
"prediction_metrics = []\r\n",
"\r\n",
"for cluster_id, time_windows in cluster_map.items():\r\n",
" v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows\r\n",
" v_flattened = flatten_metrics_datapoint(v)\r\n",
"\r\n",
" \r\n",
"for pred in prediction_metrics_raw:\r\n",
" cluster_id = pred[-1]\r\n",
" prediction_cluster_ids.append(cluster_id)\r\n",
" prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))\r\n",
" prediction_metrics.append(v_flattened)\r\n",
"\r\n",
"\r\n",
" # v_flattened = v_flattened.reshape(1, v_flattened.shape[0]) # reshape for ML with only 1 pred value\r\n",
" # res = PredictionResult(use_case, use_case, method, layer_name, None, cluster_id, increase_time_window(time_windows[-1].time_window_id), svc.predict(v_flattened)[0])\r\n",
" # prediction_results.append(res)"
" flat_ = flatten_layer_metrics_datapoint(pred)\r\n",
" prediction_metrics.append(flat_)\r\n",
" "
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 41,
"source": [
"scaler.transform(prediction_metrics[0].reshape(1,27))"
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 42,
"source": [
"prediction_metrics[15]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[-0.2525847 , -0.00725354, -0.00748744, -0.26150883, -0.61179695,\n",
" -0.00699078, -0.0156031 , 0.10230883, -1.49959068, -0.25198809,\n",
" -0.00721248, -0.00740694, -0.2559145 , -0.6125857 , -0.0069614 ,\n",
" -0.01582086, -0.22871208, -1.567934 , -0.25144835, -0.00729236,\n",
" -0.00753175, -0.25448947, -0.6134931 , -0.00698498, -0.01589221,\n",
" -0.63013244, -1.62002196]])"
"array([ 1.01800000e+04, 6.94600000e+03, 1.25669044e+01, 1.00000000e+00,\n",
" 1.20000000e+01, 1.46559171e+00, 1.01800000e+04, 9.82318271e-05,\n",
" 1.17878193e-03, 1.43967751e-04, 1.00000000e+00, 0.00000000e+00,\n",
" 2.37254283e+06, 1.14923227e+03, 7.98256735e+06, 3.54604887e-01,\n",
" -9.35016243e-01, 4.35300000e+03, 3.25600000e+03, 1.15021768e+01,\n",
" 1.00000000e+00, 1.00000000e+01, 1.33691646e+00, 4.35300000e+03,\n",
" 2.29726625e-04, 2.29726625e-03, 3.07125307e-04, 1.00000000e+00,\n",
" 0.00000000e+00, 2.36405615e+05, 3.69147185e+02, 1.20194323e+06,\n",
" 2.39315664e-01, -9.70941817e-01, 4.36000000e+03])"
]
},
"metadata": {},
"execution_count": 64
"execution_count": 42
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 29,
"source": [
"prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))"
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
]
},
"metadata": {},
"execution_count": 29
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 43,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])\r\n",
" )"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 45,
"source": [
"prediction_metrics[15]"
"list(zip(np.unique(prediction_results, return_counts=True)))"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0.46472317, -0.88545603, 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0.35460489, -0.93501624, 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0.23931566, -0.97094182])"
"[(array([0., 1., 2., 3.]),),\n",
" (array([ 5335, 1511, 355, 13007], dtype=int64),)]"
]
},
"metadata": {},
"execution_count": 67
"execution_count": 45
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 46,
"source": [
"dataa = np.array(prediction_metrics)\r\n",
"svc.predict(dataa[3].reshape(1, 27))"
"prediction_results"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([3.])"
"array([3., 0., 0., ..., 0., 3., 3.])"
]
},
"metadata": {},
"execution_count": 29
"execution_count": 46
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 51,
"source": [
"predictions = []\r\n",
"for i in range(len(prediction_cluster_ids)):\r\n",
" predictions.append(\r\n",
" PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])\r\n",
" )"
"time = '(2019, 45)'\r\n",
"int(time.replace('(', '').replace(')', '').split(',')[1])"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"45"
]
},
"metadata": {},
"execution_count": 51
}
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 52,
"source": [
"list(zip(np.unique(prediction_results, return_counts=True)))"
"eval(time)[1]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[(array([0., 1., 2., 3., 4.]),),\n",
" (array([ 2740, 596, 1429, 1324, 14119], dtype=int64),)]"
"45"
]
},
"metadata": {},
"execution_count": 74
"execution_count": 52
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 53,
"source": [
"prediction_results"
"int(time.split(',')[1].strip()[:-1])"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([4., 4., 0., ..., 0., 0., 0.])"
"45"
]
},
"metadata": {},
"execution_count": 70
"execution_count": 53
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 47,
"source": [
"[r.__dict__ for r in predictions[:10]]"
],
......@@ -367,88 +439,88 @@
"text/plain": [
"[{'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '0',\n",
" 'cluster_id': 895,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '1',\n",
" 'cluster_id': 8947,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" 'prediction': 0.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '2',\n",
" 'cluster_id': 10464,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" 'prediction': 0.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '3',\n",
" 'cluster_id': 14671,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '4',\n",
" 'cluster_id': 18000,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '5',\n",
" 'cluster_id': 17895,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" 'prediction': 2.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '6',\n",
" 'cluster_id': 1234,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '7',\n",
" 'cluster_id': 16236,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '8',\n",
" 'cluster_id': 1995,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0},\n",
" {'use_case': 'community-prediction-youtube-n',\n",
" 'table': 'community-prediction-youtube-n',\n",
" 'method': 'single_context',\n",
" 'method': 'cross_context',\n",
" 'layer': 'LikesLayer',\n",
" 'reference_layer': None,\n",
" 'cluster_id': '9',\n",
" 'cluster_id': 5161,\n",
" 'time_window': '(2018, 25)',\n",
" 'prediction': 3.0}]"
" 'prediction': 0.0}]"
]
},
"metadata": {},
"execution_count": 15
"execution_count": 47
}
],
"metadata": {}
......
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label, convert_metrics_data_to_dataframe
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label, convert_metrics_data_to_dataframe, get_cluster_metrics
from pathlib import Path
#############################
......@@ -86,7 +86,7 @@ def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) ->
tuples = []
continue
cur_metrics = (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2, cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
cur_metrics = get_cluster_metrics(cur_cluster)
# deque function: adding N+1st element will remove oldest one
if len(tuples) == N:
......
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label,convert_metrics_data_to_dataframe
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label,convert_metrics_data_to_dataframe, get_layer_metrics
from pathlib import Path
#################
......@@ -59,21 +59,10 @@ def get_columns(N) -> List[str]:
cols = cols * N
return cols + ['cluster_id'] + ['evolution_label']
######################
def get_cyclic_time_feature_from_time_window(time: str) -> Tuple[float, float]:
return get_cyclic_time_feature(int(time.replace('(', '').replace(')', '').split(',')[1]))
#######################
from typing import Iterable, List, Dict, Any
import json
from entities import Layer, Cluster
def get_layer_metrics(layer: Layer) -> Iterable:
res = [layer.n_nodes, layer.n_clusters, layer.entropy]
res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res.append(get_cyclic_time_feature_from_time_window(layer.time_window_id))
return res
def create_layer_metrics_training_data(use_case: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
"""
Loads the metrics training data for an individual layer from disk.
......
......@@ -45,4 +45,22 @@ def convert_metrics_data_to_dataframe(data: Iterable, columns: list, flattening_
training_data.append(xy)
return pd.DataFrame(data=training_data, columns=columns)
\ No newline at end of file
return pd.DataFrame(data=training_data, columns=columns)
####################
from entities import Cluster, Layer
from typing import Dict, Tuple
def get_cluster_metrics(cur_cluster: Cluster) -> Tuple:
return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2,
cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
####################
def get_layer_metrics(layer: Layer) -> Iterable:
res = [layer.n_nodes, layer.n_clusters, layer.entropy]
res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res.append(get_cyclic_time_feature(layer.get_time_info()))
return res
###################
\ No newline at end of file
def increase_time_window(time_window_id: str) -> str:
tuple_ = eval(time_window_id)
if tuple_[1] == 52:
# 1st week next year
return (tuple_[0]+1 , 1)
else:
# next week
return str((tuple_[0], tuple_[1]+1))
######################
from typing import Tuple
import pickle
def load_ml_models(use_case, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/ml_output/{method}/{layer_name}'
if method == 'single_context':
with open(f'{path_}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
elif method == 'cross_context':
with open(f'{path_}_{reference_layer_name}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_{reference_layer_name}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
else:
raise NotImplementedError('Prediction method is not implemented')
return scaler, svc
\ No newline at end of file
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_layer_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
method = 'cross_context'
N = 2 # Currently N is fixed to 2
####################
import pandas as pd
from pandas import DataFrame
#####################
import json
from entities import Layer, Cluster
import collections
import numpy as np
from typing import Iterable, Tuple, List, Dict, Any
####################
import pickle
#####################
import numpy as np
def flatten_layer_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single layer metrics data point in the form:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number]
to:
(X)
'''
flat_list = []
for layer_metric_tuple in datapoint[:-1]: # for all x
flat_list.extend(layer_metric_tuple[0:-1]) # everything before time
flat_list.extend(layer_metric_tuple[-1]) # time1/2
flat_list.append(datapoint[-1]) # cluster num
return np.asarray(flat_list)
#########################
from db.repository import Repository
from db.dao import PredictionResult
repo = Repository()
def run_prediction(use_case: str):
for layerpair in repo.get_layer_pairs(use_case):
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
print(f"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}")
##########################
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
######################
# load the time keys chronologically
ordered_time_keys = list(layer_metrics.keys())
ordered_time_keys.sort(key=lambda x: eval(x))
######################
ordered_time_keys = ordered_time_keys[-N:]
#################
prediction_metrics_raw = []
current_layer_metric = layer_metrics[ordered_time_keys[1]]
prev_layer_metric = layer_metrics[ordered_time_keys[0]]
current_layer_metric_tuple = get_layer_metrics(current_layer_metric)
prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)
for cluster_id in cluster_ids:
# yield each combination of reference layer metrics to clusters
prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])
#######################
scaler, svc = load_ml_models(use_case, method, layer_name, reference_layer_name)
################
prediction_cluster_ids = []
prediction_time_window = increase_time_window(ordered_time_keys[1])
prediction_metrics = []
for pred in prediction_metrics_raw:
cluster_id = pred[-1]
prediction_cluster_ids.append(cluster_id)
flat_ = flatten_layer_metrics_datapoint(pred)
prediction_metrics.append(flat_)
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
repo.add_prediction_result(res)
from processing.data_prep.metrics_base import get_cyclic_time_feature
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_cluster_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
N = 3 # Currently N is fixed to 3
method = 'single_context'
......@@ -11,18 +12,11 @@ import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable, Tuple
from typing import Iterable, Tuple, Dict, List
######################
from typing import Dict
from typing import Tuple
def get_metrics(cur_cluster: Cluster) -> Tuple:
return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2,
cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
####################
import pickle
#####################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
def flatten_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single metrics data point in the form:
[(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N]
......@@ -35,16 +29,6 @@ def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
flat_list.extend(entry[-1]) # add time tuple
return np.asarray(flat_list)
######################
def increase_time_window(time_window_id: str):
tuple_ = eval(time_window_id)
if tuple_[1] == 52:
# 1st week next year
return (tuple_[0]+1 , 1)
else:
# next week
return str((tuple_[0], tuple_[1]+1))
#########################
from db.repository import Repository
from db.dao import PredictionResult
......@@ -72,12 +56,8 @@ def run_prediction(use_case: str):
cluster_map[id_] = []
cluster_map[id_].append(cluster)
####################
with open(f'data/{use_case}/ml_output/{method}/{layer_name}.model', 'rb') as file:
svc = pickle.load(file)
####################
with open(f'data/{use_case}/ml_output/{method}/{layer_name}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
####################
scaler, svc = load_ml_models(use_case, method, layer_name)
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids = []
......@@ -85,7 +65,7 @@ def run_prediction(use_case: str):
prediction_metrics = []
for cluster_id, time_windows in cluster_map.items():
v = [get_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
v = [get_cluster_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
v_flattened = flatten_metrics_datapoint(v)
prediction_cluster_ids.append(cluster_id)
......
......@@ -8,10 +8,10 @@ approach = 'cross_context'
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name):
def export_model(model, use_case, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}.model', 'wb') as f:
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
......@@ -46,11 +46,13 @@ def run_training(use_case):
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_X = scaler.fit_transform(training[training.columns[:-1]]) # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing)[:,:-1] # all except y
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
########################
from processing import DataSampler
......
......@@ -5,12 +5,18 @@ if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.repository import Repository
from processing.ml.predict_single_context import run_prediction as run_single_prediction
# from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
if __name__ == '__main__':
'''Executes the predictions.'''
use_case='community-prediction-youtube-n'
repo = Repository()
repo.delete_all_prediction_results()
run_single_prediction(use_case)
# run_cross_prediction(use_case)
\ No newline at end of file
run_cross_prediction(use_case)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment