Commit 32ebd07f authored by Alexander Lercher's avatar Alexander Lercher

Merge branch 'feature/community-prediction' into 'develop'

CA-CEP

See merge request !44
parents 877c5e65 a31e702c
......@@ -2,13 +2,15 @@ FROM python:3
LABEL maintainer="Alexander Lercher"
RUN apt-get update
RUN pip install flask
RUN pip install connexion[swagger-ui]
EXPOSE 5000
WORKDIR /app
COPY src/data-hub/role-stage-discovery-microservice/app/requirements.txt /app/
RUN pip install -r requirements.txt
COPY src/modules/ /app/
COPY src/data-hub/proactive-community-detection-microservice/app/ /app/
RUN chmod a+x main.py
......
paths:
/debug:
post:
operationId: "debug.echo"
tags:
- "Echo"
summary: "Echo function for debugging purposes"
description: "Echoes the input back to the caller."
parameters:
- in: body
name: "Object"
required: true
schema:
type: object
responses:
'200':
description: "Successful echo of request data"
/use-cases/{use_case}/tables/{table}/layers/{layer_name}/predictions:
get:
operationId: "routes.predictions.get"
security:
- JwtRegular: []
tags:
- "Predictions"
summary: "Get predictions"
parameters:
- name: "use_case"
in: "path"
description: "Name of the use-case"
required: true
type: "string"
- name: "table"
in: "path"
description: "Name of the table"
required: true
type: "string"
- name: "layer_name"
in: "path"
description: "Name of the layer"
required: true
type: "string"
responses:
'200':
description: "Successful operation"
schema:
$ref: "#/definitions/Prediction"
'404':
description: "Predictions not found"
definitions:
Prediction:
type: object
properties:
use_case:
type: string
table:
type: string
method:
type: string
layer:
type: string
reference_layer:
type: string
cluster_label:
type: string
time_window:
type: string
prediction:
type: integer
......@@ -11,20 +11,9 @@ produces:
basePath: "/api"
# Import security definitions from global security definition
securityDefinitions:
$ref: '../security/security.yml#securityDefinitions'
paths:
/debug:
post:
operationId: "debug.echo"
tags:
- "Echo"
summary: "Echo function for debugging purposes"
description: "Echoes the input back to the caller."
parameters:
- in: body
name: "Object"
required: true
schema:
type: object
responses:
200:
description: "Successful echo of request data"
$ref: 'routes.yml#paths'
swagger: "2.0"
info:
title: Proactive Community Detection microservice
description: This is the documentation for the proactive community detection microservice.
version: "1.0.0"
consumes:
- "application/json"
produces:
- "application/json"
basePath: "/api"
# Import security definitions from global security definition
securityDefinitions:
$ref: '../../../../modules/security/security_local.yml#securityDefinitions'
paths:
$ref: 'routes.yml#paths'
from db.dao.cluster import Cluster as ClusterDao
from db.dao.layer import Layer as LayerDao
from db.dao.timeslice import TimeSlice as TimeSliceDao
from db.dao.layer_pair import LayerPair as LayerPairDao
from db.dao.prediction_result import PredictionResult
import json
from typing import List, Dict
from datetime import date, datetime
class Cluster:
'''
A cluster for an arbitrary layer containing some nodes.
:param use_case: The use-case of the layer
:param use_case_table: The use-case table of the layer
:param layer_name: The name of the layer in which the cluster is located
:param cluster_label: The label of the cluster unique for the layer
:param nodes: The individual nodes of the cluster
:param label: A human readable label
'''
def __init__(self, use_case: str = None, use_case_table: str = None, layer_name: str = None,
cluster_label: int = None, nodes: List[Dict] = None, label: str = None,
cluster_dict: Dict = None, from_db=False):
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.cluster_label = cluster_label
self.nodes = nodes
self.label = label
if cluster_dict is not None:
self.from_serializable_dict(cluster_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"use_case": self.use_case,
"use_case_table": self.use_case_table,
"layer_name": self.layer_name,
"cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes,
"label": self.label,
}
def from_serializable_dict(self, cluster_dict: Dict, from_db=False):
self.use_case = cluster_dict["use_case"]
self.use_case_table = cluster_dict["use_case_table"]
self.layer_name = cluster_dict["layer_name"]
self.cluster_label = cluster_dict["cluster_label"]
self.nodes = json.loads(cluster_dict["nodes"]) \
if from_db else cluster_dict["nodes"]
self.label = cluster_dict["label"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Cluster({self.__repr__()})"
import json
from datetime import datetime
from typing import Dict
class Layer:
'''
This class represents a single layer of the Multilayer Graph.
:param layer_info: Information as dictionary to restore the layer object.
'''
def __init__(self, layer_info: Dict = None, from_db=False):
if layer_info is not None:
self.from_serializable_dict(layer_info, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"layer_name": self.layer_name,
"properties": self.properties,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
"total_properties": self.total_properties,
}
@staticmethod
def from_business_logic_dict(layer_info: Dict):
layer = Layer()
layer.layer_name = layer_info["name"]
layer.properties = layer_info["cluster_properties"]
layer.total_properties = layer_info["properties"]
layer.use_case = layer_info["use_case"]
layer.use_case_table = layer_info["table"]
return layer
def from_serializable_dict(self, layer_info: Dict, from_db=False):
self.layer_name = layer_info['layer_name']
self.properties = layer_info['properties']
self.use_case = layer_info["use_case"]
self.use_case_table = layer_info["use_case_table"]
self.total_properties = layer_info["total_properties"] if "total_properties"in layer_info.keys() else None
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Layer({self.__repr__()})"
from typing import List, Dict
class LayerPair:
def __init__(self, use_case: str, table: str, layer: str, reference_layer: str):
self.use_case = use_case
self.table = table
self.layer = layer
self.reference_layer = reference_layer
@staticmethod
def create_from_dict(dict_) -> 'LayerPair':
lp = LayerPair(None, None, None, None)
lp.__dict__.update(dict_)
return lp
from typing import List, Dict
class PredictionResult:
def __init__(self, use_case: str, table: str, method: str,
layer: str, reference_layer: str, cluster_id: str,
time_window: str, prediction: int):
self.use_case = use_case
self.table = table
self.method = method
self.layer = layer
self.reference_layer = reference_layer
self.cluster_id = cluster_id
self.time_window = time_window
self.prediction = prediction
@staticmethod
def create_from_dict(dict_) -> 'PredictionResult':
obj = PredictionResult(None, None, None, None, None, None, None, None)
obj.__dict__.update(dict_)
return obj
import json
from typing import List, Dict, NewType, Any
from datetime import date, datetime
Node = NewType('Node', dict)
class TimeSlice:
'''
A time slice for a single layer containing all nodes for that time.
:param time: The tag indicating the time
:param layer_name: The name of the layer the nodes belong to
'''
def __init__(self, time: Any = None, use_case: str = None, use_case_table: str = None, layer_name: str = None,
time_slice_dict: Dict = None, from_db = False):
self.time = str(time)
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.clusters: Dict[str, List[Node]] = {}
if time_slice_dict is not None:
self.from_serializable_dict(time_slice_dict, from_db)
def init_all_clusters(self, cluster_labels: List[str]):
'''Initializes internal clusters for all labels with an empty list.'''
for cluster_label in cluster_labels:
# only string keys can be stored in json
cluster_label = str(cluster_label)
self.clusters[cluster_label] = []
def add_node_to_cluster(self, cluster_label: str, node):
# only string keys can be stored in json
cluster_label = str(cluster_label)
if cluster_label not in self.clusters:
# self.clusters[cluster_label] = []
raise KeyError(f"self::init_all_clusters must be used to add all global cluster labels beforehand (got {cluster_label})")
# node = self._get_unique_id(node)
self.clusters[cluster_label].append(node)
def get_nodes_for_cluster(self, cluster_label: str):
if cluster_label in self.clusters:
return self.clusters[cluster_label]
else:
return []
def _get_unique_id(self, node : Dict) -> Dict:
'''Returns a new dict with the unique id only.'''
uid_key = 'UniqueID'
if uid_key in node:
return {uid_key: node[uid_key]}
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"time": self.time,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
'layer_name': self.layer_name,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, dict: Dict, from_db=False):
self.time = dict["time"]
self.use_case = dict["use_case"]
self.use_case_table = dict["use_case_table"]
self.layer_name = dict['layer_name']
self.clusters = json.loads(dict['clusters']) if from_db else dict['clusters']
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeSlice({self.__repr__()})"
import pymongo
import network_constants as netconst
from database.MongoRepositoryBase import MongoRepositoryBase
import json
from db.dao import *
from typing import List
import logging
LOGGER = logging.getLogger(__name__)
class Repository(MongoRepositoryBase):
'''This is a repository for MongoDb.'''
def __init__(self):
super().__init__(netconst.PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME,
netconst.PROACTIVE_COMMUNITY_DETECTION_DB_PORT,
'proactiveCommunityDb')
self._use_case_collection = 'use_cases'
self._layer_collection = 'layers'
self._layer_pair_collection = 'layer_pairs'
self._clusters_collection = 'clusters'
self._time_slice_collection = 'time_slices'
self._prediction_result_collection = 'prediction_results'
def DROP(self, confirm:bool=False):
assert confirm, 'WONT DELETE WHOLE DB WITHOUT CONFIRMATION'
for collection_ in [self._use_case_collection, self._layer_collection, self._layer_pair_collection,
self._clusters_collection, self._time_slice_collection]:
super().drop_collection(collection_)
#region LayerPair
def add_use_case(self, use_case: str):
super().insert_entry(self._use_case_collection, {'name':use_case})
def get_use_cases(self) -> List[str]:
entries = super().get_entries(self._use_case_collection)
return [e['name'] for e in entries]
#endregion
#region Layers
def add_layer(self, layer: LayerDao):
super().insert_entry(self._layer_collection, layer.to_serializable_dict())
def get_layers(self) -> List[LayerDao]:
'''Retrieves all layers from the db, independent of use-case.'''
entries = super().get_entries(self._layer_collection, projection={'_id': 0})
return [LayerDao(e) for e in entries]
def get_layers_for_use_case(self, use_case: str) -> LayerDao:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case})
return [LayerDao(e) for e in entries]
def get_layers_for_table(self, use_case: str, use_case_table: str) -> LayerDao:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case, 'use_case_table': use_case_table})
return [LayerDao(e) for e in entries]
def get_layer_by_name(self, use_case:str, use_case_table:str, layer_name:str) -> LayerDao:
'''Returns a singe layer for use-case and layer-name.'''
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name})
entries = [LayerDao(e) for e in entries]
if entries is not None and len(entries) > 0:
if len(entries) > 1:
LOGGER.error(f"Layer Key {use_case}, {layer_name} is not unique.")
return entries[0]
else:
return None
def delete_all_layers(self):
super().drop_collection(self._layer_collection)
#endregion Layers
#region Clusters
def add_cluster(self, cluster: ClusterDao):
super().insert_entry(self._clusters_collection, cluster.to_serializable_dict(for_db=True))
def add_clusters(self, clusters: List[ClusterDao]):
cluster_dicts = [c.to_serializable_dict(for_db=True) for c in clusters]
super().insert_many(self._clusters_collection, cluster_dicts)
def get_clusters_for_layer(self, use_case: str, use_case_table: str, layer_name: str) -> List[ClusterDao]:
entries = super().get_entries(self._clusters_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name}, projection={'_id': 0})
return [ClusterDao(cluster_dict=e, from_db=True) for e in entries]
def delete_all_clusters(self):
super().drop_collection(self._clusters_collection)
#endregion
#region TimeSlice
def add_time_slice(self, timeslice: TimeSliceDao):
super().insert_entry(self._time_slice_collection, timeslice.to_serializable_dict(for_db=True))
def get_time_slices(self) -> List[TimeSliceDao]:
'''Returns all time slices.'''
entries = super().get_entries(self._time_slice_collection)
return [TimeSliceDao(None, None, time_slice_dict=e, from_db=True) for e in entries]
def get_time_slices_for_layer(self, use_case: str, use_case_table: str, layer_name: str) -> List[TimeSliceDao]:
'''Returns all time slices with the given layer_name.'''
entries = super().get_entries(self._time_slice_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name})
return [TimeSliceDao(time_slice_dict=e, from_db=True) for e in entries]
def remove_all_time_slices(self):
super().drop_collection(self._time_slice_collection)
#endregion
#region LayerPair
def add_layer_pair(self, layer_pair: LayerPairDao):
super().insert_entry(self._layer_pair_collection, layer_pair.__dict__)
def get_layer_pairs(self, use_case: str) -> List[LayerPairDao]:
entries = super().get_entries(self._layer_pair_collection, selection={'use_case': use_case})
return [LayerPairDao.create_from_dict(e) for e in entries]
#endregion
#region PredictionResult
def add_prediction_result(self, prediction_result: PredictionResult):
super().insert_entry(self._prediction_result_collection, prediction_result.__dict__)
def get_prediction_results(self, use_case: str) -> List[PredictionResult]:
entries = super().get_entries(self._prediction_result_collection, selection={'use_case': use_case}, projection={'_id': 0})
return [PredictionResult.create_from_dict(e) for e in entries]
def delete_all_prediction_results(self):
super().drop_collection(self._prediction_result_collection)
#endregion
from flask import request
def echo():
import processing.fetching.fetching as f
# print(f._fetch_use_cases())
print(f._fetch_use_cases())
return request.json
\ No newline at end of file
from entities.timewindow import TimeWindow
from entities.cluster import Cluster
from entities.layer import Layer
\ No newline at end of file
# from __future__ import annotations
from typing import Dict, List, Iterable, Any, Tuple
from entities.timewindow import TimeWindow
import numpy as np
import scipy
from processing import ClusterMetricsCalculatorFactory
class Cluster:
'''A cluster from one time window containing all metrics used for machine learning.'''
def __init__(self, time_window_id: Any, cluster_id: Any, cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int,
global_cluster_center, global_center_distance=None):
self.time_window_id = time_window_id
self.cluster_id = cluster_id
metrics_calculator = ClusterMetricsCalculatorFactory.create_metrics_calculator(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
self.size = metrics_calculator.get_size()
self.std_dev = metrics_calculator.get_standard_deviation()
self.scarcity = metrics_calculator.get_scarcity()
self.importance1 = metrics_calculator.get_importance1()
self.importance2 = metrics_calculator.get_importance2()
self.range_ = metrics_calculator.get_range()
self.center = metrics_calculator.get_center()
self.global_center_distance = \
scipy.spatial.distance.euclidean(self.center, global_cluster_center) \
if self.size > 0 \
else 0
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
return eval(self.time_window_id)[1]
def __repr__(self):
return str(self.__dict__)
def __str__(self):
return f"Cluster({self.time_window_id}, {self.cluster_id}, " \
f"{self.size}, {self.std_dev}, {self.scarcity}, " \
f"{self.importance1}, {self.importance2}, " \
f"{self.range_}, {self.center})"
@staticmethod
def create_multiple_from_time_window(time_window: TimeWindow, cluster_feature_names: List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> Iterable['Cluster']:
total_layer_nodes = sum([len(nodes) for nodes in time_window.clusters.values()])
layer_diversity = len([nodes for nodes in time_window.clusters.values() if len(nodes) > 0])
for cluster_nr, cluster_nodes in time_window.clusters.items():
yield Cluster(time_window.time, cluster_nr, cluster_nodes, cluster_feature_names, total_layer_nodes, layer_diversity, global_cluster_centers[cluster_nr])
@staticmethod
def create_from_dict(dict_) -> 'Cluster':
cl = Cluster(0, 0, [], 'None', 0, 0, None)
cl.__dict__.update(dict_)
return cl
from typing import Dict, List, Tuple, Any
import scipy.spatial
from entities.timewindow import TimeWindow
from processing import ClusterMetricsCalculatorFactory
class InternalCluster:
def __init__(self, cluster_id, cluster_nodes: List[dict], feature_names:List[str], global_cluster_center: Tuple[float], n_layer_nodes: int):
self.cluster_id = cluster_id
metrics_calculator = ClusterMetricsCalculatorFactory.create_metrics_calculator(cluster_nodes, feature_names, n_layer_nodes, None)
self.size = metrics_calculator.get_size()
self.relative_size = metrics_calculator.get_importance1()
self.center = metrics_calculator.get_center()
if self.size > 0:
self.global_center_distance = scipy.spatial.distance.euclidean(self.center, global_cluster_center)
else:
self.global_center_distance = 0
@staticmethod
def create_many_from_cluster_nodes(clusters: Dict[str, List[dict]], feature_names: List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> List['InternalCluster']:
res_clusters = []
total_layer_nodes = sum([len(nodes) for nodes in clusters.values()])
for key, value in clusters.items():
# ignore noise as it contains no meaningful cluster information
if key == '-1':
continue
res_clusters.append(InternalCluster(key, value, feature_names, global_cluster_centers[key], total_layer_nodes))
return res_clusters
class Layer:
'''Represents metrics for one layer for a single time window.'''
def __init__(self, time_window_id: Any, clusters: List[InternalCluster]):
self.time_window_id = time_window_id
active_clusters = [c for c in clusters if c.size > 0]
self.n_nodes = sum([c.size for c in clusters])
self.n_clusters = len(active_clusters)
self.relative_cluster_sizes = self.get_relative_cluster_sizes(active_clusters)
self.cluster_size_agg_metrics = self.get_size_min_max_avg_sum(active_clusters)
self.cluster_relative_size_agg_metrics = self.get_relative_size_min_max_avg_sum(active_clusters)
self.entropy = self.get_entropy(active_clusters)
self.centers = [c.center for c in active_clusters]
self.distances_from_global_centers = self.get_distances_from_global_center(active_clusters)
self.cluster_center_distance_agg_metrics = self.get_center_distance_min_max_avg_sum(active_clusters)
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
return eval(self.time_window_id)[1]
def get_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's absolute sizes.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].size
max_ = clusters[0].size
sum_ = 0
for c in clusters:
value = c.size
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_relative_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's relative sizes.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].relative_size
max_ = clusters[0].relative_size
sum_ = 0
for c in clusters:
value = c.relative_size
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_center_distance_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's center distances.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].global_center_distance
max_ = clusters[0].global_center_distance
sum_ = 0
for c in clusters:
value = c.global_center_distance
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_relative_cluster_sizes(self, clusters: List[InternalCluster]):
return [c.relative_size for c in clusters]
def get_entropy(self, clusters: List[InternalCluster]):
'''
Returns the entropy over all clusters C,
where P(c_i) is the probability that a node belongs to cluster c_i.
'''
return scipy.stats.entropy(self.get_relative_cluster_sizes(clusters), base=2)
def get_distances_from_global_center(self, clusters: List[InternalCluster]):
return [cluster.global_center_distance for cluster in clusters]
def __repr__(self):
return str(self.__dict__)
def __str__(self):
return f"Layer({self.time_window_id}, " \
f"{self.n_nodes}, {self.n_clusters}, {self.relative_cluster_sizes}, " \
f"{self.entropy}, {self.centers}, {self.distances_from_global_centers})"
@staticmethod
def create_from_time_window(time_window: TimeWindow, feature_names:List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> 'Layer':
clusters: List[InternalCluster] = InternalCluster.create_many_from_cluster_nodes(time_window.clusters, feature_names, global_cluster_centers)
return Layer(time_window.time, clusters)
@staticmethod
def create_from_dict(dict_) -> 'Layer':
l = Layer(0, [])
l.__dict__.update(dict_)
return l
\ No newline at end of file
import json
from typing import List, Dict, NewType, Any
from datetime import date, datetime
class TimeWindow:
'''
A time slice for a single layer containing all nodes for that time.
:param time: The tag indicating the time
:param layer_name: The name of the layer the nodes belong to
'''
def __init__(self, time: Any = None, use_case: str = None, use_case_table: str = None, layer_name: str = None,
time_slice_dict: Dict = None, from_db = False):
self.time = str(time)
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.clusters: Dict[str, List[dict]] = {}
if time_slice_dict is not None:
self.from_serializable_dict(time_slice_dict, from_db)
def add_node_to_cluster(self, cluster_label: str, node):
# only string keys can be stored in json
cluster_label = str(cluster_label)
if cluster_label not in self.clusters:
self.clusters[cluster_label] = []
# node = self._get_unique_id(node)
self.clusters[cluster_label].append(node)
def get_nodes_for_cluster(self, cluster_label: str):
if cluster_label in self.clusters:
return self.clusters[cluster_label]
else:
return []
def _get_unique_id(self, node : Dict) -> Dict:
'''Returns a new dict with the unique id only.'''
uid_key = 'UniqueID'
if uid_key in node:
return {uid_key: node[uid_key]}
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"time": self.time,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
'layer_name': self.layer_name,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, dict: Dict, from_db=False):
self.time = dict["time"]
self.use_case = dict["use_case"]
self.use_case_table = dict["use_case_table"]
self.layer_name = dict['layer_name']
self.clusters = json.loads(dict['clusters']) if from_db else dict['clusters']
@staticmethod
def create_from_serializable_dict(dict: Dict, from_db=False):
ts = TimeWindow()
ts.from_serializable_dict(dict, from_db)
return ts
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeWindow({self.__repr__()})"
# add modules folder to interpreter path
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
### init logging ###
import logging
LOG_FORMAT = ('%(levelname) -5s %(asctime)s %(name)s:%(funcName) -35s %(lineno) -5d: %(message)s')
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
LOGGER = logging.getLogger(__name__)
#############################
import connexion
from security import swagger_util
from pathlib import Path
import env_info
from flask import request
from flask import redirect
from flask_cors import CORS
# load swagger config
app = connexion.App(__name__, specification_dir='configs/')
app.add_api('swagger.yml')
CORS(app.app)
@app.route('/', methods=['GET'])
def api_root():
return 'Endpoint of proactive-community-detection-microservice!'
return redirect('/api/ui')
if not env_info.is_running_locally():
swagger_path = "configs/swagger.yml"
# SSL configuration
certificate_path = env_info.get_resources_path()
context = (os.path.normpath(f'{certificate_path}/articonf1.crt'), os.path.normpath(f'{certificate_path}/articonf1.key')) # certificate and key files
else:
print("Running locally...")
swagger_path = "configs/swagger_local.yml"
context = None
app.add_api(swagger_util.get_bundled_specs(Path(swagger_path)),
resolver = connexion.RestyResolver("cms_rest_api"))
# start app
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)
app.run(host='0.0.0.0', port=5000, ssl_context=context, debug=True)
import warnings
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Tuple
import numpy as np
from scipy.spatial import ConvexHull, qhull, distance
from math import sqrt
from statistics import mean
warnings.simplefilter(action='ignore', category=UserWarning)
# UserWarning: geopandas not available. Some functionality will be disabled.
from pointpats.centrography import std_distance
warnings.simplefilter(action='default', category=UserWarning)
class ClusterMetricsCalculator(ABC):
def __init__(self, cluster_nodes: List[dict], nr_layer_nodes: int, layer_diversity: int):
self.cluster_nodes = cluster_nodes
self.nr_layer_nodes = nr_layer_nodes
self.layer_diversity = layer_diversity
def get_size(self) -> int:
'''Returns the size of the cluster.'''
return len(self.cluster_nodes)
@abstractmethod
def get_standard_deviation(self) -> float:
'''Returns the std dev from the center of the distribution.'''
pass
@abstractmethod
def get_scarcity(self) -> float:
'''
Returns the scarcity of the data points regarding the complete range for possible points.
High scarcity indicates low density.
'''
pass
@abstractmethod
def get_range(self) -> float:
'''Returns the range or area of the cluster based on the edge nodes.'''
pass
@abstractmethod
def get_center(self) -> (float, float):
'''Returns the center of the cluster, output is fixed 2d.'''
pass
def get_importance1(self) -> float:
'''Returns the ratio of cluster_nodes to layer_nodes.'''
return float(len(self.cluster_nodes)) / self.nr_layer_nodes if len(self.cluster_nodes) > 0 else 0
def get_importance2(self) -> float:
'''Returns the inverse of the layer_diversity, where layer_diversity = number of clusters with #nodes > 0.'''
return 1.0 / self.layer_diversity if len(self.cluster_nodes) > 0 else 0
def _convert_feature_to_float(self, feature_value) -> float:
return float(feature_value if feature_value is not "" else 0)
class ClusterMetricsCalculator1D(ClusterMetricsCalculator):
'''Metrics calculator for clusters which were clustered based on 1 feature (1d clustering).'''
def __init__(self, cluster_nodes: List[dict], cluster_feature_name: str, nr_layer_nodes: int, layer_diversity: int):
super().__init__(cluster_nodes, nr_layer_nodes, layer_diversity)
self.feature_values: List[Any] = [self._convert_feature_to_float(node[cluster_feature_name])
for node in cluster_nodes]
if len(self.feature_values) > 0:
self.max_value = max(self.feature_values)
self.min_value = min(self.feature_values)
else:
self.max_value = self.min_value = 0
def get_standard_deviation(self):
return np.std(self.feature_values) if len(self.feature_values) > 0 else 0
def get_scarcity(self):
'''Returns the scarcity as cluster_range / cluster_size, or 0 if len(nodes)=0.'''
if len(self.feature_values) == 0:
return 0
return self.get_range() / self.get_size()
def get_range(self):
return float(self.max_value - self.min_value)
def get_center(self):
if len(self.feature_values) == 0:
return (0, 0)
return (sum(self.feature_values) / len(self.feature_values), 0)
class ClusterMetricsCalculator2D(ClusterMetricsCalculator):
'''Metrics calculator for clusters which were clustered based on 2 features (2d clustering).'''
def __init__(self, cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int):
assert len(cluster_feature_names) == 2, "This class is for 2d cluster results only!"
super().__init__(cluster_nodes, nr_layer_nodes, layer_diversity)
self.feature_values: List[Tuple[Any]] = [
(self._convert_feature_to_float(node[cluster_feature_names[0]]), self._convert_feature_to_float(node[cluster_feature_names[1]]))
for node in cluster_nodes
]
def get_standard_deviation(self):
if len(self.feature_values) == 0:
return 0
warnings.simplefilter(action='ignore', category=RuntimeWarning)
std_dist = std_distance(self.feature_values)
warnings.simplefilter(action='default', category=RuntimeWarning)
if np.isnan(std_dist):
return 0 # somehow std_dist=nan if all feature values are same with many decimals
return std_dist
def get_scarcity(self):
'''Returns the scarcity as cluster_range / cluster_size, or 0 if len(nodes)=0.'''
if len(self.feature_values) == 0:
return 0
if len(self.feature_values) == 1:
# exactly 1 element gives inf density
return 0
range_, twodim = self._get_range()
if twodim:
return sqrt(range_ / self.get_size())
else:
return range_ / self.get_size()
def _get_range(self):
twodim = False
if len(self.feature_values) == 0 or len(self.feature_values) == 1:
range_ = 0
elif len(self.feature_values) == 2:
# cannot calculate area with 2 points - just use 2d distance as range instead
range_ = float(distance.euclidean(self.feature_values[0], self.feature_values[1]))
else:
try:
# calculate range as 2d area
points = self._get_polygon_border_points(self.feature_values)
range_ = self._calc_polygon_area(points)
# twodim must be known when calculating scarcity
twodim = True
except qhull.QhullError as err:
# possible reasons that there is no hull with real area:
# 1. all points are at the same location
# 2. all points have the same x or y coordinates (lie on one hori/vert line)
points = np.asarray(self.feature_values)
same_x = len(set(points[:,0])) == 1
if same_x:
# use only y feature
features = points[:,1]
range_ = max(features) - min(features)
same_y = len(set(points[:,1])) == 1
if same_y:
# use only x feature
features = points[:,0]
range_ = max(features) - min(features)
if not same_x and not same_y:
# assume linear distribution of nodes
points = np.asarray(list(set(self.feature_values)))
min_ = min(points[:,0]), min(points[:,1])
max_ = max(points[:,0]), max(points[:,1])
range_ = float(distance.euclidean(min_, max_))
return (range_, twodim)
def get_range(self):
return self._get_range()[0]
def _get_polygon_border_points(self, points: List[List[float]]) -> 'np.array':
points = np.asarray(points)
hull = ConvexHull(points)
return points[hull.vertices]
def _calc_polygon_area(self, border_points: 'np.array') -> float:
x: 'np.array' = border_points[:,0]
y: 'np.array' = border_points[:,1]
# https://en.wikipedia.org/wiki/Shoelace_formula
area = 0.5 * np.abs(np.dot(x, np.roll(y,1)) - np.dot(y, np.roll(x,1)))
return float(area)
def get_center(self):
if len(self.feature_values) == 0:
return (0, 0)
x = [f[0] for f in self.feature_values]
y = [f[1] for f in self.feature_values]
centroid = (sum(x) / len(self.feature_values), sum(y) / len(self.feature_values))
return centroid
class ClusterMetricsCalculatorFactory:
@staticmethod
def create_metrics_calculator(cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int) -> ClusterMetricsCalculator:
"""
This factory creates a class which contains metrics about a single cluster based on
its nodes, feature values, its layer total node number and its layer diversity.
:param cluster_nodes: all nodes from the cluster
:param cluster_feature_names: all field names which where used during clustering
:param nr_layer_nodes: the number of total layer nodes
:param layer_diversity: the diversity of the layer calculated as: number of clusters with nodes > 0
"""
if isinstance(cluster_feature_names, str):
return ClusterMetricsCalculator1D(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
if len(cluster_feature_names) == 1:
return ClusterMetricsCalculator1D(cluster_nodes, cluster_feature_names[0], nr_layer_nodes, layer_diversity)
if len(cluster_feature_names) == 2:
return ClusterMetricsCalculator2D(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
class DataSampler:
def __init__(self):
pass
def undersample(self, X, y, strategy='not minority') -> ('X', 'y'):
'''Undersampling so all class sizes equal minority class size.'''
rus = RandomUnderSampler(random_state=42, sampling_strategy=strategy)
X_undersampled, y_undersampled = rus.fit_resample(X, y)
return X_undersampled, y_undersampled
def oversample(self, X, y) -> ('X', 'y'):
'''Oversample based on SMOTE so all class sizes equal majority class size.'''
sm = SMOTE(random_state=42)
X_oversampled, Y_oversampled = sm.fit_resample(X, y)
return X_oversampled, Y_oversampled
def sample_fixed_size(self, X, y, size: int) -> ('X', 'y'):
sampling_sizes = {k: min(size, v) for k, v in y.value_counts().items()}
# undersample the larger classes to size
X, y = self.undersample(X, y, strategy=sampling_sizes)
# oversample the smaller classes to size
X, y = self.oversample(X, y)
return X, y
def sample_median_size(self, X, y: pd.Series, max_size:int=None) -> ('X', 'y'):
'''Sample the median class size for all classes.'''
median = int(y.value_counts().median())
if max_size is not None:
median = min(median, max_size)
return self.sample_fixed_size(X, y, size=median)
from processing.ClusterMetricsCalculator import ClusterMetricsCalculator, ClusterMetricsCalculator1D, ClusterMetricsCalculator2D, ClusterMetricsCalculatorFactory
from processing.DataSampler import DataSampler
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label, convert_metrics_data_to_dataframe, get_cluster_metrics
from pathlib import Path
#############################
from typing import List, Dict
import json
import os
from entities import TimeWindow, Cluster
def store_metrics_for_clusters(use_case: str, layer_name: str, feature_names: List[str]):
'''
:param layer_name: Name of the layer for which multiple time windows exist
:param feature_names: Features of the layer
'''
print(f"Working on {layer_name} cluster metrics")
# load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file:
clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = {
str(cluster['cluster_label']): calculate_center(cluster, feature_names)
for cluster in clusters
if cluster['label'] != 'noise'
}
path_in = f'data/{use_case}/raw/timeslices/{layer_name}'
Path(f'data/{use_case}/cluster_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/cluster_metrics/{layer_name}.json'
complete_clusters: List[Cluster] = []
for root, _, files in os.walk(path_in):
for f in files:
with open(os.path.join(root, f), 'r') as file:
# for each time window json
json_slice = json.loads(file.read())
time_window = TimeWindow.create_from_serializable_dict(json_slice)
# create all clusters + metrics for one time window
clusters = Cluster.create_multiple_from_time_window(time_window, feature_names, cluster_centers)
complete_clusters.extend(clusters)
# store the cluster metrics
with open(path_out, 'w') as file:
file.write(json.dumps([cl.__dict__ for cl in complete_clusters]))
######################
COLUMNS = ['cluster_size', 'cluster_variance', 'cluster_density', 'cluster_import1', 'cluster_import2',
'cluster_area', 'cluster_center_distance', 'time_f1', 'time_f2']*3 + ['evolution_label']
######################
import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable, Tuple
def create_metrics_training_data(use_case: str, layer_name: str, N: int = 3) -> Iterable[list]:
"""
Loads the metrics training data for an individual layer from disk.
A single metrics training data point should look like this:
(cluster_size, cluster_std_dev, cluster_scarcity, cluster_import1, cluster_import2, cluster_range, cluster_center_x, cluster_center_y, time_info) ^ N, evolution_label
time_info ... the time as 2d cyclic feature, i.e. time_info := (time_f1, time_f2)
The first tuple represents metrics from the cluster in t_i-(N-1).
The Nth tuple represents metrics from the cluster in t_i.
The label is one of {continuing, shrinking, growing, dissolving, forming} \ {splitting, merging} and identifies the change for t_i+1.
:param N: number of cluster metric tuples
:param layer_name: the name of the layer metrics json file
"""
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
data.sort(key=lambda cl: (cl.cluster_id, cl.time_window_id))
# manually prepare deque with N metric_tuples + evolution label
tuples = []
for i, cur_cluster in enumerate(data[:-1]):
if cur_cluster.cluster_id != data[i+1].cluster_id:
# next cluster slice in list will be another cluster id -> restart deque and skip adding the current (last) cluster slice
tuples = []
continue
cur_metrics = get_cluster_metrics(cur_cluster)
# deque function: adding N+1st element will remove oldest one
if len(tuples) == N:
tuples.pop(0)
tuples.append(cur_metrics)
if len(tuples) == N:
label = get_evolution_label(cur_cluster.size, data[i+1].size)
yield list(tuples) + [label]
############################
def flatten_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
'''
Flattens a single metrics data point in the form:
[(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N, evolution_label]
to:
(X, y: np.array)
'''
flat_list = []
for entry in datapoint[:-1]: # for all x
flat_list.extend(entry[:-1]) # add all number features except the time tuple
flat_list.extend(entry[-1]) # add time tuple
flat_list.append(datapoint[-1]) # y
return np.asarray(flat_list)
################################
import numpy as np
import pandas as pd
from pandas import DataFrame
import collections
import statistics as stat
def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep
return df
def store_training_data(use_case: str, layer_name: str):
# load metrics data from disk
data: Iterable = create_metrics_training_data(use_case=use_case, layer_name=layer_name)
# flatten and convert to df
df = convert_metrics_data_to_dataframe(data, columns=COLUMNS, flattening_method=flatten_metrics_datapoint)
# balance df
df = balance_dataset(df)
# shuffle
df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/single_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv')
#######################
from db.repository import Repository
from db.dao import LayerDao
from pathlib import Path
repo = Repository()
def run(use_case=None):
'''
Requires raw jsons for clusters and time slices.
Working directory: data/
'''
if use_case is not None:
use_cases = [use_case]
else:
use_cases = repo.get_use_cases()
for use_case in use_cases:
print(f"Executing cluster metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)]
##################
for layer in layers:
store_metrics_for_clusters(use_case, layer[0], layer[1])
###################
for name, _ in layers:
print(f"Storing training data for {name}")
store_training_data(use_case, layer_name=name)
\ No newline at end of file
from processing.data_prep.metrics_base import calculate_center, get_cyclic_time_feature, get_evolution_label,convert_metrics_data_to_dataframe, get_layer_metrics
from pathlib import Path
#################
from typing import List, Tuple
import statistics as stat
import json
import os
from entities import TimeWindow, Layer
from processing import ClusterMetricsCalculatorFactory
def store_metrics_for_layers(use_case: str, layer_name: str, feature_names: List[str]):
print(f"Working on {layer_name} layer metrics")
# load global cluster centers
path_in = f'data/{use_case}/raw/clusters/{layer_name}.json'
with open(path_in, 'r') as file:
clusters = json.loads(file.read())
cluster_centers: Dict[str, Tuple[float]] = {
str(cluster['cluster_label']): calculate_center(cluster, feature_names)
for cluster in clusters
if cluster['label'] != 'noise'
}
# load time windows
all_layers: List[Layer] = []
path_in = f'data/{use_case}/raw/timeslices/{layer_name}'
for root, _, files in os.walk(path_in):
for f in files:
with open(os.path.join(root, f), 'r') as file:
json_time_slice = json.loads(file.read())
time_window = TimeWindow.create_from_serializable_dict(json_time_slice)
layer = Layer.create_from_time_window(time_window, feature_names, cluster_centers)
all_layers.append(layer)
# store the layer metrics
Path(f'data/{use_case}/layer_metrics/').mkdir(parents=True, exist_ok=True)
path_out = f'data/{use_case}/layer_metrics/{layer_name}.json'
with open(path_out, 'w') as file:
file.write(json.dumps([l.__dict__ for l in all_layers]))
#########################
from typing import List
def get_columns(N) -> List[str]:
'''Returns columns for the data depending on sizes of N (number time windows) independent of M (number of clusters in L_R).'''
cols = ['n_nodes', 'n_clusters', 'entropy']
for v in ['sizes', 'relative_sizes', 'center_dist']:
cols += [f'{v}_min', f'{v}_max', f'{v}_avg', f'{v}_sum']
# cols.extend(['relative_cluster_sizes']*M)
# cols.extend(['cluster_centers']*M)
# cols.extend(['distance_from_global_centers']*M)
cols.extend(['time_f1', 'time_f2'])
cols = cols * N
return cols + ['cluster_id'] + ['evolution_label']
######################
from typing import Iterable, List, Dict, Any
import json
from entities import Layer, Cluster
def create_layer_metrics_training_data(use_case: str, layer_name: str, reference_layer: str, N: int = 2) -> Iterable:
"""
Loads the metrics training data for an individual layer from disk.
A single metrics training data point should look like this:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (cluster_centers)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number, evolution_label]
The first tuple represents metrics from the reference layer in t_i-(N-1).
The Nth tuple represents metrics from the reference layer in t_i.
The reference_layer has M clusters in total, this might differ from the number of clusters in layer_name.
The cluster number identifies the cluster for which the evolution_label holds.
The label is one of {continuing, shrinking, growing, dissolving, forming} \ {splitting, merging} and identifies the change for a cluster in the layer layer_name for t_i.
"""
if N != 2:
raise NotImplementedError("N is not implemented and fixed to 2!")
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
# load the time keys chronologically
ordered_time_keys = list(layer_metrics.keys())
ordered_time_keys.sort(key=lambda x: [int(v) for v in x.replace('(', '').replace(')', '').split(',')])
# go through all time windows once...
prev_time_key = ordered_time_keys[0]
for current_time_key in ordered_time_keys[1:]:
# ...and load the current and previous layer metrics in the reference_layer
current_layer_metric = layer_metrics[current_time_key]
prev_layer_metric = layer_metrics[prev_time_key]
current_layer_metric_tuple = get_layer_metrics(current_layer_metric)
prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)
# ...then load the current and previous cluster metrics for all clusters in the layer_name
for cluster_id in cluster_ids:
current_cluster_metric = cluster_metrics[(current_time_key, cluster_id)]
prev_cluster_metric = cluster_metrics[(prev_time_key, cluster_id)]
evolution_label = get_evolution_label(prev_cluster_metric.size, current_cluster_metric.size)
# yield each combination of reference layer metrics to clusters
yield [prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id), evolution_label]
prev_time_key = current_time_key
#####################
import numpy as np
def flatten_layer_metrics_datapoint(datapoint: list) -> Tuple['X', np.array]:
'''
Flattens a single layer metrics data point in the form:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number, evolution_label]
to:
(X, y: np.array)
'''
flat_list = []
for layer_metric_tuple in datapoint[:-2]: # for all x
flat_list.extend(layer_metric_tuple[0:-1]) # everything before time
flat_list.extend(layer_metric_tuple[-1]) # time1/2
flat_list.append(datapoint[-2]) # cluster num
flat_list.append(datapoint[-1]) # y
return np.asarray(flat_list)
#########################
import numpy as np
import pandas as pd
from pandas import DataFrame
import collections
import statistics as stat
def balance_dataset(df: DataFrame) -> DataFrame:
# nothing happening here, balance only on real training, not during prep
return df
def store_training_data(use_case: str, layer_name: str, reference_layer_name: str):
# load metrics data from disk
data: Iterable = create_layer_metrics_training_data(use_case=use_case, layer_name=layer_name, reference_layer=reference_layer_name)
# convert to X and Y
df = convert_metrics_data_to_dataframe(data, columns=get_columns(N=2), flattening_method=flatten_layer_metrics_datapoint)
# balance df
df = balance_dataset(df)
# shuffle
df = df.sample(frac=1).reset_index(drop=True)
Path(f'data/{use_case}/ml_input/cross_context/').mkdir(parents=True, exist_ok=True)
df.to_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv')
#########################
from db.repository import Repository
from db.dao import LayerDao
from pathlib import Path
repo = Repository()
def run(use_case=None):
'''
Requires raw jsons and cluster metrics.
Working directory: data/
'''
if use_case is not None:
use_cases = [use_case]
else:
use_cases = repo.get_use_cases()
for use_case in use_cases:
print(f"Executing layer metrics calc for use case {use_case}")
layers = [[l.layer_name, l.properties] for l in repo.get_layers_for_use_case(use_case)]
layer_pairs = repo.get_layer_pairs(use_case)
################
for layer in layers:
try:
store_metrics_for_layers(use_case, layer[0], layer[1])
except FileNotFoundError:
pass
###############
for ld in layer_pairs:
print(f"Storing training data for {ld.layer} with L_R={ld.reference_layer}")
store_training_data(use_case, layer_name=ld.layer, reference_layer_name=ld.reference_layer)
\ No newline at end of file
from processing.data_prep.cluster_metrics_calc import run as crun
from processing.data_prep.layer_metrics_calc import run as lrun
from pathlib import Path
import json
import os
from db.repository import Repository
repo = Repository()
def store_clusters_as_files(use_case):
path_ = f'data/{use_case}/raw/clusters/'
Path(path_).mkdir(parents=True, exist_ok=True)
layers = repo.get_layers_for_use_case(use_case)
for l in layers:
clusters = repo.get_clusters_for_layer(use_case, l.use_case_table, l.layer_name)
with open(os.path.join(path_, f'{l.layer_name}.json'), 'w') as file_:
file_.write(json.dumps([c.to_serializable_dict() for c in clusters]))
def store_time_slices_as_files(use_case):
path_ = f'data/{use_case}/raw/timeslices/'
layers = repo.get_layers_for_use_case(use_case)
for l in layers:
Path(os.path.join(path_, l.layer_name)).mkdir(parents=True, exist_ok=True)
time_slices = repo.get_time_slices_for_layer(use_case, l.use_case_table, l.layer_name)
for ts in time_slices:
with open(os.path.join(path_, l.layer_name, f'{ts.time}.json'), 'w') as file_:
file_.write(json.dumps(ts.to_serializable_dict()))
def run(use_case=None):
'''Prepares training data for single and cross-context using the file system (data/)'''
if use_case is not None:
use_cases = [use_case]
else:
use_cases = repo.get_use_cases()
for use_case in use_cases:
store_clusters_as_files(use_case)
store_time_slices_as_files(use_case)
crun(use_case)
lrun(use_case)
'''
These functions are utilized for both single and cross-context data preparation, i.e.
- cluster_metrics_main.py
- layer_metrics_main.py
'''
#############################
from typing import Tuple
from processing import ClusterMetricsCalculatorFactory
def calculate_center(cluster: dict, features: list) -> Tuple[float]:
calc = ClusterMetricsCalculatorFactory.create_metrics_calculator(cluster['nodes'], features, 1, 1)
return calc.get_center()
#####################
import numpy as np
def get_cyclic_time_feature(time: int, max_time_value: int = 52) -> Tuple[float, float]:
return (np.sin(2*np.pi*time/max_time_value),
np.cos(2*np.pi*time/max_time_value))
####################
def get_evolution_label(old_size: int, new_size: int) -> int:
'''Returns the evolution label as int by mapping 0..4 to {continuing, shrinking, growing, dissolving, forming}.'''
if old_size == 0 and new_size == 0:
return -1 # STILL EMPTY
if old_size == new_size:
return 0 # continuing
if old_size == 0 and new_size > 0:
return 4 # forming
if old_size > 0 and new_size == 0:
return 3 # dissolving
if old_size > new_size:
return 1 # shrinking
if old_size < new_size:
return 2 # growing
#########################
from typing import Iterable
import pandas as pd
def convert_metrics_data_to_dataframe(data: Iterable, columns: list, flattening_method: 'callable') -> pd.DataFrame:
'''Flattens and splits metrics data to match ML conventions.'''
training_data = []
for element in data:
xy: 'np.array' = flattening_method(element)
training_data.append(xy)
return pd.DataFrame(data=training_data, columns=columns)
####################
from entities import Cluster, Layer
from typing import Dict, Tuple
def get_cluster_metrics(cur_cluster: Cluster) -> Tuple:
return (cur_cluster.size, cur_cluster.std_dev, cur_cluster.scarcity, cur_cluster.importance1, cur_cluster.importance2,
cur_cluster.range_, cur_cluster.global_center_distance, get_cyclic_time_feature(cur_cluster.get_time_info()))
####################
def get_layer_metrics(layer: Layer) -> Iterable:
res = [layer.n_nodes, layer.n_clusters, layer.entropy]
res += [layer.cluster_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_relative_size_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res += [layer.cluster_center_distance_agg_metrics[k] for k in ['min', 'max', 'avg', 'sum']]
res.append(get_cyclic_time_feature(layer.get_time_info()))
return res
###################
\ No newline at end of file
from security.token_manager import TokenManager
import network_constants
from db.repository import Repository
from db.dao import *
from typing import List, Dict
import requests
import json
def _fetch_use_cases() -> List[str]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch use-cases from business-logic microservice, statuscode: {response.status_code}!")
return [row["name"] for row in response.json()]
def _fetch_tables(use_case: str) -> List[str]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{use_case}/tables'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch use-cases from business-logic microservice, statuscode: {response.status_code}!")
return [row["name"] for row in response.json()]
def _fetch_layers(use_case: str, table: str) -> List[LayerDao]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch layers for {use_case} from business-logic microservice, statuscode: {response.status_code}!")
return [LayerDao.from_business_logic_dict(row) for row in response.json()]
def _fetch_clusters(use_case: str, table: str, layer_name: str) -> List[str]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.ROLESTAGE_DISCOVERY_HOSTNAME}:{network_constants.ROLESTAGE_DISCOVERY_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/clusters'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch clusters for {use_case}//{table}//{layer_name}, statuscode: {response.status_code}!")
return [ClusterDao(cluster_dict=row) for row in response.json()]
def _fetch_timeslices(use_case: str, table: str, layer_name: str) -> List[Dict]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.ROLESTAGE_DISCOVERY_HOSTNAME}:{network_constants.ROLESTAGE_DISCOVERY_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch time slices for {use_case}//{table}//{layer_name}, statuscode: {response.status_code}!")
return [TimeSliceDao(time_slice_dict=row) for row in response.json()]
def _fetch_layerpairs(use_case: str, table: str) -> List[Dict]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layer-pairs'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch layer pairs for {use_case}//{table}, statuscode: {response.status_code}!")
return [LayerPairDao.create_from_dict(row) for row in response.json()]
def fetch(selected_use_cases: List[str] = None, selected_use_case_tables: List[str] = None):
'''Fetches all the required data.'''
repo = Repository()
for use_case in _fetch_use_cases():
if selected_use_cases is not None and use_case not in selected_use_cases:
continue
repo.add_use_case(use_case)
for table in _fetch_tables(use_case):
if selected_use_case_tables is not None and table not in selected_use_case_tables:
continue
print(f"Fetching for {use_case}//{table}")
try:
# copy all layer pairs
layer_pairs: List[LayerPairDao] = _fetch_layerpairs(use_case, table)
for lp in layer_pairs:
repo.add_layer_pair(lp)
except ConnectionError as e:
print(str(e))
# copy all layers
for layer in _fetch_layers(use_case, table):
db_layer = repo.get_layer_by_name(use_case, layer.use_case_table, layer.layer_name)
if db_layer == None:
repo.add_layer(layer)
else:
print(f"Layer already exists, skipping cluster and timeslice fetching: {db_layer.layer_name}")
continue
try:
# copy all clusters
clusters = _fetch_clusters(use_case, layer.use_case_table, layer.layer_name)
for cl in clusters:
repo.add_cluster(cl)
except ConnectionError as e:
print(str(e))
try:
# copy all timeslices
timeslices = _fetch_timeslices(use_case, layer.use_case_table, layer.layer_name)
for ts in timeslices:
repo.add_time_slice(ts)
except ConnectionError as e:
print(str(e))
\ No newline at end of file
def increase_time_window(time_window_id: str) -> str:
tuple_ = eval(time_window_id)
if tuple_[1] == 52:
# 1st week next year
return (tuple_[0]+1 , 1)
else:
# next week
return str((tuple_[0], tuple_[1]+1))
######################
from typing import Tuple
import pickle
def load_ml_models(use_case, method, layer_name, reference_layer_name=None) -> Tuple['scaler', 'clf']:
path_ = f'data/{use_case}/ml_output/{method}/{layer_name}'
if method == 'single_context':
with open(f'{path_}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
elif method == 'cross_context':
with open(f'{path_}_{reference_layer_name}.model', 'rb') as file:
svc = pickle.load(file)
with open(f'{path_}_{reference_layer_name}_scaler.model', 'rb') as file:
scaler = pickle.load(file)
else:
raise NotImplementedError('Prediction method is not implemented')
return scaler, svc
\ No newline at end of file
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_layer_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
method = 'cross_context'
N = 2 # Currently N is fixed to 2
####################
import pandas as pd
from pandas import DataFrame
#####################
import json
from entities import Layer, Cluster
import collections
import numpy as np
from typing import Iterable, Tuple, List, Dict, Any
####################
import pickle
#####################
import numpy as np
def flatten_layer_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single layer metrics data point in the form:
[(n_nodes, n_clusters, entropy,
(relative_cluster_size)^M, (distance_from_global_centers)^M,
(time1, time2))^N,
cluster_number]
to:
(X)
'''
flat_list = []
for layer_metric_tuple in datapoint[:-1]: # for all x
flat_list.extend(layer_metric_tuple[0:-1]) # everything before time
flat_list.extend(layer_metric_tuple[-1]) # time1/2
flat_list.append(datapoint[-1]) # cluster num
return np.asarray(flat_list)
#########################
from db.repository import Repository
from db.dao import PredictionResult
repo = Repository()
def run_prediction(use_case: str):
for layerpair in repo.get_layer_pairs(use_case):
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
print(f"Predicting {method} for {use_case}//{layer_name} based on {reference_layer_name}")
##########################
with open(f'data/{use_case}/cluster_metrics/{layer_name}.json') as file:
cluster_metrics: List[Cluster] = [Cluster.create_from_dict(e) for e in json.loads(file.read())]
cluster_ids = {c.cluster_id for c in cluster_metrics}
cluster_metrics: Dict[Any, Cluster] = {(c.time_window_id, c.cluster_id): c for c in cluster_metrics}
with open(f'data/{use_case}/layer_metrics/{reference_layer_name}.json') as file:
layer_metrics: List[Layer] = [Layer.create_from_dict(e) for e in json.loads(file.read())]
layer_metrics: Dict[Any, Layer] = {l.time_window_id: l for l in layer_metrics}
######################
# load the time keys chronologically
ordered_time_keys = list(layer_metrics.keys())
ordered_time_keys.sort(key=lambda x: eval(x))
######################
ordered_time_keys = ordered_time_keys[-N:]
#################
prediction_metrics_raw = []
current_layer_metric = layer_metrics[ordered_time_keys[1]]
prev_layer_metric = layer_metrics[ordered_time_keys[0]]
current_layer_metric_tuple = get_layer_metrics(current_layer_metric)
prev_layer_metric_tuple = get_layer_metrics(prev_layer_metric)
for cluster_id in cluster_ids:
# yield each combination of reference layer metrics to clusters
prediction_metrics_raw.append([prev_layer_metric_tuple, current_layer_metric_tuple, int(cluster_id)])
#######################
scaler, svc = load_ml_models(use_case, method, layer_name, reference_layer_name)
################
prediction_cluster_ids = []
prediction_time_window = increase_time_window(ordered_time_keys[1])
prediction_metrics = []
for pred in prediction_metrics_raw:
cluster_id = pred[-1]
prediction_cluster_ids.append(cluster_id)
flat_ = flatten_layer_metrics_datapoint(pred)
prediction_metrics.append(flat_)
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, reference_layer_name, prediction_cluster_ids[i], prediction_time_window, prediction_results[i])
repo.add_prediction_result(res)
from processing.data_prep.metrics_base import get_cyclic_time_feature, get_cluster_metrics
from processing.ml.predict_base import increase_time_window, load_ml_models
N = 3 # Currently N is fixed to 3
method = 'single_context'
####################
import pandas as pd
from pandas import DataFrame
#####################
import json
from entities import Cluster
import collections
import numpy as np
from typing import Iterable, Tuple, Dict, List
######################
import pickle
#####################
def flatten_metrics_datapoint(datapoint: list) -> np.array:
'''
Flattens a single metrics data point in the form:
[(cluster_size, cluster_variance, cluster_density, cluster_import1, cluster_import2, cluster_range, cluster_center, (time_f1, time_f2))^N]
to:
(X)
'''
flat_list = []
for entry in datapoint: # for all x
flat_list.extend(entry[:-1]) # add all number features except the time tuple
flat_list.extend(entry[-1]) # add time tuple
return np.asarray(flat_list)
#########################
from db.repository import Repository
from db.dao import PredictionResult
repo = Repository()
def run_prediction(use_case: str):
for layer in repo.get_layers_for_use_case(use_case):
layer_name = layer.layer_name
print(f"Predicting {method} for {use_case}//{layer_name}")
#################
path_in = f"data/{use_case}/cluster_metrics/{layer_name}.json"
with open(path_in, 'r') as file:
data = [Cluster.create_from_dict(cl_d) for cl_d in json.loads(file.read())]
data.sort(key=lambda cl: (eval(cl.cluster_id), eval(cl.time_window_id)))
#####################
cluster_map: Dict['cluster_id', 'time_windows'] = {}
for cluster in data:
id_ = cluster.cluster_id
if id_ not in cluster_map:
cluster_map[id_] = []
cluster_map[id_].append(cluster)
####################
scaler, svc = load_ml_models(use_case, method, layer_name)
#####################
# store id, future time window, and flattened metrics to combine the latter during prediction
prediction_cluster_ids = []
prediction_time_windows = []
prediction_metrics = []
for cluster_id, time_windows in cluster_map.items():
v = [get_cluster_metrics(c) for c in time_windows[-N:]] # metrics for last N time windows
v_flattened = flatten_metrics_datapoint(v)
prediction_cluster_ids.append(cluster_id)
prediction_time_windows.append(increase_time_window(time_windows[-1].time_window_id))
prediction_metrics.append(v_flattened)
# predict all at once for speedup
prediction_results = svc.predict(scaler.transform(np.array(prediction_metrics)))
print(np.unique(prediction_results, return_counts=True))
for i in range(len(prediction_cluster_ids)):
res = PredictionResult(use_case, use_case, method, layer_name, None, prediction_cluster_ids[i], prediction_time_windows[i], prediction_results[i])
repo.add_prediction_result(res)
import numpy as np
import collections
from typing import Tuple
def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> Tuple['training_data', 'test_data']:
if shuffle:
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
training_size = int(len(dataframe) * (1-test_dataset_frac))
train = dataframe[:training_size].reset_index(drop=True)
test = dataframe[training_size:].reset_index(drop=True)
y_train = train[train.columns[-1]]
y_test = test[test.columns[-1]]
return train, test
#######################
import pandas as pd
from pandas import DataFrame
def remove_empty_community_class(df):
'''Removes evolution_label -1 from dataset indicating the community stays empty.'''
# res = df.loc[df['evolution_label'] != -1.0]
# res = res.reset_index(drop=True)
# return res
df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
return df
########################
import sklearn.metrics
def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
"""
Prints all reports.
:param clfs: list of classifiers to evaluate
:param test_Xs: list of test_X for the corresponding classifier at idx
:param test_Y: true classes
:param titles: list of titles for the classifiers at idx
"""
for clf, test_X, title in zip(clfs, test_Xs, titles):
pred_Y = clf.predict(test_X)
print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))
########################
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
approach = 'cross_context'
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, reference_layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}_{reference_layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
###################
from sklearn.ensemble import RandomForestClassifier
n_estimators = 50
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 1E-5
bootstrap=True
###############
from db.repository import Repository
repo = Repository()
def run_training(use_case):
for layerpair in repo.get_layer_pairs(use_case):
layer_name = layerpair.layer
reference_layer_name = layerpair.reference_layer
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training[training.columns[:-1]]) # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, reference_layer_name, scaler=True)
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
rfc.fit(train_X, train_Y)
print_report([rfc], [test_X], test_Y, ["X"])
export_model(rfc, use_case, layer_name, reference_layer_name)
\ No newline at end of file
import pandas as pd
from pandas import DataFrame
from processing.ml.train_base import split_data, remove_empty_community_class, print_report
approach = 'single_context'
#######################
import pickle
from pathlib import Path
def export_model(model, use_case, layer_name, scaler=False):
fpath = f'data/{use_case}/ml_output/{approach}'
Path(fpath).mkdir(parents=True, exist_ok=True)
with open(f'{fpath}/{layer_name}{"_scaler" if scaler else ""}.model', 'wb') as f:
pickle.dump(model, f)
#####################
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5
bootstrap=True
###############
from db.repository import Repository
repo = Repository()
def run_training(use_case):
for layer in repo.get_layers_for_use_case(use_case):
layer_name = layer.layer_name
df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)
#######################
training, testing = split_data(df, shuffle=False)
#####################
training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)
#####################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(training[training.columns[:-1]]) # all except y
train_Y = training[training.columns[-1]]
test_X = scaler.transform(testing[testing.columns[:-1]]) # all except y
test_Y = testing[testing.columns[-1]]
export_model(scaler, use_case, layer_name, scaler=True)
########################
from processing import DataSampler
sampler = DataSampler()
try:
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=100000)
except ValueError as e: # not enough points for oversampling
print(f"Could not sample training data, using original distribution: {e}")
####################
rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
bootstrap=bootstrap)
rfc.fit(train_X, train_Y)
####################
print_report([rfc], [test_X], test_Y, ["X"])
####################
export_model(rfc, use_case, layer_name)
\ No newline at end of file
attrs==21.2.0
backcall==0.2.0
beautifulsoup4==4.9.3
certifi==2021.5.30
chardet==4.0.0
charset-normalizer==2.0.3
click==7.1.2
clickclick==20.10.2
colorama==0.4.4
connexion==2.9.0
cycler==0.10.0
debugpy==1.4.0
decorator==5.0.9
Flask==1.1.4
Flask-Cors==3.0.10
idna==3.2
imbalanced-learn==0.8.0
imblearn==0.0
importlib-metadata==3.10.1
inflection==0.5.1
ipykernel==6.0.3
ipython==7.25.0
ipython-genutils==0.2.0
isodate==0.6.0
itsdangerous==1.1.0
jedi==0.18.0
Jinja2==2.11.3
joblib==1.0.1
jsonschema==3.2.0
jupyter-client==6.1.12
jupyter-core==4.7.1
kiwisolver==1.3.1
libpysal==4.5.1
MarkupSafe==2.0.1
matplotlib==3.4.2
matplotlib-inline==0.1.2
numpy==1.21.1
openapi-schema-validator==0.1.5
openapi-spec-validator==0.3.1
opencv-contrib-python==4.5.3.56
pandas==1.3.0
parso==0.8.2
pickleshare==0.7.5
Pillow==8.3.1
pointpats==2.2.0
prance==0.21.2
prompt-toolkit==3.0.19
Pygments==2.9.0
pymongo==3.12.0
pyparsing==2.4.7
pyrsistent==0.18.0
python-dateutil==2.8.2
pytz==2021.1
pywin32==301
PyYAML==5.4.1
pyzmq==22.1.0
requests==2.26.0
scikit-learn==0.24.2
scipy==1.7.0
semver==2.13.0
six==1.16.0
soupsieve==2.2.1
swagger-ui-bundle==0.0.8
threadpoolctl==2.2.0
tornado==6.1
traitlets==5.0.5
typing-extensions==3.10.0.0
urllib3==1.26.6
wcwidth==0.2.5
Werkzeug==1.0.1
zipp==3.5.0
from flask import request, Response
from db.repository import Repository
from db.dao import PredictionResult
repo = Repository()
def get(use_case, table, layer_name):
res = repo.get_prediction_results(use_case)
if res is None or len(res) == 0:
return Response(status=404)
else:
return [c.__dict__ for c in res]
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from processing.data_prep.main import run
if __name__ == '__main__':
'''Creates data/raw files'''
run(use_case='community-prediction-youtube-n')
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from security.token_manager import TokenManager
import network_constants
from db.repository import Repository
from db.dao import LayerPairDao
from typing import List, Dict
import requests
import json
def get_youtube_dependencies() -> List[LayerPairDao]:
uc = 'community-prediction-youtube-n'
layer_dependencies = [
('CategoryLayer', 'CountryLayer'),
('ViewsLayer', 'CountryLayer'),
('ViewsLayer', 'CategoryLayer'),
('LikesLayer', 'ViewsLayer'),
('DislikesLayer', 'ViewsLayer'),
('CommentCountLayer', 'ViewsLayer'),
('TrendDelayLayer', 'ViewsLayer'),
]
return [LayerPairDao(uc, uc, ld[0], ld[1]) for ld in layer_dependencies]
def get_taxi_dependencies() -> List[LayerPairDao]:
uc = 'community-prediction-taxi'
layer_dependencies = [
('CallTypeLayer', 'DayTypeLayer'),
('OriginCallLayer', 'CallTypeLayer'),
('OriginStandLayer', 'CallTypeLayer'),
('TaxiIdLayer', 'OriginStandLayer'),
('StartLocationLayer', 'OriginStandLayer'),
('EndLocationLayer', 'OriginStandLayer'),
('StartLocationLayer', 'DayTypeLayer'),
('EndLocationLayer', 'DayTypeLayer'),
]
return [LayerPairDao(uc, uc, ld[0], ld[1]) for ld in layer_dependencies]
def upload_layerpair(layerpair:LayerPairDao):
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{layerpair.use_case}/tables/{layerpair.table}/layer-pairs'
response = requests.post(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"},
json = layerpair.__dict__
)
if response.status_code != 200:
raise ConnectionError(f"Could not upload layer pair, statuscode: {response.status_code}!")
if __name__ == '__main__':
'''Uploads the cross-context dependencies for all use-cases.'''
assert False, 'replace with true to upload now'
for lp in get_youtube_dependencies():
upload_layerpair(lp)
for lp in get_taxi_dependencies():
upload_layerpair(lp)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from processing.fetching import fetching
if __name__ == "__main__":
'''Fetches all required data from business-logic and role-stage-discovery.'''
fetching.fetch(selected_use_cases=['community-prediction-youtube-n'], selected_use_case_tables=None)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from db.repository import Repository
from processing.ml.predict_single_context import run_prediction as run_single_prediction
from processing.ml.predict_cross_context import run_prediction as run_cross_prediction
if __name__ == '__main__':
'''Executes the predictions.'''
use_case='community-prediction-youtube-n'
repo = Repository()
repo.delete_all_prediction_results()
run_single_prediction(use_case)
run_cross_prediction(use_case)
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from processing.ml.train_single_context import run_training as run_single_training
from processing.ml.train_cross_context import run_training as run_cross_training
if __name__ == '__main__':
'''Executes the training.'''
use_case='community-prediction-youtube-n'
run_single_training(use_case)
run_cross_training(use_case)
\ No newline at end of file
import unittest
import sys
for path in ['../', './']:
sys.path.insert(1, path)
# python -m unittest discover
from processing import ClusterMetricsCalculator2D
class TestClusterMetricsCalculator(unittest.TestCase):
def test__get_standard_deviation__same_points_many_decimals__zero_and_not_nan(self):
nodes = [{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567},
{'f1': -8.58564, 'f2': 41.148567}]
calc = ClusterMetricsCalculator2D(nodes, ['f1','f2'], len(nodes), 1)
self.assertAlmostEqual(0, calc.get_standard_deviation())
def test__get_range__almost_linear_distribution_in_2d__euclidean_distance(self):
l = [(-8.657802, 41.160978), (-8.65782, 41.160969), (-8.657838, 41.16096)]
nodes = [{'f1': e[0], 'f2': e[1]} for e in l]
calc = ClusterMetricsCalculator2D(nodes, ['f1','f2'], len(nodes), 1)
# https://www.calculatorsoup.com/calculators/geometry-plane/distance-two-points.php
self.assertAlmostEqual(4.0E-5, calc.get_range(), 5)
if __name__ == '__main__':
unittest.main()
# ignore this folder because it contains raw jsons for the community-prediction task
**
!.gitignore
\ No newline at end of file
......@@ -162,7 +162,7 @@ paths:
'404':
description: "Layer not found"
/use-cases/{use_case}/tables{table}/layers/{layer_name}/timeslices:
/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices:
get:
operationId: "routes.timeslices.get_by_name"
security:
......
......@@ -23,14 +23,22 @@ class TimeSlice:
if time_slice_dict is not None:
self.from_serializable_dict(time_slice_dict, from_db)
def init_all_clusters(self, cluster_labels: List[str]):
'''Initializes internal clusters for all labels with an empty list.'''
for cluster_label in cluster_labels:
# only string keys can be stored in json
cluster_label = str(cluster_label)
self.clusters[cluster_label] = []
def add_node_to_cluster(self, cluster_label: str, node):
# only string keys can be stored in json
cluster_label = str(cluster_label)
if cluster_label not in self.clusters:
self.clusters[cluster_label] = []
# self.clusters[cluster_label] = []
raise KeyError(f"self::init_all_clusters must be used to add all global cluster labels beforehand (got {cluster_label})")
node = self._get_unique_id(node)
# node = self._get_unique_id(node)
self.clusters[cluster_label].append(node)
def get_nodes_for_cluster(self, cluster_label: str):
......
......@@ -73,7 +73,7 @@ class Repository(MongoRepositoryBase):
def add_layer_nodes(self, nodes:List[dict]):
super().insert_many(self._layer_nodes_collection, nodes)
def get_layer_nodes(self, use_case: str, use_case_table: str, layer_name: str) -> dict:
def get_layer_nodes(self, use_case: str, use_case_table: str, layer_name: str) -> List[dict]:
'''Returns all nodes for the use-case and layer.'''
entries = super().get_entries(self._layer_nodes_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name}, projection={'_id': 0})
return [e for e in entries]
......
......@@ -13,12 +13,16 @@ from typing import Tuple, Dict, Any, List
TimeSliceKey = Tuple[int, int]
# TODO extract information about time features (maybe from table mapping)
TIME_PROPERTY_NAMES = ['Timestamp']
TIME_PROPERTY_NAMES = ['timestamp']
repo = Repository()
def convert_to_time_slice_key(timestamp: str) -> TimeSliceKey:
'''Returns the tuple (year, week_of_year) from a timestamp. This is used as the key for the slicing.'''
# time = datetime.utcfromtimestamp(float(timestamp[0:10]))
time = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
# time = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
time = datetime.utcfromtimestamp(float(timestamp))
(y, w, _) = time.isocalendar()
return (y, w)
......@@ -34,10 +38,16 @@ def split_clusterset_by_time(layer: Layer, clusters: List[Cluster]) -> Dict[Time
'''
time_slices: Dict[Any, TimeSlice] = {}
for cluster_no in clusters:
for node in cluster_no.nodes:
all_cluster_labels = [cluster.cluster_label for cluster in clusters if cluster.cluster_label != -1]
for cluster in clusters:
if cluster.cluster_label == -1:
print("Noise cluster was ignored.")
continue
for node in cluster.nodes:
# retrieve times the node is located in based on the defined time properties in the schema
# retrieve times where the node is located inas (year, week), based on the defined timestamp fields in the schema
time_keys = set()
for time_property in TIME_PROPERTY_NAMES:
if time_property in node:
......@@ -46,28 +56,66 @@ def split_clusterset_by_time(layer: Layer, clusters: List[Cluster]) -> Dict[Time
for time_key in time_keys:
if time_key not in time_slices:
time_slices[time_key] = TimeSlice(time_key, layer.use_case, layer.use_case_table, layer.layer_name)
time_slices[time_key].init_all_clusters(all_cluster_labels)
time_slices[time_key].add_node_to_cluster(cluster_no.cluster_label, node)
time_slices[time_key].add_node_to_cluster(cluster.cluster_label, node)
return time_slices
if __name__ == "__main__":
repo = Repository()
def get_layers() -> List[Layer]:
return repo.get_layers()
def get_clusters_for_layer(use_case, use_case_table, layer_name)-> List[Cluster]:
# return repo.get_clusters_for_layer(use_case, use_case_table, layer_name)
json_path = f'_predictions/clusters/{layer_name}.json'
if os.path.exists(json_path):
with open(json_path, 'r') as file:
return [Cluster(cluster_dict=e, from_db=False) for e in json.loads(file.read())]
return []
repo.remove_all_time_slices()
def get_layer_nodes(use_case, use_case_table, layer_name)-> List[dict]:
# return repo.get_layer_nodes(use_case, use_case_table, layer_name)
return []
layers = repo.get_layers()
def add_time_slice(timeslice):
try:
repo.add_time_slice(timeslice)
pass
except:
print(f"Error while storing time slice in db for {timeslice.layer_name}")
# try:
# json_path = f'_predictions/timeslices/{timeslice.layer_name}/{timeslice.time}.json'.replace(', ', '_').replace('(', '').replace(')', '')
# if not os.path.exists(os.path.dirname(json_path)):
# os.makedirs(os.path.dirname(json_path))
# with open(json_path, 'w') as file:
# file.write(json.dumps(timeslice.to_serializable_dict(for_db=False)))
# except Exception as e:
# print(f"Error while writing json for {timeslice.layer_name}: {e}")
def run_time_slicing(selected_use_cases: List[str] = None, selected_use_case_tables: List[str] = None, selected_layer_names: List[str] = None):
layers = get_layers()
for layer in layers:
layer_name = layer.layer_name
use_case = layer.use_case
use_case_table = layer.use_case_table
# skip layers not in the params
if selected_use_cases is not None and use_case not in selected_use_cases \
or selected_use_case_tables is not None and use_case_table not in selected_use_case_tables \
or selected_layer_names is not None and layer_name not in selected_layer_names:
continue
print(f"Working on {use_case}//{use_case_table}//{layer_name}.")
clusters_for_layer = repo.get_clusters_for_layer(use_case, use_case_table, layer_name)
clusters_for_layer = get_clusters_for_layer(use_case, use_case_table, layer_name)
# if no clusters were generated use one large cluster instead of skipping the layer
if clusters_for_layer is None or len(clusters_for_layer) == 0:
nodes = repo.get_layer_nodes(use_case, use_case_table, layer_name)
nodes = get_layer_nodes(use_case, use_case_table, layer_name)
if nodes is None or len(nodes) == 0:
print("Skipping, because there are no clusters and no nodes for the layer.")
continue
......@@ -76,4 +124,9 @@ if __name__ == "__main__":
time_slices = split_clusterset_by_time(layer, clusters_for_layer)
for k,v in time_slices.items():
repo.add_time_slice(v)
add_time_slice(v)
if __name__ == "__main__":
# repo.remove_all_time_slices()
run_time_slicing(selected_use_cases=['community-prediction-youtube-n'])
\ No newline at end of file
# this file contains all the training data but is around 1.8 GB.
train.csv
\ No newline at end of file
import csv
import hashlib
import sys
import os
modules_paths = ['.', '../../../modules/']
for modules_path in modules_paths:
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from messaging.MessageHandler import MessageHandler
from db.repository import Repository
# file to read the data from
CSV_FILE = r'dummy_upload/community-prediction-taxi/train.csv'
handler = MessageHandler(Repository())
import csv
import json
from datetime import datetime
from typing import Iterator
enum_mapping = {'A': 1, 'B': 2, 'C': 3}
def load_csv_content() -> Iterator:
'''Returns a generator for all lines in the csv file with correct field types.'''
with open(CSV_FILE) as csv_file:
reader = csv.reader(csv_file)
headers = [h.lower() for h in next(reader)]
for line in reader:
# convert line fields to correct type
for i in range(len(headers)):
# trip_id AS string
if i == 0:
continue
# call_type, day_type
if i in [1, 6]:
line[i] = enum_mapping[line[i]]
# origin_call, origin_stand, taxi_id AS int
elif i in [2, 3, 4]:
line[i] = int(line[i]) if line[i] != "" else ""
# timestamp AS timestamp
elif i == 5:
# datetime is not serializable
# line[i] = datetime.fromtimestamp(int(line[i]))
line[i] = int(line[i])
# missing_data AS bool
elif i == 7:
line[i] = line[i].lower() == 'true'
# polyline AS List[List[float]]
elif i == 8:
line[i] = json.loads(line[i])
entry = dict(zip(headers, line))
yield entry
def upload_transaction(transaction):
# manually flatten based on table mapping
uid = transaction['trip_id']
transaction['UniqueID'] = uid
if len(transaction['polyline']) == 0:
print(f"skipping transaction: {transaction}")
return
transaction['start_location_lat'] = transaction['polyline'][0][0]
transaction['start_location_long'] = transaction['polyline'][0][1]
transaction['end_location_lat'] = transaction['polyline'][-1][0]
transaction['end_location_long'] = transaction['polyline'][-1][1]
del transaction['trip_id']
del transaction['polyline']
t = {
'use_case': 'community-prediction-taxi',
'table': 'community-prediction-taxi',
'id': uid,
'properties': transaction,
}
handler.handle_new_trace(t)
if __name__ == '__main__':
entries = load_csv_content()
for idx, transaction in enumerate(entries):
upload_transaction(transaction)
if idx % 1000 == 0:
print(f"Progress: {str(float(idx) / 1710671)} %")
\ No newline at end of file
# this file contains all the training data but is large.
videos.csv
\ No newline at end of file
import csv
import hashlib
import sys
import os
modules_paths = ['.', '../../../modules/']
for modules_path in modules_paths:
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from messaging.MessageHandler import MessageHandler
from db.repository import Repository
# file to read the data from
CSV_FILE = r'dummy_upload/community-prediction-youtube/videos.csv'
handler = MessageHandler(Repository())
import csv
import json
from datetime import datetime
from typing import Iterator
import pandas as pd
from pandas import DataFrame
def load_csv_content() -> Iterator:
'''Returns a generator for all lines in the csv file with correct field types.'''
dfs: DataFrame = pd.read_csv(CSV_FILE)
return dfs.iterrows()
def upload_transaction(transaction):
# manually flatten based on table mapping
uid = transaction['video_id']
transaction['UniqueID'] = uid
transaction['trend_delay'] = transaction['trend_duration']
transaction['timestamp'] = transaction['trending_timestamp']
del transaction['trend_duration']
del transaction['trending_timestamp']
t = {
'use_case': 'community-prediction-youtube-n',
'table': 'community-prediction-youtube-n',
'id': uid,
'properties': transaction,
}
handler.handle_new_trace(t)
if __name__ == '__main__':
entries = load_csv_content()
for idx, transaction in entries:
transaction = transaction.to_dict()
upload_transaction(transaction)
if idx % 1000 == 0:
print(f"Progress: {str(float(idx) / 375942)} %")
\ No newline at end of file
......@@ -7,9 +7,9 @@ def get_resources_path():
try:
return os.environ['ARTICONF_RESOURCES_PATH']
except:
return '/srv/articonf'
else:
return '/srv/articonf'
pass
return '/srv/articonf'
def is_running_locally():
'''Set env var ARTICONF_LOCAL=1 to run locally.'''
......
......@@ -55,6 +55,17 @@ else:
ROLESTAGE_DISCOVERY_DB_HOSTNAME = 'articonf1.itec.aau.at'
ROLESTAGE_DISCOVERY_DB_PORT = 30104
## Proactive Community Detection
if server:
PROACTIVE_COMMUNITY_DETECTION_HOSTNAME = 'proactive-community-detection'
PROACTIVE_COMMUNITY_DETECTION_REST_PORT = 80
PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME = f'{ROLESTAGE_DISCOVERY_HOSTNAME}-db'
PROACTIVE_COMMUNITY_DETECTION_DB_PORT = 27017
else:
PROACTIVE_COMMUNITY_DETECTION_HOSTNAME = 'articonf1.itec.aau.at'
PROACTIVE_COMMUNITY_DETECTION_REST_PORT = 30105
PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME = 'articonf1.itec.aau.at'
PROACTIVE_COMMUNITY_DETECTION_DB_PORT = 30106
#endregion Data Hub
#region Rest Gateway
......
import sys
import os
from pathlib import Path
from typing import Dict, Any
import requests
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
import network_constants as nc
from security.token_manager import TokenManager
import tables.add_table as add_table
def add_use_case(use_case: str):
jwt = TokenManager.getInstance().getToken()
url = f"https://articonf1.itec.aau.at:30420/api/use-cases"
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = {"name": use_case}
)
print(url+": "+str(response.status_code))
if __name__ == "__main__":
# TODO strategy pattern for all the use cases where add_use_case() etc. is clear duplicate code
# The same for add table, where almost everything is the same (UniqueId, Timestamp for every Layer)
# Only need to get jwt token once in the strategy impl
use_case = "community-prediction-taxi"
# disable ssl warnings :)
requests.packages.urllib3.disable_warnings()
add_use_case(use_case)
add_table.main(use_case=use_case)
\ No newline at end of file
import network_constants as nc
from security.token_manager import TokenManager
import requests
def add_table(use_case: str, table_name: str):
''' Adds the use-case table with all the mappings as dict Internal -> External. '''
jwt = TokenManager.getInstance().getToken()
mapping = { c : c for c in [
# mapping does not change any of the names for these properties
'call_type',
'origin_call',
'origin_stand',
'taxi_id',
'timestamp',
'day_type',
'missing_data',
] }
mapping['start_location_lat'] = 'ployline[0][0]'
mapping['start_location_long'] = 'ployline[0][1]'
mapping['end_location_lat'] = 'ployline[-1][0]'
mapping['end_location_long'] = 'ployline[-1][1]'
mapping["UniqueID"] = "trip_id"
url = f"https://articonf1.itec.aau.at:30420/api/use-cases/{use_case}/tables"
table = {
"name": table_name,
"mappings": mapping
}
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = table
)
print(url+": "+str(response.status_code))
def add_layers(use_case:str, table_name: str):
jwt = TokenManager.getInstance().getToken()
layers = [
{
"name": "CallTypeLayer",
"properties": [
"UniqueID",
"call_type",
],
"cluster_properties": [
"call_type"
]
},
{
"name": "DayTypeLayer",
"properties": [
"UniqueID",
"day_type",
],
"cluster_properties": [
"day_type"
]
},
{
"name": "OriginCallLayer",
"properties": [
"UniqueID",
"call_type",
"origin_call",
],
"cluster_properties": [
"call_type",
"origin_call",
]
},
{
"name": "OriginStandLayer",
"properties": [
"UniqueID",
"call_type",
"origin_stand",
],
"cluster_properties": [
"call_type",
"origin_stand",
]
},
{
"name": "TaxiIdLayer",
"properties": [
"UniqueID",
"taxi_id",
],
"cluster_properties": [
"taxi_id",
]
},
{
"name": "StartLocationLayer",
"properties": [
"UniqueID",
"start_location_lat",
"start_location_long",
],
"cluster_properties": [
"start_location_lat",
"start_location_long",
]
},
{
"name": "EndLocationLayer",
"properties": [
"UniqueID",
"end_location_lat",
"end_location_long",
],
"cluster_properties": [
"end_location_lat",
"end_location_long",
]
},
]
for layer in layers:
# add basic info to each layer
layer["use_case"] = use_case
layer["table"] = table_name
layer["properties"].append("timestamp")
url = f"https://articonf1.itec.aau.at:30420/api/layers"
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = layer
)
print(url+": "+str(response.status_code))
def main(use_case: str, table_name: str = "community-prediction-taxi"):
add_table(use_case, table_name)
add_layers(use_case, table_name)
\ No newline at end of file
import sys
import os
from pathlib import Path
from typing import Dict, Any
import requests
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
import network_constants as nc
from security.token_manager import TokenManager
import tables.add_table as add_table
def add_use_case(use_case: str):
jwt = TokenManager.getInstance().getToken()
url = f"https://articonf1.itec.aau.at:30420/api/use-cases"
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = {"name": use_case}
)
print(url+": "+str(response.status_code))
if __name__ == "__main__":
# TODO strategy pattern for all the use cases where add_use_case() etc. is clear duplicate code
# The same for add table, where almost everything is the same (UniqueId, Timestamp for every Layer)
# Only need to get jwt token once in the strategy impl
use_case = "community-prediction-youtube"
# disable ssl warnings :)
requests.packages.urllib3.disable_warnings()
add_use_case(use_case)
add_table.main(use_case=use_case)
\ No newline at end of file
import network_constants as nc
from security.token_manager import TokenManager
import requests
def add_table(use_case: str, table_name: str):
''' Adds the use-case table with all the mappings as dict Internal -> External. '''
jwt = TokenManager.getInstance().getToken()
mapping = { c : c for c in [
# mapping does not change any of the names for these properties
'trending_date',
'title',
'channel_title',
'category_id',
'category_name',
'publish_time',
'tags',
'views',
'likes',
'dislikes',
'comment_count',
'comments_disabled',
'ratings_disabled',
'video_error_or_removed',
'country_code',
'country_id',
'publish_timestamp',
] }
mapping["UniqueID"] = "video_id"
mapping["trend_delay"] = "trend_duration"
mapping["timestamp"] = "trending_timestamp"
url = f"https://articonf1.itec.aau.at:30420/api/use-cases/{use_case}/tables"
table = {
"name": table_name,
"mappings": mapping
}
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = table
)
print(url+": "+str(response.status_code))
def add_layers(use_case:str, table_name: str):
jwt = TokenManager.getInstance().getToken()
layers = [
{
"name": "CategoryLayer",
"properties": [
"UniqueID",
"category_id",
"category_name",
],
"cluster_properties": [
"category_id"
]
},
{
"name": "ViewsLayer",
"properties": [
"UniqueID",
"views",
],
"cluster_properties": [
"views"
]
},
{
"name": "LikesLayer",
"properties": [
"UniqueID",
"likes",
],
"cluster_properties": [
"likes"
]
},
{
"name": "DislikesLayer",
"properties": [
"UniqueID",
"dislikes",
],
"cluster_properties": [
"dislikes"
]
},
{
"name": "CommentCountLayer",
"properties": [
"UniqueID",
"comment_count",
],
"cluster_properties": [
"comment_count"
]
},
{
"name": "CountryLayer",
"properties": [
"UniqueID",
"country_code",
"country_id",
],
"cluster_properties": [
"country_id"
]
},
{
"name": "TrendDelayLayer",
"properties": [
"UniqueID",
"trend_delay",
],
"cluster_properties": [
"trend_delay"
]
},
]
for layer in layers:
# add basic info to each layer
layer["use_case"] = use_case
layer["table"] = table_name
layer["properties"].append("timestamp")
url = f"https://articonf1.itec.aau.at:30420/api/layers"
response = requests.post(
url,
verify=False,
proxies = { "http":None, "https":None },
headers = { "Authorization": f"Bearer {jwt}"},
json = layer
)
print(url+": "+str(response.status_code))
def main(use_case: str, table_name: str = "community-prediction-youtube"):
add_table(use_case, table_name)
add_layers(use_case, table_name)
\ No newline at end of file
paths:
#####
# USE-CASES
#####
/use-cases:
post:
security:
......@@ -57,9 +55,8 @@ paths:
description: "Successful Request"
'403':
description: "Confirmation required"
#####
# TABLES
#####
# region tables
/tables:
get:
security:
......@@ -187,13 +184,9 @@ paths:
responses:
'200':
description: "Successful Request"
#####
# END-TABLES
#####
# endregion tables
#####
# LAYERS
#####
# region layers
/layers:
get:
security:
......@@ -515,13 +508,9 @@ paths:
description: "Field in request is missing"
'403':
description: "Confirmation required"
#####
# END LAYERS
#####
# endregion layers
#####
# ENUMS
#####
# region enums
/enums:
get:
security:
......@@ -618,6 +607,66 @@ paths:
description: "Successful Request"
'404':
description: "Enum does not exist"
# endregion enums
# region context dependencies for community prediction
/use-cases/{use_case}/tables/{table}/layer-pairs:
get:
security:
- JwtRegular: []
operationId: "routes.context_pairs.get_all"
tags:
- "LayerPairs"
summary: "Retrieve all layer pairs for cluster prediction"
description: "Retrieve all layer pairs for cluster prediction"
parameters:
- name: "use_case"
in: "path"
description: "Name of the use-case"
required: true
type: "string"
- name: "table"
in: "path"
description: "Name of the table"
required: true
type: "string"
responses:
'200':
description: "Successful Request"
schema:
$ref: '#/definitions/LayerPair'
'404':
description: "pairs do not exist"
post:
security:
- JwtRegular: []
operationId: "routes.context_pairs.insert"
tags:
- "LayerPairs"
summary: "Add a new layer pair for cluster prediction"
description: "Add a new layer pair for cluster prediction"
parameters:
- name: "use_case"
in: "path"
description: "Name of the use-case"
required: true
type: "string"
- name: "table"
in: "path"
description: "Name of the table"
required: true
type: "string"
- name: "layer_pair"
in: "body"
required: true
schema:
$ref: '#/definitions/LayerPair'
responses:
'200':
description: "Successful Request"
'400':
description: "incorrect format etc"
# endregion context dependencies
definitions:
LayerMapping:
......@@ -684,4 +733,16 @@ definitions:
type: array
items:
type: string
example: "internal_property_1"
\ No newline at end of file
example: "internal_property_1"
LayerPair:
type: "object"
properties:
use_case:
type: string
table:
type: string
layer:
type: string
reference_layer:
type: string
\ No newline at end of file
from typing import List, Dict
class LayerPair:
def __init__(self, use_case: str, table: str, layer: str, reference_layer: str):
self.use_case = use_case
self.table = table
self.layer = layer
self.reference_layer = reference_layer
@staticmethod
def create_from_dict(dict_) -> 'LayerPair':
lp = LayerPair(None, None, None, None)
lp.__dict__.update(dict_)
return lp
......@@ -3,13 +3,14 @@ import network_constants as netconst
from database.MongoRepositoryBase import MongoRepositoryBase
from db.entities.layer_adapter import LayerAdapter
from db.entities.use_case import UseCase
from db.entities.layer_pair import LayerPair
import pymongo
import json
from typing import List, Dict
class Repository(MongoRepositoryBase):
'''This is a repository for MongoDb.'''
'''This is a LAYER repository for MongoDb.'''
def __init__(self):
super().__init__(netconst.BUSINESS_LOGIC_DB_HOSTNAME,
......@@ -18,6 +19,7 @@ class Repository(MongoRepositoryBase):
self._adapter_collection = 'layer_adapters'
self._use_case_collection = 'use_cases'
self._layer_pair_collection = 'contextpairs'
def all(self) -> List[Dict]:
result = super().get_entries(self._adapter_collection, projection={'_id': False})
......@@ -59,4 +61,17 @@ class Repository(MongoRepositoryBase):
def delete(self, adapter : LayerAdapter):
collection = self._database[self._adapter_collection]
collection.delete_many({"name": adapter.name, "use_case": adapter.use_case, "table": adapter.table})
\ No newline at end of file
collection.delete_many({"name": adapter.name, "use_case": adapter.use_case, "table": adapter.table})
# region context pairs
def get_layer_pairs(self, use_case, table) -> List[LayerPair]:
return \
[LayerPair.create_from_dict(entry) for entry in
super().get_entries(self._layer_pair_collection, projection={'_id': False}, selection={"use_case": use_case, "table": table})
]
def insert_layer_pair(self, uc, table, pair: LayerPair):
super().insert_entry(self._layer_pair_collection, pair.__dict__)
# endregion context pairs
\ No newline at end of file
import json
from flask import Response, request
from db.repository import Repository
from db.entities.layer_pair import LayerPair
repo = Repository()
def get_all(use_case: str, table: str):
return [e.__dict__ for e in repo.get_layer_pairs(use_case, table)]
def insert(use_case: str, table: str, layer_pair: dict):
repo.insert_layer_pair(use_case, table, LayerPair.create_from_dict(layer_pair))
return Response(status=200)
\ No newline at end of file
......@@ -4,7 +4,7 @@ from icecream import ic
def httpget(url):
token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6InJlZ3VsYXJAaXRlYy5hYXUuYXQiLCJjcmVhdGVkX2F0IjoiMjAyMS0wMy0yNCAxMDoxMzo1MS4wMjkwNDkiLCJ2YWxpZF91bnRpbCI6IjIwMjEtMDMtMjUgMTA6MTM6NTEuMDI5MDQ5In0.V6kYV5Lmb_tUIsF-6AKNB8_lIifmJP_Dm8gHhGa5w_o'
token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6InJlZ3VsYXJAaXRlYy5hYXUuYXQiLCJjcmVhdGVkX2F0IjoiMjAyMS0wNC0wNyAwODo0MDo0NS4yODEwOTYiLCJ2YWxpZF91bnRpbCI6IjIwMjEtMDQtMDggMDg6NDA6NDUuMjgxMDk2In0.oIDOEYy8bmIR3AHDRU-T0upYU0Wcz7V4FYzO5tSaSzk'
res = requests.get(url,
verify=False,
headers = { "Authorization": f"Bearer {token}"})
......@@ -32,6 +32,9 @@ count_data(res.json())
res_f = httpget(url = 'https://articonf1.itec.aau.at:30001/api/use_cases/crowd-journalism/transactions-failed')
count_data(res_f.json(), 'docType')
res_d = httpget(url = 'https://articonf1.itec.aau.at:30001/api/use_cases/crowd-journalism/transactions-duplicated')
count_data(res_d.json())
# failed tags: the "tag" is missing, but is called name
# failed purchases: duplicate keys generated from (userid, videoid, ownerid)
# failed classifications: impact is missing
......
import requests
requests.packages.urllib3.disable_warnings()
from icecream import ic
uc = 'community-prediction-youtube'
def httpget(url):
token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6InJlZ3VsYXJAaXRlYy5hYXUuYXQiLCJjcmVhdGVkX2F0IjoiMjAyMS0wNS0wNSAxMTozNjozOC4yMzAxODEiLCJ2YWxpZF91bnRpbCI6IjIwMjEtMDUtMDYgMTE6MzY6MzguMjMwMTgxIn0.Fz6iPpA0CnrXlOCj-VuCHFzc58H9Of2cBYHOb_RqvzI'
res = requests.get(url,
verify=False,
headers = { "Authorization": f"Bearer {token}"})
return res
# list tables
res = httpget(url = f'https://articonf1.itec.aau.at:30420/api/use-cases/{uc}/tables')
print("Tables: ", [entry['name'] for entry in res.json()])
# count pushed data
def count_data(json_res, table_identifier='table'):
tables = {}
for entry in json_res:
key = entry[table_identifier]
if key not in tables:
tables[key] = 0
tables[key] += 1
ic(tables)
res = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions')
count_data(res.json())
res_f = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions-failed')
count_data(res_f.json(), 'docType')
res_d = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions-duplicated')
count_data(res_d.json())
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment