Commit e0a467cf authored by Alexander Lercher's avatar Alexander Lercher

Fetching data for community prediction

Use-cases, layers, clusters, timeslices, layerpairs
parent b1a8e730
......@@ -2,13 +2,15 @@ FROM python:3
LABEL maintainer="Alexander Lercher"
RUN apt-get update
RUN pip install flask
RUN pip install connexion[swagger-ui]
EXPOSE 5000
WORKDIR /app
COPY src/data-hub/role-stage-discovery-microservice/app/requirements.txt /app/
RUN pip install -r requirements.txt
COPY src/modules/ /app/
COPY src/data-hub/proactive-community-detection-microservice/app/ /app/
RUN chmod a+x main.py
......
from db.dao.cluster import Cluster as ClusterDao
from db.dao.layer import Layer as LayerDao
from db.dao.timeslice import TimeSlice as TimeSliceDao
from db.dao.layer_pair import LayerPair as LayerPairDao
import json
from typing import List, Dict
from datetime import date, datetime
class Cluster:
'''
A cluster for an arbitrary layer containing some nodes.
:param use_case: The use-case of the layer
:param use_case_table: The use-case table of the layer
:param layer_name: The name of the layer in which the cluster is located
:param cluster_label: The label of the cluster unique for the layer
:param nodes: The individual nodes of the cluster
:param label: A human readable label
'''
def __init__(self, use_case: str = None, use_case_table: str = None, layer_name: str = None,
cluster_label: int = None, nodes: List[Dict] = None, label: str = None,
cluster_dict: Dict = None, from_db=False):
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.cluster_label = cluster_label
self.nodes = nodes
self.label = label
if cluster_dict is not None:
self.from_serializable_dict(cluster_dict, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"use_case": self.use_case,
"use_case_table": self.use_case_table,
"layer_name": self.layer_name,
"cluster_label": self.cluster_label,
"nodes": json.dumps(self.nodes) if for_db else self.nodes,
"label": self.label,
}
def from_serializable_dict(self, cluster_dict: Dict, from_db=False):
self.use_case = cluster_dict["use_case"]
self.use_case_table = cluster_dict["use_case_table"]
self.layer_name = cluster_dict["layer_name"]
self.cluster_label = cluster_dict["cluster_label"]
self.nodes = json.loads(cluster_dict["nodes"]) \
if from_db else cluster_dict["nodes"]
self.label = cluster_dict["label"]
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Cluster({self.__repr__()})"
import json
from datetime import datetime
from typing import Dict
class Layer:
'''
This class represents a single layer of the Multilayer Graph.
:param layer_info: Information as dictionary to restore the layer object.
'''
def __init__(self, layer_info: Dict = None, from_db=False):
if layer_info is not None:
self.from_serializable_dict(layer_info, from_db)
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"layer_name": self.layer_name,
"properties": self.properties,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
"total_properties": self.total_properties,
}
@staticmethod
def from_business_logic_dict(layer_info: Dict):
layer = Layer()
layer.layer_name = layer_info["name"]
layer.properties = layer_info["cluster_properties"]
layer.total_properties = layer_info["properties"]
layer.use_case = layer_info["use_case"]
layer.use_case_table = layer_info["table"]
return layer
def from_serializable_dict(self, layer_info: Dict, from_db=False):
self.layer_name = layer_info['layer_name']
self.properties = layer_info['properties']
self.use_case = layer_info["use_case"]
self.use_case_table = layer_info["use_case_table"]
self.total_properties = layer_info["total_properties"] if "total_properties"in layer_info.keys() else None
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"Layer({self.__repr__()})"
from typing import List, Dict
class LayerPair:
def __init__(self, use_case: str, table: str, layer: str, reference_layer: str):
self.use_case = use_case
self.table = table
self.layer = layer
self.reference_layer = reference_layer
@staticmethod
def create_from_dict(dict_) -> 'LayerPair':
lp = LayerPair(None, None, None, None)
lp.__dict__.update(dict_)
return lp
import json
from typing import List, Dict, NewType, Any
from datetime import date, datetime
Node = NewType('Node', dict)
class TimeSlice:
'''
A time slice for a single layer containing all nodes for that time.
:param time: The tag indicating the time
:param layer_name: The name of the layer the nodes belong to
'''
def __init__(self, time: Any = None, use_case: str = None, use_case_table: str = None, layer_name: str = None,
time_slice_dict: Dict = None, from_db = False):
self.time = str(time)
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.clusters: Dict[str, List[Node]] = {}
if time_slice_dict is not None:
self.from_serializable_dict(time_slice_dict, from_db)
def init_all_clusters(self, cluster_labels: List[str]):
'''Initializes internal clusters for all labels with an empty list.'''
for cluster_label in cluster_labels:
# only string keys can be stored in json
cluster_label = str(cluster_label)
self.clusters[cluster_label] = []
def add_node_to_cluster(self, cluster_label: str, node):
# only string keys can be stored in json
cluster_label = str(cluster_label)
if cluster_label not in self.clusters:
# self.clusters[cluster_label] = []
raise KeyError(f"self::init_all_clusters must be used to add all global cluster labels beforehand (got {cluster_label})")
# node = self._get_unique_id(node)
self.clusters[cluster_label].append(node)
def get_nodes_for_cluster(self, cluster_label: str):
if cluster_label in self.clusters:
return self.clusters[cluster_label]
else:
return []
def _get_unique_id(self, node : Dict) -> Dict:
'''Returns a new dict with the unique id only.'''
uid_key = 'UniqueID'
if uid_key in node:
return {uid_key: node[uid_key]}
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"time": self.time,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
'layer_name': self.layer_name,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, dict: Dict, from_db=False):
self.time = dict["time"]
self.use_case = dict["use_case"]
self.use_case_table = dict["use_case_table"]
self.layer_name = dict['layer_name']
self.clusters = json.loads(dict['clusters']) if from_db else dict['clusters']
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeSlice({self.__repr__()})"
import pymongo
import network_constants as netconst
from database.MongoRepositoryBase import MongoRepositoryBase
import json
from db.dao import *
from typing import List
import logging
LOGGER = logging.getLogger(__name__)
class Repository(MongoRepositoryBase):
'''This is a repository for MongoDb.'''
def __init__(self):
super().__init__(netconst.PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME,
netconst.PROACTIVE_COMMUNITY_DETECTION_DB_PORT,
'proactiveCommunityDb')
self._layer_collection = 'layers'
self._layer_pair_collection = 'layer_pairs'
self._clusters_collection = 'clusters'
self._time_slice_collection = 'time_slices'
#region Layers
def add_layer(self, layer: LayerDao):
super().insert_entry(self._layer_collection, layer.to_serializable_dict())
def get_layers(self) -> List[LayerDao]:
'''Retrieves all layers from the db, independent of use-case.'''
entries = super().get_entries(self._layer_collection, projection={'_id': 0})
return [LayerDao(e) for e in entries]
def get_layers_for_use_case(self, use_case: str) -> LayerDao:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case})
return [LayerDao(e) for e in entries]
def get_layers_for_table(self, use_case: str, use_case_table: str) -> LayerDao:
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case, 'use_case_table': use_case_table})
return [LayerDao(e) for e in entries]
def get_layer_by_name(self, use_case:str, use_case_table:str, layer_name:str) -> LayerDao:
'''Returns a singe layer for use-case and layer-name.'''
entries = super().get_entries(self._layer_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name})
entries = [LayerDao(e) for e in entries]
if entries is not None and len(entries) > 0:
if len(entries) > 1:
LOGGER.error(f"Layer Key {use_case}, {layer_name} is not unique.")
return entries[0]
else:
return None
def delete_all_layers(self):
super().drop_collection(self._layer_collection)
#endregion Layers
#region Clusters
def add_cluster(self, cluster: ClusterDao):
super().insert_entry(self._clusters_collection, cluster.to_serializable_dict(for_db=True))
def add_clusters(self, clusters: List[ClusterDao]):
cluster_dicts = [c.to_serializable_dict(for_db=True) for c in clusters]
super().insert_many(self._clusters_collection, cluster_dicts)
def get_clusters_for_layer(self, use_case: str, use_case_table: str, layer_name: str) -> List[ClusterDao]:
entries = super().get_entries(self._clusters_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name}, projection={'_id': 0})
return [ClusterDao(cluster_dict=e, from_db=True) for e in entries]
def delete_all_clusters(self):
super().drop_collection(self._clusters_collection)
#endregion
#region TimeSlice
def add_time_slice(self, timeslice: TimeSliceDao):
super().insert_entry(self._time_slice_collection, timeslice.to_serializable_dict(for_db=True))
def get_time_slices(self) -> List[TimeSliceDao]:
'''Returns all time slices.'''
entries = super().get_entries(self._time_slice_collection)
return [TimeSliceDao(None, None, time_slice_dict=e, from_db=True) for e in entries]
def get_time_slices_by_name(self, use_case: str, use_case_table: str, layer_name: str) -> List[TimeSliceDao]:
'''Returns all time slices with the given layer_name.'''
entries = super().get_entries(self._time_slice_collection, selection={'use_case': use_case, 'use_case_table': use_case_table, 'layer_name': layer_name})
return [TimeSliceDao(time_slice_dict=e, from_db=True) for e in entries]
def remove_all_time_slices(self):
super().drop_collection(self._time_slice_collection)
#endregion
#region LayerPair
def add_layer_pair(self, layer_pair: LayerPairDao):
super().insert_entry(self._layer_pair_collection, layer_pair.__dict__)
def get_layer_pairs(self, use_case: str, use_case_table: str) -> List[LayerPairDao]:
entries = super().get_entries(self._layer_pair_collection)
return [LayerPairDao.create_from_dict(e) for e in entries]
#endregion
from flask import request
def echo():
import processing.fetching.fetching as f
# print(f._fetch_use_cases())
print(f._fetch_use_cases())
return request.json
\ No newline at end of file
from entities.timewindow import TimeWindow
from entities.cluster import Cluster
from entities.layer import Layer
\ No newline at end of file
# from __future__ import annotations
from typing import Dict, List, Iterable, Any, Tuple
from entities.timewindow import TimeWindow
import numpy as np
import scipy
from processing import ClusterMetricsCalculatorFactory
class Cluster:
'''A cluster from one time window containing all metrics used for machine learning.'''
def __init__(self, time_window_id: Any, cluster_id: Any, cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int,
global_cluster_center, global_center_distance=None):
self.time_window_id = time_window_id
self.cluster_id = cluster_id
metrics_calculator = ClusterMetricsCalculatorFactory.create_metrics_calculator(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
self.size = metrics_calculator.get_size()
self.std_dev = metrics_calculator.get_standard_deviation()
self.scarcity = metrics_calculator.get_scarcity()
self.importance1 = metrics_calculator.get_importance1()
self.importance2 = metrics_calculator.get_importance2()
self.range_ = metrics_calculator.get_range()
self.center = metrics_calculator.get_center()
self.global_center_distance = \
scipy.spatial.distance.euclidean(self.center, global_cluster_center) \
if self.size > 0 \
else 0
def get_time_info(self) -> int:
'''Returns the week of the time tuple str, eg. 25 for "(2014, 25)".'''
str_tuple = self.time_window_id
return int(str_tuple.split(',')[1].strip()[:-1])
def __repr__(self):
return str(self.__dict__)
def __str__(self):
return f"Cluster({self.time_window_id}, {self.cluster_id}, " \
f"{self.size}, {self.std_dev}, {self.scarcity}, " \
f"{self.importance1}, {self.importance2}, " \
f"{self.range_}, {self.center})"
@staticmethod
def create_multiple_from_time_window(time_window: TimeWindow, cluster_feature_names: List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> Iterable['Cluster']:
total_layer_nodes = sum([len(nodes) for nodes in time_window.clusters.values()])
layer_diversity = len([nodes for nodes in time_window.clusters.values() if len(nodes) > 0])
for cluster_nr, cluster_nodes in time_window.clusters.items():
yield Cluster(time_window.time, cluster_nr, cluster_nodes, cluster_feature_names, total_layer_nodes, layer_diversity, global_cluster_centers[cluster_nr])
@staticmethod
def create_from_dict(dict_) -> 'Cluster':
cl = Cluster(0, 0, [], 'None', 0, 0, None)
cl.__dict__.update(dict_)
return cl
from typing import Dict, List, Tuple, Any
import scipy.spatial
from entities.timewindow import TimeWindow
from processing import ClusterMetricsCalculatorFactory
class InternalCluster:
def __init__(self, cluster_id, cluster_nodes: List[dict], feature_names:List[str], global_cluster_center: Tuple[float], n_layer_nodes: int):
self.cluster_id = cluster_id
metrics_calculator = ClusterMetricsCalculatorFactory.create_metrics_calculator(cluster_nodes, feature_names, n_layer_nodes, None)
self.size = metrics_calculator.get_size()
self.relative_size = metrics_calculator.get_importance1()
self.center = metrics_calculator.get_center()
if self.size > 0:
self.global_center_distance = scipy.spatial.distance.euclidean(self.center, global_cluster_center)
else:
self.global_center_distance = 0
@staticmethod
def create_many_from_cluster_nodes(clusters: Dict[str, List[dict]], feature_names: List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> List['InternalCluster']:
res_clusters = []
total_layer_nodes = sum([len(nodes) for nodes in clusters.values()])
for key, value in clusters.items():
# ignore noise as it contains no meaningful cluster information
if key == '-1':
continue
res_clusters.append(InternalCluster(key, value, feature_names, global_cluster_centers[key], total_layer_nodes))
return res_clusters
class Layer:
'''Represents metrics for one layer for a single time window.'''
def __init__(self, time_window_id: Any, clusters: List[InternalCluster]):
self.time_window_id = time_window_id
active_clusters = [c for c in clusters if c.size > 0]
self.n_nodes = sum([c.size for c in clusters])
self.n_clusters = len(active_clusters)
self.relative_cluster_sizes = self.get_relative_cluster_sizes(active_clusters)
self.cluster_size_agg_metrics = self.get_size_min_max_avg_sum(active_clusters)
self.cluster_relative_size_agg_metrics = self.get_relative_size_min_max_avg_sum(active_clusters)
self.entropy = self.get_entropy(active_clusters)
self.centers = [c.center for c in active_clusters]
self.distances_from_global_centers = self.get_distances_from_global_center(active_clusters)
self.cluster_center_distance_agg_metrics = self.get_center_distance_min_max_avg_sum(active_clusters)
def get_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's absolute sizes.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].size
max_ = clusters[0].size
sum_ = 0
for c in clusters:
value = c.size
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_relative_size_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's relative sizes.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].relative_size
max_ = clusters[0].relative_size
sum_ = 0
for c in clusters:
value = c.relative_size
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_center_distance_min_max_avg_sum(self, clusters: List[InternalCluster]) -> dict:
'''Returns min, max, avg, and sum of the cluster's center distances.'''
if len(clusters) == 0:
return {'min':0, 'max':0, 'avg':0, 'sum':0}
min_ = clusters[0].global_center_distance
max_ = clusters[0].global_center_distance
sum_ = 0
for c in clusters:
value = c.global_center_distance
min_ = min(min_, value)
max_ = max(max_, value)
sum_ += value
avg_ = sum_ / len(clusters)
return {'min': min_, 'max': max_, 'avg': avg_, 'sum': sum_}
def get_relative_cluster_sizes(self, clusters: List[InternalCluster]):
return [c.relative_size for c in clusters]
def get_entropy(self, clusters: List[InternalCluster]):
'''
Returns the entropy over all clusters C,
where P(c_i) is the probability that a node belongs to cluster c_i.
'''
return scipy.stats.entropy(self.get_relative_cluster_sizes(clusters), base=2)
def get_distances_from_global_center(self, clusters: List[InternalCluster]):
return [cluster.global_center_distance for cluster in clusters]
def __repr__(self):
return str(self.__dict__)
def __str__(self):
return f"Layer({self.time_window_id}, " \
f"{self.n_nodes}, {self.n_clusters}, {self.relative_cluster_sizes}, " \
f"{self.entropy}, {self.centers}, {self.distances_from_global_centers})"
@staticmethod
def create_from_time_window(time_window: TimeWindow, feature_names:List[str], global_cluster_centers: Dict[str, Tuple[float]]) -> 'Layer':
clusters: List[InternalCluster] = InternalCluster.create_many_from_cluster_nodes(time_window.clusters, feature_names, global_cluster_centers)
return Layer(time_window.time, clusters)
@staticmethod
def create_from_dict(dict_) -> 'Layer':
l = Layer(0, [])
l.__dict__.update(dict_)
return l
\ No newline at end of file
import json
from typing import List, Dict, NewType, Any
from datetime import date, datetime
class TimeWindow:
'''
A time slice for a single layer containing all nodes for that time.
:param time: The tag indicating the time
:param layer_name: The name of the layer the nodes belong to
'''
def __init__(self, time: Any = None, use_case: str = None, use_case_table: str = None, layer_name: str = None,
time_slice_dict: Dict = None, from_db = False):
self.time = str(time)
self.use_case = use_case
self.use_case_table = use_case_table
self.layer_name = layer_name
self.clusters: Dict[str, List[dict]] = {}
if time_slice_dict is not None:
self.from_serializable_dict(time_slice_dict, from_db)
def add_node_to_cluster(self, cluster_label: str, node):
# only string keys can be stored in json
cluster_label = str(cluster_label)
if cluster_label not in self.clusters:
self.clusters[cluster_label] = []
# node = self._get_unique_id(node)
self.clusters[cluster_label].append(node)
def get_nodes_for_cluster(self, cluster_label: str):
if cluster_label in self.clusters:
return self.clusters[cluster_label]
else:
return []
def _get_unique_id(self, node : Dict) -> Dict:
'''Returns a new dict with the unique id only.'''
uid_key = 'UniqueID'
if uid_key in node:
return {uid_key: node[uid_key]}
def to_serializable_dict(self, for_db=False) -> Dict:
return {
"time": self.time,
"use_case": self.use_case,
"use_case_table": self.use_case_table,
'layer_name': self.layer_name,
"clusters": json.dumps(self.clusters) if for_db else self.clusters
}
def from_serializable_dict(self, dict: Dict, from_db=False):
self.time = dict["time"]
self.use_case = dict["use_case"]
self.use_case_table = dict["use_case_table"]
self.layer_name = dict['layer_name']
self.clusters = json.loads(dict['clusters']) if from_db else dict['clusters']
@staticmethod
def create_from_serializable_dict(dict: Dict, from_db=False):
ts = TimeWindow()
ts.from_serializable_dict(dict, from_db)
return ts
def __repr__(self):
return json.dumps(self.to_serializable_dict())
def __str__(self):
return f"TimeWindow({self.__repr__()})"
......@@ -51,4 +51,4 @@ app.add_api(swagger_util.get_bundled_specs(Path(swagger_path)),
# start app
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, ssl_context=context)
app.run(host='0.0.0.0', port=5000, ssl_context=context, debug=True)
import warnings
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Tuple
import numpy as np
from scipy.spatial import ConvexHull, qhull, distance
from math import sqrt
from statistics import mean
warnings.simplefilter(action='ignore', category=UserWarning)
# UserWarning: geopandas not available. Some functionality will be disabled.
from pointpats.centrography import std_distance
warnings.simplefilter(action='default', category=UserWarning)
class ClusterMetricsCalculator(ABC):
def __init__(self, cluster_nodes: List[dict], nr_layer_nodes: int, layer_diversity: int):
self.cluster_nodes = cluster_nodes
self.nr_layer_nodes = nr_layer_nodes
self.layer_diversity = layer_diversity
def get_size(self) -> int:
'''Returns the size of the cluster.'''
return len(self.cluster_nodes)
@abstractmethod
def get_standard_deviation(self) -> float:
'''Returns the std dev from the center of the distribution.'''
pass
@abstractmethod
def get_scarcity(self) -> float:
'''
Returns the scarcity of the data points regarding the complete range for possible points.
High scarcity indicates low density.
'''
pass
@abstractmethod
def get_range(self) -> float:
'''Returns the range or area of the cluster based on the edge nodes.'''
pass
@abstractmethod
def get_center(self) -> (float, float):
'''Returns the center of the cluster, output is fixed 2d.'''
pass
def get_importance1(self) -> float:
'''Returns the ratio of cluster_nodes to layer_nodes.'''
return float(len(self.cluster_nodes)) / self.nr_layer_nodes if len(self.cluster_nodes) > 0 else 0
def get_importance2(self) -> float:
'''Returns the inverse of the layer_diversity, where layer_diversity = number of clusters with #nodes > 0.'''
return 1.0 / self.layer_diversity if len(self.cluster_nodes) > 0 else 0
def _convert_feature_to_float(self, feature_value) -> float:
return float(feature_value if feature_value is not "" else 0)
class ClusterMetricsCalculator1D(ClusterMetricsCalculator):
'''Metrics calculator for clusters which were clustered based on 1 feature (1d clustering).'''
def __init__(self, cluster_nodes: List[dict], cluster_feature_name: str, nr_layer_nodes: int, layer_diversity: int):
super().__init__(cluster_nodes, nr_layer_nodes, layer_diversity)
self.feature_values: List[Any] = [self._convert_feature_to_float(node[cluster_feature_name])
for node in cluster_nodes]
if len(self.feature_values) > 0:
self.max_value = max(self.feature_values)
self.min_value = min(self.feature_values)
else:
self.max_value = self.min_value = 0
def get_standard_deviation(self):
return np.std(self.feature_values) if len(self.feature_values) > 0 else 0
def get_scarcity(self):
'''Returns the scarcity as cluster_range / cluster_size, or 0 if len(nodes)=0.'''
if len(self.feature_values) == 0:
return 0
return self.get_range() / self.get_size()
def get_range(self):
return float(self.max_value - self.min_value)
def get_center(self):
if len(self.feature_values) == 0:
return (0, 0)
return (sum(self.feature_values) / len(self.feature_values), 0)
class ClusterMetricsCalculator2D(ClusterMetricsCalculator):
'''Metrics calculator for clusters which were clustered based on 2 features (2d clustering).'''
def __init__(self, cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int):
assert len(cluster_feature_names) == 2, "This class is for 2d cluster results only!"
super().__init__(cluster_nodes, nr_layer_nodes, layer_diversity)
self.feature_values: List[Tuple[Any]] = [
(self._convert_feature_to_float(node[cluster_feature_names[0]]), self._convert_feature_to_float(node[cluster_feature_names[1]]))
for node in cluster_nodes
]
def get_standard_deviation(self):
if len(self.feature_values) == 0:
return 0
warnings.simplefilter(action='ignore', category=RuntimeWarning)
std_dist = std_distance(self.feature_values)
warnings.simplefilter(action='default', category=RuntimeWarning)
if np.isnan(std_dist):
return 0 # somehow std_dist=nan if all feature values are same with many decimals
return std_dist
def get_scarcity(self):
'''Returns the scarcity as cluster_range / cluster_size, or 0 if len(nodes)=0.'''
if len(self.feature_values) == 0:
return 0
if len(self.feature_values) == 1:
# exactly 1 element gives inf density
return 0
range_, twodim = self._get_range()
if twodim:
return sqrt(range_ / self.get_size())
else:
return range_ / self.get_size()
def _get_range(self):
twodim = False
if len(self.feature_values) == 0 or len(self.feature_values) == 1:
range_ = 0
elif len(self.feature_values) == 2:
# cannot calculate area with 2 points - just use 2d distance as range instead
range_ = float(distance.euclidean(self.feature_values[0], self.feature_values[1]))
else:
try:
# calculate range as 2d area
points = self._get_polygon_border_points(self.feature_values)
range_ = self._calc_polygon_area(points)
# twodim must be known when calculating scarcity
twodim = True
except qhull.QhullError as err:
# possible reasons that there is no hull with real area:
# 1. all points are at the same location
# 2. all points have the same x or y coordinates (lie on one hori/vert line)
points = np.asarray(self.feature_values)
same_x = len(set(points[:,0])) == 1
if same_x:
# use only y feature
features = points[:,1]
range_ = max(features) - min(features)
same_y = len(set(points[:,1])) == 1
if same_y:
# use only x feature
features = points[:,0]
range_ = max(features) - min(features)
if not same_x and not same_y:
# assume linear distribution of nodes
points = np.asarray(list(set(self.feature_values)))
min_ = min(points[:,0]), min(points[:,1])
max_ = max(points[:,0]), max(points[:,1])
range_ = float(distance.euclidean(min_, max_))
return (range_, twodim)
def get_range(self):
return self._get_range()[0]
def _get_polygon_border_points(self, points: List[List[float]]) -> 'np.array':
points = np.asarray(points)
hull = ConvexHull(points)
return points[hull.vertices]
def _calc_polygon_area(self, border_points: 'np.array') -> float:
x: 'np.array' = border_points[:,0]
y: 'np.array' = border_points[:,1]
# https://en.wikipedia.org/wiki/Shoelace_formula
area = 0.5 * np.abs(np.dot(x, np.roll(y,1)) - np.dot(y, np.roll(x,1)))
return float(area)
def get_center(self):
if len(self.feature_values) == 0:
return (0, 0)
x = [f[0] for f in self.feature_values]
y = [f[1] for f in self.feature_values]
centroid = (sum(x) / len(self.feature_values), sum(y) / len(self.feature_values))
return centroid
class ClusterMetricsCalculatorFactory:
@staticmethod
def create_metrics_calculator(cluster_nodes: List[dict], cluster_feature_names: List[str], nr_layer_nodes: int, layer_diversity: int) -> ClusterMetricsCalculator:
"""
This factory creates a class which contains metrics about a single cluster based on
its nodes, feature values, its layer total node number and its layer diversity.
:param cluster_nodes: all nodes from the cluster
:param cluster_feature_names: all field names which where used during clustering
:param nr_layer_nodes: the number of total layer nodes
:param layer_diversity: the diversity of the layer calculated as: number of clusters with nodes > 0
"""
if isinstance(cluster_feature_names, str):
return ClusterMetricsCalculator1D(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
if len(cluster_feature_names) == 1:
return ClusterMetricsCalculator1D(cluster_nodes, cluster_feature_names[0], nr_layer_nodes, layer_diversity)
if len(cluster_feature_names) == 2:
return ClusterMetricsCalculator2D(cluster_nodes, cluster_feature_names, nr_layer_nodes, layer_diversity)
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
class DataSampler:
def __init__(self):
pass
def undersample(self, X, y, strategy='not minority') -> ('X', 'y'):
'''Undersampling so all class sizes equal minority class size.'''
rus = RandomUnderSampler(random_state=42, sampling_strategy=strategy)
X_undersampled, y_undersampled = rus.fit_resample(X, y)
return X_undersampled, y_undersampled
def oversample(self, X, y) -> ('X', 'y'):
'''Oversample based on SMOTE so all class sizes equal majority class size.'''
sm = SMOTE(random_state=42)
X_oversampled, Y_oversampled = sm.fit_resample(X, y)
return X_oversampled, Y_oversampled
def sample_fixed_size(self, X, y, size: int) -> ('X', 'y'):
sampling_sizes = {k: min(size, v) for k, v in y.value_counts().items()}
# undersample the larger classes to size
X, y = self.undersample(X, y, strategy=sampling_sizes)
# oversample the smaller classes to size
X, y = self.oversample(X, y)
return X, y
def sample_median_size(self, X, y: pd.Series, max_size:int=None) -> ('X', 'y'):
'''Sample the median class size for all classes.'''
median = int(y.value_counts().median())
if max_size is not None:
median = min(median, max_size)
return self.sample_fixed_size(X, y, size=median)
from processing.ClusterMetricsCalculator import ClusterMetricsCalculator, ClusterMetricsCalculator1D, ClusterMetricsCalculator2D, ClusterMetricsCalculatorFactory
from processing.DataSampler import DataSampler
from processing.fetching import fetching
\ No newline at end of file
from security.token_manager import TokenManager
import network_constants
from db.entities.layer import Layer
from db.repository import Repository
from db.dao import *
from typing import List, Dict
import requests
......@@ -23,9 +23,7 @@ def _fetch_use_cases() -> List[str]:
if response.status_code != 200:
raise ConnectionError(f"Could not fetch use-cases from business-logic microservice, statuscode: {response.status_code}!")
data = json.loads(response.text)
return [row["name"] for row in data]
return [row["name"] for row in response.json()]
def _fetch_tables(use_case: str) -> List[str]:
......@@ -41,14 +39,12 @@ def _fetch_tables(use_case: str) -> List[str]:
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch tables for {use_case} from business-logic microservice, statuscode: {response.status_code}!")
data = json.loads(response.text)
raise ConnectionError(f"Could not fetch use-cases from business-logic microservice, statuscode: {response.status_code}!")
return [row["name"] for row in data]
return [row["name"] for row in response.json()]
def _fetch_layers(use_case: str, table: str) -> List[Layer]:
def _fetch_layers(use_case: str, table: str) -> List[LayerDao]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers'
......@@ -61,17 +57,15 @@ def _fetch_layers(use_case: str, table: str) -> List[Layer]:
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch layers for {use_case}//{table} from business-logic microservice, statuscode: {response.status_code}!")
raise ConnectionError(f"Could not fetch layers for {use_case} from business-logic microservice, statuscode: {response.status_code}!")
data = json.loads(response.text)
return [Layer.from_business_logic_dict(row) for row in data]
return [LayerDao.from_business_logic_dict(row) for row in response.json()]
def _fetch_nodes(use_case: str, table: str, layer_name: str) -> List[Dict]:
def _fetch_clusters(use_case: str, table: str, layer_name: str) -> List[str]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.SEMANTIC_LINKING_HOSTNAME}:{network_constants.SEMANTIC_LINKING_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/nodes'
url = f'https://{network_constants.ROLESTAGE_DISCOVERY_HOSTNAME}:{network_constants.ROLESTAGE_DISCOVERY_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/clusters'
response = requests.get(
url,
......@@ -81,52 +75,95 @@ def _fetch_nodes(use_case: str, table: str, layer_name: str) -> List[Dict]:
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch nodes for {use_case}//{table}//{layer_name} from semantic-linking microservice, statuscode: {response.status_code}!")
raise ConnectionError(f"Could not fetch clusters for {use_case}//{table}//{layer_name}, statuscode: {response.status_code}!")
return [ClusterDao(cluster_dict=row) for row in response.json()]
return response.json()
def _fetch_timeslices(use_case: str, table: str, layer_name: str) -> List[Dict]:
jwt = TokenManager.getInstance().getToken()
url = f'https://{network_constants.ROLESTAGE_DISCOVERY_HOSTNAME}:{network_constants.ROLESTAGE_DISCOVERY_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch time slices for {use_case}//{table}//{layer_name}, statuscode: {response.status_code}!")
return [TimeSliceDao(time_slice_dict=row) for row in response.json()]
def _fetch_layerpairs(use_case: str, table: str) -> List[Dict]:
jwt = TokenManager.getInstance().getToken()
def fetch_nodes_from_semantic_linking(selected_use_cases: List[str] = None, selected_use_case_tables: List[str] = None):
url = f'https://{network_constants.BUSINESS_LOGIC_HOSTNAME}:{network_constants.BUSINESS_LOGIC_REST_PORT}/api/use-cases/{use_case}/tables/{table}/layer-pairs'
response = requests.get(
url,
verify = False,
proxies = { "http":None, "https":None },
headers = {"Authorization": f"Bearer {jwt}"}
)
if response.status_code != 200:
raise ConnectionError(f"Could not fetch layer pairs for {use_case}//{table}, statuscode: {response.status_code}!")
return [LayerPairDao.create_from_dict(row) for row in response.json()]
def fetch(selected_use_cases: List[str] = None, selected_use_case_tables: List[str] = None):
'''Empties the db and inserts layers and nodes from BusinessLogic and SemanticLinking'''
repository = Repository()
repo = Repository()
# please dont delete all layers/ nodes anymore @10.11.2020
# repository.delete_all_layers()
# repository.delete_all_nodes()
use_cases = _fetch_use_cases()
for use_case in use_cases:
for use_case in _fetch_use_cases():
if selected_use_cases is not None and use_case not in selected_use_cases:
continue
print(f"Fetching for use-case {use_case}")
tables = _fetch_tables(use_case)
for table in tables:
for table in _fetch_tables(use_case):
if selected_use_case_tables is not None and table not in selected_use_case_tables:
continue
layers = _fetch_layers(use_case, table)
for layer in layers:
print(f"Fetching for {use_case}//{table}")
try:
# copy all layer pairs
layer_pairs: List[LayerPairDao] = _fetch_layerpairs(use_case, table)
for lp in layer_pairs:
repo.add_layer_pair(lp)
except ConnectionError as e:
print(str(e))
# copy all layers
for layer in _fetch_layers(use_case, table):
db_layer = repo.get_layer_by_name(use_case, layer.use_case_table, layer.layer_name)
if db_layer == None:
repo.add_layer(layer)
else:
print(f"Layer already exists, skipping cluster and timeslice fetching: {db_layer}")
continue
try:
print(f"Fetching nodes for layer {use_case}//{table}//{layer.layer_name}.")
# check if layer already exists in DB, add it if not
reference_layer = repository.get_layer_by_name(use_case, table, layer.layer_name)
if reference_layer == None:
repository.add_layer(layer)
else:
raise Exception(f"Layer should be unique, but was not: {reference_layer}")
nodes = _fetch_nodes(use_case, table, layer.layer_name)
for node in nodes:
node['use_case_table'] = node['table']
del node['table']
for node in nodes:
repository.add_layer_node(node)
# copy all clusters
clusters = _fetch_clusters(use_case, layer.use_case_table, layer.layer_name)
for cl in clusters:
repo.add_cluster(cl)
except ConnectionError as e:
print(str(e))
try:
# copy all timeslices
timeslices = _fetch_timeslices(use_case, layer.use_case_table, layer.layer_name)
for ts in timeslices:
repo.add_time_slice(ts)
except ConnectionError as e:
print(str(e))
continue
\ No newline at end of file
import sys
import os
modules_path = '../../../modules/'
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
import json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from processing.fetching import fetching
if __name__ == "__main__":
fetching.fetch(selected_use_cases=['community-prediction-youtube-n'], selected_use_case_tables=None)
\ No newline at end of file
......@@ -162,7 +162,7 @@ paths:
'404':
description: "Layer not found"
/use-cases/{use_case}/tables{table}/layers/{layer_name}/timeslices:
/use-cases/{use_case}/tables/{table}/layers/{layer_name}/timeslices:
get:
operationId: "routes.timeslices.get_by_name"
security:
......
......@@ -82,20 +82,20 @@ def get_layer_nodes(use_case, use_case_table, layer_name)-> List[dict]:
def add_time_slice(timeslice):
try:
# repo.add_time_slice(timeslice)
repo.add_time_slice(timeslice)
pass
except:
print(f"Error while storing time slice in db for {timeslice.layer_name}")
try:
json_path = f'_predictions/timeslices/{timeslice.layer_name}/{timeslice.time}.json'.replace(', ', '_').replace('(', '').replace(')', '')
if not os.path.exists(os.path.dirname(json_path)):
os.makedirs(os.path.dirname(json_path))
with open(json_path, 'w') as file:
file.write(json.dumps(timeslice.to_serializable_dict(for_db=False)))
except Exception as e:
print(f"Error while writing json for {timeslice.layer_name}: {e}")
# try:
# json_path = f'_predictions/timeslices/{timeslice.layer_name}/{timeslice.time}.json'.replace(', ', '_').replace('(', '').replace(')', '')
# if not os.path.exists(os.path.dirname(json_path)):
# os.makedirs(os.path.dirname(json_path))
# with open(json_path, 'w') as file:
# file.write(json.dumps(timeslice.to_serializable_dict(for_db=False)))
# except Exception as e:
# print(f"Error while writing json for {timeslice.layer_name}: {e}")
def run_time_slicing(selected_use_cases: List[str] = None, selected_use_case_tables: List[str] = None, selected_layer_names: List[str] = None):
layers = get_layers()
......
......@@ -55,6 +55,17 @@ else:
ROLESTAGE_DISCOVERY_DB_HOSTNAME = 'articonf1.itec.aau.at'
ROLESTAGE_DISCOVERY_DB_PORT = 30104
## Proactive Community Detection
if server:
PROACTIVE_COMMUNITY_DETECTION_HOSTNAME = 'proactive-community-detection'
PROACTIVE_COMMUNITY_DETECTION_REST_PORT = 80
PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME = f'{ROLESTAGE_DISCOVERY_HOSTNAME}-db'
PROACTIVE_COMMUNITY_DETECTION_DB_PORT = 27017
else:
PROACTIVE_COMMUNITY_DETECTION_HOSTNAME = 'articonf1.itec.aau.at'
PROACTIVE_COMMUNITY_DETECTION_REST_PORT = 30105
PROACTIVE_COMMUNITY_DETECTION_DB_HOSTNAME = 'articonf1.itec.aau.at'
PROACTIVE_COMMUNITY_DETECTION_DB_PORT = 30106
#endregion Data Hub
#region Rest Gateway
......
paths:
#####
# USE-CASES
#####
/use-cases:
post:
security:
......@@ -57,9 +55,8 @@ paths:
description: "Successful Request"
'403':
description: "Confirmation required"
#####
# TABLES
#####
# region tables
/tables:
get:
security:
......@@ -187,13 +184,9 @@ paths:
responses:
'200':
description: "Successful Request"
#####
# END-TABLES
#####
# endregion tables
#####
# LAYERS
#####
# region layers
/layers:
get:
security:
......@@ -515,13 +508,9 @@ paths:
description: "Field in request is missing"
'403':
description: "Confirmation required"
#####
# END LAYERS
#####
# endregion layers
#####
# ENUMS
#####
# region enums
/enums:
get:
security:
......@@ -618,6 +607,66 @@ paths:
description: "Successful Request"
'404':
description: "Enum does not exist"
# endregion enums
# region context dependencies for community prediction
/use-cases/{use_case}/tables/{table}/layer-pairs:
get:
security:
- JwtRegular: []
operationId: "routes.context_pairs.get_all"
tags:
- "LayerPairs"
summary: "Retrieve all layer pairs for cluster prediction"
description: "Retrieve all layer pairs for cluster prediction"
parameters:
- name: "use_case"
in: "path"
description: "Name of the use-case"
required: true
type: "string"
- name: "table"
in: "path"
description: "Name of the table"
required: true
type: "string"
responses:
'200':
description: "Successful Request"
schema:
$ref: '#/definitions/LayerPair'
'404':
description: "pairs do not exist"
post:
security:
- JwtRegular: []
operationId: "routes.context_pairs.insert"
tags:
- "LayerPairs"
summary: "Add a new layer pair for cluster prediction"
description: "Add a new layer pair for cluster prediction"
parameters:
- name: "use_case"
in: "path"
description: "Name of the use-case"
required: true
type: "string"
- name: "table"
in: "path"
description: "Name of the table"
required: true
type: "string"
- name: "layer_pair"
in: "body"
required: true
schema:
$ref: '#/definitions/LayerPair'
responses:
'200':
description: "Successful Request"
'400':
description: "incorrect format etc"
# endregion context dependencies
definitions:
LayerMapping:
......@@ -684,4 +733,16 @@ definitions:
type: array
items:
type: string
example: "internal_property_1"
\ No newline at end of file
example: "internal_property_1"
LayerPair:
type: "object"
properties:
use_case:
type: string
table:
type: string
layer:
type: string
reference_layer:
type: string
\ No newline at end of file
from typing import List, Dict
class LayerPair:
def __init__(self, use_case: str, table: str, layer: str, reference_layer: str):
self.use_case = use_case
self.table = table
self.layer = layer
self.reference_layer = reference_layer
@staticmethod
def create_from_dict(dict_) -> 'LayerPair':
lp = LayerPair(None, None, None, None)
lp.__dict__.update(dict_)
return lp
......@@ -3,13 +3,14 @@ import network_constants as netconst
from database.MongoRepositoryBase import MongoRepositoryBase
from db.entities.layer_adapter import LayerAdapter
from db.entities.use_case import UseCase
from db.entities.layer_pair import LayerPair
import pymongo
import json
from typing import List, Dict
class Repository(MongoRepositoryBase):
'''This is a repository for MongoDb.'''
'''This is a LAYER repository for MongoDb.'''
def __init__(self):
super().__init__(netconst.BUSINESS_LOGIC_DB_HOSTNAME,
......@@ -18,6 +19,7 @@ class Repository(MongoRepositoryBase):
self._adapter_collection = 'layer_adapters'
self._use_case_collection = 'use_cases'
self._layer_pair_collection = 'contextpairs'
def all(self) -> List[Dict]:
result = super().get_entries(self._adapter_collection, projection={'_id': False})
......@@ -59,4 +61,17 @@ class Repository(MongoRepositoryBase):
def delete(self, adapter : LayerAdapter):
collection = self._database[self._adapter_collection]
collection.delete_many({"name": adapter.name, "use_case": adapter.use_case, "table": adapter.table})
\ No newline at end of file
collection.delete_many({"name": adapter.name, "use_case": adapter.use_case, "table": adapter.table})
# region context pairs
def get_layer_pairs(self, use_case, table) -> List[LayerPair]:
return \
[LayerPair.create_from_dict(entry) for entry in
super().get_entries(self._layer_pair_collection, projection={'_id': False}, selection={"use_case": use_case, "table": table})
]
def insert_layer_pair(self, uc, table, pair: LayerPair):
super().insert_entry(self._layer_pair_collection, pair.__dict__)
# endregion context pairs
\ No newline at end of file
import json
from flask import Response, request
from db.repository import Repository
from db.entities.layer_pair import LayerPair
repo = Repository()
def get_all(use_case: str, table: str):
return [e.__dict__ for e in repo.get_layer_pairs(use_case, table)]
def insert(use_case: str, table: str, layer_pair: dict):
repo.insert_layer_pair(use_case, table, LayerPair.create_from_dict(layer_pair))
return Response(status=200)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment