Merge branch 'developBogdan' into 'develop'

Connected Cluster and Similarity functionalities See merge request !12

Merge branch 'developBogdan' into 'develop'
Connected Cluster and Similarity functionalities See merge request !12
7ca4a448 · Alexander Lercher · d728e14e · 8f1f8dc7 · 7ca4a448 · 7ca4a448
Commit 7ca4a448 authored Jul 20, 2020 by Alexander Lercher
21 changed files
--- a/src/data-hub/role-stage-discovery-microservice/app/__init__.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/__init__.py
--- a/src/data-hub/role-stage-discovery-microservice/app/configs/swagger.yml
+++ b/src/data-hub/role-stage-discovery-microservice/app/configs/swagger.yml
@@ -178,6 +178,58 @@ paths:
 #endregion
+################################################################################
+  /connectedClusters:
+    get:
+      operationId: "routes.connClusters.get_conn_clusters"
+      tags:
+          - "Connected"
+      summary: "Get connected Clusters data"
+      description: "Returns a dictionary of cluster. The clusters contain the associated connected clusters and connected nodes data."
+      responses:
+        200:
+          description: "Successful operation"
+          schema:
+            $ref: "#/definitions/ConnectedDict"
+  /clusterSimilarity:
+    get:
+      operationId: "routes.similarity.get_similarity"
+      tags:
+          - "Similarity"
+      summary: "Get data of the similarity between clusters."
+      parameters: 
+          - name: "layer_name"
+            in: "query"
+            description: "Name of the layer"
+            required: true
+            type: "string"
+          - name: "batchNr"
+            in: "query"
+            description: "Batch number (starting from 0)"
+            required: true
+            type: "integer"
+      description: "Data is returned in batches of size 1000. Returns a dictionary where the key is a tuple of cluster_labels (i.e. [0,319]) and the value is the computed similarity between 2 clusters in the tuple, in regard to each layer in the input. \n Note: the tuple clusters have the same layer and the computed similarity is in regard to clusters from OTHER layers."
+      responses:
+        200:
+          description: "Successful operation"
+          schema:
+            $ref: "#/definitions/ClusterSimilarityArray"
+  /clusterRunArray:
+    get:
+      operationId: "routes.connRun.get_connected_run"
+      tags:
+          - "RunId"
+      summary: "Get RunId"
+      description: "Returns the RunId and the associated datetime when a connection of clusters/simillarity of clusters was computed."
+      responses:
+        200:
+          description: "Successful operation"
+          schema:
+            $ref: "#/definitions/ClusterRunArray"
 definitions:
  Cluster:
@@ -264,3 +316,124 @@ definitions:
    type: array
    items:
        $ref: "#/definitions/TimeSlice"
+##################################################################
+  ConnectedDict:
+    type: array
+    items:
+      $ref: "#/definitions/ConnectedCluster"
+  ConnectedCluster:
+    type: object
+    properties:
+      cluster_label:
+        type: string
+        example: "6"
+      cluster_layer:
+        type: string
+        example: "Price_Layer"
+      cluster_runId:
+        type: string
+        example: "5efdc04ac43add0aba567d76"
+      cluster_containedNodesDict:
+        type: array
+        items: 
+          type: string
+          example: "2696718d7a33ab3dbf28e9c88411afcfe9a933a45e57ec9159bc0668543f1568"
+      cluster_connNodesDict:
+        type: array
+        items:
+          $ref: "#/definitions/ConnectedNode"
+      cluster_connClustDict:
+        type: array
+        items:
+          $ref: "#/definitions/ConnectedClusterAux"
+  ConnectedClusterAux:
+    type: object
+    properties:
+      cluster_label:
+        type: string
+        example: "-1"
+      cluster_layer:
+        type: string
+        example: "FinishedTime_Layer"
+      connectionWeight:
+        type: number
+        example: 42
+  ConnectedNode:
+    type: object
+    properties:
+      node_id:
+        type: string
+      node_cluster:
+        type: string
+      node_layer:
+        type: string
+    example:
+        "node_id": "27a08ed0facc7d68a0818c7695dad391cf48d6095e57ec9159bc0668543f159b"
+        "node_cluster": "2230"
+        "node_layer": "Destination_Layer"
+      #not used, should be removed?
+      #finished_time
+      #latitude_Destination
+      #longitude_Destination
+      #travelID
+      #travelPrice
+      #userID        
+  ClusterSimilarityArray:
+    type: array
+    items: 
+     $ref: "#/definitions/ClusterSimilarityDictionary"
+  ClusterSimilarityDictionary:
+    properties:
+      cluster1_label:
+        type: string
+        example: "0"
+      cluster2_label:
+        type: string
+        example: "1"
+      cluster_layer:
+        type: string
+        example: "Price_Layer"
+      similarityValues:
+        type: object
+        additionalProperties:
+          type: number
+        example:
+          "layer_name": similarityValue
+          "StartingPoint_Layer": 39.0,
+          "StartingTime_Layer": 99.0101004948485
+      runId: 
+        type: string
+        example: "5efdc04ac43add0aba567d76"
+  ClusterRunArray:
+    type: array
+    items:
+      $ref: "#/definitions/ClusterRun"
+  ClusterRun:
+    type: object
+    properties:
+      _id:
+        type: string
+        example: "5efdc04ac43add0aba567d76"
+      Datetime:
+        type: string
+        example: "2020-07-02 14:19:51.651764"
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_cluster.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_cluster.py
+class ClusterC:
+    ''' 
+    This class represents connected clusters between layers where weights are the number of shared nodes. 
+    Connections are stored with one 'source' cluster which is connected to n 'destination' clusters, with n >= 0.
+    :param cluster_label: The cluster label of the 'source' cluster
+    :param cluster_layer: The layer name of the 'source' cluster
+    :param cluster_runId: The run which calculated the connections
+    :param cluster_containedNodesDict: Ids of nodes contained in the 'source' cluster
+    :param cluster_connNodesDict: Node Objects contained in 'dest' clusters, where the 'dest' cluster is uniquely identifiable by layer name and cluster label
+    :param cluster_connClustDict: Layer name, cluster label and weight for each 'dest' cluster
+    '''
+    def __init__(self,cluster_label,cluster_layer,cluster_runId,cluster_containedNodesDict,cluster_connNodesDict,cluster_connClustDict):
+        self.cluster_label = cluster_label
+        self.cluster_layer = cluster_layer
+        self.cluster_runId = cluster_runId
+        self.cluster_containedNodesDict = cluster_containedNodesDict #Keys are frozensets(touples) == frozenset(uniqueID, cluster and layer) Value is the Node UniqueId
+        self.cluster_connNodesDict = cluster_connNodesDict #Keys are frozensets(touples)  (uniqueID:str,node_cluster:str,node_layer:str) Values are NodeC Objects
+        self.cluster_connClustDict = cluster_connClustDict #dictionary: (dict[(cluster_label,clusterlayer)] -> nrOfConnections/weightOfTheConnection )
--- a/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_layer.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_layer.py
+class LayerC:
+    def __init__(self,layer_name:str,cluster_Dict):
+        '''
+        This class represents the Layer which contains the connected clusters.
+        :param layer_name: The layer name which contains the clusters
+        :param cluster_Dict: The connected_clusters contained in this layer.
+        '''
+        self.layer_name = layer_name 
+        self.cluster_Dict = cluster_Dict # Dict[cluster_label] --> ClusterC object
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_node.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_node.py
+class NodeC:
+    '''
+    This class represents the Node data contained in a Cluster.
+    :param node_layer: The layer name which contains the node
+    :param node_cluster: The connected_cluster.cluster_label which contains in this layer.
+    :param uniqueID: Id of the node. Only unique in inside a single cluster. NOT unique between multiple clusters/layers.
+    '''
+    def __init__(self, node_cluster, node_layer, uniqueID):
+        self.node_cluster = node_cluster # str
+        self.node_layer = node_layer # str
+        self.uniqueID = uniqueID   # str     
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_run.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/entities/connected_run.py
+from datetime import datetime
+class ConnectedRun:
+    '''
+    This class represents the RunId and Time when the Connecting of the Clusters and Calculating the Similarity between clusters is executed.
+    :param run_id: The MongoDB _id of the Run to uniquely identify it.
+    :param timeOfExec: Datetime object containing info when the run was finished.
+'''
+    def __init__(self,run_id,timeOfExec):
+        self.run_id = run_id
+        self.timeOfExec = timeOfExec
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/db/repository.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/repository.py
@@ -4,6 +4,7 @@ from database.MongoRepositoryBase import MongoRepositoryBase
 import json
 from db.entities import *
+# from processing.similarityFiles.miscFunctions import *
 from typing import List
@@ -19,6 +20,9 @@ class Repository(MongoRepositoryBase):
        self._layer_nodes_collection = 'layer_nodes'
        self._clusters_collection = 'clusters'
        self._time_slice_collection = 'time_slices'
+        self._connected_clusters_collection ='connected_clusters'
+        self._similarity_collection = 'similarity'
+        self._connected_run = 'connected_run'
 #region Layers
    def add_layer(self, layer: Layer):
@@ -79,3 +83,87 @@ class Repository(MongoRepositoryBase):
        super().drop_collection(self._time_slice_collection)
 #endregion
+#region clusterConnected
+    def add_connected_clusters(self, clusterDictArray):
+        ''' Add Connected Clusters Data to DB '''
+        result = super().insert_many(self._connected_clusters_collection, clusterDictArray)
+        return result
+    def get_connected_clusters(self, run_id: str=None):#, layer_name: str):
+        ''' Get Connected Clusters Data from DB '''
+        if (run_id == None):
+            entries = super().get_entries(self._connected_clusters_collection, projection={'_id': 0})
+        else:
+            entries = super().get_entries(self._similarity_collection, selection={'cluster_runId' : run_id}, projection={'_id': 0})
+        output = []
+        for ent in entries:
+            output.append(ent)
+        return output
+        # print(ent)
+        #return [Cluster(cluster_dict=e, from_db=True) for e in entries]
+#endregion
+#region similarity
+    def add_similarity(self, inputDict):
+        ''' Add Similarity Data to DB '''
+        #checkIfConnClustDictIsSerializable(outputJSON)      
+        result = super().insert_many(self._similarity_collection, inputDict)
+        #print(str(result))
+        #super().insert_entry(self._connected_clusters_collection, outputJSON)
+        return result
+    def get_similarity(self,skipNr,batchSize, cluster_layer: str= None, run_id: str=None):
+        ''' Get Similarity Data from DB '''
+        if (run_id == None):
+            if(cluster_layer == None):
+                entries = super().get_entries(self._similarity_collection, projection={'_id': 0})
+            else:
+                entries = super().get_entries(self._similarity_collection, selection={'cluster_layer' : cluster_layer}, projection={'_id': 0})
+        else:
+            if(cluster_layer == None):
+                entries = super().get_entries(self._similarity_collection, selection={'runId' : run_id}, projection={'_id': 0})
+            else:
+                entries = super().get_entries(self._similarity_collection, selection={'cluster_layer' : cluster_layer, 'runId' : run_id}, projection={'_id': 0})
+        #
+        return list(entries.sort([('_id', -1)]).skip(skipNr).limit(batchSize))
+        """
+        output = []
+        for e in entries:
+            output.append(e)
+        return output
+        """
+#endregion
+#region connected_run
+    def convert_run_to_json_support(self, run_from_db: dict) -> dict:
+        ''' Converts the ObjectId from MongoDb to its string repr. '''
+        run_from_db['_id'] = str(run_from_db['_id'])
+        return run_from_db
+    def add_connected_run(self, conRunTimestamp):
+        ''' Add Connected Run Data to DB '''
+        result = super().insert_entry(self._connected_run, conRunTimestamp)
+        return result
+    def get_connected_run(self, run_id: str= None):
+        ''' Get Connected Run Data from DB '''
+        if (run_id == None):
+            entries = super().get_entries(self._connected_run)
+        else:
+            entries = super().get_entries(self._connected_run, selection={'_id' : run_id}, projection={'_id': 1, 'Datetime': 1})
+        output = []
+        for e in entries:
+            output.append(self.convert_run_to_json_support(e))
+        return output
+#endregion
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/__ init __.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/__ init __.py
+# __init__.py
+from similarityFiles.calculateSimilarity import *
+from similarityFiles.calculateWeights import *
+from similarityFiles.populateWithNewNodes import *
+from similarityFiles.miscFunctions import *
+from similarityFiles.test import *
+from db.entities.connected_cluster import *
+from db.entities.connected_layer import *
+from db.entities.connected_node import *
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/calculateSimilarity.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/calculateSimilarity.py
+#This file contains the methods for caclulating the similarity between clusters
+import math
+from db.entities.connected_node import NodeC
+from db.entities.connected_cluster import ClusterC
+from db.entities.connected_layer import LayerC
+from typing import Dict
+def minMaxFunction(iIndex,jIndex,clusterList) -> Dict[str,int]:
+    ''' minMax Metric for calculating similarity between 2 clusters.
+        Clusters must be from the same layer, and will be compared to clusters from different layers (cluster_layer attribute)
+    :param string iIndex: The index of the first Cluster in the "clusterList" 
+    :param string jIndex: The index of the second Cluster in the "clusterList"  
+    :param List[Cluster] clusterList: A list of clusters to which the 2 clusters will be compared to
+    :returns: Dictionary with layername as KEY, and the computed similarity value between the 2 clusters in regard to the layer as the VALUE of the Dict.
+    :rtype: Dict{str,int}
+    '''
+    iCluster= clusterList[iIndex]
+    jCluster= clusterList[jIndex]
+    outputDict = dict()
+    #calculate th
+    for curCluster in clusterList:      #jCluster.cluster_layer == iCluster.cluster_layer, so i only compare to one
+        iClusterTuple = (iCluster.cluster_label,iCluster.cluster_layer)
+        jClusterTuple = (jCluster.cluster_label,jCluster.cluster_layer)
+        #iClusterKey = frozenset(iClusterTuple)
+        #jClusterKey = frozenset(jClusterTuple)
+        curLayer = curCluster.cluster_layer
+        if(( curLayer != iCluster.cluster_layer) 
+        and ( curCluster.cluster_connClustDict.__contains__(iClusterTuple)) 
+        and ( curCluster.cluster_connClustDict.__contains__(jClusterTuple))):
+            # min part
+            curMin = min(curCluster.cluster_connClustDict[iClusterTuple],curCluster.cluster_connClustDict[jClusterTuple])
+            if(outputDict.__contains__(curLayer) == False):
+                outputDict[curLayer]= curMin
+            else: # max part
+                if(outputDict[curLayer]<curMin):
+                    outputDict[curLayer] = curMin
+    return outputDict
+def calcEuclideanDist(iIndex,jIndex,clusterList) -> Dict[str,float]:
+    ''' Euclidean Distance Metric for calculating similarity between 2 clusters.
+        Clusters must be from the same layer, and will be compared to clusters from different layers (cluster_layer attribute)
+    :param string iIndex: The index of the first Cluster in the "clusterList" 
+    :param string jIndex: The index of the second Cluster in the "clusterList"  
+    :param List[Cluster] clusterList: A list of clusters to which the 2 clusters will be compared to
+    :returns: Dictionary with layername as KEY, and the computed similarity value between the 2 clusters in regard to the layer as the VALUE of the Dict.
+    :rtype: Dict{str,float}
+    '''
+    iCluster= clusterList[iIndex]
+    jCluster= clusterList[jIndex]
+    outputDict = dict()
+    #calculate the distance  //paralelizable
+    for curCluster in clusterList:      #jCluster.cluster_layer == iCluster.cluster_layer, so i only compare to one
+        iClusterTuple = (iCluster.cluster_label,iCluster.cluster_layer)
+        jClusterTuple = (jCluster.cluster_label,jCluster.cluster_layer)
+        #iClusterKey = frozenset(iClusterTuple)
+        #jClusterKey = frozenset(jClusterTuple)
+        curLayer = curCluster.cluster_layer
+        #considering only clusters from other layers for distance calc
+        if( curLayer != iCluster.cluster_layer):
+            ###### if in a layer both cluster don't have a connection --> distance of 0. Identical in regard to that layer. correct or false?
+            iVal = 0
+            jVal = 0
+            connectedClusters = False
+            if(curCluster.cluster_connClustDict.__contains__(iClusterTuple)):
+                iVal = curCluster.cluster_connClustDict[iClusterTuple]
+                connectedClusters = True
+            if(curCluster.cluster_connClustDict.__contains__(jClusterTuple)):
+                jVal = curCluster.cluster_connClustDict[jClusterTuple]
+                connectedClusters = True
+            if (connectedClusters == False):
+                #clusters aren't connected => assign the max int value if there are no prior elements in list
+                if(outputDict.__contains__(curLayer) == False):
+                    outputDict[curLayer]= 2147483647 #notConnected to that particular layer at all
+            else:
+                #clusters ARE connected => add the squares part of the euclid distance to the value of the similarity
+                if(outputDict.__contains__(curLayer) == False):
+                    #first element
+                    outputDict[curLayer]= (iVal - jVal)**2
+                else: 
+                    #further elements
+                    outputDict[curLayer]+= (iVal - jVal)**2
+    for layer in outputDict:
+        outputDict[layer] = math.sqrt(outputDict[layer])
+    return outputDict
+#                                      frozenset(tuple)                dict
+#                               [(iClusterLabel,jClusterLabel), (layer,similarity)]
+#def calculateSimilarity(inputLayerDict) -> Dict[frozenset((str,str)),Dict[str,int]]:
+def calculateSimilarity(inputLayerDict):
+    ''' Calculates the similarity between clusters contained in the "inputLayerDict". Similarity is calculated for each combination of 2 clusters from the SAME layer.
+    :param Dict{layername: LayerC} inputLayerDict: Contains the associated Layer and Clusters objects. The dictonary KEY is layername, the Value is a LayerC Object. The LayerC object has an attribute cluster_Dict which stores the clusters in the LayerC.
+    :returns: Dict{tuple(cluster_label1, cluster_label2) : Dict{layername, similarityValue}}. Returns a Dictionary with a tuple of 2 clusters as KEY, and a Dictionary with the computed similarity of the clusters in regard to each layer as VALUE
+    :rtype: Dict{(string,string): Dict{str:float}}
+    '''     
+    print("Entered calculateSimilarity")
+    similarityDict = dict() #the key is a frozenset(Tuple) (clusterLabel1,clusterLabel2)
+    clusterList = list()
+    for curLayerC in inputLayerDict.values():        
+        for curCluster in curLayerC.cluster_Dict.values():
+            clusterList.append(curCluster)
+    #print(" Nr. of clusters: "+str(len(clusterList)))
+    #go thru every combination of 2 clusters and calculate the similarity between them in regard to each layer
+    i=0
+    while( i < len(clusterList) ):
+        iCluster = clusterList[i]
+        j=i+1
+        while ( j<len(clusterList)):
+            jCluster = clusterList[j]
+            if (iCluster.cluster_layer == jCluster.cluster_layer): #calculate similarity only from the same layer
+                tuplekey = (clusterList[i].cluster_label,clusterList[j].cluster_label,iCluster.cluster_layer)
+                #### EUCLIDEAN DISTANCE /minMax
+                similarityDict[tuplekey]=calcEuclideanDist(i,j,clusterList)
+                #print("#### similarityDict  i:"+str(i)+" j:"+str(j))
+                #print("#### "+str(similarityDict))
+            else:
+                j = len(clusterList)
+            j+=1
+        i+=1
+    print("Finished calculateSimilarity")
+    return similarityDict
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/calculateWeights.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/calculateWeights.py
+from db.entities.connected_node import NodeC
+from db.entities.connected_cluster import ClusterC
+from db.entities.connected_layer import LayerC
+from typing import Dict
+def sortFunctByNode(node):
+    try :
+        return node.uniqueID
+    except:
+        print(node.node_cluster)
+        print(node.node_layer)
+        print(node.uniqueID)
+def calculateWeights(inputLayerDict) -> Dict[str,LayerC]: 
+    ''' Calculates the nr of connections/weights between the clusters contained in the "inputLayerDict". Connections are made between clusters from DIFFERENT layers.
+    :param Dict{string: LayerC} inputLayerDict: Contains the associated LayerC and ConnClusters objects. The dictonary KEY is layername, the Value is a Layer Object. The Layer object has an attribute cluster_Dict which stores the clusters in the LayerC.
+    :returns: Dict{layername: LayerC}. Returns the inputLayerDict with the added connections in the attributes cluster_connClustDict and cluster_connNodesDict
+    :rtype: Dict{string: LayerC}
+    '''    
+    #the input dictates which cluster is updated; 
+    # #however it will update all the included clusters
+    # if i only want to update a single cluster without considering the rest i should create a new method?
+    #
+    #
+    #
+    #
+    print("Entered calculateWeights")
+    nodeList = []
+    for curLayer in inputLayerDict.values():
+        for curCluster in curLayer.cluster_Dict.values():
+            for curNode in curCluster.cluster_containedNodesDict.values():
+                nodeList.append(curNode)
+                #if curNode != None:
+                    #if(curNode.uniqueID!= None):
+    #print("  Nr. of nodes: " + str(len(nodeList)))
+    nodeList.sort(key=sortFunctByNode)
+    i=0
+    while( i < len(nodeList) ):
+        iNode = nodeList[i]
+        j=i+1
+        while ( j<len(nodeList)):
+            jNode = nodeList[j]
+            #if there is a connection
+            #Compute a connection 
+            if (iNode.node_layer != jNode.node_layer) and (iNode.uniqueID == jNode.uniqueID):
+                iNodeTuple = (iNode.uniqueID,iNode.node_cluster,iNode.node_layer)
+                jNodeTuple= (jNode.uniqueID,jNode.node_cluster,jNode.node_layer)
+                iNodeKey = frozenset(iNodeTuple)
+                jNodeKey = frozenset(jNodeTuple)
+                iClusterTuple = (iNode.node_cluster,iNode.node_layer)
+                jClusterTuple = (jNode.node_cluster,jNode.node_layer)
+                #iClusterKey = frozenset(iClusterTuple)
+                #jClusterKey = frozenset(jClusterTuple)
+                #Check if old node dicts has this node: if not add to ConnDictionary and to OldNodesDict
+                #              Layer                .             Cluster             .    OldNodesDict    .    Does not contain the OTHER node
+                if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connNodesDict.__contains__(jNodeKey) == False):
+                    #add node j at cluster i
+                    if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict.__contains__(jClusterTuple)):
+                        inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict[jClusterTuple]+=1
+                    else:
+                        inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict[jClusterTuple]=1
+                    #add node to old nodes
+                    inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connNodesDict[jNodeKey]=jNode
+                if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connNodesDict.__contains__(iNodeKey) == False):
+                    #add node i at cluster j
+                    if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict.__contains__(iClusterTuple)):
+                        inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict[iClusterTuple]+=1
+                    else:
+                        inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict[iClusterTuple]=1
+                    #add node to old nodes
+                    inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connNodesDict[iNodeKey]=iNode
+            j+=1   
+        i+=1
+    print("Finished calculateWeights")
+    #store weights in database?
+    return inputLayerDict
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataInput.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataInput.py
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataOutput.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataOutput.py
+#Misc util functions
+import json
+import requests
+import datetime
+from processing.similarityFiles.miscFunctions import *
+from db.repository import Repository
+repo = Repository()
+def outputFileLayerFunction(layerDict,limitNrNodes,limitNrCluster,runId):
+    ''' Writes the layerDict data to a JSON file.
+    :param Dict{string: Layer} layerDict: Object which contains Data about the Layers, Clusters and Nodes
+    :param int limitNrNodes: How many nodes are contained in layerDict. Used in creating the name of the File
+    :param int limitNrCluster: How many clusters are contained in layerDict. Used in creating the name of the File
+    '''  
+    layerJSON = convertLayerDictToJSON(layerDict,runId)
+    outputJSON = json.dumps(layerJSON, default=lambda o: o.__dict__, indent=4)
+    try:
+        with open('resultLayerDictN'+str(limitNrNodes)+'C'+str(limitNrCluster)+'.json', 'w') as outfile:
+            outfile.write(outputJSON)
+    except ValueError:
+        print("Error occured when writing the resultLayerDict file")
+def outputFileSimilFunction(similarityDict,limitNrNodes,limitNrCluster,runId):
+    ''' Writes the similarityDict data to a JSON file.
+    :param Dict{(cluster_label1, cluster_label2): Dict{layername: value}} similarityDict: Object which contains Data about the similarity between the clusters, Clusters and Nodes
+    :param int limitNrNodes: How many nodes are contained in layerDict. Used in creating the name of the File
+    :param int limitNrCluster: How many clusters are contained in layerDict. Used in creating the name of the File
+    '''  
+    similJSON = convertSimilarityDictToJSON(similarityDict,runId)
+    outputJSON = json.dumps(similJSON, default=lambda o: o.__dict__, indent=4)
+    try: 
+        with open('resultSimilarityDictN'+str(limitNrNodes)+'C'+str(limitNrCluster)+'.json', 'w') as outfile2:
+            outfile2.write(outputJSON)
+    except ValueError:
+        print("Error occured when writing the resultSimilarityDict file")
+def outputFileTimeFunction(timelist,limitNrNodes,limitNrCluster,runId):
+    ''' Writes execution time to a file.
+    :param List[datetime] timelist: Contains timestamps about the execution time of functions and the program.
+    :param int limitNrNodes: How many nodes are considered. Used in creating the name of the File
+    :param int limitNrCluster: How many clusters are considered. Used in creating the name of the File
+    '''  
+    stringToWrite = "StartTime: "+ str(timelist[0])
+    stringToWrite += "\nFinishTime: " + str((timelist[3])) +"\n"
+    stringToWrite += "\nPopulateWithNewNodes: " + str((timelist[1]-timelist[0]).total_seconds())
+    stringToWrite += "\nCalculateWeights: " + str((timelist[2]-timelist[1]).total_seconds())
+    stringToWrite += "\nCalculateSimilarity: " + str((timelist[3]-timelist[2]).total_seconds())
+    stringToWrite += "\nTotalTime: " + str((timelist[3]-timelist[0]).total_seconds())
+    stringToWrite += "\nRunId: " +str(runId)
+    #aux = str(timelist[0]) + " :PopulateWithNewNodes\n"+ str(timelist[1]) +  " :CalculateWeights\n" + str(timelist[2]) + " :CalculateSimilarity\n"+   str(timelist[3]) + " :Finish" 
+    try:
+        with open('resultTimeExecN'+str(limitNrNodes)+'C'+str(limitNrCluster)+'.txt', 'w') as outfile3:
+            outfile3.write(stringToWrite)
+    except ValueError:
+        print("Error occured when writing the resultTimeExec file")
+def outputMongoConnClustDict(inputDict,runId):
+    ''' Stores connected_clusters in the database.
+    :param Dict() inputDict: Contains the data to insert
+    :param string runId: Id of the Run 
+    '''  
+    #inputDict["Timestamp"] = str(datetime.datetime.now())
+    add_conn_clusters(inputDict,runId)
+def outputMongoSimilarity(inputDict,runId):
+    ''' Stores cluster_similarity in the database.
+    :param Dict() inputDict: Contains the data to insert
+    :param string runId: Id of the Run 
+    ''' 
+    add_similarity(inputDict,runId)
+def add_connected_run():
+    '''
+        Inserts Run with current Time into the DB
+        :returns: Returns the _id of the connected_run entry in the DB
+        :rtype: string
+    '''    
+    currentTime = datetime.datetime.now()
+    runDict = {"Datetime" : str(currentTime)}
+    inserted_result = repo.add_connected_run(runDict)
+    return str(inserted_result.inserted_id)
+def add_conn_clusters(inputDict,runId):
+    ''' Stores connected_clusters in the database.
+    :param Dict() inputDict: Contains the data to insert
+    :param string runId: Id of the Run 
+    ''' 
+    outputJSON = convertLayerDictToJSON(inputDict,runId)
+    repo.add_connected_clusters(outputJSON)
+def add_similarity(inputDict,runId):
+    ''' Stores cluster_similarity in the database.
+    :param Dict() inputDict: Contains the data to insert
+    :param string runId: Id of the Run 
+    ''' 
+    outputJSON = convertSimilarityDictToJSON(inputDict,runId)
+    repo.add_similarity(outputJSON)
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/miscFunctions.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/miscFunctions.py
+#Misc util functions
+import json
+import requests
+import datetime
+def currentTime():
+    ts = datetime.datetime.now()
+    print(ts)
+    return ts
+def totalNumberOfNodes(inputLayerDict):
+    ''' Computes total number of nodes in the inputLayerDict.
+    :param Dict{string: Layer} inputLayerDict: Layer in which the Clusters in which the Nodes are stored
+    :returns: Returns nr of Nodes
+    :rtype: int
+    '''  
+    nodeCount = 0
+    for curLayer in inputLayerDict.values():
+        for curCluster in curLayer.cluster_Dict.values():
+                nodeCount+=len(curCluster.cluster_containedNodesDict.values())
+    return nodeCount
+def totalNumberOfClusters(inputLayerDict):
+    ''' Computes total number of clusters in the inputLayerDict.
+    :param Dict{string: Layer} inputLayerDict: Layer in which the Clusters are stored
+    :returns: Returns nr of Clusters
+    :rtype: int
+    '''  
+    clustCount = 0
+    for curLayer in inputLayerDict.values():
+            clustCount+= len(curLayer.cluster_Dict.values())
+    return clustCount
+def convertLayerDictToJSON(layerDict, runId):
+    ''' Converts a Layer object to JSON format.
+    :param Dict{string: Layer} layerDict: Object which contains Data about the Layers, Clusters and Nodes
+    :rtype: Dict{string: [Cluster1, Cluster2, ...]}
+    '''
+    outputJSON = []
+    for curLayer in layerDict.values():        
+        for curCluster in curLayer.cluster_Dict.values():
+            outputJSON.append({
+                "cluster_label" : curCluster.cluster_label,
+                "cluster_layer" : curCluster.cluster_layer, 
+                "cluster_runId" : runId,
+                "cluster_connClustDict" : changeTupleDictToDictList(curCluster.cluster_connClustDict),
+                "cluster_connNodesDict" : getFrozensetFromConnNodesDict(curCluster.cluster_connNodesDict), #Don
+                "cluster_containedNodesDict" : getNodeIdListFromContainedNodesDict(curCluster.cluster_containedNodesDict)
+            })
+    #outputJSON = json.dumps(outputJSON, default=lambda o: o.__dict__, indent=4)
+    return outputJSON
+def changeTupleDictToDictList(inputDict):
+    '''
+    Helper function used to convert the code into JSON format
+    '''
+    outputList = []
+    for tupleKey in inputDict:
+        auxDict = dict()
+        auxDict["cluster_label"]= tupleKey[0]
+        auxDict["cluster_layer"]= tupleKey[1]
+        auxDict["connectionWeight"] = inputDict[tupleKey]
+        outputList.append(auxDict)            
+    return outputList
+def getNodeIdListFromContainedNodesDict(inputDict):
+    '''
+    Helper function used to convert the code into JSON format
+    '''
+    output = []
+    for curNode in inputDict.values():
+        output.append(curNode.uniqueID)
+    return output
+def getFrozensetFromConnNodesDict(inputDict):
+    '''
+    Helper function used to convert the code into JSON format
+    '''
+    output = []
+    for curNode in inputDict.values(): 
+        auxDict = {}
+        auxDict["node_id"]= curNode.uniqueID
+        auxDict["node_cluster"] = curNode.node_cluster
+        auxDict["node_layer"] = curNode.node_layer
+        output.append(auxDict)
+    return output
+def convertSimilarityDictToJSON(inputDict,runId):
+    ''' Converts a Similarity Dictionary to JSON format. For outputting to DB
+    :param Dict{} similarityDict: Object which contains Data about the Computed similarities between Clusters
+    :rtype: List[Dicts]
+    '''
+    similList = []
+    for tupleKey in inputDict:
+        auxDict = dict()
+        auxDict["cluster1_label"]= tupleKey[0]
+        auxDict["cluster2_label"]= tupleKey[1]
+        auxDict["cluster_layer"] = tupleKey[2]  
+        auxDict["similarityValues"] = inputDict[tupleKey]    
+        auxDict["runId"] = runId
+        similList.append(auxDict)
+    similToJSON = similList
+    #outputJSON = json.dumps(similToJSON, default=lambda o: o.__dict__, indent=4)
+    return similToJSON
--- a/src/data-hub/role-stage-discovery-microservice/app/routes/connClusters.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/routes/connClusters.py
+from flask import request, Response
+from db.repository import Repository
+repo = Repository()
+def get_conn_clusters():
+    ''' Gets connected_clusters from the database.
+        :returns: Returns similarity objects from the DB
+        :rtype: Dict
+    '''  
+    result = repo.get_connected_clusters()
+    if result is None or len(result) == 0:        
+        print("MongoDb Get Error: Response 404")
+        return Response(status=404)
+    else:
+        return result
--- a/src/data-hub/role-stage-discovery-microservice/app/routes/connRun.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/routes/connRun.py
+from flask import request, Response
+from db.repository import Repository
+from db.entities import connected_run
+import datetime
+#from db.entities import clusterset #REMOVE?
+repo = Repository()
+def get_connected_run():
+    ''' Gets Run from the database.
+        :returns: Returns Run objects from the DB
+        :rtype: Dict{_id,datetime}
+    '''  
+    result = repo.get_connected_run()
+    if result is None or len(result) == 0:        
+        print("#### Response 404")
+        return Response(status=404)
+    else:
+        return result
--- a/src/data-hub/role-stage-discovery-microservice/app/routes/similarity.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/routes/similarity.py
+from flask import request, Response
+from db.repository import Repository
+from processing.similarityFiles.miscFunctions import convertSimilarityDictToJSON
+#from db.entities import clusterset #REMOVE?
+repo = Repository()
+def get_similarity(layer_name,batchNr): 
+    ''' Gets cluster_similarity from the database.
+        :returns: Returns similarity objects from the DB
+        :rtype: Dict
+    '''  
+    batchSize = 1000
+    if int(batchNr)<0:
+        print("Batch number needs to be a positive integer")
+        return Response(status=404)
+    skipNr = batchSize*int(batchNr)
+    #get_similarity(self,skipNr,batchSize, cluster_layer: str= None, run_id: str=None)
+    result = repo.get_similarity(skipNr, batchSize, layer_name)
+    if result is None or len(result) == 0:        
+        print("MongoDb Get Error: Response 404")
+        return Response(status=404)
+    else:
+        return result
--- a/src/data-hub/role-stage-discovery-microservice/app/similarityMain.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/similarityMain.py
+"""
+for each cluster in the layer
+    for each other cluster from all the other layers
+        find the number of connexions
+        save them into a dictionary (ClusterID(from other layer) -> Nr of connections)
+    save all the dictionaries in a map? ( ClusterID1 -> dictionary1, ClusterID2 -> dicitonary2 )
+have a map per layer? (Nr of maps = nr of layers)
+Each cluster has a dictionary of connCluster-> nrConections
+Each layer has a dictionary of clusters -> dictionaries of nodes/connections
+"""
+import os
+import sys
+import math
+import datetime
+from typing import Dict
+##################AUX
+modules_path = '../../../modules/'
+if os.path.exists(modules_path):
+    sys.path.insert(1, modules_path)
+#### TO BE DELETED #### ^
+from db.entities.connected_node import NodeC
+from db.entities.connected_cluster import ClusterC
+from db.entities.connected_layer import LayerC
+from processing.similarityFiles.dataInput import *
+from processing.similarityFiles.calculateWeights import *
+from processing.similarityFiles.calculateSimilarity import *
+from processing.similarityFiles.miscFunctions import *
+from processing.similarityFiles.dataOutput import *
+def main():   
+    print("\nEntered Main")
+    outputToFileFLAG = False
+    timelist = []
+    timelist.append(currentTime())#starting time
+    """
+    Current Layers
+        Price_Layer
+        FinishedTime_Layer
+        Destination_Layer
+        StartingPoint_Layer
+        Reputation_Layer
+        StartingTime_Layer
+        User_Layer
+    """
+    layerNameList = ["Price_Layer","FinishedTime_Layer","Destination_Layer"] #Get it from somewhere else?
+    limitNrCluster = -1 #per Layer # 0< equals noLimit
+    limitNrNodes = -1 #per Layer
+    layerDict = getClusterDataFromMongo(layerNameList,limitNrCluster,limitNrNodes)
+    #URLlist = None
+    #layerDict = getClusterDataFromSwagger(limitNrCluster,limitNrNodes, URLlist) #for Swagger, Change URLs inside the function for different input Data or provide a list with URLS 
+    totalNodes = totalNumberOfNodes(layerDict)
+    print("Nr. of nodes: " +str(totalNodes))
+    totalClusters = totalNumberOfClusters(layerDict)
+    print("Nr. of clusters: " + str(totalClusters))
+    timelist.append(currentTime())
+    #calculates the weights between the clusters (weight == number of connections) #return is displayed in outputLayerFunction    
+    layerDict = calculateWeights(layerDict)
+    timelist.append(currentTime())
+    #calculates the similarity between the clusters #returns dictionary[ tuple(cluster_label1,cluster_label2), 
+    #                                                                      listOfSimilarity(layer1,layer2,layer3) ]    
+    similarityDict = calculateSimilarity(layerDict)
+    timelist.append(currentTime()) #Finishing time    
+    #Write to files
+    runId = add_connected_run()
+    if (outputToFileFLAG == True):
+        print("Outputing data")
+        outputFileLayerFunction(layerDict,totalNodes,totalClusters,runId)
+        outputFileSimilFunction(similarityDict,totalNodes,totalClusters,runId)
+        outputFileTimeFunction(timelist,totalNodes,totalClusters,runId)
+    #Output to DB
+    outputMongoConnClustDict(layerDict,runId)
+    outputMongoSimilarity(similarityDict,runId)
+    #Currently not used in the calculation of connections/similarity, developed for possible future uses
+    #connClustersFromMongo = getConnClusterDataFromMongo()    
+    #similarityDictFromMongo = calculateSimilarity(connClustersFromMongo)
+    #similarityArrFromMongo = getSimilarityDataFromMongo("Price_Layer") # only 220 similarities, but there are about 20 clusters total
+    #similarityArrFromMongo = getSimilarityDataFromMongo("Destination_Layer") # ~2.500k similarities    
+    #similarityArrFromMongo = getSimilarityDataFromMongo("FinishedTime_Layer")# should have the rest of similarities => 15.000k
+    #connectedRunFromMongo = getConnectedRunDataFromMongo()
+    print("FINISHED")
+    return
+##########START##########
+main()
+#########FINISH##########
--- a/src/data-hub/role-stage-discovery-microservice/app/tests/test_cluster.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/tests/test_cluster.py
 import unittest
 import sys
-sys.path.insert(1, '../')
+for path in ['../', './']:
+    sys.path.insert(1, path)
 # python -m unittest discover
 from db.entities import Cluster

--- a/src/data-hub/role-stage-discovery-microservice/app/tests/test_similarity.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/tests/test_similarity.py
--- a/src/modules/database/MongoRepositoryBase.py
+++ b/src/modules/database/MongoRepositoryBase.py
@@ -17,7 +17,7 @@ class MongoRepositoryBase:
    def insert_entry(self, collection_name, content: dict):
        collection = self._database[collection_name]
-        collection.insert_one(content)
+        return collection.insert_one(content)
    def insert_many(self, collection_name, content: list):
        collection = self._database[collection_name]

--- a/src/modules/network_constants.py
+++ b/src/modules/network_constants.py
@@ -18,6 +18,6 @@ SEMANTIC_LINKING_DB_PORT = 27017
 ## Role Stage Discovery
 ROLESTAGE_DISCOVERY_HOSTNAME = 'role-stage-discovery'
-ROLESTAGE_DISCOVERY_REST_PORT = 80
+ROLESTAGE_DISCOVERY_REST_PORT = 30103
-ROLESTAGE_DISCOVERY_DB_HOSTNAME = f'{ROLESTAGE_DISCOVERY_HOSTNAME}-db'
+ROLESTAGE_DISCOVERY_DB_HOSTNAME = f'articonf1.itec.aau.at'
-ROLESTAGE_DISCOVERY_DB_PORT = 27017
+ROLESTAGE_DISCOVERY_DB_PORT = 30104
\ No newline at end of file