Commit eab8c906 authored by Bogdan's avatar Bogdan

Created Test (Assertion) + other minor changes

parent 3b390ffb
......@@ -4,3 +4,5 @@
*.log
**/env
**/venv
src/data-hub/role-stage-discovery-microservice/app/resultSimilarityDictN14992C221.json
......@@ -328,36 +328,51 @@ definitions:
type: string
example: "5efdc04ac43add0aba567d76"
cluster_containedNodesDict:
$ref: "#/definitions/ConnectedNode"
type: array
items:
type: string
example: "2696718d7a33ab3dbf28e9c88411afcfe9a933a45e57ec9159bc0668543f1568"
cluster_connNodesDict:
$ref: "#/definitions/ConnectedNode"
type: array
items:
$ref: "#/definitions/ConnectedNode"
cluster_connClustDict:
type: object
additionalProperties:
type: number
example:
"cluster_label": nrOfConnectedNodes
#"-1": 42
"0": 39
"6969": 1
type: array
items:
$ref: "#/definitions/ConnectedClusterAux"
#not used, should be removed?
#cluster_connectionsNr
ConnectedClusterAux:
type: object
properties:
cluster_label:
type: string
example: "-1"
cluster_layer:
type: string
example: "FinishedTime_Layer"
connectionWeight:
type: number
example: 42
ConnectedNode:
type: object
properties:
cluster_label:
node_id:
type: string
node_layer:
node_cluster:
type: string
uniqueID:
node_layer:
type: string
example:
"cluster_label": "2230"
"node_id": "27a08ed0facc7d68a0818c7695dad391cf48d6095e57ec9159bc0668543f159b"
"node_cluster": "2230"
"node_layer": "Destination_Layer"
"uniqueID": "a95075f5042b1b27060080156d87"
#not used, should be removed?
#finished_time
#latitude_Destination
......@@ -373,18 +388,16 @@ definitions:
ClusterSimilarityDictionary:
properties:
clusterTuple:
type: array
items:
type: string
minItems: 2
maxItems: 2
example: [
#cluster_label1
0,
#cluster_label2
319
]
cluster1_label:
type: string
example: "0"
cluster2_label:
type: string
example: "1"
cluster_layer:
type: string
example: "Price_layer"
similarityValues:
type: object
additionalProperties:
......
......@@ -7,7 +7,7 @@ class ClusterC:
:param cluster_layer: The layer name of the 'source' cluster
:param cluster_runId: The run which calculated the connections
:param cluster_containedNodesDict: Ids of nodes contained in the 'source' cluster
:param cluster_connNodesDict: Ids of nodes contained in 'dest' clusters, where the 'dest' cluster is uniquely identifiable by layer name and cluster label
:param cluster_connNodesDict: Node Objects contained in 'dest' clusters, where the 'dest' cluster is uniquely identifiable by layer name and cluster label
:param cluster_connClustDict: Layer name, cluster label and weight for each 'dest' cluster
'''
......@@ -16,7 +16,6 @@ class ClusterC:
self.cluster_label = cluster_label
self.cluster_layer = cluster_layer
self.cluster_runId = cluster_runId
self.cluster_containedNodesDict = cluster_containedNodesDict ###RENAME TO curClNodesDict #Keys are frozensets(touples) uniqueID and cluster #
self.cluster_connNodesDict = cluster_connNodesDict #Keys are frozensets(touples) uniqueID and cluster #problem if you remove newNodes and oldNodes lists.. there may be duplicates
self.cluster_connClustDict = cluster_connClustDict #dictionary: layer -> (dict2: cluster_label -> nrOfConnections ) OR dictionary: cluster_label -> nrOfConnections
#cluster_connClustDict ------> look at both newNodes and oldNodes
self.cluster_containedNodesDict = cluster_containedNodesDict #Keys are frozensets(touples) == frozenset(uniqueID, cluster and layer) Value is the Node UniqueId
self.cluster_connNodesDict = cluster_connNodesDict #Keys are frozensets(touples) (uniqueID:str,node_cluster:str,node_layer:str) Values are NodeC Objects
self.cluster_connClustDict = cluster_connClustDict #dictionary: (dict[(cluster_label,clusterlayer)] -> nrOfConnections/weightOfTheConnection )
class LayerC:
def __init__(self,layer_name,cluster_Dict):
self.layer_name = layer_name
self.cluster_Dict = cluster_Dict
\ No newline at end of file
def __init__(self,layer_name:str,cluster_Dict):
'''
This class represents the Layer which contains the connected clusters.
:param layer_name: The layer name which contains the clusters
:param cluster_Dict: The connected_clusters contained in this layer.
'''
self.layer_name = layer_name
self.cluster_Dict = cluster_Dict # Dict[cluster_label] --> ClusterC object
\ No newline at end of file
class NodeC:
'''
This class represents the Node data contained in a Cluster.
:param node_layer: The layer name which contains the node
:param node_cluster: The connected_cluster.cluster_label which contains in this layer.
:param uniqueID: Id of the node. Only unique in inside a single cluster. NOT unique between multiple clusters/layers.
'''
def __init__(self, cluster_label, node_layer, uniqueID):
self.cluster_label = cluster_label
self.node_layer = node_layer
self.uniqueID = uniqueID
\ No newline at end of file
def __init__(self, node_cluster, node_layer, uniqueID):
self.node_cluster = node_cluster # str
self.node_layer = node_layer # str
self.uniqueID = uniqueID # str
\ No newline at end of file
from datetime import datetime
class ConnectedRun:
'''
This class represents the RunId and Time when the Connecting of the Clusters and Calculating the Similarity between clusters is executed.
:param run_id: The MongoDB _id of the Run to uniquely identify it.
:param timeOfExec: Datetime object containing info when the run was finished.
'''
def __init__(self,run_id,timeOfExec):
self.run_id = run_id
self.timeOfExec = timeOfExec
\ No newline at end of file
......@@ -14,6 +14,7 @@ LOGGER = logging.getLogger(__name__)
#############################
import connexion
# load swagger config
app = connexion.App(__name__, specification_dir='configs/')
app.add_api('swagger.yml')
......
......@@ -29,13 +29,18 @@ def minMaxFunction(iIndex,jIndex,clusterList) -> Dict[str,int]:
#calculate th
for curCluster in clusterList: #jCluster.cluster_layer == iCluster.cluster_layer, so i only compare to one
iClusterTuple = (iCluster.cluster_label,iCluster.cluster_layer)
jClusterTuple = (jCluster.cluster_label,jCluster.cluster_layer)
#iClusterKey = frozenset(iClusterTuple)
#jClusterKey = frozenset(jClusterTuple)
curLayer = curCluster.cluster_layer
if(( curLayer != iCluster.cluster_layer)
and ( curCluster.cluster_connClustDict.__contains__(iCluster.cluster_label))
and ( curCluster.cluster_connClustDict.__contains__(jCluster.cluster_label))):
and ( curCluster.cluster_connClustDict.__contains__(iClusterTuple))
and ( curCluster.cluster_connClustDict.__contains__(jClusterTuple))):
# min part
curMin = min(curCluster.cluster_connClustDict[iCluster.cluster_label],curCluster.cluster_connClustDict[jCluster.cluster_label])
curMin = min(curCluster.cluster_connClustDict[iClusterTuple],curCluster.cluster_connClustDict[jClusterTuple])
if(outputDict.__contains__(curLayer) == False):
outputDict[curLayer]= curMin
else: # max part
......@@ -68,28 +73,32 @@ def calcEuclideanDist(iIndex,jIndex,clusterList) -> Dict[str,float]:
#calculate the distance //paralelizable
for curCluster in clusterList: #jCluster.cluster_layer == iCluster.cluster_layer, so i only compare to one
iClusterTuple = (iCluster.cluster_label,iCluster.cluster_layer)
jClusterTuple = (jCluster.cluster_label,jCluster.cluster_layer)
#iClusterKey = frozenset(iClusterTuple)
#jClusterKey = frozenset(jClusterTuple)
curLayer = curCluster.cluster_layer
#considering only clusters from other layers for distance calc
if( curLayer != iCluster.cluster_layer):
######BUUUG, WHAT IF THEY DON'T SHARE A CONNECTION?
###### if in a layer both cluster don't have a connection --> distance of 0. Identical in regard to that layer. correct or false?
iVal = 0
jVal = 0
connectedClusters = False
if(curCluster.cluster_connClustDict.__contains__(iCluster.cluster_label)):
iVal = curCluster.cluster_connClustDict[iCluster.cluster_label]
if(curCluster.cluster_connClustDict.__contains__(iClusterTuple)):
iVal = curCluster.cluster_connClustDict[iClusterTuple]
connectedClusters = True
if(curCluster.cluster_connClustDict.__contains__(jCluster.cluster_label)):
jVal = curCluster.cluster_connClustDict[jCluster.cluster_label]
if(curCluster.cluster_connClustDict.__contains__(jClusterTuple)):
jVal = curCluster.cluster_connClustDict[jClusterTuple]
connectedClusters = True
if (connectedClusters == False):
#clusters aren't connected => assign the max int value if there are no prior elements in list
if(outputDict.__contains__(curLayer) == False):
outputDict[curLayer]= 2147483647 #notConnected
outputDict[curLayer]= 2147483647 #notConnected to that particular layer at all
else:
#clusters ARE connected => add the squares part of the euclid distance to the value of the similarity
if(outputDict.__contains__(curLayer) == False):
......@@ -140,10 +149,10 @@ def calculateSimilarity(inputLayerDict):
while ( j<len(clusterList)):
jCluster = clusterList[j]
if (iCluster.cluster_layer == jCluster.cluster_layer): #calculate similarity only from the same layer
tuplekey = (clusterList[i].cluster_label,clusterList[j].cluster_label)
key = frozenset(tuplekey)
tuplekey = (clusterList[i].cluster_label,clusterList[j].cluster_label,iCluster.cluster_layer)
#### EUCLIDEAN DISTANCE /minMax
similarityDict[key]=calcEuclideanDist(i,j,clusterList)
similarityDict[tuplekey]=calcEuclideanDist(i,j,clusterList)
#print("#### similarityDict i:"+str(i)+" j:"+str(j))
#print("#### "+str(similarityDict))
......
......@@ -7,7 +7,7 @@ def sortFunctByNode(node):
try :
return node.uniqueID
except:
print(node.cluster_label)
print(node.node_cluster)
print(node.node_layer)
print(node.uniqueID)
......@@ -55,35 +55,37 @@ def calculateWeights(inputLayerDict) -> Dict[str,LayerC]:
#Compute a connection
if (iNode.node_layer != jNode.node_layer) and (iNode.uniqueID == jNode.uniqueID):
iOldTuple = (iNode.uniqueID,iNode.cluster_label)
jOldTuple= (jNode.uniqueID,jNode.cluster_label)
iOldKey = frozenset(iOldTuple)
jOldKey = frozenset(jOldTuple)
#iForeignKey =
#jForeignKey =
iNodeTuple = (iNode.uniqueID,iNode.node_cluster,iNode.node_layer)
jNodeTuple= (jNode.uniqueID,jNode.node_cluster,jNode.node_layer)
iNodeKey = frozenset(iNodeTuple)
jNodeKey = frozenset(jNodeTuple)
iClusterTuple = (iNode.node_cluster,iNode.node_layer)
jClusterTuple = (jNode.node_cluster,jNode.node_layer)
#iClusterKey = frozenset(iClusterTuple)
#jClusterKey = frozenset(jClusterTuple)
#Check if old node dicts has this node: if not add to ConnDictionary and to OldNodesDict
# Layer . Cluster . OldNodesDict . Does not contain the OTHER node
if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.cluster_label].cluster_connNodesDict.__contains__(jOldKey) == False):
if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connNodesDict.__contains__(jNodeKey) == False):
#add node j at cluster i
if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.cluster_label].cluster_connClustDict.__contains__(jNode.cluster_label)):
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.cluster_label].cluster_connClustDict[jNode.cluster_label]+=1
if (inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict.__contains__(jClusterTuple)):
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict[jClusterTuple]+=1
else:
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.cluster_label].cluster_connClustDict[jNode.cluster_label]=1
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connClustDict[jClusterTuple]=1
#add node to old nodes
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.cluster_label].cluster_connNodesDict[jOldKey]=jNode
inputLayerDict[iNode.node_layer].cluster_Dict[iNode.node_cluster].cluster_connNodesDict[jNodeKey]=jNode
if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.cluster_label].cluster_connNodesDict.__contains__(iOldKey) == False):
if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connNodesDict.__contains__(iNodeKey) == False):
#add node i at cluster j
if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.cluster_label].cluster_connClustDict.__contains__(iNode.cluster_label)):
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.cluster_label].cluster_connClustDict[iNode.cluster_label]+=1
if (inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict.__contains__(iClusterTuple)):
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict[iClusterTuple]+=1
else:
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.cluster_label].cluster_connClustDict[iNode.cluster_label]=1
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connClustDict[iClusterTuple]=1
#add node to old nodes
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.cluster_label].cluster_connNodesDict[iOldKey]=iNode
inputLayerDict[jNode.node_layer].cluster_Dict[jNode.node_cluster].cluster_connNodesDict[iNodeKey]=iNode
j+=1
i+=1
......
......@@ -50,42 +50,6 @@ def convertLayerDictToJSON(layerDict, runId):
:rtype: Dict{string: [Cluster1, Cluster2, ...]}
'''
'''
{
layer1 : {
[
{
cluster_label1 : 0123400,
cluster_layer: layer1,
"cluster_connClustDict": {
"0123456": 98
"1234567": 12
},
cluster_containedNodesDict : {
[
abcd,
sgre,
dgre,
ddhr,
yyrh
]
}
},
{
},
{
}
]
},
layer2 : {
}
}
'''
outputJSON = []
for curLayer in layerDict.values():
......@@ -94,52 +58,74 @@ def convertLayerDictToJSON(layerDict, runId):
"cluster_label" : curCluster.cluster_label,
"cluster_layer" : curCluster.cluster_layer,
"cluster_runId" : runId,
"cluster_connClustDict" : changeDictKeysToString(curCluster.cluster_connClustDict),
"cluster_connNodesDict" : getFrozensetFromConnNodesDict(curCluster.cluster_connNodesDict),
"cluster_containedNodesDict" : getNodeIdListFromContainedNodesDict(curCluster.cluster_containedNodesDict),
"cluster_connClustDict" : changeTupleDictToDictList(curCluster.cluster_connClustDict),
"cluster_connNodesDict" : getFrozensetFromConnNodesDict(curCluster.cluster_connNodesDict), #Don
"cluster_containedNodesDict" : getNodeIdListFromContainedNodesDict(curCluster.cluster_containedNodesDict)
})
#outputJSON = json.dumps(outputJSON, default=lambda o: o.__dict__, indent=4)
return outputJSON
def changeDictKeysToString(inputDict):
def changeTupleDictToDictList(inputDict):
'''
Helper function used to convert the code into JSON format
'''
outputList = []
for tupleKey in inputDict:
auxDict = dict()
auxDict["cluster_label"]= tupleKey[0]
auxDict["cluster_layer"]= tupleKey[1]
auxDict["connectionWeight"] = inputDict[tupleKey]
outputList.append(auxDict)
keys_values = inputDict.items()
outputDict = { str(key): value for key,value in keys_values}
return outputDict
return outputList
def getNodeIdListFromContainedNodesDict(inputDict):
'''
Helper function used to convert the code into JSON format
'''
output = []
for curNode in inputDict.values():
output.append(curNode.uniqueID)
return output
def getFrozensetFromConnNodesDict(inputDict):
'''
Helper function used to convert the code into JSON format
'''
output = []
for curNode in inputDict.values():
auxDict = {}
auxDict["node_id"]= curNode.uniqueID
auxDict["node_cluster"] = curNode.cluster_label
auxDict["node_cluster"] = curNode.node_cluster
auxDict["node_layer"] = curNode.node_layer
output.append(auxDict)
return output
def convertSimilarityDictToJSON(inputDict,runId):
''' Converts a Similarity Dictionary to JSON format. For outputting to DB
:param Dict{} similarityDict: Object which contains Data about the Computed similarities between Clusters
:rtype: List[Dicts]
'''
similList = []
for compositeKey in inputDict:
frozensetString =list()
#key is a tuple of cluster_labels
for key in compositeKey:
frozensetString.append(key)
similList.append({
"clusterTuple" : frozensetString,
"similarityValues" : inputDict[compositeKey],
"runId": runId
})
for tupleKey in inputDict:
auxDict = dict()
auxDict["cluster1_label"]= tupleKey[0]
auxDict["cluster2_label"]= tupleKey[1]
auxDict["cluster_layer"] = tupleKey[2]
auxDict["similarityValues"] = inputDict[tupleKey]
auxDict["runId"] = runId
similList.append(auxDict)
similToJSON = similList
#outputJSON = json.dumps(similToJSON, default=lambda o: o.__dict__, indent=4)
......
StartTime: 2020-07-02 12:05:47.067975
FinishTime: 2020-07-02 12:05:54.561853
StartTime: 2020-07-07 16:55:42.418309
FinishTime: 2020-07-07 16:55:49.746628
PopulateWithNewNodes: 2.495718
CalculateWeights: 4.590413
CalculateSimilarity: 0.407747
TotalTime: 7.493878
\ No newline at end of file
PopulateWithNewNodes: 2.321926
CalculateWeights: 4.499367
CalculateSimilarity: 0.507026
TotalTime: 7.328319
RunId: 5f048cf587e0ee319fa894ed
\ No newline at end of file
StartTime: 2020-07-06 16:16:11.525479
FinishTime: 2020-07-06 16:16:18.213974
StartTime: 2020-07-07 14:57:32.942331
FinishTime: 2020-07-07 14:57:39.489324
PopulateWithNewNodes: 2.206513
CalculateWeights: 4.435216
CalculateSimilarity: 0.046766
TotalTime: 6.688495
RunId: 5f033232366be85ec1afca7b
\ No newline at end of file
PopulateWithNewNodes: 2.102823
CalculateWeights: 4.382948
CalculateSimilarity: 0.061222
TotalTime: 6.546993
RunId: 5f0471438b27390711e31c70
\ No newline at end of file
......@@ -30,12 +30,17 @@ from processing.similarityFiles.calculateWeights import *
from processing.similarityFiles.calculateSimilarity import *
from processing.similarityFiles.miscFunctions import *
from processing.similarityFiles.dataOutput import *
from routes.connRun import connected_run
#####TEST ONLY#####
from processing.similarityFiles.testSimilarity import *
def main():
print("\nEntered Main")
outputToFileFLAG = False
timelist = []
timelist.append(currentTime())#starting time
......@@ -54,8 +59,9 @@ def main():
limitNrNodes = 1000 #per Layer
layerDict = getClusterDataFromMongo(layerNameList,limitNrCluster,limitNrNodes)
#layerDict = getClusterDataFromSwagger(limitNrCluster,limitNrNodes) #for Swagger, Change URLs inside the function for different input Data
#URLlist = None
#layerDict = getClusterDataFromSwagger(limitNrCluster,limitNrNodes, URLlist) #for Swagger, Change URLs inside the function for different input Data or provide a list with URLS
totalNodes = totalNumberOfNodes(layerDict)
print("Nr. of nodes: " +str(totalNodes))
totalClusters = totalNumberOfClusters(layerDict)
......@@ -76,25 +82,34 @@ def main():
#Write to files
runId = add_connected_run()
print("Outputing data")
outputFileLayerFunction(layerDict,totalNodes,totalClusters,runId)
outputFileSimilFunction(similarityDict,totalNodes,totalClusters,runId)
outputFileTimeFunction(timelist,totalNodes,totalClusters,runId)
if (outputToFileFLAG == True):
print("Outputing data")
outputFileLayerFunction(layerDict,totalNodes,totalClusters,runId)
outputFileSimilFunction(similarityDict,totalNodes,totalClusters,runId)
outputFileTimeFunction(timelist,totalNodes,totalClusters,runId)
#Output to DB
outputMongoConnClustDict(layerDict,runId)
outputMongoSimilarity(similarityDict,runId)
#Currently not used, developed for possible future uses
#Currently not used in the calculation of connections/similarity, developed for possible future uses
connClustersFromMongo = getConnClusterDataFromMongo()
similarityDictFromMongo = calculateSimilarity(connClustersFromMongo)
similarityArrFromMongo = getSimilarityDataFromMongo()
connectedRunFromMongo = getConnectedRunDataFromMongo()
print("FINISHED")
return
def test():
testInputData()
##########START##########
main()
#main()
test()
#########FINISH##########
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment