Added Batch feature for Similarity GET

a039212b · Bogdan · eab8c906 · a039212b · a039212b · a039212b
Commit a039212b authored Jul 14, 2020 by Bogdan
11 changed files
--- a/src/data-hub/role-stage-discovery-microservice/app/configs/swagger.yml
+++ b/src/data-hub/role-stage-discovery-microservice/app/configs/swagger.yml
@@ -197,8 +197,20 @@ paths:
      operationId: "routes.similarity.get_similarity"
      tags:
          - "Similarity"
-      summary: "Get data of the similarity between clusters"
-      description: "Returns a dictionary where the key is a tuple of cluster_labels (i.e. [0,319]) and the value is the computed similarity between 2 clusters in the tuple, in regard to each layer in the input. \n Note: the tuple clusters have the same layer and the computed similarity is in regard to clusters from OTHER layers."
+      summary: "Get data of the similarity between clusters."
+      parameters: 
+          - name: "layer_name"
+            in: "query"
+            description: "Name of the layer"
+            required: true
+            type: "string"
+          - name: "batchNr"
+            in: "query"
+            description: "Batch number (starting from 0)"
+            required: true
+            type: "integer"
+          
+      description: "Data is returned in batches of size 1000. Returns a dictionary where the key is a tuple of cluster_labels (i.e. [0,319]) and the value is the computed similarity between 2 clusters in the tuple, in regard to each layer in the input. \n Note: the tuple clusters have the same layer and the computed similarity is in regard to clusters from OTHER layers."
      responses:
        200:
          description: "Successful operation"
@@ -396,7 +408,7 @@ definitions:
        example: "1"
      cluster_layer:
        type: string
-        example: "Price_layer"
+        example: "Price_Layer"
      
      similarityValues:
        type: object

--- a/src/data-hub/role-stage-discovery-microservice/app/db/repository.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/db/repository.py
@@ -115,17 +115,28 @@ class Repository(MongoRepositoryBase):
        #super().insert_entry(self._connected_clusters_collection, outputJSON)
        return result
    
-    def get_similarity(self, run_id: str=None):
+    def get_similarity(self,skipNr,batchSize, cluster_layer: str= None, run_id: str=None):
        ''' Get Similarity Data from DB '''
+        
        if (run_id == None):
-            entries = super().get_entries(self._similarity_collection, projection={'_id': 0})
+            if(cluster_layer == None):
+                entries = super().get_entries(self._similarity_collection, projection={'_id': 0})
+            else:
+                entries = super().get_entries(self._similarity_collection, selection={'cluster_layer' : cluster_layer}, projection={'_id': 0})
        else:
-            entries = super().get_entries(self._similarity_collection, selection={'runId' : run_id}, projection={'_id': 0})
+            if(cluster_layer == None):
+                entries = super().get_entries(self._similarity_collection, selection={'runId' : run_id}, projection={'_id': 0})
+            else:
+                entries = super().get_entries(self._similarity_collection, selection={'cluster_layer' : cluster_layer, 'runId' : run_id}, projection={'_id': 0})
            
+        #
+        return list(entries.sort([('_id', -1)]).skip(skipNr).limit(batchSize))
+        """
        output = []
        for e in entries:
            output.append(e)
        return output
+        """


 #endregion

--- a/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataInput.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/processing/similarityFiles/dataInput.py
@@ -186,7 +186,7 @@ def getConnClusterDataFromMongo():

    return outputDict

-def getSimilarityDataFromMongo():
+def getSimilarityDataFromMongo(cluster_layer: str= None, batchSize: int=1000, run_id: str=None):
    '''
    Gets the computed Similarity between clusters data from the MongoDB database. The data is found in the collection "similarity"

@@ -194,7 +194,15 @@ def getSimilarityDataFromMongo():

    :rtype: Dict
    '''  
-    result = repo.get_similarity()
+    skipNr = 0
+    result = []
+    batchResult = repo.get_similarity(skipNr,batchSize,cluster_layer,run_id)    
+    result.extend(batchResult)
+    
+    while len(batchResult) == batchSize: 
+        skipNr += batchSize
+        batchResult = repo.get_similarity(skipNr,batchSize,cluster_layer,run_id)
+        result.extend(batchResult)
    return result

 def getConnectedRunDataFromMongo():

--- a/src/data-hub/role-stage-discovery-microservice/app/resultLayerDictN2999C121.json
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultLayerDictN2999C121.json
--- a/src/data-hub/role-stage-discovery-microservice/app/resultLayerDictN2999C60.json
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultLayerDictN2999C60.json
--- a/src/data-hub/role-stage-discovery-microservice/app/resultSimilarityDictN2999C121.json
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultSimilarityDictN2999C121.json
--- a/src/data-hub/role-stage-discovery-microservice/app/resultSimilarityDictN2999C60.json
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultSimilarityDictN2999C60.json
--- a/src/data-hub/role-stage-discovery-microservice/app/resultTimeExecN2999C121.txt
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultTimeExecN2999C121.txt
-StartTime: 2020-07-07 16:55:42.418309
-FinishTime: 2020-07-07 16:55:49.746628
-
-PopulateWithNewNodes: 2.321926
-CalculateWeights: 4.499367
-CalculateSimilarity: 0.507026
-TotalTime: 7.328319
-RunId: 5f048cf587e0ee319fa894ed
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/resultTimeExecN2999C60.txt
+++ b/src/data-hub/role-stage-discovery-microservice/app/resultTimeExecN2999C60.txt
-StartTime: 2020-07-07 14:57:32.942331
-FinishTime: 2020-07-07 14:57:39.489324
-
-PopulateWithNewNodes: 2.102823
-CalculateWeights: 4.382948
-CalculateSimilarity: 0.061222
-TotalTime: 6.546993
-RunId: 5f0471438b27390711e31c70
\ No newline at end of file
--- a/src/data-hub/role-stage-discovery-microservice/app/routes/similarity.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/routes/similarity.py
@@ -5,13 +5,19 @@ from processing.similarityFiles.miscFunctions import convertSimilarityDictToJSON

 repo = Repository()

-def get_similarity(): 
+def get_similarity(layer_name,batchNr): 
    ''' Gets cluster_similarity from the database.

        :returns: Returns similarity objects from the DB
        :rtype: Dict
    '''  
-    result = repo.get_similarity()
+    batchSize = 1000
+    if int(batchNr)<0:
+        print("Batch number needs to be a positive integer")
+        return Response(status=404)
+    skipNr = batchSize*int(batchNr)
+    #get_similarity(self,skipNr,batchSize, cluster_layer: str= None, run_id: str=None)
+    result = repo.get_similarity(skipNr, batchSize, layer_name)
    if result is None or len(result) == 0:        
        print("MongoDb Get Error: Response 404")
        return Response(status=404)

--- a/src/data-hub/role-stage-discovery-microservice/app/similarityMain.py
+++ b/src/data-hub/role-stage-discovery-microservice/app/similarityMain.py
@@ -55,9 +55,10 @@ def main():
        User_Layer
    """
    layerNameList = ["Price_Layer","FinishedTime_Layer","Destination_Layer"] #Get it from somewhere else?
-    limitNrCluster = 20 #per Layer
-    limitNrNodes = 1000 #per Layer
+    limitNrCluster = -1 #per Layer # 0< equals noLimit
+    limitNrNodes = -1 #per Layer

+    """
    layerDict = getClusterDataFromMongo(layerNameList,limitNrCluster,limitNrNodes)

    #URLlist = None
@@ -93,13 +94,15 @@ def main():
    #Output to DB
    outputMongoConnClustDict(layerDict,runId)
    outputMongoSimilarity(similarityDict,runId)
-
+    """

    #Currently not used in the calculation of connections/similarity, developed for possible future uses
-    connClustersFromMongo = getConnClusterDataFromMongo()    
-    similarityDictFromMongo = calculateSimilarity(connClustersFromMongo)
+    #connClustersFromMongo = getConnClusterDataFromMongo()    
+    #similarityDictFromMongo = calculateSimilarity(connClustersFromMongo)

-    similarityArrFromMongo = getSimilarityDataFromMongo()
+    #similarityArrFromMongo = getSimilarityDataFromMongo("Price_Layer") # only 220 similarities, but there are about 20 clusters total
+    #similarityArrFromMongo = getSimilarityDataFromMongo("Destination_Layer") # ~2.500k similarities    
+    similarityArrFromMongo = getSimilarityDataFromMongo("FinishedTime_Layer")# should have the rest of similarities => 15.000k
    connectedRunFromMongo = getConnectedRunDataFromMongo()
    

@@ -110,6 +113,6 @@ def main():
 def test():
    testInputData()
 ##########START##########
-#main()
-test()
+main()
+#test()
 #########FINISH##########