[SemanticLinking] Added youtube dummy upload

b0499784 · Alexander Lercher · 271733ea · b0499784 · b0499784 · b0499784
Commit b0499784 authored May 05, 2021 by Alexander Lercher
3 changed files
--- a/src/data-hub/semantic-linking-microservice/app/dummy_upload/community-prediction-youtube/.gitignore
+++ b/src/data-hub/semantic-linking-microservice/app/dummy_upload/community-prediction-youtube/.gitignore
+# this file contains all the training data but is large.
+videos.csv
\ No newline at end of file
--- a/src/data-hub/semantic-linking-microservice/app/dummy_upload/community-prediction-youtube/dummy_upload.py
+++ b/src/data-hub/semantic-linking-microservice/app/dummy_upload/community-prediction-youtube/dummy_upload.py
+import csv
+import hashlib
+
+import sys
+import os
+modules_paths = ['.', '../../../modules/']
+for modules_path in modules_paths:
+    if os.path.exists(modules_path):
+        sys.path.insert(1, modules_path)
+
+from messaging.MessageHandler import MessageHandler
+from db.repository import Repository
+
+# file to read the data from
+CSV_FILE = r'dummy_upload/community-prediction-youtube/videos.csv'
+handler = MessageHandler(Repository())
+
+
+import csv
+import json
+from datetime import datetime
+from typing import Iterator
+import pandas as pd
+from pandas import DataFrame
+
+
+def load_csv_content() -> Iterator:
+    '''Returns a generator for all lines in the csv file with correct field types.'''
+    dfs: DataFrame = pd.read_csv(CSV_FILE)
+    return dfs.iterrows()
+
+def upload_transaction(transaction):
+    # manually flatten based on table mapping
+    uid = transaction['video_id']
+
+    transaction['UniqueID'] = uid
+    transaction['trend_delay'] = transaction['trend_duration']
+    transaction['timestamp'] = transaction['trending_timestamp']
+
+    del transaction['trend_duration']
+    del transaction['trending_timestamp']
+
+    t = {        
+        'use_case':  'community-prediction-youtube-n',
+        'table':  'community-prediction-youtube-n',
+        'id': uid,
+        'properties': transaction,
+    }
+    handler.handle_new_trace(t)
+
+
+if __name__ == '__main__':    
+    entries = load_csv_content()
+
+    for idx, transaction in entries:
+        transaction = transaction.to_dict()
+        upload_transaction(transaction)
+
+        if idx % 1000 == 0:
+            print(f"Progress: {str(float(idx) / 375942)} %")
\ No newline at end of file
--- a/tools/check-use-case-data/check_uc.py
+++ b/tools/check-use-case-data/check_uc.py
+import requests
+requests.packages.urllib3.disable_warnings() 
+from icecream import ic
+
+uc = 'community-prediction-youtube'
+
+def httpget(url):
+    token = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1c2VybmFtZSI6InJlZ3VsYXJAaXRlYy5hYXUuYXQiLCJjcmVhdGVkX2F0IjoiMjAyMS0wNS0wNSAxMTozNjozOC4yMzAxODEiLCJ2YWxpZF91bnRpbCI6IjIwMjEtMDUtMDYgMTE6MzY6MzguMjMwMTgxIn0.Fz6iPpA0CnrXlOCj-VuCHFzc58H9Of2cBYHOb_RqvzI'
+    res = requests.get(url, 
+                    verify=False, 
+                    headers = { "Authorization": f"Bearer {token}"})
+    return res
+
+# list tables
+res = httpget(url = f'https://articonf1.itec.aau.at:30420/api/use-cases/{uc}/tables')
+print("Tables: ", [entry['name'] for entry in res.json()])
+
+# count pushed data
+def count_data(json_res, table_identifier='table'):
+    tables = {}
+    for entry in json_res:
+        key = entry[table_identifier]
+        if key not in tables:
+            tables[key] = 0
+        tables[key] += 1
+    ic(tables)
+
+res = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions')
+count_data(res.json())
+
+res_f = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions-failed')
+count_data(res_f.json(), 'docType')
+
+res_d = httpget(url = f'https://articonf1.itec.aau.at:30001/api/use_cases/{uc}/transactions-duplicated')
+count_data(res_d.json())
\ No newline at end of file