Commit 688215a0 authored by Alexander Lercher's avatar Alexander Lercher

Upload of data set to semantic linking

parent b1b2337a
# this file contains all the training data but is around 1.8 GB.
train.csv
\ No newline at end of file
import csv
import hashlib
import sys
import os
modules_paths = ['.', '../../../modules/']
for modules_path in modules_paths:
if os.path.exists(modules_path):
sys.path.insert(1, modules_path)
from messaging.MessageHandler import MessageHandler
from db.repository import Repository
# file to read the data from
CSV_FILE = r'dummy_upload/community-prediction-taxi/train.csv'
handler = MessageHandler(Repository())
import csv
import json
from datetime import datetime
from typing import Iterator
enum_mapping = {'A': 1, 'B': 2, 'C': 3}
def load_csv_content() -> Iterator:
'''Returns a generator for all lines in the csv file with correct field types.'''
with open(CSV_FILE) as csv_file:
reader = csv.reader(csv_file)
headers = [h.lower() for h in next(reader)]
for line in reader:
# convert line fields to correct type
for i in range(len(headers)):
# trip_id AS string
if i == 0:
continue
# call_type, day_type
if i in [1, 6]:
line[i] = enum_mapping[line[i]]
# origin_call, origin_stand, taxi_id AS int
elif i in [2, 3, 4]:
line[i] = int(line[i]) if line[i] != "" else ""
# timestamp AS timestamp
elif i == 5:
# datetime is not serializable
# line[i] = datetime.fromtimestamp(int(line[i]))
line[i] = int(line[i])
# missing_data AS bool
elif i == 7:
line[i] = line[i].lower() == 'true'
# polyline AS List[List[float]]
elif i == 8:
line[i] = json.loads(line[i])
entry = dict(zip(headers, line))
yield entry
def upload_transaction(transaction):
# manually flatten based on table mapping
uid = transaction['trip_id']
transaction['UniqueID'] = uid
if len(transaction['polyline']) == 0:
print(f"skipping transaction: {transaction}")
return
transaction['start_location_lat'] = transaction['polyline'][0][0]
transaction['start_location_long'] = transaction['polyline'][0][1]
transaction['end_location_lat'] = transaction['polyline'][-1][0]
transaction['end_location_long'] = transaction['polyline'][-1][1]
del transaction['trip_id']
del transaction['polyline']
t = {
'use_case': 'community-prediction-taxi',
'table': 'community-prediction-taxi',
'id': uid,
'properties': transaction,
}
handler.handle_new_trace(t)
if __name__ == '__main__':
entries = load_csv_content()
for idx, transaction in enumerate(entries):
upload_transaction(transaction)
if idx % 1000 == 0:
print(f"Progress: {str(float(idx) / 1710671)} %")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment