Create schema and import data - IMDb non-commercial datasets¶
imports could be faster with some changes
In [2]:
import mariadb
import json
from pathlib import Path
from time import perf_counter
from pprint import pprint
from itertools import chain, repeat
In [3]:
# connect database
conn = mariadb.connect(
user="root",
password="root",
host="localhost",
port=3306,
)
c = conn.cursor()
Create schema¶
In [4]:
#c.execute('DROP SCHEMA IF EXISTS imdb')
c.execute('CREATE SCHEMA IF NOT EXISTS imdb')
c.execute('USE imdb')
In [5]:
with open('create_schema.sql', encoding='utf-8') as create_f:
create_sql = [statemet.replace('InnoDB', 'Aria') for statemet in create_f.read().split(';')]
for sql in create_sql:
if len(sql.strip()): # empty query at the end
c.execute(sql)
Load and insert enumerations¶
In [6]:
with open('enums.json', encoding='utf-8') as enums_f:
enums = json.load(enums_f)
for key in enums: # transform to lookup table for ids: enums['genre']['Comedy'] => genre.id
enums[key] = {name: i for i, name in enumerate(enums[key], 1)}
# pprint(enums)
print(*((k, len(l)) for k, l in enums.items()))
('profession', 45) ('title_type', 11) ('genre', 28) ('t_aka_type', 8) ('region', 248) ('language', 107) ('category', 12) ('character', 2735521)
In [6]:
for table_name, e in enums.items():
if table_name != 'character': # is imported in tmp_imports6.py
for name, id_ in e.items():
c.execute(f'INSERT INTO `{table_name}` (id, name) VALUES (?, ?)', (id_, name))
helper functions¶
In [7]:
# (used in scripts)
def idstrip(idstr, prefix_len=2):
return int(idstr.strip()[prefix_len:])
def id_helper(parts, column, col_name, prefix_len):
return idstrip(parts[column[col_name]], prefix_len)
def value_helper(parts, column, col_name, cls):
cv = parts[column[col_name]].strip()
return cls(cv) if cv != r'\N' else None
def enum_helper(parts, column, col_name, enum_name):
v = value_helper(parts, column, col_name, str)
return enums[enum_name][v] if v is not None else None
# valhelp and idhelp uses the parts and column directly from current scope
valhelp = lambda col_name, cls=str: value_helper(parts, column, col_name, cls)
idhelp = lambda col_name, prefix_len=2: id_helper(parts, column, col_name, prefix_len)
enmhelp = lambda col_name, enum_name: enum_helper(parts, column, col_name, enum_name)
In [13]:
# all imported using scripts:
# tmp_imports0.py,
# tmp_imports1.py,
# tmp_imports2.py,
# tmp_imports3.py,
# tmp_imports4.py,
# tmp_imports5.py,
# tmp_imports6.py,
# tmp_imports7.py