In [2]:
# NOTE: using pandas proved not practical
from pathlib import Path
from time import perf_counter
from collections import Counter
from itertools import chain
import json
from pprint import pprint
In [2]:
profession = Counter();
t1 = perf_counter()
with open(Path('data') / 'name.basics.tsv', encoding='utf-8') as f:
column = {}
for i, p in enumerate(f.readline().strip().split('\t')):
column[p.strip()] = i
for line in f:
parts = line.strip().split('\t')
glist = parts[column['primaryProfession']].strip().split(',')
if len(glist) and glist[0] != '\\N':
profession.update(glist)
del profession[''] # ?trailing comma?
print(perf_counter() - t1, 's')
18.6012343679904 s
In [3]:
pprint(profession)
Counter({'actor': 3080545, 'actress': 1864307, 'miscellaneous': 1386144, 'producer': 1177175, 'writer': 876570, 'camera_department': 788093, 'director': 716912, 'art_department': 471987, 'sound_department': 386120, 'cinematographer': 382510, 'editor': 354594, 'composer': 330296, 'music_department': 267988, 'assistant_director': 258027, 'visual_effects': 238331, 'make_up_department': 225327, 'animation_department': 216384, 'production_manager': 208340, 'editorial_department': 188765, 'soundtrack': 171205, 'costume_department': 156641, 'transportation_department': 101682, 'art_director': 92793, 'stunts': 88323, 'script_department': 86774, 'location_management': 83672, 'production_designer': 80765, 'costume_designer': 75823, 'special_effects': 67313, 'casting_department': 56794, 'set_decorator': 55477, 'executive': 38339, 'casting_director': 31650, 'manager': 14349, 'talent_agent': 13743, 'legal': 4325, 'publicist': 4183, 'music_artist': 3649, 'assistant': 3593, 'podcaster': 190, 'production_department': 39, 'electrical_department': 7, 'music_supervisor': 4, 'choreographer': 2, 'intimacy_coordinator': 1})
In [4]:
title_types = Counter(); genres = Counter(); # title.basics
t1 = perf_counter()
with open(Path('data') / 'title.basics.tsv', encoding='utf-8') as f:
column = {}
for i, p in enumerate(f.readline().strip().split('\t')):
column[p.strip()] = i
for line in f:
parts = line.strip().split('\t')
title_type = parts[column['titleType']]
if title_type != '\\N':
title_types[title_type] += 1
glist = parts[column['genres']].strip().split(',')
if len(glist) and glist[0] != '\\N':
genres.update(glist)
print(perf_counter() - t1, 's')
21.897611982014496 s
In [5]:
pprint(title_types)
pprint(genres)
Counter({'tvEpisode': 8115351, 'short': 982948, 'movie': 674011, 'video': 289054, 'tvSeries': 258546, 'tvMovie': 144651, 'tvMiniSeries': 52999, 'tvSpecial': 46249, 'videoGame': 37687, 'tvShort': 10275, 'tvPilot': 1}) Counter({'Drama': 3017091, 'Comedy': 2099840, 'Talk-Show': 1303202, 'Short': 1157127, 'Documentary': 1011720, 'Romance': 988474, 'News': 947675, 'Family': 785825, 'Reality-TV': 598622, 'Animation': 534719, 'Crime': 443291, 'Action': 436663, 'Adventure': 416227, 'Music': 400045, 'Game-Show': 387113, 'Adult': 333307, 'Sport': 253523, 'Fantasy': 215414, 'Mystery': 215052, 'Horror': 190430, 'Thriller': 175689, 'History': 156752, 'Biography': 114750, 'Sci-Fi': 114416, 'Musical': 89482, 'War': 40951, 'Western': 30433, 'Film-Noir': 882})
In [6]:
taka_types = Counter(); taka_region = Counter(); taka_lang = Counter(); # title.akas
taka_attributes = Counter() # not enum, but we are curious about the possible values
t1 = perf_counter()
with open(Path('data') / 'title.akas.tsv', encoding='utf-8') as f:
column = {}
for i, p in enumerate(f.readline().strip().split('\t')):
column[p.strip()] = i
for line in f:
parts = line.strip().split('\t')
region = parts[column['region']]
if region != '\\N':
taka_region[region] += 1
lang = parts[column['language']]
if lang != '\\N':
taka_lang[lang] += 1
# there is a quirk in data, some include \u0002 instead of comma
phase1 = parts[column['types']].strip().split(',')
phase2 = [p.split('\x02') for p in phase1]
glist = list(chain.from_iterable(phase2))
if len(glist) and glist[0] != '\\N':
taka_types.update(glist)
glist = parts[column['attributes']].strip().split(',')
if len(glist) and glist[0] != '\\N':
taka_attributes.update(glist)
print(perf_counter() - t1, 's')
103.84395928200684 s
In [7]:
pprint(taka_types)
pprint(taka_region)
pprint(taka_lang)
pprint(taka_attributes)
Counter({'imdbDisplay': 3759642, 'original': 1849964, 'alternative': 123541, 'working': 56975, 'dvd': 22456, 'video': 21711, 'festival': 20579, 'tv': 19114}) Counter({'DE': 4684314, 'JP': 4682243, 'FR': 4674751, 'IN': 4619888, 'ES': 4591991, 'IT': 4571384, 'PT': 4491127, 'US': 1511997, 'GB': 476165, 'CA': 261353, 'XWW': 184904, 'AU': 161742, 'BR': 125841, 'MX': 105547, 'RU': 105119, 'GR': 95004, 'PL': 93841, 'FI': 89115, 'SE': 80852, 'HU': 78171, 'NL': 65062, 'PH': 62498, 'AR': 62390, 'NO': 60236, 'DK': 57630, 'TR': 55223, 'XWG': 52547, 'CN': 43547, 'SUHH': 39593, 'TW': 37274, 'EC': 36940, 'BE': 36572, 'HK': 36395, 'KR': 34367, 'SG': 33180, 'ZA': 33051, 'AT': 31050, 'UA': 30266, 'RO': 29666, 'BG': 28226, 'CZ': 24637, 'RS': 23508, 'IL': 20654, 'ID': 20200, 'IE': 17880, 'IR': 16907, 'AE': 16559, 'XYU': 16470, 'HR': 15570, 'EG': 15269, 'CL': 14965, 'CH': 14372, 'NZ': 14291, 'VN': 13423, 'VE': 13324, 'TH': 12824, 'LT': 11836, 'SK': 11334, 'CSHH': 10344, 'DDDE': 9968, 'SI': 9763, 'CO': 9648, 'NG': 9637, 'PE': 8716, 'EE': 7876, 'UY': 7707, 'XEU': 6777, 'PK': 5627, 'LV': 5380, 'MY': 4952, 'BD': 4563, 'DO': 2988, 'IS': 2807, 'CU': 2509, 'AL': 2286, 'BA': 2275, 'XAS': 2128, 'UZ': 1951, 'LB': 1890, 'PR': 1850, 'LK': 1840, 'KZ': 1621, 'TN': 1557, 'GE': 1552, 'NP': 1457, 'MA': 1315, 'LU': 1230, 'CY': 1189, 'AZ': 1166, 'SA': 1081, 'BY': 1063, 'DZ': 1060, 'CM': 1040, 'MK': 997, 'KE': 978, 'GH': 906, 'AM': 902, 'CR': 889, 'PA': 863, 'KP': 849, 'JM': 810, 'HT': 799, 'YUCS': 787, 'BO': 758, 'UG': 709, 'LI': 674, 'IQ': 645, 'SY': 616, 'QA': 578, 'XKO': 534, 'MT': 511, 'BF': 498, 'PY': 449, 'KW': 437, 'MN': 436, 'GT': 422, 'TZ': 407, 'JO': 395, 'KH': 389, 'SN': 388, 'MD': 357, 'TT': 348, 'XKV': 306, 'MM': 290, 'ME': 282, 'PS': 272, 'AF': 269, 'CI': 265, 'CG': 249, 'XSA': 238, 'SV': 234, 'KG': 217, 'SM': 209, 'CD': 209, 'BJ': 203, 'TG': 199, 'MZ': 197, 'ZW': 197, 'MC': 187, 'BH': 183, 'HN': 169, 'ET': 168, 'BS': 150, 'ZM': 150, 'GI': 142, 'CSXX': 138, 'AO': 136, 'MO': 133, 'NI': 126, 'GA': 121, 'SL': 120, 'NA': 120, 'BW': 113, 'RW': 112, 'VI': 108, 'GP': 105, 'GL': 103, 'NE': 100, 'MG': 100, 'MU': 97, 'MV': 95, 'YE': 91, 'PF': 91, 'TJ': 89, 'LA': 88, 'ML': 85, 'SD': 82, 'FO': 80, 'TM': 77, 'LY': 69, 'BT': 69, 'GU': 65, 'BB': 62, 'GM': 61, 'AD': 57, 'LR': 56, 'MW': 55, 'GN': 54, 'BI': 54, 'VDVN': 52, 'MQ': 49, 'OM': 49, 'KY': 48, 'BUMM': 46, 'XPI': 46, 'IM': 43, 'AN': 43, 'AW': 40, 'FJ': 39, 'BM': 38, 'SC': 37, 'SO': 35, 'BZ': 34, 'SZ': 32, 'GY': 28, 'SR': 28, 'RE': 28, 'TD': 27, 'PG': 27, 'BN': 27, 'CV': 26, 'CF': 25, 'AG': 23, 'GF': 22, 'NC': 21, 'GW': 20, 'TL': 20, 'DM': 19, 'TO': 18, 'MR': 18, 'LS': 15, 'ER': 15, 'LC': 14, 'VG': 13, 'GQ': 12, 'VA': 11, 'KM': 11, 'AQ': 11, 'MH': 10, 'ST': 8, 'SB': 8, 'VC': 8, 'XSI': 7, 'ZRCD': 7, 'VU': 7, 'EH': 6, 'GD': 6, 'AS': 6, 'KN': 6, 'AI': 5, 'WS': 5, 'XAU': 4, 'MP': 4, 'CW': 4, 'KI': 4, 'CK': 4, 'DJ': 3, 'MS': 3, 'TC': 3, 'FM': 2, 'XNA': 2, 'SH': 2, 'CC': 2, 'JE': 2, 'NU': 1, 'TV': 1, 'PW': 1, 'NR': 1}) Counter({'ja': 4540461, 'fr': 4497427, 'hi': 4464630, 'es': 4430180, 'de': 4429755, 'it': 4428000, 'pt': 4427843, 'en': 519582, 'tr': 37189, 'cmn': 37002, 'ru': 36753, 'bg': 26691, 'sv': 10337, 'yue': 8478, 'he': 8292, 'qbn': 7250, 'sr': 6141, 'ca': 5469, 'fa': 4082, 'hr': 1706, 'nl': 1675, 'ar': 1383, 'cs': 1314, 'tl': 1132, 'id': 935, 'ta': 927, 'ml': 904, 'sl': 839, 'te': 716, 'uk': 622, 'qbp': 561, 'sk': 544, 'th': 504, 'ms': 491, 'bn': 470, 'bs': 439, 'ur': 336, 'kn': 208, 'mr': 202, 'gl': 183, 'af': 164, 'la': 161, 'ga': 159, 'eu': 129, 'cy': 106, 'gu': 89, 'gsw': 86, 'az': 83, 'lv': 82, 'kk': 78, 'mk': 76, 'lt': 73, 'yi': 70, 'pa': 67, 'ka': 63, 'mi': 57, 'gd': 55, 'qal': 47, 'et': 42, 'el': 39, 'uz': 36, 'hy': 32, 'ro': 32, 'lb': 27, 'hu': 26, 'be': 24, 'zu': 20, 'fi': 17, 'qbo': 17, 'tg': 14, 'da': 13, 'ko': 13, 'zh': 12, 'xh': 11, 'pl': 10, 'tk': 10, 'ps': 10, 'sd': 8, 'no': 7, 'st': 6, 'ky': 5, 'wo': 5, 'vi': 4, 'tn': 4, 'mn': 4, 'ku': 3, 'is': 2, 'rn': 1, 'eka': 1, 'sq': 1, 'roa': 1, 'rm': 1, 'su': 1, 'jv': 1, 'prs': 1, 'jsl': 1, 'fro': 1, 'haw': 1, 'lo': 1, 'my': 1, 'am': 1, 'qac': 1, 'ne': 1, 'myv': 1, 'br': 1, 'iu': 1, 'cr': 1}) Counter({'transliterated title': 28655, 'alternative spelling': 21057, 'new title': 18577, 'literal English title': 16525, 'complete title': 15810, 'literal title': 13631, 'video box title': 11674, 'reissue title': 11341, 'short title': 10500, 'series title': 9835, 'alternative transliteration': 9754, 'DVD box title': 8224, 'informal English title': 6695, 'informal title': 6500, 'dubbed version': 6344, 'original subtitled version': 5913, 'informal literal title': 5274, 'subtitle': 5099, 'poster title': 4927, 'video catalogue title': 4672, 'long title': 4479, 'informal alternative title': 4193, 'cable TV title': 3627, 'original script title': 3307, 'promotional title': 2967, 'informal short title': 2751, 'informal literal English title': 2520, 'segment title': 2483, 'theatrical title': 2439, 'second season title': 2281, 'review title': 1956, 'DVD menu title': 1547, 'rerun title': 1039, 'pre-release title': 1026, 'first season title': 1013, 'third season title': 933, 'copyright title': 929, 'premiere title': 820, 'TV listings title': 715, 'bootleg title': 619, 'anthology series': 602, 'literal translation of working title': 593, 'second part title': 577, 'expansion title': 575, 'closing credits title': 542, '3-D version': 535, 'fourth season title': 524, 'first part title': 501, 'recut version': 458, 'Berlin film festival title': 442, 'bowdlerized title': 386, 'censored version': 376, 'promotional abbreviation': 372, 'première title': 342, 'cut version': 318, 'fifth season title': 302, 'fake working title': 295, 'short version': 268, 'trailer title': 267, 'orthographically correct title': 258, 'longer version': 216, 'game box title': 199, '16mm release title': 196, 'syndication title': 196, '8mm release title': 187, 'Cannes festival title': 172, 'sixth season title': 165, "director's cut": 164, 'restored version': 161, 'third part title': 155, 'IMAX version': 133, 'seventh season title': 132, 'video CD title': 130, 'soft porn version': 115, 'X-rated version': 112, 'eighth season title': 104, 'long new title': 100, 'unauthorized video title': 85, 'ninth season title': 78, 'second copyright title': 77, 'first episode title': 75, 'teaser title': 73, 'tenth season title': 68, 'last season title': 65, 'Pay-TV title': 65, 'uncensored intended title': 62, 'YIVO translation': 58, 'first episodes title': 56, 'summer title': 52, 'Venice film festival title': 50, 'eleventh season title': 50, 'MIFED title': 47, 'weekend title': 43, 'racier version': 40, 'videogame episode': 35, 'silent version': 29, 'new syndication title': 27, 'twelfth season title': 27, 'Los Angeles premiere title': 24, 'fourteenth season title': 24, 'thirteenth season title': 21, 'sixteenth season title': 18, 'fifteenth season title': 16, 'eighteenth season title': 15, 'nineteenth season title': 15, 'correct transliteration': 14, 'twentieth season title': 14, 'twentyfirst season title': 14, 'seventeenth season title': 14, 'LD title': 12, 'english transliteration': 11, 'twentysecond season title': 11, 'twentyfifth season title': 10, 'non-modified Hepburn romanization': 10, 'twentythird season title': 9, 'thirtythird season title': 8, 'twentyfourth season title': 8, 'PC version': 8, 'thirtieth season title': 7, 'twentysixth season title': 7, 'daytime version title': 6, 'English translation of working title': 6, 'thirtysecond season title': 6, 'twentyseventh season title': 6, 'recut version\x02reissue title': 5, 'modern translation': 5, 'thirtyfirst season title': 5, 'rumored': 5, 'cut version\x02reissue title': 4, 'redubbed comic version': 4, '3-D video title': 4, 'fourth part title': 4, 'thirtysixth season title': 4, 'original pilot title': 4, 'incorrect title': 4, 'Los Angeles première title': 3, 'Yiddish dubbed\x02reissue title': 3, 'armed forces circuit title': 3, 'third and fourth season title': 3, 'informal title\x02literal title': 3, 'title for episodes with guest hosts': 3, 'thirtyfourth season title': 3, 'American Mutoscope & Biograph catalog title': 3, 'thirtyninth season title': 3, 'thirtyeighth season title': 3, 'thirtyseventh season title': 3, 'reissue title\x02short version': 2, 'cut version\x02video box title': 2, 'IMAX version\x02promotional title': 2, 'twentyninth season title': 2, 'thirtyfifth season title': 2, 'twentyeighth season title': 2, 'R-rated version': 2, 'GameCube version': 2, 'first segment title': 2, 'second segment title': 2, '8mm release title\x02short version': 1, 'added framing sequences and narration in Yiddish\x02reissue title': 1, 'Bable dialect title': 1, 'Yiddish dubbed': 1, 'POLart\x02video box title': 1, 'POLart': 1, 'YIVO translation\x02reissue title': 1, 'first two episodes title': 1, 'first three episodes title': 1, 'literal French title': 1, '16mm rental title': 1, 'racier version\x02reissue title': 1, '8mm release title\x02second part title': 1, 'cable TV title\x02cut version': 1, 'closing credits title\x02pre-release title': 1, 'Locarno film festival title': 1, 'longer version\x02rerun title': 1, 'approximation of original mirrored title': 1, 'X-rated version\x02bootleg title': 1, 'poster title\x02video box title': 1, 'dubbed version\x02recut version': 1, 'fortieth season title': 1, 'Hakka dialect title': 1, 'Bilbao festival title': 1, 'promotional title\x02thirteenth season title': 1, 'orthographically correct title\x02video box title': 1, 'late Sunday edition': 1, 'fourth season title\x02recut version': 1, 'third segment title': 1, 'fifth part title': 1})
In [3]:
tp_categories = Counter(); # title.principals
chars = Counter()
t1 = perf_counter()
job_max_len = 0
with open(Path('data') / 'title.principals.tsv', encoding='utf-8') as f:
column = {}
for i, p in enumerate(f.readline().strip().split('\t')):
column[p.strip()] = i
for li, line in enumerate(f):
parts = line.strip().split('\t', len(column))
category = parts[column['category']]
job_max_len = max(job_max_len, len(parts[column['job']]))
if category != '\\N':
tp_categories[category] += 1
if parts[column['characters']] != r'\N':
chars.update(json.loads(parts[column['characters']]))
print(perf_counter() - t1, 's')
130.6693641100428 s
In [4]:
print('job_max_len:', job_max_len)
pprint(tp_categories)
job_max_len: 286 Counter({'actor': 13441424, 'self': 10560091, 'actress': 10489927, 'writer': 8494665, 'director': 7005873, 'producer': 3943935, 'cinematographer': 2067729, 'composer': 2013765, 'editor': 2012579, 'archive_footage': 404370, 'production_designer': 383677, 'archive_sound': 4793})
In [10]:
print(len(chars), chars.total())
pprint(chars.most_common(100))
2735521 30382085 [('Self', 4546843), ('Self - Host', 1878474), ('Self - Presenter', 327377), ('Self - Guest', 325307), ('Self - Panelist', 250371), ('Self - Co-Host', 240752), ('Self - Contestant', 237169), ('Narrator', 170928), ('Themselves', 123295), ('Self - Judge', 116250), ('Self - Announcer', 111500), ('Self - Reporter', 95146), ('Self - Hostess', 94778), ('Self - Correspondent', 79159), ('Various', 74727), ('Self - Newsreader', 65513), ('Self - Narrator', 58859), ('Host', 53434), ('Self - Housemate', 46139), ('Self - Co-Hostess', 45536), ('Presenter', 44458), ('Self - Co-Anchor', 42368), ('Self - Anchor', 39588), ('Self - Model', 37613), ('Presented by', 36132), ('Additional Voices', 35325), ('Various Characters', 34330), ('Self - Interviewee', 32986), ('Self - Analyst', 29627), ('Self - Commentator', 28864), ('Self - Musical Guest', 22758), ('Self - Panellist', 22155), ('Self - Performer', 21038), ('Self - Celebrity Contestant', 18933), ('Self - Team Captain', 18551), ('Self - Programledare', 18360), ('Self - Singer', 17714), ('Alex', 17126), ('Various Roles', 16028), ('Self - Musician', 15965), ('Announcer', 15932), ('Mother', 15794), ('Self - Participant', 15469), ('Laura', 15339), ('Sam', 15078), ('Self - Guest Co-Host', 14795), ('David', 14581), ('James', 13762), ('Self - Coach', 13617), ('John', 13598), ('Self - Weather Forecaster', 13516), ('Self - Guest Host', 13493), ('Sarah', 13359), ('Dancer', 13175), ('Self - Meteorologist', 13100), ('Self - Comedian', 13062), ('Man', 12965), ('Anna', 12355), ('Tom', 12297), ('Jack', 11803), ('Father', 11618), ('Maria', 11568), ('Lisa', 11378), ('Mike', 11339), ('Doctor', 11204), ('Daniel', 11199), ('Self - Bandleader', 11150), ('Self - Emcee', 10860), ('Woman', 10786), ('Dad', 10725), ('Self - Jury', 10675), ('Self - Sports Newsreader', 10662), ('Max', 10575), ('Self - Guest Panelist', 10518), ('Michael', 10484), ('Alice', 10370), ('Guest', 10270), ('Ben', 10210), ('Singer', 10086), ('Paul', 10021), ('Mom', 9942), ('Sara', 9932), ('Girl', 9749), ('Mark', 9606), ('Self - Musical Director', 9553), ('Self - Interviewer', 9366), ('Self - Contributor', 9309), ('Self - Color Commentator', 9137), ('Chris', 9075), ('Charlie', 9070), ('Self - Chef', 8970), ('Self - Dancer', 8910), ('Self - Jury Member', 8861), ('Self - Lexicographer', 8808), ('Self - Play-by-Play Announcer', 8778), ('Self - News Anchor', 8735), ('Susan', 8698), ('Emma', 8596), ('Thomas', 8465), ('Themselves - Musical Guest', 8460)]
In [12]:
with open('enums.json', 'w', encoding='utf-8') as f:
json.dump({'profession': list(sorted(profession.keys())),
'title_type': list(sorted(title_types.keys())),
'genre': list(sorted(genres.keys())),
't_aka_type': list(sorted(taka_types.keys())),
'region': list(sorted(taka_region.keys())),
'language': list(sorted(taka_lang.keys())),
'category': list(sorted(tp_categories.keys())),
'character': list(sorted(chars.keys()))
},
f,
indent=2)
In [ ]: