# NOTE: using pandas proved not practical
from pathlib import Path
from time import perf_counter
from collections import Counter
from itertools import chain
import json
from pprint import pprint

profession = Counter();
t1 = perf_counter()
with open(Path('data') / 'name.basics.tsv', encoding='utf-8') as f:
    column = {}
    for i, p in enumerate(f.readline().strip().split('\t')):
        column[p.strip()] = i
    for line in f:
        parts = line.strip().split('\t')
        glist = parts[column['primaryProfession']].strip().split(',')
        if len(glist) and glist[0] != '\\N':
            profession.update(glist)
del profession['']  # ?trailing comma?
print(perf_counter() - t1, 's')

18.6012343679904 s

pprint(profession)

Counter({'actor': 3080545,
         'actress': 1864307,
         'miscellaneous': 1386144,
         'producer': 1177175,
         'writer': 876570,
         'camera_department': 788093,
         'director': 716912,
         'art_department': 471987,
         'sound_department': 386120,
         'cinematographer': 382510,
         'editor': 354594,
         'composer': 330296,
         'music_department': 267988,
         'assistant_director': 258027,
         'visual_effects': 238331,
         'make_up_department': 225327,
         'animation_department': 216384,
         'production_manager': 208340,
         'editorial_department': 188765,
         'soundtrack': 171205,
         'costume_department': 156641,
         'transportation_department': 101682,
         'art_director': 92793,
         'stunts': 88323,
         'script_department': 86774,
         'location_management': 83672,
         'production_designer': 80765,
         'costume_designer': 75823,
         'special_effects': 67313,
         'casting_department': 56794,
         'set_decorator': 55477,
         'executive': 38339,
         'casting_director': 31650,
         'manager': 14349,
         'talent_agent': 13743,
         'legal': 4325,
         'publicist': 4183,
         'music_artist': 3649,
         'assistant': 3593,
         'podcaster': 190,
         'production_department': 39,
         'electrical_department': 7,
         'music_supervisor': 4,
         'choreographer': 2,
         'intimacy_coordinator': 1})

title_types = Counter();  genres = Counter();  #  title.basics
t1 = perf_counter()
with open(Path('data') / 'title.basics.tsv', encoding='utf-8') as f:
    column = {}
    for i, p in enumerate(f.readline().strip().split('\t')):
        column[p.strip()] = i
    for line in f:
        parts = line.strip().split('\t')
        title_type = parts[column['titleType']]
        if title_type != '\\N':
            title_types[title_type] += 1
        glist = parts[column['genres']].strip().split(',')
        if len(glist) and glist[0] != '\\N':
            genres.update(glist)
print(perf_counter() - t1, 's')

21.897611982014496 s

pprint(title_types)
pprint(genres)

Counter({'tvEpisode': 8115351,
         'short': 982948,
         'movie': 674011,
         'video': 289054,
         'tvSeries': 258546,
         'tvMovie': 144651,
         'tvMiniSeries': 52999,
         'tvSpecial': 46249,
         'videoGame': 37687,
         'tvShort': 10275,
         'tvPilot': 1})
Counter({'Drama': 3017091,
         'Comedy': 2099840,
         'Talk-Show': 1303202,
         'Short': 1157127,
         'Documentary': 1011720,
         'Romance': 988474,
         'News': 947675,
         'Family': 785825,
         'Reality-TV': 598622,
         'Animation': 534719,
         'Crime': 443291,
         'Action': 436663,
         'Adventure': 416227,
         'Music': 400045,
         'Game-Show': 387113,
         'Adult': 333307,
         'Sport': 253523,
         'Fantasy': 215414,
         'Mystery': 215052,
         'Horror': 190430,
         'Thriller': 175689,
         'History': 156752,
         'Biography': 114750,
         'Sci-Fi': 114416,
         'Musical': 89482,
         'War': 40951,
         'Western': 30433,
         'Film-Noir': 882})

taka_types = Counter(); taka_region = Counter(); taka_lang = Counter(); # title.akas
taka_attributes = Counter() # not enum, but we are curious about the possible values
t1 = perf_counter()
with open(Path('data') / 'title.akas.tsv', encoding='utf-8') as f:
    column = {}
    for i, p in enumerate(f.readline().strip().split('\t')):
        column[p.strip()] = i
    for line in f:
        parts = line.strip().split('\t')
        region = parts[column['region']]
        if region != '\\N':
            taka_region[region] += 1
        lang = parts[column['language']]
        if lang != '\\N':
            taka_lang[lang] += 1
        # there is a quirk in data, some include \u0002 instead of comma
        phase1 = parts[column['types']].strip().split(',')
        phase2 = [p.split('\x02') for p in phase1]
        glist = list(chain.from_iterable(phase2))
        if len(glist) and glist[0] != '\\N':
            taka_types.update(glist)
        glist = parts[column['attributes']].strip().split(',')
        if len(glist) and glist[0] != '\\N':
            taka_attributes.update(glist)            
print(perf_counter() - t1, 's')

103.84395928200684 s

pprint(taka_types)
pprint(taka_region)
pprint(taka_lang)
pprint(taka_attributes)

Counter({'imdbDisplay': 3759642,
         'original': 1849964,
         'alternative': 123541,
         'working': 56975,
         'dvd': 22456,
         'video': 21711,
         'festival': 20579,
         'tv': 19114})
Counter({'DE': 4684314,
         'JP': 4682243,
         'FR': 4674751,
         'IN': 4619888,
         'ES': 4591991,
         'IT': 4571384,
         'PT': 4491127,
         'US': 1511997,
         'GB': 476165,
         'CA': 261353,
         'XWW': 184904,
         'AU': 161742,
         'BR': 125841,
         'MX': 105547,
         'RU': 105119,
         'GR': 95004,
         'PL': 93841,
         'FI': 89115,
         'SE': 80852,
         'HU': 78171,
         'NL': 65062,
         'PH': 62498,
         'AR': 62390,
         'NO': 60236,
         'DK': 57630,
         'TR': 55223,
         'XWG': 52547,
         'CN': 43547,
         'SUHH': 39593,
         'TW': 37274,
         'EC': 36940,
         'BE': 36572,
         'HK': 36395,
         'KR': 34367,
         'SG': 33180,
         'ZA': 33051,
         'AT': 31050,
         'UA': 30266,
         'RO': 29666,
         'BG': 28226,
         'CZ': 24637,
         'RS': 23508,
         'IL': 20654,
         'ID': 20200,
         'IE': 17880,
         'IR': 16907,
         'AE': 16559,
         'XYU': 16470,
         'HR': 15570,
         'EG': 15269,
         'CL': 14965,
         'CH': 14372,
         'NZ': 14291,
         'VN': 13423,
         'VE': 13324,
         'TH': 12824,
         'LT': 11836,
         'SK': 11334,
         'CSHH': 10344,
         'DDDE': 9968,
         'SI': 9763,
         'CO': 9648,
         'NG': 9637,
         'PE': 8716,
         'EE': 7876,
         'UY': 7707,
         'XEU': 6777,
         'PK': 5627,
         'LV': 5380,
         'MY': 4952,
         'BD': 4563,
         'DO': 2988,
         'IS': 2807,
         'CU': 2509,
         'AL': 2286,
         'BA': 2275,
         'XAS': 2128,
         'UZ': 1951,
         'LB': 1890,
         'PR': 1850,
         'LK': 1840,
         'KZ': 1621,
         'TN': 1557,
         'GE': 1552,
         'NP': 1457,
         'MA': 1315,
         'LU': 1230,
         'CY': 1189,
         'AZ': 1166,
         'SA': 1081,
         'BY': 1063,
         'DZ': 1060,
         'CM': 1040,
         'MK': 997,
         'KE': 978,
         'GH': 906,
         'AM': 902,
         'CR': 889,
         'PA': 863,
         'KP': 849,
         'JM': 810,
         'HT': 799,
         'YUCS': 787,
         'BO': 758,
         'UG': 709,
         'LI': 674,
         'IQ': 645,
         'SY': 616,
         'QA': 578,
         'XKO': 534,
         'MT': 511,
         'BF': 498,
         'PY': 449,
         'KW': 437,
         'MN': 436,
         'GT': 422,
         'TZ': 407,
         'JO': 395,
         'KH': 389,
         'SN': 388,
         'MD': 357,
         'TT': 348,
         'XKV': 306,
         'MM': 290,
         'ME': 282,
         'PS': 272,
         'AF': 269,
         'CI': 265,
         'CG': 249,
         'XSA': 238,
         'SV': 234,
         'KG': 217,
         'SM': 209,
         'CD': 209,
         'BJ': 203,
         'TG': 199,
         'MZ': 197,
         'ZW': 197,
         'MC': 187,
         'BH': 183,
         'HN': 169,
         'ET': 168,
         'BS': 150,
         'ZM': 150,
         'GI': 142,
         'CSXX': 138,
         'AO': 136,
         'MO': 133,
         'NI': 126,
         'GA': 121,
         'SL': 120,
         'NA': 120,
         'BW': 113,
         'RW': 112,
         'VI': 108,
         'GP': 105,
         'GL': 103,
         'NE': 100,
         'MG': 100,
         'MU': 97,
         'MV': 95,
         'YE': 91,
         'PF': 91,
         'TJ': 89,
         'LA': 88,
         'ML': 85,
         'SD': 82,
         'FO': 80,
         'TM': 77,
         'LY': 69,
         'BT': 69,
         'GU': 65,
         'BB': 62,
         'GM': 61,
         'AD': 57,
         'LR': 56,
         'MW': 55,
         'GN': 54,
         'BI': 54,
         'VDVN': 52,
         'MQ': 49,
         'OM': 49,
         'KY': 48,
         'BUMM': 46,
         'XPI': 46,
         'IM': 43,
         'AN': 43,
         'AW': 40,
         'FJ': 39,
         'BM': 38,
         'SC': 37,
         'SO': 35,
         'BZ': 34,
         'SZ': 32,
         'GY': 28,
         'SR': 28,
         'RE': 28,
         'TD': 27,
         'PG': 27,
         'BN': 27,
         'CV': 26,
         'CF': 25,
         'AG': 23,
         'GF': 22,
         'NC': 21,
         'GW': 20,
         'TL': 20,
         'DM': 19,
         'TO': 18,
         'MR': 18,
         'LS': 15,
         'ER': 15,
         'LC': 14,
         'VG': 13,
         'GQ': 12,
         'VA': 11,
         'KM': 11,
         'AQ': 11,
         'MH': 10,
         'ST': 8,
         'SB': 8,
         'VC': 8,
         'XSI': 7,
         'ZRCD': 7,
         'VU': 7,
         'EH': 6,
         'GD': 6,
         'AS': 6,
         'KN': 6,
         'AI': 5,
         'WS': 5,
         'XAU': 4,
         'MP': 4,
         'CW': 4,
         'KI': 4,
         'CK': 4,
         'DJ': 3,
         'MS': 3,
         'TC': 3,
         'FM': 2,
         'XNA': 2,
         'SH': 2,
         'CC': 2,
         'JE': 2,
         'NU': 1,
         'TV': 1,
         'PW': 1,
         'NR': 1})
Counter({'ja': 4540461,
         'fr': 4497427,
         'hi': 4464630,
         'es': 4430180,
         'de': 4429755,
         'it': 4428000,
         'pt': 4427843,
         'en': 519582,
         'tr': 37189,
         'cmn': 37002,
         'ru': 36753,
         'bg': 26691,
         'sv': 10337,
         'yue': 8478,
         'he': 8292,
         'qbn': 7250,
         'sr': 6141,
         'ca': 5469,
         'fa': 4082,
         'hr': 1706,
         'nl': 1675,
         'ar': 1383,
         'cs': 1314,
         'tl': 1132,
         'id': 935,
         'ta': 927,
         'ml': 904,
         'sl': 839,
         'te': 716,
         'uk': 622,
         'qbp': 561,
         'sk': 544,
         'th': 504,
         'ms': 491,
         'bn': 470,
         'bs': 439,
         'ur': 336,
         'kn': 208,
         'mr': 202,
         'gl': 183,
         'af': 164,
         'la': 161,
         'ga': 159,
         'eu': 129,
         'cy': 106,
         'gu': 89,
         'gsw': 86,
         'az': 83,
         'lv': 82,
         'kk': 78,
         'mk': 76,
         'lt': 73,
         'yi': 70,
         'pa': 67,
         'ka': 63,
         'mi': 57,
         'gd': 55,
         'qal': 47,
         'et': 42,
         'el': 39,
         'uz': 36,
         'hy': 32,
         'ro': 32,
         'lb': 27,
         'hu': 26,
         'be': 24,
         'zu': 20,
         'fi': 17,
         'qbo': 17,
         'tg': 14,
         'da': 13,
         'ko': 13,
         'zh': 12,
         'xh': 11,
         'pl': 10,
         'tk': 10,
         'ps': 10,
         'sd': 8,
         'no': 7,
         'st': 6,
         'ky': 5,
         'wo': 5,
         'vi': 4,
         'tn': 4,
         'mn': 4,
         'ku': 3,
         'is': 2,
         'rn': 1,
         'eka': 1,
         'sq': 1,
         'roa': 1,
         'rm': 1,
         'su': 1,
         'jv': 1,
         'prs': 1,
         'jsl': 1,
         'fro': 1,
         'haw': 1,
         'lo': 1,
         'my': 1,
         'am': 1,
         'qac': 1,
         'ne': 1,
         'myv': 1,
         'br': 1,
         'iu': 1,
         'cr': 1})
Counter({'transliterated title': 28655,
         'alternative spelling': 21057,
         'new title': 18577,
         'literal English title': 16525,
         'complete title': 15810,
         'literal title': 13631,
         'video box title': 11674,
         'reissue title': 11341,
         'short title': 10500,
         'series title': 9835,
         'alternative transliteration': 9754,
         'DVD box title': 8224,
         'informal English title': 6695,
         'informal title': 6500,
         'dubbed version': 6344,
         'original subtitled version': 5913,
         'informal literal title': 5274,
         'subtitle': 5099,
         'poster title': 4927,
         'video catalogue title': 4672,
         'long title': 4479,
         'informal alternative title': 4193,
         'cable TV title': 3627,
         'original script title': 3307,
         'promotional title': 2967,
         'informal short title': 2751,
         'informal literal English title': 2520,
         'segment title': 2483,
         'theatrical title': 2439,
         'second season title': 2281,
         'review title': 1956,
         'DVD menu title': 1547,
         'rerun title': 1039,
         'pre-release title': 1026,
         'first season title': 1013,
         'third season title': 933,
         'copyright title': 929,
         'premiere title': 820,
         'TV listings title': 715,
         'bootleg title': 619,
         'anthology series': 602,
         'literal translation of working title': 593,
         'second part title': 577,
         'expansion title': 575,
         'closing credits title': 542,
         '3-D version': 535,
         'fourth season title': 524,
         'first part title': 501,
         'recut version': 458,
         'Berlin film festival title': 442,
         'bowdlerized title': 386,
         'censored version': 376,
         'promotional abbreviation': 372,
         'première title': 342,
         'cut version': 318,
         'fifth season title': 302,
         'fake working title': 295,
         'short version': 268,
         'trailer title': 267,
         'orthographically correct title': 258,
         'longer version': 216,
         'game box title': 199,
         '16mm release title': 196,
         'syndication title': 196,
         '8mm release title': 187,
         'Cannes festival title': 172,
         'sixth season title': 165,
         "director's cut": 164,
         'restored version': 161,
         'third part title': 155,
         'IMAX version': 133,
         'seventh season title': 132,
         'video CD title': 130,
         'soft porn version': 115,
         'X-rated version': 112,
         'eighth season title': 104,
         'long new title': 100,
         'unauthorized video title': 85,
         'ninth season title': 78,
         'second copyright title': 77,
         'first episode title': 75,
         'teaser title': 73,
         'tenth season title': 68,
         'last season title': 65,
         'Pay-TV title': 65,
         'uncensored intended title': 62,
         'YIVO translation': 58,
         'first episodes title': 56,
         'summer title': 52,
         'Venice film festival title': 50,
         'eleventh season title': 50,
         'MIFED title': 47,
         'weekend title': 43,
         'racier version': 40,
         'videogame episode': 35,
         'silent version': 29,
         'new syndication title': 27,
         'twelfth season title': 27,
         'Los Angeles premiere title': 24,
         'fourteenth season title': 24,
         'thirteenth season title': 21,
         'sixteenth season title': 18,
         'fifteenth season title': 16,
         'eighteenth season title': 15,
         'nineteenth season title': 15,
         'correct transliteration': 14,
         'twentieth season title': 14,
         'twentyfirst season title': 14,
         'seventeenth season title': 14,
         'LD title': 12,
         'english transliteration': 11,
         'twentysecond season title': 11,
         'twentyfifth season title': 10,
         'non-modified Hepburn romanization': 10,
         'twentythird season title': 9,
         'thirtythird season title': 8,
         'twentyfourth season title': 8,
         'PC version': 8,
         'thirtieth season title': 7,
         'twentysixth season title': 7,
         'daytime version title': 6,
         'English translation of working title': 6,
         'thirtysecond season title': 6,
         'twentyseventh season title': 6,
         'recut version\x02reissue title': 5,
         'modern translation': 5,
         'thirtyfirst season title': 5,
         'rumored': 5,
         'cut version\x02reissue title': 4,
         'redubbed comic version': 4,
         '3-D video title': 4,
         'fourth part title': 4,
         'thirtysixth season title': 4,
         'original pilot title': 4,
         'incorrect title': 4,
         'Los Angeles première title': 3,
         'Yiddish dubbed\x02reissue title': 3,
         'armed forces circuit title': 3,
         'third and fourth season title': 3,
         'informal title\x02literal title': 3,
         'title for episodes with guest hosts': 3,
         'thirtyfourth season title': 3,
         'American Mutoscope & Biograph catalog title': 3,
         'thirtyninth season title': 3,
         'thirtyeighth season title': 3,
         'thirtyseventh season title': 3,
         'reissue title\x02short version': 2,
         'cut version\x02video box title': 2,
         'IMAX version\x02promotional title': 2,
         'twentyninth season title': 2,
         'thirtyfifth season title': 2,
         'twentyeighth season title': 2,
         'R-rated version': 2,
         'GameCube version': 2,
         'first segment title': 2,
         'second segment title': 2,
         '8mm release title\x02short version': 1,
         'added framing sequences and narration in Yiddish\x02reissue title': 1,
         'Bable dialect title': 1,
         'Yiddish dubbed': 1,
         'POLart\x02video box title': 1,
         'POLart': 1,
         'YIVO translation\x02reissue title': 1,
         'first two episodes title': 1,
         'first three episodes title': 1,
         'literal French title': 1,
         '16mm rental title': 1,
         'racier version\x02reissue title': 1,
         '8mm release title\x02second part title': 1,
         'cable TV title\x02cut version': 1,
         'closing credits title\x02pre-release title': 1,
         'Locarno film festival title': 1,
         'longer version\x02rerun title': 1,
         'approximation of original mirrored title': 1,
         'X-rated version\x02bootleg title': 1,
         'poster title\x02video box title': 1,
         'dubbed version\x02recut version': 1,
         'fortieth season title': 1,
         'Hakka dialect title': 1,
         'Bilbao festival title': 1,
         'promotional title\x02thirteenth season title': 1,
         'orthographically correct title\x02video box title': 1,
         'late Sunday edition': 1,
         'fourth season title\x02recut version': 1,
         'third segment title': 1,
         'fifth part title': 1})

tp_categories = Counter(); # title.principals
chars = Counter()
t1 = perf_counter()
job_max_len = 0
with open(Path('data') / 'title.principals.tsv', encoding='utf-8') as f:
    column = {}
    for i, p in enumerate(f.readline().strip().split('\t')):
        column[p.strip()] = i
    for li, line in enumerate(f):
        parts = line.strip().split('\t', len(column))
        category = parts[column['category']]
        job_max_len = max(job_max_len, len(parts[column['job']]))
        if category != '\\N':
            tp_categories[category] += 1
        if parts[column['characters']] != r'\N':
            chars.update(json.loads(parts[column['characters']]))
print(perf_counter() - t1, 's')

130.6693641100428 s

print('job_max_len:', job_max_len)
pprint(tp_categories)

job_max_len: 286
Counter({'actor': 13441424,
         'self': 10560091,
         'actress': 10489927,
         'writer': 8494665,
         'director': 7005873,
         'producer': 3943935,
         'cinematographer': 2067729,
         'composer': 2013765,
         'editor': 2012579,
         'archive_footage': 404370,
         'production_designer': 383677,
         'archive_sound': 4793})

print(len(chars), chars.total())
pprint(chars.most_common(100))

2735521 30382085
[('Self', 4546843),
 ('Self - Host', 1878474),
 ('Self - Presenter', 327377),
 ('Self - Guest', 325307),
 ('Self - Panelist', 250371),
 ('Self - Co-Host', 240752),
 ('Self - Contestant', 237169),
 ('Narrator', 170928),
 ('Themselves', 123295),
 ('Self - Judge', 116250),
 ('Self - Announcer', 111500),
 ('Self - Reporter', 95146),
 ('Self - Hostess', 94778),
 ('Self - Correspondent', 79159),
 ('Various', 74727),
 ('Self - Newsreader', 65513),
 ('Self - Narrator', 58859),
 ('Host', 53434),
 ('Self - Housemate', 46139),
 ('Self - Co-Hostess', 45536),
 ('Presenter', 44458),
 ('Self - Co-Anchor', 42368),
 ('Self - Anchor', 39588),
 ('Self - Model', 37613),
 ('Presented by', 36132),
 ('Additional Voices', 35325),
 ('Various Characters', 34330),
 ('Self - Interviewee', 32986),
 ('Self - Analyst', 29627),
 ('Self - Commentator', 28864),
 ('Self - Musical Guest', 22758),
 ('Self - Panellist', 22155),
 ('Self - Performer', 21038),
 ('Self - Celebrity Contestant', 18933),
 ('Self - Team Captain', 18551),
 ('Self - Programledare', 18360),
 ('Self - Singer', 17714),
 ('Alex', 17126),
 ('Various Roles', 16028),
 ('Self - Musician', 15965),
 ('Announcer', 15932),
 ('Mother', 15794),
 ('Self - Participant', 15469),
 ('Laura', 15339),
 ('Sam', 15078),
 ('Self - Guest Co-Host', 14795),
 ('David', 14581),
 ('James', 13762),
 ('Self - Coach', 13617),
 ('John', 13598),
 ('Self - Weather Forecaster', 13516),
 ('Self - Guest Host', 13493),
 ('Sarah', 13359),
 ('Dancer', 13175),
 ('Self - Meteorologist', 13100),
 ('Self - Comedian', 13062),
 ('Man', 12965),
 ('Anna', 12355),
 ('Tom', 12297),
 ('Jack', 11803),
 ('Father', 11618),
 ('Maria', 11568),
 ('Lisa', 11378),
 ('Mike', 11339),
 ('Doctor', 11204),
 ('Daniel', 11199),
 ('Self - Bandleader', 11150),
 ('Self - Emcee', 10860),
 ('Woman', 10786),
 ('Dad', 10725),
 ('Self - Jury', 10675),
 ('Self - Sports Newsreader', 10662),
 ('Max', 10575),
 ('Self - Guest Panelist', 10518),
 ('Michael', 10484),
 ('Alice', 10370),
 ('Guest', 10270),
 ('Ben', 10210),
 ('Singer', 10086),
 ('Paul', 10021),
 ('Mom', 9942),
 ('Sara', 9932),
 ('Girl', 9749),
 ('Mark', 9606),
 ('Self - Musical Director', 9553),
 ('Self - Interviewer', 9366),
 ('Self - Contributor', 9309),
 ('Self - Color Commentator', 9137),
 ('Chris', 9075),
 ('Charlie', 9070),
 ('Self - Chef', 8970),
 ('Self - Dancer', 8910),
 ('Self - Jury Member', 8861),
 ('Self - Lexicographer', 8808),
 ('Self - Play-by-Play Announcer', 8778),
 ('Self - News Anchor', 8735),
 ('Susan', 8698),
 ('Emma', 8596),
 ('Thomas', 8465),
 ('Themselves - Musical Guest', 8460)]

with open('enums.json', 'w', encoding='utf-8') as f:
    json.dump({'profession': list(sorted(profession.keys())),
               'title_type': list(sorted(title_types.keys())),
               'genre': list(sorted(genres.keys())), 
               't_aka_type': list(sorted(taka_types.keys())), 
               'region': list(sorted(taka_region.keys())), 
               'language': list(sorted(taka_lang.keys())),
               'category': list(sorted(tp_categories.keys())),
               'character': list(sorted(chars.keys()))
              },
             f,
             indent=2)