In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Skip because data is already merged
# iterate over all .csv files in directory and append them into one large file
#i=0
#for file in os.listdir(os.getcwd()):
 #   if file.endswith(".csv"):
  #      if i==0:
   #         train_data = pd.read_csv(file, sep='|')
    #        i+=1
     #   else:
      #      temp = pd.read_csv(file, sep='|')
       #     train_data = train_data.append(temp, ignore_index=True)

In [2]:
train_data = pd.read_csv('TMDB_raw_data_merge.csv', encoding = 'utf8',  sep = '|', engine = 'python')
train_data.shape


Out[2]:
(159212, 21)

In [3]:
# remove rows that have data in the wrong column or other idiosyncrasies 
train_data = train_data[train_data['id'].str.contains('~')==False]
train_data = train_data[train_data['budget'].str.contains('~')==False]
train_data = train_data[train_data['revenue'].str.contains('~')==False]
train_data = train_data[train_data['popularity'].str.contains('~')==False]
train_data = train_data[train_data['vote_count'].str.contains('~')==False]
train_data = train_data[train_data['vote_average'].str.contains('~')==False]
train_data = train_data[train_data['adult'].str.contains('~')==False]
train_data.shape


Out[3]:
(156327, 21)

In [4]:
train_data = train_data.reset_index(drop=True)

In [5]:
# for each column that stores multiple values as a string separated by '~'
# split that string apart and store the name of that column to be broken into dummies
dummy_names = []
for name in list(train_data):
    if name != 'title' and name != 'original_title' and name != 'overview' and name != 'tagline':
        if train_data[name].dtype == 'O' and np.sum(train_data[name].str.contains('~'))!= 0 or name == 'original_language':
            train_data[name] = train_data[name].str.split('~')
            print name
            dummy_names.append(name)


genres
original_language
production_companies
production_countries
spoken_languages

In [6]:
# for each column that was split on '~' except for genre make dummy variables for each
# unique value and concatenate to original data frame
for name in dummy_names:
    if name == 'genres':
        temp = train_data['genres'].str.join(sep='*').str.get_dummies(sep='*')
        labels = ['Unlabeled']*temp.shape[0]
        for i in range(temp.shape[0]):
            if np.sum(temp.loc[i,['28','12','14','878','53','37']]) != 0:
                labels[i] = 'Action'
            if np.sum(temp.loc[i,['80','18','9648','10402','36','10752']]) != 0:
                labels[i] = 'Drama'
            if np.sum(temp.loc[i,['35']]) != 0:
                labels[i] = 'Comedy'
            if np.sum(temp.loc[i,['10751','16']]) != 0:
                labels[i] = 'Family'
            if np.sum(temp.loc[i,['10749']]) != 0:
                labels[i] = 'Romance'
            if np.sum(temp.loc[i,['99']]) != 0:
                labels[i] = 'Documentary'
            if np.sum(temp.loc[i,['27']]) != 0:
                labels[i] = 'Horror'
            if i%15000 == 0:
                print i
        
        labels = pd.DataFrame(labels)
        labels.columns = ['Labels']


0
15000
30000
45000
60000
75000
90000
105000
120000
135000
150000

In [7]:
# put all companies in an array to see how many unique companies there are
companies = []
for i in range(train_data.shape[0]):
    companies = np.append(companies, np.array(train_data['production_companies'][i]))
    if i%15000 == 0:
        print i


0
15000
30000
45000
60000
75000
90000
105000
120000
135000
150000

In [30]:
# find unique companies and counts
unique, counts = np.unique(companies, return_counts = True)
len(unique)


Out[30]:
46133

In [29]:
# sort by most frequent company
sorted(zip(counts,unique), reverse=True)


Out[29]:
[(67438, u'nan'),
 (1476, u'6194'),
 (1325, u'8411'),
 (1074, u'33'),
 (1012, u'4'),
 (894, u'306'),
 (867, u'441'),
 (594, u'5358'),
 (581, u'5798'),
 (569, u'5996'),
 (524, u'5'),
 (501, u'5120'),
 (395, u'3166'),
 (369, u'8520'),
 (366, u'6'),
 (338, u'882'),
 (331, u'3324'),
 (325, u'60'),
 (314, u'1432'),
 (312, u'4395'),
 (295, u'12'),
 (294, u'602'),
 (294, u'2'),
 (260, u'10417'),
 (258, u'4606'),
 (254, u'14317'),
 (244, u'8356'),
 (236, u'9255'),
 (229, u'9195'),
 (229, u'8402'),
 (222, u'18367'),
 (219, u'10330'),
 (216, u'17027'),
 (211, u'955'),
 (206, u'559'),
 (190, u'19788'),
 (188, u'1931'),
 (183, u'7025'),
 (183, u'14'),
 (183, u'10845'),
 (181, u'16466'),
 (179, u'9'),
 (177, u'25'),
 (174, u'235'),
 (169, u'22221'),
 (169, u'18659'),
 (165, u'3823'),
 (163, u'5940'),
 (159, u'6181'),
 (151, u'61'),
 (151, u'288'),
 (149, u'694'),
 (144, u'83'),
 (144, u'5542'),
 (141, u'5766'),
 (137, u'21404'),
 (136, u'328'),
 (133, u'2683'),
 (133, u'1314'),
 (129, u'168'),
 (128, u'4676'),
 (127, u'9266'),
 (127, u'3094'),
 (123, u'7036'),
 (122, u'881'),
 (122, u'174'),
 (121, u'5070'),
 (120, u'364'),
 (120, u'3391'),
 (120, u'201'),
 (119, u'41'),
 (118, u'7295'),
 (118, u'6111'),
 (113, u'1704'),
 (111, u'4928'),
 (110, u'6639'),
 (110, u'5928'),
 (110, u'3268'),
 (110, u'12364'),
 (109, u'4581'),
 (108, u'2521'),
 (107, u'79'),
 (107, u'7333'),
 (106, u'508'),
 (103, u'7201'),
 (103, u'591'),
 (102, u'3221'),
 (102, u'14599'),
 (101, u'5906'),
 (101, u'181'),
 (100, u'7405'),
 (100, u'3681'),
 (100, u'1632'),
 (99, u'856'),
 (98, u'21978'),
 (94, u'7429'),
 (94, u'4207'),
 (93, u'9335'),
 (93, u'6916'),
 (93, u'27'),
 (91, u'1311'),
 (90, u'43'),
 (90, u'1950'),
 (89, u'48266'),
 (89, u'35'),
 (88, u'5175'),
 (88, u'2785'),
 (88, u'10163'),
 (87, u'4056'),
 (86, u'2650'),
 (86, u'19781'),
 (86, u'1251'),
 (85, u'6301'),
 (84, u'915'),
 (84, u'6068'),
 (83, u'56'),
 (83, u'3'),
 (82, u'925'),
 (82, u'5897'),
 (81, u'97'),
 (80, u'76'),
 (79, u'9349'),
 (79, u'850'),
 (79, u'7446'),
 (79, u'1115'),
 (77, u'641'),
 (76, u'806'),
 (76, u'5755'),
 (75, u'8273'),
 (75, u'6896'),
 (75, u'3052'),
 (75, u'10947'),
 (74, u'1382'),
 (74, u'10254'),
 (73, u'23'),
 (73, u'11795'),
 (72, u'5164'),
 (71, u'3282'),
 (70, u'7981'),
 (70, u'491'),
 (70, u'3341'),
 (70, u'15671'),
 (70, u'13'),
 (69, u'9181'),
 (69, u'5094'),
 (69, u'29587'),
 (67, u'7521'),
 (67, u'7254'),
 (67, u'429'),
 (67, u'3214'),
 (67, u'3153'),
 (67, u'13852'),
 (66, u'4867'),
 (66, u'16804'),
 (65, u'7281'),
 (64, u'537'),
 (64, u'2452'),
 (64, u'12874'),
 (63, u'7466'),
 (63, u'5870'),
 (63, u'4288'),
 (62, u'711'),
 (62, u'119'),
 (62, u'11613'),
 (62, u'10611'),
 (61, u'9027'),
 (61, u'308'),
 (61, u'1957'),
 (61, u'18595'),
 (61, u'1242'),
 (61, u'10399'),
 (60, u'7680'),
 (60, u'7575'),
 (60, u'6246'),
 (60, u'3287'),
 (60, u'2320'),
 (60, u'1701'),
 (60, u'1679'),
 (60, u'1429'),
 (59, u'4641'),
 (59, u'3653'),
 (59, u'18908'),
 (59, u'10473'),
 (58, u'875'),
 (58, u'521'),
 (58, u'444'),
 (58, u'223'),
 (58, u'20091'),
 (58, u'12113'),
 (57, u'5724'),
 (57, u'4688'),
 (57, u'17513'),
 (57, u'1728'),
 (57, u'12530'),
 (57, u'10210'),
 (56, u'7961'),
 (56, u'7584'),
 (56, u'4063'),
 (55, u'8861'),
 (55, u'528'),
 (55, u'1645'),
 (55, u'157'),
 (54, u'3172'),
 (54, u'126'),
 (54, u'1'),
 (53, u'8302'),
 (53, u'8237'),
 (53, u'6452'),
 (53, u'6018'),
 (53, u'5975'),
 (53, u'4343'),
 (53, u'42019'),
 (53, u'356'),
 (53, u'311'),
 (53, u'2865'),
 (52, u'1885'),
 (52, u'17980'),
 (52, u'1216'),
 (52, u'11907'),
 (51, u'6790'),
 (51, u'15913'),
 (51, u'1302'),
 (51, u'104'),
 (50, u'7819'),
 (50, u'6438'),
 (50, u'310'),
 (49, u'6586'),
 (49, u'47'),
 (49, u'3033'),
 (49, u'15808'),
 (49, u'15567'),
 (49, u'14620'),
 (49, u'12617'),
 (49, u'118'),
 (49, u'11672'),
 (49, u'10146'),
 (49, u'10104'),
 (48, u'7508'),
 (48, u'6793'),
 (48, u'622'),
 (48, u'5822'),
 (48, u'2073'),
 (48, u'15165'),
 (48, u'1478'),
 (48, u'12372'),
 (48, u'11240'),
 (48, u'10708'),
 (48, u'10625'),
 (47, u'82'),
 (47, u'66848'),
 (47, u'6292'),
 (47, u'3602'),
 (47, u'3213'),
 (47, u'26727'),
 (47, u'264'),
 (47, u'191'),
 (47, u'1569'),
 (47, u'1393'),
 (47, u'10932'),
 (47, u'101'),
 (46, u'6735'),
 (46, u'497'),
 (46, u'3065'),
 (46, u'258'),
 (46, u'20915'),
 (46, u'14171'),
 (46, u'1370'),
 (46, u'12782'),
 (46, u'11246'),
 (46, u'11199'),
 (45, u'7454'),
 (45, u'5826'),
 (45, u'27434'),
 (45, u'27155'),
 (45, u'14723'),
 (45, u'1444'),
 (44, u'9342'),
 (44, u'7110'),
 (44, u'659'),
 (44, u'4300'),
 (44, u'29165'),
 (44, u'28208'),
 (44, u'2441'),
 (44, u'1647'),
 (44, u'16110'),
 (44, u'16'),
 (44, u'15868'),
 (44, u'15276'),
 (44, u'12093'),
 (44, u'10342'),
 (44, u'10339'),
 (43, u'8582'),
 (43, u'74327'),
 (43, u'729'),
 (43, u'60390'),
 (43, u'46'),
 (43, u'41077'),
 (43, u'3958'),
 (43, u'35795'),
 (43, u'333'),
 (43, u'2953'),
 (43, u'25894'),
 (43, u'14063'),
 (42, u'94'),
 (42, u'9300'),
 (42, u'923'),
 (42, u'9223'),
 (42, u'64'),
 (42, u'635'),
 (42, u'15980'),
 (42, u'1221'),
 (42, u'12077'),
 (41, u'6530'),
 (41, u'59811'),
 (41, u'4952'),
 (41, u'4933'),
 (41, u'3449'),
 (41, u'158'),
 (41, u'1556'),
 (41, u'1353'),
 (41, u'1331'),
 (41, u'12657'),
 (40, u'9974'),
 (40, u'6417'),
 (40, u'5267'),
 (40, u'3448'),
 (40, u'2608'),
 (40, u'22123'),
 (40, u'1501'),
 (40, u'12778'),
 (40, u'11244'),
 (39, u'5779'),
 (39, u'4294'),
 (39, u'3507'),
 (39, u'3475'),
 (39, u'171'),
 (39, u'1666'),
 (39, u'13479'),
 (39, u'12984'),
 (39, u'1171'),
 (39, u'11143'),
 (39, u'10039'),
 (38, u'5686'),
 (38, u'4881'),
 (38, u'420'),
 (38, u'17516'),
 (38, u'16785'),
 (38, u'1538'),
 (38, u'1377'),
 (38, u'10622'),
 (37, u'8724'),
 (37, u'829'),
 (37, u'8'),
 (37, u'7260'),
 (37, u'70'),
 (37, u'6849'),
 (37, u'6187'),
 (37, u'4799'),
 (37, u'3614'),
 (37, u'2883'),
 (37, u'2786'),
 (37, u'22213'),
 (37, u'1848'),
 (37, u'16366'),
 (37, u'1422'),
 (37, u'11745'),
 (37, u'10617'),
 (37, u'1009'),
 (36, u'9155'),
 (36, u'8298'),
 (36, u'6425'),
 (36, u'516'),
 (36, u'4006'),
 (36, u'2726'),
 (36, u'1971'),
 (36, u'1460'),
 (36, u'12991'),
 (36, u'11921'),
 (35, u'986'),
 (35, u'86225'),
 (35, u'7872'),
 (35, u'670'),
 (35, u'6458'),
 (35, u'5458'),
 (35, u'4946'),
 (35, u'29223'),
 (35, u'289'),
 (35, u'27787'),
 (35, u'17369'),
 (35, u'13549'),
 (35, u'10405'),
 (35, u'10201'),
 (34, u'9993'),
 (34, u'8676'),
 (34, u'8530'),
 (34, u'6679'),
 (34, u'660'),
 (34, u'6125'),
 (34, u'5844'),
 (34, u'3991'),
 (34, u'371'),
 (34, u'294'),
 (34, u'254'),
 (34, u'219'),
 (34, u'16630'),
 (34, u'11359'),
 (34, u'11237'),
 (34, u'10771'),
 (34, u'10000'),
 (33, u'9292'),
 (33, u'9209'),
 (33, u'8165'),
 (33, u'6379'),
 (33, u'6147'),
 (33, u'6116'),
 (33, u'5388'),
 (33, u'4784'),
 (33, u'2490'),
 (33, u'2348'),
 (33, u'22155'),
 (33, u'2159'),
 (33, u'20662'),
 (33, u'11773'),
 (33, u'11671'),
 (32, u'7483'),
 (32, u'726'),
 (32, u'65192'),
 (32, u'5730'),
 (32, u'5104'),
 (32, u'4898'),
 (32, u'47208'),
 (32, u'3544'),
 (32, u'29716'),
 (32, u'20358'),
 (32, u'20192'),
 (32, u'1972'),
 (32, u'16636'),
 (32, u'14966'),
 (32, u'1360'),
 (32, u'12062'),
 (32, u'11061'),
 (32, u'10823'),
 (32, u'10707'),
 (32, u'10621'),
 (31, u'9987'),
 (31, u'819'),
 (31, u'769'),
 (31, u'763'),
 (31, u'688'),
 (31, u'5462'),
 (31, u'5373'),
 (31, u'534'),
 (31, u'5125'),
 (31, u'3034'),
 (31, u'2260'),
 (31, u'2230'),
 (31, u'2166'),
 (31, u'2064'),
 (31, u'192'),
 (31, u'15278'),
 (31, u'1403'),
 (31, u'13969'),
 (31, u'1212'),
 (31, u'11661'),
 (31, u'10580'),
 (31, u'10308'),
 (30, u'8930'),
 (30, u'8492'),
 (30, u'843'),
 (30, u'6082'),
 (30, u'608'),
 (30, u'5552'),
 (30, u'5186'),
 (30, u'3458'),
 (30, u'208'),
 (30, u'18880'),
 (30, u'16037'),
 (30, u'15645'),
 (30, u'1398'),
 (30, u'12994'),
 (30, u'12111'),
 (30, u'1138'),
 (30, u'10255'),
 (30, u'10102'),
 (29, u'893'),
 (29, u'8263'),
 (29, u'7799'),
 (29, u'746'),
 (29, u'7248'),
 (29, u'6519'),
 (29, u'5846'),
 (29, u'5369'),
 (29, u'53009'),
 (29, u'5253'),
 (29, u'4530'),
 (29, u'3790'),
 (29, u'3693'),
 (29, u'3468'),
 (29, u'278'),
 (29, u'2269'),
 (29, u'2188'),
 (29, u'1763'),
 (29, u'11846'),
 (29, u'1088'),
 (28, u'9301'),
 (28, u'8358'),
 (28, u'8355'),
 (28, u'826'),
 (28, u'803'),
 (28, u'6999'),
 (28, u'69770'),
 (28, u'6755'),
 (28, u'6584'),
 (28, u'588'),
 (28, u'58'),
 (28, u'5754'),
 (28, u'53'),
 (28, u'5056'),
 (28, u'3902'),
 (28, u'3086'),
 (28, u'2674'),
 (28, u'2328'),
 (28, u'2268'),
 (28, u'19150'),
 (28, u'14809'),
 (28, u'14714'),
 (28, u'10919'),
 (27, u'876'),
 (27, u'8670'),
 (27, u'8478'),
 (27, u'8171'),
 (27, u'7937'),
 (27, u'7493'),
 (27, u'7437'),
 (27, u'7294'),
 (27, u'718'),
 (27, u'50054'),
 (27, u'4811'),
 (27, u'461'),
 (27, u'4375'),
 (27, u'42242'),
 (27, u'4110'),
 (27, u'4051'),
 (27, u'39985'),
 (27, u'3656'),
 (27, u'357'),
 (27, u'297'),
 (27, u'28367'),
 (27, u'28205'),
 (27, u'2499'),
 (27, u'2185'),
 (27, u'21206'),
 (27, u'19037'),
 (27, u'18903'),
 (27, u'175'),
 (27, u'1742'),
 (27, u'16973'),
 (27, u'1587'),
 (27, u'1548'),
 (27, u'1491'),
 (27, u'13591'),
 (27, u'130'),
 (26, u'9221'),
 (26, u'9149'),
 (26, u'898'),
 (26, u'851'),
 (26, u'8212'),
 (26, u'8100'),
 (26, u'7899'),
 (26, u'6677'),
 (26, u'6339'),
 (26, u'55913'),
 (26, u'4630'),
 (26, u'3675'),
 (26, u'3631'),
 (26, u'3608'),
 (26, u'3546'),
 (26, u'3055'),
 (26, u'3029'),
 (26, u'2527'),
 (26, u'2251'),
 (26, u'1516'),
 (26, u'1477'),
 (26, u'14159'),
 (26, u'12745'),
 (26, u'1205'),
 (26, u'11840'),
 (26, u'11073'),
 (26, u'10221'),
 (25, u'888'),
 (25, u'868'),
 (25, u'7561'),
 (25, u'75314'),
 (25, u'7448'),
 (25, u'6332'),
 (25, u'5632'),
 (25, u'5490'),
 (25, u'4753'),
 (25, u'4433'),
 (25, u'4400'),
 (25, u'435'),
 (25, u'37732'),
 (25, u'3635'),
 (25, u'33629'),
 (25, u'309'),
 (25, u'3012'),
 (25, u'224'),
 (25, u'18368'),
 (25, u'17873'),
 (25, u'172'),
 (25, u'16670'),
 (25, u'163'),
 (25, u'1428'),
 (25, u'1406'),
 (25, u'12142'),
 (25, u'12026'),
 (25, u'1183'),
 (25, u'1083'),
 (25, u'10522'),
 (25, u'1038'),
 (25, u'10105'),
 (24, u'9387'),
 (24, u'9383'),
 (24, u'932'),
 (24, u'8555'),
 (24, u'7929'),
 (24, u'737'),
 (24, u'7197'),
 (24, u'7013'),
 (24, u'6861'),
 (24, u'6704'),
 (24, u'6538'),
 (24, u'6197'),
 (24, u'5888'),
 (24, u'5381'),
 (24, u'5225'),
 (24, u'5026'),
 (24, u'4899'),
 (24, u'4715'),
 (24, u'45970'),
 (24, u'4178'),
 (24, u'3623'),
 (24, u'3618'),
 (24, u'34'),
 (24, u'3362'),
 (24, u'3229'),
 (24, u'296'),
 (24, u'2788'),
 (24, u'2502'),
 (24, u'24955'),
 (24, u'238'),
 (24, u'2345'),
 (24, u'22060'),
 (24, u'16418'),
 (24, u'15505'),
 (24, u'1502'),
 (24, u'1497'),
 (24, u'14623'),
 (24, u'144'),
 (24, u'13404'),
 (24, u'12514'),
 (24, u'12154'),
 (24, u'12096'),
 (24, u'11858'),
 (24, u'11317'),
 (24, u'10569'),
 (24, u'10565'),
 (23, u'8453'),
 (23, u'8277'),
 (23, u'8138'),
 (23, u'790'),
 (23, u'7576'),
 (23, u'75592'),
 (23, u'738'),
 (23, u'5391'),
 (23, u'512'),
 (23, u'5073'),
 (23, u'507'),
 (23, u'4792'),
 (23, u'4748'),
 (23, u'4255'),
 (23, u'3964'),
 (23, u'3447'),
 (23, u'290'),
 (23, u'28567'),
 (23, u'280'),
 (23, u'27706'),
 (23, u'2669'),
 (23, u'266'),
 (23, u'2514'),
 (23, u'1992'),
 (23, u'18477'),
 (23, u'1829'),
 (23, u'1786'),
 (23, u'1639'),
 (23, u'162'),
 (23, u'1341'),
 (23, u'12630'),
 (23, u'12360'),
 (23, u'12240'),
 (23, u'11737'),
 (23, u'11620'),
 (23, u'11561'),
 (23, u'10393'),
 (23, u'10031'),
 (22, u'9980'),
 (22, u'9068'),
 (22, u'8858'),
 (22, u'8659'),
 (22, u'8595'),
 (22, u'8354'),
 (22, u'7965'),
 (22, u'7928'),
 (22, u'7671'),
 (22, u'7395'),
 (22, u'7320'),
 (22, u'7173'),
 (22, u'7024'),
 (22, u'6652'),
 (22, u'5671'),
 (22, u'4977'),
 (22, u'494'),
 (22, u'4745'),
 (22, u'4683'),
 (22, u'4564'),
 (22, u'3952'),
 (22, u'3759'),
 (22, u'3712'),
 (22, u'3632'),
 (22, u'3491'),
 (22, u'342'),
 (22, u'3393'),
 (22, u'3363'),
 (22, u'318'),
 (22, u'31059'),
 (22, u'27504'),
 (22, u'2448'),
 (22, u'23504'),
 (22, u'2302'),
 (22, u'2013'),
 (22, u'1869'),
 (22, u'1787'),
 (22, u'17161'),
 (22, u'16132'),
 (22, u'1496'),
 (22, u'14040'),
 (22, u'1379'),
 (22, u'13337'),
 (22, u'13121'),
 (22, u'12838'),
 (22, u'11792'),
 (22, u'11341'),
 (22, u'1080'),
 (22, u'1066'),
 (22, u'10481'),
 (22, u'1018'),
 (21, u'9372'),
 (21, u'9340'),
 (21, u'9240'),
 (21, u'9152'),
 (21, u'9024'),
 (21, u'8924'),
 (21, u'859'),
 (21, u'8409'),
 (21, u'8106'),
 (21, u'6938'),
 (21, u'6739'),
 (21, u'6736'),
 (21, u'5998'),
 (21, u'5781'),
 (21, u'5426'),
 (21, u'5003'),
 (21, u'4948'),
 (21, u'4775'),
 (21, u'47131'),
 (21, u'4681'),
 (21, u'4679'),
 (21, u'4079'),
 (21, u'3929'),
 (21, u'3760'),
 (21, u'3466'),
 (21, u'321'),
 (21, u'3179'),
 (21, u'307'),
 (21, u'3061'),
 (21, u'3040'),
 (21, u'2927'),
 (21, u'2908'),
 (21, u'2846'),
 (21, u'2612'),
 (21, u'2532'),
 (21, u'21972'),
 (21, u'1926'),
 (21, u'18718'),
 (21, u'16675'),
 (21, u'16527'),
 (21, u'1524'),
 (21, u'14197'),
 (21, u'13930'),
 (21, u'1268'),
 (21, u'12389'),
 (21, u'12081'),
 (21, u'11391'),
 (21, u'10843'),
 (21, u'1063'),
 (21, u'1030'),
 (20, u'9015'),
 (20, u'846'),
 (20, u'8147'),
 (20, u'7343'),
 (20, u'6941'),
 (20, u'680'),
 (20, u'6451'),
 (20, u'5778'),
 (20, u'506'),
 (20, u'500'),
 (20, u'4740'),
 (20, u'4141'),
 (20, u'4009'),
 (20, u'393'),
 (20, u'3685'),
 (20, u'3309'),
 (20, u'3237'),
 (20, u'31080'),
 (20, u'3070'),
 (20, u'285'),
 (20, u'2783'),
 (20, u'2577'),
 (20, u'25115'),
 (20, u'249'),
 (20, u'2365'),
 (20, u'2112'),
 (20, u'2061'),
 (20, u'20542'),
 (20, u'2050'),
 (20, u'19237'),
 (20, u'1804'),
 (20, u'15488'),
 (20, u'14718'),
 (20, u'14104'),
 (20, u'14035'),
 (20, u'13845'),
 (20, u'13682'),
 (20, u'13660'),
 (20, u'13433'),
 (20, u'12268'),
 (20, u'11917'),
 (20, u'11735'),
 (20, u'11448'),
 (20, u'10954'),
 (19, u'860'),
 (19, u'81'),
 (19, u'79568'),
 (19, u'768'),
 (19, u'744'),
 (19, u'6962'),
 (19, u'6778'),
 (19, u'6363'),
 (19, u'5982'),
 (19, u'5674'),
 (19, u'52372'),
 (19, u'4701'),
 (19, u'4376'),
 (19, u'436'),
 (19, u'4206'),
 (19, u'4107'),
 (19, u'3984'),
 (19, u'3633'),
 (19, u'34034'),
 (19, u'29566'),
 (19, u'2902'),
 (19, u'248'),
 (19, u'2329'),
 (19, u'216'),
 (19, u'2070'),
 (19, u'1898'),
 (19, u'189'),
 (19, u'1766'),
 (19, u'16934'),
 (19, u'16772'),
 (19, u'16430'),
 (19, u'16401'),
 (19, u'16323'),
 (19, u'1583'),
 (19, u'15730'),
 (19, u'1533'),
 (19, u'1527'),
 (19, u'15175'),
 (19, u'1500'),
 (19, u'134'),
 (19, u'11427'),
 (19, u'11308'),
 (19, u'10657'),
 (19, u'10348'),
 (19, u'10345'),
 (19, u'103'),
 (18, u'9303'),
 (18, u'9177'),
 (18, u'9020'),
 (18, u'8962'),
 (18, u'870'),
 (18, u'836'),
 (18, u'8335'),
 (18, u'8333'),
 (18, u'813'),
 (18, u'771'),
 (18, u'76043'),
 (18, u'748'),
 (18, u'7364'),
 (18, u'7255'),
 (18, u'6480'),
 (18, u'6427'),
 (18, u'6254'),
 (18, u'619'),
 (18, u'5747'),
 (18, u'5484'),
 (18, u'4913'),
 (18, u'48859'),
 (18, u'4875'),
 (18, u'48320'),
 (18, u'4599'),
 (18, u'4598'),
 (18, u'4527'),
 (18, u'4462'),
 (18, u'4247'),
 (18, u'3920'),
 (18, u'3906'),
 (18, u'37'),
 (18, u'3676'),
 (18, u'3619'),
 (18, u'3604'),
 (18, u'3451'),
 (18, u'3307'),
 (18, u'3202'),
 (18, u'31547'),
 (18, u'3041'),
 (18, u'29695'),
 (18, u'2918'),
 (18, u'28514'),
 (18, u'284'),
 (18, u'2481'),
 (18, u'2395'),
 (18, u'2370'),
 (18, u'236'),
 (18, u'22959'),
 (18, u'22792'),
 (18, u'22622'),
 (18, u'20580'),
 (18, u'20235'),
 (18, u'19551'),
 (18, u'19246'),
 (18, u'1812'),
 (18, u'1807'),
 (18, u'18010'),
 (18, u'17392'),
 (18, u'15433'),
 (18, u'15005'),
 (18, u'14680'),
 (18, u'1445'),
 (18, u'1441'),
 (18, u'1371'),
 (18, u'13587'),
 (18, u'13282'),
 (18, u'13001'),
 (18, u'1249'),
 (18, u'11842'),
 (18, u'11788'),
 (18, u'1176'),
 (18, u'11407'),
 (18, u'11332'),
 (18, u'1124'),
 (18, u'11152'),
 (18, u'10592'),
 (18, u'1048'),
 (18, u'10284'),
 (17, u'9210'),
 (17, u'9168'),
 (17, u'8955'),
 (17, u'8471'),
 (17, u'8350'),
 (17, u'8210'),
 (17, u'8146'),
 (17, u'8047'),
 (17, u'7956'),
 (17, u'7710'),
 (17, u'758'),
 (17, u'7431'),
 (17, u'7419'),
 (17, u'7396'),
 (17, u'7237'),
 (17, u'708'),
 (17, u'7008'),
 (17, u'6416'),
 (17, u'6397'),
 (17, u'6104'),
 (17, u'5915'),
 (17, u'58596'),
 (17, u'5752'),
 (17, u'570'),
 (17, u'56689'),
 ...]

In [37]:
# dummy variables for categories excpet genre and companies
for name in dummy_names:
    if name != 'production_companies' and name != 'genres':
        temp = train_data[name].str.join(sep='*').str.get_dummies(sep='*')
        temp.columns = [name + '_' + str(col) for col in temp.columns]
        train_data = pd.concat([train_data.reset_index(drop=True), temp], axis=1)
        print name


original_language
production_countries
spoken_languages

In [38]:
train_data.shape


Out[38]:
(156327, 525)

In [39]:
# remove from features data
train_data = train_data.drop('genres', axis=1)
train_data = train_data.drop('original_language', axis=1)
train_data = train_data.drop('production_countries', axis=1)
train_data = train_data.drop('spoken_languages', axis=1)

In [40]:
train_data.shape


Out[40]:
(156327, 521)

In [42]:
features = train_data

In [43]:
features.to_csv('features_V1.csv', index=False, encoding='utf-8')
labels.to_csv('labels.csv', index=False, encoding='utf-8')

In [ ]: