In [1]:
import json
import json_helper as jhlp
import matplotlib.pyplot as plt
import os
%matplotlib inline
import random as rnd
import numpy as np
from rake_topic_extraction import Rake
topic_extractor = Rake('SmartStoplist.txt')

In [2]:
plt.style.use('ggplot')

In [3]:
scripts_path = '/home/vishi/imsdb2'
txt_names = [scripts_path + '/' + name for name in os.listdir(scripts_path)]
script_texts = []
movie_names = []
for txt in txt_names:
    with open(txt) as f:
        lines = f.readlines()
        data = []
        for i,l in enumerate(lines):
            temp = l.replace('<b>','').replace('</b>','').replace('\n','')
            if temp != '':
                data.append(temp)
        script_texts.append(data)
        name = txt.split('/')[-1].split('.')[0]
        movie_names.append(name)

In [4]:
movie_text_all = []
movie_text_by_scene = []

num_bins = 100
for script_lines in script_texts:
    movie_text_all.append(' '.join([l.strip() for l in script_lines]))
    txt_by_sc = []
    for i in range(num_bins):
        bin_size = len(script_lines)/num_bins
        txt_by_sc.append(' '.join([l.strip() for l in script_lines[i*bin_size:(i+1)*bin_size]]))
    movie_text_by_scene.append(txt_by_sc)

In [5]:
print len(movie_text_all)
print len(movie_text_by_scene)
print len(movie_names)


946
946
946

In [6]:
#print movie_text_all[0]
print movie_text_by_scene[0][0]


STIR OF ECHOES Written by David Koepp Based on the novel  Richard Matheson March 26, 1998   Sometimes within the brain's old ghostly house, I hear, far off, at some forgotten door, A music and an eerie faint carouse And stir of echoes down the creaking floor. "Chambers of Imagery" Archibald MacLeish   In the black, a child HUMS. Gentle WATER sounds. FADE IN: INT A BATHROOM NIGHT JAKE, a four year old boy, sits in a bathtub. The door to thi bathroom is open and his mother is visible in the background, walking back and forth in the bedroom, getting dressed to go out. The STEREO is blaring in the bedroom, the music echoes off the tile in the tiny bathroom. Jake is playing with a plastic airplane. He answers a question. JAKE YES (PAUSE) Sometimes. (PAUSE) With my toys. He looks up, but we're close in on him and can't see who he's looking at. JAKE (cont'd) My... blue sword. No! The one with the gray tape around the middle. That one. He lands the airplane on the water. He giggles. JAKE (coast' d) That's silly. MAGGIE (o.s.) (from the bedroom) Jake? You all right in there? JAKE (calling out to the door) YES!

In [7]:
def get_scene_weight(movie_keywords, movie_id, scene_id):
    keyword_extractor = Rake('SmartStoplist.txt')
    scene_keywords = keyword_extractor.run(movie_text_by_scene[movie_id][scene_id])
    topic_overlap = len([1 for w in movie_keywords if w in scene_keywords])
    return topic_overlap

In [8]:
topic_overlaps_for_movie = []

width = 0.8
for i in range(len(movie_names)):
    movie_keywords = topic_extractor.run(movie_text_all[i])
    weights = [get_scene_weight(movie_keywords, i, j) for j in range(num_bins)]
    topic_overlaps_for_movie.append(weights)

In [14]:
#plt.figure(figsize=(10,5))
mname = 'Pirates of the Caribbean: Dead Man\'s Chest'
bplot = plt.bar(range(num_bins), topic_overlaps_for_movie[movie_names.index(mname)])
plt.xlabel('scene index')
plt.ylabel('topic ovelap')
plt.title(mname)
plt.show()



In [21]:
stds = [np.std(wt) for wt in topic_overlaps_for_movie]
plt.plot(range(len(stds)), stds)
plt.title('deviation for topic overlaps')
plt.show()

plt.plot(range(len(stds)), sorted(stds))
plt.title('sorted deviations')
plt.show()



In [15]:
distributions_for_movie = []
for overlap in topic_overlaps_for_movie:
    dist = []
    for i in range(num_bins/10):
        dist.append(np.mean(overlap[i*10:(i+1)*10]))
    distributions_for_movie.append(dist)

In [42]:
plt.style.use('ggplot')

plt.plot(range(10), distributions_for_movie[50]); plt.show()
plt.plot(range(10), distributions_for_movie[100]); plt.show()
plt.plot(range(10), distributions_for_movie[150]); plt.show()
plt.plot(range(10), distributions_for_movie[250]); plt.show()
plt.plot(range(10), distributions_for_movie[500]); plt.show()



In [18]:
plt.plot(range(10), np.mean(distributions_for_movie, 0)); plt.show()
plt.plot(range(10), np.var(distributions_for_movie, 0)); plt.show()



In [54]:
import pandas as pd
dat = [[movie_names[i]]+distributions_for_movie[i] for i in range(len(movie_names))]
df = pd.DataFrame(dat)
df


Out[54]:
0 1 2 3 4 5 6 7 8 9 10
0 Stir of Echoes 21.3 22.2 21.5 24.1 21.9 22.7 23.8 18.9 23.2 22.7
1 Rachel Getting Married 20.4 21.1 20.4 22.6 19.3 19.3 19.7 22.3 27.8 20.6
2 Brick 16.7 17.6 18.7 16.2 19.7 18.4 17.3 13.0 12.7 15.9
3 Avatar 32.1 27.3 24.2 24.1 26.7 23.9 22.4 24.3 24.0 22.5
4 Leviathan 21.0 18.5 18.0 15.3 15.9 13.0 18.6 13.1 13.6 18.5
5 Man Trouble 24.7 27.4 28.8 23.6 24.9 28.9 25.2 25.5 26.2 27.7
6 Grand Theft Parsons 22.8 18.8 17.1 18.1 14.8 19.6 17.6 18.2 18.2 19.0
7 Inventing the Abbotts 26.6 19.1 20.0 18.0 20.5 17.6 13.0 18.5 15.7 19.4
8 Donnie Brasco 28.6 23.4 21.9 21.5 23.9 21.1 21.6 18.9 24.0 25.1
9 Get Low 20.3 13.9 12.1 15.6 14.6 12.1 12.7 12.5 15.2 16.3
10 Life As A House 24.3 27.8 20.9 21.3 22.0 19.2 18.5 20.2 20.3 21.4
11 Taxi Driver 29.1 23.1 18.2 23.7 22.1 32.3 22.0 17.6 20.7 24.4
12 Girl with the Dragon Tattoo, The 27.1 32.0 28.3 24.6 31.8 25.0 29.7 27.8 29.6 30.7
13 Meet John Doe 24.4 23.2 25.3 23.6 28.8 29.0 25.8 25.3 29.9 30.7
14 Miller's Crossing 22.9 17.5 21.8 22.4 19.2 18.8 21.1 24.5 19.2 18.4
15 Saving Private Ryan 20.3 13.8 18.3 21.5 19.2 17.6 16.0 15.3 12.4 16.3
16 Gran Torino 24.7 18.6 21.6 21.7 17.1 18.2 14.2 17.0 17.8 18.6
17 Airplane 2: The Sequel 21.6 19.1 20.3 15.9 14.6 16.8 16.4 19.9 15.5 14.1
18 Battle: Los Angeles 21.2 19.6 15.9 14.8 16.6 13.8 16.9 14.5 12.5 16.4
19 Taking Lives 23.2 20.9 20.4 19.1 21.6 20.0 24.0 21.1 21.2 22.9
20 They 36.9 26.3 25.4 28.6 28.3 26.3 29.5 27.0 25.7 30.8
21 Rock, The 25.8 25.3 18.8 22.9 15.7 20.2 16.1 15.1 16.9 17.0
22 Amadeus 35.2 21.7 25.4 25.7 22.2 25.8 24.5 26.4 29.8 24.6
23 Die Hard 25.5 20.3 26.8 22.9 19.5 18.9 20.1 23.1 17.7 24.7
24 Eastern Promises 20.9 17.0 20.8 17.3 15.6 15.5 19.1 19.7 18.5 17.1
25 Wrestler, The 24.8 22.4 24.9 18.5 19.7 19.2 18.3 23.5 20.2 20.9
26 Soldier 30.3 18.6 22.4 21.0 21.4 18.0 20.0 18.9 21.3 20.8
27 Perfect Creature 18.6 18.3 14.6 15.3 14.6 16.8 19.0 15.5 16.8 16.4
28 Bottle Rocket 16.6 16.1 15.5 14.0 15.3 15.9 15.6 14.0 15.4 12.5
29 Shame 18.5 10.3 11.7 12.8 10.8 15.7 10.3 17.9 11.8 21.4
... ... ... ... ... ... ... ... ... ... ... ...
916 Dumb and Dumber 25.4 21.8 19.4 24.6 26.2 22.2 26.6 22.0 21.6 20.9
917 Alien Nation 21.5 18.4 19.0 16.4 17.6 22.1 15.7 16.6 14.8 21.8
918 Bad Santa 19.0 18.4 13.2 12.2 12.9 14.3 11.1 14.9 18.4 20.7
919 Zerophilia 22.2 19.9 16.7 17.0 13.3 15.4 15.0 16.9 16.0 13.3
920 Gremlins 16.9 17.7 16.6 16.7 14.1 12.6 11.7 16.5 14.8 14.1
921 Time Machine, The 20.2 18.4 21.7 25.5 25.1 20.4 18.3 21.3 18.7 17.4
922 Lost in Space 21.2 19.8 13.1 13.3 15.8 16.0 17.9 19.3 15.5 16.0
923 Supergirl 18.3 23.7 21.3 18.6 20.7 20.5 19.4 27.1 27.2 18.0
924 Nurse Betty 29.7 28.7 25.5 23.9 28.2 24.5 23.0 24.2 24.3 26.4
925 Miracle Worker, The 5.6 6.3 8.8 8.2 10.0 4.7 4.8 6.7 4.6 5.0
926 Apocalypse Now 24.4 19.1 11.0 14.4 13.7 15.6 17.7 21.3 20.4 19.2
927 Bonfire of the Vanities 24.5 22.8 20.1 20.7 18.5 20.6 22.2 23.4 25.0 23.7
928 Robin Hood: Prince of Thieves 28.0 25.8 21.9 27.2 26.7 27.8 27.8 23.9 26.1 28.7
929 RocknRolla 16.5 20.5 17.6 18.1 18.1 18.5 18.7 21.4 19.5 19.9
930 Disturbia 28.2 30.0 20.5 16.1 18.4 19.8 17.6 15.8 14.7 18.6
931 King Kong 27.4 22.0 21.7 19.1 17.7 22.7 23.1 23.0 20.6 22.0
932 Titanic 37.7 35.4 28.3 33.2 29.5 24.9 22.7 23.9 27.8 28.7
933 Assassins 17.4 21.8 17.8 20.8 15.7 15.8 13.8 14.2 16.0 13.8
934 American Werewolf in London 15.4 13.3 12.3 18.4 20.0 13.8 16.9 17.4 13.8 16.5
935 Losers, The 19.6 19.0 17.1 20.9 18.7 18.4 14.8 17.7 16.7 16.5
936 Cellular 23.4 17.2 15.0 20.5 14.0 16.1 20.0 17.4 17.8 20.8
937 Gang Related 22.3 18.8 14.8 19.0 16.1 17.7 16.1 16.1 13.4 19.0
938 Program, The 25.0 21.6 23.6 24.7 20.5 21.7 21.2 21.7 20.1 25.5
939 Three Men and a Baby 19.1 19.1 20.2 21.4 23.6 23.6 19.6 19.5 24.0 23.3
940 Mini's First Time 29.9 25.2 22.6 26.3 23.0 21.8 22.4 24.2 22.7 25.4
941 White Christmas 20.9 13.5 18.4 15.7 16.7 19.0 16.6 18.0 13.5 19.5
942 Six Degrees of Separation 18.5 17.9 32.7 18.0 15.3 21.0 21.8 19.3 16.1 14.0
943 Starman 25.9 28.9 21.7 16.9 21.0 17.2 26.7 22.0 19.4 22.6
944 Nightbreed 25.9 24.8 26.3 26.8 22.6 25.7 31.8 24.2 24.0 22.0
945 Lord of the Rings: Fellowship of the Ring, The 25.0 20.8 20.1 18.7 20.4 20.0 21.0 23.0 23.4 17.3

946 rows × 11 columns


In [55]:
import cPickle as pickle
with open('topic_overlap.pkl', 'w') as fp:
    pickle.dump(df, fp)

In [25]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(distributions_for_movie)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1])
plt.show()



In [ ]:
data = []
names = emotion_dict.keys()
for i in range(len(emotion_dict)):
    data.append([names[i]] + [k for k in X[i]])