notebook.community

Edit and run



In [1]:

    
import json
import json_helper as jhlp
import matplotlib.pyplot as plt
import os
%matplotlib inline
import random as rnd
import numpy as np
from rake_topic_extraction import Rake
topic_extractor = Rake('SmartStoplist.txt')



In [2]:

    
plt.style.use('ggplot')



In [3]:

    
scripts_path = '/home/vishi/imsdb2'
txt_names = [scripts_path + '/' + name for name in os.listdir(scripts_path)]
script_texts = []
movie_names = []
for txt in txt_names:
    with open(txt) as f:
        lines = f.readlines()
        data = []
        for i,l in enumerate(lines):
            temp = l.replace('<b>','').replace('</b>','').replace('\n','')
            if temp != '':
                data.append(temp)
        script_texts.append(data)
        name = txt.split('/')[-1].split('.')[0]
        movie_names.append(name)



In [4]:

    
movie_text_all = []
movie_text_by_scene = []

num_bins = 100
for script_lines in script_texts:
    movie_text_all.append(' '.join([l.strip() for l in script_lines]))
    txt_by_sc = []
    for i in range(num_bins):
        bin_size = len(script_lines)/num_bins
        txt_by_sc.append(' '.join([l.strip() for l in script_lines[i*bin_size:(i+1)*bin_size]]))
    movie_text_by_scene.append(txt_by_sc)



In [5]:

    
print len(movie_text_all)
print len(movie_text_by_scene)
print len(movie_names)



In [6]:

    
#print movie_text_all[0]
print movie_text_by_scene[0][0]









    



STIR OF ECHOES Written by David Koepp Based on the novel  Richard Matheson March 26, 1998   Sometimes within the brain's old ghostly house, I hear, far off, at some forgotten door, A music and an eerie faint carouse And stir of echoes down the creaking floor. "Chambers of Imagery" Archibald MacLeish   In the black, a child HUMS. Gentle WATER sounds. FADE IN: INT A BATHROOM NIGHT JAKE, a four year old boy, sits in a bathtub. The door to thi bathroom is open and his mother is visible in the background, walking back and forth in the bedroom, getting dressed to go out. The STEREO is blaring in the bedroom, the music echoes off the tile in the tiny bathroom. Jake is playing with a plastic airplane. He answers a question. JAKE YES (PAUSE) Sometimes. (PAUSE) With my toys. He looks up, but we're close in on him and can't see who he's looking at. JAKE (cont'd) My... blue sword. No! The one with the gray tape around the middle. That one. He lands the airplane on the water. He giggles. JAKE (coast' d) That's silly. MAGGIE (o.s.) (from the bedroom) Jake? You all right in there? JAKE (calling out to the door) YES!



In [7]:

    
def get_scene_weight(movie_keywords, movie_id, scene_id):
    keyword_extractor = Rake('SmartStoplist.txt')
    scene_keywords = keyword_extractor.run(movie_text_by_scene[movie_id][scene_id])
    topic_overlap = len([1 for w in movie_keywords if w in scene_keywords])
    return topic_overlap



In [8]:

    
topic_overlaps_for_movie = []

width = 0.8
for i in range(len(movie_names)):
    movie_keywords = topic_extractor.run(movie_text_all[i])
    weights = [get_scene_weight(movie_keywords, i, j) for j in range(num_bins)]
    topic_overlaps_for_movie.append(weights)



In [14]:

    
#plt.figure(figsize=(10,5))
mname = 'Pirates of the Caribbean: Dead Man\'s Chest'
bplot = plt.bar(range(num_bins), topic_overlaps_for_movie[movie_names.index(mname)])
plt.xlabel('scene index')
plt.ylabel('topic ovelap')
plt.title(mname)
plt.show()



In [21]:

    
stds = [np.std(wt) for wt in topic_overlaps_for_movie]
plt.plot(range(len(stds)), stds)
plt.title('deviation for topic overlaps')
plt.show()

plt.plot(range(len(stds)), sorted(stds))
plt.title('sorted deviations')
plt.show()



In [15]:

    
distributions_for_movie = []
for overlap in topic_overlaps_for_movie:
    dist = []
    for i in range(num_bins/10):
        dist.append(np.mean(overlap[i*10:(i+1)*10]))
    distributions_for_movie.append(dist)



In [42]:

    
plt.style.use('ggplot')

plt.plot(range(10), distributions_for_movie[50]); plt.show()
plt.plot(range(10), distributions_for_movie[100]); plt.show()
plt.plot(range(10), distributions_for_movie[150]); plt.show()
plt.plot(range(10), distributions_for_movie[250]); plt.show()
plt.plot(range(10), distributions_for_movie[500]); plt.show()



In [18]:

    
plt.plot(range(10), np.mean(distributions_for_movie, 0)); plt.show()
plt.plot(range(10), np.var(distributions_for_movie, 0)); plt.show()



In [54]:

    
import pandas as pd
dat = [[movie_names[i]]+distributions_for_movie[i] for i in range(len(movie_names))]
df = pd.DataFrame(dat)
df









    Out[54]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
    
  
  
    
      0
      Stir of Echoes
      21.3
      22.2
      21.5
      24.1
      21.9
      22.7
      23.8
      18.9
      23.2
      22.7
    
    
      1
      Rachel Getting Married
      20.4
      21.1
      20.4
      22.6
      19.3
      19.3
      19.7
      22.3
      27.8
      20.6
    
    
      2
      Brick
      16.7
      17.6
      18.7
      16.2
      19.7
      18.4
      17.3
      13.0
      12.7
      15.9
    
    
      3
      Avatar
      32.1
      27.3
      24.2
      24.1
      26.7
      23.9
      22.4
      24.3
      24.0
      22.5
    
    
      4
      Leviathan
      21.0
      18.5
      18.0
      15.3
      15.9
      13.0
      18.6
      13.1
      13.6
      18.5
    
    
      5
      Man Trouble
      24.7
      27.4
      28.8
      23.6
      24.9
      28.9
      25.2
      25.5
      26.2
      27.7
    
    
      6
      Grand Theft Parsons
      22.8
      18.8
      17.1
      18.1
      14.8
      19.6
      17.6
      18.2
      18.2
      19.0
    
    
      7
      Inventing the Abbotts
      26.6
      19.1
      20.0
      18.0
      20.5
      17.6
      13.0
      18.5
      15.7
      19.4
    
    
      8
      Donnie Brasco
      28.6
      23.4
      21.9
      21.5
      23.9
      21.1
      21.6
      18.9
      24.0
      25.1
    
    
      9
      Get Low
      20.3
      13.9
      12.1
      15.6
      14.6
      12.1
      12.7
      12.5
      15.2
      16.3
    
    
      10
      Life As A House
      24.3
      27.8
      20.9
      21.3
      22.0
      19.2
      18.5
      20.2
      20.3
      21.4
    
    
      11
      Taxi Driver
      29.1
      23.1
      18.2
      23.7
      22.1
      32.3
      22.0
      17.6
      20.7
      24.4
    
    
      12
      Girl with the Dragon Tattoo, The
      27.1
      32.0
      28.3
      24.6
      31.8
      25.0
      29.7
      27.8
      29.6
      30.7
    
    
      13
      Meet John Doe
      24.4
      23.2
      25.3
      23.6
      28.8
      29.0
      25.8
      25.3
      29.9
      30.7
    
    
      14
      Miller's Crossing
      22.9
      17.5
      21.8
      22.4
      19.2
      18.8
      21.1
      24.5
      19.2
      18.4
    
    
      15
      Saving Private Ryan
      20.3
      13.8
      18.3
      21.5
      19.2
      17.6
      16.0
      15.3
      12.4
      16.3
    
    
      16
      Gran Torino
      24.7
      18.6
      21.6
      21.7
      17.1
      18.2
      14.2
      17.0
      17.8
      18.6
    
    
      17
      Airplane 2: The Sequel
      21.6
      19.1
      20.3
      15.9
      14.6
      16.8
      16.4
      19.9
      15.5
      14.1
    
    
      18
      Battle: Los Angeles
      21.2
      19.6
      15.9
      14.8
      16.6
      13.8
      16.9
      14.5
      12.5
      16.4
    
    
      19
      Taking Lives
      23.2
      20.9
      20.4
      19.1
      21.6
      20.0
      24.0
      21.1
      21.2
      22.9
    
    
      20
      They
      36.9
      26.3
      25.4
      28.6
      28.3
      26.3
      29.5
      27.0
      25.7
      30.8
    
    
      21
      Rock, The
      25.8
      25.3
      18.8
      22.9
      15.7
      20.2
      16.1
      15.1
      16.9
      17.0
    
    
      22
      Amadeus
      35.2
      21.7
      25.4
      25.7
      22.2
      25.8
      24.5
      26.4
      29.8
      24.6
    
    
      23
      Die Hard
      25.5
      20.3
      26.8
      22.9
      19.5
      18.9
      20.1
      23.1
      17.7
      24.7
    
    
      24
      Eastern Promises
      20.9
      17.0
      20.8
      17.3
      15.6
      15.5
      19.1
      19.7
      18.5
      17.1
    
    
      25
      Wrestler, The
      24.8
      22.4
      24.9
      18.5
      19.7
      19.2
      18.3
      23.5
      20.2
      20.9
    
    
      26
      Soldier
      30.3
      18.6
      22.4
      21.0
      21.4
      18.0
      20.0
      18.9
      21.3
      20.8
    
    
      27
      Perfect Creature
      18.6
      18.3
      14.6
      15.3
      14.6
      16.8
      19.0
      15.5
      16.8
      16.4
    
    
      28
      Bottle Rocket
      16.6
      16.1
      15.5
      14.0
      15.3
      15.9
      15.6
      14.0
      15.4
      12.5
    
    
      29
      Shame
      18.5
      10.3
      11.7
      12.8
      10.8
      15.7
      10.3
      17.9
      11.8
      21.4
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      916
      Dumb and Dumber
      25.4
      21.8
      19.4
      24.6
      26.2
      22.2
      26.6
      22.0
      21.6
      20.9
    
    
      917
      Alien Nation
      21.5
      18.4
      19.0
      16.4
      17.6
      22.1
      15.7
      16.6
      14.8
      21.8
    
    
      918
      Bad Santa
      19.0
      18.4
      13.2
      12.2
      12.9
      14.3
      11.1
      14.9
      18.4
      20.7
    
    
      919
      Zerophilia
      22.2
      19.9
      16.7
      17.0
      13.3
      15.4
      15.0
      16.9
      16.0
      13.3
    
    
      920
      Gremlins
      16.9
      17.7
      16.6
      16.7
      14.1
      12.6
      11.7
      16.5
      14.8
      14.1
    
    
      921
      Time Machine, The
      20.2
      18.4
      21.7
      25.5
      25.1
      20.4
      18.3
      21.3
      18.7
      17.4
    
    
      922
      Lost in Space
      21.2
      19.8
      13.1
      13.3
      15.8
      16.0
      17.9
      19.3
      15.5
      16.0
    
    
      923
      Supergirl
      18.3
      23.7
      21.3
      18.6
      20.7
      20.5
      19.4
      27.1
      27.2
      18.0
    
    
      924
      Nurse Betty
      29.7
      28.7
      25.5
      23.9
      28.2
      24.5
      23.0
      24.2
      24.3
      26.4
    
    
      925
      Miracle Worker, The
      5.6
      6.3
      8.8
      8.2
      10.0
      4.7
      4.8
      6.7
      4.6
      5.0
    
    
      926
      Apocalypse Now
      24.4
      19.1
      11.0
      14.4
      13.7
      15.6
      17.7
      21.3
      20.4
      19.2
    
    
      927
      Bonfire of the Vanities
      24.5
      22.8
      20.1
      20.7
      18.5
      20.6
      22.2
      23.4
      25.0
      23.7
    
    
      928
      Robin Hood: Prince of Thieves
      28.0
      25.8
      21.9
      27.2
      26.7
      27.8
      27.8
      23.9
      26.1
      28.7
    
    
      929
      RocknRolla
      16.5
      20.5
      17.6
      18.1
      18.1
      18.5
      18.7
      21.4
      19.5
      19.9
    
    
      930
      Disturbia
      28.2
      30.0
      20.5
      16.1
      18.4
      19.8
      17.6
      15.8
      14.7
      18.6
    
    
      931
      King Kong
      27.4
      22.0
      21.7
      19.1
      17.7
      22.7
      23.1
      23.0
      20.6
      22.0
    
    
      932
      Titanic
      37.7
      35.4
      28.3
      33.2
      29.5
      24.9
      22.7
      23.9
      27.8
      28.7
    
    
      933
      Assassins
      17.4
      21.8
      17.8
      20.8
      15.7
      15.8
      13.8
      14.2
      16.0
      13.8
    
    
      934
      American Werewolf in London
      15.4
      13.3
      12.3
      18.4
      20.0
      13.8
      16.9
      17.4
      13.8
      16.5
    
    
      935
      Losers, The
      19.6
      19.0
      17.1
      20.9
      18.7
      18.4
      14.8
      17.7
      16.7
      16.5
    
    
      936
      Cellular
      23.4
      17.2
      15.0
      20.5
      14.0
      16.1
      20.0
      17.4
      17.8
      20.8
    
    
      937
      Gang Related
      22.3
      18.8
      14.8
      19.0
      16.1
      17.7
      16.1
      16.1
      13.4
      19.0
    
    
      938
      Program, The
      25.0
      21.6
      23.6
      24.7
      20.5
      21.7
      21.2
      21.7
      20.1
      25.5
    
    
      939
      Three Men and a Baby
      19.1
      19.1
      20.2
      21.4
      23.6
      23.6
      19.6
      19.5
      24.0
      23.3
    
    
      940
      Mini's First Time
      29.9
      25.2
      22.6
      26.3
      23.0
      21.8
      22.4
      24.2
      22.7
      25.4
    
    
      941
      White Christmas
      20.9
      13.5
      18.4
      15.7
      16.7
      19.0
      16.6
      18.0
      13.5
      19.5
    
    
      942
      Six Degrees of Separation
      18.5
      17.9
      32.7
      18.0
      15.3
      21.0
      21.8
      19.3
      16.1
      14.0
    
    
      943
      Starman
      25.9
      28.9
      21.7
      16.9
      21.0
      17.2
      26.7
      22.0
      19.4
      22.6
    
    
      944
      Nightbreed
      25.9
      24.8
      26.3
      26.8
      22.6
      25.7
      31.8
      24.2
      24.0
      22.0
    
    
      945
      Lord of the Rings: Fellowship of the Ring, The
      25.0
      20.8
      20.1
      18.7
      20.4
      20.0
      21.0
      23.0
      23.4
      17.3
    
  

946 rows × 11 columns



In [55]:

    
import cPickle as pickle
with open('topic_overlap.pkl', 'w') as fp:
    pickle.dump(df, fp)



In [25]:

    
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(distributions_for_movie)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1])
plt.show()



In [ ]:

    
data = []
names = emotion_dict.keys()
for i in range(len(emotion_dict)):
    data.append([names[i]] + [k for k in X[i]])

	0	1	2	3	4	5	6	7	8	9	10
0	Stir of Echoes	21.3	22.2	21.5	24.1	21.9	22.7	23.8	18.9	23.2	22.7
1	Rachel Getting Married	20.4	21.1	20.4	22.6	19.3	19.3	19.7	22.3	27.8	20.6
2	Brick	16.7	17.6	18.7	16.2	19.7	18.4	17.3	13.0	12.7	15.9
3	Avatar	32.1	27.3	24.2	24.1	26.7	23.9	22.4	24.3	24.0	22.5
4	Leviathan	21.0	18.5	18.0	15.3	15.9	13.0	18.6	13.1	13.6	18.5
5	Man Trouble	24.7	27.4	28.8	23.6	24.9	28.9	25.2	25.5	26.2	27.7
6	Grand Theft Parsons	22.8	18.8	17.1	18.1	14.8	19.6	17.6	18.2	18.2	19.0
7	Inventing the Abbotts	26.6	19.1	20.0	18.0	20.5	17.6	13.0	18.5	15.7	19.4
8	Donnie Brasco	28.6	23.4	21.9	21.5	23.9	21.1	21.6	18.9	24.0	25.1
9	Get Low	20.3	13.9	12.1	15.6	14.6	12.1	12.7	12.5	15.2	16.3
10	Life As A House	24.3	27.8	20.9	21.3	22.0	19.2	18.5	20.2	20.3	21.4
11	Taxi Driver	29.1	23.1	18.2	23.7	22.1	32.3	22.0	17.6	20.7	24.4
12	Girl with the Dragon Tattoo, The	27.1	32.0	28.3	24.6	31.8	25.0	29.7	27.8	29.6	30.7
13	Meet John Doe	24.4	23.2	25.3	23.6	28.8	29.0	25.8	25.3	29.9	30.7
14	Miller's Crossing	22.9	17.5	21.8	22.4	19.2	18.8	21.1	24.5	19.2	18.4
15	Saving Private Ryan	20.3	13.8	18.3	21.5	19.2	17.6	16.0	15.3	12.4	16.3
16	Gran Torino	24.7	18.6	21.6	21.7	17.1	18.2	14.2	17.0	17.8	18.6
17	Airplane 2: The Sequel	21.6	19.1	20.3	15.9	14.6	16.8	16.4	19.9	15.5	14.1
18	Battle: Los Angeles	21.2	19.6	15.9	14.8	16.6	13.8	16.9	14.5	12.5	16.4
19	Taking Lives	23.2	20.9	20.4	19.1	21.6	20.0	24.0	21.1	21.2	22.9
20	They	36.9	26.3	25.4	28.6	28.3	26.3	29.5	27.0	25.7	30.8
21	Rock, The	25.8	25.3	18.8	22.9	15.7	20.2	16.1	15.1	16.9	17.0
22	Amadeus	35.2	21.7	25.4	25.7	22.2	25.8	24.5	26.4	29.8	24.6
23	Die Hard	25.5	20.3	26.8	22.9	19.5	18.9	20.1	23.1	17.7	24.7
24	Eastern Promises	20.9	17.0	20.8	17.3	15.6	15.5	19.1	19.7	18.5	17.1
25	Wrestler, The	24.8	22.4	24.9	18.5	19.7	19.2	18.3	23.5	20.2	20.9
26	Soldier	30.3	18.6	22.4	21.0	21.4	18.0	20.0	18.9	21.3	20.8
27	Perfect Creature	18.6	18.3	14.6	15.3	14.6	16.8	19.0	15.5	16.8	16.4
28	Bottle Rocket	16.6	16.1	15.5	14.0	15.3	15.9	15.6	14.0	15.4	12.5
29	Shame	18.5	10.3	11.7	12.8	10.8	15.7	10.3	17.9	11.8	21.4
...	...	...	...	...	...	...	...	...	...	...	...
916	Dumb and Dumber	25.4	21.8	19.4	24.6	26.2	22.2	26.6	22.0	21.6	20.9
917	Alien Nation	21.5	18.4	19.0	16.4	17.6	22.1	15.7	16.6	14.8	21.8
918	Bad Santa	19.0	18.4	13.2	12.2	12.9	14.3	11.1	14.9	18.4	20.7
919	Zerophilia	22.2	19.9	16.7	17.0	13.3	15.4	15.0	16.9	16.0	13.3
920	Gremlins	16.9	17.7	16.6	16.7	14.1	12.6	11.7	16.5	14.8	14.1
921	Time Machine, The	20.2	18.4	21.7	25.5	25.1	20.4	18.3	21.3	18.7	17.4
922	Lost in Space	21.2	19.8	13.1	13.3	15.8	16.0	17.9	19.3	15.5	16.0
923	Supergirl	18.3	23.7	21.3	18.6	20.7	20.5	19.4	27.1	27.2	18.0
924	Nurse Betty	29.7	28.7	25.5	23.9	28.2	24.5	23.0	24.2	24.3	26.4
925	Miracle Worker, The	5.6	6.3	8.8	8.2	10.0	4.7	4.8	6.7	4.6	5.0
926	Apocalypse Now	24.4	19.1	11.0	14.4	13.7	15.6	17.7	21.3	20.4	19.2
927	Bonfire of the Vanities	24.5	22.8	20.1	20.7	18.5	20.6	22.2	23.4	25.0	23.7
928	Robin Hood: Prince of Thieves	28.0	25.8	21.9	27.2	26.7	27.8	27.8	23.9	26.1	28.7
929	RocknRolla	16.5	20.5	17.6	18.1	18.1	18.5	18.7	21.4	19.5	19.9
930	Disturbia	28.2	30.0	20.5	16.1	18.4	19.8	17.6	15.8	14.7	18.6
931	King Kong	27.4	22.0	21.7	19.1	17.7	22.7	23.1	23.0	20.6	22.0
932	Titanic	37.7	35.4	28.3	33.2	29.5	24.9	22.7	23.9	27.8	28.7
933	Assassins	17.4	21.8	17.8	20.8	15.7	15.8	13.8	14.2	16.0	13.8
934	American Werewolf in London	15.4	13.3	12.3	18.4	20.0	13.8	16.9	17.4	13.8	16.5
935	Losers, The	19.6	19.0	17.1	20.9	18.7	18.4	14.8	17.7	16.7	16.5
936	Cellular	23.4	17.2	15.0	20.5	14.0	16.1	20.0	17.4	17.8	20.8
937	Gang Related	22.3	18.8	14.8	19.0	16.1	17.7	16.1	16.1	13.4	19.0
938	Program, The	25.0	21.6	23.6	24.7	20.5	21.7	21.2	21.7	20.1	25.5
939	Three Men and a Baby	19.1	19.1	20.2	21.4	23.6	23.6	19.6	19.5	24.0	23.3
940	Mini's First Time	29.9	25.2	22.6	26.3	23.0	21.8	22.4	24.2	22.7	25.4
941	White Christmas	20.9	13.5	18.4	15.7	16.7	19.0	16.6	18.0	13.5	19.5
942	Six Degrees of Separation	18.5	17.9	32.7	18.0	15.3	21.0	21.8	19.3	16.1	14.0
943	Starman	25.9	28.9	21.7	16.9	21.0	17.2	26.7	22.0	19.4	22.6
944	Nightbreed	25.9	24.8	26.3	26.8	22.6	25.7	31.8	24.2	24.0	22.0
945	Lord of the Rings: Fellowship of the Ring, The	25.0	20.8	20.1	18.7	20.4	20.0	21.0	23.0	23.4	17.3