In [1]:
import json
import json_helper as jhlp
import os
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.cluster import KMeans

In [2]:
plt.style.use('ggplot')

In [3]:
json_path = os.getcwd() + '/database_characterinteractions'
movie_jsons = [os.getcwd() + '/database_characterinteractions/' + json_name for json_name in os.listdir(json_path)]
script_dicts_with_name = {}
for movie_json in movie_jsons:
    with open(movie_json) as f:
        script_dicts_with_name[movie_json.split('/')[-1].split('.')[0]] = json.loads(f.read())

In [4]:
len(movie_jsons)


Out[4]:
953

In [5]:
def get_chars_for_scene2(script_dict, scene_id):
    return script_dict[str(scene_id)]

def get_all_char_from_script2(script_dict):
    chars = set()
    for scene in script_dict:
        for char in script_dict[scene]:
            chars.add(char)
    return list(chars)

In [6]:
chars = get_all_char_from_script2(script_dicts_with_name['A Few Good Men'])
print chars
print "\n"
print get_chars_for_scene2(script_dicts_with_name['A Few Good Men'], 10)


[u'LUTHER', u'MCGUIRE', u'SAM', u'HOWARD', u'KAFFEE', u'WEST', u'SPRADLING', u'STONE', u'SERGEANT AT ARMS', u'HAMMAKER', u'JO', u'DOWNEY', u'MARKINSON', u'GIBBS', u'KENDRICK', u'JESSEP', u'DAWSON', u'WHITAKER', u'RANDOLPH', u'ROSS']


[u'KAFFEE', u'SPRADLING', u'KAFFEE', u'SPRADLING', u'KAFFEE', u'SPRADLING', u'KAFFEE']

In [7]:
len(script_dicts_with_name)


Out[7]:
952

In [8]:
#graph formation method I
#register a character interaction
#if two characters co-occur in a scen
graphs_I = []
graph_names_I = []
for name in script_dicts_with_name:
    script_dict = script_dicts_with_name[name]
    all_char_list = get_all_char_from_script2(script_dict)
    graph = nx.Graph()
    for scene_id in script_dict:
        scene_chars = get_chars_for_scene2(script_dict, scene_id)
        for i in range(len(scene_chars)-1):
            for j in range(i+1,len(scene_chars)):
                char1 = scene_chars[i]
                char2 = scene_chars[j]
                if char1 == char2:
                    continue
                if not graph.has_edge(char1, char2):
                    graph.add_edge(char1, char2, weight=1)
                else:
                    graph[char1][char2]['weight'] += 1
    graphs_I.append(graph)
    graph_names_I.append(name)

In [9]:
#for graph formation method II
def get_char_interactions_for_scene_new(script_dict, scene_id):
    interactions = []
    chars_in_scene = get_chars_for_scene2(script_dict, scene_id)
    if len(chars_in_scene) < 2: #if a scene has less than two characters
        return interactions
    for i in range(len(chars_in_scene)-1):
        for j in range(i+1, len(chars_in_scene)):
            if j-i > 3:
                continue
            m = j-i
            weight = 4-m
            char1 = chars_in_scene[i]
            char2 = chars_in_scene[j]
            if char1 == char2:
                continue
            interactions.append((char1, char2, weight))
    return interactions

In [10]:
#graph formation method II
#register a character interaction if two characters
#speak within a certain distance of each other
graphs_II = []
graph_names_II = []
for name in script_dicts_with_name:
    script_dict = script_dicts_with_name[name]
    graph = nx.Graph()
    for scene_id in script_dict:
        char_interactions = get_char_interactions_for_scene_new(script_dict, scene_id)
        for interaction in char_interactions:
            char1 = interaction[0]
            char2 = interaction[1]
            wt = interaction[2]
            if not graph.has_edge(char1, char2):
                    graph.add_edge(char1, char2, weight=wt)
            else:
                graph[char1][char2]['weight'] += wt

    graphs_II.append(graph)
    graph_names_II.append(name)

In [11]:
print len(graphs_I)
print len(graph_names_I)
print len(graphs_II)
print len(graph_names_II)


952
952
952
952

In [12]:
def get_graph_metrics(graph):
    try:
        num_chars = len(graph.nodes()) #1
        num_relations = len(graph.edges()) #2
        avg_clustering = nx.average_clustering(graph) #3
        per_node_centrality = sorted(np.array(nx.degree_centrality(graph).values()), reverse=True)
        centralization = sum(per_node_centrality[0] - per_node_centrality)/((num_chars-1)*(num_chars-2)) #4
        sing_char_centrality = (per_node_centrality[0] - per_node_centrality[1])/sum(per_node_centrality) #5
        edge_weights = sorted([float(graph[u][v]['weight'])/5 for u,v in graph.edges()], reverse=True)
        single_relationship_centrality = (edge_weights[0]-edge_weights[1])/sum(edge_weights) #6
        degrees = graph.degree(graph.nodes(), weight='weight').values()
        degrees = np.array(degrees)/sum(edge_weights)
        k = min(10, num_chars)
        top_char_degree_var = np.var(sorted(degrees, reverse=True)[:k]) #7
        k = min(10, len(edge_weights))
        top_relationship_var = np.var(edge_weights[:k]) #8
    except Exception as e:
        return None
    metrics = np.array([num_chars, num_relations, avg_clustering, centralization, sing_char_centrality, 
                       single_relationship_centrality, top_char_degree_var, top_relationship_var])
    if float('inf') in metrics:
        return None
    else:
        return metrics

In [ ]:
movie_names_I = []
movie_features_I = []

for i,graph in enumerate(graphs_I):
    metrics = get_graph_metrics(graph)
    if metrics != None:
        movie_names_I.append(graph_names_I[i])
        movie_features_I.append(metrics)
        
movie_names_II = []
movie_features_II = []

for i,graph in enumerate(graphs_II):
    metrics = get_graph_metrics(graph)
    if metrics != None:
        movie_names_II.append(graph_names_II[i])
        movie_features_II.append(metrics)

In [14]:
print len(movie_names_I)
print len(movie_features_I)

print len(movie_names_II)
print len(movie_features_II)


925
925
925
925

In [15]:
import pandas as pd
feats = [[movie_names_I[i]] + [f for f in mf] for i,mf in enumerate(movie_features_I)]
df = pd.DataFrame(feats)
df


Out[15]:
0 1 2 3 4 5 6 7 8
0 Funny People 18.0 38.0 0.627066 0.038062 0.013158 0.228325 1.114880 7568.6256
1 Road, The 11.0 28.0 0.832035 0.060000 0.000000 0.236688 1.372372 6195.4324
2 Limitless 19.0 42.0 0.859649 0.046841 0.011905 0.014257 1.287618 709.2756
3 Yes Man 27.0 70.0 0.651552 0.031657 0.107143 0.004606 1.213875 1003.2880
4 English Patient, The 20.0 49.0 0.601429 0.018775 0.020408 0.104373 0.269430 604.4000
5 Gremlins 23.0 76.0 0.644488 0.032566 0.065789 0.002179 0.763040 18.8484
6 Crow, The 20.0 56.0 0.500198 0.028932 0.053571 0.045455 0.374572 19.8116
7 Moon 9.0 19.0 0.544444 0.035714 0.000000 0.194133 1.177243 1997.0080
8 Smokin' Aces 26.0 71.0 0.492703 0.013067 0.014085 0.015005 0.021428 192.3796
9 Human Nature 12.0 34.0 0.699074 0.033058 0.000000 0.191882 1.451974 3996.6304
10 Cradle 2 the Grave 17.0 51.0 0.564180 0.022135 0.019608 0.057832 0.417705 235.8836
11 Snatch 26.0 81.0 0.586736 0.013467 0.000000 0.001347 0.120621 124.5284
12 Sherlock Holmes 17.0 41.0 0.718009 0.040625 0.024390 0.350877 1.655823 808.8096
13 Star Trek II: The Wrath of Khan 21.0 101.0 0.775082 0.023158 0.004950 0.016935 0.646164 1029.2916
14 Ginger Snaps 13.0 38.0 0.705883 0.050505 0.026316 0.363497 1.409181 2102.7216
15 Harold and Kumar Go to White Castle 28.0 64.0 0.782262 0.030178 0.000000 0.420779 1.680751 1563.3876
16 Sideways 16.0 33.0 0.743479 0.055238 0.045455 0.308030 1.973953 16684.0036
17 Big White, The 16.0 34.0 0.676515 0.034286 0.000000 0.086073 0.474138 1278.0276
18 Cirque du Freak: The Vampire's Assistan 15.0 52.0 0.766040 0.041601 0.038462 0.223978 1.121962 3925.1504
19 Eight Legged Freaks 14.0 61.0 0.872982 0.029586 0.000000 0.024307 0.292936 114.1380
20 Citizen Kane 29.0 102.0 0.697335 0.015023 0.000000 0.053893 0.585699 2216.3924
21 Ghost Rider 20.0 65.0 0.759872 0.038473 0.053846 0.006575 0.362362 121.7316
22 Newsies 25.0 123.0 0.757743 0.021060 0.004065 0.114693 0.562832 114.2564
23 Wall-E 14.0 48.0 0.758761 0.028600 0.000000 0.022670 0.602018 152.1284
24 Precious 17.0 39.0 0.726144 0.046094 0.064103 0.133663 1.368561 206.0036
25 Hudson Hawk 17.0 60.0 0.699223 0.035156 0.025000 0.033471 0.824970 344.6340
26 Rush Hour 2 13.0 28.0 0.794872 0.063131 0.017857 0.480677 2.083454 7429.1796
27 Dances with Wolves 18.0 44.0 0.681393 0.027682 0.022727 0.064723 0.323648 163.7716
28 Win Win 17.0 71.0 0.807427 0.033854 0.000000 0.019219 0.714111 654.5776
29 1492: Conquest of Paradise 20.0 48.0 0.729789 0.043706 0.114583 0.082028 1.184544 112.4160
... ... ... ... ... ... ... ... ... ...
895 Metro 17.0 30.0 0.460912 0.033073 0.033333 0.111736 1.456276 2026.3584
896 Hanna 20.0 50.0 0.589084 0.030779 0.010000 0.014151 0.723225 829.3200
897 Twilight: New Moon 25.0 85.0 0.778115 0.030571 0.047059 0.151158 1.333289 10729.8416
898 Land of the Dead 27.0 173.0 0.791709 0.017870 0.005780 0.009871 0.280822 22.2500
899 Drive 13.0 30.0 0.653175 0.044192 0.033333 0.044248 0.668038 95.3604
900 Game, The 19.0 37.0 0.504280 0.048656 0.148649 0.248891 2.001254 9948.7540
901 Tourist, The 11.0 32.0 0.843434 0.038889 0.015625 0.210048 1.204128 1566.9056
902 Larry Crowne 28.0 83.0 0.608488 0.029651 0.072289 0.081390 0.534309 925.7664
903 Planet of the Apes, The 13.0 40.0 0.833794 0.039773 0.012500 0.082212 0.548466 382.8324
904 Beach, The 11.0 20.0 0.736652 0.077778 0.075000 0.157210 1.638903 1007.3124
905 500 Days of Summer 16.0 39.0 0.773214 0.051429 0.076923 0.274842 1.614581 10024.4464
906 Deep Cover 16.0 38.0 0.700832 0.041905 0.039474 0.074451 0.612710 1134.0016
907 American Shaolin: King of Kickboxers II 20.0 76.0 0.810174 0.035088 0.039474 0.024289 0.834218 165.9876
908 Duck Soup 11.0 44.0 0.858009 0.024444 0.011364 0.116906 1.009203 8639.0144
909 Disturbia 10.0 19.0 0.697778 0.080247 0.078947 0.062784 1.797648 1085.4624
910 Boogie Nights 33.0 87.0 0.582368 0.015310 0.011494 0.051095 0.434025 2863.2884
911 Sugar and Spice 25.0 95.0 0.728497 0.021513 0.015789 0.006047 0.508536 142.3296
912 Do The Right Thing 37.0 198.0 0.682062 0.011662 0.002525 0.069682 0.245499 594.7184
913 Starman 19.0 35.0 0.694069 0.021786 0.000000 0.469358 1.349386 7747.3284
914 Good Girl, The 9.0 36.0 1.000000 0.000000 0.000000 0.021994 0.928856 79099.6336
915 Year One 24.0 88.0 0.729840 0.028184 0.017045 0.141398 0.934894 1316.5716
916 Pi 9.0 16.0 0.709524 0.089286 0.093750 0.086321 1.761079 1783.8144
917 Blood Simple 23.0 78.0 0.673792 0.014069 0.012821 0.044146 0.311160 801.5024
918 Mariachi, El 10.0 25.0 0.678730 0.061728 0.040000 0.230825 0.899861 522.0916
919 Other Boleyn Girl, The 30.0 225.0 0.781593 0.015288 0.000000 0.017032 0.305388 343.3296
920 Mimic 15.0 38.0 0.722496 0.040816 0.000000 0.092096 0.935956 338.7984
921 Buried 13.0 25.0 0.766434 0.066919 0.140000 0.188379 1.572420 144699.8544
922 Seventh Seal, The 14.0 75.0 0.903152 0.015779 0.000000 0.001217 0.355084 12541.7504
923 Two For The Money 18.0 38.0 0.696898 0.049740 0.052632 0.194619 1.651640 7729.4564
924 Black Rain 14.0 36.0 0.700031 0.054241 0.055556 0.085296 1.289786 519.6560

925 rows × 9 columns


In [16]:
import cPickle as pickle
with open('char_net_final_I.pkl', 'w') as fp:
    pickle.dump(df, fp)

In [17]:
feats = [[movie_names_II[i]] + [f for f in mf] for i,mf in enumerate(movie_features_II)]
df = pd.DataFrame(feats)
df

with open('char_net_final_II.pkl', 'w') as fp:
    pickle.dump(df, fp)

In [18]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(movie_features_I)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], alpha=0.5)
plt.show()



In [19]:
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(movie_features_II)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], alpha=0.5)
plt.show()



In [20]:
#clustering in tsne reduced dimensions
kmeans = KMeans(n_clusters=7, random_state=0)
tsne_clusters = kmeans.fit_predict(tsne_op)
#c = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'w']
c = ['r', 'g', 'b', 'y', 'k', 'w', 'c']
#c = np.linspace(0,0.6,7)
colors = [c[t] for t in tsne_clusters]
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], c=colors, alpha=0.5)
plt.show()



In [21]:
#lists of movies falling in different clusters
movie_clusters = [[] for _ in range(7)]
for i,tc in enumerate(tsne_clusters):
    movie_clusters[tc].append(movie_names_II[i])
    
for cluster in movie_clusters:
    print cluster[:10]


['Limitless', 'Precious', 'Hudson Hawk', 'Blade Runner', 'Panic Room', 'Watchmen', 'Trainspotting', 'Orphan', "One Flew Over the Cuckoo's Nes", 'Bachelor Party, The']
['Star Trek II: The Wrath of Khan', 'Big White, The', 'Citizen Kane', 'Twiligh', 'Erin Brockovich', 'Matrix, The', 'Family Man, The', "Ocean's Eleven", 'Broken Arrow', 'Angels & Demons']
['Funny People', 'Road, The', 'Human Nature', 'Harold and Kumar Go to White Castle', 'Sideways', 'Rush Hour 2', 'Three Kings (Spoils of War)', 'Paul', 'Up in the Air', 'Fabulous Baker Boys, The']
['Gremlins', 'Crow, The', "Smokin' Aces", 'Eight Legged Freaks', 'Wall-E', 'Dances with Wolves', '1492: Conquest of Paradise', 'Hannibal', 'Pacifier, The', 'Princess Bride, The']
['Yes Man', 'English Patient, The', 'Cold Mountain', 'Crazy, Stupid, Love', "All the King's Men", 'Prophecy, The', 'Men in Black', "All the President's Men", 'Austin Powers - The Spy Who Shagged Me', 'Midnight Cowboy']
['Moon', 'Sherlock Holmes', 'Ginger Snaps', "Cirque du Freak: The Vampire's Assistan", 'Win Win', 'An Education', 'Descendants, The', 'Jacket, The', 'Wag the Dog', 'Lake Placid']
['Cradle 2 the Grave', 'Snatch', 'Ghost Rider', 'Newsies', 'Fair Game', 'Total Recall', 'Marty', 'John Q', 'Adventures of Buckaroo Banzai Across the Eighth Dimension, The', 'Island, The']

In [22]:
#find out which cluster the given movie falls in
def get_movie_cluster(name):
    for i,cluster in enumerate(movie_clusters):
        if name in cluster:
            return i

In [23]:
print get_movie_cluster('Terminator')
print get_movie_cluster('Terminator 2: Judgement Day')
print get_movie_cluster('Terminator Salvation')

print get_movie_cluster('Godfather')
print get_movie_cluster('Godfather Part II')
print get_movie_cluster('Godfather Part III, The')

print get_movie_cluster('Kill Bill Volume 1 & 2')
print get_movie_cluster('Reservoir Dogs')


4
1
3
6
5
3
6
1

In [24]:
import cPickle as pickle
with open('movie_data.pkl', 'r') as fp:
    movie_data = pickle.load(fp)

In [25]:
movie_data['ratings'][movie_clusters[0][0]]


Out[25]:
7.9

In [26]:
cluster_score_means = []
cluster_score_deviations = []
cluster_genres = []

for cluster in movie_clusters:
    cluster_scores = [movie_data['ratings'][name] for name in cluster if name in movie_data['ratings']]
    cluster_score_means.append(np.mean(cluster_scores))
    cluster_score_deviations.append(np.std(cluster_scores))
    genres = []
    for name in cluster:
        if name in movie_data['genres']:
            genres += movie_data['genres'][name]
    g = set(genres)
    cluster_genres.append(sorted(g, key=lambda m: genres.count(m)))

In [27]:
for i in range(len(movie_clusters)):
    print [g for g in cluster_genres[i] if g != 'genres'][:5]
    print cluster_score_means[i]
    print cluster_score_deviations[i]


[u'Musical', u'Animation', u'Music', u'Sport', u'War']
6.8902173913
1.1401811243
[u'Film-Noir', u'Talk-Show', u'Animation', u'Western', u'Musical']
7.06022099448
0.904969677341
[u'Film-Noir', u'Musical', u'War', u'Sport', u'Music']
7.04344262295
0.865834923967
[u'Short', u'Musical', u'Music', u'Western', u'Sport']
6.74110429448
1.13411840096
[u'Western', u'Documentary', u'Musical', u'Film-Noir', u'Animation']
7.10275229358
0.88757568129
[u'Animation', u'War', u'Film-Noir', u'Short', u'Documentary']
7.01052631579
0.865223093304
[u'War', u'Western', u'Short', u'Sport', u'Music']
6.99736842105
1.01682862037

In [28]:
lls = [[] for _ in range(5)]
lls[0].append(5)
lls[3].append(66)
print lls


[[5], [], [], [66], []]

In [29]:
print graph_names_I.index('Girl with the Dragon Tattoo, The')
print graph_names_II.index('Girl with the Dragon Tattoo, The')


591
591

In [30]:
graph_new = graphs_II[graph_names_II.index('Lord of the Rings: Fellowship of the Ring, The')]
graph_new.remove_node('BUTTERBUR')
graph_new.remove_node('FADE TO BLACK')
edge_weights = [float(graph_new[u][v]['weight'])/5 for u,v in graph_new.edges()]
pos = nx.circular_layout(graph_new)
plt.figure(figsize=(15,15))
nx.draw_networkx(graph_new, with_labels=True, width=[e/4 for e in edge_weights], alpha=0.5); plt.show()