notebook.community

Edit and run



In [ ]:

    
import json
import json_helper as jhlp
import os
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.cluster import KMeans
plt.style.use('ggplot')



In [ ]:

    
json_path = "path to json folder"
movie_jsons = [json_path+'/' + json_name for json_name in os.listdir(json_path)]
script_dicts = []
for movie_json in movie_jsons:
    with open(movie_json) as f:
        script_dicts.append(json.loads(f.read()))



In [ ]:

    
#find all characters present in a scene from scene descriptions and dialogues
def get_chars_for_scene(scene, all_char_list):
    chars_from_desc = set()
    chars_from_dialogue = set()
    desc = jhlp.get_description_for_scene(scene)
    for c in all_char_list:
        if c in desc:
            chars_from_desc.add(c)
    for d in scene['char_dialogues']:
        if d[1] in all_char_list:
            chars_from_dialogue.add(d[1])
    return list(chars_from_desc), list(chars_from_dialogue)



In [ ]:

    
#graph formation method I
#a scene interaction is registered if two characters co-occur
#a better approach is used later
#this gives great results too

graphs_I = []
for script_dict in script_dicts:
    all_char_list = jhlp.get_all_char_from_script(script_dict)
    graph = nx.Graph()
    for key in script_dict:
        scene = script_dict[key]
        a, b = get_chars_for_scene(scene, all_char_list)
        for char1 in b:
            for char2 in b:
                if char1 == char2:
                    continue
                if not graph.has_edge(char1, char2):
                    graph.add_edge(char1, char2, weight=1)
                else:
                    graph[char1][char2]['weight'] += 1
    graphs_I.append(graph)



In [ ]:

    
def get_graph_metrics(graph):
    try:
        num_chars = len(graph.nodes()) #1
        num_relations = len(graph.edges()) #2
        avg_clustering = nx.average_clustering(graph) #3
        per_node_centrality = sorted(np.array(nx.degree_centrality(graph).values()), reverse=True)
        centralization = sum(per_node_centrality[0] - per_node_centrality)/((num_chars-1)*(num_chars-2)) #4
        sing_char_centrality = (per_node_centrality[0] - per_node_centrality[1])/sum(per_node_centrality) #5
        edge_weights = sorted([float(graph[u][v]['weight'])/5 for u,v in graph.edges()], reverse=True)
        single_relationship_centrality = (edge_weights[0]-edge_weights[1])/sum(edge_weights) #6
        degrees = graph.degree(graph.nodes(), weight='weight').values()
        degrees = np.array(degrees)/sum(edge_weights)
        k = min(10, num_chars)
        top_char_degree_var = np.var(sorted(degrees, reverse=True)[:k]) #7
        k = min(10, len(edge_weights))
        top_relationship_var = np.var(edge_weights[:k]) #8
    except Exception as e:
        return None
    metrics = np.array([num_chars, num_relations, avg_clustering, centralization, sing_char_centrality, 
                       single_relationship_centrality, top_char_degree_var, top_relationship_var])
    if float('inf') in metrics:
        return None
    else:
        return metrics



In [ ]:

    
graph_metrics_I = np.array([get_graph_metrics(graph) for graph in graphs_I])



In [ ]:

    
print graph_metrics_I.shape
import cPickle as pickle
with open('char_net_features_using_json_I.pkl', 'w') as fp:
    pickle.dump(graph_metrics_I, fp)



In [ ]:

    
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(graph_metrics_I)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], alpha=0.5)
plt.show()



In [ ]:

    
#clustering in tsne reduced dimensions
kmeans = KMeans(n_clusters=8, random_state=0)
tsne_clusters = kmeans.fit_predict(tsne_op)
c = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'w']
colors = [c[t] for t in tsne_clusters]
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], c=colors, alpha=0.5)
plt.show()



In [ ]:

    
#find the movies present in the red cluster 
#validate this manually
#would you expect these movies to fall in the same cluster
red_movies = []
for i,t in enumerate(tsne_clusters):
    if t == 0:
        red_movies.append(movie_jsons[i])
for r in red_movies:
    print r



In [ ]:

    
#clustering in actual dimensions
#compare this with clustering in the tsne-reduced dimensions
#the clusters are nearly the same

kmeans = KMeans(n_clusters=8, random_state=0)
actual_clusters = kmeans.fit_predict(graph_metrics_I)
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(graph_metrics_I)

c = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'w']
colors = [c[t] for t in actual_clusters]
plt.figure(figsize=(15,15))
plt.scatter(tsne_op[:,0], tsne_op[:,1], c=colors, alpha=0.5)
plt.show()