In [ ]:
import json
import json_helper as jhlp
import os
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.cluster import KMeans
plt.style.use('ggplot')
In [ ]:
json_path = "path to json folder"
movie_jsons = [json_path+'/' + json_name for json_name in os.listdir(json_path)]
script_dicts = []
for movie_json in movie_jsons:
with open(movie_json) as f:
script_dicts.append(json.loads(f.read()))
In [ ]:
#find all characters present in a scene from scene descriptions and dialogues
def get_chars_for_scene(scene, all_char_list):
chars_from_desc = set()
chars_from_dialogue = set()
desc = jhlp.get_description_for_scene(scene)
for c in all_char_list:
if c in desc:
chars_from_desc.add(c)
for d in scene['char_dialogues']:
if d[1] in all_char_list:
chars_from_dialogue.add(d[1])
return list(chars_from_desc), list(chars_from_dialogue)
In [ ]:
#graph formation method I
#a scene interaction is registered if two characters co-occur
#a better approach is used later
#this gives great results too
graphs_I = []
for script_dict in script_dicts:
all_char_list = jhlp.get_all_char_from_script(script_dict)
graph = nx.Graph()
for key in script_dict:
scene = script_dict[key]
a, b = get_chars_for_scene(scene, all_char_list)
for char1 in b:
for char2 in b:
if char1 == char2:
continue
if not graph.has_edge(char1, char2):
graph.add_edge(char1, char2, weight=1)
else:
graph[char1][char2]['weight'] += 1
graphs_I.append(graph)
In [ ]:
def get_graph_metrics(graph):
try:
num_chars = len(graph.nodes()) #1
num_relations = len(graph.edges()) #2
avg_clustering = nx.average_clustering(graph) #3
per_node_centrality = sorted(np.array(nx.degree_centrality(graph).values()), reverse=True)
centralization = sum(per_node_centrality[0] - per_node_centrality)/((num_chars-1)*(num_chars-2)) #4
sing_char_centrality = (per_node_centrality[0] - per_node_centrality[1])/sum(per_node_centrality) #5
edge_weights = sorted([float(graph[u][v]['weight'])/5 for u,v in graph.edges()], reverse=True)
single_relationship_centrality = (edge_weights[0]-edge_weights[1])/sum(edge_weights) #6
degrees = graph.degree(graph.nodes(), weight='weight').values()
degrees = np.array(degrees)/sum(edge_weights)
k = min(10, num_chars)
top_char_degree_var = np.var(sorted(degrees, reverse=True)[:k]) #7
k = min(10, len(edge_weights))
top_relationship_var = np.var(edge_weights[:k]) #8
except Exception as e:
return None
metrics = np.array([num_chars, num_relations, avg_clustering, centralization, sing_char_centrality,
single_relationship_centrality, top_char_degree_var, top_relationship_var])
if float('inf') in metrics:
return None
else:
return metrics
In [ ]:
graph_metrics_I = np.array([get_graph_metrics(graph) for graph in graphs_I])
In [ ]:
print graph_metrics_I.shape
import cPickle as pickle
with open('char_net_features_using_json_I.pkl', 'w') as fp:
pickle.dump(graph_metrics_I, fp)
In [ ]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(graph_metrics_I)
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], alpha=0.5)
plt.show()
In [ ]:
#clustering in tsne reduced dimensions
kmeans = KMeans(n_clusters=8, random_state=0)
tsne_clusters = kmeans.fit_predict(tsne_op)
c = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'w']
colors = [c[t] for t in tsne_clusters]
plt.figure(figsize=(10,10))
plt.scatter(tsne_op[:,0], tsne_op[:,1], c=colors, alpha=0.5)
plt.show()
In [ ]:
#find the movies present in the red cluster
#validate this manually
#would you expect these movies to fall in the same cluster
red_movies = []
for i,t in enumerate(tsne_clusters):
if t == 0:
red_movies.append(movie_jsons[i])
for r in red_movies:
print r
In [ ]:
#clustering in actual dimensions
#compare this with clustering in the tsne-reduced dimensions
#the clusters are nearly the same
kmeans = KMeans(n_clusters=8, random_state=0)
actual_clusters = kmeans.fit_predict(graph_metrics_I)
tsne_model = TSNE(n_components=2, random_state=0)
tsne_op = tsne_model.fit_transform(graph_metrics_I)
c = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'w']
colors = [c[t] for t in actual_clusters]
plt.figure(figsize=(15,15))
plt.scatter(tsne_op[:,0], tsne_op[:,1], c=colors, alpha=0.5)
plt.show()