In [2]:
#!/usr/bin/env python
# coding=utf-8
# Detects and creates the collaboration graph based on the clustering results
# Evaluates content creator assignments, collaborations between channel, networks, categories and popularities
import pandas as pa
import numpy as np
import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math
from sklearn.preprocessing import MinMaxScaler
import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook
import itertools
import csv
from sqlalchemy import exists, func
from database import *
from matplotlib import pylab, pyplot
from matplotlib import dates
import seaborn as sns
sns.set(color_codes=True)
from scipy import stats, integrate
from datetime import datetime, timedelta, date
date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'
plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')
SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13
plt.rc('font', size=MEDIUM_SIZE) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title
x_width = 6.8898
x_height = x_width / 1.618
s_width = 3.4449
s_height = s_width / 1.618
def save_plot(name, fig, width, height):
fig.tight_layout()
fig.set_size_inches(width, height)
#f.subplots_adjust(top=0.86)
fig.savefig(CDIR+'/'+name, bbox_inches="tight")
#plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")
In [3]:
DIR = '../../data/data_evaluation_3months'
CDIR = '../../data/data_evaluation_3months/charts'
db = YTDatabase()
In [24]:
G = nx.read_gml(DIR+"/collab_detections_graph.gml")
Gf = nx.read_gml(DIR+"/filtered_collab_detections_graph.gml")
Gfu = Gf.to_undirected()
In [25]:
# apply networkx metrics on the graph
# what type is graph? DiGraph
In [117]:
print nx.info(Gf)
print "Strongly Connected Components: ", nx.number_strongly_connected_components(Gf)
print "Weakly Conncted Components: ", nx.number_weakly_connected_components(Gf)
print 'Average Degree:', pa.DataFrame(Gf.degree().items())[1].mean()
In [118]:
print 'Average Weighted Degree:', pa.DataFrame(Gf.degree(weight='weight').items())[1].mean()
print 'Average Weighted In-Degree:', pa.DataFrame(Gf.in_degree(weight='weight').items())[1].mean()
print 'Average Weighted Out-Degree:',pa.DataFrame(Gf.out_degree(weight='weight').items())[1].mean()
In [112]:
Ggs = [x for x in nx.connected_component_subgraphs(Gfu)]
print pa.DataFrame([nx.diameter(g) for g in Ggs]).describe()
In [56]:
print nx.info(Gfu)
In [114]:
# Compute the average clustering coefficient for the graph G
nx.average_clustering(Gfu)
Out[114]:
In [87]:
def get_top_keys(dictionary, top):
items = dictionary.items()
items.sort(reverse=True, key=lambda x: x[1])
return map(lambda x: x[0], items[:top])
# Biggest conccected component
Gc = max(nx.connected_component_subgraphs(Gfu), key=len)
print nx.info(Gc)
# Betweenness centrality
bet_cen = nx.betweenness_centrality(Gc)
# Closeness centrality
clo_cen = nx.closeness_centrality(Gc)
# Eigenvector centrality
eig_cen = nx.eigenvector_centrality_numpy(Gc)
deg_cen = nx.degree_centrality(Gc)
top_bet_cen = get_top_keys(bet_cen,10)
top_clo_cen = get_top_keys(clo_cen,10)
top_eig_cen = get_top_keys(eig_cen,10)
top_deg_cen = get_top_keys(deg_cen,10)
In [91]:
print pa.DataFrame(bet_cen.items()).describe()
print
print pa.DataFrame(clo_cen.items()).describe()
print
print pa.DataFrame(eig_cen.items()).describe()
print
print pa.DataFrame(deg_cen.items()).describe()
In [88]:
with db._session_scope(False) as session:
print '\nTop 10 Betweenes centrality:'
for ch in top_bet_cen:
title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
print title, bet_cen[ch]
print '\nTop 10 Closeness centrality:'
for ch in top_clo_cen:
title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
print title, clo_cen[ch]
print '\nTop 10 Eigenvector centrality:'
for ch in top_eig_cen:
title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
print title, eig_cen[ch]
print '\nTop 10 Degree centrality:'
for ch in top_deg_cen:
title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
print title, deg_cen[ch]
In [ ]: