notebook.community

Edit and run



In [2]:

    
#!/usr/bin/env python
# coding=utf-8

# Detects and creates the collaboration graph based on the clustering results
# Evaluates content creator assignments, collaborations between channel, networks, categories and popularities

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math

from sklearn.preprocessing import MinMaxScaler

import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func

from database import *

from matplotlib import pylab, pyplot
from matplotlib import dates

import seaborn as sns
sns.set(color_codes=True)

from scipy import stats, integrate

from datetime import datetime, timedelta, date

date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'

plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')

SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

x_width  = 6.8898
x_height = x_width / 1.618

s_width  = 3.4449
s_height = s_width / 1.618

def save_plot(name, fig, width, height):
    fig.tight_layout()
    fig.set_size_inches(width, height)
    #f.subplots_adjust(top=0.86)

    fig.savefig(CDIR+'/'+name, bbox_inches="tight")
    #plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")



In [3]:

    
DIR = '../../data/data_evaluation_3months'
CDIR = '../../data/data_evaluation_3months/charts'

db = YTDatabase()









    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sqlalchemy/engine/default.py:470: Warning: Can't create database 'mlode'; database exists
  cursor.execute(statement, parameters)



In [24]:

    
G = nx.read_gml(DIR+"/collab_detections_graph.gml")
Gf = nx.read_gml(DIR+"/filtered_collab_detections_graph.gml")

Gfu = Gf.to_undirected()



In [25]:

    
# apply networkx metrics on the graph

# what type is graph? DiGraph



In [117]:

    
print nx.info(Gf)
print "Strongly Connected Components: ", nx.number_strongly_connected_components(Gf)
print "Weakly Conncted Components: ", nx.number_weakly_connected_components(Gf)
print 'Average Degree:', pa.DataFrame(Gf.degree().items())[1].mean()









    



Name: 
Type: DiGraph
Number of nodes: 3624
Number of edges: 3406
Average in degree:   0.9398
Average out degree:   0.9398
Strongly Connected Components:  3320
Weakly Conncted Components:  1432
Average Degree: 1.87969094923



In [118]:

    
print 'Average Weighted Degree:', pa.DataFrame(Gf.degree(weight='weight').items())[1].mean()
print 'Average Weighted In-Degree:', pa.DataFrame(Gf.in_degree(weight='weight').items())[1].mean()
print 'Average Weighted Out-Degree:',pa.DataFrame(Gf.out_degree(weight='weight').items())[1].mean()









    



Average Weighted Degree: 4.08774834437
Average Weighted In-Degree: 2.04387417219
Average Weighted Out-Degree: 2.04387417219



In [112]:

    
Ggs = [x for x in nx.connected_component_subgraphs(Gfu)]

print pa.DataFrame([nx.diameter(g) for g in Ggs]).describe()









    



                 0
count  1432.000000
mean      0.259078
std       0.864261
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      23.000000



In [56]:

    
print nx.info(Gfu)









    



 Name: 
Type: Graph
Number of nodes: 3624
Number of edges: 3131
Average degree:   1.7279



In [114]:

    
# Compute the average clustering coefficient for the graph G
nx.average_clustering(Gfu)









    Out[114]:





0.057670389443132394



In [87]:

    
def get_top_keys(dictionary, top):
    items = dictionary.items()
    items.sort(reverse=True, key=lambda x: x[1])
    return map(lambda x: x[0], items[:top])


# Biggest conccected component
Gc = max(nx.connected_component_subgraphs(Gfu), key=len)
print nx.info(Gc)

# Betweenness centrality
bet_cen = nx.betweenness_centrality(Gc)
# Closeness centrality
clo_cen = nx.closeness_centrality(Gc)
# Eigenvector centrality
eig_cen = nx.eigenvector_centrality_numpy(Gc)

deg_cen = nx.degree_centrality(Gc)

top_bet_cen = get_top_keys(bet_cen,10)
top_clo_cen = get_top_keys(clo_cen,10)
top_eig_cen = get_top_keys(eig_cen,10)
top_deg_cen = get_top_keys(deg_cen,10)









    



Name: 
Type: Graph
Number of nodes: 1810
Number of edges: 2731
Average degree:   3.0177



In [91]:

    
print pa.DataFrame(bet_cen.items()).describe()
print 
print pa.DataFrame(clo_cen.items()).describe()
print 
print pa.DataFrame(eig_cen.items()).describe()
print 
print pa.DataFrame(deg_cen.items()).describe()









    



                 1
count  1810.000000
mean      0.003520
std       0.008854
min       0.000000
25%       0.000000
50%       0.000986
75%       0.003456
max       0.129319

                 1
count  1810.000000
mean      0.141151
std       0.026512
min       0.067480
25%       0.122877
50%       0.140603
75%       0.160754
max       0.217821

                  1
count  1.810000e+03
mean   9.036362e-04
std    2.349414e-02
min   -2.044549e-18
25%    5.804070e-13
50%    1.399612e-10
75%    1.104900e-08
max    7.043814e-01

                 1
count  1810.000000
mean      0.001668
std       0.002107
min       0.000553
25%       0.000553
50%       0.001106
75%       0.002211
max       0.034273



In [88]:

    
with db._session_scope(False) as session:

    print '\nTop 10 Betweenes centrality:'
    for ch in top_bet_cen:
        title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
        print title, bet_cen[ch]
    
    print '\nTop 10 Closeness centrality:'
    for ch in top_clo_cen:
        title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
        print title, clo_cen[ch]
    
    print '\nTop 10 Eigenvector centrality:'
    for ch in top_eig_cen:
        title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
        print title, eig_cen[ch]
        
        
    print '\nTop 10 Degree centrality:'
    for ch in top_deg_cen:
        title = session.query(Channel.title).filter(Channel.id == ch).first()[0]
        print title, deg_cen[ch]









    



Top 10 Betweenes centrality:
Cy'Spot 0.129318509095
RabidRetrospectGames 0.116734038348
Gelli Clash 0.112277645347
Tony Tubo 0.1046684691
Bolek 2106 0.0825739606384
Body Mind Zone 0.0773463185803
CBR 0.0771075918487
EclypsiaFamily 0.0748198920465
Reaper 0.0711017502866
PlaystationFollower 0.0627454337087

Top 10 Closeness centrality:
Cy'Spot 0.217820590006
RabidRetrospectGames 0.214209591474
Reaper 0.212973863904
PlaystationFollower 0.210104529617
Gelli Clash 0.209569045412
Tony Tubo 0.208146358302
Bolek 2106 0.205521472393
EclypsiaFamily 0.202575587906
Drift0r 0.201987494417
IvanForever 0.201897321429

Top 10 Eigenvector centrality:
iO Trendz 0.704381368276
InformOverload 0.703372731799
MostAmazingTop10 0.0896443776594
SHAYTARDS 0.0180037627752
InformOverload 2 0.0119134623913
LandonProduction Vlogs 0.00747563009894
Monster Energy 0.00671591254687
Michael McCrudden 0.00627944179099
HES19Motivation 0.0060699433969
DJ Akademiks 0.00604798846565

Top 10 Degree centrality:
Gelli Clash 0.0342730790492
Cy'Spot 0.0259812050857
Tony Tubo 0.024322830293
Bolek 2106 0.0232172470978
DavidK 0.0232172470978
RabidRetrospectGames 0.0210060807076
Canal Clash ON 0.02045328911
FarsAttack 0.0193477059149
Reaper 0.0182421227197
CONRAGEN  0.0132669983416



In [ ]: