In [156]:
#!/usr/bin/env python
# coding=utf-8

# Detects and creates the collaboration graph based on the clustering results
# Evaluates content creator assignments, collaborations between channel, networks, categories and popularities

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math

from sklearn.preprocessing import MinMaxScaler

import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func

from database import *

from matplotlib import pylab, pyplot
from matplotlib import dates

import seaborn as sns
sns.set(color_codes=True)

from scipy import stats, integrate

from datetime import datetime, timedelta, date

date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'

plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')

SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

x_width  = 6.8898
x_height = x_width / 1.618

s_width  = 3.4449
s_height = s_width / 1.618

def save_plot(name, fig, width, height):
    fig.tight_layout()
    fig.set_size_inches(width, height)
    #f.subplots_adjust(top=0.86)

    fig.savefig(CDIR+'/'+name, bbox_inches="tight")
    #plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")

In [157]:
DIR = '../../data/data_evaluation_3MONTHS_filtered'
CDIR = '../../data/data_evaluation_3MONTHS_filtered/charts'

db = YTDatabase()

In [3]:
df_channel = pa.read_csv(DIR+r'/df_channel_statistics_first_day.txt', sep=str('\t'), encoding='utf-8')

In [4]:
df_channel= df_channel.set_index(['id'])

df_channel.loc[df_channel['network'] == 'Maker_Studios', 'network'] = 'Maker Studios'
#Fullscreen managed
df_channel.loc[df_channel['network'] == 'Fullscreen managed', 'network'] = 'Fullscreen'
df_channel.head()


Out[4]:
topicIds network viewCount subscriberCount videoCount commentCount category popularity
id
UC__Pj66OeDibNZNN__L913g Music None 3253022 23029 967 0 Entertainment 2
UC__PZLSRGtUQiTtvm3hPoEQ Movies BroadbandTV 310896 5878 144 0 Entertainment 1
UC__rmdgxs3ZF0zK_he7Tmig Lifestyle None 1291254 8146 294 121 How-to & Style 1
UC_-CxgsxX0tpnm24WO-797Q Lifestyle Maker Studios 625545 18990 67 101 How-to & Style 2
UC_1FUFB6TlGeGOyDI4ikkzg Movies BroadbandTV 89020205 106760 288 0 Entertainment 3

In [5]:
# read collabs from database table
# session.query(VideoFaceCluster)
with db._session_scope(False) as session:

    df_feature_cluster = pa.read_sql(session.query(VideoFeatures.id, VideoFeatures.videoID, VideoFeatures.duration, Video.channelID, VideoFaceCluster.cluster).filter( (VideoFaceCluster.featureID == VideoFeatures.id) & (VideoFeatures.videoID == Video.id) ).statement, db.engine)

print df_feature_cluster.head()
print '\nnumber of feature', len(df_feature_cluster)
print 'number of channels:', df_feature_cluster['channelID'].nunique()
print 'number of videos', df_feature_cluster['videoID'].nunique()
print 'number of clusters:', df_feature_cluster['cluster'].nunique()


       id      videoID  duration                 channelID  cluster
0  159389  gjce2lOMhXs   2223.10  UC--BMyA2X4a9PGAo3lTuopg     9510
1  159391  gjce2lOMhXs   1633.47  UC--BMyA2X4a9PGAo3lTuopg     9510
2  159392  gjce2lOMhXs   1643.81  UC--BMyA2X4a9PGAo3lTuopg     9510
3  159395  gjce2lOMhXs   1350.18  UC--BMyA2X4a9PGAo3lTuopg    12551
4  159396  gjce2lOMhXs   1128.79  UC--BMyA2X4a9PGAo3lTuopg    12551

number of feature 145404
number of channels: 3740
number of videos 80491
number of clusters: 14010

In [15]:
# create df_feature_cluster with react videos filtered

# sql like 'react', reaction?

# read collabs from database table
# session.query(VideoFaceCluster)
with db._session_scope(False) as session:

    df_filtered_cluster = pa.read_sql(session.query(VideoFeatures.id, VideoFeatures.videoID, VideoFeatures.duration, Video.channelID, VideoFaceCluster.cluster).filter( (VideoFaceCluster.featureID == VideoFeatures.id) & (VideoFeatures.videoID == Video.id) & (~Video.title.like(u'%react%')) & (Video.category!=20) ).statement, db.engine)

print df_filtered_cluster.head()
print '\nnumber of feature', len(df_filtered_cluster)
print 'number of channels:', df_filtered_cluster['channelID'].nunique()
print 'number of videos', df_filtered_cluster['videoID'].nunique()
print 'number of clusters:', df_filtered_cluster['cluster'].nunique()


       id      videoID  duration                 channelID  cluster
0  159389  gjce2lOMhXs   2223.10  UC--BMyA2X4a9PGAo3lTuopg     9510
1  159391  gjce2lOMhXs   1633.47  UC--BMyA2X4a9PGAo3lTuopg     9510
2  159392  gjce2lOMhXs   1643.81  UC--BMyA2X4a9PGAo3lTuopg     9510
3  159395  gjce2lOMhXs   1350.18  UC--BMyA2X4a9PGAo3lTuopg    12551
4  159396  gjce2lOMhXs   1128.79  UC--BMyA2X4a9PGAo3lTuopg    12551

number of feature 77268
number of channels: 2822
number of videos 45893
number of clusters: 8703

In [16]:
# testing filtering with 10% mark

from sklearn.preprocessing import normalize

channel_groups = df_feature_cluster.groupby('channelID')
filtered_channel_groups = df_filtered_cluster.groupby('channelID')

df_test = pa.DataFrame()
df_test_filtered = pa.DataFrame()

print 'number of videos for channel:', df_feature_cluster[df_feature_cluster.channelID == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']['videoID'].nunique()
f = df_feature_cluster[df_feature_cluster.channelID == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']['videoID'].nunique() * 0.1
counts = channel_groups.get_group('UC-lHJZR3Gqxm24_Vd_AJ5Yw')['cluster'].value_counts(normalize=True)
df_test['relative_counts'] = counts
df_test['channelID'] = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
print 'cluster in >10% videos:\n', counts[counts >= f]
fig = plt.figure()
ax = counts.plot(kind='bar')
ax.axhline(0.1, color='red')
print df_test

print 'number of videos for channel, filtered:', df_filtered_cluster[df_filtered_cluster.channelID == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']['videoID'].nunique()
f = df_filtered_cluster[df_filtered_cluster.channelID == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']['videoID'].nunique() * 0.1
counts = filtered_channel_groups.get_group('UC-lHJZR3Gqxm24_Vd_AJ5Yw')['cluster'].value_counts(normalize=True)
df_test_filtered['relative_counts'] = counts
df_test_filtered['channelID'] = 'UC-lHJZR3Gqxm24_Vd_AJ5Yw'
print 'cluster in >10% videos, filtered:\n', counts[counts >= f]
fig = plt.figure()
ax = counts.plot(kind='bar')
ax.axhline(0.1, color='red')
ax.axhline(0.05, color='orange')
print df_test_filtered


number of videos for channel: 57
cluster in >10% videos:
Series([], Name: cluster, dtype: float64)
       relative_counts                 channelID
5367          0.395062  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1267          0.111111  UC-lHJZR3Gqxm24_Vd_AJ5Yw
8259          0.086420  UC-lHJZR3Gqxm24_Vd_AJ5Yw
11929         0.074074  UC-lHJZR3Gqxm24_Vd_AJ5Yw
3490          0.049383  UC-lHJZR3Gqxm24_Vd_AJ5Yw
2656          0.037037  UC-lHJZR3Gqxm24_Vd_AJ5Yw
3449          0.024691  UC-lHJZR3Gqxm24_Vd_AJ5Yw
5351          0.024691  UC-lHJZR3Gqxm24_Vd_AJ5Yw
5710          0.024691  UC-lHJZR3Gqxm24_Vd_AJ5Yw
4129          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
590           0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
3338          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
4882          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
2328          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
13344         0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
7598          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1191          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1068          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
5368          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1713          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
7638          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
603           0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
8198          0.012346  UC-lHJZR3Gqxm24_Vd_AJ5Yw
number of videos for channel, filtered: 39
cluster in >10% videos, filtered:
Series([], Name: cluster, dtype: float64)
       relative_counts                 channelID
5367          0.438596  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1267          0.105263  UC-lHJZR3Gqxm24_Vd_AJ5Yw
11929         0.105263  UC-lHJZR3Gqxm24_Vd_AJ5Yw
8259          0.105263  UC-lHJZR3Gqxm24_Vd_AJ5Yw
2656          0.052632  UC-lHJZR3Gqxm24_Vd_AJ5Yw
3490          0.035088  UC-lHJZR3Gqxm24_Vd_AJ5Yw
5710          0.035088  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1713          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
7598          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
1068          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
2328          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
7638          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
4882          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw
8198          0.017544  UC-lHJZR3Gqxm24_Vd_AJ5Yw

In [17]:
# create table with relative cluster counts per channel, plot dist

df_relative_counts = pa.DataFrame()
df_number_videos = []

for name, group in filtered_channel_groups:
    
    df_test = pa.DataFrame()

    df_number_videos.append(df_filtered_cluster[df_filtered_cluster.channelID == name]['videoID'].nunique())
    counts = group['cluster'].value_counts(normalize=True)
    df_test['relative_counts'] = counts
    df_test['channelID'] = name
    df_relative_counts = df_relative_counts.append(df_test)

df_number_videos = pa.DataFrame(df_number_videos)
df_relative_counts.head()


Out[17]:
relative_counts channelID
9511 0.36 UC--BMyA2X4a9PGAo3lTuopg
9512 0.24 UC--BMyA2X4a9PGAo3lTuopg
12552 0.16 UC--BMyA2X4a9PGAo3lTuopg
12551 0.12 UC--BMyA2X4a9PGAo3lTuopg
9510 0.12 UC--BMyA2X4a9PGAo3lTuopg

In [18]:
fig = plt.figure()
ax = sns.distplot(df_relative_counts['relative_counts'], kde=False, bins=100)
#ax.axvline(0.25, color='yellow', label='25%', linewidth=0.5)
ax.axvline(0.1, color='orange', label='10%', linewidth=1.0)
#ax.axvline(0.05, color='red', label='5%', linewidth=0.8)
ax.set_xlabel('% of Videos from Channel')
ax.set_ylabel('Cluster')
#ax.set_xscale('log')
ax.set_yscale('log')
ax.legend()
plt.title('Cluster Percentages of Videos per Channel')

save_plot('channel_video_cluster_percentages.pdf', fig, s_width, s_height)

fig = plt.figure()
ax = sns.distplot(df_number_videos, kde=False)
ax.set_xlabel('Number of Videos')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('log')
plt.title('Number of Videos per Channel')


save_plot('channel_nof_videos.pdf', fig, s_width, s_height)



In [21]:
'''
cluster = {}

label = df_feature_cluster['cluster'].unique()

for l in label:
    ftl = df_feature_cluster[df_feature_cluster.cluster == l]
    groups = ftl.groupby(['channelID'])
    vcounts = groups.videoID.nunique()
    vtcounts = groups.videoID.unique()
    vcounts = vcounts.sort_values(ascending=False)
    vs = vcounts.index
    vtcounts = vtcounts.reindex(vs)
    # TODO get average(?) duration for collabs too
    cluster[l] = [(cid, nof, list(vids)) for (cid, nof), (cid2, vids) in zip(vcounts.iteritems(), vtcounts.iteritems())]
    #print l, cluster[l]
    #print vtcounts
'''
    
filtered_cluster = {}

label = df_filtered_cluster['cluster'].unique()

for l in label:
    ftl = df_filtered_cluster[df_filtered_cluster.cluster == l]
    groups = ftl.groupby(['channelID'])
    vcounts = groups.videoID.nunique()
    vtcounts = groups.videoID.unique()
    vcounts = vcounts.sort_values(ascending=False)
    vs = vcounts.index
    vtcounts = vtcounts.reindex(vs)
    # TODO get average(?) duration for collabs too
    filtered_cluster[l] = [(cid, nof, list(vids)) for (cid, nof), (cid2, vids) in zip(vcounts.iteritems(), vtcounts.iteritems())]
    #print l, filtered_cluster[l]
    #print vtcounts

In [26]:
print cluster[3]
print filtered_cluster[8]


[(u'UCIjquHL0-e5Z5YmmVOPIHpg', 2, [u'reojjQ6XsM8', u'vuSTBd6oLsQ']), (u'UC9RRAHA3RuA0N0Genbekgrg', 2, [u'MAnOFA2OqCw', u'pSr-upnC4t4'])]
[(u'UCpTgltu3Ap618ySFFdWaHjg', 1, [u'5tYdW3Aii7c'])]

In [27]:
# create graph from collabs table

def create_graph(df, cluster):

    G = nx.DiGraph() # directed graph


    for l, cls in cluster.iteritems():
        if cls[0][1] >= math.ceil(df[df.channelID == cls[0][0]]['videoID'].nunique() * 0.1):
            mainc = cls[0][0]
            if G.has_node(mainc):
                if 'cluster' in G.node[mainc]:
                    G.node[mainc]['cluster'].append(str(l))
                else:
                    G.node[mainc]['cluster'] = [str(l)]

                    with db._session_scope(False) as session:
                        G.node[mainc]['network'] = session.query(Channel.network).filter(Channel.id == mainc).first()[0]

            else:
                with db._session_scope(False) as session:
                    network = session.query(Channel.network).filter(Channel.id == mainc).first()[0]
                G.add_node(mainc, cluster=[str(l)], network=network)

            for (c, n, v) in cls[1:]:
                G.add_edge(mainc, c, weight=int(n), cluster=str(l), videos=v)


    print '\nNodes:',len(G.nodes())
    print 'Edges:',len(G.edges())

    print 'Collabs (weight sum)', G.size(weight='weight')

    for x in G.nodes():
        if not 'network' in G.node[x]:
            with db._session_scope(False) as session:
                    G.node[x]['network'] = session.query(Channel.network).filter(Channel.id == x).first()[0]
        if not 'cluster' in G.node[x]:
            G.node[x]['cluster'] = ''
        #print G.node[x]['network']
        
    return G

G = create_graph(df_feature_cluster, cluster)

Gf = create_graph(df_filtered_cluster, filtered_cluster)


nx.write_gml(G, DIR+"/collab_detections_graph.gml")
nx.write_gml(Gf, DIR+"/filtered_collab_detections_graph.gml")


Nodes: 3625
Edges: 3475
Collabs (weight sum) 7614.0

Nodes: 2775
Edges: 1728
Collabs (weight sum) 3925.0

In [189]:
df_collab_channels = pa.Series(Gf.nodes())

print len(df_collab_channels.unique())

df_collab_channels_deg = pa.Series(Gf.degree())

print df_collab_channels_deg.value_counts()


2775
0     1176
1      827
2      340
3      175
4      101
5       64
6       35
7       23
8       14
10       8
9        5
11       2
16       2
12       2
14       1
dtype: int64

In [ ]:
for c in nx.isolates(Gf):

In [15]:
#G = nx.read_gml(DIR+"/collab_detections_graph.gml")
#Gf = nx.read_gml(DIR+"/filtered_collab_detections_graph.gml")

In [28]:
# compare graphs


# first compare nodes, which are present, which missing
Gf_missing = set()
for node in G.nodes():
    if not Gf.has_node(node):
        Gf_missing.add(node)

Gf_adds = set()
for node in Gf.nodes():
    if not G.has_node(node):
        Gf_adds.add(node)
        
with db._session_scope(False) as session:
    df_missing = pa.read_sql(session.query(Channel.title).filter((Channel.id.in_(Gf_missing))).statement, db.engine)
    df_adds = pa.read_sql(session.query(Channel.title).filter((Channel.id.in_(Gf_adds))).statement, db.engine)

print 'Missing:\n', df_missing
print 'Added:\n', df_adds

# then add all for edge comparision
Gfc = Gf.copy()
Gfc.add_nodes_from(list(Gf_missing))
   
Gc = G.copy()
Gc.add_nodes_from(list(Gf_adds))

Gc_diff = nx.difference(Gc, Gfc)
print '\nMissing edges:', len(Gc_diff.edges())
weight_sum = 0
for e in Gc_diff.edges():
        weight_sum += Gc.get_edge_data(e[0], e[1])['weight'] 
print 'Missing collabs:', weight_sum

Gfc_diff = nx.difference(Gfc, Gc)
print 'Added edges:',len(Gfc_diff.edges())
weight_sum = 0
for e in Gfc_diff.edges():
        weight_sum += Gfc.get_edge_data(e[0], e[1])['weight']
print 'Added collabs:', weight_sum


#for e in Gc_diff.edges():
#    with db._session_scope(False) as session:
#        fm = session.query(Channel.title).filter(Channel.id == e[0]).first()[0]
#        to = session.query(Channel.title).filter(Channel.id == e[1]).first()[0]
#        print fm,'->', to, ':', Gc.get_edge_data(e[0], e[1])


Missing:
                                      title
0                                  VeerDosE
1                            Dante D'Angelo
2                                 Антишнапс
3                            Gameradioativo
4                                  Japaaa23
5                                  LuiJusto
6                                WDTechToys
7                               ShamamGames
8     FIFALIZE - SÉRIES E TUTORIAIS DE FIFA
9                                  Sashuani
10                               dookieshed
11                                   Reaper
12                                Utiplayer
13                     DarkElfX oXo Emblems
14                                 itsRucka
15                                  TRASHER
16                             Na`Vi.Dota 2
17                             MegaCapitalG
18                               FarsAttack
19                             Spikey Mikey
20                            NicolasTRibBR
21                              ZeldaMaster
22   AVITHDGAMER | VIRTUAL PRO LOOK A LIKES
23                           CoreDasAntigas
24                            Zero Gravity™
25                  Narimitkung Gamechannel
26                            Arekkz Gaming
27                                SnakeMC96
28                                Falcon BR
29                    Реакции Летсплейщиков
..                                      ...
841                                JP Plays
842                                   Ali-A
843                                   Gento
844                        Ω VictorKratos Ω
845                                 ZeratoR
846                              Thamriyell
847                        Hadouken de Fogo
848                                    Veki
849                         Superbrioche666
850                                 33Plays
851                   Soy Yoh, un Chibi :'D
852                                Sr Pedro
853                                AzzyLand
854                              Fifasticos
855                            MissPinguina
856                              Bolek 2106
857                        Survoris Channel
858                            Winter_Killz
859                              andreyp0et
860         Mudinho - Cobertura Bo3 Zombies
861             Erupt - Destiny & Overwatch
862                            JT Machinima
863                           Maikindred Tv
864                              Noni Gamer
865                            Teylor Clash
866                         G18SprayAndPray
867                               MARK JOGA
868             Project Phoenix Productions
869                 Memb - Age Of Empires 2
870            DangerousThing Mobile Gaming

[871 rows x 1 columns]
Added:
                                                title
0                                               Diddy
1                                              Tiny23
2                                           Proximity
3   TheLearningStation - Kids Songs and Nursery Rh...
4      FUNKER530 - Veteran Community & Combat Footage
5                                     Lazy Masquerade
6                                             RapBits
7                                          arronlee33
8                                          Tío Kv2000
9                                              Maskey
10                                       Peter Armado
11                           How It Should Have Ended
12                                          Firecrack
13                                        Tonia Moore
14                                      MrCreepyPasta
15                                      Yu-Gi-Oh-TUBE
16                                          evilsephi
17                                             Domics
18                                  Coisas De Android
19                                            Magnata
20                                    BLH Productions

Missing edges: 1825
Missing collabs: 3726
Added edges: 78
Added collabs: 124

In [29]:
# get the number of clusters per channel -> number of content creator (not directly)
df_channel_cluster = pa.DataFrame(index=Gf.nodes())
df_channel_cluster['n_cluster'] = np.nan
#print df_channel_cluster

for node in Gf.nodes():
    #att = nx.get_node_attributes(Gf, 'cluster')
    #print att
    df_channel_cluster.ix[node,'n_cluster'] = len(Gf.node[node]['cluster'])
    
df_channel_cluster.describe()


Out[29]:
n_cluster
count 2775.000000
mean 1.824505
std 2.279144
min 0.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 65.000000

In [30]:
fig = plt.figure()
ax = sns.distplot(df_channel_cluster['n_cluster'], kde=False, bins=100)
ax.set_xlabel('Cluster')
ax.set_ylabel('Channel')
ax.set_xscale('symlog')
ax.set_yscale('symlog')

plt.title('Channel Face Cluster')
save_plot('channel_nof_face_cluster.pdf', fig, s_width, s_height)

fig = plt.figure()
sns.boxplot(data=df_channel_cluster['n_cluster'])

fig = plt.figure()
sns.barplot(data=df_channel_cluster['n_cluster'], ci=99)


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6c18b59cd0>

In [32]:
degree_sequence=sorted(nx.degree(Gf).values(),reverse=True) # degree sequence
out_degree_sequence=sorted(Gf.out_degree().values(),reverse=True) # degree sequence
in_degree_sequence=sorted(Gf.in_degree().values(),reverse=True) # degree sequence

In [33]:
fig = plt.figure()
#plt.loglog(degree_sequence, label='Sum Degree')
plt.loglog(out_degree_sequence, label='Out Degree')
plt.loglog(in_degree_sequence, label='In Degree')
plt.title("Degree rank plot")
plt.ylabel("Degree")
plt.xlabel("Number of channel")
plt.legend()


Out[33]:
<matplotlib.legend.Legend at 0x7f6c18d5ca50>

In [34]:
#ax.set_xscale('log') #, basex=2)
#ax.set_yscale('log')

fig = plt.figure()
ax1 = sns.distplot(out_degree_sequence, kde=False, label='Out Degree')

ax1 = sns.distplot(in_degree_sequence, kde=False, label='In Degree')
ax1.set_xlabel('Degree')
ax1.set_ylabel('Number of Channel')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Collaboration Degrees')

save_plot('collab_in_out_degrees.pdf', fig, s_width, s_height)

print 'Out degree:', pa.DataFrame(out_degree_sequence).describe(), pa.DataFrame(out_degree_sequence).median()
print 'In degree:', pa.DataFrame(in_degree_sequence).describe(), pa.DataFrame(in_degree_sequence).median()

#fig.set_size_inches(width, height)
#fig = plt.figure()
#ax2 = sns.distplot(degree_sequence, hist=False, label='Sum Degree')
#ax22 = sns.distplot(out_degree_sequence, hist=False, label='Out Degree')
#ax23 = sns.distplot(in_degree_sequence, hist=False, label='In Degree')
#ax23.legend()

#fig = plt.figure()
#ax3 = sns.distplot(degree_sequence)


Out degree:                  0
count  2775.000000
mean      0.622703
std       1.153655
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max      16.000000 0    0.0
dtype: float64
In degree:                  0
count  2775.000000
mean      0.622703
std       1.050624
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max      13.000000 0    0.0
dtype: float64

In [35]:
# actual collabs list, edges list

def func(graph):
    data = []

    for e in graph.edges(data=True):
        item = {}
        item['from'] = e[0]
        item['to'] = e[1]
        item['cluster'] = e[2]['cluster']
        item['weight'] = e[2]['weight']
        if isinstance(e[2]['videos'], list):
            item['videos'] = e[2]['videos']
        else:
           item['videos'] = [e[2]['videos']]
        
        item['from_network'] = graph.node[e[0]]['network']
        if isinstance(graph.node[e[0]]['cluster'], list):
            item['from_cluster'] = graph.node[e[0]]['cluster'] 
        else:
            item['from_cluster'] = [graph.node[e[0]]['cluster']]

        item['from_popularity'] = df_channel.ix[e[0]].popularity
        item['from_category'] = df_channel.ix[e[0]].category
        item['from_topic'] =  df_channel.ix[e[0]].topicIds

        item['to_network'] = graph.node[e[1]]['network']
        if isinstance(graph.node[e[0]]['cluster'], list):
            item['to_cluster'] = graph.node[e[1]]['cluster'] 
        else:
            item['to_cluster'] = [graph.node[e[1]]['cluster']]
            
        item['to_popularity'] = df_channel.ix[e[1]].popularity
        item['to_category'] = df_channel.ix[e[1]].category
        item['to_topic'] = df_channel.ix[e[1]].topicIds
        
        # get list of video ids with collabs somehow here
        data.append(item)

    
    return data


df_graph = pa.DataFrame(func(G))
df_graph.loc[df_graph['from_network'] == 'Maker_Studios', 'from_network'] = 'Maker Studios'
df_graph.loc[df_graph['to_network'] == 'Maker_Studios', 'to_network'] = 'Maker Studios'

df_graph['from_cluster'] = df_graph['from_cluster'].apply(json.dumps)
df_graph['to_cluster'] = df_graph['to_cluster'].apply(json.dumps)
df_graph['videos'] = df_graph['videos'].apply(json.dumps)
df_graph.to_csv(DIR+r'/df_collabs.txt', sep=str('\t'), encoding='utf-8')



df_graph_filtered = pa.DataFrame(func(Gf))
df_graph_filtered.loc[df_graph_filtered['from_network'] == 'Maker_Studios', 'from_network'] = 'Maker Studios'
df_graph_filtered.loc[df_graph_filtered['to_network'] == 'Maker_Studios', 'to_network'] = 'Maker Studios'


df_graph_filtered['from_cluster'] = df_graph_filtered['from_cluster'].apply(json.dumps)
df_graph_filtered['to_cluster'] = df_graph_filtered['to_cluster'].apply(json.dumps)
df_graph_filtered['videos'] = df_graph_filtered['videos'].apply(json.dumps)
df_graph_filtered.to_csv(DIR+r'/df_filtered_collabs.txt', sep=str('\t'), encoding='utf-8')


print len(df_graph)
print len(df_graph_filtered)

df_graph_filtered.head()


#df_graph_filtered[df_graph_filtered.to == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']


3475
1728
Out[35]:
cluster from from_category from_cluster from_network from_popularity from_topic to to_category to_cluster to_network to_popularity to_topic videos weight
0 4806 UCw-hc7ZJummS0AvWyjUX56A Sports ["3102", "4806", "12796"] None 3 Fitness UCWYtZYH4kcbMm29liIOSGQQ Sports ["2996", "3143", "5232", "7762"] BroadbandTV 3 Sports ["Q-Z7QYkqn2M"] 1
1 5556 UClpEE-Led9ZK0GJQKvU--3Q Entertainment ["5556", "5884", "7050"] Maker Studios 2 Football UCIKF1msqN7lW9gplsifOPkQ Entertainment ["12627"] BroadbandTV 3 Music ["On4GE5hAU8s"] 1
2 5556 UClpEE-Led9ZK0GJQKvU--3Q Entertainment ["5556", "5884", "7050"] Maker Studios 2 Football UC3M4u8_WwqY-2xDbJXxo5eQ Gaming ["4139", "7049"] OmniaMediaCo 3 Sports game ["4cEkXTIcvH4"] 1
3 6135 UCMDz09-3zO1hm1pqRA-Er0A Entertainment ["6135"] BroadbandTV 3 Lifestyle UC3fxB7rF6T7wqymKUJxwmXA Entertainment "" BroadbandTV 3 Lifestyle ["C7Waq5ZgqGU"] 1
4 1367 UCUcBFGAfOzut1x4GSWa6Akg Entertainment ["1367", "4731", "5365", "6760"] Maker Studios 3 Music UCCk_JV7ar8HlxWZRXQKgS7w People & Blogs "" BroadbandTV 2 Lifestyle ["RJir4ghyTDo"] 1

In [179]:
df_collab_channels = pa.Series([])

df_collab_channels = df_collab_channels.append(df_graph_filtered['to'])
df_collab_channels = df_collab_channels.append(df_graph_filtered['from'])

print len(df_collab_channels.unique())


1599

In [36]:
df_graph_filtered[df_graph_filtered['from'] == 'UC2FfW6_YHXUSfsndM2R37cQ']

df_graph_filtered[df_graph_filtered['from'] == 'UC45zwHCRNT5jA8sC5DUW7oQ']


Out[36]:
cluster from from_category from_cluster from_network from_popularity from_topic to to_category to_cluster to_network to_popularity to_topic videos weight
1113 2157 UC45zwHCRNT5jA8sC5DUW7oQ Comedy ["2157"] Fullscreen 3 Lifestyle UC2FfW6_YHXUSfsndM2R37cQ Entertainment ["1409"] None 3 Music of Latin America ["F76ntKET30k"] 1

In [150]:
# Statistics about collabs (per channel pair (edges))
# counting collabs between channel, channel can have multiple collabs with different channel etc.

print 'Number of collaborations complete:', df_graph['weight'].sum()
print 'Number of collaborations complete, filtered:', df_graph_filtered['weight'].sum()


print 'Collaborations between channels:', df_graph['weight'].describe()
print 'Collaborations between channels, filtered:', df_graph_filtered['weight'].describe()

print 'Median number of collaborations between channels:', df_graph['weight'].median()
print 'Median number of collaborations between channels, filtered:', df_graph_filtered['weight'].median()


# Number of collaborations per channel pair

fig = plt.figure()
#ax1 = sns.distplot(df_graph['weight'], kde=False)
ax1 = sns.distplot(df_graph_filtered['weight'], kde=False)
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Frequency')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Collaborations per Channel Pair')
save_plot('collab_channel_pairs.pdf', fig, s_width, s_height)
#fig = plt.figure()
#ax2 = sns.distplot(df_graph['weight'], hist=False)
#ax21 = sns.distplot(df_graph_filtered['weight'], hist=False, )

fig = plt.figure()
#ax1 = sns.distplot(df_graph['weight'], kde=False)
ax1 = sns.distplot(df_graph_filtered['weight'], kde=False)
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Frequency')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Collaborations per Channel Pair')
save_plot('collab_channel_pairs_s.pdf', fig, s_width, 0.75*s_height)


fig = plt.figure()
ax = sns.barplot(data=df_graph_filtered['weight'], ci=99, errwidth=1., capsize=.05)
plt.legend(["{}% CI".format(99)])
ax.set_xlabel('')
ax.set_xticklabels('')
ax.set_ylabel('mean(Collaborations)')
plt.title('Number of Collaborations per Channel Pair')
ax.set_ylim(0.0, 4.0)
for bar in ax.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x+width/2.
    newwidth = width/2
    bar.set_x(centre-newwidth/2.)
    bar.set_width(newwidth)

save_plot('collab_channel_pairs_bar_box.pdf', fig, s_width, s_height)


Number of collaborations complete: 7614
Number of collaborations complete, filtered: 3925
Collaborations between channels: count    3475.000000
mean        2.191079
std         4.610916
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       117.000000
Name: weight, dtype: float64
Collaborations between channels, filtered: count    1728.000000
mean        2.271412
std         4.786327
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       117.000000
Name: weight, dtype: float64
Median number of collaborations between channels: 1.0
Median number of collaborations between channels, filtered: 1.0

In [38]:
top_pairs = df_graph_filtered.sort_values(by=['weight'], ascending=False)


def get_name(id):
    with db._session_scope(False) as session:
        return session.query(Channel.title).filter(Channel.id == id).first()[0]
    
top_pairs['from_title'] = top_pairs['from'].apply(get_name)
top_pairs['to_title'] = top_pairs['to'].apply(get_name)

top_pairs[:20].to_csv(DIR+r'/df_most_collabs_top_pairs.txt', sep=str('\t'), encoding='utf-8')

top_pairs[['from_title', 'from_popularity', 'to_title', 'to_popularity', 'weight']][:30]


Out[38]:
from_title from_popularity to_title to_popularity weight
652 InformOverload 3 iO Trendz 3 117
1252 LandonProduction Vlogs 2 MostAmazingTop10 4 51
1321 FilmRise 2 FilmRise Documentaries 2 49
574 Good Mythical MORE 4 Good Mythical Morning 5 49
1322 FilmRise 2 FilmRise True Crime 2 48
443 Good Mythical Morning 5 Good Mythical MORE 4 42
1490 ChocolaTV 4 Freak TV 3 32
158 Ashley & Nate 3 NatesLife 3 29
797 WengieVlogs 3 Maxmellow 2 28
912 PontiacMadeDDG VLOGS 3 PontiacMadeDDG 3 27
1251 LandonProduction Vlogs 2 LandonProduction 3 27
567 AndroTube 3 CuriosYTube 3 25
356 Living Out Loud Vlog 2 FionaFrills 3 25
1528 Matsura Vlog 3 Thais e Thalita Matsura 4 21
456 JustKiddingNews 4 Bart & Geo 3 21
1307 EU FICO LOKO 4 Christian Figueiredo 4 21
155 Nikol CrazyFamily 3 Я - Alisa 3 20
420 Thalita Vlog 3 Thalita Ferraz 4 20
861 DisneyCarToys 4 AllToyCollector 4 20
925 babyteeth4 4 Babyteeth More 1 19
476 TheBeauty2go 3 Kathi2go 3 19
1560 CuriosYTube 3 SisiThings 0 18
261 Murmullo Latino ASMR 2 Whisper Latina ASMR 2 17
404 Mayden y Natalia Vlogs 3 ExpCaseros 4 17
1429 VIDA DE CASAL 3 Luan Novitt TV 3 17
1243 Penteados para meninas 4 Bel para meninas 4 16
1298 Ümidi HD 3 FattyPillowTV 3 16
518 askhodgetwins 4 TwinMuscle 4 16
458 JustKiddingNews 4 JustKiddingParty 3 15
395 Maxmellow 2 WengieVlogs 3 15

In [39]:
top_pairs['popularity_diff'] = top_pairs.apply(lambda row: abs(row['to_popularity'] - row['from_popularity']), axis=1)
top_pairs.sort_values(by=['popularity_diff', 'weight'], ascending=True)[['from_title', 'from_popularity', 'to_title', 'to_popularity', 'weight']][:30]


Out[39]:
from_title from_popularity to_title to_popularity weight
1017 TheRealAlexBertie 3 Ash Hardell 3 1
1256 Welcome Jules 3 NAMOR* 3 1
1258 Victor Goes 3 #100 Noção 3 1
1253 Kyo And Ruka 2 Zaunstar 2 1
1226 Diário de Princesas 1 Youtubers Kids BR 1 1
1225 WarLeaks - Daily Military Defense Videos & Com... 3 kamikadzedead 3 1
1255 Welcome Jules 3 SuperDimmix 3 1
1205 MusicNeverSleeps 4 Tiffany Alvord 4 1
1206 SanFrezco 3 Nick SanFrezco 3 1
1240 Sabrina Iorio 3 uJoãozinho Vine 3 1
1217 GQ 4 ALONZO LERONE 4 1
1244 BULL1TRC 3 ToNYD2WiLD 3 1
1231 Heroes TEAM 3 AviveHD 3 1
1107 The Lindquists 3 Mitchell Davis 3 1
1108 Amanda Pontes 3 Jaqueline Guerreiro 3 1
1062 João Ricardo 3 Sabrina Iorio 3 1
1109 Amanda Pontes 3 Tati Nunes 3 1
1061 João Ricardo 3 uJoãozinho Vine 3 1
1058 Pira Não 3 Patrícia Suguino 3 1
1113 uJoãozinho Vine 3 Sabrina Iorio 3 1
1114 iBlali 4 Jarow 4 1
1118 HeyKayli 3 CarlieStylez 3 1
1119 Rampage Anomaly 3 Chumino YT 3 1
1051 Internet Comment Etiquette with Erik 3 Zador Nightmares 3 1
1050 SuperDimmix 3 BIRDYY 3 1
1123 YourMovieSucksDOTorg 3 КИНОНАХ 3 1
1124 YourMovieSucksDOTorg 3 ZEPfilms 3 1
1126 YourMovieSucksDOTorg 3 Brendaniel 3 1
1128 BigBrudda 3 Larry's Lounge 3 1
1104 Para Tudo 3 Maicon Santini 3 1

In [40]:
# check how the names of collab channels differ, channel name in other channel occuring?
for i, row in top_pairs[:20].iterrows():
    if row['from'] in row['to']:
        print row[['from', 'to', 'weight']]
    elif row['to'] in row['from']:
        print row[['from', 'to', 'weight']]

In [41]:
fig = plt.figure()
ax = sns.boxplot(data=df_graph_filtered['weight'])

fig = plt.figure()
ax = sns.barplot(data=df_graph_filtered['weight'], ci=90, errwidth=1., capsize=.05)
plt.legend(["{}% CI".format(99)])
ax.set_xlabel('')
ax.set_xticklabels('')
ax.set_ylabel('mean(Collaborations)')
plt.title('Number of Collaborations per Channel Pair')
for bar in ax.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x+width/2.
    newwidth = width/2
    bar.set_x(centre-newwidth/2.)
    bar.set_width(newwidth)



In [153]:
# Statistics about channel and their overall collabs (channel wise)
# counting in and outgoing overall collabs per channel (every channel has single value)

# trenne out and ingoing?
# sum ingoing weights of every node in the graph -> in, out going weight -> overall collabs

df_channel_collab_weights = pa.DataFrame(G.nodes(),columns=['channel']) # use all channel list not only from collab graph?
df_channel_collab_weights = df_channel_collab_weights.set_index(['channel'])

df_channel_collab_weights_filtered = pa.DataFrame(Gf.nodes(),columns=['channel'])
df_channel_collab_weights_filtered = df_channel_collab_weights_filtered.set_index(['channel'])

df_channel_collab_weights_filtered['in_weight'] = np.nan
df_channel_collab_weights_filtered['out_weight'] = np.nan
df_channel_collab_weights_filtered['sum_weight'] = np.nan

for node in Gf.nodes():
    outs = Gf.out_edges(node)
    out_weight = 0
    for e in outs:
        out_weight += Gf.get_edge_data(e[0], e[1])['weight']

    ins = Gf.in_edges(node)
    in_weight = 0
    for e in ins:
        in_weight += Gf.get_edge_data(e[0], e[1])['weight']

    df_channel_collab_weights_filtered.loc[node, 'in_weight'] = in_weight
    df_channel_collab_weights_filtered.loc[node, 'out_weight'] = out_weight
    df_channel_collab_weights_filtered.loc[node, 'sum_weight'] = in_weight+out_weight
    

print df_channel_collab_weights_filtered.head()

fig = plt.figure()
ax1 = sns.distplot(df_channel_collab_weights_filtered['out_weight'], kde=False, label='Outgoing')

# Define some hatches
hatches = ['-', '+', 'x', '\\', '*', 'o']

# Loop over the bars
for i,thisbar in enumerate(ax1.patches):
    # Set a different hatch for each bar
    thisbar.set_hatch('//')
    
ax1 = sns.distplot(df_channel_collab_weights_filtered['in_weight'], kde=False, label='Ingoing')
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Channel')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Number of Collaborations per Channel')
save_plot('collab_in_out_collabs.pdf', fig, s_width, s_height)

fig = plt.figure()
ax1 = sns.distplot(df_channel_collab_weights_filtered['sum_weight'], kde=False)
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Frequency')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Collaborations per Channel')
save_plot('collab_nof_collabs_per_channel.pdf', fig, s_width, s_height)

fig = plt.figure()
ax1 = sns.distplot(df_channel_collab_weights_filtered['sum_weight'], kde=False)
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Frequency')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Collaborations per Channel')
save_plot('collab_nof_collabs_per_channel_s.pdf', fig, s_width, 0.75*s_height)

print df_channel_collab_weights_filtered['sum_weight'].describe()

df_test_weights = []
for i, row in df_channel_collab_weights_filtered.iterrows():
    df_test_weights.append((row['in_weight'], 'in_weight'))
    df_test_weights.append((row['out_weight'], 'out_weight'))
df_test_weights = pa.DataFrame(df_test_weights, columns=['weight', 'direction'])
df_test_weights.head()



fig = plt.figure()
ax = sns.violinplot(x='direction', y='weight', data=df_test_weights)
ax.set_xticklabels(['Internal', 'External'])
ax.set_ylabel('Number of Collaborations')
ax.set_xlabel('')
plt.title('Collaborations Internal/External')
ax.set_ylim([-4, 16])
save_plot('collab_in_out_collabs_box_violin.pdf', fig, s_width, s_height)

fig = plt.figure()
ax = sns.boxplot(x='direction', y='weight', data=df_test_weights)
print df_channel_collab_weights_filtered['in_weight'].describe()
print df_channel_collab_weights_filtered['out_weight'].describe()

ax.set_xticklabels(['Internal', 'External'])
ax.set_ylabel('Number of Collaborations')
ax.set_xlabel('')
plt.title('Collaborations Internal/External')
ax.set_ylim([-0.1, 3.4])
save_plot('collab_in_out_collabs_box_s.pdf', fig, s_width, 0.75*s_height)

fig = plt.figure()
ax = sns.barplot(data=df_channel_collab_weights_filtered['sum_weight'], ci=99, errwidth=1., capsize=.05)
plt.legend(["{}% CI".format(99)])
ax.set_xlabel('')
ax.set_xticklabels('')
ax.set_ylabel('mean(Collaborations)')
plt.title('Number of Collaborations per Channel')

for bar in ax.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x+width/2.
    newwidth = width/2
    bar.set_x(centre-newwidth/2.)
    bar.set_width(newwidth)

save_plot('collab_nof_collabs_per_channel_bar_box.pdf', fig, s_width, s_height)

print df_channel_collab_weights_filtered['sum_weight'].median()
df_channel_collab_weights_filtered['sum_weight'].describe()


                          in_weight  out_weight  sum_weight
channel                                                    
UCm1LjO7mpzb68Q71PjGleWQ        0.0         0.0         0.0
UCw-hc7ZJummS0AvWyjUX56A        0.0         1.0         1.0
UCJ3_30nK_biup2t7g_3C61g        1.0         0.0         1.0
UCc_JXQMtWfJmQk3XPQ8JVJQ        0.0         0.0         0.0
UC3Nr-TlH0dQD-R7JBdWU00w        0.0         0.0         0.0
count    2775.000000
mean        2.828829
std         7.234004
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max       134.000000
Name: sum_weight, dtype: float64
count    2775.000000
mean        1.414414
std         4.486117
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max       134.000000
Name: in_weight, dtype: float64
count    2775.000000
mean        1.414414
std         4.941453
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max       120.000000
Name: out_weight, dtype: float64
1.0
Out[153]:
count    2775.000000
mean        2.828829
std         7.234004
min         0.000000
25%         0.000000
50%         1.000000
75%         3.000000
max       134.000000
Name: sum_weight, dtype: float64

In [43]:
sorted_collabs =  df_channel_collab_weights_filtered['sum_weight'].sort_values(ascending=False)

sorted_collabs = sorted_collabs[:10]
print sorted_collabs
#most_collabs = pa.DataFrame(['channel0', 'channel1', 'nof'])
most_collabs = []

for index1, sumw in sorted_collabs.iteritems():
    #print index1, sumw
    froms = df_graph_filtered[df_graph_filtered['from'] == index1][['to', 'weight', 'cluster']]
    tos = df_graph_filtered[df_graph_filtered.to == index1][['from', 'weight', 'cluster']]
    
    for index2, row in froms.iterrows():
        most_collabs.append( (index1, sumw,  row['to'], row['weight'], row['cluster']) )
    
    for index2, row in tos.iterrows():
        most_collabs.append( (index1, sumw, row['from'], row['weight'], row['cluster']) )
    
most_collabs = pa.DataFrame(most_collabs)

#most_collabs.set_index([0, 1], inplace=True)

#most_collabs = most_collabs.reset_index()


def get_name(id):
    with db._session_scope(False) as session:
        return session.query(Channel.title).filter(Channel.id == id).first()[0]
    
most_collabs[0] = most_collabs[0].apply(get_name)
most_collabs[2] = most_collabs[2].apply(get_name)


most_collabs.columns = ['channel0', 'total', 'channel1', 'count', 'cluster']

most_collabs.set_index(['channel0', 'total', ], inplace=True)

most_collabs.to_csv(DIR+r'/df_most_collabs_top.txt', sep=str('\t'), encoding='utf-8')

most_collabs


channel
UCIZSqPHF-R9m3X5o0VPKFdg    134.0
UC-NINtNMPM75eaqh07RCy_Q    133.0
UCz_BJGTYrM4ntr7vsrsjfLw     97.0
UC4PooiX37Pld1T8J5SYT-SQ     93.0
UCzpCc5n9hqiVC7HhPwcIKEg     91.0
UCoKGanMmrKj8RAqAwS5vRug     82.0
UCBINYCmwE29fBXCpUI8DgTA     74.0
UCfPhyExfcaqJBKc3HO3cNBw     63.0
UCNRfqSkBqZR8Ge_QnfwqgFQ     57.0
UCU4BHh9Dwfd7-I_xTZ5037Q     50.0
Name: sum_weight, dtype: float64
Out[43]:
channel1 count cluster
channel0 total
iO Trendz 134.0 MyCupcakeAddiction 1 4776
134.0 SuperwomanVlogs 1 7489
134.0 Michael McCrudden 1 7598
134.0 Scarce 1 9356
134.0 InformOverload 117 7556
134.0 PewDiePie 1 5367
134.0 SHAYTARDS 1 5257
134.0 Pabllo Vittar 1 3053
134.0 DJ Akademiks 1 4716
134.0 MostAmazingTop10 9 5264
InformOverload 133.0 InformOverload 2 2 7556
133.0 LandonProduction Vlogs 1 5975
133.0 iO Trendz 117 7556
133.0 HES19Motivation 1 108
133.0 Bando de Quadrados - Nerd Cristão 1 3447
133.0 Be Amazed 1 3493
133.0 Monster Energy 1 5247
133.0 SHAYTARDS 2 5257
133.0 MostAmazingTop10 6 8709
133.0 dangmattsmith 1 5508
FilmRise 97.0 FilmRise Documentaries 49 252
97.0 FilmRise True Crime 48 252
Good Mythical Morning 93.0 Isa Marcial 1 4933
93.0 Good Mythical MORE 42 13327
93.0 Terror Tube 1 104
93.0 Good Mythical MORE 49 8617
Good Mythical MORE 91.0 Good Mythical Morning 49 8617
91.0 Good Mythical Morning 42 13327
LandonProduction Vlogs 82.0 LandonProduction 27 4811
82.0 MostAmazingTop10 51 4811
82.0 InformOverload 1 5975
82.0 MostAmazingTop10 2 8709
82.0 LandonProduction 1 4958
MostAmazingTop10 74.0 LandonProduction Vlogs 2 8709
74.0 InformOverload 6 8709
74.0 iO Trendz 9 5264
74.0 Leandro Osti 1 1166
74.0 CHEF MAMA ROSA 1 2188
74.0 Black Pigeon Speaks 1 1087
74.0 Just For Laughs Gags 1 668
74.0 Monster Energy 1 5247
74.0 CBR 1 1996
74.0 LandonProduction Vlogs 51 4811
JustKiddingNews 63.0 JustKiddingFilms 2 11464
63.0 Joe Jo 1 10663
63.0 James Rodolfo 13 12825
63.0 Bart & Geo 21 11464
63.0 Tiff & Case 6 5428
63.0 JustKiddingParty 15 11464
63.0 Barbell Brigade 4 11464
63.0 Nikki Limo 1 2742
WengieVlogs 57.0 Wengie 13 6626
57.0 Clicknetwork 1 6626
57.0 Maxmellow 28 6626
57.0 Maxmellow 15 5566
FilmRise True Crime 50.0 Just For Laughs Gags 1 105
50.0 FilmRise 48 252
50.0 Everyday Estée 1 160

In [158]:
# number of collabs/ Number of videos ratio

# number of collabs -> channel weights (count only in-edges? because in-edges are own videos, 
# out would include videos from othe channel)
# number of videos -> DB, other DF

df_channel_collab_weights_filtered['video/collab ratio'] = np.nan
df_channel_collab_weights_filtered.loc[:, 'nof_videos'] = np.nan

for node in Gf.nodes():
    in_weight = df_channel_collab_weights_filtered.loc[node, 'in_weight']
    with db._session_scope(False) as session:
        nof_videos = session.query(Video.id).filter(Video.channelID == node).count()
    df_channel_collab_weights_filtered.loc[node, 'nof_videos'] = nof_videos
    #print nof_videos, in_weight, in_weight/nof_videos
    df_channel_collab_weights_filtered.loc[node, 'video/collab ratio'] = in_weight / nof_videos

    
#print df_channel_collab_weights[['in_weight', 'out_weight', 'nof_videos', 'video/collab ratio']].sort_values(by='video/collab ratio', ascending=False)

In [159]:
#number of collabs/ Number of videos ratio

fig = plt.figure()
ax1 = sns.distplot(df_channel_collab_weights_filtered['video/collab ratio'], kde=False)
ax1.set_xlabel('Collaboration/Video Ratio')
ax1.set_ylabel('Number of Channel')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Channel Collaborations/Videos Ratio')
save_plot('collab_video_ratios.pdf', fig, s_width, s_height)


fig = plt.figure()
ax1 = sns.distplot(df_channel_collab_weights_filtered['video/collab ratio'], kde=False)
ax1.set_xlabel('Collaboration/Video Ratio')
ax1.set_ylabel('Number of Channel')
#ax1.set_xscale('log')
ax1.set_yscale('symlog')
ax1.legend()
plt.title('Channel Collaborations/Videos Ratio')
save_plot('collab_video_ratios_s.pdf', fig, s_width, 0.75*s_height)

#fig = plt.figure()
#ax2 = sns.distplot(df_channel_collab_weights_filtered['video/collab ratio'], hist=False)
#ax2.set_yscale('log')
#ax2.set_xscale('log')

df_channel_collab_weights_filtered['video/collab ratio'].describe()


Out[159]:
count    2775.000000
mean        0.063742
std         0.175167
min         0.000000
25%         0.000000
50%         0.000000
75%         0.043478
max         2.000000
Name: video/collab ratio, dtype: float64

In [46]:
# assigns network and popularity in channel list df_channel_collab_weights_filtered


df_channel_collab_weights_filtered['network'] = np.nan
df_channel_collab_weights_filtered['popularity'] = np.nan

for index, row in df_channel_collab_weights_filtered.iterrows():
    df_channel_collab_weights_filtered.loc[index, 'network'] = df_channel.loc[index, 'network']
    df_channel_collab_weights_filtered.loc[index, 'popularity'] = df_channel.loc[index, 'popularity']


# Collabs by network
# number of collabs complete (all channels) per network, dist.
networks = []
for name, group in df_channel_collab_weights_filtered.groupby(['network']):
    #print name, group['sum_weight'].sum()
    networks.append((name, group['in_weight'].sum(), group['out_weight'].sum()))
df_network_collabs = pa.DataFrame(networks, columns=['network', 'in_collabs', 'out_collabs'])
df_network_collabs = df_network_collabs.set_index(['network'])

# number of collabs by popularity, per popularity class -> in or outgoing collabs? both?
popularitys = []
for name, group in df_channel_collab_weights_filtered.groupby(['popularity']):
    #print name, group['sum_weight'].sum()
    popularitys.append((name, group['in_weight'].sum(), group['out_weight'].sum()))
df_popularitys_collabs = pa.DataFrame(popularitys, columns=['popularity', 'in_collabs', 'out_collabs'])
df_popularitys_collabs = df_popularitys_collabs.set_index(['popularity'])

print df_network_collabs.head()
print len(df_network_collabs)
print df_popularitys_collabs.head()
# Number of collabs between channel in their own network, and collabs over networks


                        in_collabs  out_collabs
network                                        
1l1lv2tvbn7LsHWoygrwgA         0.0          0.0
2btube                         3.0          1.0
63vOrQKSnHPhvqmEekGPOQ         0.0          0.0
AGE Network                    0.0          0.0
AIR                           30.0          0.0
203
            in_collabs  out_collabs
popularity                         
0.0               71.0         53.0
1.0              187.0        142.0
2.0              847.0        828.0
3.0             1914.0       1929.0
4.0              827.0        902.0

In [47]:
print df_network_collabs.describe()

print df_popularitys_collabs.describe()


        in_collabs  out_collabs
count   203.000000   203.000000
mean     19.334975    19.334975
std     112.487474   115.571615
min       0.000000     0.000000
25%       0.000000     0.000000
50%       1.000000     0.000000
75%       3.000000     2.000000
max    1106.000000  1037.000000
        in_collabs  out_collabs
count     7.000000     7.000000
mean    560.714286   560.714286
std     696.386558   712.598464
min       3.000000     5.000000
25%      73.500000    59.500000
50%     187.000000   142.000000
75%     837.000000   865.000000
max    1914.000000  1929.000000

In [48]:
fig, axes = plt.subplots()
ax = df_network_collabs[ (df_network_collabs.in_collabs + df_network_collabs.out_collabs) > 100].sort_values(by=['in_collabs', 'out_collabs'], ascending=True).plot(ax=axes, kind='barh')
ax.set_xlabel('Number of Collaborations')
ax.set_ylabel('Network')
ax.set_xscale('log')
#ax.set_yscale('log')
ax.legend(loc=0, labels=['Internal', 'External'])
plt.title('Network Collaborations')
plt.tight_layout()
save_plot('collab_in_out_networks.pdf', fig, s_width, s_height)

fig, axes = plt.subplots()
ax = df_network_collabs[ (df_network_collabs.in_collabs + df_network_collabs.out_collabs) > 40].sort_values(by=['in_collabs', 'out_collabs'], ascending=True).plot(ax=axes, kind='barh')
ax.set_xlabel('Number of Collaborations')
ax.set_ylabel('Network')
ax.set_xscale('log')
#ax.set_yscale('log')
ax.legend(loc=0, labels=['Internal', 'External'])
plt.title('Network Collaborations')
plt.tight_layout()
save_plot('collab_in_out_networks_big.pdf', fig, x_width, x_height)

fig, axes = plt.subplots()
ax1 = df_popularitys_collabs.sort_index(ascending=False).plot(ax=axes,kind='barh')
ax1.set_xlabel('Number of Collaborations')
ax1.set_ylabel('Popularity')
ax1.set_xscale('log')
#ax1.set_yscale('log')
ax1.legend(loc=0, labels=['Internal', 'External'])
plt.title('Popularity Collaborations')
plt.tight_layout()
save_plot('collab_in_out_popularity.pdf', fig, s_width, s_height)


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

In [174]:
# Collabs between popularitys
pops = df_graph_filtered[['from_popularity', 'to_popularity']]
y = pops['from_popularity'].nunique()
x = pops['to_popularity'].nunique()
arr = np.zeros((y, x))

for row in pops.iterrows():
    #print row[0], row[1][0], row[1][1]
    arr[row[1][0], row[1][1]] += 1

print arr
# divide every arr field through the number of videos in this pair?
df_arr = pa.DataFrame(arr)

fig = plt.figure()
ax = sns.heatmap(df_arr, annot=True, fmt='g', cbar=False, mask=df_arr<=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
#ax.set_xscale('log')
#ax1.set_yscale('log')
plt.title('Collaborations between Popularities')
plt.tight_layout()
save_plot('collab_popularity_heatmap.pdf', fig, s_width, s_height)


[[   1.    4.   12.   15.    4.    0.    0.]
 [   6.   11.   24.   38.   10.    0.    0.]
 [  10.   22.  109.  191.   53.    1.    1.]
 [  12.   41.  196.  430.  157.    3.    0.]
 [   3.   11.   48.  170.  115.    7.    2.]
 [   0.    1.    3.    6.    7.    0.    0.]
 [   0.    0.    1.    3.    0.    0.    0.]]

In [190]:
# Collabs between popularitys
pops = df_graph_filtered[['from_popularity', 'to_popularity']]
y = pops['from_popularity'].nunique()
x = pops['to_popularity'].nunique()
arr = np.zeros((y, x))

sum = 0.0

for row in pops.iterrows():
    #print row[0], row[1][0], row[1][1]
    arr[row[1][0], row[1][1]] += 1
    sum += 1

print arr
print sum
arr_p = arr / sum * 100.0
# divide every arr field through the number of videos in this pair?
df_arr = pa.DataFrame(arr_p)

fig = plt.figure()
ax = sns.heatmap(df_arr, annot=True, fmt='.1f', cbar=False, mask=df_arr<=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
#ax.set_xscale('log')
#ax1.set_yscale('log')
plt.title('% Collaborations between Popularities')
plt.tight_layout()
save_plot('collab_popularity_heatmap_perc.pdf', fig, s_width, s_height)


[[   1.    4.   12.   15.    4.    0.    0.]
 [   6.   11.   24.   38.   10.    0.    0.]
 [  10.   22.  109.  191.   53.    1.    1.]
 [  12.   41.  196.  430.  157.    3.    0.]
 [   3.   11.   48.  170.  115.    7.    2.]
 [   0.    1.    3.    6.    7.    0.    0.]
 [   0.    0.    1.    3.    0.    0.    0.]]
1728.0

In [50]:
# Collabs between popularitys
pops = df_graph_filtered[['from_popularity', 'to_popularity', 'videos']]
y = pops['from_popularity'].nunique()
x = pops['to_popularity'].nunique()
#vids = pops['videos'].apply(len)

arr = np.zeros((y, x)) # number of channel-pairs collaborating
arr_div = np.zeros((y, x)) # number of videos

for row in pops.iterrows():
    #print row[0], row[1][0], row[1][1]
    arr[row[1][0], row[1][1]] += 1
    arr_div[row[1][0], row[1][1]] += len(row[1][2])

#print arr


df_arr1 = pa.DataFrame(arr)

fig, (ax1, ax2) = plt.subplots(ncols=2)

sns.heatmap(df_arr1, annot=True, fmt='.0f', cbar=False, mask=df_arr<=0, ax=ax1)
ax1.set_xlabel('To')
ax1.set_ylabel('From')
#ax.set_xscale('log')
#ax1.set_yscale('log')
ax1.set_title('Channel-Collaborations between Popularities')
plt.tight_layout()

#arr = (arr - np.mean(arr)) / np.std(arr)
#arr = (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

#arr_div = (arr_div - np.mean(arr_div)) / np.std(arr_div)
#arr_div = (arr_div-np.min(arr_div))/(np.max(arr_div)-np.min(arr_div))

arr[arr == 0] = 0.000000001

arr_test = arr_div / arr

#print arr_div

#print arr_test


df_arr = pa.DataFrame(arr_test)

sns.heatmap(df_arr, annot=True, fmt='.0f', cbar=False, mask=df_arr<=0, ax=ax2)
ax2.set_xlabel('To')
ax2.set_ylabel('From')
#ax.set_xscale('log')
#ax1.set_yscale('log')
ax2.set_title('Videos/Channel-Collaborations Ratio')
plt.tight_layout()
save_plot('collab_popularity_heatmap_normalized_combine.pdf', fig, 2*s_width, s_height)

fig = plt.figure()
ax =sns.heatmap(df_arr, annot=True, fmt='.0f', cbar=False, mask=df_arr<=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
#ax.set_xscale('log')
#ax1.set_yscale('log')
ax.set_title('Videos/Channel-Collaborations Ratio')
plt.tight_layout()
save_plot('collab_popularity_heatmap_normalized.pdf', fig, s_width, s_height)



In [51]:
bins = [0, 1.0e+3, 1.0e+4, 1.0e+5, 1.0e+6, 1.0e+7, 5.0e+7, 1.0e+8]
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)


for i, (a, b) in enumerate(pairwise(bins)):
    print i,':', a, b


0 : 0 1000.0
1 : 1000.0 10000.0
2 : 10000.0 100000.0
3 : 100000.0 1000000.0
4 : 1000000.0 10000000.0
5 : 10000000.0 50000000.0
6 : 50000000.0 100000000.0

In [202]:
# network collabs
    
network_pairs = []
for name, group in df_graph_filtered.groupby(['from_network', 'to_network']):
    network_pairs.append((name[0], name[1], group['weight'].sum()))
    
df_network_pairs_collabs = pa.DataFrame(network_pairs, columns=['from_network','to_network', 'nof_collabs'])

#df_network_pairs_collabs.plot()

fig = plt.figure()
ax = sns.distplot(df_network_pairs_collabs['nof_collabs'], kde=False, bins=200)
ax.set_xlabel('Number of Collaborations')
ax.set_ylabel('Network Pairs')
ax.set_xscale('log')
ax.set_yscale('log')
plt.title('Number of pairwise Network Collaborations')
plt.tight_layout()
plt.axvline(x=25.0)
save_plot('collab_network_pairs.pdf', fig, s_width, s_height)

#print df_network_pairs_collabs.sort_values(by='nof_collabs', ascending=False).head()
print len(df_network_pairs_collabs)

df_network_collabs_mask = df_network_pairs_collabs[df_network_pairs_collabs.nof_collabs > 12]
#print df_network_collabs_mask.sort_values(by='nof_collabs', ascending=False).head()
print len(df_network_collabs_mask)

#df_network_pairs_collabs[df_network_pairs_collabs.from_network=='AIR']


404
36

In [208]:
# Collabs between networks

def f(x1, x2):
    return ((df_network_collabs_mask['from_network'] == x1) & (df_network_collabs_mask['to_network'] == x2)).any()

df_graph_collabs_vis = df_graph_filtered[df_graph_filtered[['from_network','to_network']].apply(lambda x: f(*x), axis=1)]


network_test = []
for name, group in df_graph_collabs_vis.groupby(['from_network', 'to_network']):
    network_test.append((name[0], name[1], group['weight'].sum()))

    
df_network_test_collabs = pa.DataFrame(network_test, columns=['from_network','to_network', 'nof_collabs'])

froms = df_network_test_collabs['from_network'].values
tos = df_network_test_collabs['to_network'].values

network_set = []
network_set.extend(froms)
network_set.extend(tos)
network_set = list(set(network_set))

new_index = pa.MultiIndex.from_product([network_set, network_set], names=['from_network', 'to_network'])

df_network_test_collabs = df_network_test_collabs.set_index(['from_network', 'to_network']).reindex(new_index).reset_index()

df_network_test_collabs = df_network_test_collabs.set_index(['from_network', 'to_network'])
df_network_test_collabs = df_network_test_collabs.sort_values(by='nof_collabs', ascending=False)


def unique(array):
    uniq, index = np.unique(array, return_index=True)
    return uniq[index.argsort()]

#print df_graph_collabs_vis.head()
print len(df_graph_collabs_vis)
print df_graph_collabs_vis['from_network'].nunique()
print df_graph_collabs_vis['to_network'].nunique()


index_test = df_network_test_collabs.index.values
index_test1, index_test2 = zip(*index_test)

print len(index_test1), len(index_test2)

index_test1 = unique(index_test1)
index_test2 = unique(index_test2)

#print index_test1
#print index_test2

#print len(index_test1), len(index_test2)

test = df_network_test_collabs.unstack(0)
test = test.reindex(index_test1)
#print test.index
new_test = []

for lvl in index_test1:
    vals = test[('nof_collabs', lvl)]

    new_test.append(vals)
    #new_test.append(test[('nof_collabs', lvl)])

new_test = pa.DataFrame(new_test).T

new_test_2 = new_test.copy()

sum = np.nansum(new_test.values)

#print new_test.index
#new_test['nof_collabs'] = new_test['nof_collabs'].apply(lambda x: x / sum * 100.0)

for i, row in new_test_2.iterrows():
    for j, item in row.iteritems():
        #v = 0
        #for l in df_graph_filtered[(df_graph_filtered.from_network==i) & (df_graph_filtered.to_network==j[1])]['videos']:
        #    v += len(l)
        #print item, v
        #if item and v:
        #    new_test_2.loc[(i, j)] = v/item #item/v
        #else:
        #    new_test_2.loc[(i, j)] = np.nan
        new_test_2.loc[(i, j)] = item / sum * 100.0
    
fig = plt.figure()
ax = sns.heatmap(new_test, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='g', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Network Collaborations')
plt.tight_layout()

save_plot('collab_network_heatmap.pdf', fig, 1*x_width, 1.5*x_height)


fig = plt.figure()
ax = sns.heatmap(new_test_2, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='.1f', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('% Network Collaborations')
plt.tight_layout()
save_plot('collab_network_heatmap_perc.pdf', fig, 1*x_width, 1.5*x_height)

#print index_test1
#print index_test2


1192
10
14
225 225

In [58]:
fig = plt.figure()
ax = sns.heatmap(new_test, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='g', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Network Collaborations')
plt.tight_layout()

save_plot('collab_network_heatmap_s.pdf', fig, 1.5*s_width, 2.5*s_height)


fig = plt.figure()
ax = sns.heatmap(new_test_2, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='.2g', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Videos/Network Collaborations')
plt.tight_layout()
save_plot('collab_network_heatmap_normalized_s.pdf', fig, 1.5*s_width, 2.5*s_height)

new_test


Out[58]:
(nof_collabs, BroadbandTV) (nof_collabs, Studio71) (nof_collabs, Maker Studios) (nof_collabs, None) (nof_collabs, StyleHaul) (nof_collabs, Fullscreen) (nof_collabs, PranksNetwork) (nof_collabs, AIR) (nof_collabs, Mixicom) (nof_collabs, lacapula)
to_network
BroadbandTV 664.0 28.0 57.0 228.0 NaN 20.0 NaN NaN NaN NaN
Studio71 NaN 411.0 NaN 110.0 NaN NaN NaN NaN NaN NaN
Maker Studios 47.0 25.0 364.0 107.0 20.0 28.0 NaN NaN NaN NaN
None 168.0 150.0 93.0 258.0 NaN 25.0 NaN NaN NaN NaN
StyleHaul NaN NaN NaN NaN 32.0 NaN NaN NaN NaN NaN
Fullscreen 29.0 NaN 34.0 22.0 NaN NaN NaN NaN NaN NaN
PranksNetwork NaN NaN NaN NaN NaN NaN 25.0 NaN NaN NaN
AIR 22.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
Mixicom NaN NaN NaN 27.0 NaN NaN NaN NaN NaN NaN
lacapula NaN 27.0 NaN NaN NaN NaN NaN NaN NaN NaN

In [83]:
df_network_pairs_collabs.sort_values(by='nof_collabs', ascending=False).to_csv(DIR+r'/df_network_collabs_pairs.txt', sep=str('\t'), encoding='utf-8')

In [67]:
test = df_network_pairs_collabs.groupby(by='from_network')
for name, group in test:
    print group


  from_network to_network  nof_collabs
0       2btube   Freedom!            1
   from_network     to_network  nof_collabs
1  Age of Media    BroadbandTV            1
2  Age of Media  Maker Studios            1
  from_network  to_network  nof_collabs
3        Alloy       Break            2
4        Alloy        None            2
5        Alloy  TimeWarner            1
  from_network  to_network  nof_collabs
6       Arcane  Fullscreen            1
    from_network     to_network  nof_collabs
7  AwesomenessTV  Maker Studios            1
8  AwesomenessTV           None            3
9  AwesomenessTV       Studio71           12
   from_network to_network  nof_collabs
10       Base79  DevLounge            1
   from_network to_network  nof_collabs
11   BentPixels  Machinima            3
   from_network     to_network  nof_collabs
12     BigFrame    BroadbandTV            1
13     BigFrame  Maker Studios            3
14     BigFrame           None            1
15     BigFrame       Studio71            7
    from_network          to_network  nof_collabs
16  BodyMindZone        BodyMindZone            1
17  BodyMindZone         BroadbandTV            2
18  BodyMindZone          Fullscreen            1
19  BodyMindZone           Machinima            1
20  BodyMindZone       Maker Studios            1
21  BodyMindZone                None            3
22  BodyMindZone           QuizGroup            1
23  BodyMindZone  vivacommunications            1
   from_network   to_network  nof_collabs
24        Break  BroadbandTV            1
   from_network              to_network  nof_collabs
25  BroadbandTV                  2btube            1
26  BroadbandTV                     AIR           22
27  BroadbandTV            Age of Media            4
28  BroadbandTV            BodyMindZone            2
29  BroadbandTV                   Break            2
30  BroadbandTV             BroadbandTV          664
31  BroadbandTV          ClassicClipsTV            2
32  BroadbandTV           Complex Media            1
33  BroadbandTV     Creators Revolution            1
34  BroadbandTV                   Curse            2
35  BroadbandTV  DigiSayLimited Managed            3
36  BroadbandTV     DigitalDynastyMedia            1
37  BroadbandTV                Divimove            1
38  BroadbandTV                   Exmge            1
39  BroadbandTV          FinderStudio A            1
40  BroadbandTV       FranceTelevisions            1
41  BroadbandTV                Freedom!            4
42  BroadbandTV              Fullscreen           29
43  BroadbandTV               GTchannel            1
44  BroadbandTV           GeekandSundry            1
45  BroadbandTV               Machinima            2
46  BroadbandTV           Maker Studios           47
47  BroadbandTV                    Mitu            5
48  BroadbandTV                 Mixicom            1
49  BroadbandTV                    None          168
50  BroadbandTV            OmniaMediaCo            1
51  BroadbandTV                 Paragon            1
52  BroadbandTV           PranksNetwork            2
53  BroadbandTV          Ritual Network            1
54  BroadbandTV                ScaleLab           16
55  BroadbandTV       SiriusStarNetwork            5
56  BroadbandTV            Social Blade            2
57  BroadbandTV     Social Blade Legacy            1
58  BroadbandTV                Studio71            5
59  BroadbandTV               StyleHaul           14
60  BroadbandTV           TopBeautyBlog            3
61  BroadbandTV                 TubeOne            2
62  BroadbandTV           YouPartnerVSP            2
63  BroadbandTV               Zoomin TV            1
64  BroadbandTV                     nfl            1
65  BroadbandTV     onerpmweb affiliate           11
66  BroadbandTV                  wizdeo            1
67  BroadbandTV  wjoPtSoNLAoX2sLBaKLYng            1
    from_network to_network  nof_collabs
68  BuzzMyVideos  DevLounge            2
             from_network     to_network  nof_collabs
69  ChannelFlip Affiliate  Kin Community            1
70  ChannelFlip Affiliate  Maker Studios            1
71  ChannelFlip Affiliate           None            1
         from_network   to_network  nof_collabs
72  ChannelFrederator  BroadbandTV            1
73  ChannelFrederator       Collab            1
          from_network to_network  nof_collabs
74  Citizine Affiliate       None            1
      from_network   to_network  nof_collabs
75  ClassicClipsTV  BroadbandTV            4
76  ClassicClipsTV     ScaleLab            1
   from_network     to_network  nof_collabs
77       Collab    BroadbandTV            1
78       Collab    Jukin Media            1
79       Collab  Maker Studios            1
80       Collab       Studio71            1
81       Collab      StyleHaul            2
     from_network to_network  nof_collabs
82  Complex Media       None            4
       from_network to_network  nof_collabs
83  CondeNastMaster   Studio71            1
           from_network     to_network  nof_collabs
84  Creators Revolution  PranksNetwork            4
   from_network     to_network  nof_collabs
85        Curse    BroadbandTV           17
86        Curse     Fullscreen            1
87        Curse  Maker Studios            3
88        Curse       Studio71            1
89        Curse  YouPartnerVSP            1
   from_network     to_network  nof_collabs
90  DDNPartners  Maker Studios            1
91  DDNPartners           None            4
              from_network to_network  nof_collabs
92  DaIW2zPRWhzQ9Hj7a0QP1w        AIR            1
   from_network    to_network  nof_collabs
93    DevLounge  BuzzMyVideos            3
           from_network to_network  nof_collabs
94  DigitalDynastyMedia       None            1
   from_network     to_network  nof_collabs
95     Divimove          Alloy            1
96     Divimove  Maker Studios            1
97     Divimove       Studio71            1
              from_network     to_network  nof_collabs
98  Divimove IT affiliated  Maker Studios            1
   from_network   to_network  nof_collabs
99        Emaze  BroadbandTV            2
              from_network     to_network  nof_collabs
100  Endemol beyond Brasil    BroadbandTV            2
101  Endemol beyond Brasil  Maker Studios            1
      from_network     to_network  nof_collabs
102  EndemolEspana  EndemolEspana            1
103  EndemolEspana  Maker Studios            1
104  EndemolEspana       ScaleLab            1
      from_network to_network  nof_collabs
105  EpidemicSound       None            2
           from_network   to_network  nof_collabs
106  FamilyVideoNetwork  BroadbandTV            1
       from_network    to_network  nof_collabs
107  FinderStudio A  The District            1
    from_network     to_network  nof_collabs
108     Freedom!          Break            1
109     Freedom!    BroadbandTV            2
110     Freedom!  Maker Studios            1
111     Freedom!           Mitu            1
112     Freedom!           None            2
113     Freedom!            WWE            1
    from_network                  to_network  nof_collabs
114   Fullscreen                         AIR            1
115   Fullscreen                    AgoraPro            1
116   Fullscreen                       Alloy            3
117   Fullscreen           Awestruck Network            1
118   Fullscreen                      Base79            1
119   Fullscreen                       Break            2
120   Fullscreen                 BroadbandTV           20
121   Fullscreen                DigitalMinds            1
122   Fullscreen       Endemol beyond Brasil            1
123   Fullscreen                    Freedom!            9
124   Fullscreen                  Fullscreen           10
125   Fullscreen                   Machinima            3
126   Fullscreen               Maker Studios           28
127   Fullscreen                     Nerdist            1
128   Fullscreen  NewHavenAssetManagementLLC            2
129   Fullscreen                        None           25
130   Fullscreen         Social Blade Legacy            1
131   Fullscreen                    Studio71           11
132   Fullscreen                   StyleHaul            3
133   Fullscreen                      emipub            2
134   Fullscreen                   ingrooves            1
135   Fullscreen             theorchardmusic            1
           from_network     to_network  nof_collabs
136  Fullscreen managed  Kin Community            3
137  Fullscreen managed       Studio71            3
      from_network to_network  nof_collabs
138  GeekandSundry      Alloy            3
139  GeekandSundry    Nerdist            1
140  GeekandSundry   ScaleLab            1
    from_network   to_network  nof_collabs
141  Jukin Media  BroadbandTV            1
142  Jukin Media         None            1
              from_network to_network  nof_collabs
143  Justforlaughsfestival   Studio71            1
      from_network          to_network  nof_collabs
144  Kin Community         BroadbandTV            2
145  Kin Community          Fullscreen            1
146  Kin Community  Fullscreen managed            2
147  Kin Community       Kin Community            2
148  Kin Community       Maker Studios            1
149  Kin Community                None            2
150  Kin Community            Studio71            2
    from_network             to_network  nof_collabs
151    Machinima            BroadbandTV            3
152    Machinima  Endemol beyond Brasil            1
153    Machinima              Machinima            1
154    Machinima                   None            2
155    Machinima               Studio71            3
     from_network     to_network  nof_collabs
156  MagnoliaNetz  Maker Studios            1
      from_network              to_network  nof_collabs
157  Maker Studios                  2btube            1
158  Maker Studios            Age of Media            1
159  Maker Studios              BelieveSAS            1
160  Maker Studios                   Break            4
161  Maker Studios             BroadbandTV           57
162  Maker Studios       ChannelFrederator            1
163  Maker Studios                   Curse            5
164  Maker Studios             DDNPartners            2
165  Maker Studios            DigitalMinds            1
166  Maker Studios  Divimove IT affiliated            4
167  Maker Studios            DreamWorksTV            1
168  Maker Studios           EndemolEspana            2
169  Maker Studios           EpidemicSound            2
170  Maker Studios              Fullscreen           34
171  Maker Studios           Kin Community            1
172  Maker Studios           Maker Studios          364
173  Maker Studios                    Mitu            1
174  Maker Studios                 Mixicom            1
175  Maker Studios                    None           93
176  Maker Studios            OmniaMediaCo            1
177  Maker Studios           RepostNetwork            2
178  Maker Studios                ScaleLab            1
179  Maker Studios     Social Blade Legacy           11
180  Maker Studios                 SonyBMG            1
181  Maker Studios                Studio71           17
182  Maker Studios               StyleHaul            2
183  Maker Studios           Toms Hardware            3
184  Maker Studios           TopBeautyBlog            1
185  Maker Studios                 TubeOne            1
186  Maker Studios           UCI Affiliate            1
187  Maker Studios    WhistleSportsFitness            3
188  Maker Studios      disney interactive            1
189  Maker Studios  id0iPcHVVMh4ASTXBUYkOA            1
190  Maker Studios                     nfl            1
191  Maker Studios     onerpmweb affiliate            6
192  Maker Studios      vivacommunications            1
    from_network   to_network  nof_collabs
193         Mitu       2btube            1
194         Mitu  BroadbandTV            4
195         Mitu         None            2
    from_network to_network  nof_collabs
196      Mixicom    Mixicom            6
197      Mixicom   Studio71            1
           from_network to_network  nof_collabs
198  NSTV Entertainment       None            1
    from_network     to_network  nof_collabs
199      Nerdist  GeekandSundry            1
200      Nerdist     TimeWarner            8
                   from_network to_network  nof_collabs
201  NewHavenAssetManagementLLC     emipub            1
    from_network              to_network  nof_collabs
202         None                     AIR            3
203         None                   Alloy            1
204         None               Amuse Inc            1
205         None                BigFrame            1
206         None                   Break            2
207         None             BroadbandTV          228
208         None       ChannelFrederator            1
209         None          ClassicClipsTV            1
210         None                  Collab            1
211         None     Creators Revolution            1
212         None                   Curse            1
213         None             DDNPartners           11
214         None  Divimove IT affiliated            1
215         None               EndemolDE            1
216         None           EndemolEspana            1
217         None      FamilyVideoNetwork            2
218         None                FamousID            1
219         None                Freedom!            1
220         None              Fullscreen           22
221         None              Hasbro UGC            1
222         None             Jukin Media            1
223         None           Kin Community            2
224         None               Machinima            1
225         None           Maker Studios          107
226         None                    Mitu            2
227         None                 Mixicom           27
228         None                    None          258
229         None                  ONErpm            1
230         None            OmniaMediaCo            2
231         None           PranksNetwork           12
232         None             RazorAndTie            1
233         None              RecStudios            1
234         None                ScaleLab            6
235         None            Social Blade            1
236         None     Social Blade Legacy            3
237         None                Studio71          110
238         None               StyleHaul           10
239         None                TheVerge            1
240         None           TopBeautyBlog            1
241         None                 TubeOne            3
242         None           UCI Affiliate            1
243         None                    UEFA            1
244         None             UniversalMC            1
245         None      disney interactive            7
246         None            id1o1f%2Bcvp            1
247         None                lacapula            1
248         None     onerpmweb affiliate            5
249         None      vivacommunications            1
    from_network to_network  nof_collabs
250       ONErpm    SonyBMG            2
     from_network   to_network  nof_collabs
251  OmniaMediaCo  BroadbandTV            1
252  OmniaMediaCo        Exmge            1
253  OmniaMediaCo     ScaleLab            1
      from_network          to_network  nof_collabs
254  PranksNetwork       PranksNetwork           25
255  PranksNetwork  vivacommunications            1
    from_network   to_network  nof_collabs
256      Qanawat  BroadbandTV            1
257      Qanawat   Fullscreen            1
258      Qanawat      Qanawat            1
    from_network     to_network  nof_collabs
259     ScaleLab    BroadbandTV           16
260     ScaleLab       Freedom!            1
261     ScaleLab     Fullscreen            1
262     ScaleLab      Machinima            1
263     ScaleLab  Maker Studios            1
264     ScaleLab           Mitu            1
265     ScaleLab           None            5
266     ScaleLab     Rumblefish            1
267     ScaleLab       Studio71            1
     from_network   to_network  nof_collabs
268  Social Blade  BroadbandTV            2
            from_network              to_network  nof_collabs
269  Social Blade Legacy             BroadbandTV            1
270  Social Blade Legacy           Maker Studios           19
271  Social Blade Legacy      NSTV Entertainment            1
272  Social Blade Legacy                    None            3
273  Social Blade Legacy     Social Blade Legacy           14
274  Social Blade Legacy                Studio71            5
275  Social Blade Legacy                    avex            1
276  Social Blade Legacy          seedwellcomedy            1
277  Social Blade Legacy                  wizdeo            4
278  Social Blade Legacy  wjoPtSoNLAoX2sLBaKLYng            1
    from_network   to_network  nof_collabs
279      SonyBMG  BroadbandTV            2
280      SonyBMG       ONErpm            2
    from_network to_network  nof_collabs
281        Splay       None            1
    from_network               to_network  nof_collabs
282     Studio71                      AIR            2
283     Studio71                    Alloy            2
284     Studio71            AwesomenessTV            2
285     Studio71                 BigFrame            4
286     Studio71              BroadbandTV           28
287     Studio71             BuzzMyVideos            3
288     Studio71                    Curse            1
289     Studio71      DigitalDynastyMedia            1
290     Studio71                 Divimove            2
291     Studio71            EpidemicSound            2
292     Studio71                    Exmge            1
293     Studio71                 Freedom!            7
294     Studio71               Fullscreen           12
295     Studio71       Fullscreen managed            2
296     Studio71              IDG Germany            1
297     Studio71            Kin Community            2
298     Studio71             MagnoliaNetz            1
299     Studio71            Maker Studios           25
300     Studio71             MusicNations            1
301     Studio71                     None          150
302     Studio71            PranksNetwork            4
303     Studio71      Social Blade Legacy            1
304     Studio71                 Studio71          411
305     Studio71                StyleHaul            4
306     Studio71         TEDtalksDirector            1
307     Studio71                 TheVerge            1
308     Studio71               TimeWarner           14
309     Studio71                  TubeOne           18
310     Studio71  Very Us Network Hamburg            3
311     Studio71            alldefdigital            1
312     Studio71   e vXdMrHHseZ esYUskSBw            3
313     Studio71                   emipub            1
314     Studio71                 lacapula           27
315     Studio71                     sylo            1
    from_network            to_network  nof_collabs
316    StyleHaul           BroadbandTV           13
317    StyleHaul            Fullscreen            2
318    StyleHaul           KVZMusicLtd            1
319    StyleHaul         Kin Community            2
320    StyleHaul             M6 wizdeo            1
321    StyleHaul         Maker Studios           20
322    StyleHaul                  Mitu            1
323    StyleHaul                  None            8
324    StyleHaul          OmniaMediaCo            1
325    StyleHaul               SonyBMG            1
326    StyleHaul              Studio71            6
327    StyleHaul             StyleHaul           32
328    StyleHaul             Tastemade            1
329    StyleHaul  WhistleSportsFitness            1
    from_network     to_network  nof_collabs
330          TIN  Maker Studios            2
331          TIN        TubeOne            1
    from_network     to_network  nof_collabs
332     TheVerge  Maker Studios            1
333     TheVerge           None            1
    from_network to_network  nof_collabs
334   TimeWarner      Alloy            1
335   TimeWarner    Nerdist            1
      from_network   to_network  nof_collabs
336  Toms Hardware  BroadbandTV            1
      from_network         to_network  nof_collabs
337  TopBeautyBlog                AIR            1
338  TopBeautyBlog        BroadbandTV            1
339  TopBeautyBlog         Fullscreen            1
340  TopBeautyBlog               None            1
341  TopBeautyBlog  Zefr SonyPictures            1
342  TopBeautyBlog    theorchardmusic            1
    from_network             to_network  nof_collabs
343      TubeOne            BroadbandTV            2
344      TubeOne  ChannelFlip Affiliate            1
345      TubeOne          Maker Studios            1
346      TubeOne                   None            3
347      TubeOne               Studio71            6
348      TubeOne                TubeOne            1
      from_network          to_network  nof_collabs
349  UCI Affiliate  Fullscreen managed            1
350  UCI Affiliate       Maker Studios            1
351  UCI Affiliate                None            1
352  UCI Affiliate           StyleHaul            1
353  UCI Affiliate       UCI Affiliate            3
    from_network    to_network  nof_collabs
354         UEFA   BroadbandTV            1
355         UEFA  audionetwork            1
                from_network to_network  nof_collabs
356  Very Us Network Hamburg   Studio71            1
             from_network   to_network  nof_collabs
357  WhistleSportsFitness  BroadbandTV            1
    from_network     to_network  nof_collabs
358    WhiteCast  Maker Studios            1
359    WhiteCast           None            1
                 from_network   to_network  nof_collabs
360  Xtreme video sas managed  BroadbandTV            2
      from_network     to_network  nof_collabs
361  YouPartnerVSP    BroadbandTV            1
362  YouPartnerVSP           None            5
363  YouPartnerVSP  YouPartnerVSP            3
    from_network     to_network  nof_collabs
364    Zoomin TV  Maker Studios            1
365    Zoomin TV       Studio71            1
           from_network     to_network  nof_collabs
366  disney interactive    BroadbandTV            1
367  disney interactive  Maker Studios            2
368  disney interactive           None            1
               from_network         to_network  nof_collabs
369  e vXdMrHHseZ esYUskSBw  ARD JungesAngebot            1
370  e vXdMrHHseZ esYUskSBw           Studio71            2
    from_network                  to_network  nof_collabs
371       emipub                 BroadbandTV            1
372       emipub  NewHavenAssetManagementLLC            1
    from_network  to_network  nof_collabs
373    ingrooves  Fullscreen            2
374    ingrooves     TubeOne            1
    from_network to_network  nof_collabs
375          itn       None            1
    from_network   to_network  nof_collabs
376   melberries  BroadbandTV            1
    from_network   to_network  nof_collabs
377        neotv  BroadbandTV            2
    from_network   to_network  nof_collabs
378          nfl  BroadbandTV            1
            from_network          to_network  nof_collabs
379  onerpmweb affiliate    Base79 Ent Spain            1
380  onerpmweb affiliate         BroadbandTV            5
381  onerpmweb affiliate          Fullscreen            2
382  onerpmweb affiliate        MagnoliaNetz            1
383  onerpmweb affiliate       Maker Studios            2
384  onerpmweb affiliate  NSTV Entertainment            1
385  onerpmweb affiliate                None            5
386  onerpmweb affiliate            Studio71            1
    from_network to_network  nof_collabs
387     postgame  GTchannel            1
       from_network   to_network  nof_collabs
388  sandboxnetwork  BroadbandTV            1
    from_network     to_network  nof_collabs
389     theonion     Fullscreen            1
390     theonion  Maker Studios            1
        from_network     to_network  nof_collabs
391  theorchardmusic  Maker Studios            1
392  theorchardmusic           None            1
      from_network to_network  nof_collabs
393  tunesat label       None            1
    from_network          to_network  nof_collabs
394  vivarecords  vivacommunications            1
    from_network           to_network  nof_collabs
395       wizdeo        Maker Studios            2
396       wizdeo  Social Blade Legacy           12
397       wizdeo       wizdeo managed            2
       from_network      to_network  nof_collabs
398  wizdeo managed     BroadbandTV            1
399  wizdeo managed  wizdeo managed            8
               from_network     to_network  nof_collabs
400  wjoPtSoNLAoX2sLBaKLYng    BroadbandTV            6
401  wjoPtSoNLAoX2sLBaKLYng     Fullscreen            1
402  wjoPtSoNLAoX2sLBaKLYng  Maker Studios            1
403  wjoPtSoNLAoX2sLBaKLYng           None            2

In [143]:
# Show the numbers of collabs in their own network in difference to outside of networks
# (from==to and from!=to)

in_out = []
test = df_network_pairs_collabs.groupby(by='from_network')
for name, group in test:
    #print name, ':'
    in_out.append( (name, group[group.to_network == name]['nof_collabs'].sum(), group[group.to_network != name]['nof_collabs'].sum()) )


df_collabs_in_out = pa.DataFrame(in_out, columns=['network', 'within', 'outreaching'])
df_collabs_in_out_vis = df_collabs_in_out[ (df_collabs_in_out.within+df_collabs_in_out.outreaching > 25)].sort_values(by='within', ascending=True)
ax = df_collabs_in_out_vis.plot(kind='barh')
ax.set_yticklabels(df_collabs_in_out_vis['network'].tolist())
ax.set_xlabel('Number of Collaborations')
ax.set_ylabel('Network')
ax.set_xscale('log')
ax.legend(loc=0, labels=['Internal', 'External'])
plt.title('Network Collaborations')
plt.tight_layout()
save_plot('collab_network_within_out.pdf', ax.get_figure(), 1.5*s_width,  1.5*s_height)



In [145]:
def compute_percentage(x):
    #print x['sum']
    pct = float(x['within'])/float(x['sum']) * 100
    return round(pct, 2)

df_collabs_in_out['sum'] = df_collabs_in_out['within'] + df_collabs_in_out['outreaching']
df_collabs_in_out['within_p'] = df_collabs_in_out.apply(compute_percentage, axis=1)

def compute_percentage(x):
    #print x
    pct = float(x['outreaching'])/float(x['sum']) * 100
    return round(pct, 2)

df_collabs_in_out['outreaching_p'] = df_collabs_in_out.apply(compute_percentage, axis=1)

df_collabs_in_out


Out[145]:
network within outreaching sum within_p outreaching_p
0 2btube 0 1 1 0.00 100.00
1 Age of Media 0 2 2 0.00 100.00
2 Alloy 0 5 5 0.00 100.00
3 Arcane 0 1 1 0.00 100.00
4 AwesomenessTV 0 16 16 0.00 100.00
5 Base79 0 1 1 0.00 100.00
6 BentPixels 0 3 3 0.00 100.00
7 BigFrame 0 12 12 0.00 100.00
8 BodyMindZone 1 10 11 9.09 90.91
9 Break 0 1 1 0.00 100.00
10 BroadbandTV 664 373 1037 64.03 35.97
11 BuzzMyVideos 0 2 2 0.00 100.00
12 ChannelFlip Affiliate 0 3 3 0.00 100.00
13 ChannelFrederator 0 2 2 0.00 100.00
14 Citizine Affiliate 0 1 1 0.00 100.00
15 ClassicClipsTV 0 5 5 0.00 100.00
16 Collab 0 6 6 0.00 100.00
17 Complex Media 0 4 4 0.00 100.00
18 CondeNastMaster 0 1 1 0.00 100.00
19 Creators Revolution 0 4 4 0.00 100.00
20 Curse 0 23 23 0.00 100.00
21 DDNPartners 0 5 5 0.00 100.00
22 DaIW2zPRWhzQ9Hj7a0QP1w 0 1 1 0.00 100.00
23 DevLounge 0 3 3 0.00 100.00
24 DigitalDynastyMedia 0 1 1 0.00 100.00
25 Divimove 0 3 3 0.00 100.00
26 Divimove IT affiliated 0 1 1 0.00 100.00
27 Emaze 0 2 2 0.00 100.00
28 Endemol beyond Brasil 0 3 3 0.00 100.00
29 EndemolEspana 1 2 3 33.33 66.67
... ... ... ... ... ... ...
62 TimeWarner 0 2 2 0.00 100.00
63 Toms Hardware 0 1 1 0.00 100.00
64 TopBeautyBlog 0 6 6 0.00 100.00
65 TubeOne 1 13 14 7.14 92.86
66 UCI Affiliate 3 4 7 42.86 57.14
67 UEFA 0 2 2 0.00 100.00
68 Very Us Network Hamburg 0 1 1 0.00 100.00
69 WhistleSportsFitness 0 1 1 0.00 100.00
70 WhiteCast 0 2 2 0.00 100.00
71 Xtreme video sas managed 0 2 2 0.00 100.00
72 YouPartnerVSP 3 6 9 33.33 66.67
73 Zoomin TV 0 2 2 0.00 100.00
74 disney interactive 0 4 4 0.00 100.00
75 e vXdMrHHseZ esYUskSBw 0 3 3 0.00 100.00
76 emipub 0 2 2 0.00 100.00
77 ingrooves 0 3 3 0.00 100.00
78 itn 0 1 1 0.00 100.00
79 melberries 0 1 1 0.00 100.00
80 neotv 0 2 2 0.00 100.00
81 nfl 0 1 1 0.00 100.00
82 onerpmweb affiliate 0 18 18 0.00 100.00
83 postgame 0 1 1 0.00 100.00
84 sandboxnetwork 0 1 1 0.00 100.00
85 theonion 0 2 2 0.00 100.00
86 theorchardmusic 0 2 2 0.00 100.00
87 tunesat label 0 1 1 0.00 100.00
88 vivarecords 0 1 1 0.00 100.00
89 wizdeo 0 16 16 0.00 100.00
90 wizdeo managed 8 1 9 88.89 11.11
91 wjoPtSoNLAoX2sLBaKLYng 0 10 10 0.00 100.00

92 rows × 6 columns


In [146]:
df_collabs_in_out[df_collabs_in_out['sum'] > 25].sort_values(by='within')


Out[146]:
network within outreaching sum within_p outreaching_p
53 ScaleLab 0 28 28 0.00 100.00
34 Fullscreen 10 118 128 7.81 92.19
55 Social Blade Legacy 14 36 50 28.00 72.00
51 PranksNetwork 25 1 26 96.15 3.85
59 StyleHaul 32 58 90 35.56 64.44
48 None 258 592 850 30.35 69.65
42 Maker Studios 364 266 630 57.78 42.22
58 Studio71 411 328 739 55.62 44.38
10 BroadbandTV 664 373 1037 64.03 35.97

In [149]:
# Show the numbers of collabs in their own network in difference to outside of networks
# (from==to and from!=to)
import matplotlib.ticker as mtick


df_collabs_in_out_vis = df_collabs_in_out[df_collabs_in_out['sum'] > 25].sort_values(by='within', ascending=True)
df_collabs_in_out_vis = df_collabs_in_out_vis[['network', 'within_p', 'outreaching_p']]
ax = df_collabs_in_out_vis.plot(kind='barh')
ax.set_yticklabels(df_collabs_in_out_vis['network'].tolist())
ax.set_xlabel('% share of collaborations')
ax.set_ylabel('Network')
ax.legend(loc=0, labels=['Internal', 'External'])
plt.title('Network Collaborations')
plt.tight_layout()
save_plot('collab_network_within_out_perc.pdf', ax.get_figure(), 1.5*s_width,  1.5*s_height)



In [77]:
df_collabs_in_out.sort_values(by=['within', 'outreaching'], ascending=False).to_csv(DIR+r'/df_network_collabs_within_out.txt', sep=str('\t'), encoding='utf-8')

In [61]:
# Collabs between channel categories

print 'number of collabs:', len(df_graph_filtered)

print df_graph_filtered[df_graph_filtered.from_category.isnull()]
print df_graph_filtered[df_graph_filtered.to_category.isnull()]

Xuniques, X = np.unique(df_graph_filtered['from_category'], return_inverse=True)
Yuniques, Y = np.unique(df_graph_filtered['to_category'], return_inverse=True)

#cats = df_graph[['to_category', 'from_category']]
cats = pa.DataFrame([X, Y]).T
#print cats
print len(Xuniques), Xuniques #, X
print len(Yuniques), Yuniques #, Y

arrc = np.zeros((len(Yuniques), len(Xuniques)))

print arrc.shape

sum = 0.0

for row in cats.iterrows():
    #print row[0], row[1][0], row[1][1]
    arrc[row[1][0], row[1][1]] += 1
    sum += 1

arr_p = arrc / sum * 100
    
fig = plt.figure()
ax = sns.heatmap(arrc, annot=True, yticklabels=Yuniques, xticklabels=Xuniques, fmt='g', mask=arrc <= 0.0)

ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
#ax.invert_yaxis()
fig.tight_layout()
#fig.subplots_adjust(bottom=0.2)



cat_test = []
for name, group in df_graph_filtered.groupby(['from_category', 'to_category']):
    #print group.index
    cat_test.append((name[0], name[1], len(group)))

df_cat = pa.DataFrame(cat_test, columns=['from_category','to_category', 'nof_collabs'])
df_cat = df_cat.set_index(['from_category', 'to_category'])
df_cat = df_cat.sort_values(by='nof_collabs', ascending=False)

fig.tight_layout()
#fig.subplots_adjust(bottom=0.2)

index_test = df_cat.index.values
index_test1, index_test2 = zip(*index_test)

index_test1 = unique(index_test1)
index_test2 = unique(index_test2)

test = df_cat.unstack(0)
test = test.reindex(index_test1)

new_test = []
for lvl in index_test1:
    new_test.append(test[('nof_collabs', lvl)])
new_test = pa.DataFrame(new_test).T


new_test_2 = new_test.copy()

for i, row in new_test_2.iterrows():
    for j, item in row.iteritems():
        v = 0
        for l in df_graph_filtered[(df_graph_filtered.from_category==i) & (df_graph_filtered.to_category==j[1])]['videos']:
            v += len(l)
        #print i, j, item, v
        if item and v:
            new_test_2.loc[(i, j)] = v/item #item/v
        else:
            new_test_2.loc[(i, j)] = np.nan


fig = plt.figure()
ax = sns.heatmap(new_test, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='g', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Category Collaborations')
plt.tight_layout()
save_plot('collab_category_heatmap.pdf', fig, x_width, 1.5*x_height)


fig = plt.figure()
ax = sns.heatmap(new_test_2, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='.0f', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Category Collaborations')
save_plot('collab_category_heatmap_normalized.pdf', fig, x_width, 1.5*x_height)


number of collabs: 1728
Empty DataFrame
Columns: [cluster, from, from_category, from_cluster, from_network, from_popularity, from_topic, to, to_category, to_cluster, to_network, to_popularity, to_topic, videos, weight]
Index: []
Empty DataFrame
Columns: [cluster, from, from_category, from_cluster, from_network, from_popularity, from_topic, to, to_category, to_cluster, to_network, to_popularity, to_topic, videos, weight]
Index: []
15 [u'Cars & Vehicles' u'Comedy' u'Education' u'Entertainment'
 u'Film & Animation' u'Gaming' u'How-to & Style' u'Music'
 u'News & Politics' u'Non-profits & Activism' u'People & Blogs'
 u'Pets & Animals' u'Science & Technology' u'Sports' u'Travel & Events']
15 [u'Cars & Vehicles' u'Comedy' u'Education' u'Entertainment'
 u'Film & Animation' u'Gaming' u'How-to & Style' u'Music'
 u'News & Politics' u'Non-profits & Activism' u'People & Blogs'
 u'Pets & Animals' u'Science & Technology' u'Sports' u'Travel & Events']
(15, 15)

In [219]:
# Collabs between channel categories

print 'number of collabs:', len(df_graph_filtered)

print df_graph_filtered[df_graph_filtered.from_category.isnull()]
print df_graph_filtered[df_graph_filtered.to_category.isnull()]

Xuniques, X = np.unique(df_graph_filtered['from_category'], return_inverse=True)
Yuniques, Y = np.unique(df_graph_filtered['to_category'], return_inverse=True)

#cats = df_graph[['to_category', 'from_category']]
cats = pa.DataFrame([X, Y]).T
#print cats
print len(Xuniques), Xuniques #, X
print len(Yuniques), Yuniques #, Y

arrc = np.zeros((len(Yuniques), len(Xuniques)))

print arrc.shape

sum = 0.0

for row in cats.iterrows():
    #print row[0], row[1][0], row[1][1]
    arrc[row[1][0], row[1][1]] += 1
    sum += 1

arrc = arrc / sum * 100
    
print sum

sum = 0.0

def shorter(x):
    try:
        return {'Film & Animation': 'Film',
               'Cars & Vehicles': 'Cars',
               'Pets & Animals': 'Animals',
               'Travel & Events': 'Travel',
               'People & Blogs': 'People',
               'Entertainment': 'Entert.',
               'Education': 'Educ.',
               'News & Politics': 'News',
               'How-to & Style': 'How-to',
               'Science & Technology': 'Science',
               'Non-profits & Activism': 'Non-profits'}[x]
    except:
        return x

cat_test = []
for name, group in df_graph_filtered.groupby(['from_category', 'to_category']):
    #print group.index
    cat_test.append((name[0], name[1], len(group)))
    sum += len(group)

df_cat = pa.DataFrame(cat_test, columns=['from_category','to_category', 'nof_collabs'])
df_cat['nof_collabs'] = df_cat['nof_collabs'].apply(lambda x: x / sum * 100.0)
df_cat = df_cat.set_index(['from_category', 'to_category'])
df_cat = df_cat.sort_values(by='nof_collabs', ascending=False)

fig.tight_layout()
#fig.subplots_adjust(bottom=0.2)

index_test = df_cat.index.values
index_test1, index_test2 = zip(*index_test)

index_test1 = unique(index_test1)
index_test2 = unique(index_test2)

print index_test1.dtype

test = df_cat.unstack(0)
test = test.reindex(index_test1)

new_test = []
for lvl in index_test1:
    new_test.append(test[('nof_collabs', lvl)])
new_test = pa.DataFrame(new_test).T



func = np.vectorize(shorter)
index_test1 = func(index_test1)

fig = plt.figure()
ax = sns.heatmap(new_test, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='.1f', cbar=False)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('% Category Collaborations')
plt.tight_layout()
save_plot('collab_category_heatmap_perc_s.pdf', fig, 1.6*s_width, 2.6*s_height)


number of collabs: 1728
Empty DataFrame
Columns: [cluster, from, from_category, from_cluster, from_network, from_popularity, from_topic, to, to_category, to_cluster, to_network, to_popularity, to_topic, videos, weight]
Index: []
Empty DataFrame
Columns: [cluster, from, from_category, from_cluster, from_network, from_popularity, from_topic, to, to_category, to_cluster, to_network, to_popularity, to_topic, videos, weight]
Index: []
15 [u'Cars & Vehicles' u'Comedy' u'Education' u'Entertainment'
 u'Film & Animation' u'Gaming' u'How-to & Style' u'Music'
 u'News & Politics' u'Non-profits & Activism' u'People & Blogs'
 u'Pets & Animals' u'Science & Technology' u'Sports' u'Travel & Events']
15 [u'Cars & Vehicles' u'Comedy' u'Education' u'Entertainment'
 u'Film & Animation' u'Gaming' u'How-to & Style' u'Music'
 u'News & Politics' u'Non-profits & Activism' u'People & Blogs'
 u'Pets & Animals' u'Science & Technology' u'Sports' u'Travel & Events']
(15, 15)
1728.0
<U22

In [63]:
fig = plt.figure()
ax = sns.heatmap(new_test, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='g', cbar=False, annot_kws={"size": 8})
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Category Collaborations')
plt.tight_layout()
save_plot('collab_category_heatmap_s.pdf', fig, 1.5*s_width, 2.5*s_height)


fig = plt.figure()
ax = sns.heatmap(new_test_2, annot=True, yticklabels=index_test1, xticklabels=index_test1, fmt='.0f', cbar=False, annot_kws={"size": 8})
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_yticklabels(ax.get_yticklabels(),rotation=0)
ax.set_xlabel('To')
ax.set_ylabel('From')
plt.title('Category Collaborations')
save_plot('collab_category_heatmap_normalized_s.pdf', fig, 1.5*s_width, 2.5*s_height)



In [64]:
# make list for gaming collabs

filtered = df_graph_filtered[df_graph_filtered['to_category']=='Gaming']


def get_name(id):
    with db._session_scope(False) as session:
        return session.query(Channel.title).filter(Channel.id == id).first()[0]
    
filtered['from_channel'] = filtered['from'].apply(get_name)
filtered['to_channel'] = filtered['to'].apply(get_name)

filtered[['from_channel', 'from_category', 'from_cluster', 'from_popularity', 'to_channel', 'to_category', 'to_cluster', 'to_popularity', 'videos', 'weight']].to_csv(DIR+r'/gaming_collabs.csv', sep=str('\t'), encoding='utf-8')


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [49]:
# prefiltered cluster analyse atm not interesting as proba and persistence of hdbscan is not very diverse
# because we only used the first 20 cluster instead of 100

#df_collabs_prefiltered = pa.read_csv(DIR+r'/hdb_collab_prefiltered_cluster.txt', sep=str('\t'), encoding='utf-8')
#df_collabs_prefiltered.head()

pers = np.load(DIR+'/hdbscan_pers.npy')
proba = np.load(DIR+'/hdbscan_proba.npy')


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-49-8eb74a24d71f> in <module>()
      5 #df_collabs_prefiltered.head()
      6 
----> 7 pers = np.load(DIR+'/hdbscan_pers.npy')
      8 proba = np.load(DIR+'/hdbscan_proba.npy')

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/numpy/lib/npyio.pyc in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
    368     own_fid = False
    369     if isinstance(file, basestring):
--> 370         fid = open(file, "rb")
    371         own_fid = True
    372     elif is_pathlib_path(file):

IOError: [Errno 2] No such file or directory: '../../data/data_evaluation_3MONTHS/hdbscan_pers.npy'

In [55]:
fig = plt.figure()
ax1 = sns.distplot(proba, kde=False)

fig = plt.figure()
ax2 = sns.distplot(proba, hist=False)

fig = plt.figure()
ax3 = sns.distplot(proba)



In [56]:
fig = plt.figure()
ax1 = sns.distplot(pers, kde=False)

fig = plt.figure()
ax2 = sns.distplot(pers, hist=False)

fig = plt.figure()
ax3 = sns.distplot(pers)


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/statsmodels/nonparametric/kde.py:494: RuntimeWarning: invalid value encountered in divide
  binned = fast_linbin(X,a,b,gridsize)/(delta*nobs)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/statsmodels/nonparametric/kde.py:494: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X,a,b,gridsize)/(delta*nobs)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2

In [57]:
sns.jointplot(x=pers, y=proba)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-57-957c5c74bffe> in <module>()
----> 1 sns.jointplot(x=pers, y=proba)

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/distributions.pyc in jointplot(x, y, data, kind, stat_func, color, size, ratio, space, dropna, xlim, ylim, joint_kws, marginal_kws, annot_kws, **kwargs)
    796     grid = JointGrid(x, y, data, dropna=dropna,
    797                      size=size, ratio=ratio, space=space,
--> 798                      xlim=xlim, ylim=ylim)
    799 
    800     # Plot the data using the grid

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/axisgrid.pyc in __init__(self, x, y, data, size, ratio, space, dropna, xlim, ylim)
   1657         # Possibly drop NA
   1658         if dropna:
-> 1659             not_na = pd.notnull(x) & pd.notnull(y)
   1660             x = x[not_na]
   1661             y = y[not_na]

ValueError: operands could not be broadcast together with shapes (14010,) (258726,) 

In [ ]: