In [120]:
#!/usr/bin/env python
# coding=utf-8

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math

import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func

from database import *

from matplotlib import pylab, pyplot
from matplotlib import dates

import statsmodels
import seaborn as sns
sns.set(color_codes=True)

current_palette = sns.color_palette()
first = current_palette[0]
second = current_palette[1]
sns.set_palette(
    [second, first] + current_palette[2:]
)

from scipy import stats
from scipy.stats import norm, normaltest, mannwhitneyu, ranksums

from datetime import datetime, timedelta, date

date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'

plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')

SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

x_width  = 6.8898
x_height = x_width / 1.618

s_width  = 3.4449
s_height = s_width / 1.618

def save_plot(name, fig, width, height):
    #fig.tight_layout()
    fig.set_size_inches(width, height)
    #f.subplots_adjust(top=0.86)

    fig.savefig(CDIR+'/'+name, bbox_inches="tight")
    #plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")

In [121]:
DIR = '../../data/data_evaluation_3MONTHS_filtered'
CDIR = '../../data/data_evaluation_3MONTHS_filtered/charts'

db = YTDatabase()

In [122]:
# read data from channel statistics notebook
df_channel = pa.read_csv(DIR+r'/df_channel_statistics_first_day.txt', sep=str('\t'), encoding='utf-8')

In [123]:
# read data from collaboration notebook
#df_graph = pa.read_csv(DIR+r'/df_collabs.txt', sep=str('\t'), encoding='utf-8', index_col=0)
df_graph_filtered = pa.read_csv(DIR+r'/df_filtered_collabs.txt', sep=str('\t'), encoding='utf-8', index_col=0)


df_graph_filtered['from_cluster'] = df_graph_filtered['from_cluster'].apply(json.loads)
df_graph_filtered['to_cluster'] = df_graph_filtered['to_cluster'].apply(json.loads)
df_graph_filtered['videos'] = df_graph_filtered['videos'].apply(json.loads)

#print len(df_graph)
print len(df_graph_filtered)


df_graph_filtered.head()


1728
Out[123]:
cluster from from_category from_cluster from_network from_popularity from_topic to to_category to_cluster to_network to_popularity to_topic videos weight
0 4806 UCw-hc7ZJummS0AvWyjUX56A Sports [3102, 4806, 12796] None 3 Fitness UCWYtZYH4kcbMm29liIOSGQQ Sports [2996, 3143, 5232, 7762] BroadbandTV 3 Sports [Q-Z7QYkqn2M] 1
1 5556 UClpEE-Led9ZK0GJQKvU--3Q Entertainment [5556, 5884, 7050] Maker Studios 2 Football UCIKF1msqN7lW9gplsifOPkQ Entertainment [12627] BroadbandTV 3 Music [On4GE5hAU8s] 1
2 5556 UClpEE-Led9ZK0GJQKvU--3Q Entertainment [5556, 5884, 7050] Maker Studios 2 Football UC3M4u8_WwqY-2xDbJXxo5eQ Gaming [4139, 7049] OmniaMediaCo 3 Sports game [4cEkXTIcvH4] 1
3 6135 UCMDz09-3zO1hm1pqRA-Er0A Entertainment [6135] BroadbandTV 3 Lifestyle UC3fxB7rF6T7wqymKUJxwmXA Entertainment BroadbandTV 3 Lifestyle [C7Waq5ZgqGU] 1
4 1367 UCUcBFGAfOzut1x4GSWa6Akg Entertainment [1367, 4731, 5365, 6760] Maker Studios 3 Music UCCk_JV7ar8HlxWZRXQKgS7w People & Blogs BroadbandTV 2 Lifestyle [RJir4ghyTDo] 1

In [ ]:


In [124]:
# Number of actual collab videos
videos = []
for i, row in df_graph_filtered.iterrows():
    videos.extend(row['videos'])
    
print len(videos)


3925

In [6]:
# get history statistscs from db of videos

with db._session_scope(False) as session:

    df_video_history = pa.read_sql(session.query(VideoHistory).statement, db.engine)

In [125]:
len(df_graph_filtered['to'].unique())


Out[125]:
1114

In [10]:
df_video_history = df_video_history.set_index(['id'])

df_video_history['crawlTimestamp'] = df_video_history['crawlTimestamp'].apply(lambda x: datetime.strptime(x, date_format2))

df_video_history.head()


Out[10]:
videoID viewCount commentCount likeCount dislikeCount crawlTimestamp
id
1 qBZKeIbNDcE 9 0 1 0 2016-12-28 03:03:22
2 2A6NSm9aSVQ 139 5 14 2 2016-12-28 03:03:23
3 0hAym2XYn9w 60 3 9 0 2016-12-28 03:03:48
4 VGHx-lcdJps 258 21 64 2 2016-12-28 03:03:49
5 xYi6uCXmCFo 337 8 94 1 2016-12-28 03:03:52

In [12]:
# Video history grouping
video_groups = df_video_history.groupby(['videoID'])

print len(df_video_history), len(video_groups)

# filtering too young videos for half life study of 6 days
vcounts = video_groups['viewCount'].count()
mask = vcounts[vcounts > 11].index

df_video_history_filtered = df_video_history[df_video_history.videoID.isin(mask)]

video_groups_filtered = df_video_history_filtered.groupby(['videoID'])

print len(df_video_history_filtered), len(video_groups_filtered)


13035916 232879
12899364 207380

In [13]:
# video sample statistics

from sklearn import preprocessing

name = 'ISLuff63OlI'
group = video_groups.get_group(name)

print name
fig, axs = plt.subplots(ncols=4)
plt.title(name)
axs[0].plot_date(group['crawlTimestamp'], group['viewCount'], 'b-')
axs[0].axvline(group.iloc[0, 5] + timedelta(days=6), color='red', linewidth=.5)
axs[1].plot_date(group['crawlTimestamp'], group['commentCount'], 'b-')
axs[2].plot_date(group['crawlTimestamp'], group['likeCount'], 'b-')
axs[3].plot_date(group['crawlTimestamp'], group['dislikeCount'], 'r-')

    
print group['crawlTimestamp'].min()
print group['crawlTimestamp'].max()
print 'Age:', (group['crawlTimestamp'].max() - group['crawlTimestamp'].min()).days

# views percentage
# Create x, where x the scores columns values as floats
x = group['viewCount'].values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
group['viewCount_norm'] = x_scaled

print 'First 6 days made', group[(group.crawlTimestamp <= group.iloc[0, 5] + timedelta(days=6))]['viewCount_norm'].max(), '%'

#group


ISLuff63OlI
2017-01-08 00:07:01
2017-04-25 11:06:00
Age: 107
First 6 days made 0.745408929895 %
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [14]:
# multi sample channel statistics
import random


sampled_df_i  = random.sample(video_groups.indices, 10)
df_list  = map(lambda df_i: (df_i, video_groups.get_group(df_i)), sampled_df_i)


for name, group in df_list:
    print name
    fig, axs = plt.subplots(ncols=4)
    plt.title(name)
    axs[0].plot_date(group['crawlTimestamp'], group['viewCount'], 'b-')
    axs[0].axvline(group.iloc[0, 5] + timedelta(days=6), color='red', linewidth=.5)
    axs[1].plot_date(group['crawlTimestamp'], group['commentCount'], 'b-')
    axs[2].plot_date(group['crawlTimestamp'], group['likeCount'], 'b-')
    axs[3].plot_date(group['crawlTimestamp'], group['dislikeCount'], 'r-')


zyRcHomiM5c
99_k3EC7sVM
01RW7wZbT8U
TmMZY5NUNGM
E57PBLnJjIA
anT-y3Px4QM
c6c33WN-pMs
ThliEAOvVqc
BCwGKfmGpXc
HHWf12L-JcM

In [15]:
# difference between video stats of collab and non_collab videos of same channel
# are collab videos more popular than normal for channel?

In [16]:
# create boxplot per channel, collab noncollab videos compared
# comparing what?
# viewcount gradient?
# viewcount max, mean? -> max mean of collab, non_collab videos
# consider age of video? take only first 6 days into account

In [14]:
# age dist, for all videos

df_video_age = []

for name, group in video_groups:

    age = (group['crawlTimestamp'].max() - group['crawlTimestamp'].min()).days
    df_video_age.append((name, age))
    
df_video_age = pa.DataFrame(df_video_age, columns=['id', 'age'])


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-2b334d46f8ad> in <module>()
      5 for name, group in video_groups:
      6 
----> 7     age = (group['crawlTimestamp'].max() - group['crawlTimestamp'].min()).days
      8     df_video_age.append((name, age))
      9 

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/core/generic.pyc in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
   5611                                       skipna=skipna)
   5612         return self._reduce(f, name, axis=axis, skipna=skipna,
-> 5613                             numeric_only=numeric_only)
   5614 
   5615     return set_function_name(stat_func, name, cls)

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/core/series.pyc in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   2316                                           'numeric_only.'.format(name))
   2317             with np.errstate(all='ignore'):
-> 2318                 return op(delegate, skipna=skipna, **kwds)
   2319 
   2320         return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/core/nanops.pyc in f(values, axis, skipna, **kwds)
    103                         result = alt(values, axis=axis, skipna=skipna, **kwds)
    104                 else:
--> 105                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    106             except Exception:
    107                 try:

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/core/nanops.pyc in reduction(values, axis, skipna)
    433     def reduction(values, axis=None, skipna=True):
    434         values, mask, dtype, dtype_max = _get_values(
--> 435             values, skipna, fill_value_typ=fill_value_typ, )
    436 
    437         if ((axis is not None and values.shape[axis] == 0) or

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/core/nanops.pyc in _get_values(values, skipna, fill_value, fill_value_typ, isfinite, copy)
    182         mask = _isfinite(values)
    183     else:
--> 184         mask = isnull(values)
    185 
    186     dtype = values.dtype

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/types/missing.pyc in isnull(obj)
     43     pandas.notnull: boolean inverse of pandas.isnull
     44     """
---> 45     return _isnull(obj)
     46 
     47 

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/types/missing.pyc in _isnull_new(obj)
     53         raise NotImplementedError("isnull is not defined for MultiIndex")
     54     elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):
---> 55         return _isnull_ndarraylike(obj)
     56     elif isinstance(obj, ABCGeneric):
     57         return obj._constructor(obj._data.isnull(func=isnull))

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/pandas/types/missing.pyc in _isnull_ndarraylike(obj)
    143     elif needs_i8_conversion(obj):
    144         # this is the NaT pattern
--> 145         result = values.view('i8') == iNaT
    146     else:
    147         result = np.isnan(values)

KeyboardInterrupt: 

In [118]:
df_video_age['date'] = df_video_age['age'].apply(lambda x: date(2016, 12, 28) + timedelta(days=x))
df_video_age['dayn'] = df_video_age['date'].apply(lambda x: x.strftime("%A"))

print df_video_age['age'].min()

#ax = df_video_age[['date']].groupby(df_video_age['date']).count().plot(kind='bar')
test = df_video_age[['dayn']].groupby(df_video_age['dayn']).count()
order = [u'Monday',u'Tuesday',u'Wednesday',u'Thursday',u'Friday',u'Saturday',u'Sunday']


fig, axs = plt.subplots(1, 2)

#fig, ax = plt.subplots()
ax = test.ix[order].plot(kind='bar',legend=False, ax=axs[1])
ax.set_xlabel('Weekday')
ax.set_ylabel('Videos')
fig.tight_layout()
axs[1].set_title('Videos per Weekday')
#save_plot('video_weekday_counts.pdf', fig, s_width, s_height)

#ax.set_xticks(ax.get_xticks()[::2])
#fig = plt.figure()
ax = sns.distplot(df_video_age['age'], bins=100, ax=axs[0])
ax.set_xlabel('Age (Days)')
ax.set_ylabel('Density')
fig.tight_layout()
axs[0].set_title('Video Age')

save_plot('video_age_and_days.pdf', fig, 2*s_width, s_height)
#ax.set_xticklabels(df_video_age['date'])
#age_groups = range(0, 70, 5)
#print age_groups
#for age in age_groups:
#    ax.axvline(age, color='red', linewidth=.5)



df_video_age.head()


0
Out[118]:
id age date dayn
0 --0s03JYIww 27 2017-01-24 Tuesday
1 --1lgNZ9cvY 14 2017-01-11 Wednesday
2 --4kDK_RBjg 48 2017-02-14 Tuesday
3 --4n0eZ8XTQ 1 2016-12-29 Thursday
4 --5mUkeFBmI 56 2017-02-22 Wednesday

In [126]:
# get all videos
with db._session_scope(False) as session:

    df_videos = pa.read_sql(session.query(Video).statement, db.engine)
    

df_videos = df_videos.set_index(['id'])

df_videos['dateAdded'] = df_videos['dateAdded'].apply(lambda x: datetime.strptime(x, date_format))
df_videos['crawlTimestamp'] = df_videos['crawlTimestamp'].apply(lambda x: datetime.strptime(x, date_format2))


# filter videos only related to channel with collabs

df_videos_collabs = df_videos[ (df_videos.channelID.isin(df_graph_filtered['from'].tolist())) | (df_videos.channelID.isin(df_graph_filtered['to'].tolist()))]
df_videos_collabs_without_gaming = df_videos[ (df_videos.channelID.isin(df_graph_filtered['from'].tolist())) | (df_videos.channelID.isin(df_graph_filtered['to'].tolist()))]

#df_videos_collabs.head()

print 'All videos:', len(df_videos)
print 'Collab related videos:', len(df_videos_collabs)
print 'Actual collab videos:', len(videos)

df_videos_collabs.head()


All videos: 250420
Collab related videos: 81942
Actual collab videos: 3925
Out[126]:
channelID title description category dateAdded tags topicIds attribution duration crawlTimestamp deleted
id
__3e1lrV5c8 UCajyfb5EUPd3f3deUk0dBEQ NEUE DM ZAHNBÜRSTEN PINSEL im LIVE TEST | Mami... Neu bei dm gibt es die gehypten Zahnbürsten Pi... 26 2017-02-19 11:30:00 ["mamiseelen","dm zahnb\u00fcrsten pinsel","li... ["/m/0yxzwny","/m/012xff"] None PT7M41S 2017-02-20 00:10:55 None
__3Pz5Hmm4c UCw6Ou-fRcPa23GvYq-VHEbA IS LONZO BALL'S $495 SIGNATURE SHOE THE ZO2 WO... ENJOY AND LEAVE A LIKE!\nPodcast Link - https:... 26 2017-05-06 04:12:59 ["lonzo","ball","zo2","lonzo ball sneaker","bi... [] None PT11M16S 2017-05-07 00:12:42 None
__6aGbnZHXQ UCFdIIKy9ZQuIuRnpXLVabfQ ​Provando Doce de 1 Dolar - Dollar Tree ? Vit... ​ ? Clique Gostei, Comente, INSCR... 22 2017-04-05 23:00:01 ["vitoriademattia diario8765redfwhgy24","TAG V... [] None PT21M56S 2017-04-07 00:12:54 None
__9yzCbryRw UCBkZWl8s1Cg22Yw11O6A-PQ A Futile Effort!/Hope Incarnate! | Made By Son... READ THIS :D\nI really Love this Song a lot. I... 23 2017-02-12 19:32:47 ["Hip Hop","Sonic The Ghetto-Hog productions",... ["/m/08b26_"] None PT4M10S 2017-02-13 00:10:02 None
__a6tnmzoU4 UCxL7KXCV5YW0KdPRV583VJQ NEW YEAR NEW BABY!? WHATS IN MY BAG? → http://bit.ly/2iO7QLz\nWATC... 22 2017-01-13 20:00:30 ["new year","new years","baby","pregnant","new... [] None PT7M32S 2017-01-14 00:13:27 None

In [127]:
with db._session_scope(False) as session:

    df_channel_history_collabs = pa.read_sql(session.query(ChannelHistory).filter((ChannelHistory.channelID.in_(df_graph_filtered['from'].tolist())) | (ChannelHistory.channelID.in_(df_graph_filtered['to'].tolist()))).statement, db.engine)

channel_collabs_groups = df_channel_history_collabs.groupby(['channelID'])

print len(df_channel_history_collabs), len(channel_collabs_groups)


191320 1599

In [261]:
print len(df_graph_filtered)

df_collab_channels = pa.Series([])

df_collab_channels = df_collab_channels.append(df_graph_filtered['to'])
df_collab_channels = df_collab_channels.append(df_graph_filtered['from'])

print len(df_collab_channels.unique())
df_collab_channels.head()


1728
1599
Out[261]:
0    UCWYtZYH4kcbMm29liIOSGQQ
1    UCIKF1msqN7lW9gplsifOPkQ
2    UC3M4u8_WwqY-2xDbJXxo5eQ
3    UC3fxB7rF6T7wqymKUJxwmXA
4    UCCk_JV7ar8HlxWZRXQKgS7w
dtype: object

In [ ]:
# mark videos collab or non_collab
# create video collab list

df_videos_collabs['collab_window'] = 'Non_Collab'

for name, group in channel_collabs_groups:
    videos = []
    # own videos
    for index, row in df_graph_filtered[df_graph_filtered['to'] == name].iterrows():
        #print row.videos
        for vid in row['videos']:
            df_videos_collabs.ix[vid, 'collab_window'] = 'Collab'

    # external videos (hes in)
    for index, row in df_graph_filtered[df_graph_filtered['from'] == name].iterrows():
        #print row.videos
        for vid in row['videos']:
            df_videos_collabs.ix[vid, 'collab_window'] = 'Collab'
            
print len(df_videos_collabs)
df_videos_collabs.head()

In [133]:
# FIX read old collabs list, filter with new
# df_videos_collabs_without_gaming

df_videos_collabs = pa.read_csv(DIR+r'/df_video_collabs_stats.txt', sep=str('\t'), encoding='utf-8')
#df_videos_collabs=df_videos_collabs.set_index(['id'])
print len(df_videos_collabs)

print len(df_videos_collabs_without_gaming)
mask = df_videos_collabs_without_gaming.index

df_videos_collabs = df_videos_collabs[df_videos_collabs.id.isin(mask)]
df_videos_collabs=df_videos_collabs.set_index(['id'])
print len(df_videos_collabs)


167389
81942
81486

In [134]:
print len(df_videos_collabs)

print len(df_videos_collabs[df_videos_collabs.collab_window=='Collab'])

print len(df_videos_collabs[df_videos_collabs.collab_window=='Non_Collab'])
#df_videos_collabs.to_csv(DIR+r'/df_videos_collab.txt', sep=str('\t'), encoding='utf-8')


81486
4234
77252

In [ ]:
# calculate video view gradient like for channel, see if difference for collabs

#video_groups_filtered = df_video_history_filtered

print len(video_groups_filtered)
print len(df_videos_collabs)
mask = df_videos_collabs.index

df_video_history_collabs = df_video_history_filtered[df_video_history_filtered.videoID.isin(mask)]
video_history_collab_group = df_video_history_collabs.groupby('videoID')
print len(video_history_collab_group)

name = 'ISLuff63OlI'
group = video_groups_filtered.get_group(name)
group

In [ ]:
# fix zero entries

for name, group in video_history_collab_group:
    if 0 in group['viewCount'].values:
        print name
        subs = group['viewCount'].values.astype(float)
        subs[subs==0] = 1.0
        print subs
        df_video_history_collabs.ix[group.index, 'viewCount'] = subs

In [ ]:
# label every video history point either collab or non_collab
# calc gradient and p_gradient of viewcounts
# compare gradients then

df_video_history_collabs['gradient'] = np.nan
df_video_history_collabs['p_gradient'] = np.nan
df_video_history_collabs['collab_window'] = 'Non_Collab'

cnt=0

for name, group in video_history_collab_group:
    
    if len(group['viewCount']) < 2:
        cnt += 1
        continue
    
    subs = group['viewCount'].values
    group.drop(group.index[:1], inplace=True)
    df_video_history_collabs.ix[group.index, 'gradient'] = np.diff(subs)
    df_video_history_collabs.ix[group.index, 'p_gradient'] = [100.0 * a1 / a2 - 100 for a1, a2 in zip(subs[1:], subs)]

    # set collab or non collab
    df_video_history_collabs.ix[group.index, 'collab_window'] = df_videos_collabs.ix[name, 'collab_window']


print cnt

print 'Non_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['gradient'].describe()

print '\nNon_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['p_gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['p_gradient'].describe()

In [135]:
print 'Non_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['gradient'].describe()

print '\nNon_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['p_gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['p_gradient'].describe()


Non_Collabs:
count    8.048123e+06
mean     2.236156e+03
std      2.439116e+04
min     -9.052200e+04
25%      6.000000e+00
50%      5.300000e+01
75%      4.050000e+02
max      7.837219e+06
Name: gradient, dtype: float64

Collabs:
count    4.840720e+05
mean     3.034038e+03
std      2.935988e+04
min     -3.096500e+04
25%      9.000000e+00
50%      8.500000e+01
75%      6.900000e+02
max      5.809283e+06
Name: gradient, dtype: float64

Non_Collabs:
count    8.048123e+06
mean     4.044399e+01
std      2.812795e+04
min     -6.056647e+01
25%      6.521876e-02
50%      2.594559e-01
75%      8.672448e-01
max      5.090080e+07
Name: p_gradient, dtype: float64

Collabs:
count    4.840720e+05
mean     3.857606e+02
std      2.580275e+05
min     -8.309859e+01
25%      7.127940e-02
50%      2.492748e-01
75%      7.865941e-01
max      1.794914e+08
Name: p_gradient, dtype: float64

In [136]:
print len(df_video_history_collabs)

print len(df_video_history_collabs[df_video_history_collabs.gradient.notnull()])

df_video_history_collabs.to_csv(DIR+r'/df_video_history_collabs.txt', sep=str('\t'), encoding='utf-8')


8671487
8532195

In [137]:
# FIX
# Filter out the gaming videos

print len(df_video_history_collabs)
print len(df_videos_collabs)
mask = df_videos_collabs.index

df_video_history_collabs = df_video_history_collabs[df_video_history_collabs.videoID.isin(mask)]
video_history_collab_group = df_video_history_collabs.groupby('videoID')
print len(df_video_history_collabs)


8671487
81486
67648

In [138]:
print len(df_video_history_collabs)


4180959

In [141]:
df_video_history_collabs.head()


Out[141]:
videoID viewCount commentCount likeCount dislikeCount crawlTimestamp gradient p_gradient collab_window
id
1 qBZKeIbNDcE 9.0 0 1 0 2016-12-28 03:03:22 NaN NaN Non_Collab
2 2A6NSm9aSVQ 139.0 5 14 2 2016-12-28 03:03:23 NaN NaN Non_Collab
7 H_PpwvS8hA8 127.0 5 9 0 2016-12-28 03:04:08 NaN NaN Non_Collab
9 lsN0rGSfXWw 122.0 4 37 0 2016-12-28 03:04:36 NaN NaN Non_Collab
18 Azqy_VVj1PI 47536.0 2072 13325 60 2016-12-29 00:06:15 NaN NaN Non_Collab

In [ ]:
#df_video_history_collabs = pa.read_csv(DIR+r'/df_video_history_collabs.txt', sep=str('\t'), encoding='utf-8')

#df_video_history_collabs['crawlTimestamp'] = df_video_history_collabs['crawlTimestamp'].apply(lambda x: datetime.strptime(x, date_format2))
#video_history_collab_group = df_video_history_collabs.groupby('videoID')

In [249]:
df_video_history_collabs_test = df_video_history_collabs[df_video_history_collabs.p_gradient<10000]
df_video_history_collabs_test.head()


Out[249]:
videoID viewCount commentCount likeCount dislikeCount crawlTimestamp gradient p_gradient collab_window day_after
id
1864 qBZKeIbNDcE 422.0 27 51 2 2016-12-29 00:11:56 413.0 4588.888889 Non_Collab -1
1865 2A6NSm9aSVQ 8883.0 138 409 18 2016-12-29 00:11:56 8744.0 6290.647482 Non_Collab -1
1870 H_PpwvS8hA8 10086.0 107 374 12 2016-12-29 00:11:56 9959.0 7841.732283 Non_Collab -1
1872 lsN0rGSfXWw 1477.0 21 437 0 2016-12-29 00:11:56 1355.0 1110.655738 Non_Collab -1
3932 Azqy_VVj1PI 115837.0 3484 30545 135 2016-12-30 00:12:13 68301.0 143.682683 Non_Collab -1

In [58]:
df_video_history_collabs_test.crawlTimestamp.dtype


Out[58]:
dtype('<M8[ns]')

In [250]:
len(df_video_history_collabs[df_video_history_collabs.p_gradient>10000])


Out[250]:
645

In [144]:
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ax=ax1, inner='box')
sns.barplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ci=99,errwidth=1., capsize=.1, ax=ax2)
plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views')
ax2.set_ylabel('mean(Views)')
fig.suptitle('Video view-count')
fig.tight_layout()
fig.subplots_adjust(top=0.86)

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ax=ax1, inner='box')
sns.barplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ci=99,errwidth=1., capsize=.1, ax=ax2)
plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views %')
ax2.set_ylabel('mean(Views %)')
fig.suptitle('Video view-count %')
fig.tight_layout()
fig.subplots_adjust(top=0.86)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-144-0a5c1bc7d899> in <module>()
      1 fig, (ax1, ax2) = plt.subplots(ncols=2)
      2 sns.violinplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ax=ax1, inner='box')
----> 3 sns.barplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ci=99,errwidth=1., capsize=.1, ax=ax2)
      4 plt.legend(["{}% CI".format(99)])
      5 ax1.set_xlabel('')

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/categorical.pyc in barplot(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, ax, **kwargs)
   2897                           estimator, ci, n_boot, units,
   2898                           orient, color, palette, saturation,
-> 2899                           errcolor, errwidth, capsize)
   2900 
   2901     if ax is None:

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/categorical.pyc in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize)
   1543                                  order, hue_order, units)
   1544         self.establish_colors(color, palette, saturation)
-> 1545         self.estimate_statistic(estimator, ci, n_boot)
   1546 
   1547         self.errcolor = errcolor

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/categorical.pyc in estimate_statistic(self, estimator, ci, n_boot)
   1449                     boots = bootstrap(stat_data, func=estimator,
   1450                                       n_boot=n_boot,
-> 1451                                       units=unit_data)
   1452                     confint.append(utils.ci(boots, ci))
   1453 

/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/algorithms.pyc in bootstrap(*args, **kwargs)
     73     for i in range(int(n_boot)):
     74         resampler = rs.randint(0, n, n)
---> 75         sample = [a.take(resampler, axis=0) for a in args]
     76         boot_dist.append(func(*sample, **func_kwargs))
     77     return np.array(boot_dist)

KeyboardInterrupt: 

In [274]:
# Boxplot viewcount gradient for collab videos 

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.boxplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ax=ax1, order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, ax=ax2, order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views Gradient')
ax2.set_ylabel('mean(Views Gradient)')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.suptitle('Video Views Gradient')
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-1600.0, 3500.0])
ax2.set_ylim([3100.0, 4400.0])
save_plot('collab_video_views_box_ci.pdf', fig, 2*s_width, 0.75*s_height)

# test, norm fitting
test = df_video_history_collabs_test[df_video_history_collabs_test["gradient"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['gradient'])

fig = plt.figure()
ax = sns.distplot(test['gradient'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["$\mu=${0:.2g}, $\sigma=${1:.2f}".format(mu, sigma)])
#plt.legend(["Fitted normal dist"])
ax.set_xlabel('Views')
ax.set_ylabel('Density')
plt.title('Video Views')
ax.set_ylim([0.0, 1.5e-05])
ax.set_xlim([-260000.0, 260000.0])
ax.set_xticks(ax.get_xticks()[::2])
ax.set_yticks(ax.get_yticks()[::2])
plt.tight_layout()
plt.legend(["Fitted Gaussian"],prop={'size':8})
save_plot('collab_video_views_norm_fit.pdf', fig, s_width, s_height)
print 'Normal-dist-test:', normaltest(test['gradient'])

collabs_grads = test[test.collab_window == 'Collab']['gradient']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['gradient']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2


print 'Non_Collabs:'
print df_video_history_collabs_test[df_video_history_collabs_test.collab_window=='Non_Collab']['gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs_test[df_video_history_collabs_test.collab_window=='Collab']['gradient'].describe()


Normal-dist-test: NormaltestResult(statistic=13192493.868126784, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=609040441102.5, pvalue=0.0) 
p*2: 0.0
Wilcoxon rank-sum-test: RanksumsResult(statistic=67.132698369255181, pvalue=0.0) 
p*2: 0.0
Non_Collabs:
count    3.815349e+06
mean     3.305907e+03
std      3.059019e+04
min     -8.462700e+04
25%      1.300000e+01
50%      1.040000e+02
75%      7.620000e+02
max      7.837219e+06
Name: gradient, dtype: float64

Collabs:
count    2.973160e+05
mean     4.144535e+03
std      3.616439e+04
min     -3.096500e+04
25%      1.900000e+01
50%      1.580000e+02
75%      1.212000e+03
max      5.809283e+06
Name: gradient, dtype: float64

In [181]:
# Boxplot view count percental gradient for collab videos 

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ax=ax1, order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, ax=ax2, order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views Growth %')
ax2.set_ylabel('mean(Views Growth %)')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.suptitle('Video Views Growth %')
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-120.0, 150.0])
ax2.set_ylim([0.0, 8.5])
save_plot('collab_video_views_box_ci_perc_violin.pdf', fig, 2*s_width, s_height)

# test, norm fitting
test = df_video_history_collabs_test[df_video_history_collabs_test["p_gradient"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['p_gradient'])

fig = plt.figure()
ax = sns.distplot(test['p_gradient'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["$\mu=${0:.2g}, $\sigma=${1:.2f}".format(mu, sigma)])
#plt.legend(["Fitted normal dist"])
ax.set_xlabel('Views Growth %')
ax.set_ylabel('Density')
plt.title('Video Views Growth %')
ax.set_ylim([0.0, 0.006])
ax.set_xlim([-700.0, 700.0])
ax.set_xticks(ax.get_xticks()[::2])
ax.set_yticks(ax.get_yticks()[::2])
plt.tight_layout()
plt.legend(["Fitted Gaussian"],prop={'size':8})
#save_plot('collab_video_views_norm_fit.pdf', fig, s_width, s_height)
print 'Normal-dist-test:', normaltest(test['p_gradient'])

collabs_grads = test[test.collab_window == 'Collab']['p_gradient']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['p_gradient']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2

print test[test.collab_window == 'Collab']['p_gradient'].describe()
print test[test.collab_window == 'Non_Collab']['p_gradient'].describe()

print test[test.collab_window == 'Collab']['p_gradient'].median()
print test[test.collab_window == 'Non_Collab']['p_gradient'].median()


Normal-dist-test: NormaltestResult(statistic=12255295.288119387, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=559694078142.5, pvalue=3.085456597251137e-33) 
p*2: 6.1709131945e-33
Wilcoxon rank-sum-test: RanksumsResult(statistic=-12.009439135899697, pvalue=3.1698755695281311e-33) 
p*2: 6.33975113906e-33
count    297316.000000
mean          6.310792
std         109.123101
min         -83.098592
25%           0.104556
50%           0.319975
75%           0.952141
max        9961.585366
Name: p_gradient, dtype: float64
count    3.815349e+06
mean     7.796696e+00
std      1.230656e+02
min     -5.923076e+01
25%      1.008007e-01
50%      3.407290e-01
75%      1.078746e+00
max      9.978182e+03
Name: p_gradient, dtype: float64
0.319975005511
0.340729001585

In [ ]:


In [ ]:
# get for all videos in df_videos_collabs the max popaurlty statiscs vlaue of the 12 days data


df_videos_collabs['12day_viewmax'] = np.nan
df_videos_collabs['12day_likemax'] = np.nan
df_videos_collabs['12day_dislikemax'] = np.nan
df_videos_collabs['12day_commentmax'] = np.nan

for index, group in video_history_collab_group:
    #print group.head()
    df_videos_collabs.ix[index, '12day_viewmax'] = group[(group.crawlTimestamp <= group.iloc[0, 5] + timedelta(days=12))]['viewCount'].max()
    df_videos_collabs.ix[index, '12day_likemax'] = group[(group.crawlTimestamp <= group.iloc[0, 5] + timedelta(days=12))]['likeCount'].max()
    df_videos_collabs.ix[index, '12day_dislikemax'] = group[(group.crawlTimestamp <= group.iloc[0, 5] + timedelta(days=12))]['dislikeCount'].max()
    df_videos_collabs.ix[index, '12day_commentmax'] = group[(group.crawlTimestamp <= group.iloc[0, 5] + timedelta(days=12))]['commentCount'].max()


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [148]:
print len(df_videos_collabs)
print df_videos_collabs['12day_viewmax'].describe()
print len(df_videos_collabs[df_videos_collabs['12day_viewmax'].isnull()])
print df_videos_collabs['12day_likemax'].describe()
print len(df_videos_collabs[df_videos_collabs['12day_likemax'].isnull()])
print df_videos_collabs['12day_dislikemax'].describe()
print len(df_videos_collabs[df_videos_collabs['12day_dislikemax'].isnull()])
print df_videos_collabs['12day_commentmax'].describe()
print len(df_videos_collabs[df_videos_collabs['12day_commentmax'].isnull()])


81486
count    6.764800e+04
mean     1.728373e+05
std      5.010130e+05
min      1.000000e+00
25%      4.940000e+03
50%      2.772850e+04
75%      1.266482e+05
max      1.706475e+07
Name: 12day_viewmax, dtype: float64
13838
count    6.764800e+04
mean     9.233844e+03
std      3.951147e+04
min      0.000000e+00
25%      1.510000e+02
50%      9.080000e+02
75%      4.824250e+03
max      1.956001e+06
Name: 12day_likemax, dtype: float64
13838
count     67648.000000
mean        307.286084
std        3184.341032
min           0.000000
25%           6.000000
50%          32.000000
75%         149.000000
max      639683.000000
Name: 12day_dislikemax, dtype: float64
13838
count    6.764800e+04
mean     1.201489e+03
std      2.144439e+04
min      0.000000e+00
25%      2.900000e+01
50%      1.520000e+02
75%      6.570000e+02
max      5.423785e+06
Name: 12day_commentmax, dtype: float64
13838

In [149]:
# save to file
df_videos_collabs.to_csv(DIR+r'/df_video_collabs_stats.txt', sep=str('\t'), encoding='utf-8')
#df_videos_collabs = pa.read_csv(DIR+r'/df_video_collabs_stats.txt', sep=str('\t'), encoding='utf-8')
#df_videos_collabs=df_videos_collabs.set_index(['id'])
#df_videos_collabs.head()

In [182]:
# all viewcounts complete
# for all channel mixed not very good/accurate, as channel have different popularities

from scipy.stats import norm, normaltest, mannwhitneyu, ranksums

print len(df_videos_collabs[df_videos_collabs['collab_window']=='Collab'])

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="12day_viewmax", data=df_videos_collabs, ax=ax1, inner='box',order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="12day_viewmax", data=df_videos_collabs, ci=95,errwidth=1., capsize=.1, ax=ax2,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views')
ax2.set_ylabel('mean(Views)')
fig.suptitle('Video Views')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-650000.0, 2000000.0])
save_plot('collab_video_12day_view_box_ci_violin.pdf', fig, 2*s_width, s_height)

# test, norm fitting
test = df_videos_collabs[df_videos_collabs["12day_viewmax"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['12day_viewmax'])

fig = plt.figure()
ax = sns.distplot(test['12day_viewmax'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.legend(["Fitted normal dist"])
ax.set_xlabel('Views')
ax.set_ylabel('Density')
plt.title('Video views')
plt.tight_layout()

print 'Normal-dist-test:', normaltest(test['12day_viewmax'])

collabs_grads = test[test.collab_window == 'Collab']['12day_viewmax']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['12day_viewmax']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2


fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="12day_likemax", data=df_videos_collabs, ax=ax1, inner='box',order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="12day_likemax", data=df_videos_collabs, ci=95,errwidth=1., capsize=.1, ax=ax2,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Likes')
ax2.set_ylabel('mean(Likes)')
fig.suptitle('Video Likes')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-44000.0, 123000.0])
ax2.set_ylim([0.0, 19000.0])
save_plot('collab_video_12day_like_box_ci_violin.pdf', fig, 2*s_width, s_height)

# test, norm fitting
test = df_videos_collabs[df_videos_collabs["12day_likemax"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['12day_likemax'])

fig = plt.figure()
ax = sns.distplot(test['12day_likemax'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.legend(["Fitted normal dist"])
ax.set_xlabel('Likes')
ax.set_ylabel('Density')
plt.title('Video likes')
plt.tight_layout()

print 'Normal-dist-test:', normaltest(test['12day_likemax'])

collabs_grads = test[test.collab_window == 'Collab']['12day_likemax']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['12day_likemax']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2


fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="12day_dislikemax", data=df_videos_collabs, ax=ax1, inner='box',order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="12day_dislikemax", data=df_videos_collabs, ci=95,errwidth=1., capsize=.1, ax=ax2,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Dislikes')
ax2.set_ylabel('mean(Dislikes)')
fig.suptitle('Video Dislikes')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-7000.0, 9000.0])
save_plot('collab_video_12day_dislike_box_ci_violin.pdf', fig, 2*s_width, s_height)

# test, norm fitting
test = df_videos_collabs[df_videos_collabs["12day_dislikemax"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['12day_dislikemax'])

fig = plt.figure()
ax = sns.distplot(test['12day_dislikemax'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.legend(["Fitted normal dist"])
ax.set_xlabel('Dislikes')
ax.set_ylabel('Density')
plt.title('Video dislikes')
plt.tight_layout()

print 'Normal-dist-test:', normaltest(test['12day_dislikemax'])

collabs_grads = test[test.collab_window == 'Collab']['12day_dislikemax']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['12day_dislikemax']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2


fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="12day_commentmax", data=df_videos_collabs, ax=ax1, inner='box',order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="12day_commentmax", data=df_videos_collabs, ci=95,errwidth=1., capsize=.1, ax=ax2,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(95)], loc=0)
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Comments')
ax2.set_ylabel('mean(Comments)')
fig.suptitle('Video Comments')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-12000.0, 14000.0])
save_plot('collab_video_12day_comment_box_ci_violin.pdf', fig, 2*s_width, s_height)

# test, norm fitting
test = df_videos_collabs[df_videos_collabs["12day_commentmax"].notnull()]

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['12day_commentmax'])

fig = plt.figure()
ax = sns.distplot(test['12day_commentmax'], fit=norm, kde=False, bins=1000, norm_hist=True)
#plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.legend(["Fitted normal dist"])
ax.set_xlabel('Comments')
ax.set_ylabel('Density')
plt.title('Video comments')
plt.tight_layout()

print 'Normal-dist-test:', normaltest(test['12day_commentmax'])

collabs_grads = test[test.collab_window == 'Collab']['12day_commentmax']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['12day_commentmax']

manu = mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
ranks = ranksums(x=collabs_grads, y=non_collabs_grads)

print '\nMann-Whitney-test:', manu, '\np*2:', manu.pvalue*2
print 'Wilcoxon rank-sum-test:', ranks, '\np*2:', ranks.pvalue*2

#print 'Collab:\n', df_videos_collabs.ix[df_videos_collabs.collab_window=='Collab',:].describe()
#print '\nNon_Collab:\n',df_videos_collabs.ix[df_videos_collabs.collab_window=='Non_Collab',:].describe()


4234
Normal-dist-test: NormaltestResult(statistic=101917.2419282568, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=147421662.0, pvalue=9.3143387005482232e-27) 
p*2: 1.86286774011e-26
Wilcoxon rank-sum-test: RanksumsResult(statistic=10.70821234960807, pvalue=9.3142992794214324e-27) 
p*2: 1.86285985588e-26
Normal-dist-test: NormaltestResult(statistic=140872.45728523246, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=153109791.5, pvalue=4.695549889606518e-53) 
p*2: 9.39109977921e-53
Wilcoxon rank-sum-test: RanksumsResult(statistic=15.331611690477786, pvalue=4.7017430583714563e-53) 
p*2: 9.40348611674e-53
Normal-dist-test: NormaltestResult(statistic=294539.16440824792, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=148802470.5, pvalue=2.5642100414263072e-32) 
p*2: 5.12842008285e-32
Wilcoxon rank-sum-test: RanksumsResult(statistic=11.830554783754005, pvalue=2.7134241110749869e-32) 
p*2: 5.42684822215e-32
Normal-dist-test: NormaltestResult(statistic=348146.15695784736, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=149519667.0, pvalue=2.1979927274611458e-35) 
p*2: 4.39598545492e-35
Wilcoxon rank-sum-test: RanksumsResult(statistic=12.413503154633055, pvalue=2.2077730402106398e-35) 
p*2: 4.41554608042e-35

In [395]:
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.boxplot(x="collab_window", y="12day_viewmax", data=df_videos_collabs, ax=ax1,order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="12day_viewmax", data=df_videos_collabs, ci=95,errwidth=1., capsize=.1, ax=ax2,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Views')
ax2.set_ylabel('mean(Views)')
fig.suptitle('Video Views')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
#ax1.set_ylim([-12000.0, 500000.0])
ax2.set_ylim([150000.0, 280000.0])
ax1.set_yscale('log')
ax2.set_yscale('log')
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import AutoMinorLocator
plt.tick_params(axis='y', which='minor')
ax2.yaxis.set_minor_formatter(FormatStrFormatter('%g'))
yloc = plt.MaxNLocator(5)
ylog = plt.LinearLocator(numticks=6)
ylog2 = plt.LinearLocator(numticks=6)
ax2.yaxis.set_major_locator(ylog)
ax2.yaxis.set_minor_locator(ylog2)
#ax2.set_yticks(ax2.get_yticks()[::2])
#ax2.set_yticklabels(ax2.get_yticklabels()[::8])
save_plot('collab_video_12day_view_box_ci_log.pdf', fig, 2*s_width, 0.75*s_height)

print 'Collab:\n', df_videos_collabs.ix[df_videos_collabs.collab_window=='Collab','12day_viewmax'].describe()
print '\nNon_Collab:\n',df_videos_collabs.ix[df_videos_collabs.collab_window=='Non_Collab','12day_viewmax'].describe()

print 'Collab:\n', df_videos_collabs.ix[df_videos_collabs.collab_window=='Collab','12day_viewmax'].median()
print '\nNon_Collab:\n',df_videos_collabs.ix[df_videos_collabs.collab_window=='Non_Collab','12day_viewmax'].median()


Collab:
count    4.234000e+03
mean     2.454193e+05
std      6.825585e+05
min      3.000000e+00
25%      6.969250e+03
50%      4.560500e+04
75%      1.874062e+05
max      1.407286e+07
Name: 12day_viewmax, dtype: float64

Non_Collab:
count    6.341400e+04
mean     1.679912e+05
std      4.861064e+05
min      1.000000e+00
25%      4.865000e+03
50%      2.675550e+04
75%      1.233325e+05
max      1.706475e+07
Name: 12day_viewmax, dtype: float64
Collab:
45605.0

Non_Collab:
26755.5

In [183]:
test = df_videos_collabs[df_videos_collabs["12day_viewmax"].notnull()]

# df_channel_history['viewCount_gradient']
fig = plt.figure()
ax = sns.barplot(x="collab_window", y="12day_viewmax", data=test, ci=99,errwidth=1., capsize=.1,order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(99)])
ax.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax.set_xlabel('')
ax.set_ylabel('mean(Views)')
plt.title('Video views comparision')
plt.tight_layout()


#fig = plt.figure()
#ax = sns.distplot(test['6day_viewmax'], kde=False)
#ax.set_yscale('log')
#ax.set_xscale('log')

from scipy.stats import norm, normaltest, mannwhitneyu, ranksums

# Distribution fitting
# norm.fit(data) returns a list of two parameters 
# (mean, parameters[0] and std, parameters[1]) via a MLE(Maximum-Likelihood) approach 
# to data, which should be in array form.
(mu, sigma) = stats.norm.fit(test['12day_viewmax'])

fig = plt.figure()
ax = sns.distplot(test['12day_viewmax'], fit=norm, kde=False, bins=100, norm_hist=True)
#plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.legend(["Fitted normal dist"])
ax.set_xlabel('Views')
ax.set_ylabel('Density')
plt.title('Video views')
plt.tight_layout()


print 'Normal-dist-test:', normaltest(test['12day_viewmax'])

collabs_grads = test[test.collab_window == 'Collab']['12day_viewmax']
non_collabs_grads = test[test.collab_window == 'Non_Collab']['12day_viewmax']

print '\nMann-Whitney-test:', mannwhitneyu(x=collabs_grads, y=non_collabs_grads, alternative='two-sided')
print 'Wilcoxon rank-sum-test:', ranksums(x=collabs_grads, y=non_collabs_grads)


Normal-dist-test: NormaltestResult(statistic=101917.2419282568, pvalue=0.0)

Mann-Whitney-test: MannwhitneyuResult(statistic=147421662.0, pvalue=9.3143387005482232e-27)
Wilcoxon rank-sum-test: RanksumsResult(statistic=10.70821234960807, pvalue=9.3142992794214324e-27)

In [260]:
print len(df_videos_collabs)
print len(df_videos_collabs.groupby(['channelID']))


81486
1589

In [265]:
print len(df_videos_collabs)
print len(df_videos_collabs.groupby(['channelID']))
df_video_channel_gr = df_videos_collabs.groupby(['channelID', 'collab_window'], as_index=False)


81486
1589
2705

In [264]:
# take every channel and get max/mean of collab, non_collab groups for views etc.
# create list with two entrys for every channel, collab, non_collab with mean value of above

# TODO check why number of channels with both types is so low

print len(df_videos_collabs)
print len(df_videos_collabs.groupby(['channelID']))
df_video_channel_mean = df_videos_collabs.groupby(['channelID', 'collab_window'], as_index=False).mean()
print len(df_video_channel_mean)
#print df_video_channel_mean.head()
video_channel_group = df_video_channel_mean.groupby('channelID')
print len(video_channel_group)

print df_videos_collabs['collab_window'].value_counts()
# calc growth of collab to non_collab per channel, only the two of same channel is comparable directly
df_collab_growth = []

#print video_channel_group.count()

for name, group in video_channel_group:
    if len(group) > 1:
        #print name
        #print group[['collab_window', '6day_viewmax']]
        c_v = group.iloc[0]['12day_viewmax']
        nc_v = group.iloc[1]['12day_viewmax']
        g = 100.0 * c_v / nc_v - 100
        #print g
        df_collab_growth.append((name, g))
    else:
        print name, group['collab_window']

df_collab_growth = pa.DataFrame(df_collab_growth)
print len(df_collab_growth)
#print df_collab_growth.head()
#df_collab_growth[1].mean()


 81486
1589
2705
1589
Non_Collab    77252
Collab         4234
Name: collab_window, dtype: int64
UC-PjJT0RVURj_L6D8D3WIww 12    Non_Collab
Name: collab_window, dtype: object
UC-SH7ZCar_m5Udkthm3j0Uw 15    Non_Collab
Name: collab_window, dtype: object
UC-SmWrDkILvnmfzE-43CdtA 18    Non_Collab
Name: collab_window, dtype: object
UC-V2ze2c4s7rV4neMd7ZS-w 21    Non_Collab
Name: collab_window, dtype: object
UC-q_5nCwFN3cUTW_wXlc7kg 34    Non_Collab
Name: collab_window, dtype: object
UC04ZT2MN2qF7tn2_JahRQRw 47    Non_Collab
Name: collab_window, dtype: object
UC0Juf8RX49w9_7FdvcjtNdA 50    Non_Collab
Name: collab_window, dtype: object
UC0PXqiud6dbwOAk8RvslgpQ 51    Non_Collab
Name: collab_window, dtype: object
UC0x8XBfvJoDzSqMzE56g2BQ 76    Non_Collab
Name: collab_window, dtype: object
UC19hDT4twVO-2553FiQmcXQ 83    Non_Collab
Name: collab_window, dtype: object
UC1C0x6WPqAavZoja1Qga-uw 86    Non_Collab
Name: collab_window, dtype: object
UC1H-sZk-cj-tDpfTCZ1f1YQ 89    Non_Collab
Name: collab_window, dtype: object
UC1J5R-2zsrrICAYmkt5g_6Q 92    Non_Collab
Name: collab_window, dtype: object
UC1NB1xUQ8ItM5VoA4101lUQ 93    Non_Collab
Name: collab_window, dtype: object
UC1Xga1RR7NpowZH40Yn48tQ 104    Non_Collab
Name: collab_window, dtype: object
UC1bM_rWa49vRd-aPKo1kUCQ 107    Non_Collab
Name: collab_window, dtype: object
UC1ga45T4B7LX4llKa3khq4g 112    Non_Collab
Name: collab_window, dtype: object
UC1mc5_VnbSnFTZQygWu2nqQ 113    Collab
Name: collab_window, dtype: object
UC1sVnwXAjct1g2oaFojIpnA 122    Non_Collab
Name: collab_window, dtype: object
UC2NyUeUIru1fwBEnZBTD3UQ 153    Non_Collab
Name: collab_window, dtype: object
UC2bYhAHyaqfWlPXWBVk4BcA 156    Non_Collab
Name: collab_window, dtype: object
UC2k2x0gsgTYxqIkxQGe8hmQ 161    Non_Collab
Name: collab_window, dtype: object
UC2wQ3publKlYOLZ2Ho4xv0Q 166    Non_Collab
Name: collab_window, dtype: object
UC3BIknatL1GFynLtV9l7pZg 171    Non_Collab
Name: collab_window, dtype: object
UC3gaQduurXoDOFMSNSKnJVA 210    Non_Collab
Name: collab_window, dtype: object
UC4-bcER4WWrAqLSoq50dmzw 221    Non_Collab
Name: collab_window, dtype: object
UC47Xf99jxhD5gs4D2fHUawg 230    Collab
Name: collab_window, dtype: object
UC4hSMRoJQit1eu8nhlM-7rQ 251    Non_Collab
Name: collab_window, dtype: object
UC4zS1wbO81p59CxKL7CQAcA 260    Non_Collab
Name: collab_window, dtype: object
UC54gPkG3GEwFNDxxueq5__A 263    Non_Collab
Name: collab_window, dtype: object
UC5U4r2JU9HqCHWqS5wT_moA 272    Non_Collab
Name: collab_window, dtype: object
UC5XbGYYAazZi02btPCwsemA 275    Non_Collab
Name: collab_window, dtype: object
UC5ZAkiI3NFIpbflatylVHSw 276    Non_Collab
Name: collab_window, dtype: object
UC5pmjxKYia6W0kr9Q4vZT7w 285    Non_Collab
Name: collab_window, dtype: object
UC61YzFVCvX9Nw0bE2gM7MUA 292    Non_Collab
Name: collab_window, dtype: object
UC6gX9qVCdYFwhyMECWWGozw 319    Non_Collab
Name: collab_window, dtype: object
UC6paX0kLA8D81aYogceMufw 324    Non_Collab
Name: collab_window, dtype: object
UC6uVHEgl379D_2a1kq0s6Kw 327    Non_Collab
Name: collab_window, dtype: object
UC6ySmwN_K8fGGD2EKLQdEGA 332    Non_Collab
Name: collab_window, dtype: object
UC70WKHZ1X9qIXWmNr0qHGmw 335    Non_Collab
Name: collab_window, dtype: object
UC74BOEe8x_IVy8HET5S5NOg 340    Non_Collab
Name: collab_window, dtype: object
UC7LRYRh6ZK12Ll65sv31C1A 355    Non_Collab
Name: collab_window, dtype: object
UC7MRc6sSOwTW7Zyd_Oy0BYQ 360    Non_Collab
Name: collab_window, dtype: object
UC7V9_rZ_bZQTy9gVnsW78hw 367    Non_Collab
Name: collab_window, dtype: object
UC7fMcs5gAGgu_ZcT2VBBX2Q 372    Non_Collab
Name: collab_window, dtype: object
UC7u7O4kGomeLwRxh0lhkDaQ 383    Non_Collab
Name: collab_window, dtype: object
UC8CWuKdtXz9yFMsI-QtAVKg 390    Non_Collab
Name: collab_window, dtype: object
UC8HbphXFqQy38TCPn3hLHkw 393    Non_Collab
Name: collab_window, dtype: object
UC8U6Bl5RMIF50aKnFjz1UEw 394    Non_Collab
Name: collab_window, dtype: object
UC8XwvZnpcKcyZlMdeqgugCQ 397    Collab
Name: collab_window, dtype: object
UC8ootkKAbJC9bABoRcQf0Kw 406    Non_Collab
Name: collab_window, dtype: object
UC98Bm49Y0WmF3cu0O6ukv0g 415    Non_Collab
Name: collab_window, dtype: object
UC9Zl_UOLc2F5Aq45G6DxEaQ 430    Non_Collab
Name: collab_window, dtype: object
UC9aonWctty5lhRbG8SsxNDg 433    Non_Collab
Name: collab_window, dtype: object
UC9gFih9rw0zNCK3ZtoKQQyA 438    Non_Collab
Name: collab_window, dtype: object
UC9oOvCsrSGpX1Hl2cIJXYLw 441    Non_Collab
Name: collab_window, dtype: object
UC9qZ8spyUKXjuS3eKHaECPg 444    Non_Collab
Name: collab_window, dtype: object
UCA8fzHNNFcutvoKaO4bQ96Q 445    Non_Collab
Name: collab_window, dtype: object
UCAkP51BEzkKimJh7KDflx_g 474    Non_Collab
Name: collab_window, dtype: object
UCAqdBCnxifh5IxrX7_YzLXQ 477    Collab
Name: collab_window, dtype: object
UCB44AXu-8l2mueFxTC4K_Ig 488    Non_Collab
Name: collab_window, dtype: object
UCBB4HUmgkIagmY56KpkBdvg 493    Non_Collab
Name: collab_window, dtype: object
UCBDWo0g9N8pL8c9QhE8Lxxw 494    Non_Collab
Name: collab_window, dtype: object
UCBDuHZ-6IASDQtq9YnW6Mtw 495    Non_Collab
Name: collab_window, dtype: object
UCBHskmWYlJzR7Q-FcbwM4Gg 502    Non_Collab
Name: collab_window, dtype: object
UCBQ8NG_fqzmAmGEriQW1XZQ 511    Non_Collab
Name: collab_window, dtype: object
UCBbeDje34deoh6ELVL9seAg 522    Non_Collab
Name: collab_window, dtype: object
UCBvc7pmUp9wiZIFOXEp1sCg 547    Non_Collab
Name: collab_window, dtype: object
UCC5qx-6DlQfAmlgPLsAxX-g 552    Non_Collab
Name: collab_window, dtype: object
UCC9uqoIkY8Nd7J9Gnk98W1w 557    Non_Collab
Name: collab_window, dtype: object
UCCBRMzkG8uIWHUAmSRmIcBA 560    Non_Collab
Name: collab_window, dtype: object
UCCCysu6ecuB4TFqzkyxLJaA 561    Non_Collab
Name: collab_window, dtype: object
UCCOgDIMQ5nazNBQgellJ2tg 570    Collab
Name: collab_window, dtype: object
UCCgt8cP0H5T2M-JnPPIF93w 577    Non_Collab
Name: collab_window, dtype: object
UCCs9lF-XXPv7F1sMYy_d7Eg 592    Non_Collab
Name: collab_window, dtype: object
UCD3EMFelc4s3TmDm4EX9Xvg 603    Collab
Name: collab_window, dtype: object
UCD7eAFZw7jk1eoqInkU2PbA 608    Non_Collab
Name: collab_window, dtype: object
UCDAhDUmNuxVaH3N5yHOnL3g 615    Non_Collab
Name: collab_window, dtype: object
UCDHeWTf_ftBNkb4TeGDAFsA 626    Non_Collab
Name: collab_window, dtype: object
UCDRhgfREdHD-rx_QuV_W5GQ 629    Non_Collab
Name: collab_window, dtype: object
UCDUydlERBRBngdkQHP2ELbg 632    Non_Collab
Name: collab_window, dtype: object
UCDZ-irzyMKKSskgTEM2H0dg 635    Non_Collab
Name: collab_window, dtype: object
UCD_CzFZrcJcT2HR9CbdPLyA 636    Non_Collab
Name: collab_window, dtype: object
UCDaIW2zPRWhzQ9Hj7a0QP1w 637    Non_Collab
Name: collab_window, dtype: object
UCDwkLUpwRSLinbia0Zwuv8w 656    Non_Collab
Name: collab_window, dtype: object
UCDy44r0s7VxvrYjOJjeRefQ 657    Non_Collab
Name: collab_window, dtype: object
UCEEVeH_OhEZ3XOswJbWjrbQ 674    Non_Collab
Name: collab_window, dtype: object
UCEG_84q4DDappW2ZgLoMOuw 675    Non_Collab
Name: collab_window, dtype: object
UCEHjiMXxYtlLieMmXlEwrcg 676    Collab
Name: collab_window, dtype: object
UCEP7m60Yfi9JevGzGdD03mw 679    Non_Collab
Name: collab_window, dtype: object
UCEYy1KBuTts141VQvscsdKw 688    Non_Collab
Name: collab_window, dtype: object
UCEZ3VND-4K6fX9ow0yd8Paw 689    Collab
Name: collab_window, dtype: object
UCEdwaDsW3Qig5pkGV8_c3sg 694    Non_Collab
Name: collab_window, dtype: object
UCEnYqp1DciUPH-HgzNKS4Bg 701    Non_Collab
Name: collab_window, dtype: object
UCEoUHRdUWfYwXbDuD5HU8nA 702    Non_Collab
Name: collab_window, dtype: object
UCEqQT_28xTTxQtwTQDd4xcw 707    Non_Collab
Name: collab_window, dtype: object
UCErKUCncCyBgEdxWAtrj5hg 712    Non_Collab
Name: collab_window, dtype: object
UCF2BD7I7BBphXgCpMBrVL0g 715    Non_Collab
Name: collab_window, dtype: object
UCFG4tHCRnCDRfDM9sGjr83Q 718    Non_Collab
Name: collab_window, dtype: object
UCFOEjz0jo07W8bpEHomEgHg 725    Non_Collab
Name: collab_window, dtype: object
UCFUsBdbrNe2a8tnVsxBwoZw 730    Non_Collab
Name: collab_window, dtype: object
UCFxR1_cJI4nEmJ1nMIsQAgQ 751    Non_Collab
Name: collab_window, dtype: object
UCG9eNeLwgczW2cI1X51y1fw 764    Non_Collab
Name: collab_window, dtype: object
UCGBRkLIAAjQPzBGZpzXMY0Q 765    Non_Collab
Name: collab_window, dtype: object
UCGNOLQLMrmlVEb2wWkVUTkQ 766    Non_Collab
Name: collab_window, dtype: object
UCGXHiIMcPZ9IQNwmJOv12dQ 771    Non_Collab
Name: collab_window, dtype: object
UCGgvKW19IWVvUUblrr8EgHA 778    Non_Collab
Name: collab_window, dtype: object
UCGjRlQ3fT35a8z015A71yTA 779    Non_Collab
Name: collab_window, dtype: object
UCHBpllL68EGFECmmYGpZK4Q 806    Collab
Name: collab_window, dtype: object
UCHMT0_A5LivdSvxZHw4vdhQ 813    Non_Collab
Name: collab_window, dtype: object
UCHXg7HhdTj_OfKFp_jxtj-A 818    Non_Collab
Name: collab_window, dtype: object
UCHdra1JoQoMS8kuj2zcPWFg 825    Non_Collab
Name: collab_window, dtype: object
UCHfKkFbUKbBi8wth8GjcWNg 828    Non_Collab
Name: collab_window, dtype: object
UCHfPdH_yrwLVstxYiyT5Gvw 829    Non_Collab
Name: collab_window, dtype: object
UCHi0PjvDEqIHNt_5Hytj4tw 832    Non_Collab
Name: collab_window, dtype: object
UCI1priYQZzsch3TsTHO6OTg 849    Non_Collab
Name: collab_window, dtype: object
UCI4cauukefiomHidcqmyOKQ 852    Non_Collab
Name: collab_window, dtype: object
UCIEgk9WNDkIpDLFZkU2yN3g 871    Non_Collab
Name: collab_window, dtype: object
UCISkC3KkwYtXWDp91SXeE4A 884    Non_Collab
Name: collab_window, dtype: object
UCIVFEaKHBomhznMRX2aiQEQ 887    Non_Collab
Name: collab_window, dtype: object
UCIceHSt-cWAH6x7ob7_-R2A 896    Collab
Name: collab_window, dtype: object
UCIhmXuqtOQVJCPbh_tXns1g 897    Non_Collab
Name: collab_window, dtype: object
UCInHeDAFj1Se3_SX9tk_Erw 904    Non_Collab
Name: collab_window, dtype: object
UCIq1GmbGVP30y8RGvjoHg9g 907    Non_Collab
Name: collab_window, dtype: object
UCIzHmYtHCM4w9RQ2LS6CAuQ 912    Non_Collab
Name: collab_window, dtype: object
UCJ-gg-0xws8J6PKfY6DBLzg 915    Non_Collab
Name: collab_window, dtype: object
UCJ2cGU-CskWXRmzql5RgjKg 918    Non_Collab
Name: collab_window, dtype: object
UCJ7v_GfFQaVoaPak9_SwR5A 921    Non_Collab
Name: collab_window, dtype: object
UCJIDRMDo-fb2y_zykKIlJ3Q 926    Collab
Name: collab_window, dtype: object
UCJPan88gtFeexk9FO0dj-eQ 933    Non_Collab
Name: collab_window, dtype: object
UCJTAUPyJsdpyDrM038plNZQ 938    Non_Collab
Name: collab_window, dtype: object
UCJZ7f6NQzGKZnFXzFW9y9UQ 945    Non_Collab
Name: collab_window, dtype: object
UCJaVo8-l6lQk7mYekBYOnXQ 948    Collab
Name: collab_window, dtype: object
UCJb7_Qsz9pNQDcOA4YA6TxA 949    Non_Collab
Name: collab_window, dtype: object
UCJi_befq0YDYJ9fCeyHxJbg 952    Non_Collab
Name: collab_window, dtype: object
UCJliYtAT57qlJUuJKg_TpWw 953    Non_Collab
Name: collab_window, dtype: object
UCJogD0_PWsV3oamM4XW8BFg 958    Non_Collab
Name: collab_window, dtype: object
UCK9VElaz9OOJ7lnm1KPiHWQ 977    Non_Collab
Name: collab_window, dtype: object
UCKHGqejYspoS1KXkVJozi9A 982    Non_Collab
Name: collab_window, dtype: object
UCKHhA5hN2UohhFDfNXB_cvQ 983    Non_Collab
Name: collab_window, dtype: object
UCKMwNDvEUHMz4kCiNR11vjg 990    Non_Collab
Name: collab_window, dtype: object
UCKkzObNfUk-5I8ypoTnS30Q 1013    Non_Collab
Name: collab_window, dtype: object
UCKneDiDpM6a1dI-ulzRRIEQ 1014    Non_Collab
Name: collab_window, dtype: object
UCKpNY0xYeGNN-uOHmQIE7Pg 1015    Non_Collab
Name: collab_window, dtype: object
UCLKXaoU3BdPQasCj62kZ7Eg 1038    Non_Collab
Name: collab_window, dtype: object
UCLYcZGNLUYz_dA8TtGMUb7g 1053    Non_Collab
Name: collab_window, dtype: object
UCMDz09-3zO1hm1pqRA-Er0A 1082    Non_Collab
Name: collab_window, dtype: object
UCMTmEpo6V_CHJW2vX6xkzcg 1093    Non_Collab
Name: collab_window, dtype: object
UCM_neI2uMrv5211Y2HoHOPg 1104    Non_Collab
Name: collab_window, dtype: object
UCMjSk0J-G2WQNcQ1r1qQULw 1111    Non_Collab
Name: collab_window, dtype: object
UCMpOz2KEfkSdd5JeIJh_fxw 1116    Non_Collab
Name: collab_window, dtype: object
UCMry5jwn3fSxXqnqUUIcR0Q 1119    Non_Collab
Name: collab_window, dtype: object
UCN5Zf3Zyb9Bp9NeqtNwnc5w 1124    Non_Collab
Name: collab_window, dtype: object
UCNKcMBYP_-18FLgk4BYGtfw 1137    Non_Collab
Name: collab_window, dtype: object
UCNR3K4HA6LyO9tz0oZoSJIA 1142    Non_Collab
Name: collab_window, dtype: object
UCNVBYBxWj9dMHqKEl_V8HBQ 1145    Non_Collab
Name: collab_window, dtype: object
UCNVoXEtFpnn7i1CKzbnmlUw 1146    Non_Collab
Name: collab_window, dtype: object
UCNmXRwlQMlyXtLj-L9x3hLQ 1151    Non_Collab
Name: collab_window, dtype: object
UCO-MH1-2lG0OIPPVO-L2IMw 1152    Non_Collab
Name: collab_window, dtype: object
UCO2CPVD4KO6831z9nFnkrDQ 1153    Non_Collab
Name: collab_window, dtype: object
UCOY3VWW4NGQAue_rqTBtdhA 1174    Non_Collab
Name: collab_window, dtype: object
UCOxg9Xv1w4CKZlXB9KLDd9Q 1191    Non_Collab
Name: collab_window, dtype: object
UCP0GI-IbEZV59oExPioxeHw 1192    Non_Collab
Name: collab_window, dtype: object
UCP1qdRgYCu5bcyAky97cevQ 1193    Non_Collab
Name: collab_window, dtype: object
UCP1yGvHPtfNQfFEooUg3JSQ 1194    Non_Collab
Name: collab_window, dtype: object
UCPHXtOVmjvbP9OJihsd7gCg 1201    Non_Collab
Name: collab_window, dtype: object
UCPIqJ65Rbe7OMELyTwbb9Jw 1202    Collab
Name: collab_window, dtype: object
UCPd6iac_5S-XPz0qwZdV4rQ 1213    Non_Collab
Name: collab_window, dtype: object
UCPd7354sP1aEqgP-uPLwcGA 1214    Non_Collab
Name: collab_window, dtype: object
UCQ9jeV4pSsR1ouyEag-hA8Q 1229    Collab
Name: collab_window, dtype: object
UCQLf49IdqoFXuu3j9cKHkdw 1234    Non_Collab
Name: collab_window, dtype: object
UCQOQ3RxX_o-B-68wSKdcfMQ 1237    Non_Collab
Name: collab_window, dtype: object
UCQwXt_wbP8yzzNGrlxMKHew 1252    Non_Collab
Name: collab_window, dtype: object
UCR3vcUrhFUdzagng58ueI2w 1257    Non_Collab
Name: collab_window, dtype: object
UCR9xdWdy-idP9C9eelGnc3A 1258    Non_Collab
Name: collab_window, dtype: object
UCRQa3V1oI1aXX-b5XEzaH4w 1261    Non_Collab
Name: collab_window, dtype: object
UCRYy2VKwVqyON2tKijzx8kQ 1268    Collab
Name: collab_window, dtype: object
UCRfN_tonOGP0lPbtSBgG4fw 1271    Non_Collab
Name: collab_window, dtype: object
UCRkIIp1VnuIf3JupjfOWC3Q 1274    Non_Collab
Name: collab_window, dtype: object
UCRmqkUNRzjKtZSn33VxaVFw 1277    Non_Collab
Name: collab_window, dtype: object
UCRnh6-EGR6pPLRe61VLNYvg 1280    Non_Collab
Name: collab_window, dtype: object
UCS09CkK8fjPJ_nHD6fcGMSw 1287    Non_Collab
Name: collab_window, dtype: object
UCS3Wc-xJD-gee-HaAi8YEuQ 1290    Non_Collab
Name: collab_window, dtype: object
UCSLJACAStphgfgajq1ncWgw 1297    Non_Collab
Name: collab_window, dtype: object
UCSN8HEnbxusjfp9PSPHyXrQ 1300    Non_Collab
Name: collab_window, dtype: object
UCSWbsIOjo6w0IJdoTE-o56g 1303    Non_Collab
Name: collab_window, dtype: object
UCSfppeKifkcabhPJ0bdjzhw 1308    Non_Collab
Name: collab_window, dtype: object
UCSss9IQskc990Ach6Pcl0Cw 1319    Collab
Name: collab_window, dtype: object
UCT6yCzkSSGvnBLUHutSyB7Q 1328    Non_Collab
Name: collab_window, dtype: object
UCTD8DtdOyVQ8XqADuixvH_g 1339    Non_Collab
Name: collab_window, dtype: object
UCTRB_n77DYZwHUt6sx8vk7Q 1342    Non_Collab
Name: collab_window, dtype: object
UCT_FWaZuqgw5QOjqtgsoTxw 1347    Non_Collab
Name: collab_window, dtype: object
UCTafEJoRl5myC8A50plIrng 1348    Non_Collab
Name: collab_window, dtype: object
UCTdnSQQAx81nq7FY5joyQGA 1355    Non_Collab
Name: collab_window, dtype: object
UCTqyxaB74Cfedi9Ks7jrRKA 1358    Non_Collab
Name: collab_window, dtype: object
UCTtwqNrasC7AnTsl4YcB4Fw 1363    Non_Collab
Name: collab_window, dtype: object
UCUN57Wq9jlcIuJyrS6zfRGw 1390    Non_Collab
Name: collab_window, dtype: object
UCUbDcUPed50Y_7KmfCXKohA 1399    Non_Collab
Name: collab_window, dtype: object
UCV1WpxObFQrXod8ocSKtT-Q 1420    Collab
Name: collab_window, dtype: object
UCV2J1eTpFClb0-OPz8_QGQQ 1421    Non_Collab
Name: collab_window, dtype: object
UCV9OLReC5bMvuiCFGBdK7Mg 1428    Non_Collab
Name: collab_window, dtype: object
UCV9_KinVpV-snHe3C3n1hvA 1429    Non_Collab
Name: collab_window, dtype: object
UCV_WguHjxvr8sZQroQh4xMw 1440    Non_Collab
Name: collab_window, dtype: object
UCVl4jMRkjqbjidStrnUGtEg 1447    Non_Collab
Name: collab_window, dtype: object
UCVmt7LgDMApTn_rKl-yWFEw 1448    Non_Collab
Name: collab_window, dtype: object
UCVoFTBjoeC4q_gNo_zCnYXQ 1451    Non_Collab
Name: collab_window, dtype: object
UCW2G11dTUlkjZCW_0EaZK4Q 1462    Non_Collab
Name: collab_window, dtype: object
UCWBhNH9CmID3hqeccZyHotw 1469    Non_Collab
Name: collab_window, dtype: object
UCWEJt0-LJuRE0seaIMIm2jA 1472    Non_Collab
Name: collab_window, dtype: object
UCWMOdgs8EGoJ_lggHhNNv8Q 1475    Non_Collab
Name: collab_window, dtype: object
UCWP9sPHAkNy4WILl-UIYr-A 1480    Non_Collab
Name: collab_window, dtype: object
UCWhgYAMKQe174JTNXnIXnBQ 1487    Non_Collab
Name: collab_window, dtype: object
UCWrtsravWX0ANhHiJXNlyXw 1490    Non_Collab
Name: collab_window, dtype: object
UCXFx-GHOyrQiiGqxE-J9fYA 1499    Non_Collab
Name: collab_window, dtype: object
UCXJRJfOOMIi55vneRDpD3UA 1500    Non_Collab
Name: collab_window, dtype: object
UCXTrB8sagn_pDSKmO_ZQNQg 1507    Non_Collab
Name: collab_window, dtype: object
UCXk9Voo5cAfoYbnXjrNZhVw 1520    Non_Collab
Name: collab_window, dtype: object
UCXoPy1Q1G39fCemCMAmiorQ 1521    Non_Collab
Name: collab_window, dtype: object
UCXuqSBlHAE6Xw-yeJA0Tunw 1524    Non_Collab
Name: collab_window, dtype: object
UCY-4pT7awpmNacgbK56Odjg 1531    Non_Collab
Name: collab_window, dtype: object
UCY2JbItcod5aclsDCrgdo6g 1538    Non_Collab
Name: collab_window, dtype: object
UCY2ovfktLak0pE0zSzojlLw 1539    Non_Collab
Name: collab_window, dtype: object
UCY6QdjJKbaIUWxB3ciU-crw 1542    Non_Collab
Name: collab_window, dtype: object
UCYDx4LfNuQKI9aAcIa-BCKw 1551    Non_Collab
Name: collab_window, dtype: object
UCYrSJp6r-XHoxcdZKxU9wGg 1576    Non_Collab
Name: collab_window, dtype: object
UCZC45sBWNdkqSQ9Bwtt5lfA 1589    Non_Collab
Name: collab_window, dtype: object
UCZKEae_FmOPAWfuOiiVPTcQ 1596    Non_Collab
Name: collab_window, dtype: object
UCZMCU61JkQkqleoU7VyGncQ 1597    Non_Collab
Name: collab_window, dtype: object
UCZnvx1_cVFysbRKPz0n9hIA 1610    Non_Collab
Name: collab_window, dtype: object
UCZq_CYXRoRjKqidapMPujaQ 1613    Non_Collab
Name: collab_window, dtype: object
UCZyCposXwcyopaACep44maQ 1616    Non_Collab
Name: collab_window, dtype: object
UC_DVkYrg5uDDEjLHx4bStig 1631    Non_Collab
Name: collab_window, dtype: object
UC_MH4Sme5jV2nS24bDXJ15w 1636    Non_Collab
Name: collab_window, dtype: object
UC_U0-89tHy4mB1pafPKg_hQ 1639    Non_Collab
Name: collab_window, dtype: object
UC_Z7VumDEvbvey_IKHWjuCQ 1640    Non_Collab
Name: collab_window, dtype: object
UC__Pj66OeDibNZNN__L913g 1641    Non_Collab
Name: collab_window, dtype: object
UC_ayPEoDRThQY8I6vPQbXMw 1644    Non_Collab
Name: collab_window, dtype: object
UC_drtxXViugebE3bzqjD-Kg 1649    Non_Collab
Name: collab_window, dtype: object
UCaD2x4YrEJ5FN-qxNdMlgEQ 1678    Non_Collab
Name: collab_window, dtype: object
UCaI4YJ0eiwlDh3w_8ldMdRQ 1683    Non_Collab
Name: collab_window, dtype: object
UCaMQ6ci3NoQOrf1Av-BkCTQ 1690    Non_Collab
Name: collab_window, dtype: object
UCaWbo0qMCnwpNfNxDHnIJdQ 1701    Non_Collab
Name: collab_window, dtype: object
UCa_u2C35CjlKngwxLuIZfIQ 1704    Non_Collab
Name: collab_window, dtype: object
UCaiYqc8i5ojWZAgldxSmcqQ 1707    Non_Collab
Name: collab_window, dtype: object
UCajyfb5EUPd3f3deUk0dBEQ 1708    Non_Collab
Name: collab_window, dtype: object
UCamrFj_nsXOace8hBhTkjIA 1709    Non_Collab
Name: collab_window, dtype: object
UCavTVjugW1OejDAq0aDzdMw 1710    Non_Collab
Name: collab_window, dtype: object
UCax-4TdDh3oetlyh-GoL7UA 1713    Non_Collab
Name: collab_window, dtype: object
UCaz8OAIxslD0r23Ebfm5kxw 1714    Non_Collab
Name: collab_window, dtype: object
UCb0s0qC96lCcx2pO2za6mcw 1717    Non_Collab
Name: collab_window, dtype: object
UCb9ageBPhZYNBU93tnjs-QA 1720    Non_Collab
Name: collab_window, dtype: object
UCbI88iqwpseDR6MJC9oWPjg 1723    Non_Collab
Name: collab_window, dtype: object
UCbIjNubJ-HvmmSJHqq2yC0A 1726    Collab
Name: collab_window, dtype: object
UCb_ebrh8v67AW8tQHg_wzdw 1735    Non_Collab
Name: collab_window, dtype: object
UCcU9BRrknOMR4loeZNBCR6g 1760    Non_Collab
Name: collab_window, dtype: object
UCcm0RLFnEVk9uHlV9Qc2bfA 1761    Non_Collab
Name: collab_window, dtype: object
UCcpAse3ksgsJbF1tXgf7zUA 1762    Non_Collab
Name: collab_window, dtype: object
UCcseTnvS6CC-mkIcwKfi-1A 1767    Non_Collab
Name: collab_window, dtype: object
UCcvSMLuj0OcAASPsTyVlzBA 1770    Non_Collab
Name: collab_window, dtype: object
UCd4GEf2ZH90CADC_71vxCfA 1777    Non_Collab
Name: collab_window, dtype: object
UCdAt_KUKt0g9ZRQ7gwKaN3A 1784    Non_Collab
Name: collab_window, dtype: object
UCdHK6g8ddMEcMu9B3LgeKYg 1785    Non_Collab
Name: collab_window, dtype: object
UCdMrb-YQ_gTGylCFBAoJMnA 1796    Non_Collab
Name: collab_window, dtype: object
UCdTj7XMz3zG1Zksf6aunfpA 1805    Non_Collab
Name: collab_window, dtype: object
UCdhofAZeRhqxq0rk6pBBQaQ 1822    Collab
Name: collab_window, dtype: object
UCe01l0hvy1Q6PKRvZOgNSEw 1837    Non_Collab
Name: collab_window, dtype: object
UCe6pKugTDyOPT1Md2DTybbQ 1844    Non_Collab
Name: collab_window, dtype: object
UCe8BPWccI_ULJbtULb232zg 1847    Collab
Name: collab_window, dtype: object
UCeHZNJZ4t7kc5lgbhde6SGQ 1848    Non_Collab
Name: collab_window, dtype: object
UCeNgRHpH7OHZetYjC5JZXGw 1855    Non_Collab
Name: collab_window, dtype: object
UCeP8Z6D1EckBkN66UQGzMFw 1856    Collab
Name: collab_window, dtype: object
UCeWjRGjVPGRgrr4ePbSpmyw 1859    Non_Collab
Name: collab_window, dtype: object
UCecIwzVSKzV84VBHCwPAb6w 1862    Non_Collab
Name: collab_window, dtype: object
UCejZmwvT66xppqbanpGdAuw 1865    Non_Collab
Name: collab_window, dtype: object
UCelg4wUoZziBy4D04eXgflQ 1868    Non_Collab
Name: collab_window, dtype: object
UCeyUPcTTiAg_k5dURbP0wAQ 1871    Non_Collab
Name: collab_window, dtype: object
UCf5_y7nHWg-b6oFUD6vJSDw 1876    Collab
Name: collab_window, dtype: object
UCfAOh2t5DpxVrgS9NQKjC7A 1881    Non_Collab
Name: collab_window, dtype: object
UCfDoPEgiYTMmraXj8HS7s4g 1882    Non_Collab
Name: collab_window, dtype: object
UCfa352_VIp2AI7R9Euh0QfA 1899    Non_Collab
Name: collab_window, dtype: object
UCfeNmPbGRF6dkPt04HIl25A 1904    Non_Collab
Name: collab_window, dtype: object
UCflekwBtmeecY9bcAzPgPCQ 1907    Non_Collab
Name: collab_window, dtype: object
UCfnmup3aVAKmTj4LDCt0Zfw 1910    Non_Collab
Name: collab_window, dtype: object
UCfr55bMg3ILQB0Z4KyIUQOg 1911    Non_Collab
Name: collab_window, dtype: object
UCg-dJOAfCY0FYEG5eSRqC2Q 1916    Non_Collab
Name: collab_window, dtype: object
UCg1uYO329KcAEN-PQdoQMKQ 1917    Non_Collab
Name: collab_window, dtype: object
UCgBsZZD7aPqhmUC_ZmAAapw 1928    Non_Collab
Name: collab_window, dtype: object
UCgNY34evarGdcS9nqIutfsQ 1933    Non_Collab
Name: collab_window, dtype: object
UCgSvw7asvNq7wJWKMSW28Eg 1934    Non_Collab
Name: collab_window, dtype: object
UCgUCv1oTwlgJQ3JagaixYTQ 1935    Non_Collab
Name: collab_window, dtype: object
UCgbjLe4Zi403jVxc7du8xyA 1940    Non_Collab
Name: collab_window, dtype: object
UCgefQJC5UgbWJHDxBqB4qVg 1943    Non_Collab
Name: collab_window, dtype: object
UCgjp3GQePAhFGijH41k8GSQ 1944    Non_Collab
Name: collab_window, dtype: object
UCgoIIibTijEr0TBvTrLY3kg 1949    Non_Collab
Name: collab_window, dtype: object
UCh0WTtMUH-KlAh8qDKV1_Qw 1958    Non_Collab
Name: collab_window, dtype: object
UChPVoBpjfJngjoKjxrwGSTw 1971    Non_Collab
Name: collab_window, dtype: object
UChSsKVEQ7MWauk25HMOH_kA 1974    Non_Collab
Name: collab_window, dtype: object
UChWmYNTHQpLmJdkScwNrgcA 1977    Non_Collab
Name: collab_window, dtype: object
UChY2rr3U2xAfu1pW-x2fqcg 1980    Non_Collab
Name: collab_window, dtype: object
UChZUCgX-hBXDpnpB8O1XQbA 1981    Non_Collab
Name: collab_window, dtype: object
UChdPopq2IFHuw6ZWK-ZObkw 1982    Non_Collab
Name: collab_window, dtype: object
UChikza6yUVgrs7L879pzZyw 1987    Non_Collab
Name: collab_window, dtype: object
UChjOwTnWnBW8xtC8UKpdRMQ 1988    Non_Collab
Name: collab_window, dtype: object
UChl-lOZZY2vo_Zho6cfj0uQ 1989    Collab
Name: collab_window, dtype: object
UChr1sKJ27Fr6WZxAlusbo_w 1990    Non_Collab
Name: collab_window, dtype: object
UCi0Qq-WTd50vbH9zxS1FDRA 2003    Non_Collab
Name: collab_window, dtype: object
UCi1S0Cx-hfDV1w25N3l_0dw 2004    Non_Collab
Name: collab_window, dtype: object
UCiBKXnJnzhJSf8LwBMBxXJg 2009    Non_Collab
Name: collab_window, dtype: object
UCiL91vQG1oE0k9eBG-bn5uw 2014    Non_Collab
Name: collab_window, dtype: object
UCiSIOIVt2rBeEGeeID5-p_A 2015    Non_Collab
Name: collab_window, dtype: object
UCiTxWhcBqrRp3NZCEpKbXiA 2016    Non_Collab
Name: collab_window, dtype: object
UCiiQtn-wuAOeUp3Q_2A5z9Q 2023    Non_Collab
Name: collab_window, dtype: object
UCikc0PfL5b9vxWpQpS70woA 2026    Non_Collab
Name: collab_window, dtype: object
UCiqW78-mEj0HFldGn1_aEPw 2027    Non_Collab
Name: collab_window, dtype: object
UCj10EyPn4i619iC_DZmA46A 2036    Non_Collab
Name: collab_window, dtype: object
UCjIGNWqzubjIOrdTJLhgM2w 2047    Non_Collab
Name: collab_window, dtype: object
UCjN8PEcGmXzaS8ALAPA6RpQ 2048    Non_Collab
Name: collab_window, dtype: object
UCjSsGvNrQyGT1-jJqu8f7Tg 2055    Non_Collab
Name: collab_window, dtype: object
UCjSuc_OjQJR6157Jk6AEylA 2056    Non_Collab
Name: collab_window, dtype: object
UCjTVNkbx5megxlnl1spy2jg 2059    Non_Collab
Name: collab_window, dtype: object
UCjVkjd2VWCWO-9DHRQER3Yg 2060    Non_Collab
Name: collab_window, dtype: object
UCjjUiyADYhfVHl3peKG7V-w 2071    Collab
Name: collab_window, dtype: object
UCjmG9pnCGCeird0UU0pQhBA 2076    Non_Collab
Name: collab_window, dtype: object
UCjpgDLE5LGOpzX2igCKAdSQ 2077    Non_Collab
Name: collab_window, dtype: object
UCk-FDoGNRa9XfXqF4ujMv7A 2080    Non_Collab
Name: collab_window, dtype: object
UCk40qSGYnVdFFBNXRjrvdpQ 2081    Non_Collab
Name: collab_window, dtype: object
UCk4ESa-9ODg77xLnXwnIycg 2082    Non_Collab
Name: collab_window, dtype: object
UCk70TYJHmZ_tsBKo-_SjP1w 2083    Non_Collab
Name: collab_window, dtype: object
UCk7WS3CwOVcit18wa64TR_g 2084    Non_Collab
Name: collab_window, dtype: object
UCkmzmIABuw5EIqGBYXlX3oA 2105    Non_Collab
Name: collab_window, dtype: object
UCkrbi2bmw7DQuuC8k4TP-QA 2108    Non_Collab
Name: collab_window, dtype: object
UCkuug-Gcff-YaxE4j4czN6g 2111    Non_Collab
Name: collab_window, dtype: object
UCl0KdGiwyqLJCdu5XMIz_TQ 2112    Non_Collab
Name: collab_window, dtype: object
UCl4Hj2X1BuwUMB1VXflC0Dw 2113    Non_Collab
Name: collab_window, dtype: object
UClGALQRCCeQvPVFOtxVb4_w 2114    Collab
Name: collab_window, dtype: object
UClGLOG9SdGYzPUlL4ur0zyQ 2115    Non_Collab
Name: collab_window, dtype: object
UClO7jPUJTldmuf3hl1BMtUQ 2118    Non_Collab
Name: collab_window, dtype: object
UClRVvM7X7zX65HY9P7Zllvg 2119    Non_Collab
Name: collab_window, dtype: object
UClVuVeEmTDkdMLIf1BsfrEg 2120    Non_Collab
Name: collab_window, dtype: object
UClkPeexHGUmFQ-nmNgItKjg 2127    Non_Collab
Name: collab_window, dtype: object
UClpEE-Led9ZK0GJQKvU--3Q 2134    Non_Collab
Name: collab_window, dtype: object
UCm-Cw7o_mLB_m0HYalgmOww 2139    Non_Collab
Name: collab_window, dtype: object
UCmGLoCludYo5uAivmD9TFmA 2148    Non_Collab
Name: collab_window, dtype: object
UCmKDDaxY2SZA1roVCSLv_gw 2151    Non_Collab
Name: collab_window, dtype: object
UCmRY4NSGK52lP_Lz11CjdYw 2160    Non_Collab
Name: collab_window, dtype: object
UCmrLCXSDScliR7q8AxxjvXg 2175    Non_Collab
Name: collab_window, dtype: object
UCn1XB-jvmd9fXMzhiA6IR0w 2178    Non_Collab
Name: collab_window, dtype: object
UCn4e8grbA-BO6bQnyyHQnZw 2179    Non_Collab
Name: collab_window, dtype: object
UCne29a_nfGMJcSebL04Rp-A 2204    Non_Collab
Name: collab_window, dtype: object
UCnmaW_YVh9iJAFVOsLR7W2w 2219    Non_Collab
Name: collab_window, dtype: object
UCo82BnWbczpxVAJxHMZ7j-w 2236    Non_Collab
Name: collab_window, dtype: object
UCo8wl3bJjAZGcP-XbaKJlbA 2237    Non_Collab
Name: collab_window, dtype: object
UCoBC1oha0suBKYjB0YPcNgg 2240    Non_Collab
Name: collab_window, dtype: object
UCoBb8VNqTyUXBCjjd4nINpA 2241    Non_Collab
Name: collab_window, dtype: object
UCoEnMbTB4v2L2TcqE11Bk9w 2244    Non_Collab
Name: collab_window, dtype: object
UCoIARAMkTkdjhegQAQ7_Xwg 2245    Non_Collab
Name: collab_window, dtype: object
UCoMl1m28jTVlkWkl7i8glUA 2250    Non_Collab
Name: collab_window, dtype: object
UCoXamg6CYWAptD93iLai2Vw 2259    Non_Collab
Name: collab_window, dtype: object
UCoaWkNlP1bjnfBVe1P1t7FA 2260    Non_Collab
Name: collab_window, dtype: object
UCofORWTh9OYkrls8ruk2e0g 2263    Non_Collab
Name: collab_window, dtype: object
UCoo8qQwz4w_rZXlA6orVp9g 2264    Non_Collab
Name: collab_window, dtype: object
UCopU6KXnK517ycc4tZyzm-g 2265    Non_Collab
Name: collab_window, dtype: object
UCoziFm3M4sHDq1kkx0UwtRw 2268    Non_Collab
Name: collab_window, dtype: object
UCp8JxAi_YsrFl0B3uwTPaeQ 2273    Non_Collab
Name: collab_window, dtype: object
UCpDmAxyuLoNrP-cek8tEfrw 2280    Non_Collab
Name: collab_window, dtype: object
UCpQbXq-6bWQB-nkr0zrA7rQ 2283    Non_Collab
Name: collab_window, dtype: object
UCpWSYhAy698xPkfDXd87GwA 2288    Non_Collab
Name: collab_window, dtype: object
UCpm6vCgiehSKHtMFFS5ukiA 2297    Non_Collab
Name: collab_window, dtype: object
UCpsSadsgX_Qk9i6i_bJoUwQ 2300    Non_Collab
Name: collab_window, dtype: object
UCqBUFFJsAmrAfJIkLNQZ7vA 2307    Non_Collab
Name: collab_window, dtype: object
UCqLJ8JDDF_bX0-DjL1UPW0A 2316    Non_Collab
Name: collab_window, dtype: object
UCqPOHbRVKeBUSjE68aEpQJQ 2321    Non_Collab
Name: collab_window, dtype: object
UCqZoRT8idKeVUsix0FSufTg 2326    Non_Collab
Name: collab_window, dtype: object
UCqdYOBFUUSSnpuQAn4kSu8g 2329    Non_Collab
Name: collab_window, dtype: object
UCqeXB2WsPdzcdCdP-bUc52g 2330    Non_Collab
Name: collab_window, dtype: object
UCqfgJZJpOW97AyRg-GFLUrg 2331    Non_Collab
Name: collab_window, dtype: object
UCqhF09vIluTp0cRW-gD04Yg 2332    Non_Collab
Name: collab_window, dtype: object
UCqjALWYAB9mx9G51481hp9Q 2333    Non_Collab
Name: collab_window, dtype: object
UCqmPTVjx3WErzZZ-wQ9lGtg 2338    Non_Collab
Name: collab_window, dtype: object
UCqq3PZwp8Ob8_jN0esCunIw 2339    Non_Collab
Name: collab_window, dtype: object
UCqu3u6ODlieZq0rCOzwN7XQ 2342    Non_Collab
Name: collab_window, dtype: object
UCqu_M0ARO7Sanbf_hte0Olw 2343    Non_Collab
Name: collab_window, dtype: object
UCrAfFruaVFe9e5XQ71lnrhw 2354    Collab
Name: collab_window, dtype: object
UCrKr3ozWvpCvNxvE9cMyWGg 2357    Collab
Name: collab_window, dtype: object
UCrUP38Gv4spk2rHoIxZmG3A 2364    Non_Collab
Name: collab_window, dtype: object
UCr_n3xjYfYbla84jdhi9FXA 2365    Non_Collab
Name: collab_window, dtype: object
UCrc6LVD4Fu7EEl-3e7Zau_Q 2366    Non_Collab
Name: collab_window, dtype: object
UCriRvZSnEFPSMqSya2s2U6g 2369    Non_Collab
Name: collab_window, dtype: object
UCrjr6QsBBEczaf9uucTWTow 2370    Non_Collab
Name: collab_window, dtype: object
UCs3iz985Yi0ldESJiIiInMA 2383    Non_Collab
Name: collab_window, dtype: object
UCs3o1YULWTaCCLMLU7siEWg 2384    Non_Collab
Name: collab_window, dtype: object
UCsDdQUPa4NvPvf2f00E5zfw 2393    Non_Collab
Name: collab_window, dtype: object
UCsDl6A77CrbkSJLtxQFVDGw 2394    Non_Collab
Name: collab_window, dtype: object
UCsEukrAd64fqA7FjwkmZ_Dw 2395    Non_Collab
Name: collab_window, dtype: object
UCsExQ2T3ltBVH5W_9V_ZRxg 2396    Non_Collab
Name: collab_window, dtype: object
UCsFctXdFnbeoKpLefdEloEQ 2399    Non_Collab
Name: collab_window, dtype: object
UCssEeXJQUvzZl-rQI9okPVA 2416    Non_Collab
Name: collab_window, dtype: object
UCt8OnQ7ztuLrPrehlj8ZuuQ 2433    Non_Collab
Name: collab_window, dtype: object
UCtGZ6Ph-C56xZ1luEJAjsGw 2434    Non_Collab
Name: collab_window, dtype: object
UCtaNepQqDZc0a-dfRlt7UZg 2439    Non_Collab
Name: collab_window, dtype: object
UCtaykeSsGhtn2o2BsPm-rsw 2440    Non_Collab
Name: collab_window, dtype: object
UCtct1n-5LxMRDuBOIW9uQqA 2441    Collab
Name: collab_window, dtype: object
UCtinMvHylejGpNWO5MMI-rQ 2444    Non_Collab
Name: collab_window, dtype: object
UCtm8vkLl55Nzt6WmC3MJvdA 2447    Non_Collab
Name: collab_window, dtype: object
UCtxVFBcBiLzePR4lle1r2XQ 2452    Non_Collab
Name: collab_window, dtype: object
UCu3KOyas91ceUYvkNzNR1Zw 2457    Non_Collab
Name: collab_window, dtype: object
UCu5MM6dgH9IgvBr8WDrn4Sg 2458    Non_Collab
Name: collab_window, dtype: object
UCuErSr7xeR763BzTJL7yJ7A 2461    Non_Collab
Name: collab_window, dtype: object
UCuNfQHhJVLi1aXeA6rnnETg 2464    Non_Collab
Name: collab_window, dtype: object
UCuTaqIjsWBBYi6i88sPLcYA 2471    Non_Collab
Name: collab_window, dtype: object
UCucHkPnBa7pB-jITCGuSqKw 2472    Non_Collab
Name: collab_window, dtype: object
UCugD1HAP3INAiXo70S_sAFQ 2475    Non_Collab
Name: collab_window, dtype: object
UCuqdsYT5rFqw0L5oO_HwvgA 2482    Non_Collab
Name: collab_window, dtype: object
UCv1WtbrMDyW_hAefXNeUVkA 2487    Non_Collab
Name: collab_window, dtype: object
UCv6e81arqlPt1iskoAfOehA 2494    Non_Collab
Name: collab_window, dtype: object
UCvNqgdEB6B2GpLMvzqWHy9w 2509    Non_Collab
Name: collab_window, dtype: object
UCvOwKoCHcZKyHuGNAezw0LA 2512    Non_Collab
Name: collab_window, dtype: object
UCvQ4A5xeNnVh78OJmulc4Ug 2513    Non_Collab
Name: collab_window, dtype: object
UCvUO8JDs1eTW7kM55FQY_lw 2514    Non_Collab
Name: collab_window, dtype: object
UCvVWoVROH-nfAiXfr4nFuDg 2515    Non_Collab
Name: collab_window, dtype: object
UCvVuqRzGVqRlmZYlTf99M_w 2516    Non_Collab
Name: collab_window, dtype: object
UCvWv_Ucv7PokghPI_ZY7MvA 2517    Non_Collab
Name: collab_window, dtype: object
UCvsH-OTxTuZ1bo1Z4XHn1Zg 2536    Non_Collab
Name: collab_window, dtype: object
UCvt04k2SNeu2mJ6oQx_qUpQ 2537    Non_Collab
Name: collab_window, dtype: object
UCw-hc7ZJummS0AvWyjUX56A 2544    Non_Collab
Name: collab_window, dtype: object
UCw3UZn1tcVe7pH3R6C3Gcng 2549    Non_Collab
Name: collab_window, dtype: object
UCw3YBvqEFnleB2wR7SVNA2A 2550    Non_Collab
Name: collab_window, dtype: object
UCw4ccFtBN7dhQBcHmE0qylg 2553    Non_Collab
Name: collab_window, dtype: object
UCw6Ou-fRcPa23GvYq-VHEbA 2554    Non_Collab
Name: collab_window, dtype: object
UCw7SNYrYei7F5ttQO3o-rpA 2555    Non_Collab
Name: collab_window, dtype: object
UCwNzLpk33dblyY2Gg-SyPxw 2562    Non_Collab
Name: collab_window, dtype: object
UCwgURKfUA7e0Z7_qE3TvBFQ 2577    Non_Collab
Name: collab_window, dtype: object
UCwi7FK3XDDwhioq1hV8Zmqg 2578    Non_Collab
Name: collab_window, dtype: object
UCwrVZxzfJ2s3JycGqfNiV1A 2581    Non_Collab
Name: collab_window, dtype: object
UCwsa-MpLNx4pnsM1PiQwhyQ 2582    Non_Collab
Name: collab_window, dtype: object
UCwz6oWkwVQng955vNgM5nNg 2587    Non_Collab
Name: collab_window, dtype: object
UCx9J7x3AV50rcdncU7cPacw 2590    Non_Collab
Name: collab_window, dtype: object
UCxJlPlbhLmM3pKrPsnidRuA 2593    Non_Collab
Name: collab_window, dtype: object
UCxOQyRmgJqHji6ItvllZmYg 2598    Non_Collab
Name: collab_window, dtype: object
UCxfiU1Yr2qwbhrvibG03aBw 2609    Non_Collab
Name: collab_window, dtype: object
UCxjXK2c8Ksp8jowgAZrZPDg 2612    Non_Collab
Name: collab_window, dtype: object
UCxlNE1UStmCupXMn7SdY7xw 2613    Non_Collab
Name: collab_window, dtype: object
UCxx9D4IRDkjFrPdQ72AqBNQ 2616    Non_Collab
Name: collab_window, dtype: object
UCxytOEPP99jj8mqVGAO7haQ 2619    Non_Collab
Name: collab_window, dtype: object
UCy-YumnMOFdhzYk3mkppJUw 2622    Non_Collab
Name: collab_window, dtype: object
UCy8WPFE1WEsCJYcYuzZpeXg 2627    Non_Collab
Name: collab_window, dtype: object
UCy9UJBnAfkGQFWe3R2JenzA 2628    Collab
Name: collab_window, dtype: object
UCyBgBxEuMM2sXMEuHHu8X6Q 2629    Non_Collab
Name: collab_window, dtype: object
UCyCyTe_1bT2aIPwG_gxbyeg 2634    Non_Collab
Name: collab_window, dtype: object
UCyDE9oEVAZwBtIsqMV26SIQ 2635    Non_Collab
Name: collab_window, dtype: object
UCyE4KeQg54Bx34LxQB22xkw 2638    Non_Collab
Name: collab_window, dtype: object
UCyGFZ2SQkAyOQllKcDESYeg 2639    Non_Collab
Name: collab_window, dtype: object
UCyISjICIlDZQhEC-Y0jMViw 2644    Non_Collab
Name: collab_window, dtype: object
UCyP6Mnxv84diX2StFMSa_Gg 2645    Non_Collab
Name: collab_window, dtype: object
UCyb0T6J6nL5ibo0HLRMINKw 2652    Non_Collab
Name: collab_window, dtype: object
UCyf3pMjaBl5ola2zmkp5dWA 2655    Non_Collab
Name: collab_window, dtype: object
UCyiPo3XZZ2_XiC8y9mqAqig 2656    Non_Collab
Name: collab_window, dtype: object
UCysmiBQRJc97SbSuBuZeRUQ 2659    Non_Collab
Name: collab_window, dtype: object
UCz3KPXSncFWfDkWfdUe_79w 2672    Non_Collab
Name: collab_window, dtype: object
UCz6tAQuMTRij-zv-crzP8sQ 2675    Collab
Name: collab_window, dtype: object
UCzBd-289owXoR9jwcCau84Q 2678    Non_Collab
Name: collab_window, dtype: object
UCzJIliq68IHSn-Kwgjeg2AQ 2681    Non_Collab
Name: collab_window, dtype: object
UCzK_qU_rnQ_W9CTKq7XTe3A 2682    Non_Collab
Name: collab_window, dtype: object
UCzN_QwrNpAMWxVIjZKKSkPQ 2683    Non_Collab
Name: collab_window, dtype: object
UCzNq9i-DlDDBLjPerVzJW-A 2684    Non_Collab
Name: collab_window, dtype: object
UCzNtNcKsSe-IKyOtjUeoRNA 2685    Non_Collab
Name: collab_window, dtype: object
UCzPu_uL2lOoXfgYVhjMQfSw 2686    Collab
Name: collab_window, dtype: object
UCz_BJGTYrM4ntr7vsrsjfLw 2693    Non_Collab
Name: collab_window, dtype: object
UCz_cDc_2arKIb6SlJoqFT0w 2694    Non_Collab
Name: collab_window, dtype: object
UCzbsojogXsEBljxVAHC5YIg 2697    Non_Collab
Name: collab_window, dtype: object
UCzdX8HbZrbGRwBkikmvpAXw 2698    Non_Collab
Name: collab_window, dtype: object
UCzo4OXE8JxogJHWJ2SypiNg 2701    Non_Collab
Name: collab_window, dtype: object
UCzzCZoXYp5JH7QKhWadOQcA 2704    Non_Collab
Name: collab_window, dtype: object
1116

In [259]:
# average difference, or growth between collab and noncollab, in%
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(data=df_collab_growth, ax=ax1,color=sns.color_palette()[1])
sns.barplot(data=df_collab_growth, ci=95,errwidth=1., capsize=.1, ax=ax2,color=sns.color_palette()[1])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Factor %')
ax2.set_ylabel('mean(Factor %)')
fig.suptitle('Video Views Growth Factor %')
ax1.set_xticklabels([''])
ax2.set_xticklabels([''])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-250.0, 600.0])
ax2.set_ylim([15.0, 55.0])

for bar in ax2.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x+width/2.
    newwidth = width/2
    bar.set_x(centre-newwidth/2.)
    bar.set_width(newwidth)
    
save_plot('collab_video_12day_growth_factor_box_ci_violin.pdf', fig, 2*s_width, 0.75*s_height)

print df_collab_growth.describe()


                 1
count  1108.000000
mean     34.322166
std     272.959967
min     -99.701347
25%     -37.295373
50%      -6.731477
75%      32.477690
max    6376.281237

In [244]:
# average difference, or growth between collab and noncollab, in%
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.boxplot(data=df_collab_growth, ax=ax1,color=sns.color_palette()[1])
sns.barplot(data=df_collab_growth, ci=95,errwidth=1., capsize=.1, ax=ax2,color=sns.color_palette()[1])
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('Factor %')
ax2.set_ylabel('mean(Factor %)')
fig.suptitle('Video Views Growth Factor %')
ax1.set_xticklabels([''])
ax2.set_xticklabels([''])
fig.tight_layout()
fig.subplots_adjust(top=0.86)
ax1.set_ylim([-120.0, 160.0])
ax2.set_ylim([15.0, 55.0])

for bar in ax2.patches:
    x = bar.get_x()
    width = bar.get_width()
    centre = x+width/2.
    newwidth = width/2
    bar.set_x(centre-newwidth/2.)
    bar.set_width(newwidth)
    
save_plot('collab_video_12day_growth_factor_box_ci.pdf', fig, 2*s_width, 0.75*s_height)

print df_collab_growth.describe()


                 1
count  1108.000000
mean     34.322166
std     272.959967
min     -99.701347
25%     -37.295373
50%      -6.731477
75%      32.477690
max    6376.281237

In [154]:
df_channel = df_channel.set_index('id')
df_channel.head()


Out[154]:
topicIds network viewCount subscriberCount videoCount commentCount category popularity
id
UC__Pj66OeDibNZNN__L913g Music None 3253022 23029 967 0 Entertainment 2
UC__PZLSRGtUQiTtvm3hPoEQ Movies BroadbandTV 310896 5878 144 0 Entertainment 1
UC__rmdgxs3ZF0zK_he7Tmig Lifestyle None 1291254 8146 294 121 How-to & Style 1
UC_-CxgsxX0tpnm24WO-797Q Lifestyle Maker Studios 625545 18990 67 101 How-to & Style 2
UC_1FUFB6TlGeGOyDI4ikkzg Movies BroadbandTV 89020205 106760 288 0 Entertainment 3

In [155]:
# use above data and split data into groups further to compare
# popualrity classes
# category class

# todo add popualrity class to channel and category
# read from file from channel notebook?
#

df_collab_growth_ext = df_collab_growth.copy()
df_collab_growth_ext.columns= ['id', 'growth']
df_collab_growth_ext['category'] = 'NaN'
df_collab_growth_ext['popularity'] = np.nan
df_collab_growth_ext.set_index('id', inplace=True)

for i, row in df_collab_growth_ext.iterrows():
    df_collab_growth_ext.loc[i, 'category'] = df_channel.loc[i, 'category']
    df_collab_growth_ext.loc[i, 'popularity'] = df_channel.loc[i, 'popularity']
    
df_collab_growth_ext.head()


Out[155]:
growth category popularity
id
UC-4kjzuh4822B9yPSgpZQgA 10.813472 Entertainment 4.0
UC-63s9JLCZqIDlhXK6VHb7w -89.944618 Entertainment 2.0
UC-A4oZF4AlOEdlyZWBCI0cQ -4.630593 Entertainment 4.0
UC-CRXGRabkMD7x6V_Nk9F0A -89.551831 Film & Animation 1.0
UC-NINtNMPM75eaqh07RCy_Q 93.463303 Education 3.0

In [156]:
# investigate groups from classes
df_collab_growth_category_groups = df_collab_growth_ext.groupby(by='category')
df_collab_growth_popularity_groups = df_collab_growth_ext.groupby(by='popularity')

print df_collab_growth_category_groups.count()
print df_collab_growth_popularity_groups.count()


                        growth  popularity
category                                  
Cars & Vehicles              7           7
Comedy                     128         129
Education                   40          40
Entertainment              387         387
Film & Animation            54          54
Gaming                      33          33
How-to & Style              91          93
Music                       60          61
News & Politics             12          12
Non-profits & Activism       2           2
People & Blogs             218         222
Pets & Animals               3           3
Science & Technology        34          34
Sports                      33          33
Travel & Events              6           6
            growth  category
popularity                  
0.0             25        25
1.0             78        78
2.0            278       282
3.0            532       536
4.0            186       186
5.0              8         8
6.0              1         1

In [157]:
test = df_collab_growth_category_groups.filter(lambda x: len(x) > 30)
test = test.groupby(by='category')

test_0 = test.get_group('Entertainment')
test_1 = test.get_group('People & Blogs')

In [185]:
fig = plt.figure()
ax =sns.barplot(x='popularity', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1)
plt.legend(["{}% CI".format(95)])
ax.set_xlabel('Popularity')
ax.set_ylabel('Views Growth Factor %')
fig.suptitle('Views Growth Factor % per Popularity')
ax.set_xticklabels(range(7))
fig.tight_layout()
fig.subplots_adjust(top=0.92)
ax.set_ylim([-50.0, 450.0])
save_plot('collab_video_growth_factor_popularities.pdf', fig, s_width, 1.5*s_height)


fig = plt.figure()
ax = sns.barplot(x='category', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1)
plt.legend(["{}% CI".format(95)], loc=2)
ax.set_xlabel('Category')
ax.set_ylabel('Views Growth Factor %')
fig.suptitle('Views Growth Factor % per Category')
#ax1.set_xticklabels([''])
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.setp(ax.xaxis.get_majorticklabels(), ha='right')
ax.set_ylim([-100.0, 650.0])
fig.tight_layout()
fig.subplots_adjust(top=0.92)

save_plot('collab_video_growth_factor_categories.pdf', fig, 2*s_width, 1.5*s_height)



In [216]:
fig, (ax1, ax2) = plt.subplots(ncols=2, gridspec_kw = {'width_ratios':[1, 2]})

sns.barplot(x='popularity', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1, ax=ax1)
plt.legend(["{}% CI".format(95)])
#ax1.set_xlabel('Popularity')
ax1.xaxis.label.set_visible(False)
ax1.set_ylabel('Views Growth Factor %')
ax1.set_title('Views Growth Factor % per Popularity')
ax1.set_xticklabels(range(7))
ax1.set_ylim([-50.0, 450.0])
#fig.tight_layout()
sns.barplot(x='category', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1, ax=ax2)
#plt.legend(["{}% CI".format(95)], loc=2)
#ax2.set_xlabel('Category')
ax2.xaxis.label.set_visible(False)
ax2.set_ylabel('Views Growth Factor %')
ax2.set_title('Views Growth Factor % per Category')
#ax1.set_xticklabels([''])
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=45)
plt.setp(ax2.xaxis.get_majorticklabels(), ha='right')
ax2.set_ylim([-100.0, 650.0])

fig.subplots_adjust(top=0.92)
#fig.tight_layout()
save_plot('collab_video_growth_factor_combined.pdf', fig, 4*s_width, 1.5*s_height)



In [186]:
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x='popularity', y='growth', data=df_collab_growth_ext, ax=ax1)
sns.barplot(x='popularity', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1, ax=ax2)
plt.legend(["{}% CI".format(95)])
ax1.set_xlabel('Popularity')
ax2.set_xlabel('Popularity')
ax1.set_ylabel('Views Growth Factor %')
ax2.set_ylabel('mean(Views Growth Factor %)')
fig.suptitle('Views Growth Factor % per Popularity')
#ax1.set_xticklabels([''])
#ax2.set_xticklabels([''])
ax1.set_xticklabels(range(7))
ax2.set_xticklabels(range(7))
fig.tight_layout()
#fig.subplots_adjust(top=0.86)
#ax1.set_ylim([-250.0, 800.0])
save_plot('collab_video_growth_factor_popularities_violin_bar.pdf', fig, 2.5*s_width, 1.5*s_height)



In [187]:
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x='category', y='growth', data=df_collab_growth_ext, ax=ax1,scale='width')
sns.barplot(x='category', y='growth', data=df_collab_growth_ext, ci=95,errwidth=1., capsize=.1, ax=ax2)
plt.legend(["{}% CI".format(95)])
#ax1.set_xlabel('')
#ax2.set_xlabel('')
ax1.set_ylabel('Views Growth Factor %')
ax2.set_ylabel('mean(Views Growth Factor %)')
fig.suptitle('Views Growth Factor % per Category')
#ax1.set_xticklabels([''])
#ax2.set_xticklabels([''])
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=90)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=90)
fig.tight_layout()
#fig.subplots_adjust(top=0.86)
#ax1.set_ylim([-250.0, 800.0])
save_plot('collab_video_growth_factor_categories_violin_bar.pdf', fig, 3.5*s_width, 1.5*s_height)



In [171]:
# make plots for views etc. x days after collab
# mark history data with x day after
print df_video_history_collabs['crawlTimestamp'].dtype
df_video_history_collabs.head()


datetime64[ns]
Out[171]:
videoID viewCount commentCount likeCount dislikeCount crawlTimestamp gradient p_gradient collab_window day_after
id
1 qBZKeIbNDcE 9.0 0 1 0 2016-12-28 03:03:22 NaN NaN Non_Collab -1
2 2A6NSm9aSVQ 139.0 5 14 2 2016-12-28 03:03:23 NaN NaN Non_Collab -1
7 H_PpwvS8hA8 127.0 5 9 0 2016-12-28 03:04:08 NaN NaN Non_Collab -1
9 lsN0rGSfXWw 122.0 4 37 0 2016-12-28 03:04:36 NaN NaN Non_Collab -1
18 Azqy_VVj1PI 47536.0 2072 13325 60 2016-12-29 00:06:15 NaN NaN Non_Collab -1

In [173]:
df_video_history_collabs['day_after'] = -1

cnt=0

for name, group in video_history_collab_group:
    
    if df_videos_collabs.ix[name, 'collab_window'] == 'Collab':
        date = group.iloc[0, 5]
        df_video_history_collabs.ix[group.index, 'day_after']  = group['crawlTimestamp'].apply(lambda x: (x-date).days)

df_video_history_collabs.head()


Out[173]:
videoID viewCount commentCount likeCount dislikeCount crawlTimestamp gradient p_gradient collab_window day_after
id
1 qBZKeIbNDcE 9.0 0 1 0 2016-12-28 03:03:22 NaN NaN Non_Collab -1
2 2A6NSm9aSVQ 139.0 5 14 2 2016-12-28 03:03:23 NaN NaN Non_Collab -1
7 H_PpwvS8hA8 127.0 5 9 0 2016-12-28 03:04:08 NaN NaN Non_Collab -1
9 lsN0rGSfXWw 122.0 4 37 0 2016-12-28 03:04:36 NaN NaN Non_Collab -1
18 Azqy_VVj1PI 47536.0 2072 13325 60 2016-12-29 00:06:15 NaN NaN Non_Collab -1

In [203]:
df_video_history_collabs_test = df_video_history_collabs[(df_video_history_collabs.day_after < 6)]
df_video_history_collabs_test = df_video_history_collabs_test[(df_video_history_collabs_test.p_gradient < 10000)]
print df_video_history_collabs.day_after.unique()
print df_video_history_collabs_test.day_after.unique()


[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117]
[-1  1  2  3  4  5  0]

In [275]:
fig = plt.figure()
ax = sns.boxplot(x=df_video_history_collabs_test["day_after"], y=df_video_history_collabs_test["gradient"], order=[0, 1, 2, 3, 4, 5, -1])
ax.set_xticklabels(['Day 0', 'Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
ax.set_ylim([-6000.0, 120000.0])
save_plot('collab_video_6days_sep_views_box_s_zero.pdf', fig, 1.5*s_width, 1.5*s_height)



In [204]:
fig = plt.figure()
ax = sns.boxplot(x=df_video_history_collabs_test["day_after"], y=df_video_history_collabs_test["gradient"], order=[1, 2, 3, 4, 5, -1])
ax.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
ax.set_ylim([-6000.0, 120000.0])
save_plot('collab_video_6days_sep_views_box_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig = plt.figure()
ax = sns.violinplot(x=df_video_history_collabs_test["day_after"], y=df_video_history_collabs_test["gradient"], order=[1, 2, 3, 4, 5, -1])
ax.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
ax.set_ylim([-6000.0, 190000.0])
save_plot('collab_video_6days_sep_views_box_violin_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig = plt.figure()
ax1 = sns.barplot(x="day_after", y="gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, order=[1, 2, 3, 4, 5, -1])
ax1.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_ylabel('View Gradient')
ax2.set_ylabel('mean(View Gradient)')
save_plot('collab_video_6days_sep_views_ci_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="gradient", data=df_video_history_collabs_test, ax=ax1, order=['Collab', 'Non_Collab'])
ax1 = sns.barplot(x="collab_window", y="gradient", data=df_video_history_collabs_test,ax=ax2, ci=99, errwidth=1., capsize=.1, order=[1, 2, 3, 4, 5, -1])

plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('View Gradient')
ax2.set_ylabel('mean(View Gradient)')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.suptitle('Video-based Channel View Growth (12 Days Window)')
fig.subplots_adjust(top=0.91)
fig.tight_layout()
ax1.set_ylim([-50000.0, 250000.0])
save_plot('collab_video_6days_sep_views_box_ci_violin.pdf', fig, 2*s_width, s_height)

print 'Non_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['gradient'].describe()


Non_Collabs:
count    3.815970e+06
mean     3.328708e+03
std      3.080674e+04
min     -8.462700e+04
25%      1.300000e+01
50%      1.040000e+02
75%      7.630000e+02
max      7.837219e+06
Name: gradient, dtype: float64

Collabs:
count    2.973410e+05
mean     4.159084e+03
std      3.633765e+04
min     -3.096500e+04
25%      1.900000e+01
50%      1.580000e+02
75%      1.212000e+03
max      5.809283e+06
Name: gradient, dtype: float64

In [202]:
len(df_video_history_collabs_test[df_video_history_collabs_test.p_gradient > 10000])


Out[202]:
645

In [258]:
fig = plt.figure()
ax1 = sns.barplot(x="day_after", y="gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, order=[1, 2, 3, 4, 5, -1])
ax1.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_ylabel('View Gradient')
#ax1.set_ylim([-20.0, 750.0])
ax1.xaxis.label.set_visible(False)
ax1.set_yscale('log')
plt.title('Video Views Gradient')
save_plot('collab_video_6days_sep_views_ci_s.pdf', fig, 1.5*s_width, s_height)

fig = plt.figure()
ax1 = sns.barplot(x="day_after", y="p_gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, order=[1, 2, 3, 4, 5, -1])
ax1.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_ylabel('View Growth %')
ax1.xaxis.label.set_visible(False)
ax1.set_yscale('log')
plt.title('Video Views Growth %')
#ax.set_ylim([-20.0, 750.0])
save_plot('collab_video_6days_sep_views_ci_perc_s.pdf', fig, 1.5*s_width, s_height)



In [206]:
fig = plt.figure()
ax = sns.boxplot(x=df_video_history_collabs_test["day_after"], y=df_video_history_collabs_test["p_gradient"], order=[1, 2, 3, 4, 5, -1])
ax.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
#ax.set_ylim([-20.0, 750.0])
save_plot('collab_video_6days_sep_views_box_perc_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig = plt.figure()
ax = sns.violinplot(x=df_video_history_collabs_test["day_after"], y=df_video_history_collabs_test["p_gradient"], order=[1, 2, 3, 4, 5, -1])
ax.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
#ax.set_ylim([-20.0, 750.0])
save_plot('collab_video_6days_sep_views_box_perc_violin_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig = plt.figure()
ax1 = sns.barplot(x="day_after", y="p_gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, order=[1, 2, 3, 4, 5, -1])
ax1.set_xticklabels(['Day 1', 'Day 2', 'Day 3', 'Day 4','Day 5', 'Non-Collab'])
plt.legend(["{}% CI".format(99)])
ax.set_xlabel('Days')
ax.set_ylabel('Views')
save_plot('collab_video_6days_sep_views_ci_perc_s.pdf', fig, 1.5*s_width, 1.5*s_height)

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.violinplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ax=ax1, order=['Collab', 'Non_Collab'])
sns.barplot(x="collab_window", y="p_gradient", data=df_video_history_collabs_test, ci=99, errwidth=1., capsize=.1, ax=ax2, order=['Collab', 'Non_Collab'])
plt.legend(["{}% CI".format(99)])
ax1.set_xlabel('')
ax2.set_xlabel('')
ax1.set_ylabel('View Growth %')
ax2.set_ylabel('mean(View Growth %)')
ax1.set_xticklabels(['Collaboration', 'Non-Collaboration'])
ax2.set_xticklabels(['Collaboration', 'Non-Collaboration'])
fig.suptitle('Video-based Channel View Growth % (12 Days Window)')
fig.tight_layout()
fig.subplots_adjust(top=0.86)
#ax1.set_ylim([-400.0, 1000.0])
save_plot('collab_video_6days_sep_views_box_ci_perc_violin.pdf', fig, 2*s_width, s_height)

print 'Non_Collabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Non_Collab']['p_gradient'].describe()
print '\nCollabs:'
print df_video_history_collabs[df_video_history_collabs.collab_window=='Collab']['p_gradient'].describe()


Non_Collabs:
count    3.815970e+06
mean     5.027533e+01
std      3.373144e+04
min     -5.923076e+01
25%      1.008216e-01
50%      3.408586e-01
75%      1.079559e+00
max      5.090080e+07
Name: p_gradient, dtype: float64

Collabs:
count    2.973410e+05
mean     6.125236e+02
std      3.291674e+05
min     -8.309859e+01
25%      1.045746e-01
50%      3.200044e-01
75%      9.523810e-01
max      1.794914e+08
Name: p_gradient, dtype: float64

In [ ]: