notebook.community

Edit and run



In [1]:

    
#!/usr/bin/env python
# coding=utf-8

# Calculates channel statistics, subscriber counts, popularity classes, views, networks, categories
# No collaboration related data

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math

import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func

from database import *

from matplotlib import pylab, pyplot
from matplotlib import dates

import seaborn as sns
sns.set(color_codes=True)

from scipy import stats, integrate

from datetime import datetime, timedelta, date

date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'

plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')

SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

x_width  = 6.8898
x_height = x_width / 1.618

s_width  = 3.4449
s_height = s_width / 1.618

def save_plot(name, fig, width, height):
    fig.tight_layout()
    fig.set_size_inches(width, height)
    #f.subplots_adjust(top=0.86)

    fig.savefig(CDIR+'/'+name, bbox_inches="tight")
    #plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")



In [2]:

    
DIR = '../../data/data_evaluation_3MONTHS'
CDIR = '../../data/data_evaluation_3MONTHS/charts'

db = YTDatabase()









    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sqlalchemy/engine/default.py:470: Warning: Can't create database 'mlode3m'; database exists
  cursor.execute(statement, parameters)



In [3]:

    
with db._session_scope(False) as session:
    #channel_stats_first = session.query(ChannelHistory).filter( (func.day(crawlTimestamp) == 28) & (func.month(crawlTimestamp) == 12) ).all()
    #channel_stats_last = session.query(ChannelHistory).filter( (func.day(crawlTimestamp) == 28) & (func.month(crawlTimestamp) == 2) ).all()

    df_channel_stats_first = pa.read_sql(session.query(ChannelHistory).filter( (func.day(ChannelHistory.crawlTimestamp) == 28) & (func.month(ChannelHistory.crawlTimestamp) == 12) ).statement, db.engine)
    df_channel_stats_last = pa.read_sql(session.query(ChannelHistory).filter( (func.day(ChannelHistory.crawlTimestamp) == 28) & (func.month(ChannelHistory.crawlTimestamp) == 2) ).statement, db.engine)



In [4]:

    
print df_channel_stats_first['channelID'].nunique()
print df_channel_stats_last['channelID'].nunique()

with db._session_scope(False) as session:
    lenc = session.query(ChannelHistory).count()
print lenc

df_channel_stats_first.head()









    



7942
7915
947207






    Out[4]:






  
    
      
      id
      channelID
      viewCount
      subscriberCount
      commentCount
      videoCount
      crawlTimestamp
    
  
  
    
      0
      1
      UC--BMyA2X4a9PGAo3lTuopg
      123961982
      878515
      3825
      41
      2016-12-28 02:57:13
    
    
      1
      2
      UC-27_Szq7BtHDoC0R2U0zxA
      1495970510
      7783038
      9610
      401
      2016-12-28 02:57:13
    
    
      2
      3
      UC--v6kgEGoApDbKVwFR44cA
      4531047
      46739
      3
      229
      2016-12-28 02:57:13
    
    
      3
      4
      UC-63s9JLCZqIDlhXK6VHb7w
      8834171
      27606
      0
      89
      2016-12-28 02:57:14
    
    
      4
      5
      UC-4kjzuh4822B9yPSgpZQgA
      28671226
      1234411
      0
      49
      2016-12-28 02:57:14



In [5]:

    
print df_channel_stats_first["subscriberCount"].mean()
print df_channel_stats_first["subscriberCount"].median()
print df_channel_stats_first["subscriberCount"][df_channel_stats_first.subscriberCount > 0.0].min()









    



339024.309746
42514.0
4



In [6]:

    
# channel subscriber plot

fig = plt.figure()
ax = sns.distplot(df_channel_stats_first['subscriberCount'], bins=100, kde=False)
ax.set_xlabel('Subscriber')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()

plt.title('Channel Subscriber Counts')
save_plot('channel_subscriber.pdf', fig, s_width, s_height)
#ax.axvline(x=df_channel_stats_first["subscriberCount"].median(), color='red')
#ax.axvline(x=df_channel_stats_first["subscriberCount"].mean(), color='orange')



In [7]:

    
# plotting popularity classes on subscriber range

fig = plt.figure()
ax = sns.distplot(df_channel_stats_first['subscriberCount'], kde=False, bins=500)
ax.set_xlabel('Subscriber')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Channel Subscriber Counts')

classes = [1.0e+3, 1.0e+4, 1.0e+5, 1.0e+6, 1.0e+7, 5.0e+7, 1.0e+8]
for xc in classes:
    ax.axvline(x=xc, color='red', linewidth=.5)
    
save_plot('channel_subscriber_with_classes.pdf', fig, s_width, s_height)



In [8]:

    
fig = plt.figure()
ax = df_channel_stats_first['subscriberCount'].hist(cumulative=True, bins=1000)
ax.set_xlabel('Subscriber')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('log')
#ax.legend()

plt.title('Channel Subscriber')

classes = [1.0e+3, 1.0e+4, 1.0e+5, 1.0e+6, 1.0e+7, 5.0e+7, 1.0e+8]
for xc in classes:
    ax.axvline(x=xc, color='red', linewidth=.5)



In [9]:

    
bins = [0, 1.0e+3, 1.0e+4, 1.0e+5, 1.0e+6, 1.0e+7, 5.0e+7, 1.0e+8]
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

popularity_classes = []

for i, (a, b) in enumerate(pairwise(bins)):
    #print a, b
    popularity_classes.append((a, b))
    
    
popularity_classes









    Out[9]:





[(0, 1000.0),
 (1000.0, 10000.0),
 (10000.0, 100000.0),
 (100000.0, 1000000.0),
 (1000000.0, 10000000.0),
 (10000000.0, 50000000.0),
 (50000000.0, 100000000.0)]



In [10]:

    
class_counts = [d.count() for d in [df_channel_stats_first["subscriberCount"][(df_channel_stats_first.subscriberCount >= ca) & (df_channel_stats_first.subscriberCount < cb)] for ca, cb in pairwise(bins)]]

print class_counts

for ca, cb in pairwise(bins):
    print '({}-{}) {}'.format(ca, cb, df_channel_stats_first["subscriberCount"][(df_channel_stats_first.subscriberCount >= ca) & (df_channel_stats_first.subscriberCount < cb)].count())
    
# make barplot with counts and popularity classes named (0..) and legend with ranges
fig = plt.figure()
df_counts = pa.DataFrame(class_counts, index=[range(len(class_counts))])
ax = sns.barplot(x=df_counts.index, y=df_counts[0], palette=sns.color_palette("Blues_d"))
ax.set_xlabel('Subscriber Classes')
ax.set_ylabel('Channel')
#ax.set_yscale('symlog')

plt.title('Channel Subscriber Classes')
save_plot('channel_subscriber_classes_counts.pdf', fig, s_width, s_height)
#[0, 1.0e+3, 1.0e+4, 1.0e+5, 1.0e+6, 1.0e+7, 5.0e+7, 1.0e+8]
#ax.legend(ax.patches, [r'$0: 0-10^3$', r'$1: 10^3-10^4$', r'$0: 0-10^3$', r'$0: 0-10^3$', r'$0: 0-10^3$', r'$0: 0-10^3$', r'$0: 0-10^3$'])
df_counts









    



[813, 1575, 2569, 2420, 544, 20, 1]
(0-1000.0) 813
(1000.0-10000.0) 1575
(10000.0-100000.0) 2569
(100000.0-1000000.0) 2420
(1000000.0-10000000.0) 544
(10000000.0-50000000.0) 20
(50000000.0-100000000.0) 1






    














    











    Out[10]:






  
    
      
      0
    
  
  
    
      0
      813
    
    
      1
      1575
    
    
      2
      2569
    
    
      3
      2420
    
    
      4
      544
    
    
      5
      20
    
    
      6
      1



In [11]:

    
fig = plt.figure()
ax = sns.distplot(df_channel_stats_first['subscriberCount'], kde=False, bins=bins)
ax.set_xlabel('Subscriber')
ax.set_ylabel('Channel')
ax.set_xscale('symlog')
ax.set_yscale('symlog')
#ax.legend()

plt.title('Channel Subscriber Classes')









    














    











    Out[11]:





<matplotlib.text.Text at 0x7f8209712a90>



In [12]:

    
fig = plt.figure()
#sns.distplot(df_channel_stats_first['viewCount'], bins=50, kde=False)

ax = sns.distplot(df_channel_stats_first['viewCount'], kde=False, bins=100)
ax.set_xlabel('Views')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
plt.title('Channel View Counts')
save_plot('channel_views.pdf', fig, s_width, s_height)

fig = plt.figure()
ax = sns.distplot(df_channel_stats_first['commentCount'], kde=False, bins=100)
ax.set_xlabel('Comments')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
plt.title('Channel Comment Counts')
save_plot('channel_comments.pdf', fig, s_width, s_height)



In [13]:

    
# this are the numbers of videos from the api, ever uploaded or currently
fig = plt.figure()
ax = sns.distplot(df_channel_stats_first['videoCount'], kde=False, bins=100)
ax.set_xlabel('Number of Videos')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Channel Video Counts')

save_plot('channel_videos.pdf', fig, s_width, s_height)



In [14]:

    
# get numbers of videos actually uploaded in our time-span, in our dataset

with db._session_scope(False) as session:
    uploadsChannelID = pa.read_sql(session.query(Video.channelID).statement, db.engine) 

counts = uploadsChannelID['channelID'].value_counts() 

fig = plt.figure()
ax = sns.distplot(counts, kde=False, bins=100)
ax.set_xlabel('Number of Videos')
ax.set_ylabel('Channel')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Channel Video Uploads')

save_plot('channel_uploads.pdf', fig, s_width, s_height)



In [15]:

    
#fig = plt.figure()
ax = sns.lmplot('subscriberCount', 'viewCount', data=df_channel_stats_first, ci=99)
ax.set_xlabels('Subscriber')
ax.set_ylabels('Views')
plt.legend(['99% CI'])
plt.title('Channel View-Subscriber')

save_plot('channel_view_subscriber_reg.pdf', ax.fig, s_width, s_height)



In [16]:

    
DDIR = '../../data/'

df_channel = pa.read_sql(session.query(Channel).statement, db.engine) 

df_channel = df_channel.set_index('id')

#df_channel[df_channel.network == 'Maker_Studios']['network'] = 'Maker Studios'
df_channel.loc[df_channel['network'] == 'Maker_Studios', 'network'] = 'Maker Studios'


print df_channel['network'].nunique()
print df_channel.loc[df_channel['network'] == 'None', 'network'].count()

df_channel.head()









    



371
2553






    Out[16]:






  
    
      
      title
      keywords
      description
      dateAdded
      uploadsPlaylistID
      latestUploadsIDs
      unsubscribedTrailer
      topicIds
      network
      crawlTimestamp
      thumbnailUrl
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      UC__Pj66OeDibNZNN__L913g
      TRACKS - ARTE
      ["ARTE", "tracks", "culture", "musique", "unde...
      Tracks fait le tour des sons et des cultures q...
      2013-11-28T12:52:42.000Z
      UU__Pj66OeDibNZNN__L913g
      ["4CLe7teTNJk","1EAxOnyRPYA","uM1SbbVyVTA","Vx...
      3znd0FOOqx4
      ["/m/02y26_","/m/04rlf"]
      None
      2016-12-28 02:42:58
      https://yt3.ggpht.com/-WLT_WJ83cgk/AAAAAAAAAAI...
    
    
      UC__PZLSRGtUQiTtvm3hPoEQ
      Cripta dos Mistérios
      ["mist\u00e9rios", "lendas", "fimes", "ovnis",...
      Aqui eu falo das coisas que mais gosto MISTÉRI...
      2015-06-21T00:43:12.000Z
      UU__PZLSRGtUQiTtvm3hPoEQ
      ["qKNTQsgFghs","cLZ1W22AIyU","VDggyidswBg","7-...
      UBv-R3OoK2A
      ["/m/02vxn"]
      BroadbandTV
      2016-12-28 02:42:03
      https://yt3.ggpht.com/-7V1tMbgQZ1o/AAAAAAAAAAI...
    
    
      UC__rmdgxs3ZF0zK_he7Tmig
      Contoured Living
      []
      My name is Laura. I am always searching for th...
      2009-09-28T17:21:58.000Z
      UU__rmdgxs3ZF0zK_he7Tmig
      ["K9fFvrTUkeU","LgldWKMjtWs","Hf4rG-Q5r2M","bv...
      
      ["/m/014trl","/m/025t5bz","/m/025zp2s","/m/027...
      None
      2016-12-28 02:51:24
      https://yt3.ggpht.com/-ceQsyCd1l-4/AAAAAAAAAAI...
    
    
      UC_-CxgsxX0tpnm24WO-797Q
      Mega Gumelar
      ["\"Rachel Goddard\"", "\"Tutorial Makeup\"", ...
      Hello Megaliens!\nLet me introduce myself, i a...
      2010-02-19T15:28:50.000Z
      UU_-CxgsxX0tpnm24WO-797Q
      ["s7fvV2VKsFk","Jjj4BV4OGvo","J4CIfBp5tgM","y7...
      0G4UdSvXdyc
      ["/m/014trl","/m/0157xl","/m/027x0j","/m/0wfzt...
      Maker Studios
      2016-12-28 02:51:51
      https://yt3.ggpht.com/-SVZFEnGePB4/AAAAAAAAAAI...
    
    
      UC_1FUFB6TlGeGOyDI4ikkzg
      Best Games For Kids TV
      ["\"Dora Baby Hazel Peppa Pig Bubble Guppies K...
      Best Gameplay! Games for Kids, Children TV.\nM...
      2013-07-19T12:42:48.000Z
      UU_1FUFB6TlGeGOyDI4ikkzg
      ["7PKEvGKtrAY","FwU9FoFEZRw","15pC_5Dd-1o","3s...
      fPRU0l-ehmM
      ["/m/0215n","/m/02dpv4","/m/0ytgt","/m/03bt1gh...
      BroadbandTV
      2016-12-28 02:51:00
      https://yt3.ggpht.com/-5pUwQKkReE8/AAAAAAAAAAI...



In [17]:

    
Xuniques, X = np.unique(df_channel['network'], return_inverse=True)

print len(Xuniques), X

dfnn = pa.DataFrame({ 'Network' : np.sort(X)})

fig = plt.figure()
ax = sns.distplot(dfnn, kde=False, bins=200)

ax.set_xlabel('Networks')
ax.set_ylabel('Member')
#ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Network Member')
ax.set_xticklabels('')
save_plot('network_nof_channel_hist.pdf', fig, s_width, s_height)

counts = dfnn['Network'].value_counts()

print counts.median()
counts.describe()









    



371 [171  47 171 ..., 164 171  47]






    














    











    



1.0






    Out[17]:





count     371.000000
mean       21.407008
std       166.776704
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max      2553.000000
Name: Network, dtype: float64



In [18]:

    
# Channel networks distribution
dfn = pa.DataFrame({ 'Network' : df_channel['network']})
#ax = sns.countplot(x="Network", data=dfn, palette="Greens_d");


fig = plt.figure()
networkcounts = dfn['Network'].value_counts(dropna=False)
print len(networkcounts)
ax = networkcounts[networkcounts > 20.0].sort_values().plot(kind='barh')
networkcounts.to_csv(DIR+'/network_channel_counts.csv')
networks_list = networkcounts[networkcounts > 20.0].index
print networkcounts.head()
print len(networks_list)
print networks_list
ax.set_xlabel('Channel')
ax.set_ylabel('Network')
ax.set_xscale('log')
#ax.set_yscale('log')
#ax.legend()

plt.title('Network Member')
fig.tight_layout()
save_plot('network_nof_channel.pdf', fig, x_width, x_height)









    














    











    



371
None             2553
BroadbandTV      1535
Maker Studios     984
Studio71          658
Fullscreen        309
Name: Network, dtype: int64
20
Index([u'None', u'BroadbandTV', u'Maker Studios', u'Studio71', u'Fullscreen',
       u'Machinima', u'Curse', u'Freedom!', u'ScaleLab', u'StyleHaul',
       u'Social Blade Legacy', u'OmniaMediaCo', u'YouPartnerVSP', u'SonyBMG',
       u'TopBeautyBlog', u'ChannelFrederator', u'AIR', u'Mitu', u'Zoomin TV',
       u'ScreenwaveMedia'],
      dtype='object')



In [19]:

    
# Channel topics distribution
topics = [x for x in csv.reader(open(DDIR+'topics.txt','r'), delimiter='\t')]

topicIDs = []
topicTitles = {}
for t, tt in topics:
    topicIDs.append(t)
    topicTitles[t]=str(tt).replace('(parent topic)', '')

topicIDs.append('/m/None')    
topicTitles['/m/None'] = 'None'

topicIDs.append('/m/NaT')    
topicTitles['/m/NaT'] = 'Unknown ID'

topiclist = []
for ct in df_channel['topicIds']:
    if len(json.loads(ct))==0:
        topiclist.append('/m/None')
    for t in json.loads(ct):
        if t in topicIDs: # Filter not supported topics (as of 2017, Freebase deprecated)
            topiclist.append(t)
        else:
            topiclist.append('/m/NaT')            
            
            
dft = pa.DataFrame({ 'Topic' : [topicTitles[t] for t in topiclist]})
fig = plt.figure()
ax = dft['Topic'].value_counts().sort_values(ascending=True).plot(kind='barh')

ax.set_xlabel('Channel')
ax.set_ylabel('Topic')
ax.set_xscale('symlog', linthreshx=10)
#ax.set_yscale('log')
#ax.legend()

plt.title('Channel Topics')
fig.tight_layout()
save_plot('channel_topics.pdf', fig, x_width, 2*x_height)



In [21]:

    
print len(dft[dft.Topic=='None'])
dft['Topic'].value_counts()









    



50






    Out[21]:





Unknown ID                 32235
Music                       3402
Gaming                      2733
Lifestyle                   2102
Movies                       927
Action game                  766
Hobby                        413
Role-playing video game      406
Sports                       400
Fashion                      355
Entertainment                307
Food                         297
Technology                   273
Vehicles                     263
Strategy video game          179
Hip hop music                167
Animated cartoon             165
Football                     160
Fitness                      100
Children's music              91
Action-adventure game         81
Humor                         80
Knowledge                     78
Society                       70
TV shows                      68
Sports game                   51
None                          50
Basketball                    43
Pets                          39
Electronic music              39
Performing arts               35
Tourism                       20
Pop music                     17
Rock music                    15
Motorsport                    15
Professional wrestling        13
Mixed martial arts             9
American football              8
Music of Latin America         4
Country                        3
Classical music                2
Simulation video game          2
Reggae                         2
Jazz                           1
Cricket                        1
Music of Asia                  1
Name: Topic, dtype: int64



In [20]:

    
categories = [x for x in csv.reader(open(DDIR+'categories.txt','r'), delimiter='\t')]


catIDs = []
catTitles = {}
for t, tt in categories:
    print t, tt
    catIDs.append(int(t))
    catTitles[int(t)]=tt
    
print len(catTitles)









    



1 Film & Animation
2 Cars & Vehicles
10 Music
15 Pets & Animals
17 Sports
19 Travel & Events
20 Gaming
22 People & Blogs
23 Comedy
24 Entertainment
25 News & Politics
26 How-to & Style
27 Education
28 Science & Technology
29 Non-profits & Activism
15



In [21]:

    
# Gets all videos from every channel and checks for category most occuring

df_videos = pa.read_sql(session.query(Video).statement, db.engine) 

print len(df_videos['channelID'].unique())

counts = df_videos.groupby(['channelID'])


dfcc = pa.DataFrame(index=df_channel.index, columns = ['category'])

for ch, s in counts.__iter__():
    #print s['category']
    dfcc.ix[ch, 'category'] = catTitles[int(s['category'].value_counts().idxmax())]              
    if len(s) <= 0:
        print ch, s
    #print s['category'].value_counts().max(), s['category'].value_counts().idxmax()

dfcc.ix[dfcc.category.isnull(), 'category'] = 'None'    
    
print len(dfcc[dfcc.category.isnull()] )
dfcc.head()









    



5891
0






    Out[21]:






  
    
      
      category
    
    
      id
      
    
  
  
    
      UC__Pj66OeDibNZNN__L913g
      Entertainment
    
    
      UC__PZLSRGtUQiTtvm3hPoEQ
      Entertainment
    
    
      UC__rmdgxs3ZF0zK_he7Tmig
      How-to & Style
    
    
      UC_-CxgsxX0tpnm24WO-797Q
      How-to & Style
    
    
      UC_1FUFB6TlGeGOyDI4ikkzg
      Entertainment



In [22]:

    
print len(dfcc)
print len(dfcc[dfcc.category=='None'])
dfcc.category.value_counts()









    



7942
2051






    Out[22]:





None                      2051
Gaming                    1654
Entertainment             1275
People & Blogs             754
Music                      452
How-to & Style             410
Comedy                     405
Film & Animation           297
Education                  191
Science & Technology       171
Sports                     119
Cars & Vehicles             54
News & Politics             35
Travel & Events             31
Pets & Animals              27
Non-profits & Activism      16
Name: category, dtype: int64



In [23]:

    
fig = plt.figure()
ax = dfcc['category'].value_counts().sort_values(ascending=True).plot(kind='barh')

ax.set_xlabel('Channel')
ax.set_ylabel('Category')
ax.set_xscale('log')
#ax.set_yscale('log')
#ax.legend()

plt.title('Channel Categories')
fig.tight_layout()
save_plot('channel_categories.pdf', fig, 1.5*s_width, 1.5*s_height)



In [24]:

    
df_test = pa.DataFrame({ 'topicIds' : df_channel['topicIds'].apply(json.loads)})

df_test['network'] = df_channel['network']

df_stats = df_channel_stats_first.set_index('channelID')

df_test[['viewCount', 'subscriberCount', 'videoCount', 'commentCount']] = df_stats[['viewCount', 'subscriberCount', 'videoCount', 'commentCount']]

df_test['category'] = dfcc['category']

def func(x):
    for i, (ca, cb) in enumerate(popularity_classes):
        if x >= ca and x < cb:
            return i

df_test['popularity'] = df_test['subscriberCount'].apply(func)


df_test.head()

#df_test[np.isnan(df_test.popularity)]

#df_test['topicIds'].sort_values()









    Out[24]:






  
    
      
      topicIds
      network
      viewCount
      subscriberCount
      videoCount
      commentCount
      category
      popularity
    
    
      id
      
      
      
      
      
      
      
      
    
  
  
    
      UC__Pj66OeDibNZNN__L913g
      [/m/02y26_, /m/04rlf]
      None
      3253022
      23029
      967
      0
      Entertainment
      2
    
    
      UC__PZLSRGtUQiTtvm3hPoEQ
      [/m/02vxn]
      BroadbandTV
      310896
      5878
      144
      0
      Entertainment
      1
    
    
      UC__rmdgxs3ZF0zK_he7Tmig
      [/m/014trl, /m/025t5bz, /m/025zp2s, /m/027x0j,...
      None
      1291254
      8146
      294
      121
      How-to & Style
      1
    
    
      UC_-CxgsxX0tpnm24WO-797Q
      [/m/014trl, /m/0157xl, /m/027x0j, /m/0wfztg3, ...
      Maker Studios
      625545
      18990
      67
      101
      How-to & Style
      2
    
    
      UC_1FUFB6TlGeGOyDI4ikkzg
      [/m/0215n, /m/02dpv4, /m/0ytgt, /m/03bt1gh, /m...
      BroadbandTV
      89020205
      106760
      288
      0
      Entertainment
      3



In [25]:

    
def func(x):
    topiclist = []
    for t in x:
        if t in topicIDs: # Filter not supported topics (as of 2017, Freebase deprecated)
            topiclist.append(topicTitles[t])
    if len(topiclist)<=0:
        topiclist.append(topicTitles['/m/None'])
    return topiclist

def func2(x):
    topiclist = []
    for t in x:
        if t in topicIDs:
            return topicTitles[t]
    if len(list(x))<=0:
        return topicTitles['/m/None']

    
df_test_copy = df_test.copy()

df_test_copy['topicIds'] = df_test_copy['topicIds'].apply(func)
df_test['topicIds'] = df_test['topicIds'].apply(func2)

print df_test[['topicIds', 'category']].head(10)
print df_test_copy[['topicIds', 'category']].head(10)









    



                                   topicIds        category
id                                                         
UC__Pj66OeDibNZNN__L913g              Music   Entertainment
UC__PZLSRGtUQiTtvm3hPoEQ             Movies   Entertainment
UC__rmdgxs3ZF0zK_he7Tmig          Lifestyle  How-to & Style
UC_-CxgsxX0tpnm24WO-797Q          Lifestyle  How-to & Style
UC_1FUFB6TlGeGOyDI4ikkzg             Movies   Entertainment
UC_1H9v258pXiyLDaW0R5exw  American football          Sports
UC_1HVMnw-610qx54iEiWk7A             Movies       Education
UC_23cGTZrzuilpX42sHZcnQ        Action game          Gaming
UC_2Fn6J_8D3SvpMrKUAFLOw              Music            None
UC_3gH6zKpk3aCkRkiTik5JA          Lifestyle  People & Blogs
                                                                topicIds  \
id                                                                         
UC__Pj66OeDibNZNN__L913g                                         [Music]   
UC__PZLSRGtUQiTtvm3hPoEQ                                        [Movies]   
UC__rmdgxs3ZF0zK_he7Tmig                                     [Lifestyle]   
UC_-CxgsxX0tpnm24WO-797Q                              [Lifestyle, Music]   
UC_1FUFB6TlGeGOyDI4ikkzg                                        [Movies]   
UC_1H9v258pXiyLDaW0R5exw  [American football, American football, Sports]   
UC_1HVMnw-610qx54iEiWk7A                                        [Movies]   
UC_23cGTZrzuilpX42sHZcnQ                           [Action game, Gaming]   
UC_2Fn6J_8D3SvpMrKUAFLOw                                         [Music]   
UC_3gH6zKpk3aCkRkiTik5JA                            [Lifestyle, Fashion]   

                                category  
id                                        
UC__Pj66OeDibNZNN__L913g   Entertainment  
UC__PZLSRGtUQiTtvm3hPoEQ   Entertainment  
UC__rmdgxs3ZF0zK_he7Tmig  How-to & Style  
UC_-CxgsxX0tpnm24WO-797Q  How-to & Style  
UC_1FUFB6TlGeGOyDI4ikkzg   Entertainment  
UC_1H9v258pXiyLDaW0R5exw          Sports  
UC_1HVMnw-610qx54iEiWk7A       Education  
UC_23cGTZrzuilpX42sHZcnQ          Gaming  
UC_2Fn6J_8D3SvpMrKUAFLOw            None  
UC_3gH6zKpk3aCkRkiTik5JA  People & Blogs



In [26]:

    
df_test.ix[df_test.category.isnull(), 'category'] = 'None'

groups = df_test.groupby(by=['category', 'topicIds'])
#print df_test.head()
test = groups.size()

print test.describe()

from sklearn.preprocessing import normalize, MinMaxScaler

#area = normalize(test.values[:,np.newaxis], axis=0).ravel()

min_max_scaler = MinMaxScaler(feature_range=(10, 100))
area = min_max_scaler.fit_transform(test.values)


Xuniques, X = np.unique(df_test['category'], return_inverse=True)
Yuniques, Y = np.unique(df_test['topicIds'], return_inverse=True)

fig = plt.figure()

ax = sns.regplot(x=X, y=Y, fit_reg=False, scatter_kws={"s":area})
ax.set(xticks=range(len(Xuniques)), xticklabels=Xuniques,
       yticks=range(len(Yuniques)), yticklabels=Yuniques) 
ax.set_xticklabels(Xuniques, rotation=90, rotation_mode="anchor")
ax.set_xlabel('Category')
ax.set_ylabel('Topic')
ax.tick_params(axis='x', direction='out', pad=52)

leg = [1.0, 250.0, 680.0]
area2 = min_max_scaler.fit_transform(leg)

gll = plt.scatter([],[], s=area2[0], marker='o')
gl = plt.scatter([],[], s=area2[1], marker='o')
ga = plt.scatter([],[], s=area2[2], marker='o')

plt.legend((gll,gl,ga),
       ('1 (min)', '250', '680 (max)'),
       scatterpoints=1,
       loc='upper center', bbox_to_anchor=(-0.15, -0.05),
       ncol=1,
       fontsize=9)

plt.title('Channel Topic-Category')
#fig.tight_layout()
save_plot('channel_topic_category_rel.pdf', fig, x_width, 2.8*x_height)









    



count    279.000000
mean      27.681004
std       70.105024
min        1.000000
25%        1.500000
50%        5.000000
75%       18.000000
max      573.000000
dtype: float64






    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, _DataConversionWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    














    











    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)



In [27]:

    
df_test_filtered = df_test[df_test.network.isin(networks_list)]
df_sizes = pa.DataFrame(index=df_test_filtered['network'].unique())

groups = df_test_filtered.groupby(by=['network', 'popularity'])
#print df_test_filtered.head()
test = groups.size()


counts = df_test_filtered['network'].value_counts()


Xuniques, X = np.unique(df_test_filtered['network'], return_inverse=True)
#print pa.DataFrame(X)[0].value_counts()

from sklearn.preprocessing import normalize, MinMaxScaler

print test.describe()

min_max_scaler = MinMaxScaler(feature_range=(10, 100))
area = min_max_scaler.fit_transform(test.values)
#print area

fig = plt.figure()
ax = sns.regplot(y=X, x=df_test_filtered['popularity'], fit_reg=False, scatter_kws={"s":area})
ax.set(yticks=range(len(Xuniques)), yticklabels=Xuniques) 
#ax.set_xticklabels(Xuniques, rotation=45, rotation_mode="anchor")
#ax.tick_params(axis='x', direction='out', pad=52)

leg = [1.0, 300.0, 739.0]
area2 = min_max_scaler.fit_transform(leg)

gll = plt.scatter([],[], s=area2[0], marker='o')
gl = plt.scatter([],[], s=area2[1], marker='o')
ga = plt.scatter([],[], s=area2[2], marker='o')

plt.legend((gll,gl,ga),
       ('1 (min)', '300', '739 (max)'),
       scatterpoints=1,
       loc='upper center', bbox_to_anchor=(-0.15, -0.05),
       ncol=1,
       fontsize=9)

ax.set_xlabel('Popularity Class')
ax.set_ylabel('Network')

plt.title('Network Popularity Classes')
fig.tight_layout()
save_plot('channel_network_classes_rel.pdf', fig, x_width, 1.5*x_height)









    



count     97.000000
mean      71.731959
std      158.997872
min        1.000000
25%        4.000000
50%       13.000000
75%       37.000000
max      739.000000
dtype: float64






    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    














    











    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)



In [28]:

    
df_test_filtered = df_test[df_test.network.isin(networks_list)]
df_sizes = pa.DataFrame(index=df_test_filtered['network'].unique())

groups = df_test_filtered.groupby(by=['network', 'popularity'])

test = groups.size()

test = test.sort_values(ascending=False)
print len(test)
print len(test.index.levels[0])

test2 = test.reset_index()
test2.columns = ['network', 'popularity', 'count']

print test2.head()
# number of channels per network
#counts = df_test_filtered['network'].value_counts()

fig = plt.figure()
#test2.plot(kind='bar')


ax = sns.barplot(x='network', y='count', data=test2, hue='popularity')
ax.set_xticklabels(ax.get_xticklabels(),rotation=45, ha="right")

ax.set_xlabel('Network')
ax.set_ylabel('Number of Channels')
#ax.set_yscale('symlog', linthreshx=10)
ax.set_yscale('log')
l = ax.legend(ncol=len(classes))
l.set_title('Popularity Class')
plt.title('Network Popularity Classes')
fig.tight_layout()

save_plot('channel_network_classes_rel_bar.pdf', fig, 2*x_width, x_height)









    



97
20
       network  popularity  count
0         None           1    739
1  BroadbandTV           3    708
2         None           2    706
3         None           0    618
4  BroadbandTV           2    504






    














    











    



/home/mlode/intel/intelpython27/lib/python2.7/site-packages/matplotlib/scale.py:101: RuntimeWarning: invalid value encountered in less_equal
  a[a <= 0.0] = 1e-300



In [29]:

    
df_uniq = df_test.copy()

Xuniques, X = np.unique(df_test['category'], return_inverse=True)
Yuniques, Y = np.unique(df_test['topicIds'], return_inverse=True)
Yuniques, Z = np.unique(df_test['network'], return_inverse=True)

df_uniq['category'] = X
df_uniq['topicIds'] = Y
df_uniq['network'] = Z
  
#fig = plt.figure()
#sns.pairplot(df_uniq)



In [30]:

    
df_test.head()









    Out[30]:






  
    
      
      topicIds
      network
      viewCount
      subscriberCount
      videoCount
      commentCount
      category
      popularity
    
    
      id
      
      
      
      
      
      
      
      
    
  
  
    
      UC__Pj66OeDibNZNN__L913g
      Music
      None
      3253022
      23029
      967
      0
      Entertainment
      2
    
    
      UC__PZLSRGtUQiTtvm3hPoEQ
      Movies
      BroadbandTV
      310896
      5878
      144
      0
      Entertainment
      1
    
    
      UC__rmdgxs3ZF0zK_he7Tmig
      Lifestyle
      None
      1291254
      8146
      294
      121
      How-to & Style
      1
    
    
      UC_-CxgsxX0tpnm24WO-797Q
      Lifestyle
      Maker Studios
      625545
      18990
      67
      101
      How-to & Style
      2
    
    
      UC_1FUFB6TlGeGOyDI4ikkzg
      Movies
      BroadbandTV
      89020205
      106760
      288
      0
      Entertainment
      3



In [31]:

    
df_test.to_csv(DIR+r'/df_channel_statistics_first_day.txt', sep=str('\t'), encoding='utf-8')



In [32]:

    
df_test['popularity'].max()









    Out[32]:





6



In [33]:

    
# number of cluster per channel -> number of persons per channel (content creator)
# average etc.



In [34]:

    
with db._session_scope(False) as session:
    #channel_stats_first = session.query(ChannelHistory).filter( (func.day(crawlTimestamp) == 28) & (func.month(crawlTimestamp) == 12) ).all()
    #channel_stats_last = session.query(ChannelHistory).filter( (func.day(crawlTimestamp) == 28) & (func.month(crawlTimestamp) == 2) ).all()

    df_channel_cluster = pa.read_sql(session.query(Video.channelID, VideoFaceCluster.cluster).filter( (VideoFaceCluster.featureID==VideoFeatures.id) & (VideoFeatures.videoID==Video.id)).statement, db.engine)



In [35]:

    
df_channel_cluster.head()









    Out[35]:






  
    
      
      channelID
      cluster
    
  
  
    
      0
      UC--BMyA2X4a9PGAo3lTuopg
      9510
    
    
      1
      UC--BMyA2X4a9PGAo3lTuopg
      9510
    
    
      2
      UC--BMyA2X4a9PGAo3lTuopg
      9510
    
    
      3
      UC--BMyA2X4a9PGAo3lTuopg
      12551
    
    
      4
      UC--BMyA2X4a9PGAo3lTuopg
      12551



In [ ]:

	id	channelID	viewCount	subscriberCount	commentCount	videoCount	crawlTimestamp
0	1	UC--BMyA2X4a9PGAo3lTuopg	123961982	878515	3825	41	2016-12-28 02:57:13
1	2	UC-27_Szq7BtHDoC0R2U0zxA	1495970510	7783038	9610	401	2016-12-28 02:57:13
2	3	UC--v6kgEGoApDbKVwFR44cA	4531047	46739	3	229	2016-12-28 02:57:13
3	4	UC-63s9JLCZqIDlhXK6VHb7w	8834171	27606	0	89	2016-12-28 02:57:14
4	5	UC-4kjzuh4822B9yPSgpZQgA	28671226	1234411	0	49	2016-12-28 02:57:14

	title	keywords	description	dateAdded	uploadsPlaylistID	latestUploadsIDs	unsubscribedTrailer	topicIds	network	crawlTimestamp	thumbnailUrl
id
UC__Pj66OeDibNZNN__L913g	TRACKS - ARTE	["ARTE", "tracks", "culture", "musique", "unde...	Tracks fait le tour des sons et des cultures q...	2013-11-28T12:52:42.000Z	UU__Pj66OeDibNZNN__L913g	["4CLe7teTNJk","1EAxOnyRPYA","uM1SbbVyVTA","Vx...	3znd0FOOqx4	["/m/02y26_","/m/04rlf"]	None	2016-12-28 02:42:58	https://yt3.ggpht.com/-WLT_WJ83cgk/AAAAAAAAAAI...
UC__PZLSRGtUQiTtvm3hPoEQ	Cripta dos Mistérios	["mist\u00e9rios", "lendas", "fimes", "ovnis",...	Aqui eu falo das coisas que mais gosto MISTÉRI...	2015-06-21T00:43:12.000Z	UU__PZLSRGtUQiTtvm3hPoEQ	["qKNTQsgFghs","cLZ1W22AIyU","VDggyidswBg","7-...	UBv-R3OoK2A	["/m/02vxn"]	BroadbandTV	2016-12-28 02:42:03	https://yt3.ggpht.com/-7V1tMbgQZ1o/AAAAAAAAAAI...
UC__rmdgxs3ZF0zK_he7Tmig	Contoured Living	[]	My name is Laura. I am always searching for th...	2009-09-28T17:21:58.000Z	UU__rmdgxs3ZF0zK_he7Tmig	["K9fFvrTUkeU","LgldWKMjtWs","Hf4rG-Q5r2M","bv...		["/m/014trl","/m/025t5bz","/m/025zp2s","/m/027...	None	2016-12-28 02:51:24	https://yt3.ggpht.com/-ceQsyCd1l-4/AAAAAAAAAAI...
UC_-CxgsxX0tpnm24WO-797Q	Mega Gumelar	["\"Rachel Goddard\"", "\"Tutorial Makeup\"", ...	Hello Megaliens!\nLet me introduce myself, i a...	2010-02-19T15:28:50.000Z	UU_-CxgsxX0tpnm24WO-797Q	["s7fvV2VKsFk","Jjj4BV4OGvo","J4CIfBp5tgM","y7...	0G4UdSvXdyc	["/m/014trl","/m/0157xl","/m/027x0j","/m/0wfzt...	Maker Studios	2016-12-28 02:51:51	https://yt3.ggpht.com/-SVZFEnGePB4/AAAAAAAAAAI...
UC_1FUFB6TlGeGOyDI4ikkzg	Best Games For Kids TV	["\"Dora Baby Hazel Peppa Pig Bubble Guppies K...	Best Gameplay! Games for Kids, Children TV.\nM...	2013-07-19T12:42:48.000Z	UU_1FUFB6TlGeGOyDI4ikkzg	["7PKEvGKtrAY","FwU9FoFEZRw","15pC_5Dd-1o","3s...	fPRU0l-ehmM	["/m/0215n","/m/02dpv4","/m/0ytgt","/m/03bt1gh...	BroadbandTV	2016-12-28 02:51:00	https://yt3.ggpht.com/-5pUwQKkReE8/AAAAAAAAAAI...

	category
id
UC__Pj66OeDibNZNN__L913g	Entertainment
UC__PZLSRGtUQiTtvm3hPoEQ	Entertainment
UC__rmdgxs3ZF0zK_he7Tmig	How-to & Style
UC_-CxgsxX0tpnm24WO-797Q	How-to & Style
UC_1FUFB6TlGeGOyDI4ikkzg	Entertainment

	topicIds	network	viewCount	subscriberCount	videoCount	commentCount	category	popularity
id
UC__Pj66OeDibNZNN__L913g	[/m/02y26_, /m/04rlf]	None	3253022	23029	967	0	Entertainment	2
UC__PZLSRGtUQiTtvm3hPoEQ	[/m/02vxn]	BroadbandTV	310896	5878	144	0	Entertainment	1
UC__rmdgxs3ZF0zK_he7Tmig	[/m/014trl, /m/025t5bz, /m/025zp2s, /m/027x0j,...	None	1291254	8146	294	121	How-to & Style	1
UC_-CxgsxX0tpnm24WO-797Q	[/m/014trl, /m/0157xl, /m/027x0j, /m/0wfztg3, ...	Maker Studios	625545	18990	67	101	How-to & Style	2
UC_1FUFB6TlGeGOyDI4ikkzg	[/m/0215n, /m/02dpv4, /m/0ytgt, /m/03bt1gh, /m...	BroadbandTV	89020205	106760	288	0	Entertainment	3

	topicIds	network	viewCount	subscriberCount	videoCount	commentCount	category	popularity
id
UC__Pj66OeDibNZNN__L913g	Music	None	3253022	23029	967	0	Entertainment	2
UC__PZLSRGtUQiTtvm3hPoEQ	Movies	BroadbandTV	310896	5878	144	0	Entertainment	1
UC__rmdgxs3ZF0zK_he7Tmig	Lifestyle	None	1291254	8146	294	121	How-to & Style	1
UC_-CxgsxX0tpnm24WO-797Q	Lifestyle	Maker Studios	625545	18990	67	101	How-to & Style	2
UC_1FUFB6TlGeGOyDI4ikkzg	Movies	BroadbandTV	89020205	106760	288	0	Entertainment	3

	channelID	cluster
0	UC--BMyA2X4a9PGAo3lTuopg	9510
1	UC--BMyA2X4a9PGAo3lTuopg	9510
2	UC--BMyA2X4a9PGAo3lTuopg	9510
3	UC--BMyA2X4a9PGAo3lTuopg	12551
4	UC--BMyA2X4a9PGAo3lTuopg	12551