In [1]:
# Calculates the average channel viewcount growth per day, saves csv

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot

import matplotlib.pyplot as plt
#%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func

from database import *

from matplotlib import pylab, pyplot

import seaborn as sns
sns.set(color_codes=True)

current_palette = sns.color_palette()
first = current_palette[0]
second = current_palette[1]
sns.set_palette(
    [second, first] + current_palette[2:]
)

from scipy import stats, integrate

In [2]:
DIR = '../../data/data_evaluation_2'

db = YTDatabase()


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/sqlalchemy/engine/default.py:470: Warning: Can't create database 'mlode'; database exists
  cursor.execute(statement, parameters)

In [3]:
with db._session_scope(False) as session:

    df_channel_history = pa.read_sql(session.query(ChannelHistory).statement, db.engine)

In [4]:
df_channel_history[df_channel_history.channelID == 'UC-lHJZR3Gqxm24_Vd_AJ5Yw']


Out[4]:
id channelID viewCount subscriberCount commentCount videoCount crawlTimestamp
58 59 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14132206795 51598211 289181 3057 2016-12-28 02:57:16
8000 8011 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14145979497 51645606 289181 3058 2016-12-29 00:00:06
15942 15963 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14158774806 51692274 289181 3059 2016-12-30 00:00:05
23883 23914 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14171475075 51731668 289181 3060 2016-12-31 00:00:07
31824 31865 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14182683968 51774245 289181 3061 2017-01-01 00:00:04
39765 39816 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14193935233 51807431 289181 3062 2017-01-02 00:00:10
47706 47767 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14203784589 51840406 289181 3063 2017-01-03 00:00:05
55647 55718 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14215372493 51896204 289181 3064 2017-01-04 00:00:06
63588 63669 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14227166293 51946586 289181 3065 2017-01-05 00:00:05
71529 71620 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14241617250 51980954 289181 3066 2017-01-06 00:00:05
79469 79570 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14253269056 52026530 289181 3067 2017-01-07 00:00:05
87409 87520 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14264283557 52083285 289181 3068 2017-01-08 00:00:06
95348 95469 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14289672774 52143613 289181 3069 2017-01-09 00:00:08
103417 103548 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14304695125 52190347 289181 3070 2017-01-10 00:00:14
111357 111488 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14304695125 52192493 289181 3070 2017-01-10 01:38:51
119297 119428 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14319045407 52228058 289181 3071 2017-01-11 00:00:16
127236 127367 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14329657330 52271395 289181 3072 2017-01-12 00:00:14
135174 135305 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14340411572 52316867 289181 3073 2017-01-13 00:00:13
143113 143244 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14352162791 52366118 289181 3074 2017-01-14 00:00:12
151054 151185 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14425610123 52407628 289181 3075 2017-01-15 00:00:12
158990 159121 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14425610123 52440435 289181 3076 2017-01-16 00:00:32
166926 167057 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14560497168 52471855 289181 3077 2017-01-17 00:00:13
174860 174991 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14607679435 52496111 289181 3078 2017-01-18 00:00:13
182795 182926 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14642309986 52529243 289181 3079 2017-01-19 00:00:13
190731 190862 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14642526557 52559346 289181 3080 2017-01-20 00:00:14
198658 198789 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14652762481 52585356 289181 3081 2017-01-21 00:00:13
206588 206719 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14663068832 52610579 289181 3082 2017-01-22 00:00:13
214519 214650 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14672851674 52644148 289181 3083 2017-01-23 00:00:13
222451 222582 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14685572158 52667438 289181 3084 2017-01-24 00:00:18
230373 230504 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14469166045 52687945 289181 3086 2017-01-25 00:00:14
... ... ... ... ... ... ... ...
270009 270140 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14525527005 52804819 289181 3096 2017-01-30 00:00:22
277931 278062 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14540687031 52832083 289181 3097 2017-01-31 00:00:23
285855 285986 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14556966534 52865237 289181 3098 2017-02-01 00:00:39
293779 293910 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14568574240 52916409 289181 3099 2017-02-02 00:00:14
301703 301834 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14568574240 52946657 289181 3100 2017-02-03 00:00:20
309621 309752 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14580396577 52967242 289181 3101 2017-02-04 00:00:12
317543 317674 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14600189072 53001768 289181 3102 2017-02-05 00:00:12
325464 325595 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14611453836 53034878 289181 3103 2017-02-06 00:00:13
333387 333518 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14623571274 53054674 289181 3104 2017-02-07 00:00:23
341307 341438 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14633497298 53063197 289181 3105 2017-02-08 00:00:20
349230 349361 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14642030603 52749933 289181 3107 2017-02-09 00:00:20
357154 357285 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14642030603 53394491 289181 3107 2017-02-10 00:00:18
365076 365207 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14657485933 53312437 289181 3109 2017-02-11 00:00:35
372995 373126 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14666114013 53233889 289181 3110 2017-02-12 00:00:13
380921 381052 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14676534988 53146966 289181 3110 2017-02-13 00:01:29
388841 388972 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14676534988 53167390 289181 3111 2017-02-14 00:00:24
396762 396893 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14685511029 53195457 289181 3112 2017-02-15 00:00:31
404682 404813 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14694323042 53249565 289181 3111 2017-02-16 00:01:27
412603 412734 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14709814807 53314050 289181 3111 2017-02-17 00:00:14
420523 420654 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14709370995 53437836 289181 3112 2017-02-18 00:00:26
428441 428572 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14711456115 53519122 289181 3113 2017-02-19 00:00:26
436359 436490 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14723629799 53582988 289181 3113 2017-02-20 00:02:01
444276 444407 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14735225497 53635032 289181 3113 2017-02-21 00:00:13
452193 452324 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14743493368 53669525 289181 3114 2017-02-22 00:00:14
460108 460239 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14749942951 53694394 289181 3115 2017-02-23 00:00:14
468025 468156 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14757843312 53730804 289181 3116 2017-02-24 00:00:13
475942 476073 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14766470763 53780338 289181 3116 2017-02-25 00:01:30
483857 483988 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14775589755 53826526 289181 3118 2017-02-26 00:00:13
491770 491901 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14782568924 53866011 289181 3119 2017-02-27 00:00:13
499687 499818 UC-lHJZR3Gqxm24_Vd_AJ5Yw 14782568924 53890765 289181 3120 2017-02-28 00:00:13

64 rows × 7 columns


In [5]:
channel_groups = df_channel_history.groupby(['channelID'])
counts = channel_groups.count().sort_values(by='viewCount')

print counts['id'].count()
print counts[counts.id != 64]['id'].count()


7942
30

In [6]:
group = channel_groups.get_group('UCK1i2UviaXLUNrZlAFpw_jA')
group = group.sort_values(by='id')
group['diff'] = group['viewCount'] - group['viewCount'].shift(1)

group


Out[6]:
id channelID viewCount subscriberCount commentCount videoCount crawlTimestamp diff
3919 3925 UCK1i2UviaXLUNrZlAFpw_jA 6294348732 5017530 101 321 2016-12-28 03:00:14 NaN
11859 11875 UCK1i2UviaXLUNrZlAFpw_jA 6306806646 5025907 101 322 2016-12-29 00:03:08 12457914.0
19802 19828 UCK1i2UviaXLUNrZlAFpw_jA 6319562857 5036506 101 322 2016-12-30 00:03:07 12756211.0
27743 27779 UCK1i2UviaXLUNrZlAFpw_jA 6332654585 5047671 101 322 2016-12-31 00:03:04 13091728.0
35684 35730 UCK1i2UviaXLUNrZlAFpw_jA 6345655230 5058529 101 323 2017-01-01 00:03:07 13000645.0
43624 43680 UCK1i2UviaXLUNrZlAFpw_jA 6358705350 5071510 101 319 2017-01-02 00:03:09 13050120.0
51566 51632 UCK1i2UviaXLUNrZlAFpw_jA 6266832291 5082663 101 316 2017-01-03 00:03:07 -91873059.0
59507 59583 UCK1i2UviaXLUNrZlAFpw_jA 6278092542 5093290 101 316 2017-01-04 00:03:06 11260251.0
67448 67534 UCK1i2UviaXLUNrZlAFpw_jA 6253900053 5103742 101 317 2017-01-05 00:03:04 -24192489.0
75387 75483 UCK1i2UviaXLUNrZlAFpw_jA 6266468980 5114273 101 317 2017-01-06 00:02:50 12568927.0
83329 83435 UCK1i2UviaXLUNrZlAFpw_jA 6279259643 5125283 101 317 2017-01-07 00:02:49 12790663.0
91269 91385 UCK1i2UviaXLUNrZlAFpw_jA 6291982307 5136093 101 318 2017-01-08 00:02:58 12722664.0
99207 99333 UCK1i2UviaXLUNrZlAFpw_jA 6316604422 5148306 101 318 2017-01-09 00:03:43 24622115.0
107276 107407 UCK1i2UviaXLUNrZlAFpw_jA 6329725242 5159316 101 318 2017-01-10 00:03:47 13120820.0
115216 115347 UCK1i2UviaXLUNrZlAFpw_jA 6329725242 5160681 101 318 2017-01-10 01:42:26 0.0
123154 123285 UCK1i2UviaXLUNrZlAFpw_jA 6342209335 5170440 101 318 2017-01-11 00:03:38 12484093.0
131091 131222 UCK1i2UviaXLUNrZlAFpw_jA 6355424348 5180700 101 319 2017-01-12 00:03:39 13215013.0
139034 139165 UCK1i2UviaXLUNrZlAFpw_jA 6368409828 5191569 101 319 2017-01-13 00:03:38 12985480.0
146972 147103 UCK1i2UviaXLUNrZlAFpw_jA 6381602848 5203298 101 319 2017-01-14 00:03:38 13193020.0
154909 155040 UCK1i2UviaXLUNrZlAFpw_jA 6451807974 5214389 101 321 2017-01-15 00:03:37 70205126.0
162847 162978 UCK1i2UviaXLUNrZlAFpw_jA 6451807974 5226920 101 321 2017-01-16 00:04:08 0.0
170785 170916 UCK1i2UviaXLUNrZlAFpw_jA 6737444777 5237705 101 321 2017-01-17 00:03:54 285636803.0
178718 178849 UCK1i2UviaXLUNrZlAFpw_jA 6801836413 5248584 101 322 2017-01-18 00:03:51 64391636.0
186654 186785 UCK1i2UviaXLUNrZlAFpw_jA 6840388774 5259033 101 323 2017-01-19 00:03:45 38552361.0
194588 194719 UCK1i2UviaXLUNrZlAFpw_jA 6843212533 5269945 101 323 2017-01-20 00:03:59 2823759.0
202519 202650 UCK1i2UviaXLUNrZlAFpw_jA 6856931051 5280974 101 323 2017-01-21 00:03:56 13718518.0
210446 210577 UCK1i2UviaXLUNrZlAFpw_jA 6870906289 5291744 101 324 2017-01-22 00:04:07 13975238.0
218375 218506 UCK1i2UviaXLUNrZlAFpw_jA 6884597434 5303505 101 324 2017-01-23 00:04:08 13691145.0
226303 226434 UCK1i2UviaXLUNrZlAFpw_jA 6898367762 5313862 101 324 2017-01-24 00:03:36 13770328.0
234229 234360 UCK1i2UviaXLUNrZlAFpw_jA 6624656213 5324569 101 324 2017-01-25 00:04:07 -273711549.0
... ... ... ... ... ... ... ... ...
273862 273993 UCK1i2UviaXLUNrZlAFpw_jA 6692786929 5378296 101 326 2017-01-30 00:03:59 13833940.0
281785 281916 UCK1i2UviaXLUNrZlAFpw_jA 6706945237 5388828 101 326 2017-01-31 00:04:19 14158308.0
289710 289841 UCK1i2UviaXLUNrZlAFpw_jA 6720073990 5399574 101 327 2017-02-01 00:04:21 13128753.0
297631 297762 UCK1i2UviaXLUNrZlAFpw_jA 6733479120 5410727 101 327 2017-02-02 00:03:54 13405130.0
305551 305682 UCK1i2UviaXLUNrZlAFpw_jA 6733479120 5421484 101 327 2017-02-03 00:04:20 0.0
313467 313598 UCK1i2UviaXLUNrZlAFpw_jA 6790264674 5431790 101 327 2017-02-04 00:03:30 56785554.0
321394 321525 UCK1i2UviaXLUNrZlAFpw_jA 6803763339 5442407 101 328 2017-02-05 00:03:08 13498665.0
329317 329448 UCK1i2UviaXLUNrZlAFpw_jA 6817556756 5454245 101 328 2017-02-06 00:04:08 13793417.0
337237 337368 UCK1i2UviaXLUNrZlAFpw_jA 6831701922 5464491 101 328 2017-02-07 00:03:37 14145166.0
345162 345293 UCK1i2UviaXLUNrZlAFpw_jA 6844550076 5475105 101 328 2017-02-08 00:03:15 12848154.0
353083 353214 UCK1i2UviaXLUNrZlAFpw_jA 6857855691 5484462 101 328 2017-02-09 00:03:30 13305615.0
361004 361135 UCK1i2UviaXLUNrZlAFpw_jA 6857855691 5493898 101 328 2017-02-10 00:04:06 0.0
368928 369059 UCK1i2UviaXLUNrZlAFpw_jA 6884995129 5510239 101 328 2017-02-11 00:04:25 27139438.0
376848 376979 UCK1i2UviaXLUNrZlAFpw_jA 6899615642 5523990 101 329 2017-02-12 00:03:47 14620513.0
384772 384903 UCK1i2UviaXLUNrZlAFpw_jA 6914273359 5538658 101 329 2017-02-13 00:04:57 14657717.0
392694 392825 UCK1i2UviaXLUNrZlAFpw_jA 6914273359 5551816 101 329 2017-02-14 00:03:55 0.0
400613 400744 UCK1i2UviaXLUNrZlAFpw_jA 6942558695 5564732 101 329 2017-02-15 00:04:07 28285336.0
408536 408667 UCK1i2UviaXLUNrZlAFpw_jA 6957278426 5576172 101 330 2017-02-16 00:05:00 14719731.0
416453 416584 UCK1i2UviaXLUNrZlAFpw_jA 6970959469 5587323 101 330 2017-02-17 00:04:01 13681043.0
424372 424503 UCK1i2UviaXLUNrZlAFpw_jA 6984386046 5598240 101 330 2017-02-18 00:04:30 13426577.0
432291 432422 UCK1i2UviaXLUNrZlAFpw_jA 6997541002 5610310 101 331 2017-02-19 00:04:18 13154956.0
440208 440339 UCK1i2UviaXLUNrZlAFpw_jA 7011107614 5622978 101 331 2017-02-20 00:05:45 13566612.0
448123 448254 UCK1i2UviaXLUNrZlAFpw_jA 7024589993 5634352 101 331 2017-02-21 00:03:50 13482379.0
456041 456172 UCK1i2UviaXLUNrZlAFpw_jA 7037167627 5645777 101 331 2017-02-22 00:03:56 12577634.0
463955 464086 UCK1i2UviaXLUNrZlAFpw_jA 7050276769 5659027 101 332 2017-02-23 00:04:01 13109142.0
471873 472004 UCK1i2UviaXLUNrZlAFpw_jA 7063677215 5670555 101 332 2017-02-24 00:03:39 13400446.0
479791 479922 UCK1i2UviaXLUNrZlAFpw_jA 7078362058 5681969 101 332 2017-02-25 00:05:06 14684843.0
487704 487835 UCK1i2UviaXLUNrZlAFpw_jA 7092384134 5693290 101 333 2017-02-26 00:03:42 14022076.0
495620 495751 UCK1i2UviaXLUNrZlAFpw_jA 7106006470 5705418 101 333 2017-02-27 00:03:40 13622336.0
503534 503665 UCK1i2UviaXLUNrZlAFpw_jA 7106006470 5717241 101 333 2017-02-28 00:03:35 0.0

64 rows × 8 columns


In [7]:
fig = plt.figure()
group['viewCount'].plot()
plt.show()
fig1 = plt.figure()
group['diff'].plot()
plt.show()

print group['diff'].mean()


12883456.1587

In [8]:
with db._session_scope(False) as session:
    df_channel_avg_viewcounts = pa.read_sql(session.query(Channel.id, Channel.title).statement, db.engine)

df_channel_avg_viewcounts = df_channel_avg_viewcounts.set_index(['id'])
df_channel_avg_viewcounts['mean viewCount'] = np.nan


for name, group in channel_groups:
    ##print name
    group = channel_groups.get_group(name)
    group = group.sort_values(by='id')
    group['diff'] = group['viewCount'] - group['viewCount'].shift(1)
    df_channel_avg_viewcounts.loc[name, 'mean viewCount'] = group['diff'].mean()


df_channel_avg_viewcounts.head()


Out[8]:
title mean viewCount
id
UC__Pj66OeDibNZNN__L913g TRACKS - ARTE 3640.666667
UC__PZLSRGtUQiTtvm3hPoEQ Cripta dos Mistérios 1854.222222
UC__rmdgxs3ZF0zK_he7Tmig Contoured Living 770.365079
UC_-CxgsxX0tpnm24WO-797Q Mega Gumelar 4623.412698
UC_1FUFB6TlGeGOyDI4ikkzg Best Games For Kids TV 50151.746032

In [9]:
df_channel_avg_viewcounts.loc['UC-lHJZR3Gqxm24_Vd_AJ5Yw']


Out[9]:
title               PewDiePie
mean viewCount    1.03232e+07
Name: UC-lHJZR3Gqxm24_Vd_AJ5Yw, dtype: object

In [10]:
df_avg_viewcounts_sorted = df_channel_avg_viewcounts.sort_values(by='mean viewCount', ascending=False)

df_avg_viewcounts_sorted.to_csv(DIR+r'/df_channel_avg_viewcounts.txt', sep=str('\t'), encoding='utf-8')

In [ ]: