In [38]:
#!/usr/bin/env python
# coding=utf-8

import pandas as pa 
import numpy as np

import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math


from scipy.stats import norm, normaltest, mannwhitneyu, ranksums

import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook

import itertools

import csv
from sqlalchemy import exists, func, and_

from database import *

from matplotlib import pylab, pyplot
from matplotlib import dates

import seaborn as sns
sns.set(color_codes=True)

from scipy import stats, integrate

from datetime import datetime, timedelta, date

date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'

plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')

SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title

x_width  = 6.8898
x_height = x_width / 1.618

s_width  = 3.4449
s_height = s_width / 1.618

def save_plot(name, fig, width, height):
    #fig.tight_layout()
    fig.set_size_inches(width, height)
    #f.subplots_adjust(top=0.86)

    fig.savefig(CDIR+'/'+name, bbox_inches="tight")
    #plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")

In [39]:
DIR = '../../data/data_evaluation_3MONTHS'
CDIR = '../../data/data_evaluation_3MONTHS/charts'

db = YTDatabase()

In [3]:
# get video infos

with db._session_scope(False) as session:

    df_videos = pa.read_sql(session.query(Video).statement, db.engine)

In [4]:
df_videos['dateAdded'] = df_videos['dateAdded'].apply(lambda x: datetime.strptime(x, date_format))
df_videos['crawlTimestamp'] = df_videos['crawlTimestamp'].apply(lambda x: datetime.strptime(x, date_format2))

df_videos.head()


Out[4]:
id channelID title description category dateAdded tags topicIds attribution duration crawlTimestamp deleted
0 __00T6BTcLk UCVD-u8fo_n5EnUkKNrrjEGw CS:GO - Clutch or Kick! #82 Submit a Demo for an upcoming Episode: http://... 20 2017-03-01 19:26:17 ["clayman90","clayman","rechyyy","rech","cok",... [] None PT3M18S 2017-03-02 00:13:34 None
1 __3DpIJilQI UCeE3lj6pLX_gCd0Yvns517Q Ozzy Man Reviews: Cricket Nut Shots Me commentary on some of the most eye watering... 24 2017-01-10 07:00:30 ["ozzy man reviews","ozzy","man","ozzie","ozzy... [] None PT1M54S 2017-01-11 00:09:08 True
2 __3e1lrV5c8 UCajyfb5EUPd3f3deUk0dBEQ NEUE DM ZAHNBÜRSTEN PINSEL im LIVE TEST | Mami... Neu bei dm gibt es die gehypten Zahnbürsten Pi... 26 2017-02-19 11:30:00 ["mamiseelen","dm zahnb\u00fcrsten pinsel","li... ["/m/0yxzwny","/m/012xff"] None PT7M41S 2017-02-20 00:10:55 None
3 __3Pz5Hmm4c UCw6Ou-fRcPa23GvYq-VHEbA IS LONZO BALL'S $495 SIGNATURE SHOE THE ZO2 WO... ENJOY AND LEAVE A LIKE!\nPodcast Link - https:... 26 2017-05-06 04:12:59 ["lonzo","ball","zo2","lonzo ball sneaker","bi... [] None PT11M16S 2017-05-07 00:12:42 None
4 __40p5sII08 UCOt4EtXTO6dDFSfcpAoWJIQ ЭДВАРД РУКИ НОЖНИЦЫ. FTF 2017. Гомель Фрагмент трансляции с Фри Тайм Фест. Гомель 2017 20 2017-04-03 10:00:11 ["Anime","\u0424\u0422\u0424","FTF","2017","\u... [] None PT4M52S 2017-04-04 00:13:14 None

In [5]:
# duration calculation
import isodate

durations = []
for dt in df_videos['duration']:
    dur = isodate.parse_duration(dt)
    durations.append(dur.total_seconds())

df_duration = pa.DataFrame(durations)
print 'Sum:', df_duration.sum(),'seconds'
print 'Sum:', df_duration.sum()/3600,'std'
print df_duration.describe()


Sum: 0    280529577.0
dtype: float64 seconds
Sum: 0    77924.8825
dtype: float64 std
                   0
count  250420.000000
mean     1120.236311
std      2170.378819
min         1.000000
25%       279.000000
50%       596.000000
75%      1019.000000
max     86459.000000

In [40]:
with db._session_scope(False) as session:

    df_feature_videos = pa.read_sql(session.query(VideoFeatures.videoID, Video.duration).filter(and_(VideoFeatures.videoID==Video.id,  Video.category != 20)).statement, db.engine)

In [41]:
df_feature_videos.drop_duplicates(inplace=True)

In [42]:
durations_features = []
for dt in df_feature_videos['duration']:
    dur = isodate.parse_duration(dt)
    durations_features.append(dur.total_seconds())

df_durations_features = pa.DataFrame(durations_features)
print 'Sum:', df_durations_features.sum(),'seconds'
print 'Sum:', df_durations_features.sum()/3600,'std'
print df_durations_features.describe()


Sum: 0    75588164.0
dtype: float64 seconds
Sum: 0    20996.712222
dtype: float64 std
                   0
count  112093.000000
mean      674.334383
std      1359.598486
min         1.000000
25%       227.000000
50%       413.000000
75%       704.000000
max     86459.000000

In [6]:
fig = plt.figure()
ax = sns.distplot(df_duration, kde=False, bins=100)
ax.set_xlabel('Duration (seconds)')
ax.set_ylabel('Videos')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Video Durations')

save_plot('video_durations.pdf', fig, s_width, s_height)

fig = plt.figure()
ax = sns.violinplot(df_duration)
ax.set_xlabel('Duration (seconds)')
ax.set_ylabel('Videos')
#ax.set_xscale('log')
#ax.set_yscale('symlog')
#ax.legend()
plt.title('Video Durations')


/home/mlode/intel/intelpython27/lib/python2.7/site-packages/seaborn/categorical.py:2342: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[6]:
<matplotlib.text.Text at 0x7f1c38d99310>

In [7]:
# Video topics distribution
topics = [x for x in csv.reader(open('../../data/'+'topics.txt','r'), delimiter='\t')]


topicIDs = []
topicTitles = {}
for t, tt in topics:
    topicIDs.append(t)
    topicTitles[t]=tt
            
topicIDs.append('/m/None')    
topicTitles['/m/None'] = 'None'

topicIDs.append('/m/NaT')    
topicTitles['/m/NaT'] = 'Unknown ID'

topiclist = []
for ct in df_videos['topicIds']:
    if len(json.loads(ct))==0:
        topiclist.append('/m/None')
    for t in json.loads(ct):
        if t in topicIDs: # Filter not supported topics (as of 2017, Freebase deprecated)
            topiclist.append(t)
        else:
            topiclist.append('/m/NaT')

df_topics = pa.DataFrame({ 'Topic' : [topicTitles[t] for t in topiclist]})

fig = plt.figure()
ax = df_topics['Topic'].value_counts().sort_values(ascending=True).plot(kind='barh')
ax.set_xlabel('Videos')
ax.set_ylabel('Topic')
ax.set_xscale('symlog', linthreshx=10)
#ax.set_yscale('log')
#ax.legend()
plt.title('Video Topics')
fig.tight_layout()
save_plot('video_topics.pdf', fig, x_width, 1.4*x_height)



In [8]:
print len(df_videos)

print len(df_topics[df_topics.Topic!='None'])
df_topics['Topic'].value_counts()


250420
79203
Out[8]:
None                       191814
Unknown ID                  78624
Football                      119
Basketball                     76
Professional wrestling         70
Mixed martial arts             67
Fashion                        45
American football              28
Food                           23
Boxing                         19
Golf                           15
Fitness                        14
Role-playing video game        13
Children's music               12
Music                          10
Pets                           10
Animated cartoon                8
Movies                          7
Ice hockey                      6
Hip hop music                   5
Motorsport                      5
Independent music               5
Sports                          4
Racing video game               3
Vehicles                        3
Humor                           3
Strategy video game             2
Cricket                         1
Entertainment                   1
Music of Latin America          1
Sports game                     1
Baseball                        1
Tennis                          1
Volleyball                      1
Name: Topic, dtype: int64

In [9]:
#categorys
categories = [x for x in csv.reader(open('../../data/'+'categories.txt','r'), delimiter='\t')]

catIDs = []
catTitles = {}
for t, tt in categories:
    #print t, tt
    catIDs.append(int(t))
    catTitles[int(t)]=tt
    
categorylist = []
for vt in df_videos['category']:
    if int(vt) in catIDs: # Filter not supported
        categorylist.append(int(vt))
        
df_cats = pa.DataFrame({ 'Category' : [catTitles[t] for t in categorylist]})
fig = plt.figure()
ax = df_cats['Category'].value_counts().sort_values(ascending=True).plot(kind='barh')
ax.set_xlabel('Videos')
ax.set_ylabel('Category')
ax.set_xscale('log')
#ax.set_yscale('log')
#ax.legend()
plt.title('Video Categories')
#fig.tight_layout()
save_plot('video_Categories.pdf', fig, x_width, x_height)

print len(df_videos)

df_cats['Category'].value_counts()


250420
Out[9]:
Gaming                    109985
Entertainment              51698
People & Blogs             21904
Music                      12071
How-to & Style              9831
Comedy                      8801
Film & Animation            8420
Science & Technology        8192
Education                   7047
Sports                      4946
News & Politics             3724
Cars & Vehicles             2298
Travel & Events              705
Pets & Animals               538
Non-profits & Activism       254
Name: Category, dtype: int64

In [10]:
channel_groups = df_videos.groupby(by='channelID')

In [11]:
counts = channel_groups['id'].count().sort_values(ascending=False)
print len(channel_groups)
counts.head()
counts.describe()


5891
Out[11]:
count    5891.000000
mean       42.508912
std        69.225110
min         1.000000
25%         7.000000
50%        20.000000
75%        50.000000
max      1052.000000
Name: id, dtype: float64

In [12]:
df_videos['day_added'] = df_videos['dateAdded'].apply(lambda x: x.weekday())
df_videos['time_added'] = df_videos['dateAdded'].apply(lambda x: x.time())
df_videos['min_added'] = df_videos['dateAdded'].apply(lambda x: x.minute)
df_videos['hr_added'] = df_videos['dateAdded'].apply(lambda x: int(x.strftime('%H')))
df_videos['dayn'] = df_videos['dateAdded'].apply(lambda x: x.strftime("%A"))
df_videos.head()


Out[12]:
id channelID title description category dateAdded tags topicIds attribution duration crawlTimestamp deleted day_added time_added min_added hr_added dayn
0 __00T6BTcLk UCVD-u8fo_n5EnUkKNrrjEGw CS:GO - Clutch or Kick! #82 Submit a Demo for an upcoming Episode: http://... 20 2017-03-01 19:26:17 ["clayman90","clayman","rechyyy","rech","cok",... [] None PT3M18S 2017-03-02 00:13:34 None 2 19:26:17 26 19 Wednesday
1 __3DpIJilQI UCeE3lj6pLX_gCd0Yvns517Q Ozzy Man Reviews: Cricket Nut Shots Me commentary on some of the most eye watering... 24 2017-01-10 07:00:30 ["ozzy man reviews","ozzy","man","ozzie","ozzy... [] None PT1M54S 2017-01-11 00:09:08 True 1 07:00:30 0 7 Tuesday
2 __3e1lrV5c8 UCajyfb5EUPd3f3deUk0dBEQ NEUE DM ZAHNBÜRSTEN PINSEL im LIVE TEST | Mami... Neu bei dm gibt es die gehypten Zahnbürsten Pi... 26 2017-02-19 11:30:00 ["mamiseelen","dm zahnb\u00fcrsten pinsel","li... ["/m/0yxzwny","/m/012xff"] None PT7M41S 2017-02-20 00:10:55 None 6 11:30:00 30 11 Sunday
3 __3Pz5Hmm4c UCw6Ou-fRcPa23GvYq-VHEbA IS LONZO BALL'S $495 SIGNATURE SHOE THE ZO2 WO... ENJOY AND LEAVE A LIKE!\nPodcast Link - https:... 26 2017-05-06 04:12:59 ["lonzo","ball","zo2","lonzo ball sneaker","bi... [] None PT11M16S 2017-05-07 00:12:42 None 5 04:12:59 12 4 Saturday
4 __40p5sII08 UCOt4EtXTO6dDFSfcpAoWJIQ ЭДВАРД РУКИ НОЖНИЦЫ. FTF 2017. Гомель Фрагмент трансляции с Фри Тайм Фест. Гомель 2017 20 2017-04-03 10:00:11 ["Anime","\u0424\u0422\u0424","FTF","2017","\u... [] None PT4M52S 2017-04-04 00:13:14 None 0 10:00:11 0 10 Monday

In [13]:
weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}

counts = df_videos['day_added'].value_counts()
print counts.index

fig = plt.figure()

#df_counts = pa.DataFrame(counts, index=[range(len(counts))])
ax = sns.barplot(x=counts.index, y=counts.values, palette=sns.color_palette("Blues_d"))
ax.set_xlabel('Weekday')
ax.set_ylabel('Videos')
ax.set_title('Videos per Weekday')

ax.set_xticklabels(ax.get_xticks())
labels = [item.get_text() for item in ax.get_xticklabels()]
print labels
ax.set_xticklabels([weekdays_dic[float(i)] for i in labels])

save_plot('video_uploads_weekdays.pdf', fig, s_width, s_height)


Int64Index([1, 4, 3, 2, 0, 5, 6], dtype='int64')
[u'0', u'1', u'2', u'3', u'4', u'5', u'6']

In [14]:
#  plotting upload minutes to hours and day to hour


# scatter plot the points to see the dist? heatplot?
#g = sns.lmplot(x="hr_added", y="min_added", hue='day_added', data=group, fit_reg=False)
from scipy.stats import gaussian_kde

# Calculate the point density
x = df_videos['hr_added']
y = df_videos['min_added']
z = df_videos['day_added']

# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")

# Draw the two density plots
ax = sns.kdeplot(x, y, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Minute')
plt.title('Video Upload Times')
save_plot('video_uploads_time_map.pdf', f, x_width, x_height)

weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}

# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")

# Draw the two density plots
ax = sns.kdeplot(x, z, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Day')
ax.set_yticklabels(ax.get_yticks())
labels = [item.get_text() for item in ax.get_yticklabels()]
print labels
ax.set_yticklabels([weekdays_dic[float(i)] for i in labels])
plt.title('Video Upload Times')
save_plot('video_uploads_day_map.pdf', f, x_width, x_height)


[u'-1.0', u'0.0', u'1.0', u'2.0', u'3.0', u'4.0', u'5.0', u'6.0', u'7.0']

In [15]:
# same but small


# scatter plot the points to see the dist? heatplot?
#g = sns.lmplot(x="hr_added", y="min_added", hue='day_added', data=group, fit_reg=False)
from scipy.stats import gaussian_kde

# Calculate the point density
x = df_videos['hr_added']
y = df_videos['min_added']
z = df_videos['day_added']

# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")

# Draw the two density plots
ax = sns.kdeplot(x, y, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Minute')
plt.title('Video Upload Times')
save_plot('video_uploads_time_map_small.pdf', f, s_width, s_height)

weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}

# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")

# Draw the two density plots
ax = sns.kdeplot(x, z, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Day')
ax.set_yticklabels(ax.get_yticks())
labels = [item.get_text() for item in ax.get_yticklabels()]
print labels
ax.set_yticklabels([weekdays_dic[float(i)] for i in labels])
plt.title('Video Upload Times')
save_plot('video_uploads_day_map_small.pdf', f, s_width, s_height)


[u'-1.0', u'0.0', u'1.0', u'2.0', u'3.0', u'4.0', u'5.0', u'6.0', u'7.0']

In [16]:
# frame extraction rate plot

min_frames = 600
max_frames = 8000

def index_spread(frames, duration):
    #print flength, stime
    num_frames = int(((1.0/6.0)* duration) + min_frames)
    if num_frames >= max_frames:
        num_frames = max_frames
    return (duration, num_frames)

#slength = int(frame_num / frame_rate)
#metricFn(frame_num, slength)

l = [index_spread(i, int(i/24)) for i in range(0, 63200*24, 100)]



dat = pa.DataFrame(l, columns=['time_s', 'sel_frames'])
dat['time_h'] = dat['time_s'].apply(lambda x: float(x)/60.0/60.0)
p = dat[['time_h', 'sel_frames']].plot.line(x='time_h', y='sel_frames')
p.set_ylim([0, 10000])
p.set_xlabel('Duration (hour)')
p.set_ylabel('Frames')
plt.legend(['f(n, r)'])
fig = p.get_figure()
save_plot('video_extraction_rate.pdf', fig, s_width, s_height)

d = [0, 60, 600, 1800, 3600, 7200, 43200, 86400]

dl = [index_spread(int(i*24), i) for i in d]
dl


Out[16]:
[(0, 600),
 (60, 610),
 (600, 700),
 (1800, 900),
 (3600, 1200),
 (7200, 1800),
 (43200, 7800),
 (86400, 8000)]

In [ ]:


In [ ]: