In [38]:
#!/usr/bin/env python
# coding=utf-8
import pandas as pa
import numpy as np
import json
import os
import networkx as nx
import pygraphviz as gz
from networkx.drawing.nx_pydot import write_dot
import math
from scipy.stats import norm, normaltest, mannwhitneyu, ranksums
import matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook
import itertools
import csv
from sqlalchemy import exists, func, and_
from database import *
from matplotlib import pylab, pyplot
from matplotlib import dates
import seaborn as sns
sns.set(color_codes=True)
from scipy import stats, integrate
from datetime import datetime, timedelta, date
date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format2 = '%Y-%m-%d %H:%M:%S'
plt.style.use(['seaborn-paper'])
sns.set_style("whitegrid")
#plt.rc('font', family='serif', serif='Charter')
plt.rc('font', family='serif', serif='DejaVu Serif')
SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 13
plt.rc('font', size=MEDIUM_SIZE) # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE) # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE) # fontsize of the figure title
x_width = 6.8898
x_height = x_width / 1.618
s_width = 3.4449
s_height = s_width / 1.618
def save_plot(name, fig, width, height):
#fig.tight_layout()
fig.set_size_inches(width, height)
#f.subplots_adjust(top=0.86)
fig.savefig(CDIR+'/'+name, bbox_inches="tight")
#plt.savefig(CDIR+'/video_view_percentages.pdf', bbox_inches="tight")
In [39]:
DIR = '../../data/data_evaluation_3MONTHS'
CDIR = '../../data/data_evaluation_3MONTHS/charts'
db = YTDatabase()
In [3]:
# get video infos
with db._session_scope(False) as session:
df_videos = pa.read_sql(session.query(Video).statement, db.engine)
In [4]:
df_videos['dateAdded'] = df_videos['dateAdded'].apply(lambda x: datetime.strptime(x, date_format))
df_videos['crawlTimestamp'] = df_videos['crawlTimestamp'].apply(lambda x: datetime.strptime(x, date_format2))
df_videos.head()
Out[4]:
In [5]:
# duration calculation
import isodate
durations = []
for dt in df_videos['duration']:
dur = isodate.parse_duration(dt)
durations.append(dur.total_seconds())
df_duration = pa.DataFrame(durations)
print 'Sum:', df_duration.sum(),'seconds'
print 'Sum:', df_duration.sum()/3600,'std'
print df_duration.describe()
In [40]:
with db._session_scope(False) as session:
df_feature_videos = pa.read_sql(session.query(VideoFeatures.videoID, Video.duration).filter(and_(VideoFeatures.videoID==Video.id, Video.category != 20)).statement, db.engine)
In [41]:
df_feature_videos.drop_duplicates(inplace=True)
In [42]:
durations_features = []
for dt in df_feature_videos['duration']:
dur = isodate.parse_duration(dt)
durations_features.append(dur.total_seconds())
df_durations_features = pa.DataFrame(durations_features)
print 'Sum:', df_durations_features.sum(),'seconds'
print 'Sum:', df_durations_features.sum()/3600,'std'
print df_durations_features.describe()
In [6]:
fig = plt.figure()
ax = sns.distplot(df_duration, kde=False, bins=100)
ax.set_xlabel('Duration (seconds)')
ax.set_ylabel('Videos')
ax.set_xscale('log')
ax.set_yscale('symlog')
#ax.legend()
plt.title('Video Durations')
save_plot('video_durations.pdf', fig, s_width, s_height)
fig = plt.figure()
ax = sns.violinplot(df_duration)
ax.set_xlabel('Duration (seconds)')
ax.set_ylabel('Videos')
#ax.set_xscale('log')
#ax.set_yscale('symlog')
#ax.legend()
plt.title('Video Durations')
Out[6]:
In [7]:
# Video topics distribution
topics = [x for x in csv.reader(open('../../data/'+'topics.txt','r'), delimiter='\t')]
topicIDs = []
topicTitles = {}
for t, tt in topics:
topicIDs.append(t)
topicTitles[t]=tt
topicIDs.append('/m/None')
topicTitles['/m/None'] = 'None'
topicIDs.append('/m/NaT')
topicTitles['/m/NaT'] = 'Unknown ID'
topiclist = []
for ct in df_videos['topicIds']:
if len(json.loads(ct))==0:
topiclist.append('/m/None')
for t in json.loads(ct):
if t in topicIDs: # Filter not supported topics (as of 2017, Freebase deprecated)
topiclist.append(t)
else:
topiclist.append('/m/NaT')
df_topics = pa.DataFrame({ 'Topic' : [topicTitles[t] for t in topiclist]})
fig = plt.figure()
ax = df_topics['Topic'].value_counts().sort_values(ascending=True).plot(kind='barh')
ax.set_xlabel('Videos')
ax.set_ylabel('Topic')
ax.set_xscale('symlog', linthreshx=10)
#ax.set_yscale('log')
#ax.legend()
plt.title('Video Topics')
fig.tight_layout()
save_plot('video_topics.pdf', fig, x_width, 1.4*x_height)
In [8]:
print len(df_videos)
print len(df_topics[df_topics.Topic!='None'])
df_topics['Topic'].value_counts()
Out[8]:
In [9]:
#categorys
categories = [x for x in csv.reader(open('../../data/'+'categories.txt','r'), delimiter='\t')]
catIDs = []
catTitles = {}
for t, tt in categories:
#print t, tt
catIDs.append(int(t))
catTitles[int(t)]=tt
categorylist = []
for vt in df_videos['category']:
if int(vt) in catIDs: # Filter not supported
categorylist.append(int(vt))
df_cats = pa.DataFrame({ 'Category' : [catTitles[t] for t in categorylist]})
fig = plt.figure()
ax = df_cats['Category'].value_counts().sort_values(ascending=True).plot(kind='barh')
ax.set_xlabel('Videos')
ax.set_ylabel('Category')
ax.set_xscale('log')
#ax.set_yscale('log')
#ax.legend()
plt.title('Video Categories')
#fig.tight_layout()
save_plot('video_Categories.pdf', fig, x_width, x_height)
print len(df_videos)
df_cats['Category'].value_counts()
Out[9]:
In [10]:
channel_groups = df_videos.groupby(by='channelID')
In [11]:
counts = channel_groups['id'].count().sort_values(ascending=False)
print len(channel_groups)
counts.head()
counts.describe()
Out[11]:
In [12]:
df_videos['day_added'] = df_videos['dateAdded'].apply(lambda x: x.weekday())
df_videos['time_added'] = df_videos['dateAdded'].apply(lambda x: x.time())
df_videos['min_added'] = df_videos['dateAdded'].apply(lambda x: x.minute)
df_videos['hr_added'] = df_videos['dateAdded'].apply(lambda x: int(x.strftime('%H')))
df_videos['dayn'] = df_videos['dateAdded'].apply(lambda x: x.strftime("%A"))
df_videos.head()
Out[12]:
In [13]:
weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}
counts = df_videos['day_added'].value_counts()
print counts.index
fig = plt.figure()
#df_counts = pa.DataFrame(counts, index=[range(len(counts))])
ax = sns.barplot(x=counts.index, y=counts.values, palette=sns.color_palette("Blues_d"))
ax.set_xlabel('Weekday')
ax.set_ylabel('Videos')
ax.set_title('Videos per Weekday')
ax.set_xticklabels(ax.get_xticks())
labels = [item.get_text() for item in ax.get_xticklabels()]
print labels
ax.set_xticklabels([weekdays_dic[float(i)] for i in labels])
save_plot('video_uploads_weekdays.pdf', fig, s_width, s_height)
In [14]:
# plotting upload minutes to hours and day to hour
# scatter plot the points to see the dist? heatplot?
#g = sns.lmplot(x="hr_added", y="min_added", hue='day_added', data=group, fit_reg=False)
from scipy.stats import gaussian_kde
# Calculate the point density
x = df_videos['hr_added']
y = df_videos['min_added']
z = df_videos['day_added']
# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")
# Draw the two density plots
ax = sns.kdeplot(x, y, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Minute')
plt.title('Video Upload Times')
save_plot('video_uploads_time_map.pdf', f, x_width, x_height)
weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}
# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")
# Draw the two density plots
ax = sns.kdeplot(x, z, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Day')
ax.set_yticklabels(ax.get_yticks())
labels = [item.get_text() for item in ax.get_yticklabels()]
print labels
ax.set_yticklabels([weekdays_dic[float(i)] for i in labels])
plt.title('Video Upload Times')
save_plot('video_uploads_day_map.pdf', f, x_width, x_height)
In [15]:
# same but small
# scatter plot the points to see the dist? heatplot?
#g = sns.lmplot(x="hr_added", y="min_added", hue='day_added', data=group, fit_reg=False)
from scipy.stats import gaussian_kde
# Calculate the point density
x = df_videos['hr_added']
y = df_videos['min_added']
z = df_videos['day_added']
# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")
# Draw the two density plots
ax = sns.kdeplot(x, y, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Minute')
plt.title('Video Upload Times')
save_plot('video_uploads_time_map_small.pdf', f, s_width, s_height)
weekdays_dic ={-1.0:'',0.0:'Sun', 1.0:'Mon', 2.0:'Tue',3.0:'Wed',4.0:'Thu',5.0:'Fri',6.0:'Sat',7.0:''}
# Set up the figure
f, ax = plt.subplots()
ax.set_aspect("auto")
# Draw the two density plots
ax = sns.kdeplot(x, z, cmap="Blues", shade=True, shade_lowest=False)
ax.set_xlabel('Hour')
ax.set_ylabel('Day')
ax.set_yticklabels(ax.get_yticks())
labels = [item.get_text() for item in ax.get_yticklabels()]
print labels
ax.set_yticklabels([weekdays_dic[float(i)] for i in labels])
plt.title('Video Upload Times')
save_plot('video_uploads_day_map_small.pdf', f, s_width, s_height)
In [16]:
# frame extraction rate plot
min_frames = 600
max_frames = 8000
def index_spread(frames, duration):
#print flength, stime
num_frames = int(((1.0/6.0)* duration) + min_frames)
if num_frames >= max_frames:
num_frames = max_frames
return (duration, num_frames)
#slength = int(frame_num / frame_rate)
#metricFn(frame_num, slength)
l = [index_spread(i, int(i/24)) for i in range(0, 63200*24, 100)]
dat = pa.DataFrame(l, columns=['time_s', 'sel_frames'])
dat['time_h'] = dat['time_s'].apply(lambda x: float(x)/60.0/60.0)
p = dat[['time_h', 'sel_frames']].plot.line(x='time_h', y='sel_frames')
p.set_ylim([0, 10000])
p.set_xlabel('Duration (hour)')
p.set_ylabel('Frames')
plt.legend(['f(n, r)'])
fig = p.get_figure()
save_plot('video_extraction_rate.pdf', fig, s_width, s_height)
d = [0, 60, 600, 1800, 3600, 7200, 43200, 86400]
dl = [index_spread(int(i*24), i) for i in d]
dl
Out[16]:
In [ ]:
In [ ]: