Auther : Atique Ur Rehman
I ended up with two datasets from two different spiders
I have conducted three different analyses for just proof of concept
Note: I do not claing any of this data to be mine, it was scrapped for academic purposes only. Ytpak website have a robots.txt file the which on this day 5 March, 2017 reads:
User-agent: *
Allow: /
Sitemap: https://www.ytpak.com/sitemap_index.php
Which means the website allows the scrapping of all the content. The code is released under MIT license, a copy of the license can be found in the root folder.
In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly
import networkx as nx
import matplotlib.dates
import ast
import re
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from datetime import datetime
from plotly.graph_objs import *
from collections import defaultdict
%matplotlib inline
plt.style.use('ggplot')
In [32]:
frame = pd.read_csv("./results.csv")
In [33]:
frame.describe()
Out[33]:
In [34]:
print "Total Comments {}".format(np.sum(frame['type'] == 'comments'))
print "Total meta data items {}".format(np.sum(frame['type'] == 'meta'))
In [35]:
frame['data'] = frame['data'].apply(lambda st : ast.literal_eval(st))
In [36]:
first_meta_item = frame[frame['type'] == 'meta'].iloc[0]
meta_data = first_meta_item['data']
print meta_data.keys()
In [37]:
def get_clean_likes(st):
val = st["likes"]
val = val.replace(",", "").strip()
return np.int64(val)
def get_clean_views(st):
val = st["views"]
val = val.replace(",", "")
val = val.split(" ")[0]
return np.int64(val)
meta_items = frame[frame['type'] == 'meta']
data = meta_items['data']
likes = data.apply(get_clean_likes)
views = data.apply(get_clean_views)
likes = np.array(likes)
views = np.array(views)
In [38]:
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(likes, views)
ax.set_title("Likes VS Views")
ax.set_xlabel("No. of Likes")
ax.set_ylabel("No. of Views")
a,b = np.polyfit(likes, views,1)
ax.plot(likes, a*likes + b, label = "Expected curve")
plt.legend()
Out[38]:
In [39]:
def get_clean_dates(st):
val = st["date"]
date = re.search("Published on (.*) \|", val).group(1)
date = datetime.strptime(date, '%d %b %Y')
return date
def is_kapils_video(st):
title = st['title']
des = st['description']
return "kapil" in title.lower() or "kapil" in des.lower()
meta_items = frame[frame['type'] == 'meta']
data = meta_items['data']
related_data_mask = data.apply(is_kapils_video)
filterted_data = data[related_data_mask]
likes = filterted_data.apply(get_clean_likes)
views = filterted_data.apply(get_clean_views)
dates = filterted_data.apply(get_clean_dates)
likes = np.array(likes)
views = np.array(views)
In [40]:
fig, ax = plt.subplots(1,2,figsize=(20,10))
ax[0].plot_date(dates, likes)
ax[0].set_title("Date VS Likes")
ax[0].set_xlabel("Date")
ax[0].set_ylabel("Likes")
ax[1].plot_date(dates, views)
ax[1].set_title("Date VS Views")
ax[1].set_xlabel("Date")
ax[1].set_ylabel("Views")
Out[40]:
In [41]:
connection_frame = pd.read_csv("./connections.csv")
In [42]:
connection_frame.describe()
Out[42]:
In [43]:
ids = connection_frame["id"]
refrer_ids = connection_frame["refrer"]
titles = connection_frame['title']
sugessted = defaultdict(list)
start = str(ids[0])
for i,r in zip(ids[1:], refrer_ids[1:]):
sugessted[r].append(i)
def distance_count(root, sugessted_tree, hop_count = defaultdict(int), current_hop_count = 1):
suggessions = sugessted_tree[root]
for s in suggessions:
hop_count[s] = current_hop_count
distance_count(s, sugessted_tree,hop_count, current_hop_count +1)
return hop_count
hop_count = distance_count(start,sugessted)
hop_count[start] = 0
In [44]:
G=nx.Graph()
In [45]:
Nodes=ids
G.add_nodes_from(Nodes)
Edges=[(i,r) for i, r in zip(ids, refrer_ids)]
G.add_edges_from(Edges)
In [46]:
plt.figure(figsize=(20,20))
plt.title("Raw graph")
nx.draw(G, node_color='c',edge_color='k', with_labels=False)
These are some utility functions using plotly library for generaing nodes and edgses with colors, titles and other information. These functions can be skipped for now if you jusr want a general idea of what is happening
In [47]:
def scatter_nodes(pos, keys, hop_count, labels, color=None, size=10, opacity=1):
# pos is the dict of node positions
# labels is a list of labels to be displayed when hovering the mouse over the nodes
# color is the color for nodes. When it is set as None the Plotly default color is used
# size is the size of the dots representing the nodes
# opacity is a value between [0,1] defining the node color opacity
trace = Scatter(
x=[],
y=[],
mode='markers',
marker=Marker(size=[],
colorscale='Hot',
reversescale=False,
color=[],
colorbar=dict(
thickness=15,
title='Hop count from searched video',
xanchor='left',
titleside='right')))
for k in keys:
trace['x'].append(pos[k][0])
trace['y'].append(pos[k][1])
trace['marker']['color'].append(hop_count[k])
attrib=dict(name='', text=labels , hoverinfo='text', opacity=0.9) # a dict of Plotly node attributes
trace=dict(trace, **attrib)# concatenate the dict trace and attrib
trace['marker']['size']=size
return trace
def scatter_edges(G, pos, line_color=None, line_width=1):
trace = Scatter(x=[], y=[], mode='lines')
for edge in G.edges():
trace['x'] += [pos[edge[0]][0],pos[edge[1]][0], None]
trace['y'] += [pos[edge[0]][1],pos[edge[1]][1], None]
trace['hoverinfo']='none'
trace['line']['width']=line_width
if line_color is not None: # when it is None a default Plotly color is used
trace['line']['color']=line_color
return trace
In [48]:
pos=nx.fruchterman_reingold_layout(G)
labels = [ "Title : " + titles[i] + "<br> Hop count : " + str(hop_count[k]) for i, k in enumerate(ids) ]
trace1=scatter_edges(G, pos)
trace2=scatter_nodes(pos, ids, hop_count, labels=labels)
In [49]:
width=1000
height=1000
axis=dict(showline=False, # hide axis line, grid, ticklabels and title
zeroline=False,
showgrid=False,
showticklabels=False,
title=''
)
layout=Layout(title= 'YTPAK videos suggession graph', #
font= Font(),
showlegend=False,
autosize=False,
width=width,
height=height,
xaxis=XAxis(axis),
yaxis=YAxis(axis),
margin=Margin(
l=40,
r=40,
b=85,
t=100,
pad=0,
),
hovermode='closest',
# plot_bgcolor='#EFECEA', #set background color
)
data=Data([trace1, trace2])
fig = Figure(data=data, layout=layout)
In [52]:
init_notebook_mode(connected=False)
plt.figure(figsize=(10,10))
py.iplot(fig, filename='YtpakSugessions')
Out[52]:
In [54]:
from IPython.display import Image
Image(filename='./YTPAKSugessions.png')
Out[54]:
In [ ]: