In [2]:
%matplotlib inline
import json
import math
import facebook
import re
import pytz
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib import rcParams
from collections import Counter
from scipy.sparse import coo_matrix, vstack, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, chi2, f_classif
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn import tree
from sklearn import neighbors
import warnings
warnings.filterwarnings('ignore')
import matplotlib.cm as cm
import matplotlib as mpl
# set some nicer defaults for matplotlib
from matplotlib import rcParams
# these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (17, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
rcParams['legend.fancybox'] = True
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecessary plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
# turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
# now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
Driven by strong business incentives and the rise of technology-empowered user tracking, recommendation systems have grown ever more sophisticated and elaborated.
However, many of the methods developed for product and content recommendations are ill-suited for live content where options are curated rather than chosen, like cable TV. Unlike on-demand streaming services (such as Netflix or Hulu) where the subscriber can choose which show or movie to watch next, the TV viewer is left with only one option: switching the channel. The set of alternatives available on TV is dynamic: a show that is airing at this moment will likely be over in a hour, creating an ever-changing time and network-dependent patchwork of content.
Those characteristics necessitate unique recommendation system for cable TV content. Here, we develop "stickiness" — a dynamic, behavior-dependent metric to measure TV show engagement. Our goal was to provide the building block upon wich a dynamic, TV content recommender system technology could be built.
Our dataset comes from TV streaming company Philo, formerly Tivli (we have permission from the company to use this data for the purposes of this project).
The data set includes:
The front-end video player requests video snippets in 5 second increments. This data represents a log from the CDN serving the video snippets. It covers a 4 month period in early 2012 and only includes views from Harvard University students. Users had the option to sign in with Facebook.
In [2]:
# load data in
df = pd.read_csv("data/raw_data_hashed.tsv", sep="\t")
# remove unecessary column
df = df.drop('match', 1)
# Remove fb_ prefix but keep as string, since facebook ids are often longer than maxint
df.fb_id = df.fb_id.apply(lambda fb_id: fb_id[3:] if (isinstance(fb_id, basestring) and fb_id.lower().startswith("fb_")) else fb_id)
# create new column that combines fb id and session id
nanarray = [0 if type(df.fb_id.values[i]) == type(0.0) else 1 for i in range(len(df.fb_id.values))]
df['combined_id'] = [df.fb_id.values[i] if nanarray[i] == 1 else df.uu_id.values[i] for i in range(len(df.values))]
df.combined_id = df.combined_id.dropna()
# index 35276 is a sole datapoint on Nov 28 (several days before the rest of the data starts)
df = df.drop(35276)
# sort by time - very important! - pandas indexing bug otherwise
df.sort(columns='date', inplace=True)
# remove rows where show name is 'NaN' (caused by server error)
df = df[~pd.isnull(df['show'])].reset_index()
del df['index']
In [3]:
# fetch genders from Facebook
APP_TOKEN # some API token
graph = facebook.GraphAPI(APP_TOKEN)
gender_cache = {}
def fetch_gender(fb_id):
if pd.isnull(fb_id):
return fb_id
else:
if fb_id in gender_cache:
# we've already fetched the gender and it's in the cache, return that
return gender_cache[fb_id]
else:
result = np.nan
try:
response = graph.get_object(fb_id)
if 'gender' in response:
# a small minority of users mysteriously don't have genders
result = response['gender']
except facebook.GraphAPIError:
# user doesn't exist anymore
pass
gender_cache[fb_id] = result
return result
df['gender'] = df.fb_id.apply(fetch_gender)
In [26]:
# df = pd.write_csv('data/processed.csv')
df = pd.read_csv('data/processed_hashed.csv')
# parse dates
df.date = pd.to_datetime(df.date, utc=True)
df.date = df.date.apply(lambda date: date.tz_localize('UTC').tz_convert('US/Eastern'))
del df['Unnamed: 0']
In [23]:
'''
import hashlib
salt # some salt we used
def hash_me(x):
if pd.isnull(x):
return x
hash_object = hashlib.md5(str(x)+salt)
digest = hash_object.hexdigest()
return digest
df['session_id'] = df['session_id'].map(hash_me)
df['uu_id'] = df['uu_id'].map(hash_me)
df['fb_id'] = df['fb_id'].map(hash_me)
df['combined_id'] = df['combined_id'].map(hash_me)
df.head(10)
'''
Out[23]:
In this section, we perform EDA to gain an intuitive sense of characteristics of the data set.
In [27]:
df.tail()
Out[27]:
Let's try to grapple with the dimensions of the data set.
In [4]:
print "Total datapoints: " + str(len(df))
print "Number of shows: " + str(len(df.show.unique()))
print "Number of networks: " + str(len(df.network.unique()))
print "Number of Facebook ids: " + str(len(df.fb_id.unique()))
print "Number of session ids: " + str(len(df.uu_id.unique()))
# Make sure to update pandas and numpy before running, there's a bug in date comparisons in earlier versions
print "Date range: " + str(df.date.min()) + " to " + str(df.date.max()) + " (range: " + str(df.date.max() - df.date.min()) + ")"
genders = df.groupby('fb_id')['gender'].first()
print("Facebook user gender: " + '{:.2%}'.format(len(genders[genders == "male"]) / float(len(genders)))
+ " male (gender data missing for " + '{:.2%}'.format(1 - len(genders.dropna()) / float(len(genders))) + " of FB users)")
This is lots of data — 130,000 data points spread over 2400 shows. 27 networks splits 2400 shows, which seems a bit high (~100 shows per network over 3 months), but there a lot of one-time shows. The data otherwise seems reasonable and passes the sniff test.
Most users chose not to login with Facebook. Also, we can see that more men than women chose to login with Facebook, though not by a large margin.
Armed with the data, the first question that comes to mind is what is the most viewed show? Let's answer that.
In [5]:
shows = df.groupby("show").timed.sum() / 60
shows.sort(ascending=False)
shows = shows[:25]
shows.plot(kind='bar', title="Shows ranked by minutes watched", grid=False)
plt.ylabel('Cumulative minutes spent watching')
remove_border()
It's eye-opening to see how popular sports are. Football is particularly striking, especially since the 2011 season ran from September 8 to February 5, which overlaps with the dataset for only about 2 months.
This highlights a suprising property of the data set: we expected TV series to be more popular given the regular airings and dedicated audience. The graph shows otherwise: the Grammy Awards — a one-day event — comes in fourth, larger than many popular TV shows (i.e. more man-minutes were spent watching the Grammys than the entire season of the Big Bang Theory). There is a similar dynamic with the Super Bowl.
Also, the TV series featured in the top 25 most popular seem to confirm a college audience: Big Bang Theory, Family Guy, etc.
Let's dive deeper and plot the above graph as a stacked bar graph of genders (only for Facebook users since gender data is only available for those users). In addition, let's look at network popularity.
In [6]:
fb_users = df[~pd.isnull(df['fb_id'])]
nonfb_users = df[~pd.isnull(df['uu_id'])]
for field in ['show', 'network']:
aggregate = fb_users.groupby([field, "gender"]).timed.sum().unstack().fillna(0)
aggregate['sum'] = aggregate['female'] + aggregate['male']
aggregate = aggregate.sort('sum', ascending=False)[:25]
aggregate = aggregate.drop('sum', 1)
aggregate.plot(kind='bar', title=field.capitalize() + " by popularity and gender (Facebook users only)", grid=False, stacked=True)
remove_border()
The first important observation is that the first graph is very similar to the previous one, i.e. Facebook users are generally similar to other users, though they do present differences. It is difficult to say what dynamics are skewing the Facebook data, but it perhaps has partly to do with the fact that men are more likley to login with Facebook than women.
In addition, it is clear that the top shows (mostly sports) are vastly dominated by male viewers. This seems to indicate that women viewing habits are spread out over many more shows, particularly in the long tail of the distribution. (We found it surprising that the characteristicly "nerdy" show The Big Bang Theory had more women viewers.) This seems to lend faith to our ability to accurately classify gender based on shows watched.
Networks, however, are more evenly distributed across genders, likely because most networks air shows that appeal to broad audiences. Still, some networks (such as ESPN and Bravo!) are highly polarized along gender lines.
In [7]:
hours_fb = fb_users.groupby(fb_users.set_index('date').index.hour).date.count()
hours_nonfb = nonfb_users.groupby(nonfb_users.set_index('date').index.hour).date.count()
hours_fb.plot(linewidth=4, label="Facebook users", color='#3B5998') # use official FB color
ax = hours_nonfb.plot(linewidth=4, title="Daily viewership pattern", label="Non-Facebook users", color="#FF3F19")
ax.get_xaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: str(int(x)) + ":00"))
plt.xlim(0, 23)
plt.legend(loc='best')
remove_border()
It is clear that primetime hours are by far the most high-traffic, which matches our intuition about the data. However, it is suprising to see a mostly smooth ramp up until primetime; we expected more distinct peaks, such as for early-morning news. From the graphs above, we can infer that Harvard students do not watch much TV news.
Also, there is a suprising amount of activity even late into the night. For example, there is about half as many people watching TV at midnight as there are at the climax (8pm). This has probably to do with the college audience.
This graph also confirms the previous observation — Facebook users are very similar to non-Facebook users. This supports our assumption that we will be able to classify genders of non-Facebook users from the labeled Facebook data.
Now that we have established the correspondence between Facebook and non-Facebook viewers, let's break down Facebook users by gender.
In [8]:
men = fb_users[~pd.isnull(fb_users['gender']) & (fb_users['gender'] == 'male')]
women = fb_users[~pd.isnull(fb_users['gender']) & (fb_users['gender'] == 'female')]
hours_men = men.groupby(men.set_index('date').index.hour).date.count()
hours_women = women.groupby(women.set_index('date').index.hour).date.count()
hours_men.plot(linewidth=4, label="Male Facebook users")
ax = hours_women.plot(linewidth=4, title="Daily viewership pattern", label="Female Facebook users")
ax.get_xaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: str(int(x)) + ":00"))
plt.xlim(0, 23)
plt.legend(loc='best')
remove_border()
In contrast to the show and network distributions, there is very little difference between the daily viewership patterns of the two genders, except for the small peak early in the morning for women. This gives us a useful insight — a feature based on show timestamp would not be useful for a gender classifier.
Let's now examine how viewership is spread over the whole period, rather than just a day.
In [9]:
days = df.groupby(pd.DatetimeIndex(df['date']).normalize()).date.count()
days.plot(linewidth=4, title="TV viewership over time")
# Highlight weekends; this implementation only works if the starting day is a weekday
sats = pd.date_range(start=days.index[0], end=days.index[-1], freq="W-SAT")
suns = pd.date_range(start=days.index[0], end=days.index[-1], freq="W-SUN")
halfday = pd.tseries.offsets.DateOffset(hours=12)
for sat, sun in zip(sats, suns):
plt.axvspan(sat - halfday, sun + halfday, facecolor='g', alpha=0.25)
remove_border()
plt.savefig('presentation/images/graphs/timeseries.png', transparent=True, bbox_inches="tight")
The first obvious observation from this graph is that viewership increases substantially over the weekend only to drop somewhat smoothly throughout the week (green semi-transparent bars indicate weekends). The increase on the December 17 weekend is especially steep, likely coinciding with the end of finals.
The weekend of February 25 appears to be an outlier, but this is explained by a major Philo outage. This weakens our data, but it is an unfortane part of real-world data sets.
Also, it is clear that viewership drops substantially over the Christmas holidays as students return to their homes. (Philo can only be used from the Harvard network.)
In [10]:
times_fb = fb_users.groupby('combined_id')['timed'].sum() / 60
times_nonfb = nonfb_users.groupby('combined_id')['timed'].sum() / 60
x_range = (min(times_fb.min(), times_nonfb.min()), max(times_fb.max(), times_nonfb.max()))
plt.hist(times_nonfb, bins=50, range=x_range, log=True, alpha=0.5, label="Non-Facebook users", color="#FF3F19")
plt.hist(times_fb, bins=50, range=x_range, log=True, alpha=0.5, label="Facebook users", color="#3B5998")
plt.title("Viewing volume distribution")
plt.xlabel('Minutes spent watching TV')
plt.legend(loc='best')
remove_border()
plt.savefig('presentation/images/graphs/distribution.png', transparent=True, bbox_inches="tight")
It is clear that there is an enormous skew toward low volume viewers — note that the y axis has a logarithmic scale. We expected sparsity in the data, but it is stronger than we anticipated.
The graph also shows that non-Facebook users are less engaged. This is likely because of technical limitations: session cookies do not sync across devices and are sometimes erased by the user, leading to users being represented multiple times under different session IDs.
We leveraged the unique nature of our online TV viewership dataset to develop what we call "stickiness", a dynamic measure of viewer engagement. To create the metric, we used on the Markov Models in conjunction with the PageRank algorithm.
A user watching cable TV, whether through a cable box or an Internet cable TV provider such as Philo, will watch a variety of shows over his or her lifetime with the provider. However, a cable TV is fundamentally different from on-demand TV content providers, such as Netflix, Hulu, or YouTube. Important differences include limited content availability at any given time on cable: content is curated by the provider rather than selected by the user. In addition, cable lacks content rating and robust user and usage tracking. These characteristics make it very challenging to provide relevant content recommendations to cable TV viewers. From own experience with watching TV, we made a number of assumptions about TV viewers and TV-viewing habits:
TV content is not rated, either by a five-star system or approval/disapproval. Therefore, in order to rank TV shows based on user preferences, we must look at user behavior. In particular, taking into account the nature of content availability on TV, we can measure TV show stickiness by determining which TV shows capture a user's attention the most — what keeps them most engaged. We define 'stickiness' as the proportion of time that a user watches a given TV show relative to the length of the show available at that time. Stickiness helps us understand how likely a user is to watch the show instead of switching to a different one.
A naive way of ranking TV show preferences would be to look at the amount of time a user watched a given show. This would be a limited approach since TV content is made available in different amounts on cable. For instance, there are very few primetime shows, but lots of news and sports broadcasting for much longer periods of time. Because of these differences in lengths, looking solely at the amount of time a user watches certain shows may give misleading recommendations.
As we noted during exploratory data analysis, some of the most watched shows are sports and news, while the shows that typically are typically very popular with the viewers are much lower in total time viewed. Therefore, we need to construct a better model to rank these shows for the purposes of recommendations.
Taking into account the differences in content availability, we can re-rank TV show preferences by determining which TV shows keep viewers glued to their screens. We define 'stickiness' as the proportion of time that a user watches a given TV show relative to the length of the show available at that time:
$$E(Stickiness) = \dfrac{\text{time user watches show}}{\text{time show is on}}$$Stickiness helps us understand how likely a user is to watch the entire show. From our previous assumptions, more engaging shows are more highly preferred by TV viewers while less engaging shows are less preferred.
While we can normalize TV viewership by the amount of time a show is on to determine stickiness with that show, this still leaves out the effect of the TV schedule. Stickiness is not a static factor, but conditional on what else is on TV at a given time. For instance, at 5:00 AM in the morning, there isn't much on TV except for local news, making local news more engaging than other channels since there is nothing else to change to that the user would presumably prefer more. Let's say the user watches "Fox 25 Morning News" because that's what is most engaging given the choices. Therefore, we can redefine stickines as a conditional probability:
$$P(\text{Fox 25} \mid \text{Show listings at 5 AM}) = 0.99$$This generalizes for stickiness for all shows at every time:
$$E(Stickiness \mid Lineup) = \dfrac{E(\text{time user watches show} \mid Lineup)}{\text{time show is on}}$$However, this poses a more difficult task. How do we determine the stickines of a certain show independent of its schedule? Or, more precisely, how do we measure stickines given the entire set of shows on TV? One way of transforming stickines can be made by using Bayes' Rule. Given our prior example, one formulation might be the following:
$$\dfrac{P(A \mid B) \times P(A)}{P(B \mid A)} = P(B)$$$$\dfrac{P(\text{A set of show listings @ 5 AM} \mid \text{Fox 25}) \times P(\text{Set of show listings @ 5 AM})}{P(\text{Fox 25} \mid \text{Show listings @ 5 AM})} = P(\text{Fox 25})$$From the above, it would initially appear straightforward to calculate the stickines for our show. However, many of our variables are computationally and statistically difficult to calculate. For one, it is difficult to determine the probability of a certain set of show listings at a given time over the entire set of observed data, since there is a great number of permutations of show listings that are on at given time, and any updating to our set of shows as time goes on would require us to recalculate all of the probabilities. It is also difficult to determine the probability of, say, Fox 25 given our show listings since the distribution is not necessarily known or uniform (it would be unreasonable to assume it is simply 1 / number of shows in the listing).
Therefore, we need a better statistical tool than Bayes' rule to assess stickines. We propose a Markov Chain Model for TV show stickines that can help determine the predicted stickines both overall and conditional on a schedule.
Instead of directly calculating stickiness of each show based on the amount of time a user watches it, our model derives stickiness as a stationary probability of landing on that show in a Markov Chain of other shows which are also on at that given time. We have a variety of different options for assigning edge probabilities to calculate the transition probabilities between a given node (show A) and a neighboring node (show B). These include:
The first example is the simpler method, which we will try first. To implement it, we weight each edge by the number of crossings from one node to the other. Crossing from show A to show B occurs when a user switches the channel in the same viewing session, where a "viewing session" is defined as a set of neighboring shows. We created a directed graph by determining show crossing for each user in our data set for the entire three-month period.
To determine the stationary probability from this dataset, we run the PageRank algorithm on the network. PageRank is useful here as it roughly corresponds to a user randomly starting from a show and walking along edges with preference to higher weighted edges to determine which show he or she ends up choosing. Accordingly, in the context of our network, higher PageRank score is a good proxy for stickines.
In [11]:
"""
Function
--------
tivli_graph
Turn the tivli graph data into a NetworkX Digraph.
Edge weights are the number of transitions between shows.
Parameters
----------
data : joined show fragments
Returns
-------
graph : A NetworkX DiGraph, with the following properties
* Each node is a show. For a label, use the 'show' title with
network as a property
* Each edge from A to B is assigned a weight equal to how many
times a user jumped from A to B
"""
def tivli_graph(show_list):
show_list = show_list.copy()
graph_obj = nx.DiGraph()
show_comb = show_list.groupby('combined_id')
graph_obj.add_node("Offsite")
for user, shows in show_comb:
last_show = pd.to_datetime("")
last_show_name = "Offsite"
i = 0
end = len(shows.sort(columns='date'))
for _,show in shows.sort(columns='date').iterrows():
show_start = show[0]
show_name = show[1]
show_network = show[2]
show_time = show[3]
show_end = show[0] + pd.DateOffset(seconds=show_time)
if graph_obj.has_node(show_name):
graph_obj.node[show_name]['time_watched'] += show_time
else:
graph_obj.add_node(show_name, network=show_network, time_watched=show_time)
if last_show >= show_start - pd.DateOffset(seconds=30):
# case 1: user switches channels
newweight = 1
if graph_obj.has_edge(last_show_name,show_name):
newweight = graph_obj[last_show_name][show_name]['weight'] + 1
graph_obj.add_edge(last_show_name, show_name, weight=newweight)
elif i == 0:
# case 2: user uses tivli for first time
newweight = 1
if graph_obj.has_edge(last_show_name,show_name):
newweight = graph_obj[last_show_name][show_name]['weight'] + 1
graph_obj.add_edge(last_show_name, show_name, weight=newweight)
elif last_show <= show_start - pd.DateOffset(seconds=30):
# case 3: user leaves site, comes back
# need to make two edges
newweight = 1
if graph_obj.has_edge(last_show_name,"Offsite"):
newweight = graph_obj[last_show_name]["Offsite"]['weight'] + 1
graph_obj.add_edge(last_show_name, "Offsite", weight=newweight)
newweight = 1
if graph_obj.has_edge("Offsite",show_name):
newweight = graph_obj["Offsite"][show_name]['weight'] + 1
graph_obj.add_edge("Offsite", show_name, weight=newweight)
elif i+1 == end:
# case 4: we're at the end of our dataset, assume offsite
newweight = 1
if graph_obj.has_edge(show_name,"Offsite"):
newweight = graph_obj[show_name]["Offsite"]['weight'] + 1
graph_obj.add_edge(show_name, "Offsite", weight=newweight)
last_show = show_end
last_show_name = show_name
i += 1
return graph_obj
In [12]:
tivli_net = tivli_graph(df)
While the above approach is powerful as it can quickly calculate a rough picture of the viewership network, it is does not paint a complete picture. We notice that there are very few tightly connected nodes, despite the large number of shows in our data set. We also see that shows like "NFL Football" and other sports have a disproportionally high PageRank (given by node size) while other shows which we know are popular (such as "How I Met Your Mother") have a relatively small PageRank. This is likely a result of not accounting for the amount of time a certain show is on. In addition, the methodology introduces a dangerous bias: for shows that are very engaging but that are not on TV often, the number of edge crossings will be very low compared to shows that are aired frequently.
Accordingly, we need to normalize the edge crossings by the length of time the show was on TV in that viewing session. That is, we normalized the edge weights from show A to show B by setting them equal to the stickiness of show B when coming from show A. In other words, the stickiness of B after moving from A equals the transition probability from show A to show B. One can think of it as "the probability of being fully engaged at B given currently at show A." The higher the stickiness of B when coming from A, the higher the probability of staying at show B and watching it to the end. We perform these calculations for every user and show in the dataset to produce an array of edge weights for every edge, where each entry corresponds to a normalized transition a user made between the two shows. We calculate the mean and standard deviation for these arrays at each edge and use the mean of the edge weights as our initial edge weight parameter.
In [13]:
"""
Function
--------
recreate_schedule
Trys to recreate the show schedule for Tivli on a given date.
Assumes all shows are at least 30 minutes long. Allows overlaps due to some users
being behind the current stream by pausing their video feed.
Parameters
----------
df: Tivli joined fragment data
Returns
-------
schedule: a pandas DataFrame Object
columns: start time, end time, show, network
"""
def recreate_schedule(df):
sched = []
# loop through networks
for network,t in df.groupby('network'):
# loop through each show on the network
for show_name,show in t.groupby('show'):
test = show.set_index('date').sort().resample('30Min')
test2 = test[test['timed'] > 0]
test3 = [x[0] for i,x in enumerate(test2.iterrows())]
groups = []
if len(test3) == 1:
sched.append({'show':show_name, 'network':network, 'start':test3[0],
'end':test3[0] + pd.DateOffset(seconds=1800), 'time_on':1800})
for i in range(1,len(test3)):
if test3[i] == test3[i-1] + pd.DateOffset(seconds=1800):
groups.append(test3[i-1])
groups.append(test3[i])
else:
groups.append(test3[i-1])
groups.append(test3[i-1] + pd.DateOffset(seconds=1800))
#groups.sort()
sched.append({'show':show_name, 'network':network, 'start':groups[0], 'end':groups[-1],
'time_on':(groups[-1]-groups[0]).seconds})
groups = []
if groups != []:
groups.append(groups[-1] + pd.DateOffset(seconds=1800))
sched.append({'show':show_name, 'network':network, 'start':groups[0], 'end':groups[-1],
'time_on':(groups[-1]-groups[0]).seconds})
return pd.DataFrame(sched)
all_schedules = recreate_schedule(df)
all_schedules = all_schedules.set_index(['network', 'show'])
def show_length(sch, show_network, show_name, show_start):
return sch.ix[show_network, show_name][sch.ix[show_network, show_name].start <= show_start]['time_on'].irow(0)
def tto(sch, show_network, show_name):
return sch.ix[show_network, show_name]['time_on'].sum()
Using data from the TV schedule, we create viewership network for the second-iteration Markov Model.
In [14]:
def tivli_stickiness(show_list, sch):
log = []
show_list = show_list.copy().sort(columns='date')
graph_obj = nx.DiGraph()
show_comb = show_list.groupby('combined_id')
graph_obj.add_node("Offsite")
for user, shows in show_comb:
last_show_start = pd.to_datetime("")
last_show_end = pd.to_datetime("")
last_show_name = "Offsite"
last_network = "Offsite"
last_time_watched = 0
i = 0
end = len(shows)
for _,show in shows.iterrows():
time_on = 0
show_start = show[0]
show_name = show[1]
show_network = show[2]
show_time = time_watched = show[3]
show_end = show[0] + pd.DateOffset(seconds=show_time)
if graph_obj.has_node(show_name):
graph_obj.node[show_name]['time_watched'] += show_time
else:
graph_obj.add_node(show_name, network=show_network, time_watched=show_time)
if (last_show_end >= show_start - pd.DateOffset(seconds=30)) & (last_network != show_network):
# case 1: user switches channels
if i != 0:
last_time_on = show_length(sch, last_network, last_show_name, last_show_start)
graph_helper_2(graph_obj, last_show_name, show_name, last_time_watched, last_time_on)
elif i == 0:
# case 2: user uses tivli for first time
time_on = show_length(sch, show_network, show_name, show_start)
graph_helper_2(graph_obj, "Offsite", show_name, time_on, time_on)
elif last_show_end <= show_start - pd.DateOffset(seconds=30):
# case 3: user leaves site, comes back
# need to make two edges
if i != 0:
last_time_on = show_length(sch, last_network, last_show_name, last_show_start)
time_on = show_length(sch, show_network, show_name, show_start)
graph_helper_2(graph_obj, last_show_name, "Offsite", last_time_watched, last_time_on)
graph_helper_2(graph_obj, "Offsite", show_name, time_on, time_on)
elif i+1 == end:
# case 4: we're at the end of our dataset, assume offsite
time_on = show_length(sch, show_network, show_name, show_start)
graph_helper_2(graph_obj, show_name, "Offsite", time_watched, time_on)
last_show_start = show_start
last_show_end = show_end
last_show_name = show_name
last_time_watched = show_time
last_network = show_network
i += 1
#calculate total time on
for node in graph_obj.nodes():
show_name = node
if node == "Offsite":
continue
show_network = graph_obj.node[node]['network']
time_watched = float(graph_obj.node[node]['time_watched'])
total_time_on = float(tto(all_schedules, show_network, show_name))
graph_obj.add_node(node, network=show_network, time_watched=time_watched, time_on=total_time_on)
#finally: do edge math
for edgea,edgeb in graph_obj.edges():
if edgea == edgeb: # no self-nodes
graph_obj.remove_edge(edgea, edgeb)
continue
switch_mean = float(np.mean(graph_obj[edgea][edgeb]['switch_arr']))
switch_std = float(np.std(graph_obj[edgea][edgeb]['switch_arr']))
switch_arr = len(graph_obj[edgea][edgeb]['switch_arr'])
graph_obj.add_edge(edgea, edgeb, weight=(1-switch_mean), weight_std=switch_std, prob=switch_mean, switch_arr=switch_arr)
return graph_obj
def graph_helper_2(graph_obj, last_show_name, show_name, time_watched, time_on):
if time_watched <= time_on:
switch_prob = [1 - float(time_watched)/time_on]
else:
switch_prob = [0]
if graph_obj.has_edge(last_show_name,show_name):
switch_prob += graph_obj[last_show_name][show_name]['switch_arr']
graph_obj.add_edge(last_show_name, show_name, switch_arr=switch_prob)
In [15]:
stick_graph = tivli_stickiness(df, all_schedules)
We calculated the PageRank on the entire network and visualized the resting network. (For the purposes of our visualization, we filtered out nodes with a degree of fewer than 10.) We observe that the network has gotten much larger and more detailed, with many more similar shows grouped together (grouping was determined by Gephi's modularity). However, we still see sports dominating over others.
To correct this and to get a better idea of how edge weight affects PageRank, we bootstrapped the PageRank calculation by randomizing the edge weights based on their means and standard deviations (i.e. adding Gaussian noise) and then calculated the PageRank at each node. We performed 1000 iterations. We then set the mean PageRank over these iterations to each node to get a new, bootstrapped version which accounts for large edge weight uncertainty by reducing or increasing the PageRank at many nodes.
In [16]:
stick_graph2 = stick_graph.copy()
for edgea,edgeb in stick_graph2.edges():
if edgea == edgeb:
stick_graph2.remove_edge(edgea, edgeb)
else:
nums = stick_graph2[edgea][edgeb]["switch_arr"]
if nums<=10:
stick_graph2.remove_edge(edgea,edgeb)
stick_graph2.remove_node("Offsite")
pagerank_val_arr = []
iterations = 1000
i = 0
debug = False
while i in range(0,iterations):
for edgea,edgeb in stick_graph2.edges():
# step 1: randomize edge weights
nums = stick_graph2[edgea][edgeb]["switch_arr"]
weight_std = stick_graph2[edgea][edgeb]["weight_std"]
weight = stick_graph2[edgea][edgeb]["weight"]
if weight<0:
weight=0
if weight_std==0:
weight_std=0.001
newweight = np.mean(np.random.normal(weight, weight_std, size=nums))
stick_graph2.add_edge(edgea, edgeb, newweight=newweight)
try:
pr=nx.pagerank(stick_graph2,alpha=0.9,weight='newweight',tol=.001, max_iter=200)
except:
# pagerank not guaranteed to converge
continue
pagerank_val_arr.append([np.array(sorted(pr.items(), key=lambda x:x[0]))[:,1]])
if debug:
if i % 2 == 0:
print i
i += 1
means = []
newarr = np.vstack(pagerank_val_arr)
for i in range(0,2388):
t = list(newarr[:,i])
t = map(lambda x:float(x),t)
means.append(np.mean(t))
test = dict()
keys = np.array(sorted(pr.items(), key=lambda x:x[0]))[:,0]
values = means
for i in range(0,2388):
test[keys[i]] = float(values[i])
stickiness = sorted(test.items(), key=lambda x:x[1], reverse=True)
for edgea,edgeb in stick_graph2.edges():
stick_graph2.add_edge(edgea, edgeb, newweight=0)
for node in stick_graph2.nodes():
stick_graph2.add_node(node, pagerank=test[node])
In [17]:
df['global_stickiness'] = 0.0
# go through each show in stickiness calculation
for (show,value) in stickiness:
# add global_stickiness to each show in df
# ignoring time here for simplicity but can calculate it
# over a time range later
df['global_stickiness'][df.show == show] = value
In [18]:
#df.to_csv('data/processed_with_stickiness.csv')
# added global stickiness for each show
df.set_index('show')['global_stickiness'].head(10)
Out[18]:
This new graph now corrects for the large discrepancies between sports and other shows in the PageRank, and brings up the importance of previously lower-pageranked shows in the network.
To determine the stickiness of a show at a certain time and date, we can filter the network and display only the relevant nodes. The code below recalculates the PageRank (i.e. predicted stickiness) for the specified time range.
This following code can be run by going to http://indynamics.org/cs109/ with the appropriate start and end times passed as GET parameters.
In [19]:
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
from IPython.display import HTML
import urllib
thecode = open("nodes_active.py").read()
thehtml=highlight(thecode, PythonLexer(), HtmlFormatter())
HTML(thehtml)
Out[19]:
In order to make even more robust recommendations, it may be useful to enrich our data set by using machine learning to infer relevant features or to fill in missing ones. In this section, we investigate whether we can use the available data to (1) infer show genres and (2) predict user genders.
The Philo dataset presents an interesting opportunity to use the machine learning classification techniques presented in class. Namely, we seek to answer the following research questions:
First, we acquire show description and genre data from the file show_with_episodes_desriptions.txt. The labeled data has some inconsistencies in category names - we address this by relabeling genres. We also address situations where the show does not have a description, by making the show name the description on in these cases (and including each case only once, so as not to repeat the show names for every episode.) There are 302 shows that do not have any episode or description information, and 301 unique shows, meaning that most of these were one-time shows.
In [20]:
show_df = pd.read_table('data/shows_with_episodes_descriptions.txt', sep='\t', names = ['show','category','episode_name','description'])
# remove unnecessary quotes for category names
show_df['category'] = show_df['category'].str.replace("'", '')
print sorted(list(show_df.category.unique()))
print show_df.count()
# merging sports categories together
show_df = show_df.replace(to_replace = {'category': {'Basketball': 'Sports', 'Football': 'Sports', 'Tennis': 'Sports'}})
# merging sci-fi/fantasy, fantasy, and science fiction categories together
show_df = show_df.replace(to_replace = {'category': {'Fantasy': 'Sci-Fi/Fantasy', 'Science Fiction': 'Sci-Fi/Fantasy'}})
# merging Action/Adventure and Action and Adventure together
# merging Kids, Children, and Family/Children together
# add Musicals to Movies
# merging News and Current Events together
# merging Suspense and Mystery together
# merging Documentary and Documentary/Bio together
# adding soap operas to the drama category
show_df = show_df.replace(to_replace= {'category': {'Action and Adventure': 'Action/Adventure', 'Kids': 'Children',
'Family/Children': 'Children', 'Musical': 'Movies',
'Current Events': 'News/Current Events',
'News': 'News/Current Events', 'Suspense': 'Mystery/Suspense',
'Mystery': 'Mystery/Suspense', 'Documentary/Bio': 'Documentary',
'Soap Opera': 'Drama'}})
print show_df.count()
print sorted(list(show_df.category.unique()))
# we'll require at least a show description or an episode name
# In cases that we don't have a show description, or episode name, we'll make the show name the description (since it's probably a special).
# We'll only include these cases in the dataset once, so that we don't repeat show names in our concatenated descriptions
# shows_wo_desc = show_df[np.all([show_df.description.isnull(), shows_wo_desc.episode_name.isnull()])]
shows_wo_desc_or_name = show_df[np.all([show_df.description.isnull().values,
show_df.episode_name.isnull().values], axis = 0)].drop_duplicates()
shows_wo_desc_or_name['description'] = shows_wo_desc_or_name['show']
shows_w_desc_or_name = show_df[np.any([show_df.description.notnull().values,
show_df.episode_name.notnull().values], axis = 0)]
show_df = pd.concat([shows_wo_desc_or_name, shows_w_desc_or_name])
print show_df.count()
grouped = show_df.groupby(by = 'category')
grouped.count()
Out[20]:
We create a dataframe with all of the titles and descriptions of the shows' episodes in a single field. This involves data cleaning to remove punctuation, merging lines together, and removing beginning and ending quotation marks from our input strings. We will use this dataframe later to create a (very sparse) bag of words matrix, from which we will use to identify key features and build out our classifier.
In [21]:
def clean_and_concatenate_description(show_df):
# remove quotes, concatenate episode name and description, lowercase all
show_df = show_df.fillna({'episode_name':'','description':''})
show_df['episode_name'] = show_df['episode_name'].str.replace('"', '')
show_df['description'] = show_df['description'].str.replace('"', '')
show_df[show_df.description.isnull()]['description'] = ''
show_df['episode_name_description'] = show_df.episode_name + " " + show_df.description
show_df['episode_name_description'] = show_df['episode_name_description'].str.lower()
# make sure episodes are unique
show_df = show_df[['show','episode_name_description']].drop_duplicates()
# cleaning further to remove punctuation
punctuation = '.,?;:!@#()[]^<>/'
for each in punctuation:
show_df['episode_name_description'] = show_df['episode_name_description'].str.replace(each, '')
# now we want one line for each show with ALL the words used to name and describe episodes
descriptions = show_df.groupby('show')['episode_name_description'].agg({'all_descriptions': lambda col: ''.join(col)})
return descriptions
descriptions = clean_and_concatenate_description(show_df)
# as a check, counting length of strings in the all_descrptions column
lengths = descriptions.all_descriptions.str.len().values
print "The maximum length of our merged descriptors is:", max(lengths)
print "The minimum length of our merged descriptors is:", min(lengths)
print descriptions.head()
print descriptions.count()
In [22]:
unique_show_category = show_df[['show','category']].drop_duplicates()
descriptions_and_cats = pd.merge(unique_show_category, descriptions, how='left', on=None, left_on=['show'], right_on=None,
left_index=False, right_index=True, sort=True,
suffixes=('_x', '_y'), copy=True)
print descriptions_and_cats.head()
Another variable to include in the model is the network on which a given show airs. Intuitively, we expect that TV network will be a useful feature in determining a show's genre. For instance, Lifetime usually plays drama and Comedy Central usually plays comedy. (If a show plays on more than one network, we assigned the first one.)
In [23]:
other_show_data = df[['show','network']].drop_duplicates(cols = 'show')
For the matching to work correctly, we needed to clean the show names.
In [24]:
def clean_show_names(show_df, shows_in='show', index=False):
# input dataframe with show column
# returns the df with an appended column, clean_show, that doesn't have '
# this is bad, but looping through to use a string operator to remove beginning and ending apostrophes from strings
templist = []
if index == False:
show_names = show_df[shows_in]
else:
show_names = show_df.index
for each in show_names:
if (each[0] == "'" and each[len(each)-1] == "'"):
# deprecated after discovery of apostrophe issues
#if each[1:len(each)-1][0:2] == "''": # special case of 3 apostrophes at beginning - e.g. '''Til Death'
#templist.append(each[2:len(each)-1])
#else:
newtitle = each[1:len(each)-1]
else:
newtitle = each
# correction for apostrophe issue
newtitle = newtitle.replace("''","'")
templist.append(newtitle)
show_df['clean_show'] = templist
return show_df
In [25]:
clean_show_df = clean_show_names(show_df)
clean_descriptions_df = clean_show_names(descriptions_and_cats)
print clean_descriptions_df.head()
In [26]:
merged_sets = pd.merge(clean_descriptions_df[['clean_show','category','all_descriptions']], other_show_data, how='left', on=None,
left_on=['clean_show'], right_on=['show'], left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True)
In [27]:
grouped = merged_sets.groupby(by='category')
grouped.count()
Out[27]:
It seems that we only have network data for about half of the shows. This is an oddity of the dataset that we are unable to address - we only have network data for the shows that users actually finished watching. Also, now that we have one observation per show (rather than one observation per episode), we can see for which categories we have very, very few observations. Categorizing data into the "Science" category is going to be impossible, for example, since we only have two examples of shows in this category. Accordingly, we made broader labels: instead of 'Series', 'Special', 'Other', and 'General', we have 'Educational', 'Fictional', 'Lifestyle', 'News/Current Events', and 'Sports' categories.
In [28]:
merged_sets = merged_sets.replace(to_replace= {'category': {'Action/Adventure':'Fictional', 'Children': 'Fictional',
'Mystery/Suspense':'Fictional', 'Western': 'Fictional',
'Sci-Fi/Fantasy':'Fictional', 'Drama': 'Fictional',
'Comedy':'Fictional', 'Romance':'Fictional', 'Movies':'Fictional',
'Fiction':'Fictional','Weather':'News/Current Events',
'Public Affairs': 'News/Current Events', 'Health':'Lifestyle',
'Travel':'Lifestyle','Cooking':'Lifestyle','Reality':'Lifestyle',
'Religious':'Lifestyle','Talk Show':'Lifestyle',
'Science':'Educational','Business':'Educational',
'Documentary':'Educational'
}})
grouped = merged_sets.groupby(by='category')
grouped.count()
Out[28]:
The next step is to separate the labeled data from the unlabeled data. In this case, the unlabeled data actually has a few labels (that don't correspond to genres) - 'Series', 'Special', 'Other', and 'General'.
In [29]:
unlabeled_data = merged_sets[np.any([merged_sets.category == 'Special',
merged_sets.category == 'Series',
merged_sets.category == 'General',
merged_sets.category == 'Other',], axis = 0)]
labeled_data = merged_sets[np.any([merged_sets.category == 'Educational',
merged_sets.category == 'Fictional',
merged_sets.category == 'Lifestyle',
merged_sets.category == 'Sports',
merged_sets.category == 'News/Current Events'], axis = 0)]
print "merged data"
print merged_sets.count()
print ""
print "unlabeled data"
print unlabeled_data.count()
print ""
print "labeled data"
print labeled_data.count()
We create a bag of words from the description data, with category as the output. The network data is not included the bag of words since the network isn't apart of the show description; instead, the variable is coded nominally and then merged in.
In [30]:
# helper function that maps genres to IDs
def make_nominal_numbers(alist):
items = list(set(alist))
mymap = {}
for i in xrange(1,len(items)+1):
mymap[items[i-1]] = i
output = []
for j in xrange(len(alist)):
output.append(mymap[alist[j]])
# flip the map, since we'll use it in the future to go from id to genre
mymap = dict((y,x) for x,y in mymap.iteritems())
return mymap, output
# function like in HW3 that returns a bag of words and y vector
# this y-vector is not just 0-1 - there are many categories/genres
def make_xy(dfl, dful, vectorizer=None):
text = list(dfl['all_descriptions']) + list(dful['all_descriptions'])
if (vectorizer == None):
vectorizer = CountVectorizer(min_df=0)
vectorizer.fit(text)
x = vectorizer.transform(text)
## append in the network data
network_map, networks = make_nominal_numbers(list(dfl['network'])+ list(dful['network']))
a = np.reshape(networks,(x.shape[0],1))
x = hstack((x,a))
x = x.tocsc()#to prevent memory errors
genre_map, y = np.array(make_nominal_numbers(list(dfl['category'])))
xu = x[len(y):x.shape[0],:]
x = x[0:len(y),:]
return x, y, genre_map, xu
# creating x,y groups, genre map, and x frame for unlabeled data
x,y,genre_map,xu = make_xy(labeled_data,unlabeled_data)
We split the data into training and test groups so that we can cross-validate our model. We test several models against one another and adjust the parameters; the validation set helps us to evaluate these choices.
In [31]:
# split the data into training set and validation set
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.5, random_state=5)
There are too many words to use a Support Vector Machines model. It makes more sense to do the pre-processing step of determining which words are most important in determining genre. This is features selection, and scikit has several tools to aid in this process. Which feature selection method we use depends on the model for classification; scikit recommends that to use a pipeline for both methods together. In the cell below, we create a generic model test that runs the pipeline of feature selection and classification and then scores the model.
In [32]:
# Let's score a few different models against one another - all of the models have to make intuitive sense, though
def score_model(feature_selection, classification):
clf = Pipeline([
('feature_selection', feature_selection),
('classification', classification)
])
clf.fit(xtrain, ytrain)
return clf.score(xtrain,ytrain), clf.score(xtest,ytest)
Since we are classifying into many genres, not just the fresh/rotten classification we did in Homework 3 and our bag-of-words space is high-dimensional, even after feature selection, we will use SVM as our classification model. It also seems that univariate feature selection is the best method to use for this preprocessing step, but it is not immediately clear which statistical test to use to select the best features. Here, I hold the model and percentile of features steady and test chi-squared against Anova F-values. They both yield approximately the same results in the training and test sets. We will chi-squared going forward.
In [33]:
model1 = (SelectPercentile(chi2, percentile=1), SVC())
model2 = (SelectPercentile(f_classif, percentile=1), SVC())
model1score = score_model(model1[0],model1[1])
print "model 1 scored: ",model1score
model2score = score_model(model2[0],model2[1])
print "model 2 scored: ",model2score
clf = Pipeline([
('feature_selection', model1[0]),
('classification', model1[1])
])
# I want to see what share of the predictions are correct for each genre
def chart_predictions_by_genre(clf, x, y = None):
predictions = clf.predict(x)
predictiondf = pd.DataFrame(data = predictions, columns=['prediction'])
predictiondf['count'] = 1
counts = pd.DataFrame(index = [1,2,3,4,5])
if not y == None:
predictiondf['actual'] = y
predictiondf['same'] = predictiondf.actual == predictiondf.prediction
counts['Total'] = predictiondf.groupby(by='actual')['count'].agg([np.sum])
counts['Actual'] = predictiondf[predictiondf.same].groupby(by='actual')['count'].agg([np.sum])
counts['Share Correctly Predicted'] = counts.Actual/counts.Total
plotvar = 'Share Correctly Predicted'
title = 'Share of Test Set Accurately Predicted, by Genre'
else:
counts['Total'] = predictiondf.groupby(by='prediction')['count'].agg([np.sum])
plotvar = 'Total'
title = 'Number of Predictions, by Genre'
counts['Genre'] = [genre_map[val] for val in counts.index]
counts.plot(y=plotvar, x='Genre', title = title, kind='bar')
return counts
# I think we'll see higher accuracy in the genres for which we have more data
chart_predictions_by_genre(clf, xtest, ytest)
# showing how much training data we had for each category
countsytrain = Counter(ytrain)
for key in range(1,6):
newkey = genre_map[key]
countsytrain[newkey] = countsytrain[key]
del countsytrain[key]
print countsytrain
Next, we test the default Radial Basis Function against the Linear, Polynomial, and Sigmoid kernels. Here we use the default parameters for the different kernels, just to get a sense of how they measure up against one another at base. The linear option seems to over-fit, and the sigmoid option does not do quite as well as the polynomial and RBF versions. Since the RBF version fits a little better and does not overfit any more than the polynomial version, we will that one going forward.
In [34]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
def test_kernel(kernel):
feature_selection = model1[0]
classification = SVC(kernel=kernel)
score = score_model(feature_selection, classification)
return score
for kernel in kernels:
score = test_kernel(kernel)
print kernel, score
How many words matter? These data is very noisy, and if we include words that are associated with particular shows within a genre but that are not typical of the genre at large, we will overfit the model. That said, we need many words for our model to work - almost all the data we have about the shows are descriptions. Finding the right balance is challenging - initially we thought we might need 10-20% of the features, but it turns out that we need only about 1%.
In [35]:
percentiles = [1, 5, 10, 20]
def test_percentile(percentile):
feature_selection = SelectPercentile(chi2, percentile=percentile)
classification = SVC()
score = score_model(feature_selection, classification)
return score
for percentile in percentiles:
score = test_percentile(percentile)
print percentile, score
percentiles2 = [0.1, 0.5]
for percentile in percentiles2:
score = test_percentile(percentile)
print percentile,score
After the tests, we decided to use a SVM with an RBF kernel on data whose top 1% of features were selected using chi-squared tests. Now, we need to select the best values of $c$ and gamma for the RBF kernel. Low $c$ gives smoother borders between classified areas, but higher $c$ might give more accurate results (or just overfit the model.) Low gamma means that no point has a lot of influence; high gamma gives more influence to single points. To choose the best values, we follow the method described in the scikit documentation.
In [36]:
# the below block is almost directly from the link above - I've never done this before!
C_range = 10.0 ** np.arange(-2, 9)
gamma_range = 10.0 ** np.arange(-5, 4)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedKFold(y=ytrain, n_folds=3)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(xtrain, ytrain)
print("The best classifier is: ", grid.best_estimator_)
In [37]:
feature_selection = SelectPercentile(chi2, percentile=percentile)
classification = SVC(C=1000.0, cache_size=200, degree=3, gamma=0.0001, max_iter=-1, shrinking=True, tol=0.001, verbose=False)
clf = Pipeline([
('feature_selection', feature_selection),
('classification', classification)
])
clf.fit(xtrain, ytrain)
clf.score(xtrain,ytrain)
chart_predictions_by_genre(clf, xtest, ytest)
# Again, emphasizing that we see higher accuracy in the genres for which we have more data
# I think we'll see higher accuracy in the genres for which we have more data
countsytrain = Counter(ytrain)
for key in range(1,6):
newkey = genre_map[key]
countsytrain[newkey] = countsytrain[key]
del countsytrain[key]
print countsytrain
In [38]:
clf.predict(xu)
# I think we'll see higher accuracy in the genres for which we have more data
chart_predictions_by_genre(clf, xu)
# showing how much training data we had for each category
countsytrain = Counter(ytrain)
for key in range(1,6):
newkey = genre_map[key]
countsytrain[newkey] = countsytrain[key]
del countsytrain[key]
print countsytrain
The final model in this section shows improvement over the original predictions, but there is a tradeoff between more accurate Lifestyle predictions and more accurate Sports predictions, and neither our original model nor the model whose parameters were predicted using the code from scikit learn could distinguish the signal for News/Current Events from the noise of description data. The data set, it seems, was not rich enough to accurately label the data to undertake this analysis despite the approaches we undertook: reclassifying the data into larger genre groups, and robust decisions about the type of model and the different parameters to use.
In [39]:
def male_ratio(df, var='show'):
# make a 1-0 encoded column for males
df['male'] = pd.get_dummies(df['gender'])['male']
# make a counter column
df['counter'] = 1
# drop all other columns
df = df[[var,'fb_id','male','counter']]
# create unique entry for each show and facebook id (since we want the number of total viewers that watched this show, irrespective of number of times wastched)
df = df.drop_duplicates()
# group by show and make aggregate counts
grouped = df.groupby(var)
counts = grouped['counter'].aggregate({'Viewers': np.sum})
counts['male'] = grouped['male'].aggregate({'male': np.sum})
counts['share_male'] = counts['male']/counts['Viewers']
return counts
gender_counts = male_ratio(df,'show')
# But if you look at the data, you see that we should drop groups that only have a few viewers...
gender_counts.head()
Out[39]:
Before we select features, we split up the Facebook data into training and test sets so that we can validate the model later. To obtain a reasonably rich set for training, we chose a 80/20 split. Also, since we have not built out our feature set yet, we are going to split the genders and Facebook IDs.
In [40]:
unique_user_genders = fb_users[['fb_id','gender']].drop_duplicates()
unique_user_genders = unique_user_genders[unique_user_genders.gender.notnull()]
unique_user_genders['gender'] = unique_user_genders['gender'] == 'male'
fb_ids_train, fb_ids_test, y_train, y_test = train_test_split(unique_user_genders['fb_id'],
unique_user_genders['gender'],
test_size = 0.2, random_state = 2)
print len(fb_ids_train)
print len(fb_ids_test)
fb_users_train = fb_users[fb_users['fb_id'].isin(fb_ids_train)]
We face an important question: of the most-watched shows, which have the most gender desparity in viewership? Since Facebook users had access to more channels during the time our dataset was created, we need to limit the domain to only those shows that both Facebook users and non-Facebook users could watch. Then, we rank the shows by the share of their total Facebook viewers (in our training set) that are male, and finally we limit it to only 10 shows with the highest male viewership and 10 shows with the highest female viewership. We do the same process for networks, with a limit of 5 networks in each group. These limits are somewhat arbitrary, but looking at the data, we tried to get at least 70% male on every male feature we selected.
In [41]:
# Define function that returns most watched shows
def most_watched(df,n,var='show'):
df_ = df[[var, 'uu_id']].drop_duplicates()
df_['counter'] = 1
grouped = df_.groupby(var)
counts = grouped['counter'].aggregate({'Viewers': np.sum})
counts = counts.sort(counts, ['Viewers'], ascending=False)
return counts.head(n)
# list most watched shows among non-fb users (since everybody can watch those)
most_watched_shows = most_watched(nonfb_users,100)
# find which of the most watched shows have highest and lowest male viewership in the training data
show_counts = male_ratio(fb_users_train,'show')
num_shows = 10
shows_more_f = show_counts.loc[list(most_watched_shows.index)].sort('share_male').head(num_shows)
shows_more_m = show_counts.loc[list(most_watched_shows.index)].sort('share_male', ascending=False).head(num_shows)
# list networks watched by non-fb-users
most_watched_networks = most_watched(nonfb_users,100,'network')
# find which of the networks have highest and lowest male viewership
network_counts = male_ratio(fb_users_train,'network')
num_networks = 5
networks_more_f = network_counts.loc[list(most_watched_networks.index)].sort('share_male').head(num_networks)
networks_more_m = network_counts.loc[list(most_watched_networks.index)].sort('share_male', ascending=False).head(num_networks)
print "Women tend to watch"
print networks_more_f
print shows_more_f
print ' '
print "Men tend to watch"
print networks_more_m
print shows_more_m
To incorporate these shows and networks into our model, we need new metrics: share of total viewing time spent watching each network, and share of total viewing time spent watching each show.
In [42]:
# create data frame with total time watched calculated for each user
times = fb_users[['fb_id','timed']].groupby('fb_id')['timed'].agg({'tot_time':np.sum})
times.head()
# this function creates a dataframe with the total time viewing the specific show/network for each fbid
def calc_share_time(df,var,value,times=times):
a = df[df[var] == value]
totals = a[['fb_id','timed']].groupby('fb_id')['timed'].agg({value:np.sum})
merged = pd.merge(totals,times, how='left', sort=True, left_index=True, right_index=True, copy=True)
shares = pd.DataFrame(merged[value].astype('float64')/merged['tot_time'].astype('float64'), columns = [value])
return shares
# get shares for all of our selected networks and shows and pull them into a single dataframe
def fb_id_shares_df(df,network_list,show_list,times=times):
final_output = pd.DataFrame(index = df['fb_id'].drop_duplicates())
var = 'show'
for value in show_list:
new_shares = calc_share_time(df,var,value,times)
final_output = pd.merge(final_output,new_shares, how='left', sort=True, left_index=True, right_index=True,copy=True)
var = 'network'
for value in network_list:
new_shares = calc_share_time(df,var,value,times)
final_output = pd.merge(final_output,new_shares, how='left', sort=True, left_index=True, right_index=True,copy=True)
final_output = final_output.fillna(0)
return final_output
var = 'show'
value = shows_more_f.iloc[1].name
calc_share_time(fb_users,var,value)
show_list = list(shows_more_f.index) + list(shows_more_m.index)
network_list = list(networks_more_f.index) + list(networks_more_m.index)
fb_id_shares_df = fb_id_shares_df(fb_users,network_list,show_list)
fb_id_shares_df.head()
Out[42]:
We cannot assume that all of our Facebook users saw one of these popular shows the networks. In fact, we lost a lot of data to users who have not seen any of the shows. We have to balance the choice to add more shows and networks against the choice to keep only shows and networks that actually have high or low shares of male viewership.
We think the choices we made above for the shows and networks are reasonable, despite the fact that we lose the observations counted below.
In [43]:
print "Number of Facebook users who saw at least one of our shows/networks:", len(fb_id_shares_df[fb_id_shares_df.sum(axis = 1) > 0].index)
print "Number of Facebook users who didn't see at least one of our shows/networks:", len(fb_id_shares_df[fb_id_shares_df.sum(axis = 1) == 0].index)
Now, we are going to split the features into the training and test sets that we established earlier to create the X for the classifier.
In [44]:
x_train = fb_id_shares_df.loc[fb_ids_train]
x_test = fb_id_shares_df.loc[fb_ids_test]
### note: I confirmed that the order is the same in the output as it was in the fb_ids_train and fb_ids_test lists,
# which means that it's also the same in the genders
def cut_me_down_the_same_way(x,y):
# pulling out the zero-sum rows is a bit tricky - need to keep track of indices so that I can pull them out of y, too
id_dictionary = dict(enumerate(x.index))
id_dictionary = dict((y,x) for x,y in id_dictionary.iteritems())
# cut the x data
x = x[x.sum(axis = 1) > 0]
# map the fbids of the x data to the row ids of the x data
x_rows = [id_dictionary[idx] for idx in x.index]
# cut the y data
y = y[x_rows]
return x,y
x_train_,y_train_ = cut_me_down_the_same_way(x_train, y_train)
x_test_,y_test_ = cut_me_down_the_same_way(x_test, y_test)
We face the choice regarding which classifier is most suited to our data. Here are the ones we considered:
We chose to investigate two of the classifiers which we think are most suitable in the context of our data set: Decision Trees and Nearest Neighbors.
In [45]:
clf_tree = tree.DecisionTreeClassifier()
clf_tree = clf_tree.fit(x_train_, y_train_)
# Test for overfitting
print "score on training data:", clf_tree.score(x_train_,y_train_)
print "score on test data:",clf_tree.score(x_test_,y_test_)
The decision tree model really over fits. Indeed, information gain in decision trees is biased in favor of those attributes with more levels Therefore, we will control the height of the tree.
In [46]:
clf_tree = tree.DecisionTreeClassifier(max_depth=3)
clf_tree = clf_tree.fit(x_train_, y_train_)
# Test for overfitting
print "score on training data:", clf_tree.score(x_train_,y_train_)
print "score on test data:", clf_tree.score(x_test_,y_test_)
Now, we are not overfitting, but we are only accurately predicting gender about 67% of the time. That is not so great, given that with a coin toss we would get it right 50% of the time.
Let's try nearest neighbors instead.
In [47]:
clf_knn = neighbors.KNeighborsClassifier(n_neighbors=10)
clf_knn = clf_knn.fit(x_train_, y_train_)
# Test for overfitting
print "score on training data:", clf_knn.score(x_train_,y_train_)
print "score on test data:",clf_knn.score(x_test_,y_test_)
For the 5 nearest neighbors, the nearest neighbors model fits the training data better than the decision tree, but its predictions for the test data are not any better. For 10 nearest neighbors, we get an even less accurate prediction than before.
In [48]:
graphdf = pd.DataFrame(index = range(len(x_train_) + len(x_test_)))
graphdf['time_share'] = x_train_['NFL Football'].values.tolist() + x_test_['NFL Football'].values.tolist()
graphdf['male_probability'] = clf_knn.predict_proba(x_train_)[:,0].tolist() + clf_knn.predict_proba(x_test_)[:,0].tolist()
graphdf['is_training'] = np.ones(len(x_train_)).tolist() + np.zeros(len(x_test_)).tolist()
graphdf['accuracy'] = abs(clf_knn.predict_proba(x_train_)[:,0] - y_train_).tolist() + abs(clf_knn.predict_proba(x_test_)[:,0] - y_test_).tolist()
graphdf['is_male'] = y_train_.tolist() + y_test_.tolist()
graphdf.head()
Out[48]:
In [49]:
training = graphdf[graphdf.is_training == 1]
test = graphdf[graphdf.is_training != 1]
plt.scatter(training['time_share'], training['male_probability'], marker='*', s=100, label="Training data")
plt.scatter(test['time_share'], test['male_probability'], s=60, label="Test data")
plt.title("Time share watching NFL Football as a function of predicted male probability")
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel("Share of Time Watching NFL Football")
plt.ylabel("Predicted Probability Male")
plt.legend()
remove_border()
Finally, let's regenerate the EDA show popularity chart with the full (predicted) dataset.
In [50]:
aggregate = df.groupby(["show", "male"]).timed.sum().unstack().fillna(0)
aggregate['sum'] = aggregate[0] + aggregate[1]
aggregate = aggregate.sort('sum', ascending=False)[:25]
aggregate = aggregate.drop('sum', 1)
aggregate.columns = ['Female', 'Male']
aggregate.plot(kind='bar', title=field.capitalize() + " by popularity and gender", grid=False, stacked=True)
remove_border()
plt.savefig('presentation/images/graphs/shows.png', transparent=True, bbox_inches="tight")
In this classification section, we predicted genders with about 70% accuracy where the user watched one of our selected shows or networks. Our model has a number of limitations. First, it fails for users who did not watch any of the selected shows or networks. Second, since some non-Facebook users might have more than one user id, and we have no way of matching those users to themselves, we run the risk of predicting a user's gender to be both male and female using our model. With more continuous viewing data for each user, we would get more accurate predictions. That said, we successfully used machine learning to fill in some of the missing values in our data set, which is a very encouraging result.
Putting it all together, we export some snippets of the final data as JSON for visualization in the presentation website.
In [51]:
def export_json(df, filename):
with open('presentation/data/' + filename + '.json', 'wb') as f:
f.write('[')
for i, (_, group) in enumerate(df.groupby('combined_id')):
if i != 0:
f.write(',')
f.write('{"data":' + group[['show','network','timed', 'date','global_stickiness']].to_json(orient='records') +
',"is_male":' + ("true" if group['male'].iloc[0] == 1.0 else "false") + '}')
f.write(']')
start_time = pd.datetime(2011, 12, 4, 20, 58, 0, 0, pytz.timezone('US/Eastern'))
end_time = pd.datetime(2011, 12, 4, 22, 2, 0, 0, pytz.timezone('US/Eastern'))
sample = df[(df.date >= start_time) & (df.date < end_time)]
export_json(sample, "nfl")
start_time = pd.datetime(2012, 2, 12, 19, 58, 0, 0, pytz.timezone('US/Eastern'))
end_time = pd.datetime(2012, 2, 12, 23, 32, 0, 0, pytz.timezone('US/Eastern'))
sample = df[(df.date >= start_time) & (df.date < end_time)]
export_json(sample, "grammy")
Our work provides an important contribution for content recommendation systems by proposing a measure that infers user preferences from the data itself, rather than requiring explicit input from user (eg. in the form of a star rating). As proposed here, stickiness could be used as the metric to be optimized by a standard recommendation system) and can be used as a proxy for global user preference in the context of dynamically changing set of options.
We likewise took a small step toward personification by demonstrating that we can infer user and data characteristics via machine learning methods. It is our hope that this study will prove useful to developing user-specific recommender for dynamic content, even beyond the one found on cable TV.