In [1]:
import pandas as pd
from os import path
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import sklearn
# Edit path if need be (shouldn't need to b/c we all have the same folder structure)
CSV_PATH_1 = '../Videos/all_data'
CSV_PATH_2 = '../Videos2/all_data2'
FILE_EXTENSION = '_all.csv'
GENRES = ['country', 'edm', 'pop', 'rap', 'rock']
# Containers for the data frames
genre_dfs = {}
all_genres = None
# Read in the 5 genre's of CV's
for genre in GENRES:
genre_csv_path_1 = path.join(CSV_PATH_1, genre) + FILE_EXTENSION
genre_csv_path_2 = path.join(CSV_PATH_2, genre) + FILE_EXTENSION
df_1 = pd.read_csv(genre_csv_path_1)
df_2 = pd.read_csv(genre_csv_path_2)
df_1 = df_1.drop('Unnamed: 0',1)
df_2 = df_2.drop('Unnamed: 0',1)
df_combined = pd.concat([df_1,df_2],ignore_index=True)
genre_dfs[genre] = df_combined
all_genres = pd.concat(genre_dfs.values())
all_genres.head()
# genre_dfs is now a dictionary that contains the 5 different data frames
# all_genres is a dataframe that contains all of the data
Out[1]:
In [2]:
def genre_to_ordinal(genre_in):
if(genre_in == "country"):
return 0
elif(genre_in == "pop"):
return 1
elif(genre_in == "rock"):
return 2
elif(genre_in == "edm"):
return 3
elif(genre_in == "rap"):
return 4
else:
return genre_in
all_genres['genre_ordinal'] = all_genres.genre.apply(genre_to_ordinal)
We add in some boolean genre classifiers to make our analysis more fine-grained. Rather than saying "we predict this video is country with 50% confidence", we could say "we predict this video is not edm with 90% confidence" and so on.
In [3]:
# Adding is_country flag
def is_country(genre_in):
if(genre_in == "country"):
return 1
else:
return 0
all_genres['is_country'] = all_genres.genre.apply(is_country)
# Adding is_country flag
def is_rock(genre_in):
if(genre_in == "rock"):
return 1
else:
return 0
all_genres['is_rock'] = all_genres.genre.apply(is_rock)
# Adding is_edm flag
def is_edm(genre_in):
if(genre_in == "edm"):
return 1
else:
return 0
all_genres['is_edm'] = all_genres.genre.apply(is_edm)
# Adding is_rap flag
def is_rap(genre_in):
if(genre_in == "rap"):
return 1
else:
return 0
all_genres['is_rap'] = all_genres.genre.apply(is_rap)
# Adding is_country flag
def is_pop(genre_in):
if(genre_in == "pop"):
return 1
else:
return 0
all_genres['is_pop'] = all_genres.genre.apply(is_pop)
In [4]:
# Subset all_genres to group by individual genres
country_records = all_genres[all_genres["genre"] == "country"]
rock_records = all_genres[all_genres["genre"] == "rock"]
pop_records = all_genres[all_genres["genre"] == "pop"]
edm_records = all_genres[all_genres["genre"] == "edm"]
rap_records = all_genres[all_genres["genre"] == "rap"]
# From the subsets above, create train and test sets from each
country_train = country_records.head(len(country_records) / 2)
country_test = country_records.tail(len(country_records) / 2)
rock_train = rock_records.head(len(rock_records) / 2)
rock_test = rock_records.tail(len(rock_records) / 2)
pop_train = pop_records.head(len(pop_records) / 2)
pop_test = pop_records.tail(len(pop_records) / 2)
edm_train = edm_records.head(len(edm_records) / 2)
edm_test = edm_records.tail(len(edm_records) / 2)
rap_train = rap_records.head(len(rap_records) / 2)
rap_test = rap_records.tail(len(rap_records) / 2)
# Create big training and big test set for analysis
training_set = pd.concat([country_train,rock_train,pop_train,edm_train,rap_train])
test_set = pd.concat([country_test,rock_test,pop_test,edm_test,rap_test])
training_set = training_set.fillna(0)
test_set = test_set.fillna(0)
print "Training Records:\t" , len(training_set)
print "Test Records:\t\t" , len(test_set)
# training_set.head()
We start generating our random forests, and output a relative accuracy and a confusion matrix. In this first one, we simply factor in non-color variables (rating, likes, dislikes, length and viewcount), and run it across all records to predict an ordinal genre value.
In [5]:
# Predicting based solely on non-color features, using RF
clf = RandomForestClassifier(n_estimators=11)
meta_data_features = ['rating', 'likes','dislikes','length','viewcount']
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[meta_data_features], y)
z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[meta_data_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[meta_data_features]),rownames=["Actual"], colnames=["Predicted"])
Out[5]:
As shown above, this method yields relatively poor results. This is because there's no distinct clusters being created by our random forest, and simple viewer statistics tell us nothing about what kind of video we're watching. However, we see that country, rap and pop are initially somewhat distinct (diagonal is the highest value), and rock and edm are getting mistaken for one another. Let's see if we can't make something of this.
Below, we do the same random forest as above, but going strictly off of average frame color for the video.
We found the most commonly appearing color in each frame and called it the 'frame mode'. We then took all of the frame modes and found the 10 most common of them. Those became the 'color data' we use to analyze videos.
In [6]:
def gen_new_headers(old_headers):
headers = ['colors_' + str(x+1) + '_' for x in range(10)]
h = []
for x in headers:
h.append(x + 'red')
h.append(x + 'blue')
h.append(x + 'green')
return old_headers + h + ['genre']
In [7]:
clf = RandomForestClassifier(n_estimators=11)
color_features = gen_new_headers([])[:-1]
# Predicting based solely on colors
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[color_features], y)
z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[color_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[color_features]),rownames=["Actual"], colnames=["Predicted"])
Out[7]:
This actually yields worse results than just the viewer statistics, because the color of a video by itself does not determine the genre. If rappers only had red in their videos and rockers only had black this might be somewhat accurate, but that's just not the case. But, what if we pair these findings with our initial viewer statistics?
In [8]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features
# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[all_features], y)
z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])
Out[8]:
Scores are expectedly low. It seems as if we're trying to make the classifier do way too much work, and are giving it very mediocre data to go off of. Recall that we're actually trying to determine WHICH genre a video is by the above code, not whether or not a video is of ONE specific genre. This brings back the binary classifiers that we created above, let's put those to use to see if we can improve these scores.
We try pop and rap first, since they seem to be the most distinct by what we've gathered above.
In [9]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features
print all_features
# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_pop'])
clf = clf.fit(training_set[all_features], y)
z, _ = pd.factorize(test_set['is_pop'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])
Out[9]:
In [10]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features
# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_rap'])
clf = clf.fit(training_set[all_features], y)
z, _ = pd.factorize(test_set['is_rap'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_rap, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])
Out[10]:
What we're seeing above is a confusion matrix that, based on our training data, predicts whether or not a video in the test set is a pop video or not. In the "predicted" row, 0 means it predicts it's not a pop video, and that the 1 is. Likewise with the actual, 0 shows that the video actually wasn't a pop video, and the 1 shows that it was.
The confusion matrix above is our first effort at utilizing these binary classifiers. Most of our videos aren't pop videos, and the model did a good job of picking out those that aren't pop. However, we could use some improvement in the realm of "false negatives", where the model classified a video as not pop when it actually was.
We do these tests 50 times for sake of average score.
Rather than hard-coding each time we wanted to run something for average, we wrote a function that does it for us. All we have to do is pass in the boolean classifier in quotes ("is_rock", etc.), and the number of iterations that we want. Results are displayed below.
In [11]:
def multi_RF_averages(is_genre,num_iterations):
clf = RandomForestClassifier(n_estimators=11)
loop_indices = range(0,num_iterations)
cumsum = 0
for i in loop_indices:
y, _ = pd.factorize(training_set[is_genre])
clf = clf.fit(training_set[all_features], y)
z, _ = pd.factorize(test_set[is_genre])
cumsum = cumsum + clf.score(test_set[all_features],z)
print "Average Score for",len(loop_indices),is_genre,"iterations:", cumsum/len(loop_indices)
return clf
In [12]:
pop_class = multi_RF_averages("is_pop",50)
rap_class = multi_RF_averages("is_rap",50)
rock_class = multi_RF_averages("is_rock",50)
edm_class = multi_RF_averages("is_edm",50)
country_class = multi_RF_averages("is_country",50)
The following creates several files that describe our classifiers. Our website will later
In [13]:
from sklearn.externals import joblib
# only use these to generate pickle files for website
# joblib.dump(pop_class, 'classifiers/pop_class.pkl')
# joblib.dump(rap_class, 'classifiers/rap_class.pkl')
# joblib.dump(rock_class, 'classifiers/rock_class.pkl')
# joblib.dump(edm_class, 'classifiers/edm_class.pkl')
# joblib.dump(country_class, 'classifiers/country_class.pkl')
We ran the above test with all genres, and as shown in above analysis, our country and edm typically have very low accuracy. We've seen above that edm and rock videos are getting mixed up with one another, so we assume that something is characteristic of these 2 genres that's not of everything else. We take out the edm values from our training and test datasets, hoping to improve accuracy.
In [14]:
# Removing EDM for better analysis - makes is_pop and is_rap much more accurate
training_set = pd.concat([country_train,rock_train,pop_train,rap_train])
test_set = pd.concat([country_test,rock_test,pop_test,rap_test])
multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)
Out[14]:
So, what does this tell us? Based on our training data, we have the best chance of accurately classifying something as pop or not pop (under these conditions).
We want to find out which 2 are the most distinct, so we can make build our model based on that classification.
In [15]:
training_set = pd.concat([country_train,rock_train,edm_train,rap_train,pop_train])
test_set = pd.concat([rock_test])
multi_RF_averages("is_rock",50)
test_set = pd.concat([rap_test])
multi_RF_averages("is_rap",50)
test_set = pd.concat([country_test])
multi_RF_averages("is_country",50)
test_set = pd.concat([pop_test])
multi_RF_averages("is_pop",50)
test_set = pd.concat([edm_test])
multi_RF_averages("is_edm",50)
Out[15]:
Rock and EDM have suprisingly distinct classifiers. We should dive into the videos and see what this means.
In [16]:
test_set = pd.concat([edm_test,rock_test])
multi_RF_averages("is_edm",50)
multi_RF_averages("is_rock",50)
Out[16]:
In [17]:
model = ExtraTreesClassifier()
training_set = pd.concat([country_train,pop_train,rap_train,rock_train,edm_train])
y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)
# display the relative importance of each attribute
print model.feature_importances_
In [18]:
df = pd.DataFrame()
df['index'] = all_features
y, _ = pd.factorize(training_set['is_rap'])
model.fit(training_set[all_features], y)
df['rap'] = model.feature_importances_
y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)
df['rock'] = model.feature_importances_
y, _ = pd.factorize(training_set['is_country'])
model.fit(training_set[all_features], y)
df['country'] = model.feature_importances_
y, _ = pd.factorize(training_set['is_edm'])
model.fit(training_set[all_features], y)
df['edm'] = model.feature_importances_
y, _ = pd.factorize(training_set['is_pop'])
model.fit(training_set[all_features], y)
df['pop'] = model.feature_importances_
In [19]:
df = df.set_index('index')
df = df.transpose()
df.head()
Out[19]:
In [28]:
lol =
In [48]:
lol = df.values.tolist()
cols = []
for x in df.columns:
cols.append(x)
In [51]:
import plotly.offline as py # a little wordplay
import plotly.graph_objs as go
offline.init_notebook_mode()
title = 'Feature Importance By Genre'
labels = [ ]
mode_size = [8, 8, 12, 8]
line_size = [2, 2, 4, 2]
x_data = cols
y_data = df.values.tolist()
traces = []
for i in range(0, 4):
traces.append(go.Scatter(
x=x_data,
y=y_data[i],
mode='lines',
connectgaps=True,
))
layout = go.Layout(
yaxis=dict(
showgrid=False,
zeroline=False,
showline=False,
showticklabels=False,
),
autosize=False,
margin=dict(
autoexpand=True,
l=100,
r=20,
t=110,
),
showlegend=False,
)
annotations = []
# Adding labels
for y_trace, label in zip(y_data, labels):
# labeling the left_side of the plot
annotations.append(dict(xref='paper', x=0.05, y=y_trace[0],
xanchor='right', yanchor='middle',
text=label + ' {}%'.format(y_trace[0]),
font=dict(family='Arial',
size=16,
),
showarrow=False))
# labeling the right_side of the plot
annotations.append(dict(xref='paper', x=0.95, y=y_trace[11],
xanchor='left', yanchor='middle',
text='{}%'.format(y_trace[11]),
font=dict(family='Arial',
size=16,
),
showarrow=False))
# Title
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
xanchor='left', yanchor='bottom',
text='Feature Importance By Genre',
font=dict(family='Arial',
size=30,
),
showarrow=False))
# Source
# annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.1,
# xanchor='center', yanchor='top',
# text='Source: PewResearch Center & ' +
# 'Storytelling with data',
# font=dict(family='Arial',
# size=12,
# ),
# showarrow=False))
layout['annotations'] = annotations
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='news-source')
In [36]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.pointplot(x="likes", y="rating",data=df)
sns.plt.show()
In [ ]:
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
print tips
ax = sns.pointplot(x="time", y="total_bill", data=tips)
sns.plt.show()
In [ ]:
In [ ]: