Data Collection


In [1]:
import pandas as pd
from os import path
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import sklearn

# Edit path if need be (shouldn't need to b/c we all have the same folder structure)
CSV_PATH_1 = '../Videos/all_data'
CSV_PATH_2 = '../Videos2/all_data2'
FILE_EXTENSION = '_all.csv'
GENRES = ['country', 'edm', 'pop', 'rap', 'rock']

# Containers for the data frames
genre_dfs = {}
all_genres = None

# Read in the 5 genre's of CV's
for genre in GENRES:
    genre_csv_path_1 = path.join(CSV_PATH_1, genre) + FILE_EXTENSION
    genre_csv_path_2 = path.join(CSV_PATH_2, genre) + FILE_EXTENSION
    df_1 = pd.read_csv(genre_csv_path_1)
    df_2 = pd.read_csv(genre_csv_path_2)
    df_1 = df_1.drop('Unnamed: 0',1)
    df_2 = df_2.drop('Unnamed: 0',1)
    df_combined = pd.concat([df_1,df_2],ignore_index=True)
    genre_dfs[genre] = df_combined

all_genres = pd.concat(genre_dfs.values())
all_genres.head()

# genre_dfs is now a dictionary that contains the 5 different data frames
# all_genres is a dataframe that contains all of the data


/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/sklearn/utils/fixes.py:200: UserWarning: Using `sort` instead of partition.Upgrade numpy to 1.8 for better performace on large numberof clusters
  warnings.warn('Using `sort` instead of partition.'
Out[1]:
filename author description viewcount rating likes dislikes duration length keywords published colors_1_red colors_1_blue colors_1_green colors_2_red colors_2_blue colors_2_green colors_3_red colors_3_blue colors_3_green
0 Luke Bryan - Roller Coaster.mp4 LukeBryanVEVO Luke Bryan - Crash My Party\nPurchase now on i... 28948653 4.840108 127866 5324 00:04:23 263 [Luke, Bryan, Roller, Coaster, Capitol, Record... 2014-06-21 11:00:03 250 240 230 80 70 50 210 200 190 ...
1 Dierks Bentley - Drunk On A Plane.mp4 DierksBentleyVEVO Purchase Dierks Bentley’s latest music: http:/... 41548786 4.763639 140682 8835 00:04:51 291 [Dierks, Bentley, Drunk, On, Plane, Capitol, R... 2014-05-13 20:16:23 10 20 30 0 10 20 20 30 40 ...
2 Thomas Rhett - Get Me Some Of That.mp4 ThomasRhettVEVO Music video by Thomas Rhett performing Get Me ... 43868160 4.826069 128488 5841 00:03:13 193 [Thomas, Rhett, Get, Me, Some, Of, That, The, ... 2013-12-10 08:00:01 50 60 40 60 70 50 70 80 60 ...
3 David Nail - Whatever She's Got.mp4 DavidNailVEVO Purchase David Nail’s latest music: http://umg... 48648247 4.826632 141108 6393 00:04:01 241 [David, Nail, Whatever, She's, Got, MCA, Nashv... 2013-07-16 07:00:15 30 20 10 40 30 20 50 40 30 ...
4 Joe Nichols - Yeah.mp4 JoeNicholsVEVO Joe Nichols - Yeah\n“Yeah” from Joe Nichol’s C... 11397694 4.815725 33255 1606 00:03:52 232 [Joe Nichols, Red Bow Records, Country, Yeah] 2014-05-27 07:00:01 250 250 230 20 30 40 10 20 30 ...

5 rows × 42 columns

Ordinal Genres

Below, we make the genres ordinal to fit in the random forest classifiers. We add a new column to our dataframe to do so, write a function to populate it, and run it across the dataframe.


In [2]:
def genre_to_ordinal(genre_in):
    if(genre_in == "country"):
        return 0
    elif(genre_in == "pop"):
        return 1
    elif(genre_in == "rock"):
        return 2
    elif(genre_in == "edm"):
        return 3
    elif(genre_in == "rap"):
        return 4
    else:
        return genre_in
    
all_genres['genre_ordinal'] = all_genres.genre.apply(genre_to_ordinal)

We add in some boolean genre classifiers to make our analysis more fine-grained. Rather than saying "we predict this video is country with 50% confidence", we could say "we predict this video is not edm with 90% confidence" and so on.


In [3]:
# Adding is_country flag
def is_country(genre_in):
    if(genre_in == "country"):
        return 1
    else:
        return 0
    
all_genres['is_country'] = all_genres.genre.apply(is_country)

# Adding is_country flag
def is_rock(genre_in):
    if(genre_in == "rock"):
        return 1
    else:
        return 0
    
all_genres['is_rock'] = all_genres.genre.apply(is_rock)

# Adding is_edm flag
def is_edm(genre_in):
    if(genre_in == "edm"):
        return 1
    else:
        return 0
    
all_genres['is_edm'] = all_genres.genre.apply(is_edm)

# Adding is_rap flag
def is_rap(genre_in):
    if(genre_in == "rap"):
        return 1
    else:
        return 0
    
all_genres['is_rap'] = all_genres.genre.apply(is_rap)

# Adding is_country flag
def is_pop(genre_in):
    if(genre_in == "pop"):
        return 1
    else:
        return 0
    
all_genres['is_pop'] = all_genres.genre.apply(is_pop)

Test and Train Sets

We create our training and test sets by splitting all_genres by genre, and making 10 of each genre train and 10 test. We aggregate by genre to make our full train and full test sets, each containing 50 records of various genres.


In [4]:
# Subset all_genres to group by individual genres
country_records  = all_genres[all_genres["genre"] == "country"]
rock_records     = all_genres[all_genres["genre"] == "rock"]
pop_records      = all_genres[all_genres["genre"] == "pop"]
edm_records      = all_genres[all_genres["genre"] == "edm"]
rap_records      = all_genres[all_genres["genre"] == "rap"]

# From the subsets above, create train and test sets from each
country_train = country_records.head(len(country_records) / 2)
country_test  = country_records.tail(len(country_records) / 2)
rock_train    = rock_records.head(len(rock_records) / 2)
rock_test     = rock_records.tail(len(rock_records) / 2)
pop_train     = pop_records.head(len(pop_records) / 2)
pop_test      = pop_records.tail(len(pop_records) / 2)
edm_train     = edm_records.head(len(edm_records) / 2)
edm_test      = edm_records.tail(len(edm_records) / 2)
rap_train     = rap_records.head(len(rap_records) / 2)
rap_test      = rap_records.tail(len(rap_records) / 2)

# Create big training and big test set for analysis
training_set = pd.concat([country_train,rock_train,pop_train,edm_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,edm_test,rap_test])

training_set = training_set.fillna(0)
test_set = test_set.fillna(0)

print "Training Records:\t" , len(training_set)
print "Test Records:\t\t" , len(test_set)
# training_set.head()


Training Records:	405
Test Records:		405

Generating Random Forest - Viewer Statistics

We start generating our random forests, and output a relative accuracy and a confusion matrix. In this first one, we simply factor in non-color variables (rating, likes, dislikes, length and viewcount), and run it across all records to predict an ordinal genre value.


In [5]:
# Predicting based solely on non-color features, using RF
clf = RandomForestClassifier(n_estimators=11)
meta_data_features = ['rating', 'likes','dislikes','length','viewcount']
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[meta_data_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[meta_data_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[meta_data_features]),rownames=["Actual"], colnames=["Predicted"])


0.422222222222
Out[5]:
Predicted 0 1 2 3 4
Actual
0 48 1 1 27 4
1 0 27 41 6 7
2 27 8 2 34 4
3 7 16 22 26 12
4 7 17 4 9 48

5 rows × 5 columns

As shown above, this method yields relatively poor results. This is because there's no distinct clusters being created by our random forest, and simple viewer statistics tell us nothing about what kind of video we're watching. However, we see that country, rap and pop are initially somewhat distinct (diagonal is the highest value), and rock and edm are getting mistaken for one another. Let's see if we can't make something of this.

Random Forest - Only Color Statistics

Below, we do the same random forest as above, but going strictly off of average frame color for the video.

We found the most commonly appearing color in each frame and called it the 'frame mode'. We then took all of the frame modes and found the 10 most common of them. Those became the 'color data' we use to analyze videos.


In [6]:
def gen_new_headers(old_headers):
    headers = ['colors_' + str(x+1) + '_' for x in range(10)]
    h = []
    for x in headers:
        h.append(x + 'red')
        h.append(x + 'blue')
        h.append(x + 'green')
    return old_headers + h + ['genre']

In [7]:
clf = RandomForestClassifier(n_estimators=11)
color_features = gen_new_headers([])[:-1]

# Predicting based solely on colors
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[color_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[color_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[color_features]),rownames=["Actual"], colnames=["Predicted"])


0.237037037037
Out[7]:
Predicted 0 1 2 3 4
Actual
0 30 23 9 12 7
1 23 14 13 15 16
2 30 20 8 10 7
3 16 12 29 19 7
4 22 12 22 15 14

5 rows × 5 columns

This actually yields worse results than just the viewer statistics, because the color of a video by itself does not determine the genre. If rappers only had red in their videos and rockers only had black this might be somewhat accurate, but that's just not the case. But, what if we pair these findings with our initial viewer statistics?

Random Forest - All Features


In [8]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])


0.39012345679
Out[8]:
Predicted 0 1 2 3 4
Actual
0 48 4 0 14 15
1 13 17 33 13 5
2 21 27 3 14 10
3 7 9 24 27 16
4 28 10 6 18 23

5 rows × 5 columns

Singling Out Pop and Rap

Scores are expectedly low. It seems as if we're trying to make the classifier do way too much work, and are giving it very mediocre data to go off of. Recall that we're actually trying to determine WHICH genre a video is by the above code, not whether or not a video is of ONE specific genre. This brings back the binary classifiers that we created above, let's put those to use to see if we can improve these scores.

We try pop and rap first, since they seem to be the most distinct by what we've gathered above.


In [9]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features
print all_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_pop'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_pop'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])


['rating', 'likes', 'dislikes', 'length', 'viewcount', 'colors_1_red', 'colors_1_blue', 'colors_1_green', 'colors_2_red', 'colors_2_blue', 'colors_2_green', 'colors_3_red', 'colors_3_blue', 'colors_3_green', 'colors_4_red', 'colors_4_blue', 'colors_4_green', 'colors_5_red', 'colors_5_blue', 'colors_5_green', 'colors_6_red', 'colors_6_blue', 'colors_6_green', 'colors_7_red', 'colors_7_blue', 'colors_7_green', 'colors_8_red', 'colors_8_blue', 'colors_8_green', 'colors_9_red', 'colors_9_blue', 'colors_9_green', 'colors_10_red', 'colors_10_blue', 'colors_10_green']
0.80987654321
Out[9]:
Predicted 0 1
Actual
0 311 13
1 64 17

2 rows × 2 columns


In [10]:
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_rap'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_rap'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_rap, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])


0.767901234568
Out[10]:
Predicted 0 1
Actual
0 289 31
1 63 22

2 rows × 2 columns

What we're seeing above is a confusion matrix that, based on our training data, predicts whether or not a video in the test set is a pop video or not. In the "predicted" row, 0 means it predicts it's not a pop video, and that the 1 is. Likewise with the actual, 0 shows that the video actually wasn't a pop video, and the 1 shows that it was.

The confusion matrix above is our first effort at utilizing these binary classifiers. Most of our videos aren't pop videos, and the model did a good job of picking out those that aren't pop. However, we could use some improvement in the realm of "false negatives", where the model classified a video as not pop when it actually was.

We do these tests 50 times for sake of average score.

Rather than hard-coding each time we wanted to run something for average, we wrote a function that does it for us. All we have to do is pass in the boolean classifier in quotes ("is_rock", etc.), and the number of iterations that we want. Results are displayed below.


In [11]:
def multi_RF_averages(is_genre,num_iterations):
    clf = RandomForestClassifier(n_estimators=11)
    loop_indices = range(0,num_iterations)
    cumsum = 0

    for i in loop_indices:
        y, _ = pd.factorize(training_set[is_genre])
        clf = clf.fit(training_set[all_features], y)

        z, _ = pd.factorize(test_set[is_genre])
        cumsum = cumsum + clf.score(test_set[all_features],z)
    
    
    print "Average Score for",len(loop_indices),is_genre,"iterations:", cumsum/len(loop_indices)
    return clf

In [12]:
pop_class = multi_RF_averages("is_pop",50)
rap_class = multi_RF_averages("is_rap",50)
rock_class = multi_RF_averages("is_rock",50)
edm_class = multi_RF_averages("is_edm",50)
country_class = multi_RF_averages("is_country",50)


Average Score for 50 is_pop iterations: 0.810271604938
Average Score for 50 is_rap iterations: 0.78449382716
Average Score for 50 is_rock iterations: 0.814469135802
Average Score for 50 is_edm iterations: 0.756345679012
Average Score for 50 is_country iterations: 0.793037037037

The following creates several files that describe our classifiers. Our website will later


In [13]:
from sklearn.externals import joblib
# only use these to generate pickle files for website
# joblib.dump(pop_class, 'classifiers/pop_class.pkl')
# joblib.dump(rap_class, 'classifiers/rap_class.pkl')
# joblib.dump(rock_class, 'classifiers/rock_class.pkl')
# joblib.dump(edm_class, 'classifiers/edm_class.pkl')
# joblib.dump(country_class, 'classifiers/country_class.pkl')

We ran the above test with all genres, and as shown in above analysis, our country and edm typically have very low accuracy. We've seen above that edm and rock videos are getting mixed up with one another, so we assume that something is characteristic of these 2 genres that's not of everything else. We take out the edm values from our training and test datasets, hoping to improve accuracy.


In [14]:
# Removing EDM for better analysis - makes is_pop and is_rap much more accurate
training_set = pd.concat([country_train,rock_train,pop_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,rap_test])

multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)


Average Score for 50 is_pop iterations: 0.83900621118
Average Score for 50 is_rap iterations: 0.744472049689
Average Score for 50 is_rock iterations: 0.771242236025
Average Score for 50 is_edm iterations: 1.0
Average Score for 50 is_country iterations: 0.737888198758
Out[14]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

So, what does this tell us? Based on our training data, we have the best chance of accurately classifying something as pop or not pop (under these conditions).

We want to find out which 2 are the most distinct, so we can make build our model based on that classification.


In [15]:
training_set = pd.concat([country_train,rock_train,edm_train,rap_train,pop_train])

test_set     = pd.concat([rock_test])
multi_RF_averages("is_rock",50)

test_set     = pd.concat([rap_test])
multi_RF_averages("is_rap",50)

test_set     = pd.concat([country_test])
multi_RF_averages("is_country",50)

test_set     = pd.concat([pop_test])
multi_RF_averages("is_pop",50)

test_set     = pd.concat([edm_test])
multi_RF_averages("is_edm",50)


Average Score for 50 is_rock iterations: 0.824
Average Score for 50 is_rap iterations: 0.745176470588
Average Score for 50 is_country iterations: 0.212098765432
Average Score for 50 is_pop iterations: 0.727654320988
Average Score for 50 is_edm iterations: 0.926265060241
Out[15]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Rock and EDM have suprisingly distinct classifiers. We should dive into the videos and see what this means.


In [16]:
test_set     = pd.concat([edm_test,rock_test])
multi_RF_averages("is_edm",50)
multi_RF_averages("is_rock",50)


Average Score for 50 is_edm iterations: 0.523797468354
Average Score for 50 is_rock iterations: 0.589620253165
Out[16]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Selecting Most Valuable Features per Genre - Rock


In [17]:
model = ExtraTreesClassifier()

training_set = pd.concat([country_train,pop_train,rap_train,rock_train,edm_train])
y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

# display the relative importance of each attribute
print model.feature_importances_


[ 0.03215376  0.03177851  0.03505766  0.04129051  0.02814354  0.02575379
  0.03171086  0.03654139  0.02574975  0.01578325  0.02653417  0.03130137
  0.02926545  0.02289587  0.02684631  0.03361007  0.02944086  0.02107275
  0.02752795  0.03850809  0.04428544  0.03098749  0.02239601  0.02863052
  0.03092083  0.024       0.02473154  0.02474276  0.02801491  0.02447993
  0.03047746  0.02094644  0.02267132  0.02309511  0.02865437]

In [18]:
df = pd.DataFrame()
df['index'] = all_features

y, _ = pd.factorize(training_set['is_rap'])
model.fit(training_set[all_features], y)
        
df['rap'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

df['rock'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_country'])
model.fit(training_set[all_features], y)

df['country'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_edm'])
model.fit(training_set[all_features], y)

df['edm'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_pop'])
model.fit(training_set[all_features], y)

df['pop'] = model.feature_importances_

In [19]:
df = df.set_index('index')
df = df.transpose()
df.head()


Out[19]:
index rating likes dislikes length viewcount colors_1_red colors_1_blue colors_1_green colors_2_red colors_2_blue colors_2_green colors_3_red colors_3_blue colors_3_green colors_4_red colors_4_blue colors_4_green colors_5_red colors_5_blue colors_5_green
rap 0.047518 0.044509 0.026947 0.048549 0.068925 0.020245 0.029470 0.036669 0.022064 0.025914 0.032847 0.020178 0.019545 0.024083 0.027169 0.029987 0.030205 0.026613 0.033678 0.015384 ...
rock 0.038210 0.027530 0.034013 0.067918 0.026355 0.025508 0.035172 0.028821 0.024865 0.017377 0.025067 0.037129 0.032313 0.038196 0.023275 0.018912 0.029145 0.033083 0.025767 0.021179 ...
country 0.038492 0.045486 0.028221 0.018982 0.042292 0.026688 0.017241 0.031658 0.025009 0.026803 0.021720 0.025610 0.020599 0.026612 0.032661 0.028635 0.033870 0.024077 0.023897 0.029288 ...
edm 0.028083 0.029260 0.035402 0.029115 0.034032 0.026592 0.024570 0.017721 0.022563 0.022167 0.025107 0.028665 0.032543 0.023773 0.019332 0.031252 0.038824 0.030370 0.032247 0.029857 ...
pop 0.025182 0.143491 0.168979 0.021370 0.119519 0.019349 0.017362 0.010976 0.012727 0.024821 0.019371 0.015525 0.017323 0.014381 0.024512 0.015031 0.013566 0.015581 0.016733 0.013688 ...

5 rows × 35 columns


In [28]:
lol =


[[0.04751846813757317, 0.04450860945678809, 0.026946872025639845, 0.04854879161636211, 0.06892461649124429, 0.020244662439165197, 0.029469591198911304, 0.03666895866275123, 0.022064296236677206, 0.025913540404628243, 0.0328473922409685, 0.020177816362440794, 0.01954470517035783, 0.024083395556533084, 0.02716897135714228, 0.0299871399410516, 0.030204914112677794, 0.026612903579196108, 0.033678100812857076, 0.015384066701009187, 0.0377606319761716, 0.023283097779122113, 0.026896605875738656, 0.02370724274142814, 0.022917492209296515, 0.03315687278213647, 0.022636611276494613, 0.020829799925699225, 0.024590609218013638, 0.019688600158474658, 0.02431423367747316, 0.030478725679808666, 0.019923824988711373, 0.012386350397526355, 0.026931488809929995], [0.03820965976479474, 0.02753037806867034, 0.0340131434468542, 0.06791847333860007, 0.026355397033801558, 0.02550808124853287, 0.03517179068395991, 0.028820656298402664, 0.024865336561169366, 0.017377287210609042, 0.025067429774877664, 0.03712925603631885, 0.032313431624130654, 0.038195928655306036, 0.023275307183976413, 0.01891160922372603, 0.02914479419186452, 0.03308264141281638, 0.025766988170473615, 0.02117862890471036, 0.03454471526651034, 0.02874953328420889, 0.02589834847364113, 0.023687853895087523, 0.020170353110942778, 0.03404435453090807, 0.02542704493728982, 0.023998457655911336, 0.020884384094487254, 0.023036103651904234, 0.03447665812922108, 0.0207688983761078, 0.02616763410506969, 0.01664348412858725, 0.03166595752652742], [0.038492322631438716, 0.045486078988124935, 0.028221428326983496, 0.018982051570874937, 0.042292196161463236, 0.026688175973264932, 0.017241090327986788, 0.03165804124107317, 0.025008722174609282, 0.02680277078775852, 0.021720306937796027, 0.025609712677840114, 0.020599273596548933, 0.026611742440018803, 0.032661331880473085, 0.028635346967783425, 0.033869672900653916, 0.02407678279615873, 0.02389680011782138, 0.029288064748095928, 0.03231456538677038, 0.025522108438835046, 0.032449707418120144, 0.023296733008336935, 0.034257095599808315, 0.027813891432831965, 0.028878412706529515, 0.023239697524282992, 0.03347701485537924, 0.03237786067825238, 0.028867002107612873, 0.02653381982789816, 0.015369669974456185, 0.03039838154303724, 0.03736212625108022], [0.028083091393088795, 0.02925982866606816, 0.035402135893162004, 0.029114766721899627, 0.03403230242158296, 0.026592498382255525, 0.02456989381731677, 0.017720930091760297, 0.022562942748568654, 0.022167296549449416, 0.025106889075752196, 0.028665284951930048, 0.032542996748968635, 0.0237725164793399, 0.019331805568237072, 0.031252171204955666, 0.03882406293867897, 0.03036989040829487, 0.03224732020959375, 0.029857107903605927, 0.026485112482593737, 0.02804476420296878, 0.033049262545111355, 0.02528004653713515, 0.03142009660753499, 0.026762062786309028, 0.021691531011036047, 0.021270665315099545, 0.035164016091217165, 0.02321047128941852, 0.03065668685222178, 0.03164276901326415, 0.03070841341208818, 0.030035340645568155, 0.04310302903392418], [0.025181960908518375, 0.14349123469243946, 0.16897911783629957, 0.02137023726023631, 0.11951859058975425, 0.019349123091656144, 0.017361887179769834, 0.010976307846218671, 0.012726752320135573, 0.024820713019392485, 0.019370968808225697, 0.01552530969200926, 0.017323456596456153, 0.014380825294978406, 0.02451225875710663, 0.015030724002407065, 0.01356619976144025, 0.015581235408741886, 0.016732737696410382, 0.013688399761101536, 0.017269472731588947, 0.012241376375142377, 0.01433871726646221, 0.015371294285551335, 0.017401094790585384, 0.013270932167514706, 0.022989235701943618, 0.011605946330226631, 0.02059259403476684, 0.0171798520184634, 0.021834919621225357, 0.01602093746796323, 0.022080114864940582, 0.024634912100715892, 0.023680559719611506]]

In [48]:
lol = df.values.tolist()

cols = []
for x in df.columns:
    cols.append(x)

In [51]:
import plotly.offline as py  # a little wordplay
import plotly.graph_objs as go

offline.init_notebook_mode()

title = 'Feature Importance By Genre'

labels = [ ]

mode_size = [8, 8, 12, 8]

line_size = [2, 2, 4, 2]

x_data = cols

y_data = df.values.tolist()
traces = []

for i in range(0, 4):
    traces.append(go.Scatter(
        x=x_data,
        y=y_data[i],
        mode='lines',
        connectgaps=True,
    ))



layout = go.Layout(
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=False,
        showticklabels=False,
    ),
    autosize=False,
    margin=dict(
        autoexpand=True,
        l=100,
        r=20,
        t=110,
    ),
    showlegend=False,
)

annotations = []

# Adding labels
for y_trace, label in zip(y_data, labels):
    # labeling the left_side of the plot
    annotations.append(dict(xref='paper', x=0.05, y=y_trace[0],
                                  xanchor='right', yanchor='middle',
                                  text=label + ' {}%'.format(y_trace[0]),
                                  font=dict(family='Arial',
                                            size=16,
                                            ),
                                  showarrow=False))
    # labeling the right_side of the plot
    annotations.append(dict(xref='paper', x=0.95, y=y_trace[11],
                                  xanchor='left', yanchor='middle',
                                  text='{}%'.format(y_trace[11]),
                                  font=dict(family='Arial',
                                            size=16,
                                            ),
                                  showarrow=False))
# Title
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Feature Importance By Genre',
                              font=dict(family='Arial',
                                        size=30,
                                        ),
                              showarrow=False))
# Source
# annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.1,
#                               xanchor='center', yanchor='top',
#                               text='Source: PewResearch Center & ' +
#                                    'Storytelling with data',
#                               font=dict(family='Arial',
#                                         size=12,
#                                         ),
#                               showarrow=False))

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='news-source')


---------------------------------------------------------------------------
PlotlyError                               Traceback (most recent call last)
<ipython-input-51-b3ec171dfba6> in <module>()
     83 
     84 fig = go.Figure(data=traces, layout=layout)
---> 85 py.iplot(fig, filename='news-source')

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/plotly/offline/offline.pyc in iplot(figure_or_data, show_link, link_text, validate, image, filename, image_width, image_height)
    283             '',
    284             'import plotly',
--> 285             'plotly.offline.init_notebook_mode() '
    286             '# run at the start of every ipython notebook',
    287         ]))

PlotlyError: Plotly Offline mode has not been initialized in this notebook. Run: 

import plotly
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook

In [36]:
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.pointplot(x="likes", y="rating",data=df)
sns.plt.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-36-c3532476d7df> in <module>()
      1 import seaborn as sns
      2 sns.set_style("whitegrid")
----> 3 ax = sns.pointplot(x="likes", y="rating",data=df)
      4 sns.plt.show()

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in pointplot(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, markers, linestyles, dodge, join, scale, orient, color, palette, ax, errwidth, capsize, **kwargs)
   3065                             estimator, ci, n_boot, units,
   3066                             markers, linestyles, dodge, join, scale,
-> 3067                             orient, color, palette, errwidth, capsize)
   3068 
   3069     if ax is None:

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, markers, linestyles, dodge, join, scale, orient, color, palette, errwidth, capsize)
   1607         """Initialize the plotter."""
   1608         self.establish_variables(x, y, hue, data, orient,
-> 1609                                  order, hue_order, units)
   1610         self.establish_colors(color, palette, 1)
   1611         self.estimate_statistic(estimator, ci, n_boot)

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    142                 x = data.get(x, x)
    143                 y = data.get(y, y)
--> 144                 hue = data.get(hue, hue)
    145                 units = data.get(units, units)
    146 

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/generic.pyc in get(self, key, default)
    970         """
    971         try:
--> 972             return self[key]
    973         except KeyError:
    974             return default

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1633             return self._getitem_multilevel(key)
   1634         else:
-> 1635             return self._getitem_column(key)
   1636 
   1637     def _getitem_column(self, key):

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1640         # get column
   1641         if self.columns.is_unique:
-> 1642             return self._get_item_cache(key)
   1643 
   1644         # duplicate columns & possible reduce dimensionaility

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    981         res = cache.get(item)
    982         if res is None:
--> 983             values = self._data.get(item)
    984             res = self._box_item_values(item, values)
    985             cache[item] = res

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   2750             if isnull(item):
   2751                 indexer = np.arange(len(self.items))[isnull(self.items)]
-> 2752                 return self.get_for_nan_indexer(indexer)
   2753 
   2754             _, block = self._find_block(item)

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/internals.pyc in get_for_nan_indexer(self, indexer)
   2802                 indexer = indexer.item()
   2803             else:
-> 2804                 raise ValueError("cannot label index with a null key")
   2805 
   2806         # take a nan indexer and return the values

ValueError: cannot label index with a null key

In [ ]:
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
print tips
ax = sns.pointplot(x="time", y="total_bill", data=tips)
sns.plt.show()

Future Improvements

  • Run the above graph a number of times, take the average for each cell
  • Based on the heaviest weighted parameters for each, run the random forest algorithm only taking these given parameters into consideration
  • Generate a model that classifies videos dynamically
  • Make more values ordinal - maybe to NLP or LDA to factor in descriptions, titles and lyrics

In [ ]:


In [ ]: