Data Collection



In [1]:

    
import pandas as pd
from os import path
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import sklearn

# Edit path if need be (shouldn't need to b/c we all have the same folder structure)
CSV_PATH_1 = '../Videos/all_data'
CSV_PATH_2 = '../Videos2/all_data2'
FILE_EXTENSION = '_all.csv'
GENRES = ['country', 'edm', 'pop', 'rap', 'rock']

# Containers for the data frames
genre_dfs = {}
all_genres = None

# Read in the 5 genre's of CV's
for genre in GENRES:
    genre_csv_path_1 = path.join(CSV_PATH_1, genre) + FILE_EXTENSION
    genre_csv_path_2 = path.join(CSV_PATH_2, genre) + FILE_EXTENSION
    df_1 = pd.read_csv(genre_csv_path_1)
    df_2 = pd.read_csv(genre_csv_path_2)
    df_1 = df_1.drop('Unnamed: 0',1)
    df_2 = df_2.drop('Unnamed: 0',1)
    df_combined = pd.concat([df_1,df_2],ignore_index=True)
    genre_dfs[genre] = df_combined

all_genres = pd.concat(genre_dfs.values())
all_genres.head()

# genre_dfs is now a dictionary that contains the 5 different data frames
# all_genres is a dataframe that contains all of the data









    



/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/sklearn/utils/fixes.py:200: UserWarning: Using `sort` instead of partition.Upgrade numpy to 1.8 for better performace on large numberof clusters
  warnings.warn('Using `sort` instead of partition.'






    Out[1]:






  
    
      
      filename
      author
      description
      viewcount
      rating
      likes
      dislikes
      duration
      length
      keywords
      published
      colors_1_red
      colors_1_blue
      colors_1_green
      colors_2_red
      colors_2_blue
      colors_2_green
      colors_3_red
      colors_3_blue
      colors_3_green
      
    
  
  
    
      0
              Luke Bryan - Roller Coaster.mp4
           LukeBryanVEVO
       Luke Bryan - Crash My Party\nPurchase now on i...
       28948653
       4.840108
       127866
       5324
       00:04:23
       263
       [Luke, Bryan, Roller, Coaster, Capitol, Record...
       2014-06-21 11:00:03
       250
       240
       230
       80
       70
       50
       210
       200
       190
      ...
    
    
      1
        Dierks Bentley - Drunk On A Plane.mp4
       DierksBentleyVEVO
       Purchase Dierks Bentley’s latest music: http:/...
       41548786
       4.763639
       140682
       8835
       00:04:51
       291
       [Dierks, Bentley, Drunk, On, Plane, Capitol, R...
       2014-05-13 20:16:23
        10
        20
        30
        0
       10
       20
        20
        30
        40
      ...
    
    
      2
       Thomas Rhett - Get Me Some Of That.mp4
         ThomasRhettVEVO
       Music video by Thomas Rhett performing Get Me ...
       43868160
       4.826069
       128488
       5841
       00:03:13
       193
       [Thomas, Rhett, Get, Me, Some, Of, That, The, ...
       2013-12-10 08:00:01
        50
        60
        40
       60
       70
       50
        70
        80
        60
      ...
    
    
      3
          David Nail - Whatever She's Got.mp4
           DavidNailVEVO
       Purchase David Nail’s latest music: http://umg...
       48648247
       4.826632
       141108
       6393
       00:04:01
       241
       [David, Nail, Whatever, She's, Got, MCA, Nashv...
       2013-07-16 07:00:15
        30
        20
        10
       40
       30
       20
        50
        40
        30
      ...
    
    
      4
                       Joe Nichols - Yeah.mp4
          JoeNicholsVEVO
       Joe Nichols - Yeah\n“Yeah” from Joe Nichol’s C...
       11397694
       4.815725
        33255
       1606
       00:03:52
       232
           [Joe Nichols, Red Bow Records, Country, Yeah]
       2014-05-27 07:00:01
       250
       250
       230
       20
       30
       40
        10
        20
        30
      ...
    
  

5 rows × 42 columns

Ordinal Genres

Below, we make the genres ordinal to fit in the random forest classifiers. We add a new column to our dataframe to do so, write a function to populate it, and run it across the dataframe.



In [2]:

    
def genre_to_ordinal(genre_in):
    if(genre_in == "country"):
        return 0
    elif(genre_in == "pop"):
        return 1
    elif(genre_in == "rock"):
        return 2
    elif(genre_in == "edm"):
        return 3
    elif(genre_in == "rap"):
        return 4
    else:
        return genre_in
    
all_genres['genre_ordinal'] = all_genres.genre.apply(genre_to_ordinal)

We add in some boolean genre classifiers to make our analysis more fine-grained. Rather than saying "we predict this video is country with 50% confidence", we could say "we predict this video is not edm with 90% confidence" and so on.



In [3]:

    
# Adding is_country flag
def is_country(genre_in):
    if(genre_in == "country"):
        return 1
    else:
        return 0
    
all_genres['is_country'] = all_genres.genre.apply(is_country)

# Adding is_country flag
def is_rock(genre_in):
    if(genre_in == "rock"):
        return 1
    else:
        return 0
    
all_genres['is_rock'] = all_genres.genre.apply(is_rock)

# Adding is_edm flag
def is_edm(genre_in):
    if(genre_in == "edm"):
        return 1
    else:
        return 0
    
all_genres['is_edm'] = all_genres.genre.apply(is_edm)

# Adding is_rap flag
def is_rap(genre_in):
    if(genre_in == "rap"):
        return 1
    else:
        return 0
    
all_genres['is_rap'] = all_genres.genre.apply(is_rap)

# Adding is_country flag
def is_pop(genre_in):
    if(genre_in == "pop"):
        return 1
    else:
        return 0
    
all_genres['is_pop'] = all_genres.genre.apply(is_pop)

Test and Train Sets

We create our training and test sets by splitting all_genres by genre, and making 10 of each genre train and 10 test. We aggregate by genre to make our full train and full test sets, each containing 50 records of various genres.



In [4]:

    
# Subset all_genres to group by individual genres
country_records  = all_genres[all_genres["genre"] == "country"]
rock_records     = all_genres[all_genres["genre"] == "rock"]
pop_records      = all_genres[all_genres["genre"] == "pop"]
edm_records      = all_genres[all_genres["genre"] == "edm"]
rap_records      = all_genres[all_genres["genre"] == "rap"]

# From the subsets above, create train and test sets from each
country_train = country_records.head(len(country_records) / 2)
country_test  = country_records.tail(len(country_records) / 2)
rock_train    = rock_records.head(len(rock_records) / 2)
rock_test     = rock_records.tail(len(rock_records) / 2)
pop_train     = pop_records.head(len(pop_records) / 2)
pop_test      = pop_records.tail(len(pop_records) / 2)
edm_train     = edm_records.head(len(edm_records) / 2)
edm_test      = edm_records.tail(len(edm_records) / 2)
rap_train     = rap_records.head(len(rap_records) / 2)
rap_test      = rap_records.tail(len(rap_records) / 2)

# Create big training and big test set for analysis
training_set = pd.concat([country_train,rock_train,pop_train,edm_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,edm_test,rap_test])

training_set = training_set.fillna(0)
test_set = test_set.fillna(0)

print "Training Records:\t" , len(training_set)
print "Test Records:\t\t" , len(test_set)
# training_set.head()









    



Training Records:	405
Test Records:		405

Generating Random Forest - Viewer Statistics

We start generating our random forests, and output a relative accuracy and a confusion matrix. In this first one, we simply factor in non-color variables (rating, likes, dislikes, length and viewcount), and run it across all records to predict an ordinal genre value.



In [5]:

    
# Predicting based solely on non-color features, using RF
clf = RandomForestClassifier(n_estimators=11)
meta_data_features = ['rating', 'likes','dislikes','length','viewcount']
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[meta_data_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[meta_data_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[meta_data_features]),rownames=["Actual"], colnames=["Predicted"])









    



0.422222222222






    Out[5]:






  
    
      Predicted
      0
      1
      2
      3
      4
    
    
      Actual
      
      
      
      
      
    
  
  
    
      0
       48
        1
        1
       27
        4
    
    
      1
        0
       27
       41
        6
        7
    
    
      2
       27
        8
        2
       34
        4
    
    
      3
        7
       16
       22
       26
       12
    
    
      4
        7
       17
        4
        9
       48
    
  

5 rows × 5 columns

As shown above, this method yields relatively poor results. This is because there's no distinct clusters being created by our random forest, and simple viewer statistics tell us nothing about what kind of video we're watching. However, we see that country, rap and pop are initially somewhat distinct (diagonal is the highest value), and rock and edm are getting mistaken for one another. Let's see if we can't make something of this.

Random Forest - Only Color Statistics

Below, we do the same random forest as above, but going strictly off of average frame color for the video.

We found the most commonly appearing color in each frame and called it the 'frame mode'. We then took all of the frame modes and found the 10 most common of them. Those became the 'color data' we use to analyze videos.



In [6]:

    
def gen_new_headers(old_headers):
    headers = ['colors_' + str(x+1) + '_' for x in range(10)]
    h = []
    for x in headers:
        h.append(x + 'red')
        h.append(x + 'blue')
        h.append(x + 'green')
    return old_headers + h + ['genre']



In [7]:

    
clf = RandomForestClassifier(n_estimators=11)
color_features = gen_new_headers([])[:-1]

# Predicting based solely on colors
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[color_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[color_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[color_features]),rownames=["Actual"], colnames=["Predicted"])









    



0.237037037037






    Out[7]:






  
    
      Predicted
      0
      1
      2
      3
      4
    
    
      Actual
      
      
      
      
      
    
  
  
    
      0
       30
       23
        9
       12
        7
    
    
      1
       23
       14
       13
       15
       16
    
    
      2
       30
       20
        8
       10
        7
    
    
      3
       16
       12
       29
       19
        7
    
    
      4
       22
       12
       22
       15
       14
    
  

5 rows × 5 columns

This actually yields worse results than just the viewer statistics, because the color of a video by itself does not determine the genre. If rappers only had red in their videos and rockers only had black this might be somewhat accurate, but that's just not the case. But, what if we pair these findings with our initial viewer statistics?

Random Forest - All Features



In [8]:

    
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['genre_ordinal'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['genre_ordinal'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.genre_ordinal, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])









    



0.39012345679






    Out[8]:






  
    
      Predicted
      0
      1
      2
      3
      4
    
    
      Actual
      
      
      
      
      
    
  
  
    
      0
       48
        4
        0
       14
       15
    
    
      1
       13
       17
       33
       13
        5
    
    
      2
       21
       27
        3
       14
       10
    
    
      3
        7
        9
       24
       27
       16
    
    
      4
       28
       10
        6
       18
       23
    
  

5 rows × 5 columns

Singling Out Pop and Rap

Scores are expectedly low. It seems as if we're trying to make the classifier do way too much work, and are giving it very mediocre data to go off of. Recall that we're actually trying to determine WHICH genre a video is by the above code, not whether or not a video is of ONE specific genre. This brings back the binary classifiers that we created above, let's put those to use to see if we can improve these scores.

We try pop and rap first, since they seem to be the most distinct by what we've gathered above.



In [9]:

    
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features
print all_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_pop'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_pop'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_pop, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])









    



['rating', 'likes', 'dislikes', 'length', 'viewcount', 'colors_1_red', 'colors_1_blue', 'colors_1_green', 'colors_2_red', 'colors_2_blue', 'colors_2_green', 'colors_3_red', 'colors_3_blue', 'colors_3_green', 'colors_4_red', 'colors_4_blue', 'colors_4_green', 'colors_5_red', 'colors_5_blue', 'colors_5_green', 'colors_6_red', 'colors_6_blue', 'colors_6_green', 'colors_7_red', 'colors_7_blue', 'colors_7_green', 'colors_8_red', 'colors_8_blue', 'colors_8_green', 'colors_9_red', 'colors_9_blue', 'colors_9_green', 'colors_10_red', 'colors_10_blue', 'colors_10_green']
0.80987654321






    Out[9]:






  
    
      Predicted
      0
      1
    
    
      Actual
      
      
    
  
  
    
      0
       311
       13
    
    
      1
        64
       17
    
  

2 rows × 2 columns



In [10]:

    
clf = RandomForestClassifier(n_estimators=11)
all_features = meta_data_features + color_features

# Predicting based on colors and non-color features
y, _ = pd.factorize(training_set['is_rap'])
clf = clf.fit(training_set[all_features], y)

z, _ = pd.factorize(test_set['is_rap'])
print clf.score(test_set[all_features],z)
pd.crosstab(test_set.is_rap, clf.predict(test_set[all_features]),rownames=["Actual"], colnames=["Predicted"])









    



0.767901234568






    Out[10]:






  
    
      Predicted
      0
      1
    
    
      Actual
      
      
    
  
  
    
      0
       289
       31
    
    
      1
        63
       22
    
  

2 rows × 2 columns

What we're seeing above is a confusion matrix that, based on our training data, predicts whether or not a video in the test set is a pop video or not. In the "predicted" row, 0 means it predicts it's not a pop video, and that the 1 is. Likewise with the actual, 0 shows that the video actually wasn't a pop video, and the 1 shows that it was.

The confusion matrix above is our first effort at utilizing these binary classifiers. Most of our videos aren't pop videos, and the model did a good job of picking out those that aren't pop. However, we could use some improvement in the realm of "false negatives", where the model classified a video as not pop when it actually was.

We do these tests 50 times for sake of average score.

Rather than hard-coding each time we wanted to run something for average, we wrote a function that does it for us. All we have to do is pass in the boolean classifier in quotes ("is_rock", etc.), and the number of iterations that we want. Results are displayed below.



In [11]:

    
def multi_RF_averages(is_genre,num_iterations):
    clf = RandomForestClassifier(n_estimators=11)
    loop_indices = range(0,num_iterations)
    cumsum = 0

    for i in loop_indices:
        y, _ = pd.factorize(training_set[is_genre])
        clf = clf.fit(training_set[all_features], y)

        z, _ = pd.factorize(test_set[is_genre])
        cumsum = cumsum + clf.score(test_set[all_features],z)
    
    
    print "Average Score for",len(loop_indices),is_genre,"iterations:", cumsum/len(loop_indices)
    return clf



In [12]:

    
pop_class = multi_RF_averages("is_pop",50)
rap_class = multi_RF_averages("is_rap",50)
rock_class = multi_RF_averages("is_rock",50)
edm_class = multi_RF_averages("is_edm",50)
country_class = multi_RF_averages("is_country",50)









    



Average Score for 50 is_pop iterations: 0.810271604938
Average Score for 50 is_rap iterations: 0.78449382716
Average Score for 50 is_rock iterations: 0.814469135802
Average Score for 50 is_edm iterations: 0.756345679012
Average Score for 50 is_country iterations: 0.793037037037

The following creates several files that describe our classifiers. Our website will later



In [13]:

    
from sklearn.externals import joblib
# only use these to generate pickle files for website
# joblib.dump(pop_class, 'classifiers/pop_class.pkl')
# joblib.dump(rap_class, 'classifiers/rap_class.pkl')
# joblib.dump(rock_class, 'classifiers/rock_class.pkl')
# joblib.dump(edm_class, 'classifiers/edm_class.pkl')
# joblib.dump(country_class, 'classifiers/country_class.pkl')

We ran the above test with all genres, and as shown in above analysis, our country and edm typically have very low accuracy. We've seen above that edm and rock videos are getting mixed up with one another, so we assume that something is characteristic of these 2 genres that's not of everything else. We take out the edm values from our training and test datasets, hoping to improve accuracy.



In [14]:

    
# Removing EDM for better analysis - makes is_pop and is_rap much more accurate
training_set = pd.concat([country_train,rock_train,pop_train,rap_train])
test_set     = pd.concat([country_test,rock_test,pop_test,rap_test])

multi_RF_averages("is_pop",50)
multi_RF_averages("is_rap",50)
multi_RF_averages("is_rock",50)
multi_RF_averages("is_edm",50)
multi_RF_averages("is_country",50)









    



Average Score for 50 is_pop iterations: 0.83900621118
Average Score for 50 is_rap iterations: 0.744472049689
Average Score for 50 is_rock iterations: 0.771242236025
Average Score for 50 is_edm iterations: 1.0
Average Score for 50 is_country iterations: 0.737888198758






    Out[14]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

So, what does this tell us? Based on our training data, we have the best chance of accurately classifying something as pop or not pop (under these conditions).

We want to find out which 2 are the most distinct, so we can make build our model based on that classification.



In [15]:

    
training_set = pd.concat([country_train,rock_train,edm_train,rap_train,pop_train])

test_set     = pd.concat([rock_test])
multi_RF_averages("is_rock",50)

test_set     = pd.concat([rap_test])
multi_RF_averages("is_rap",50)

test_set     = pd.concat([country_test])
multi_RF_averages("is_country",50)

test_set     = pd.concat([pop_test])
multi_RF_averages("is_pop",50)

test_set     = pd.concat([edm_test])
multi_RF_averages("is_edm",50)









    



Average Score for 50 is_rock iterations: 0.824
Average Score for 50 is_rap iterations: 0.745176470588
Average Score for 50 is_country iterations: 0.212098765432
Average Score for 50 is_pop iterations: 0.727654320988
Average Score for 50 is_edm iterations: 0.926265060241






    Out[15]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Rock and EDM have suprisingly distinct classifiers. We should dive into the videos and see what this means.



In [16]:

    
test_set     = pd.concat([edm_test,rock_test])
multi_RF_averages("is_edm",50)
multi_RF_averages("is_rock",50)









    



Average Score for 50 is_edm iterations: 0.523797468354
Average Score for 50 is_rock iterations: 0.589620253165






    Out[16]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=11, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Selecting Most Valuable Features per Genre - Rock



In [17]:

    
model = ExtraTreesClassifier()

training_set = pd.concat([country_train,pop_train,rap_train,rock_train,edm_train])
y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

# display the relative importance of each attribute
print model.feature_importances_









    



[ 0.03215376  0.03177851  0.03505766  0.04129051  0.02814354  0.02575379
  0.03171086  0.03654139  0.02574975  0.01578325  0.02653417  0.03130137
  0.02926545  0.02289587  0.02684631  0.03361007  0.02944086  0.02107275
  0.02752795  0.03850809  0.04428544  0.03098749  0.02239601  0.02863052
  0.03092083  0.024       0.02473154  0.02474276  0.02801491  0.02447993
  0.03047746  0.02094644  0.02267132  0.02309511  0.02865437]



In [18]:

    
df = pd.DataFrame()
df['index'] = all_features

y, _ = pd.factorize(training_set['is_rap'])
model.fit(training_set[all_features], y)
        
df['rap'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_rock'])
model.fit(training_set[all_features], y)

df['rock'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_country'])
model.fit(training_set[all_features], y)

df['country'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_edm'])
model.fit(training_set[all_features], y)

df['edm'] = model.feature_importances_

y, _ = pd.factorize(training_set['is_pop'])
model.fit(training_set[all_features], y)

df['pop'] = model.feature_importances_



In [19]:

    
df = df.set_index('index')
df = df.transpose()
df.head()









    Out[19]:






  
    
      index
      rating
      likes
      dislikes
      length
      viewcount
      colors_1_red
      colors_1_blue
      colors_1_green
      colors_2_red
      colors_2_blue
      colors_2_green
      colors_3_red
      colors_3_blue
      colors_3_green
      colors_4_red
      colors_4_blue
      colors_4_green
      colors_5_red
      colors_5_blue
      colors_5_green
      
    
  
  
    
      rap
       0.047518
       0.044509
       0.026947
       0.048549
       0.068925
       0.020245
       0.029470
       0.036669
       0.022064
       0.025914
       0.032847
       0.020178
       0.019545
       0.024083
       0.027169
       0.029987
       0.030205
       0.026613
       0.033678
       0.015384
      ...
    
    
      rock
       0.038210
       0.027530
       0.034013
       0.067918
       0.026355
       0.025508
       0.035172
       0.028821
       0.024865
       0.017377
       0.025067
       0.037129
       0.032313
       0.038196
       0.023275
       0.018912
       0.029145
       0.033083
       0.025767
       0.021179
      ...
    
    
      country
       0.038492
       0.045486
       0.028221
       0.018982
       0.042292
       0.026688
       0.017241
       0.031658
       0.025009
       0.026803
       0.021720
       0.025610
       0.020599
       0.026612
       0.032661
       0.028635
       0.033870
       0.024077
       0.023897
       0.029288
      ...
    
    
      edm
       0.028083
       0.029260
       0.035402
       0.029115
       0.034032
       0.026592
       0.024570
       0.017721
       0.022563
       0.022167
       0.025107
       0.028665
       0.032543
       0.023773
       0.019332
       0.031252
       0.038824
       0.030370
       0.032247
       0.029857
      ...
    
    
      pop
       0.025182
       0.143491
       0.168979
       0.021370
       0.119519
       0.019349
       0.017362
       0.010976
       0.012727
       0.024821
       0.019371
       0.015525
       0.017323
       0.014381
       0.024512
       0.015031
       0.013566
       0.015581
       0.016733
       0.013688
      ...
    
  

5 rows × 35 columns



In [28]:

    
lol =









    



[[0.04751846813757317, 0.04450860945678809, 0.026946872025639845, 0.04854879161636211, 0.06892461649124429, 0.020244662439165197, 0.029469591198911304, 0.03666895866275123, 0.022064296236677206, 0.025913540404628243, 0.0328473922409685, 0.020177816362440794, 0.01954470517035783, 0.024083395556533084, 0.02716897135714228, 0.0299871399410516, 0.030204914112677794, 0.026612903579196108, 0.033678100812857076, 0.015384066701009187, 0.0377606319761716, 0.023283097779122113, 0.026896605875738656, 0.02370724274142814, 0.022917492209296515, 0.03315687278213647, 0.022636611276494613, 0.020829799925699225, 0.024590609218013638, 0.019688600158474658, 0.02431423367747316, 0.030478725679808666, 0.019923824988711373, 0.012386350397526355, 0.026931488809929995], [0.03820965976479474, 0.02753037806867034, 0.0340131434468542, 0.06791847333860007, 0.026355397033801558, 0.02550808124853287, 0.03517179068395991, 0.028820656298402664, 0.024865336561169366, 0.017377287210609042, 0.025067429774877664, 0.03712925603631885, 0.032313431624130654, 0.038195928655306036, 0.023275307183976413, 0.01891160922372603, 0.02914479419186452, 0.03308264141281638, 0.025766988170473615, 0.02117862890471036, 0.03454471526651034, 0.02874953328420889, 0.02589834847364113, 0.023687853895087523, 0.020170353110942778, 0.03404435453090807, 0.02542704493728982, 0.023998457655911336, 0.020884384094487254, 0.023036103651904234, 0.03447665812922108, 0.0207688983761078, 0.02616763410506969, 0.01664348412858725, 0.03166595752652742], [0.038492322631438716, 0.045486078988124935, 0.028221428326983496, 0.018982051570874937, 0.042292196161463236, 0.026688175973264932, 0.017241090327986788, 0.03165804124107317, 0.025008722174609282, 0.02680277078775852, 0.021720306937796027, 0.025609712677840114, 0.020599273596548933, 0.026611742440018803, 0.032661331880473085, 0.028635346967783425, 0.033869672900653916, 0.02407678279615873, 0.02389680011782138, 0.029288064748095928, 0.03231456538677038, 0.025522108438835046, 0.032449707418120144, 0.023296733008336935, 0.034257095599808315, 0.027813891432831965, 0.028878412706529515, 0.023239697524282992, 0.03347701485537924, 0.03237786067825238, 0.028867002107612873, 0.02653381982789816, 0.015369669974456185, 0.03039838154303724, 0.03736212625108022], [0.028083091393088795, 0.02925982866606816, 0.035402135893162004, 0.029114766721899627, 0.03403230242158296, 0.026592498382255525, 0.02456989381731677, 0.017720930091760297, 0.022562942748568654, 0.022167296549449416, 0.025106889075752196, 0.028665284951930048, 0.032542996748968635, 0.0237725164793399, 0.019331805568237072, 0.031252171204955666, 0.03882406293867897, 0.03036989040829487, 0.03224732020959375, 0.029857107903605927, 0.026485112482593737, 0.02804476420296878, 0.033049262545111355, 0.02528004653713515, 0.03142009660753499, 0.026762062786309028, 0.021691531011036047, 0.021270665315099545, 0.035164016091217165, 0.02321047128941852, 0.03065668685222178, 0.03164276901326415, 0.03070841341208818, 0.030035340645568155, 0.04310302903392418], [0.025181960908518375, 0.14349123469243946, 0.16897911783629957, 0.02137023726023631, 0.11951859058975425, 0.019349123091656144, 0.017361887179769834, 0.010976307846218671, 0.012726752320135573, 0.024820713019392485, 0.019370968808225697, 0.01552530969200926, 0.017323456596456153, 0.014380825294978406, 0.02451225875710663, 0.015030724002407065, 0.01356619976144025, 0.015581235408741886, 0.016732737696410382, 0.013688399761101536, 0.017269472731588947, 0.012241376375142377, 0.01433871726646221, 0.015371294285551335, 0.017401094790585384, 0.013270932167514706, 0.022989235701943618, 0.011605946330226631, 0.02059259403476684, 0.0171798520184634, 0.021834919621225357, 0.01602093746796323, 0.022080114864940582, 0.024634912100715892, 0.023680559719611506]]



In [48]:

    
lol = df.values.tolist()

cols = []
for x in df.columns:
    cols.append(x)



In [51]:

    
import plotly.offline as py  # a little wordplay
import plotly.graph_objs as go

offline.init_notebook_mode()

title = 'Feature Importance By Genre'

labels = [ ]

mode_size = [8, 8, 12, 8]

line_size = [2, 2, 4, 2]

x_data = cols

y_data = df.values.tolist()
traces = []

for i in range(0, 4):
    traces.append(go.Scatter(
        x=x_data,
        y=y_data[i],
        mode='lines',
        connectgaps=True,
    ))



layout = go.Layout(
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=False,
        showticklabels=False,
    ),
    autosize=False,
    margin=dict(
        autoexpand=True,
        l=100,
        r=20,
        t=110,
    ),
    showlegend=False,
)

annotations = []

# Adding labels
for y_trace, label in zip(y_data, labels):
    # labeling the left_side of the plot
    annotations.append(dict(xref='paper', x=0.05, y=y_trace[0],
                                  xanchor='right', yanchor='middle',
                                  text=label + ' {}%'.format(y_trace[0]),
                                  font=dict(family='Arial',
                                            size=16,
                                            ),
                                  showarrow=False))
    # labeling the right_side of the plot
    annotations.append(dict(xref='paper', x=0.95, y=y_trace[11],
                                  xanchor='left', yanchor='middle',
                                  text='{}%'.format(y_trace[11]),
                                  font=dict(family='Arial',
                                            size=16,
                                            ),
                                  showarrow=False))
# Title
annotations.append(dict(xref='paper', yref='paper', x=0.0, y=1.05,
                              xanchor='left', yanchor='bottom',
                              text='Feature Importance By Genre',
                              font=dict(family='Arial',
                                        size=30,
                                        ),
                              showarrow=False))
# Source
# annotations.append(dict(xref='paper', yref='paper', x=0.5, y=-0.1,
#                               xanchor='center', yanchor='top',
#                               text='Source: PewResearch Center & ' +
#                                    'Storytelling with data',
#                               font=dict(family='Arial',
#                                         size=12,
#                                         ),
#                               showarrow=False))

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='news-source')









    



---------------------------------------------------------------------------
PlotlyError                               Traceback (most recent call last)
<ipython-input-51-b3ec171dfba6> in <module>()
     83 
     84 fig = go.Figure(data=traces, layout=layout)
---> 85 py.iplot(fig, filename='news-source')

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/plotly/offline/offline.pyc in iplot(figure_or_data, show_link, link_text, validate, image, filename, image_width, image_height)
    283             '',
    284             'import plotly',
--> 285             'plotly.offline.init_notebook_mode() '
    286             '# run at the start of every ipython notebook',
    287         ]))

PlotlyError: Plotly Offline mode has not been initialized in this notebook. Run: 

import plotly
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook



In [36]:

    
import seaborn as sns
sns.set_style("whitegrid")
ax = sns.pointplot(x="likes", y="rating",data=df)
sns.plt.show()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-36-c3532476d7df> in <module>()
      1 import seaborn as sns
      2 sns.set_style("whitegrid")
----> 3 ax = sns.pointplot(x="likes", y="rating",data=df)
      4 sns.plt.show()

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in pointplot(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, markers, linestyles, dodge, join, scale, orient, color, palette, ax, errwidth, capsize, **kwargs)
   3065                             estimator, ci, n_boot, units,
   3066                             markers, linestyles, dodge, join, scale,
-> 3067                             orient, color, palette, errwidth, capsize)
   3068 
   3069     if ax is None:

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, markers, linestyles, dodge, join, scale, orient, color, palette, errwidth, capsize)
   1607         """Initialize the plotter."""
   1608         self.establish_variables(x, y, hue, data, orient,
-> 1609                                  order, hue_order, units)
   1610         self.establish_colors(color, palette, 1)
   1611         self.estimate_statistic(estimator, ci, n_boot)

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/seaborn/categorical.pyc in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    142                 x = data.get(x, x)
    143                 y = data.get(y, y)
--> 144                 hue = data.get(hue, hue)
    145                 units = data.get(units, units)
    146 

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/generic.pyc in get(self, key, default)
    970         """
    971         try:
--> 972             return self[key]
    973         except KeyError:
    974             return default

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1633             return self._getitem_multilevel(key)
   1634         else:
-> 1635             return self._getitem_column(key)
   1636 
   1637     def _getitem_column(self, key):

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1640         # get column
   1641         if self.columns.is_unique:
-> 1642             return self._get_item_cache(key)
   1643 
   1644         # duplicate columns & possible reduce dimensionaility

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    981         res = cache.get(item)
    982         if res is None:
--> 983             values = self._data.get(item)
    984             res = self._box_item_values(item, values)
    985             cache[item] = res

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   2750             if isnull(item):
   2751                 indexer = np.arange(len(self.items))[isnull(self.items)]
-> 2752                 return self.get_for_nan_indexer(indexer)
   2753 
   2754             _, block = self._find_block(item)

/Users/wesm/anaconda/envs/openCV/lib/python2.7/site-packages/pandas/core/internals.pyc in get_for_nan_indexer(self, indexer)
   2802                 indexer = indexer.item()
   2803             else:
-> 2804                 raise ValueError("cannot label index with a null key")
   2805 
   2806         # take a nan indexer and return the values

ValueError: cannot label index with a null key



In [ ]:

    
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
print tips
ax = sns.pointplot(x="time", y="total_bill", data=tips)
sns.plt.show()

Future Improvements

Run the above graph a number of times, take the average for each cell
Based on the heaviest weighted parameters for each, run the random forest algorithm only taking these given parameters into consideration
Generate a model that classifies videos dynamically
Make more values ordinal - maybe to NLP or LDA to factor in descriptions, titles and lyrics



In [ ]:



In [ ]:

	filename	author	description	viewcount	rating	likes	dislikes	duration	length	keywords	published	colors_1_red	colors_1_blue	colors_1_green	colors_2_red	colors_2_blue	colors_2_green	colors_3_red	colors_3_blue	colors_3_green
0	Luke Bryan - Roller Coaster.mp4	LukeBryanVEVO	Luke Bryan - Crash My Party\nPurchase now on i...	28948653	4.840108	127866	5324	00:04:23	263	[Luke, Bryan, Roller, Coaster, Capitol, Record...	2014-06-21 11:00:03	250	240	230	80	70	50	210	200	190	...
1	Dierks Bentley - Drunk On A Plane.mp4	DierksBentleyVEVO	Purchase Dierks Bentley’s latest music: http:/...	41548786	4.763639	140682	8835	00:04:51	291	[Dierks, Bentley, Drunk, On, Plane, Capitol, R...	2014-05-13 20:16:23	10	20	30	0	10	20	20	30	40	...
2	Thomas Rhett - Get Me Some Of That.mp4	ThomasRhettVEVO	Music video by Thomas Rhett performing Get Me ...	43868160	4.826069	128488	5841	00:03:13	193	[Thomas, Rhett, Get, Me, Some, Of, That, The, ...	2013-12-10 08:00:01	50	60	40	60	70	50	70	80	60	...
3	David Nail - Whatever She's Got.mp4	DavidNailVEVO	Purchase David Nail’s latest music: http://umg...	48648247	4.826632	141108	6393	00:04:01	241	[David, Nail, Whatever, She's, Got, MCA, Nashv...	2013-07-16 07:00:15	30	20	10	40	30	20	50	40	30	...
4	Joe Nichols - Yeah.mp4	JoeNicholsVEVO	Joe Nichols - Yeah\n“Yeah” from Joe Nichol’s C...	11397694	4.815725	33255	1606	00:03:52	232	[Joe Nichols, Red Bow Records, Country, Yeah]	2014-05-27 07:00:01	250	250	230	20	30	40	10	20	30	...

index	rating	likes	dislikes	length	viewcount	colors_1_red	colors_1_blue	colors_1_green	colors_2_red	colors_2_blue	colors_2_green	colors_3_red	colors_3_blue	colors_3_green	colors_4_red	colors_4_blue	colors_4_green	colors_5_red	colors_5_blue	colors_5_green
rap	0.047518	0.044509	0.026947	0.048549	0.068925	0.020245	0.029470	0.036669	0.022064	0.025914	0.032847	0.020178	0.019545	0.024083	0.027169	0.029987	0.030205	0.026613	0.033678	0.015384	...
rock	0.038210	0.027530	0.034013	0.067918	0.026355	0.025508	0.035172	0.028821	0.024865	0.017377	0.025067	0.037129	0.032313	0.038196	0.023275	0.018912	0.029145	0.033083	0.025767	0.021179	...
country	0.038492	0.045486	0.028221	0.018982	0.042292	0.026688	0.017241	0.031658	0.025009	0.026803	0.021720	0.025610	0.020599	0.026612	0.032661	0.028635	0.033870	0.024077	0.023897	0.029288	...
edm	0.028083	0.029260	0.035402	0.029115	0.034032	0.026592	0.024570	0.017721	0.022563	0.022167	0.025107	0.028665	0.032543	0.023773	0.019332	0.031252	0.038824	0.030370	0.032247	0.029857	...
pop	0.025182	0.143491	0.168979	0.021370	0.119519	0.019349	0.017362	0.010976	0.012727	0.024821	0.019371	0.015525	0.017323	0.014381	0.024512	0.015031	0.013566	0.015581	0.016733	0.013688	...