Exploratory Analysis



In [1]:

    
from pandas import concat, DataFrame
from sklearn.cross_validation import train_test_split
from sklearn import ensemble

from context import *
from util.dfmgmt import initSet, wrangle
from util.polarize import sentiment
from util.plots import boxplotify, pairplotify, meanLine

Plot characteristics and global variables



In [2]:

    
%matplotlib inline

Import dataset



In [3]:

    
df = initSet()
df = df[df['decade'] != 2010]



In [4]:

    
# Check head
df.head(10)









    Out[4]:






  
    
      
      year
      decade
      unique_words
      density
      unique_words_raw
      density_raw
      nouns
      verbs
      adjectives
      syllables
      most_used_term
      most_used_freq
      explicit
      total_curses
      reading_score
      sentiment
      charted
    
  
  
    
      0
      1961
      1960
      36
      72
      65
      158
      34
      30
      10
      70
      rose
      7
      0
      0
      2.367848
      0.9901
      0
    
    
      1
      1961
      1960
      45
      91
      74
      197
      45
      37
      19
      81
      name
      6
      0
      0
      2.771777
      0.9712
      1
    
    
      2
      1961
      1960
      54
      103
      88
      223
      45
      48
      17
      98
      babi
      10
      0
      0
      3.885650
      0.9974
      1
    
    
      3
      1961
      1960
      42
      148
      66
      263
      81
      61
      36
      76
      love
      24
      0
      0
      2.889886
      0.9993
      1
    
    
      4
      1961
      1960
      28
      131
      60
      354
      56
      77
      5
      57
      come
      38
      0
      0
      2.940000
      0.9812
      1
    
    
      5
      1961
      1960
      50
      105
      79
      181
      39
      40
      24
      80
      mari
      13
      0
      0
      3.135470
      0.9744
      1
    
    
      6
      1961
      1960
      39
      75
      70
      160
      28
      35
      21
      81
      take
      13
      0
      0
      3.503750
      0.9978
      1
    
    
      7
      1961
      1960
      50
      94
      78
      190
      62
      17
      8
      83
      never
      6
      0
      0
      3.074737
      -0.9791
      1
    
    
      8
      1961
      1960
      53
      63
      92
      166
      25
      41
      18
      105
      one
      4
      0
      0
      4.993855
      -0.2263
      1
    
    
      9
      1961
      1960
      48
      57
      79
      124
      29
      24
      7
      83
      river
      5
      0
      0
      4.648387
      -0.3400
      1

Basic Exploratory Analysis



In [5]:

    
# Basic descriptions of the distribution of the data
df.describe()









    Out[5]:






  
    
      
      year
      decade
      unique_words
      density
      unique_words_raw
      density_raw
      nouns
      verbs
      adjectives
      syllables
      most_used_freq
      explicit
      total_curses
      reading_score
      sentiment
      charted
    
  
  
    
      count
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
      3832.000000
    
    
      mean
      1990.846555
      1986.111691
      61.002610
      135.960334
      93.400574
      290.055063
      65.162839
      63.651618
      21.690501
      105.182933
      15.267223
      0.172495
      1.165449
      4.848724
      0.448467
      0.576722
    
    
      std
      13.434079
      13.296060
      35.334491
      79.853909
      45.509019
      167.711712
      43.881733
      42.320593
      17.618236
      56.083971
      13.407186
      0.377859
      4.904281
      3.536340
      0.750677
      0.494143
    
    
      min
      1961.000000
      1960.000000
      1.000000
      1.000000
      2.000000
      2.000000
      0.000000
      0.000000
      0.000000
      3.000000
      1.000000
      0.000000
      0.000000
      -3.427959
      -0.999700
      0.000000
    
    
      25%
      1981.000000
      1980.000000
      40.000000
      83.000000
      66.000000
      180.000000
      36.000000
      34.000000
      10.000000
      72.000000
      7.000000
      0.000000
      0.000000
      2.945868
      -0.025800
      0.000000
    
    
      50%
      1994.000000
      1990.000000
      53.000000
      117.000000
      84.000000
      254.000000
      54.000000
      56.000000
      17.000000
      93.000000
      12.000000
      0.000000
      0.000000
      4.136718
      0.926000
      1.000000
    
    
      75%
      2002.000000
      2000.000000
      69.000000
      164.000000
      107.000000
      352.000000
      80.000000
      83.000000
      28.000000
      120.000000
      19.000000
      0.000000
      0.000000
      5.707733
      0.988100
      1.000000
    
    
      max
      2009.000000
      2000.000000
      307.000000
      638.000000
      388.000000
      1415.000000
      376.000000
      346.000000
      205.000000
      481.000000
      234.000000
      1.000000
      88.000000
      84.237399
      0.999900
      1.000000



In [6]:

    
# # Get pair plot for major features of the dataset
pairplotify(
    df[
        ['year',
         'density',
         'unique_words',
         'total_curses', 
         'sentiment']
    ]
)









    Out[6]:





<seaborn.axisgrid.PairGrid at 0x7f26f6b76210>



In [7]:

    
# Drop both year and decade
dropList = ['most_used_term']
removeList = ['decade', 'year', 'charted']

df, features = wrangle(df, dropList, removeList, True)

df['sentiment_polarity'] = df.apply(sentiment, axis=1)  # classify sentiment score
df.head(10)









    Out[7]:






  
    
      
      year
      decade
      unique_words
      density
      unique_words_raw
      density_raw
      nouns
      verbs
      adjectives
      syllables
      most_used_freq
      explicit
      total_curses
      reading_score
      sentiment
      charted
      sentiment_polarity
    
  
  
    
      0
      1961
      1960
      36
      72
      65
      158
      34
      30
      10
      70
      7
      0
      0
      2.367848
      0.9901
      0
      1
    
    
      1
      1961
      1960
      45
      91
      74
      197
      45
      37
      19
      81
      6
      0
      0
      2.771777
      0.9712
      1
      1
    
    
      2
      1961
      1960
      54
      103
      88
      223
      45
      48
      17
      98
      10
      0
      0
      3.885650
      0.9974
      1
      1
    
    
      3
      1961
      1960
      42
      148
      66
      263
      81
      61
      36
      76
      24
      0
      0
      2.889886
      0.9993
      1
      1
    
    
      4
      1961
      1960
      28
      131
      60
      354
      56
      77
      5
      57
      38
      0
      0
      2.940000
      0.9812
      1
      1
    
    
      5
      1961
      1960
      50
      105
      79
      181
      39
      40
      24
      80
      13
      0
      0
      3.135470
      0.9744
      1
      1
    
    
      6
      1961
      1960
      39
      75
      70
      160
      28
      35
      21
      81
      13
      0
      0
      3.503750
      0.9978
      1
      1
    
    
      7
      1961
      1960
      50
      94
      78
      190
      62
      17
      8
      83
      6
      0
      0
      3.074737
      -0.9791
      1
      -1
    
    
      8
      1961
      1960
      53
      63
      92
      166
      25
      41
      18
      105
      4
      0
      0
      4.993855
      -0.2263
      1
      0
    
    
      9
      1961
      1960
      48
      57
      79
      124
      29
      24
      7
      83
      5
      0
      0
      4.648387
      -0.3400
      1
      0



In [8]:

    
# Plot features against time
features = [
    'density', 'unique_words', 'most_used_freq', 'verbs', 'nouns', 'adjectives', 'sentiment'
]



In [9]:

    
boxplotify(
    df, feature='density', path='densityXtime', title="Density of lyrics over decades"
)



In [10]:

    
boxplotify(
    df, feature='unique_words', path='unique_wordsXtime', 
    title="Total number of unique words in lyrics over decades"
)



In [11]:

    
boxplotify(
    df, feature='verbs', path='verbsXtime', 
    title="Total number of verbs in lyrics over decades"
)



In [12]:

    
boxplotify(
    df, feature='nouns', path='nounsXtime', 
    title="Total number of nouns in lyrics over decades"
)



In [13]:

    
boxplotify(
    df, feature='adjectives', path='adjectivesXtime', 
    title="Total number of adjectives in lyrics over decades")



In [14]:

    
boxplotify(
    df, feature='sentiment', path='sentimentXtime', 
    title="Sentiment score in lyrics over decades"
)



In [15]:

    
boxplotify(
    df, feature='most_used_freq', path='most_used_freqXtime', 
    title="Most used word frequency in lyrics over decades"
)



In [16]:

    
meanLine(
    df, feature='total_curses', path='profanityXtime', 
    title="Profanity in lyrics over decades"
)

Use model to view the best feature over decades



In [17]:

    
df = initSet()



In [18]:

    
def bestFeat(decade, df):
    
    # Keep both year and decade
    df_byDecade = df[df['decade'] == decade]
    dropList = ['most_used_term', 'reading_score']
    removeList = ['charted', 'year', 'decade']

    df_byDecade, features = wrangle(df_byDecade, dropList, removeList, True)

    # set X and y and build model
    X = df_byDecade[features]
    y = df_byDecade['charted']

    # Create separate training and test sets with 60/40 train/test split
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=1000
    )

    # train model with classifier and parameter determined best
    clf = ensemble.RandomForestClassifier(
        n_estimators=256, min_samples_leaf=2, max_depth=8
    )
    clf.fit(X_train, y_train)
    
    # Plot importances for all features
    features = X.columns
    feature_importances = clf.feature_importances_

    features_df = DataFrame(
        {
            'Year': decade,
            'Features': features,
            'Importance Score': feature_importances
        }
    )

    features_df.sort_values(
        'Importance Score', inplace=True, ascending=False)

    return features_df.head(1)



In [19]:

    
newFrame = []

for decade in xrange(1960, 2010, 10):
    newFrame.append(bestFeat(decade, df))

newDf = concat(newFrame)









    



/home/sabbir/Desktop/Heat-Replay/src/assets/models/util/dfmgmt.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df.drop(dropList, axis=1, inplace=True)



In [20]:

    
newDf = newDf[['Year', 'Features', 'Importance Score']]
newDf.reset_index(drop=True)









    Out[20]:






  
    
      
      Year
      Features
      Importance Score
    
  
  
    
      0
      1960
      sentiment
      0.124276
    
    
      1
      1970
      sentiment
      0.139255
    
    
      2
      1980
      density_raw
      0.164725
    
    
      3
      1990
      verbs
      0.205179
    
    
      4
      2000
      verbs
      0.191528



In [22]:

    
import matplotlib.pyplot as plt
from pandas import Series

freq_series = Series.from_array(newDf['Importance Score'])
plt.figure(figsize=(12, 8))

ax = freq_series.plot(kind='bar', color='rrgbb')
ax.set_title("Best Features Over Decades")
ax.set_xlabel("Decade")
ax.set_ylabel("Importance Score")

ax.set_xticklabels(newDf['Year'])

rects = ax.patches

# Now make some labels
labels = newDf['Features']
for rect, label in zip(rects, labels):
    ax.text(
        rect.get_x() + rect.get_width()/2, 
        rect.get_height(), label, 
        ha='center', va='bottom'
    )



In [ ]:

	year	decade	unique_words	density	unique_words_raw	density_raw	nouns	verbs	adjectives	syllables	most_used_term	most_used_freq	reading_score	sentiment	charted
0	1961	1960	36	72	65	158	34	30	10	70	rose	7	2.367848	0.9901	0
1	1961	1960	45	91	74	197	45	37	19	81	name	6	2.771777	0.9712	1
2	1961	1960	54	103	88	223	45	48	17	98	babi	10	3.885650	0.9974	1
3	1961	1960	42	148	66	263	81	61	36	76	love	24	2.889886	0.9993	1
4	1961	1960	28	131	60	354	56	77	5	57	come	38	2.940000	0.9812	1
5	1961	1960	50	105	79	181	39	40	24	80	mari	13	3.135470	0.9744	1
6	1961	1960	39	75	70	160	28	35	21	81	take	13	3.503750	0.9978	1
7	1961	1960	50	94	78	190	62	17	8	83	never	6	3.074737	-0.9791	1
8	1961	1960	53	63	92	166	25	41	18	105	one	4	4.993855	-0.2263	1
9	1961	1960	48	57	79	124	29	24	7	83	river	5	4.648387	-0.3400	1

	year	decade	unique_words	density	unique_words_raw	density_raw	nouns	verbs	adjectives	syllables	most_used_freq	explicit	total_curses	reading_score	sentiment	charted
count	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000	3832.000000
mean	1990.846555	1986.111691	61.002610	135.960334	93.400574	290.055063	65.162839	63.651618	21.690501	105.182933	15.267223	0.172495	1.165449	4.848724	0.448467	0.576722
std	13.434079	13.296060	35.334491	79.853909	45.509019	167.711712	43.881733	42.320593	17.618236	56.083971	13.407186	0.377859	4.904281	3.536340	0.750677	0.494143
min	1961.000000	1960.000000	1.000000	1.000000	2.000000	2.000000	0.000000	0.000000	0.000000	3.000000	1.000000	0.000000	0.000000	-3.427959	-0.999700	0.000000
25%	1981.000000	1980.000000	40.000000	83.000000	66.000000	180.000000	36.000000	34.000000	10.000000	72.000000	7.000000	0.000000	0.000000	2.945868	-0.025800	0.000000
50%	1994.000000	1990.000000	53.000000	117.000000	84.000000	254.000000	54.000000	56.000000	17.000000	93.000000	12.000000	0.000000	0.000000	4.136718	0.926000	1.000000
75%	2002.000000	2000.000000	69.000000	164.000000	107.000000	352.000000	80.000000	83.000000	28.000000	120.000000	19.000000	0.000000	0.000000	5.707733	0.988100	1.000000
max	2009.000000	2000.000000	307.000000	638.000000	388.000000	1415.000000	376.000000	346.000000	205.000000	481.000000	234.000000	1.000000	88.000000	84.237399	0.999900	1.000000

	Year	Features	Importance Score
0	1960	sentiment	0.124276
1	1970	sentiment	0.139255
2	1980	density_raw	0.164725
3	1990	verbs	0.205179
4	2000	verbs	0.191528