In [1]:
from pandas import concat, DataFrame
from sklearn.cross_validation import train_test_split
from sklearn import ensemble
from context import *
from util.dfmgmt import initSet, wrangle
from util.polarize import sentiment
from util.plots import boxplotify, pairplotify, meanLine
In [2]:
%matplotlib inline
In [3]:
df = initSet()
df = df[df['decade'] != 2010]
In [4]:
# Check head
df.head(10)
Out[4]:
In [5]:
# Basic descriptions of the distribution of the data
df.describe()
Out[5]:
In [6]:
# # Get pair plot for major features of the dataset
pairplotify(
df[
['year',
'density',
'unique_words',
'total_curses',
'sentiment']
]
)
Out[6]:
In [7]:
# Drop both year and decade
dropList = ['most_used_term']
removeList = ['decade', 'year', 'charted']
df, features = wrangle(df, dropList, removeList, True)
df['sentiment_polarity'] = df.apply(sentiment, axis=1) # classify sentiment score
df.head(10)
Out[7]:
In [8]:
# Plot features against time
features = [
'density', 'unique_words', 'most_used_freq', 'verbs', 'nouns', 'adjectives', 'sentiment'
]
In [9]:
boxplotify(
df, feature='density', path='densityXtime', title="Density of lyrics over decades"
)
In [10]:
boxplotify(
df, feature='unique_words', path='unique_wordsXtime',
title="Total number of unique words in lyrics over decades"
)
In [11]:
boxplotify(
df, feature='verbs', path='verbsXtime',
title="Total number of verbs in lyrics over decades"
)
In [12]:
boxplotify(
df, feature='nouns', path='nounsXtime',
title="Total number of nouns in lyrics over decades"
)
In [13]:
boxplotify(
df, feature='adjectives', path='adjectivesXtime',
title="Total number of adjectives in lyrics over decades")
In [14]:
boxplotify(
df, feature='sentiment', path='sentimentXtime',
title="Sentiment score in lyrics over decades"
)
In [15]:
boxplotify(
df, feature='most_used_freq', path='most_used_freqXtime',
title="Most used word frequency in lyrics over decades"
)
In [16]:
meanLine(
df, feature='total_curses', path='profanityXtime',
title="Profanity in lyrics over decades"
)
In [17]:
df = initSet()
In [18]:
def bestFeat(decade, df):
# Keep both year and decade
df_byDecade = df[df['decade'] == decade]
dropList = ['most_used_term', 'reading_score']
removeList = ['charted', 'year', 'decade']
df_byDecade, features = wrangle(df_byDecade, dropList, removeList, True)
# set X and y and build model
X = df_byDecade[features]
y = df_byDecade['charted']
# Create separate training and test sets with 60/40 train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=1000
)
# train model with classifier and parameter determined best
clf = ensemble.RandomForestClassifier(
n_estimators=256, min_samples_leaf=2, max_depth=8
)
clf.fit(X_train, y_train)
# Plot importances for all features
features = X.columns
feature_importances = clf.feature_importances_
features_df = DataFrame(
{
'Year': decade,
'Features': features,
'Importance Score': feature_importances
}
)
features_df.sort_values(
'Importance Score', inplace=True, ascending=False)
return features_df.head(1)
In [19]:
newFrame = []
for decade in xrange(1960, 2010, 10):
newFrame.append(bestFeat(decade, df))
newDf = concat(newFrame)
In [20]:
newDf = newDf[['Year', 'Features', 'Importance Score']]
newDf.reset_index(drop=True)
Out[20]:
In [22]:
import matplotlib.pyplot as plt
from pandas import Series
freq_series = Series.from_array(newDf['Importance Score'])
plt.figure(figsize=(12, 8))
ax = freq_series.plot(kind='bar', color='rrgbb')
ax.set_title("Best Features Over Decades")
ax.set_xlabel("Decade")
ax.set_ylabel("Importance Score")
ax.set_xticklabels(newDf['Year'])
rects = ax.patches
# Now make some labels
labels = newDf['Features']
for rect, label in zip(rects, labels):
ax.text(
rect.get_x() + rect.get_width()/2,
rect.get_height(), label,
ha='center', va='bottom'
)
In [ ]: