In [1]:
### Import as many items as possible to have available.
### Import data from CSV
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
data = pd.read_csv('data/Influence_clean.csv', header=0,encoding= 'utf-8', delimiter='|')
data['minreleasedate'] = pd.to_datetime(pd.Series(data.minreleasedate))
data['times_covered'].fillna(0, inplace=True)
data['artistid'] = data.artist.map({'The Beatles':0, 'The Rolling Stones':1})
In [2]:
data['artist'] = data.artist.astype('category')
data['songname'] = data.songname.astype('category')
# Add column for year of release for simpler grouping.
data['year'] = data['minreleasedate'].apply(lambda x: x.year)
# Make binary response - song has been covered or not. Far better accuracy over "times covered".
data['is_covered'] = data.times_covered.map(lambda x: 1 if x > 0 else 0)
In [3]:
data[(data.is_cover == 0) & (data.lyrics.isnull())].workid.count().astype(float)/data[(data.is_cover == 0)].workid.count()
Out[3]:
In [4]:
feature_cols = [ 'year','num_releases','lyric_sent','title_sent', 'countries', 'avg_rating']
X= data[data.is_cover == 0][feature_cols]
y = data[data.is_cover == 0].is_covered
y_regress = data[data.is_cover == 0].times_covered
print X.shape
print y.shape
print y_regress.shape
In [5]:
feature_range = range(1, len(feature_cols)+1)
# list to store the accuracy score for each value of max_features
Acc_scores = []
# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range:
rfclass = RandomForestClassifier(n_estimators=500, max_features=feature, random_state=50)
acc_val_scores = cross_val_score(rfclass, X, y, cv=10, scoring='accuracy')
Acc_scores.append(acc_val_scores.mean())
In [6]:
# plot max_features (x-axis) versus Accuracy score (y-axis)
plt.plot(feature_range, Acc_scores)
plt.xlabel('max_features')
plt.ylabel('Accuracy (higher is better)')
Out[6]:
In [7]:
rfclass = RandomForestClassifier(n_estimators=175, max_features=6,oob_score=True, random_state=50)
rfclass.fit(X, y)
print rfclass.oob_score_
print cross_val_score(rfclass, X, y, cv=10, scoring='accuracy').mean()
In [8]:
from sklearn.feature_selection import SelectFromModel
rfreg = RandomForestRegressor(n_estimators=100, max_features=6, random_state=111)
rfreg.fit(X,y_regress)
sfm = SelectFromModel(rfreg, threshold='mean', prefit=True)
X_important = sfm.transform(X)
print(X_important.shape[0],X_important.shape[1])
In [9]:
rfreg = RandomForestRegressor(n_estimators=100, max_features=3, random_state=111)
scores = cross_val_score(rfreg, X_important, y_regress, cv=10, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))
Out[9]:
90% vs. 28%
In [10]:
treeclf = DecisionTreeClassifier(max_depth = 15, random_state=123)
treeclf.fit(X, y)
scores = cross_val_score(treeclf, X, y, cv=10).mean()
np.mean(np.sqrt(scores))
Out[10]:
In [243]:
pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_}).sort_values('importance').plot(kind='bar',x='feature',figsize=(16,5),fontsize='14',title="Feature Importance")
Out[243]:
While The Beatle disbanded by 1970, The Stones continue to this day. However, their early work appears far more influential, with the greatest body of influence more or less paralelling that of The more covered Beatles. Let's put a number on that, shall we?
In [244]:
yticks = np.arange(100, 1000, 100)
data[data.times_covered > 0].groupby('year').times_covered.sum().plot(kind='bar',x='year',y='times_covered',figsize=(16,9))
plt.yticks(yticks)
plt.title('Total songs covered by year of original',size =28)
plt.ylabel('Times Covered (Sum)', size = 24)
Out[244]:
In [13]:
### First plot is The Beatles, second The Rolling Stones. Sum of times_covered by year of oriiginal relase of the song.
bar = data.sort_values(by='year').groupby(['year', 'artist'])['times_covered'].sum().unstack('artist')
yticks = np.arange(25, 1000, 50)
bar.plot(kind='bar', stacked=True,figsize=(16,12),subplots='True')
plt.yticks(yticks)
plt.title('Total songs covered per Artist, by year of original',size =24)
plt.ylabel('Times Covered', size = 24)
Out[13]:
In [246]:
# Throw out covers recorded by each band and see what percentage of their catalogs have been covered.
inf = data[(data.is_cover == 0) & (data.is_covered == 1)].groupby('artist').workid.count()/data[(data.is_cover == 0)].groupby('artist').workid.count()
inf.plot(kind='bar', figsize=(16,5),fontsize=18,rot=40)
Out[246]:
In [191]:
# Create new DataFrame with Top 10 most covered songs for each band.
top10 = data[(data.artistid == 1) & (data.is_cover ==0)][['artist','songname','minreleasedate', 'is_cover','times_covered']].sort_values(by='times_covered',ascending=False)[:10]
top10 = top10.append(data[(data.artistid == 0) & (data.is_cover ==0)][['artist','songname','minreleasedate', 'is_cover','times_covered']].sort_values(by='times_covered',ascending=False)[:10])
In [194]:
top10['dateint'] = top10['minreleasedate'].apply(lambda x: x.year)
colors = np.where(top10.artist == 'The Beatles', 'b', 'g')
top10.groupby('artist').plot(kind='bar', x='songname', y='times_covered',color=colors,rot=30,fontsize=14,legend=True,figsize=(16,5))
#top10.plot(kind='scatter', x='dateint', y='times_covered', s=220,c=colors, legend='artist',figsize=(12,8))
plt.title('Top 10 Covered Per Artist',size=20)
plt.xlabel('Times Covered')
plt.ylabel('Song')
Out[194]:
In [247]:
print top10.groupby('artist').times_covered.mean()
In [15]:
data[data.is_covered > 0].groupby(['artist', 'year']).songname.count().unstack('artist').plot(kind='bar',subplots='True',figsize=(16,9))
plt.title("Original Songs Released Per Year", size=20)
Out[15]:
In [204]:
bar2 = data[data.year < 2000].sort_values(by='year',ascending='True').groupby(['year', 'artist'])['times_covered'].sum().unstack('artist')
bar2.plot(kind='area',figsize=(16,5),fontsize=16)
plt.title("Original Songs Released Per Year", size=20)
Out[204]:
In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
In [18]:
# Compute Null accuracy
y_null = np.zeros_like(y_test, dtype=float)
# fill the array with the mean value of y_test
y_null.fill(y_test.mean())
y_null
np.sqrt(metrics.mean_squared_error(y_test, y_null))
Out[18]:
In [19]:
logreg = LogisticRegression(C=1e9)
#solver='newton-cg',multi_class='multinomial',max_iter=100
logreg.fit(X_train, y_train)
zip(feature_cols, logreg.coef_[0])
Out[19]:
In [20]:
print cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
In [21]:
y_pred_prob = logreg.predict_proba(X_test)[:,1 ]
print(y_pred_prob).mean()
In [22]:
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
Out[22]:
In [23]:
# calculate AUC
print metrics.roc_auc_score(y_test, y_pred_prob)
print cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()
In [248]:
data[['artist','lyric_sent']].groupby('artist').boxplot(return_type='axes',figsize=(16,6),fontsize=16)
Out[248]:
In [ ]: