In [17]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf
py.sign_in('maxrose', '••••••••••')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6
bs.head(1)
#df.groupby('recordname').hist()
Out[17]:
In [ ]:
print bs.shape
bs.releasedate.min()
In [18]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()
aggregations = {
'releasename':{'num_releases':'count'},
'releasedate':{'maxreleasedate':'max'},
'countryname':{'countries':'count'},
'rating': {'avg_rating':'max'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','maxreleasedate','num_releases','countries','avg_rating']
In [ ]:
bsagg[bsagg.avg_rating > 0].songname
In [ ]:
bsagg.info()
In [ ]:
vc = pd.DataFrame(bs.songname.value_counts().reset_index())
vc.columns = ['songname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
bs = pd.merge(bs, vc, on='songname')
bs.head()
In [ ]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1})
#bs.boxplot(kind='box')
In [ ]:
import plotly
plotly.offline.iplot([{
'x': bs.releasedate,
'y': bs.timesrec
}])
In [ ]:
sns.heatmap(bs.corr())
In [ ]:
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix
scatter_matrix(bs, alpha=0.5, figsize=(9, 9), diagonal='kde')
In [27]:
def detect_sentiment(text):
return TextBlob(text).sentiment.polarity
In [29]:
bsagg['sentiment'] = bsagg.songname.apply(detect_sentiment)
In [33]:
bsagg.boxplot(column='sentiment', by='songname')
Out[33]:
In [39]:
bsagg[bsagg.sentiment > 0.4].groupby(bsagg.artist).count()
Out[39]:
In [ ]: