In [17]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer


import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

bs.head(1)
#df.groupby('recordname').hist()


Out[17]:
artistid artist songname recordingid releasename release_group releasedate labelname countryid countryname workid rating
0 303 The Beatles And Your Bird Can Sing 15243 Revolver 212336 1987-04-30 Capitol Records 82 Ghana 15243 0

In [ ]:
print bs.shape
bs.releasedate.min()

In [18]:
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'num_releases':'count'},
    'releasedate':{'maxreleasedate':'max'},
    'countryname':{'countries':'count'},
    'rating': {'avg_rating':'max'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','maxreleasedate','num_releases','countries','avg_rating']

In [ ]:
bsagg[bsagg.avg_rating > 0].songname

In [ ]:
bsagg.info()

In [ ]:
vc = pd.DataFrame(bs.songname.value_counts().reset_index())
                  

vc.columns = ['songname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
bs = pd.merge(bs, vc, on='songname')
bs.head()

In [ ]:
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1}) 
#bs.boxplot(kind='box')

In [ ]:
import plotly
plotly.offline.iplot([{
    'x': bs.releasedate,
    'y': bs.timesrec
    
}])

In [ ]:
sns.heatmap(bs.corr())

In [ ]:
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix

scatter_matrix(bs, alpha=0.5, figsize=(9, 9), diagonal='kde')

In [27]:
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [29]:
bsagg['sentiment'] = bsagg.songname.apply(detect_sentiment)

In [33]:
bsagg.boxplot(column='sentiment', by='songname')


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x12130fa50>

In [39]:
bsagg[bsagg.sentiment > 0.4].groupby(bsagg.artist).count()


Out[39]:
workid artist songname maxreleasedate num_releases countries avg_rating sentiment
artist
The Beatles 30 30 30 30 30 30 30 30
The Rolling Stones 14 14 14 14 14 14 14 14

In [ ]: