notebook.community

Edit and run



In [17]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer


import plotly.plotly as py
import plotly.tools as tls
tls.embed('https://plot.ly/~cufflinks/8')
import cufflinks as cf

py.sign_in('maxrose', '••••••••••')

cf.set_config_file(offline=True, world_readable=False, theme='ggplot')

#df = pd.read_table('stonesbeatles_raw01.csv', encoding= 'utf-8',header=0)
df = pd.read_table('data/Allversion_noorig.csv', encoding= 'utf-8',header=0,delimiter='|')
bs = pd.read_table('data/allalbums.zip', encoding= 'utf-8',header=0,delimiter='|')
bs['releasedate'] = pd.to_datetime(pd.Series(bs.releasedate))
#df.sort_values(['artist', 'date_year','releasename'], axis=0, inplace=True)
plt.rcParams['figure.figsize'] = 8, 6

bs.head(1)
#df.groupby('recordname').hist()









    











    Out[17]:






  
    
      
      artistid
      artist
      songname
      recordingid
      releasename
      release_group
      releasedate
      labelname
      countryid
      countryname
      workid
      rating
    
  
  
    
      0
      303
      The Beatles
      And Your Bird Can Sing
      15243
      Revolver
      212336
      1987-04-30
      Capitol Records
      82
      Ghana
      15243
      0



In [ ]:

    
print bs.shape
bs.releasedate.min()



In [18]:

    
#bs.releasedate = bs.releasedate.str.replace('- ', '-')
#bs.releasedate = bs.releasedate.fillna(0)
#bs[bs.releasedate == 0].head()

#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename'])['countryname'].value_counts()


aggregations = {
    'releasename':{'num_releases':'count'},
    'releasedate':{'maxreleasedate':'max'},
    'countryname':{'countries':'count'},
    'rating': {'avg_rating':'max'}
}
#bsagg = bs.groupby(['workid', 'artist', 'songname', 'releasename','release_group','releasedate']).agg(aggregations).reset_index()
bsagg = bs.groupby(['workid', 'artist', 'songname']).agg(aggregations).reset_index()
bsagg.columns = bsagg.columns.droplevel(1)
bsagg.columns = ['workid','artist','songname','maxreleasedate','num_releases','countries','avg_rating']



In [ ]:

    
bsagg[bsagg.avg_rating > 0].songname



In [ ]:

    
bsagg.info()



In [ ]:

    
vc = pd.DataFrame(bs.songname.value_counts().reset_index())
                  

vc.columns = ['songname', 'timesrec']
#df.head(50)
#df.trackname.value_counts()
#df.drop('trkcnt', axis=1, inplace=True)
#trk.head()
bs = pd.merge(bs, vc, on='songname')
bs.head()



In [ ]:

    
#df['srcId'] = df.sourceartist.map({'The Beatles':0,'The Rolling Stones':1}) 
#bs.boxplot(kind='box')



In [ ]:

    
import plotly
plotly.offline.iplot([{
    'x': bs.releasedate,
    'y': bs.timesrec
    
}])



In [ ]:

    
sns.heatmap(bs.corr())



In [ ]:

    
#bsagg[['artist','num_releases','maxreleasedate']].scatter_matrix()
from pandas.tools.plotting import scatter_matrix

scatter_matrix(bs, alpha=0.5, figsize=(9, 9), diagonal='kde')



In [27]:

    
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity



In [29]:

    
bsagg['sentiment'] = bsagg.songname.apply(detect_sentiment)



In [33]:

    
bsagg.boxplot(column='sentiment', by='songname')









    Out[33]:





<matplotlib.axes._subplots.AxesSubplot at 0x12130fa50>



In [39]:

    
bsagg[bsagg.sentiment > 0.4].groupby(bsagg.artist).count()









    Out[39]:






  
    
      
      workid
      artist
      songname
      maxreleasedate
      num_releases
      countries
      avg_rating
      sentiment
    
    
      artist
      
      
      
      
      
      
      
      
    
  
  
    
      The Beatles
      30
      30
      30
      30
      30
      30
      30
      30
    
    
      The Rolling Stones
      14
      14
      14
      14
      14
      14
      14
      14



In [ ]:

	workid	artist	songname	maxreleasedate	num_releases	countries	avg_rating	sentiment
artist
The Beatles	30	30	30	30	30	30	30	30
The Rolling Stones	14	14	14	14	14	14	14	14