Exploratory Data Analysis of Music Mood


In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v


Sebastian Raschka 07/12/2014 

CPython 3.4.2
IPython 2.3.0

Exploratory analysis based on the random 1000-song training data from the Million Song data set.



Reading in the Data


In [2]:
import pandas as pd
df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv', usecols=range(7))
df.head()


Out[2]:
file artist title lyrics genre mood year
0 TRAAAAW128F429D538.h5 Casual I Didn't Mean To Verse One:\n\nAlright I might\nHave had a litt... Hip Hop/Rap sad 1994
1 TRAAAEF128F4273421.h5 Adam Ant Something Girls Adam Ant/Marco Pirroni\nEvery girl is a someth... Rock happy 1982
2 TRAAAFD128F92F423A.h5 Gob Face the Ashes I've just erased it's been a while, I've got a... Rock sad 2007
3 TRAABJV128F1460C49.h5 Lionel Richie Tonight Will Be Alright Little darling \nWhere you've been so long \nI... R&B happy 1986
4 TRAABLR128F423B7E3.h5 Blue Rodeo Floating Lead Vocal by Greg\n\nWell, these late night c... Rock sad 1987



Happy vs. Sad


In [3]:
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
blue = '#5A6FFA'
green = '#A3EB5B'

In [5]:
happy, sad = sum(df.loc[:, 'mood'] == 'happy'), sum(df.loc[:, 'mood'] == 'sad')
print(happy, sad)


446 554

In [6]:
from matplotlib import rcParams
rcParams['font.size'] = 18

piechart = plt.pie(
    (happy, sad),
    labels=('happy','sad'),
    shadow=True,
    colors=(green, blue),
    explode=(0,0.15), # space between slices 
    startangle=90,    # rotate conter-clockwise by 90 degrees
    autopct='%1.1f%%',# display fraction as percentages
)

plt.axis('equal')   
plt.tight_layout()
plt.savefig('./images/pie_happy_sad.eps', dpi=300)




Year Distribution


In [11]:
import numpy as np
import seaborn as sns

sns.set_style('whitegrid');

plt.hist(df['year'], bins=np.arange(1900, 2020,5))
plt.xlabel('year')
plt.ylabel('count')
plt.xlim([df['year'].min()-5, df['year'].max()+5])
plt.tight_layout()
plt.savefig('./images/histo_year.eps', dpi=300)




Genre Distribution


In [10]:
sns.set_style('whitegrid')
fp = sns.factorplot(x='genre', data=df)
fp.set_xticklabels(rotation=90)
plt.xlabel('')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('./images/bar_genre.eps', dpi=300)




Mood by Genre Distribution


In [13]:
gclass = df.groupby(['genre', 'mood']).size().unstack()

print(gclass)



fig = plt.figure(figsize=(10,4))

sns.set(style="white")

pos = np.arange(1,13)

# absolute values
plt.subplot(121)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')

plt.gca().yaxis.grid(True) 

# relative values

# normalize
gclass = (gclass.T / gclass.T.sum()).T 

plt.subplot(122)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel('Fraction')
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel('')
plt.tight_layout()
plt.savefig('./images/bar_genre_mood.eps', dpi=300)


mood         happy  sad
genre                  
Blues           17    7
Christian       27   17
Country         43   36
Dance          NaN    1
Electronic      19   21
Hip Hop/Rap     41   64
Jazz             2   12
Pop             47   35
R&B             30   23
Rock           205  328
Ska             10    7
Soul             5    3



Mood by Year Distribution


In [14]:
bins = np.arange(1960,2011,10)
happy_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='happy', 'year'], bins=bins)
sad_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='sad', 'year'], bins=bins)
year_bins, b = np.histogram(df.loc[:, 'year'], bins=bins)

fig = plt.figure(figsize=(10,4))

sns.set(style="white")

pos = np.arange(1,6)
labels = ['%s-%s' %(i, i+10) for i in np.arange(1960,2011,10)]

# absolute values
plt.subplot(121)
plt.bar(pos, happy_bins, label='happy', color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation=30)
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')

plt.gca().yaxis.grid(True) 

# relative values

# normalize
happy_bins = happy_bins / year_bins
sad_bins = sad_bins / year_bins

plt.subplot(122)
plt.bar(pos, happy_bins, color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation='30')
plt.ylabel("Fraction")
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel("")
plt.tight_layout()
plt.savefig('./images/bar_year_mood.eps', dpi=300)






Word Clouds

Using the WordCloud package from https://github.com/amueller/word_cloud


In [3]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Sebastian Raschka 24/11/2014 

CPython 2.7.8
IPython 2.3.0

In [4]:
%matplotlib inline



Word cloud of happy songs


In [74]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text 
from wordcloud import WordCloud, STOPWORDS


happy_words = ' '.join(df.loc[df['mood']=='happy', 'lyrics']).decode("utf-8", "replace")

happy_wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(happy_words)

plt.imshow(happy_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_happy_all_w.png', dpi=300)
plt.show()




Wordcloud of sad songs


In [75]:
sad_words = ' '.join(df.loc[df['mood']=='sad', 'lyrics']).decode("utf-8", "replace")

sad_wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(sad_words)

plt.imshow(sad_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_sad_all_w.png', dpi=300)
plt.show()




Wordcloud of all songs


In [77]:
words = ' '.join(df.loc[:, 'lyrics']).decode("utf-8", "replace")

wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(words)

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_all_w.png', dpi=300)
plt.show()