Exploratory Data Analysis of Music Mood



In [1]:

    
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v









    



Sebastian Raschka 07/12/2014 

CPython 3.4.2
IPython 2.3.0

Exploratory analysis based on the random 1000-song training data from the Million Song data set.

Reading in the Data



In [2]:

    
import pandas as pd
df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv', usecols=range(7))
df.head()









    Out[2]:






  
    
      
      file
      artist
      title
      lyrics
      genre
      mood
      year
    
  
  
    
      0
       TRAAAAW128F429D538.h5
              Casual
              I Didn't Mean To
       Verse One:\n\nAlright I might\nHave had a litt...
       Hip Hop/Rap
         sad
       1994
    
    
      1
       TRAAAEF128F4273421.h5
            Adam Ant
               Something Girls
       Adam Ant/Marco Pirroni\nEvery girl is a someth...
              Rock
       happy
       1982
    
    
      2
       TRAAAFD128F92F423A.h5
                 Gob
                Face the Ashes
       I've just erased it's been a while, I've got a...
              Rock
         sad
       2007
    
    
      3
       TRAABJV128F1460C49.h5
       Lionel Richie
       Tonight Will Be Alright
       Little darling \nWhere you've been so long \nI...
               R&B
       happy
       1986
    
    
      4
       TRAABLR128F423B7E3.h5
          Blue Rodeo
                      Floating
       Lead Vocal by Greg\n\nWell, these late night c...
              Rock
         sad
       1987

Happy vs. Sad



In [3]:

    
from matplotlib import pyplot as plt
%matplotlib inline



In [4]:

    
blue = '#5A6FFA'
green = '#A3EB5B'



In [5]:

    
happy, sad = sum(df.loc[:, 'mood'] == 'happy'), sum(df.loc[:, 'mood'] == 'sad')
print(happy, sad)



In [6]:

    
from matplotlib import rcParams
rcParams['font.size'] = 18

piechart = plt.pie(
    (happy, sad),
    labels=('happy','sad'),
    shadow=True,
    colors=(green, blue),
    explode=(0,0.15), # space between slices 
    startangle=90,    # rotate conter-clockwise by 90 degrees
    autopct='%1.1f%%',# display fraction as percentages
)

plt.axis('equal')   
plt.tight_layout()
plt.savefig('./images/pie_happy_sad.eps', dpi=300)

Year Distribution



In [11]:

    
import numpy as np
import seaborn as sns

sns.set_style('whitegrid');

plt.hist(df['year'], bins=np.arange(1900, 2020,5))
plt.xlabel('year')
plt.ylabel('count')
plt.xlim([df['year'].min()-5, df['year'].max()+5])
plt.tight_layout()
plt.savefig('./images/histo_year.eps', dpi=300)

Genre Distribution



In [10]:

    
sns.set_style('whitegrid')
fp = sns.factorplot(x='genre', data=df)
fp.set_xticklabels(rotation=90)
plt.xlabel('')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('./images/bar_genre.eps', dpi=300)

Mood by Genre Distribution



In [13]:

    
gclass = df.groupby(['genre', 'mood']).size().unstack()

print(gclass)



fig = plt.figure(figsize=(10,4))

sns.set(style="white")

pos = np.arange(1,13)

# absolute values
plt.subplot(121)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')

plt.gca().yaxis.grid(True) 

# relative values

# normalize
gclass = (gclass.T / gclass.T.sum()).T 

plt.subplot(122)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel('Fraction')
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel('')
plt.tight_layout()
plt.savefig('./images/bar_genre_mood.eps', dpi=300)









    



mood         happy  sad
genre                  
Blues           17    7
Christian       27   17
Country         43   36
Dance          NaN    1
Electronic      19   21
Hip Hop/Rap     41   64
Jazz             2   12
Pop             47   35
R&B             30   23
Rock           205  328
Ska             10    7
Soul             5    3

Mood by Year Distribution



In [14]:

    
bins = np.arange(1960,2011,10)
happy_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='happy', 'year'], bins=bins)
sad_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='sad', 'year'], bins=bins)
year_bins, b = np.histogram(df.loc[:, 'year'], bins=bins)

fig = plt.figure(figsize=(10,4))

sns.set(style="white")

pos = np.arange(1,6)
labels = ['%s-%s' %(i, i+10) for i in np.arange(1960,2011,10)]

# absolute values
plt.subplot(121)
plt.bar(pos, happy_bins, label='happy', color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation=30)
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')

plt.gca().yaxis.grid(True) 

# relative values

# normalize
happy_bins = happy_bins / year_bins
sad_bins = sad_bins / year_bins

plt.subplot(122)
plt.bar(pos, happy_bins, color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation='30')
plt.ylabel("Fraction")
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel("")
plt.tight_layout()
plt.savefig('./images/bar_year_mood.eps', dpi=300)

Word Clouds

Using the WordCloud package from https://github.com/amueller/word_cloud



In [3]:

    
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v









    



The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Sebastian Raschka 24/11/2014 

CPython 2.7.8
IPython 2.3.0



In [4]:

    
%matplotlib inline

Word cloud of happy songs



In [74]:

    
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text 
from wordcloud import WordCloud, STOPWORDS


happy_words = ' '.join(df.loc[df['mood']=='happy', 'lyrics']).decode("utf-8", "replace")

happy_wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(happy_words)

plt.imshow(happy_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_happy_all_w.png', dpi=300)
plt.show()

Wordcloud of sad songs



In [75]:

    
sad_words = ' '.join(df.loc[df['mood']=='sad', 'lyrics']).decode("utf-8", "replace")

sad_wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(sad_words)

plt.imshow(sad_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_sad_all_w.png', dpi=300)
plt.show()

Wordcloud of all songs



In [77]:

    
words = ' '.join(df.loc[:, 'lyrics']).decode("utf-8", "replace")

wordcloud = WordCloud( 
                      font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=800,
                      height=400
            ).generate(words)

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_all_w.png', dpi=300)
plt.show()

	file	artist	title	lyrics	genre	mood	year
0	TRAAAAW128F429D538.h5	Casual	I Didn't Mean To	Verse One:\n\nAlright I might\nHave had a litt...	Hip Hop/Rap	sad	1994
1	TRAAAEF128F4273421.h5	Adam Ant	Something Girls	Adam Ant/Marco Pirroni\nEvery girl is a someth...	Rock	happy	1982
2	TRAAAFD128F92F423A.h5	Gob	Face the Ashes	I've just erased it's been a while, I've got a...	Rock	sad	2007
3	TRAABJV128F1460C49.h5	Lionel Richie	Tonight Will Be Alright	Little darling \nWhere you've been so long \nI...	R&B	happy	1986
4	TRAABLR128F423B7E3.h5	Blue Rodeo	Floating	Lead Vocal by Greg\n\nWell, these late night c...	Rock	sad	1987