In [1]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v
Exploratory analysis based on the random 1000-song training data from the Million Song data set.
In [2]:
import pandas as pd
df = pd.read_csv('../../dataset/training/train_lyrics_1000.csv', usecols=range(7))
df.head()
Out[2]:
In [3]:
from matplotlib import pyplot as plt
%matplotlib inline
In [4]:
blue = '#5A6FFA'
green = '#A3EB5B'
In [5]:
happy, sad = sum(df.loc[:, 'mood'] == 'happy'), sum(df.loc[:, 'mood'] == 'sad')
print(happy, sad)
In [6]:
from matplotlib import rcParams
rcParams['font.size'] = 18
piechart = plt.pie(
(happy, sad),
labels=('happy','sad'),
shadow=True,
colors=(green, blue),
explode=(0,0.15), # space between slices
startangle=90, # rotate conter-clockwise by 90 degrees
autopct='%1.1f%%',# display fraction as percentages
)
plt.axis('equal')
plt.tight_layout()
plt.savefig('./images/pie_happy_sad.eps', dpi=300)
In [11]:
import numpy as np
import seaborn as sns
sns.set_style('whitegrid');
plt.hist(df['year'], bins=np.arange(1900, 2020,5))
plt.xlabel('year')
plt.ylabel('count')
plt.xlim([df['year'].min()-5, df['year'].max()+5])
plt.tight_layout()
plt.savefig('./images/histo_year.eps', dpi=300)
In [10]:
sns.set_style('whitegrid')
fp = sns.factorplot(x='genre', data=df)
fp.set_xticklabels(rotation=90)
plt.xlabel('')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('./images/bar_genre.eps', dpi=300)
In [13]:
gclass = df.groupby(['genre', 'mood']).size().unstack()
print(gclass)
fig = plt.figure(figsize=(10,4))
sns.set(style="white")
pos = np.arange(1,13)
# absolute values
plt.subplot(121)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')
plt.gca().yaxis.grid(True)
# relative values
# normalize
gclass = (gclass.T / gclass.T.sum()).T
plt.subplot(122)
plt.bar(pos, gclass.values[:,0], label='happy', color=green)
plt.bar(pos, gclass.values[:,1], bottom=gclass.values[:,0], label='sad', color=blue)
plt.xticks(pos+0.5, gclass.index, rotation='vertical')
plt.ylabel('Fraction')
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel('')
plt.tight_layout()
plt.savefig('./images/bar_genre_mood.eps', dpi=300)
In [14]:
bins = np.arange(1960,2011,10)
happy_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='happy', 'year'], bins=bins)
sad_bins, b = np.histogram(df.loc[df.loc[:,'mood']=='sad', 'year'], bins=bins)
year_bins, b = np.histogram(df.loc[:, 'year'], bins=bins)
fig = plt.figure(figsize=(10,4))
sns.set(style="white")
pos = np.arange(1,6)
labels = ['%s-%s' %(i, i+10) for i in np.arange(1960,2011,10)]
# absolute values
plt.subplot(121)
plt.bar(pos, happy_bins, label='happy', color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation=30)
plt.ylabel("Count")
plt.xlabel("")
plt.legend(loc='upper left')
plt.gca().yaxis.grid(True)
# relative values
# normalize
happy_bins = happy_bins / year_bins
sad_bins = sad_bins / year_bins
plt.subplot(122)
plt.bar(pos, happy_bins, color=green)
plt.bar(pos, sad_bins, bottom=happy_bins, color=blue, label='sad')
plt.xticks(pos, labels, rotation='30')
plt.ylabel("Fraction")
plt.axhline(y=0.5, xmin=0, linewidth=2, color='black', alpha=0.5)
plt.xlabel("")
plt.tight_layout()
plt.savefig('./images/bar_year_mood.eps', dpi=300)
Using the WordCloud package from https://github.com/amueller/word_cloud
In [3]:
%load_ext watermark
%watermark -a 'Sebastian Raschka' -d -v
In [4]:
%matplotlib inline
In [74]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from wordcloud import WordCloud, STOPWORDS
happy_words = ' '.join(df.loc[df['mood']=='happy', 'lyrics']).decode("utf-8", "replace")
happy_wordcloud = WordCloud(
font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
stopwords=STOPWORDS,
background_color='white',
width=800,
height=400
).generate(happy_words)
plt.imshow(happy_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_happy_all_w.png', dpi=300)
plt.show()
In [75]:
sad_words = ' '.join(df.loc[df['mood']=='sad', 'lyrics']).decode("utf-8", "replace")
sad_wordcloud = WordCloud(
font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
stopwords=STOPWORDS,
background_color='white',
width=800,
height=400
).generate(sad_words)
plt.imshow(sad_wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_sad_all_w.png', dpi=300)
plt.show()
In [77]:
words = ' '.join(df.loc[:, 'lyrics']).decode("utf-8", "replace")
wordcloud = WordCloud(
font_path='/Users/sebastian/Library/Fonts/ufonts.com_flux.ttf',
stopwords=STOPWORDS,
background_color='white',
width=800,
height=400
).generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./images/wordcloud_all_w.png', dpi=300)
plt.show()