Working with the lyrics Bag of Word



In [1]:

    
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_palette('Dark2')
sns.set_style('whitegrid')



In [2]:

    
con_mxm = sqlite3.connect('../mxm_dataset.db')
cur_mxm = con_mxm.cursor()

# displaying the different table available
tables = con_mxm.execute("SELECT name FROM sqlite_master WHERE type='table'")
table_names = tables.fetchall()

print('Tables within the database :')
print('{}'.format(table_names[0][0]))
print('{}'.format(table_names[1][0]))









    



Tables within the database :
words
lyrics



In [3]:

    
# import the "words" table in a pandas DataFrame
words = pd.read_sql_query("SELECT * FROM words",con_mxm)
print(words.info())
words.head(5)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
word    5000 non-null object
dtypes: object(1)
memory usage: 39.1+ KB
None






    Out[3]:







  
    
      
      word
    
  
  
    
      0
      i
    
    
      1
      the
    
    
      2
      you
    
    
      3
      to
    
    
      4
      and



In [4]:

    
# import the "lyrics" table in a pandas DataFrame
# limit to 10055 to have the exact content of each song (120 songs)
lyrics = pd.read_sql_query("SELECT *\
                           FROM lyrics\
                           ORDER BY track_id ASC\
                           LIMIT 10055",con_mxm)
#lyrics = lyric.to_sparse(fill_value=0)

print(lyrics.info())
lyrics.head(5)









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 5 columns):
track_id    10055 non-null object
mxm_tid     10055 non-null int64
word        10055 non-null object
count       10055 non-null int64
is_test     10055 non-null int64
dtypes: int64(3), object(2)
memory usage: 392.9+ KB
None






    Out[4]:







  
    
      
      track_id
      mxm_tid
      word
      count
      is_test
    
  
  
    
      0
      TRAAAAV128F421A322
      4623710
      i
      6
      0
    
    
      1
      TRAAAAV128F421A322
      4623710
      the
      4
      0
    
    
      2
      TRAAAAV128F421A322
      4623710
      you
      2
      0
    
    
      3
      TRAAAAV128F421A322
      4623710
      to
      2
      0
    
    
      4
      TRAAAAV128F421A322
      4623710
      and
      5
      0

Removing stopwords



In [5]:

    
stp_wds = stopwords.words()



In [6]:

    
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
words_no_stopwords.head(5)



In [7]:

    
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
print(lyrics_no_stopwords.info())
lyrics_no_stopwords.head(5)









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 5993 entries, 22 to 10054
Data columns (total 5 columns):
track_id    5993 non-null object
mxm_tid     5993 non-null int64
word        5993 non-null object
count       5993 non-null int64
is_test     5993 non-null int64
dtypes: int64(3), object(2)
memory usage: 280.9+ KB
None






    Out[7]:







  
    
      
      track_id
      mxm_tid
      word
      count
      is_test
    
  
  
    
      22
      TRAAAAV128F421A322
      4623710
      like
      2
      0
    
    
      27
      TRAAAAV128F421A322
      4623710
      got
      1
      0
    
    
      28
      TRAAAAV128F421A322
      4623710
      would
      1
      0
    
    
      31
      TRAAAAV128F421A322
      4623710
      seem
      1
      0
    
    
      32
      TRAAAAV128F421A322
      4623710
      someon
      1
      0

Function to choose a given number of songs



In [76]:

    
from sklearn.utils import shuffle

def get_n_songs(lyrics_df, n_songs=1 ,random=False):
    
    track_ids = lyrics.track_id.unique()
    
    if n_songs > len(track_ids):
        print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
        print('... return the whole dataset')
        return lyrics_df
    
    if random == True :
        track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
    elif random == False :
        track_to_keep = track_ids[:n_songs]

    lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
    
    return lyrics_subset



In [80]:

    
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
lyrics_sub.track_id.unique()









    Out[80]:





array(['TRAAAFD128F92F423A', 'TRAAAJG128F9308A25', 'TRAABIG128F9356C56',
       'TRAABJV128F1460C49', 'TRAABOA128F933684A', 'TRAABOG128F42955B1',
       'TRAACRY12903CAF2C2', 'TRAACZN128F93236B1', 'TRAADBN128F932D00A',
       'TRAADLH12903CA70EE', 'TRAADNA128F9331246', 'TRAADQW128F427CE68',
       'TRAADSV128F42BC36A', 'TRAADYI128E078FB38', 'TRAAEEQ128F42180B2',
       'TRAAEJH128E0785506', 'TRAAEJQ128F92C484E', 'TRAAENC128F1451DE9',
       'TRAAERZ128F1496921', 'TRAAFEU128E078581C', 'TRAAFOH128E078BD7E',
       'TRAAFOY128F146CC17', 'TRAAFRM128F9320F58', 'TRAAFTE128F429545F',
       'TRAAGAM128F428DCBE', 'TRAAGMJ12903CAD7D4', 'TRAAGOZ128F92EB3D4',
       'TRAAHFR12903CA2491', 'TRAAHRF128F92FE234', 'TRAAHSY128F147BB5C'], dtype=object)

Function to pivot by chunks

Take into consideration sparsity



In [81]:

    
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
    
    print('Processing chunk number 0')
    track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
    df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
    pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
    del df0
    pivot_df = pivot_df.to_sparse(fill_value=0)

    for i in range(1, n_chunks):
        print('Processing chunk number {}'.format(i))
        df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
        pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
        pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
        del df_tmp
        pivot_df = pivot_df.to_sparse(fill_value=0)

    return pivot_df



In [82]:

    
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
test_df = pivot_by_chunks(lyrics_df=lyrics_sub, n_chunks=2)









    



Processing chunk number 0
Processing chunk number 1



In [84]:

    
print(test_df.info())
test_df.head(5)









    



<class 'pandas.core.sparse.frame.SparseDataFrame'>
Index: 30 entries, TRAAAEW128F42930C0 to TRAAHOW12903D00E51
Columns: 1154 entries, & to ça
dtypes: float64(874), int64(280)
memory usage: 70.0+ KB
None






    Out[84]:







  
    
      
      &
      100
      3
      a
      abl
      abov
      abr
      achiev
      across
      action
      ...
      yet
      yo
      york
      you
      young
      your
      yourself
      youth
      à
      ça
    
    
      track_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      TRAAAEW128F42930C0
      0
      0.0
      0.0
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0
      0.0
      0
      0.0
      0
      0.0
      0.0
      0.0
      0.0
    
    
      TRAAAJG128F9308A25
      0
      0.0
      0.0
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0
      0.0
      0
      1.0
      0
      0.0
      0.0
      0.0
      0.0
    
    
      TRAAAUC128F428716F
      0
      0.0
      0.0
      10
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0
      0.0
      6
      0.0
      2
      0.0
      0.0
      0.0
      0.0
    
    
      TRAABOA128F933684A
      0
      0.0
      0.0
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      ...
      0.0
      0
      0.0
      1
      0.0
      0
      0.0
      1.0
      0.0
      0.0
    
    
      TRAABOG128F42955B1
      0
      0.0
      0.0
      14
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0
      1.0
      2
      0.0
      0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 1154 columns

Pivoting the Tables



In [ ]:

    
# create a table with track_is as index and word as columns
lyrics_mat = lyrics.pivot_table(index='track_id', columns='word', values='count',
                                fill_value=0)
print(lyrics_mat.info())
lyrics_mat.head(5)



In [ ]:

    
lyrics_mat.to_sparse(fill_value=0).info()



In [ ]:

    
# create a table with track_is as index and word as columns (no stopwords)
lyrics_no_stopwords_mat =lyrics_no_stopwords.pivot_table(index='track_id', columns='word', values='count',
                                                         fill_value=0)
lyrics_no_stopwords_mat.head(5)



In [ ]:

    
lyrics_no_stopwords_mat.to_sparse(fill_value=0).info()

Projections

Using some dimensionality reduction

PCA



In [ ]:

    
from sklearn.decomposition import PCA



In [ ]:

    
pca = PCA(n_components=2)
lyrics_no_pca = pca.fit_transform(lyrics_no_stopwords_mat)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,10))

ax[1].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[1].axis('scaled')
ax[0].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[0].axis('scaled')

ax[1].set_xlim(-5.0,8.0)
ax[1].set_ylim(-5.0,8.0);

Isomap



In [ ]:

    
from sklearn.manifold import Isomap



In [ ]:

    
iso = Isomap(n_components=2)
lyrics_no_iso = iso.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_iso[:,0], lyrics_no_iso[:,1], marker='.')
plt.axis('scaled');

LLE



In [ ]:

    
from sklearn.manifold import LocallyLinearEmbedding



In [ ]:

    
lle = LocallyLinearEmbedding(n_components=2)
lyrics_no_lle = lle.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_lle[:,0], lyrics_no_lle[:,1], marker='.')
plt.axis('scaled');

TSNE



In [ ]:

    
from sklearn.manifold import TSNE



In [ ]:

    
tsne = TSNE(n_components=2)
lyrics_no_tsne = tsne.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.')
plt.axis('scaled');

MDS



In [ ]:

    
from sklearn.manifold import MDS



In [ ]:

    
mds = MDS(n_components=2)
lyrics_no_mds = mds.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_mds[:,0], lyrics_no_mds[:,1], marker='.')
plt.axis('scaled');

Clustering



In [ ]:

    
from sklearn.cluster import AgglomerativeClustering



In [ ]:

    
agg = AgglomerativeClustering(n_clusters=5)
agg_preds = agg.fit_predict(lyrics_no_stopwords_mat)



In [ ]:

    
from sklearn.cluster import AffinityPropagation



In [ ]:

    
afp = AffinityPropagation(damping=0.95)
afp_preds = afp.fit_predict(lyrics_no_stopwords_mat)



In [ ]:

    
np.unique(afp_preds)



In [ ]:

    
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.', c=afp_preds, cmap='gist_rainbow')
plt.axis('scaled');



In [ ]:

    
lyrics_mat.to_csv('lyrics_pivot.csv')



In [ ]:

	track_id	mxm_tid	word	count
0	TRAAAAV128F421A322	4623710	i	6
1	TRAAAAV128F421A322	4623710	the	4
2	TRAAAAV128F421A322	4623710	you	2
3	TRAAAAV128F421A322	4623710	to	2
4	TRAAAAV128F421A322	4623710	and	5

	track_id	mxm_tid	word	count
22	TRAAAAV128F421A322	4623710	like	2
27	TRAAAAV128F421A322	4623710	got	1
28	TRAAAAV128F421A322	4623710	would	1
31	TRAAAAV128F421A322	4623710	seem	1
32	TRAAAAV128F421A322	4623710	someon	1

	&	100	3	a	abl	abov	abr	achiev	across	action	...	yet	yo	york	you	young	your	yourself	youth	à	ça
track_id
TRAAAEW128F42930C0	0	0.0	0.0	2	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0	0.0	0	0.0	0	0.0	0.0	0.0	0.0
TRAAAJG128F9308A25	0	0.0	0.0	4	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0	0.0	0	1.0	0	0.0	0.0	0.0	0.0
TRAAAUC128F428716F	0	0.0	0.0	10	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0	0.0	6	0.0	2	0.0	0.0	0.0	0.0
TRAABOA128F933684A	0	0.0	0.0	2	0.0	0.0	0.0	1.0	0.0	0.0	...	0.0	0	0.0	1	0.0	0	0.0	1.0	0.0	0.0
TRAABOG128F42955B1	0	0.0	0.0	14	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0	1.0	2	0.0	0	0.0	0.0	0.0	0.0