Working with the lyrics Bag of Word


In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_palette('Dark2')
sns.set_style('whitegrid')

In [2]:
con_mxm = sqlite3.connect('../mxm_dataset.db')
cur_mxm = con_mxm.cursor()

# displaying the different table available
tables = con_mxm.execute("SELECT name FROM sqlite_master WHERE type='table'")
table_names = tables.fetchall()

print('Tables within the database :')
print('{}'.format(table_names[0][0]))
print('{}'.format(table_names[1][0]))


Tables within the database :
words
lyrics

In [3]:
# import the "words" table in a pandas DataFrame
words = pd.read_sql_query("SELECT * FROM words",con_mxm)
print(words.info())
words.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
word    5000 non-null object
dtypes: object(1)
memory usage: 39.1+ KB
None
Out[3]:
word
0 i
1 the
2 you
3 to
4 and

In [4]:
# import the "lyrics" table in a pandas DataFrame
# limit to 10055 to have the exact content of each song (120 songs)
lyrics = pd.read_sql_query("SELECT *\
                           FROM lyrics\
                           ORDER BY track_id ASC\
                           LIMIT 10055",con_mxm)
#lyrics = lyric.to_sparse(fill_value=0)

print(lyrics.info())
lyrics.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 5 columns):
track_id    10055 non-null object
mxm_tid     10055 non-null int64
word        10055 non-null object
count       10055 non-null int64
is_test     10055 non-null int64
dtypes: int64(3), object(2)
memory usage: 392.9+ KB
None
Out[4]:
track_id mxm_tid word count is_test
0 TRAAAAV128F421A322 4623710 i 6 0
1 TRAAAAV128F421A322 4623710 the 4 0
2 TRAAAAV128F421A322 4623710 you 2 0
3 TRAAAAV128F421A322 4623710 to 2 0
4 TRAAAAV128F421A322 4623710 and 5 0

Removing stopwords


In [5]:
stp_wds = stopwords.words()

In [6]:
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
words_no_stopwords.head(5)


Out[6]:
word
26 love
28 know
35 like
38 time
43 go

In [7]:
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
print(lyrics_no_stopwords.info())
lyrics_no_stopwords.head(5)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5993 entries, 22 to 10054
Data columns (total 5 columns):
track_id    5993 non-null object
mxm_tid     5993 non-null int64
word        5993 non-null object
count       5993 non-null int64
is_test     5993 non-null int64
dtypes: int64(3), object(2)
memory usage: 280.9+ KB
None
Out[7]:
track_id mxm_tid word count is_test
22 TRAAAAV128F421A322 4623710 like 2 0
27 TRAAAAV128F421A322 4623710 got 1 0
28 TRAAAAV128F421A322 4623710 would 1 0
31 TRAAAAV128F421A322 4623710 seem 1 0
32 TRAAAAV128F421A322 4623710 someon 1 0

Function to choose a given number of songs


In [76]:
from sklearn.utils import shuffle

def get_n_songs(lyrics_df, n_songs=1 ,random=False):
    
    track_ids = lyrics.track_id.unique()
    
    if n_songs > len(track_ids):
        print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
        print('... return the whole dataset')
        return lyrics_df
    
    if random == True :
        track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
    elif random == False :
        track_to_keep = track_ids[:n_songs]

    lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
    
    return lyrics_subset

In [80]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
lyrics_sub.track_id.unique()


Out[80]:
array(['TRAAAFD128F92F423A', 'TRAAAJG128F9308A25', 'TRAABIG128F9356C56',
       'TRAABJV128F1460C49', 'TRAABOA128F933684A', 'TRAABOG128F42955B1',
       'TRAACRY12903CAF2C2', 'TRAACZN128F93236B1', 'TRAADBN128F932D00A',
       'TRAADLH12903CA70EE', 'TRAADNA128F9331246', 'TRAADQW128F427CE68',
       'TRAADSV128F42BC36A', 'TRAADYI128E078FB38', 'TRAAEEQ128F42180B2',
       'TRAAEJH128E0785506', 'TRAAEJQ128F92C484E', 'TRAAENC128F1451DE9',
       'TRAAERZ128F1496921', 'TRAAFEU128E078581C', 'TRAAFOH128E078BD7E',
       'TRAAFOY128F146CC17', 'TRAAFRM128F9320F58', 'TRAAFTE128F429545F',
       'TRAAGAM128F428DCBE', 'TRAAGMJ12903CAD7D4', 'TRAAGOZ128F92EB3D4',
       'TRAAHFR12903CA2491', 'TRAAHRF128F92FE234', 'TRAAHSY128F147BB5C'], dtype=object)

Function to pivot by chunks

Take into consideration sparsity


In [81]:
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
    
    print('Processing chunk number 0')
    track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
    df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
    pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
    del df0
    pivot_df = pivot_df.to_sparse(fill_value=0)

    for i in range(1, n_chunks):
        print('Processing chunk number {}'.format(i))
        df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
        pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
        pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
        del df_tmp
        pivot_df = pivot_df.to_sparse(fill_value=0)

    return pivot_df

In [82]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
test_df = pivot_by_chunks(lyrics_df=lyrics_sub, n_chunks=2)


Processing chunk number 0
Processing chunk number 1

In [84]:
print(test_df.info())
test_df.head(5)


<class 'pandas.core.sparse.frame.SparseDataFrame'>
Index: 30 entries, TRAAAEW128F42930C0 to TRAAHOW12903D00E51
Columns: 1154 entries, & to ça
dtypes: float64(874), int64(280)
memory usage: 70.0+ KB
None
Out[84]:
& 100 3 a abl abov abr achiev across action ... yet yo york you young your yourself youth à ça
track_id
TRAAAEW128F42930C0 0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 0.0
TRAAAJG128F9308A25 0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 0 1.0 0 0.0 0.0 0.0 0.0
TRAAAUC128F428716F 0 0.0 0.0 10 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 6 0.0 2 0.0 0.0 0.0 0.0
TRAABOA128F933684A 0 0.0 0.0 2 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0 0.0 1 0.0 0 0.0 1.0 0.0 0.0
TRAABOG128F42955B1 0 0.0 0.0 14 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 1.0 2 0.0 0 0.0 0.0 0.0 0.0

5 rows × 1154 columns

Pivoting the Tables


In [ ]:
# create a table with track_is as index and word as columns
lyrics_mat = lyrics.pivot_table(index='track_id', columns='word', values='count',
                                fill_value=0)
print(lyrics_mat.info())
lyrics_mat.head(5)

In [ ]:
lyrics_mat.to_sparse(fill_value=0).info()

In [ ]:
# create a table with track_is as index and word as columns (no stopwords)
lyrics_no_stopwords_mat =lyrics_no_stopwords.pivot_table(index='track_id', columns='word', values='count',
                                                         fill_value=0)
lyrics_no_stopwords_mat.head(5)

In [ ]:
lyrics_no_stopwords_mat.to_sparse(fill_value=0).info()

Projections

Using some dimensionality reduction

PCA


In [ ]:
from sklearn.decomposition import PCA

In [ ]:
pca = PCA(n_components=2)
lyrics_no_pca = pca.fit_transform(lyrics_no_stopwords_mat)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,10))

ax[1].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[1].axis('scaled')
ax[0].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[0].axis('scaled')

ax[1].set_xlim(-5.0,8.0)
ax[1].set_ylim(-5.0,8.0);

Isomap


In [ ]:
from sklearn.manifold import Isomap

In [ ]:
iso = Isomap(n_components=2)
lyrics_no_iso = iso.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_iso[:,0], lyrics_no_iso[:,1], marker='.')
plt.axis('scaled');

LLE


In [ ]:
from sklearn.manifold import LocallyLinearEmbedding

In [ ]:
lle = LocallyLinearEmbedding(n_components=2)
lyrics_no_lle = lle.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_lle[:,0], lyrics_no_lle[:,1], marker='.')
plt.axis('scaled');

TSNE


In [ ]:
from sklearn.manifold import TSNE

In [ ]:
tsne = TSNE(n_components=2)
lyrics_no_tsne = tsne.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.')
plt.axis('scaled');

MDS


In [ ]:
from sklearn.manifold import MDS

In [ ]:
mds = MDS(n_components=2)
lyrics_no_mds = mds.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_mds[:,0], lyrics_no_mds[:,1], marker='.')
plt.axis('scaled');

Clustering


In [ ]:
from sklearn.cluster import AgglomerativeClustering

In [ ]:
agg = AgglomerativeClustering(n_clusters=5)
agg_preds = agg.fit_predict(lyrics_no_stopwords_mat)

In [ ]:
from sklearn.cluster import AffinityPropagation

In [ ]:
afp = AffinityPropagation(damping=0.95)
afp_preds = afp.fit_predict(lyrics_no_stopwords_mat)

In [ ]:
np.unique(afp_preds)

In [ ]:
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.', c=afp_preds, cmap='gist_rainbow')
plt.axis('scaled');

In [ ]:
lyrics_mat.to_csv('lyrics_pivot.csv')

In [ ]: