Working with the lyrics Bag of Word


In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline

#sns.set_palette('Dark2')
#sns.set_style('whitegrid')

In [2]:
con_mxm = sqlite3.connect('../mxm_dataset.db')
cur_mxm = con_mxm.cursor()

# displaying the different table available
tables = con_mxm.execute("SELECT name FROM sqlite_master WHERE type='table'")
table_names = tables.fetchall()

print('Tables within the database :')
print('{}'.format(table_names[0][0]))
print('{}'.format(table_names[1][0]))


Tables within the database :
words
lyrics

In [3]:
# import the "words" table in a pandas DataFrame
words = pd.read_sql_query("SELECT * FROM words",con_mxm)
print(words.info())
words.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
word    5000 non-null object
dtypes: object(1)
memory usage: 39.1+ KB
None
Out[3]:
word
0 i
1 the
2 you
3 to
4 and

In [4]:
# import the "lyrics" table in a pandas DataFrame
# limit to 10055 to have the exact content of each song (120 songs)
lyrics = pd.read_sql_query("SELECT *\
                           FROM lyrics\
                           ORDER BY track_id ASC\
                           LIMIT 10055",con_mxm)
#lyrics = lyric.to_sparse(fill_value=0)

print(lyrics.info())
lyrics.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10055 entries, 0 to 10054
Data columns (total 5 columns):
track_id    10055 non-null object
mxm_tid     10055 non-null int64
word        10055 non-null object
count       10055 non-null int64
is_test     10055 non-null int64
dtypes: int64(3), object(2)
memory usage: 392.9+ KB
None
Out[4]:
track_id mxm_tid word count is_test
0 TRAAAAV128F421A322 4623710 i 6 0
1 TRAAAAV128F421A322 4623710 the 4 0
2 TRAAAAV128F421A322 4623710 you 2 0
3 TRAAAAV128F421A322 4623710 to 2 0
4 TRAAAAV128F421A322 4623710 and 5 0

Removing stopwords


In [5]:
stp_wds = stopwords.words()

In [6]:
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
words_no_stopwords.head(5)


Out[6]:
word
26 love
28 know
35 like
38 time
43 go

In [7]:
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
print(lyrics_no_stopwords.info())
lyrics_no_stopwords.head(5)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5993 entries, 22 to 10054
Data columns (total 5 columns):
track_id    5993 non-null object
mxm_tid     5993 non-null int64
word        5993 non-null object
count       5993 non-null int64
is_test     5993 non-null int64
dtypes: int64(3), object(2)
memory usage: 280.9+ KB
None
Out[7]:
track_id mxm_tid word count is_test
22 TRAAAAV128F421A322 4623710 like 2 0
27 TRAAAAV128F421A322 4623710 got 1 0
28 TRAAAAV128F421A322 4623710 would 1 0
31 TRAAAAV128F421A322 4623710 seem 1 0
32 TRAAAAV128F421A322 4623710 someon 1 0

Selecting the Subset Lyrics


In [107]:
track_ids = np.loadtxt('../MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt',
                       delimiter=b'<SEP>', usecols=0, dtype = ('S20'))
lyrics_10Ksub = lyrics[lyrics['track_id'].isin(track_ids)]

Function to choose a given number of songs


In [76]:
from sklearn.utils import shuffle

def get_n_songs(lyrics_df, n_songs=1 ,random=False):
    
    track_ids = lyrics.track_id.unique()
    
    if n_songs > len(track_ids):
        print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
        print('... return the whole dataset')
        return lyrics_df
    
    if random == True :
        track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
    elif random == False :
        track_to_keep = track_ids[:n_songs]

    lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
    
    return lyrics_subset

In [80]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
lyrics_sub.track_id.unique()


Out[80]:
array(['TRAAAFD128F92F423A', 'TRAAAJG128F9308A25', 'TRAABIG128F9356C56',
       'TRAABJV128F1460C49', 'TRAABOA128F933684A', 'TRAABOG128F42955B1',
       'TRAACRY12903CAF2C2', 'TRAACZN128F93236B1', 'TRAADBN128F932D00A',
       'TRAADLH12903CA70EE', 'TRAADNA128F9331246', 'TRAADQW128F427CE68',
       'TRAADSV128F42BC36A', 'TRAADYI128E078FB38', 'TRAAEEQ128F42180B2',
       'TRAAEJH128E0785506', 'TRAAEJQ128F92C484E', 'TRAAENC128F1451DE9',
       'TRAAERZ128F1496921', 'TRAAFEU128E078581C', 'TRAAFOH128E078BD7E',
       'TRAAFOY128F146CC17', 'TRAAFRM128F9320F58', 'TRAAFTE128F429545F',
       'TRAAGAM128F428DCBE', 'TRAAGMJ12903CAD7D4', 'TRAAGOZ128F92EB3D4',
       'TRAAHFR12903CA2491', 'TRAAHRF128F92FE234', 'TRAAHSY128F147BB5C'], dtype=object)

Function to pivot by chunks

Take into consideration sparsity


In [10]:
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
    
    print('Processing chunk number 0')
    track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
    df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
    pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
    del df0
    pivot_df = pivot_df.to_sparse(fill_value=0)

    for i in range(1, n_chunks):
        print('Processing chunk number {}'.format(i))
        df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
        pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
        pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
        del df_tmp
        pivot_df = pivot_df.to_sparse(fill_value=0)

    return pivot_df

In [11]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
test_df = pivot_by_chunks(lyrics_df=lyrics_sub, n_chunks=2)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-068f2f136511> in <module>()
----> 1 lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
      2 test_df = pivot_by_chunks(lyrics_df=lyrics_sub, n_chunks=2)

NameError: name 'get_n_songs' is not defined

In [84]:
print(test_df.info())
test_df.head(5)


<class 'pandas.core.sparse.frame.SparseDataFrame'>
Index: 30 entries, TRAAAEW128F42930C0 to TRAAHOW12903D00E51
Columns: 1154 entries, & to ça
dtypes: float64(874), int64(280)
memory usage: 70.0+ KB
None
Out[84]:
& 100 3 a abl abov abr achiev across action ... yet yo york you young your yourself youth à ça
track_id
TRAAAEW128F42930C0 0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 0 0.0 0 0.0 0.0 0.0 0.0
TRAAAJG128F9308A25 0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 0 1.0 0 0.0 0.0 0.0 0.0
TRAAAUC128F428716F 0 0.0 0.0 10 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 0.0 6 0.0 2 0.0 0.0 0.0 0.0
TRAABOA128F933684A 0 0.0 0.0 2 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0 0.0 1 0.0 0 0.0 1.0 0.0 0.0
TRAABOG128F42955B1 0 0.0 0.0 14 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0 1.0 2 0.0 0 0.0 0.0 0.0 0.0

5 rows × 1154 columns

Pivoting the Tables


In [12]:
# create a table with track_is as index and word as columns
lyrics_mat = lyrics.pivot_table(index='track_id', columns='word', values='count',
                                fill_value=0)
print(lyrics_mat.info())
lyrics_mat.head(50)


<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, TRAAAAV128F421A322 to TRAAHWZ128F4269B1F
Columns: 2291 entries, & to è
dtypes: int64(2291)
memory usage: 2.1+ MB
None
Out[12]:
word & 1 10 100 2 3 30 6 a aaah ... yourself youth ze zeit zero zu à å ça è
track_id
TRAAAAV128F421A322 0 0 0 0 0 0 0 0 3 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAABD128F429CF47 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAED128E0783FAB 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAEF128F4273421 0 0 0 0 0 0 0 0 11 0 ... 1 0 0 0 0 0 0 0 0 0
TRAAAEW128F42930C0 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAFD128F92F423A 0 0 0 0 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAGF12903CEC202 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAHJ128F931194C 0 0 0 0 0 0 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAHZ128E0799171 0 0 0 0 0 0 0 0 21 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAJG128F9308A25 0 0 0 0 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAOF128F429C156 0 0 0 0 0 0 0 0 2 1 ... 0 0 0 0 0 0 0 0 0 0
TRAAARJ128F9320760 0 0 0 0 0 0 0 0 10 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAUC128F428716F 0 0 0 0 0 0 0 0 10 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAZF12903CCCF6B 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABEV12903CC53A4 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABHB12903CAFC2F 0 0 0 0 0 0 0 0 8 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABHC128F933A3F8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABIG128F9356C56 0 0 2 0 0 0 0 0 13 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABJS128F9325C99 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABJV128F1460C49 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABLR128F423B7E3 0 0 0 0 0 0 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABOA128F933684A 0 0 0 0 0 0 0 0 2 0 ... 0 1 0 0 0 0 0 0 0 0
TRAABOG128F42955B1 0 0 0 0 0 0 0 0 14 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABPG128F14774DD 0 0 0 0 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABRX12903CC4816 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABVM128F92CA9DC 0 0 0 0 0 0 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABXH128F42955D6 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACER128F4290F96 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACFV128F935E50B 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACHN128F1489601 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACIE128F428495B 0 1 0 0 1 0 0 0 4 0 ... 0 2 0 0 0 0 0 0 0 0
TRAACIR128F42963AC 0 0 0 2 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACJC128F934ABB5 0 0 0 0 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACPH12903CF5F14 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACQW128F428854F 0 0 0 0 0 0 0 0 3 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACRY12903CAF2C2 0 0 0 0 0 0 0 0 7 0 ... 0 0 1 0 0 0 0 0 0 0
TRAACUP128E0789C69 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACZN128F93236B1 0 0 0 0 0 0 0 0 12 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADAA128F92F7043 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADBN128F932D00A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 6 0 3 0
TRAADCQ128F93436C3 0 0 0 0 0 0 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADFO128F92E1E91 0 0 0 0 0 0 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADKA12903CD2511 0 0 0 0 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADKW128E079503A 0 0 0 0 0 0 0 0 23 0 ... 0 0 0 0 1 0 0 0 0 0
TRAADKZ128F149BDFF 0 0 0 0 0 0 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADLH12903CA70EE 0 0 0 0 0 0 0 0 3 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNA128F9331246 0 0 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNL128F14519DF 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNN128F42B1D94 0 0 0 0 0 0 0 0 4 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADQL128F427D281 0 0 0 0 0 0 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0

50 rows × 2291 columns


In [ ]:
lyrics_mat.to_sparse(fill_value=0).info()

In [13]:
# create a table with track_is as index and word as columns (no stopwords)
lyrics_no_stopwords_mat =lyrics_no_stopwords.pivot_table(index='track_id', columns='word', values='count',
                                                         fill_value=0)
lyrics_no_stopwords_mat.head(50)


Out[13]:
word & 1 10 100 2 3 30 6 aaah aah ... yearn yellow yes yet york young youth zeit zero ça
track_id
TRAAAAV128F421A322 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAABD128F429CF47 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAED128E0783FAB 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAEF128F4273421 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAEW128F42930C0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAFD128F92F423A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAGF12903CEC202 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAHJ128F931194C 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAHZ128E0799171 0 0 0 0 0 0 0 0 0 0 ... 0 1 1 0 0 1 0 0 0 0
TRAAAJG128F9308A25 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
TRAAAOF128F429C156 0 0 0 0 0 0 0 0 1 12 ... 0 0 0 0 0 0 0 0 0 0
TRAAARJ128F9320760 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
TRAAAUC128F428716F 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAAAZF12903CCCF6B 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABEV12903CC53A4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABHB12903CAFC2F 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABHC128F933A3F8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABIG128F9356C56 0 0 2 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABJS128F9325C99 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABJV128F1460C49 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABLR128F423B7E3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABOA128F933684A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
TRAABOG128F42955B1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
TRAABPG128F14774DD 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABRX12903CC4816 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABVM128F92CA9DC 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAABXH128F42955D6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACER128F4290F96 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACFV128F935E50B 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACHN128F1489601 0 0 0 0 0 0 0 0 0 0 ... 0 0 2 0 0 0 0 0 0 0
TRAACIE128F428495B 0 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 2 0 0 0
TRAACIR128F42963AC 0 0 0 2 0 0 0 0 0 0 ... 0 0 0 0 0 2 0 0 0 0
TRAACJC128F934ABB5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACPH12903CF5F14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACQW128F428854F 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACRY12903CAF2C2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACUP128E0789C69 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAACZN128F93236B1 0 0 0 0 0 0 0 0 0 0 ... 0 3 0 0 0 0 0 0 0 0
TRAADAA128F92F7043 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADBN128F932D00A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 3
TRAADCQ128F93436C3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADFO128F92E1E91 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADKA12903CD2511 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADKW128E079503A 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 1 0
TRAADKZ128F149BDFF 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADLH12903CA70EE 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNA128F9331246 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNL128F14519DF 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADNN128F42B1D94 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
TRAADQL128F427D281 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

50 rows × 1944 columns


In [ ]:
lyrics_no_stopwords_mat.to_sparse(fill_value=0).info()

Projections

Using some dimensionality reduction

PCA


In [ ]:
from sklearn.decomposition import PCA

In [ ]:
pca = PCA(n_components=2)
lyrics_no_pca = pca.fit_transform(lyrics_no_stopwords_mat)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,10))

ax[1].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[1].axis('scaled')
ax[0].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[0].axis('scaled')

ax[1].set_xlim(-5.0,8.0)
ax[1].set_ylim(-5.0,8.0);

Isomap


In [ ]:
from sklearn.manifold import Isomap

In [ ]:
iso = Isomap(n_components=2)
lyrics_no_iso = iso.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_iso[:,0], lyrics_no_iso[:,1], marker='.')
plt.axis('scaled');

LLE


In [ ]:
from sklearn.manifold import LocallyLinearEmbedding

In [ ]:
lle = LocallyLinearEmbedding(n_components=2)
lyrics_no_lle = lle.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_lle[:,0], lyrics_no_lle[:,1], marker='.')
plt.axis('scaled');

TSNE


In [ ]:
from sklearn.manifold import TSNE

In [ ]:
tsne = TSNE(n_components=2)
lyrics_no_tsne = tsne.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.')
plt.axis('scaled');

MDS


In [ ]:
from sklearn.manifold import MDS

In [ ]:
mds = MDS(n_components=2)
lyrics_no_mds = mds.fit_transform(lyrics_no_stopwords_mat)

plt.figure(figsize=(10,10))

plt.scatter(lyrics_no_mds[:,0], lyrics_no_mds[:,1], marker='.')
plt.axis('scaled');

Clustering


In [ ]:
from sklearn.cluster import AgglomerativeClustering

In [ ]:
agg = AgglomerativeClustering(n_clusters=5)
agg_preds = agg.fit_predict(lyrics_no_stopwords_mat)

In [ ]:
from sklearn.cluster import AffinityPropagation

In [ ]:
afp = AffinityPropagation(damping=0.95)
afp_preds = afp.fit_predict(lyrics_no_stopwords_mat)

In [ ]:
np.unique(afp_preds)

In [ ]:
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.', c=afp_preds, cmap='gist_rainbow')
plt.axis('scaled');

In [ ]:
lyrics_mat.to_csv('lyrics_pivot.csv')

In [ ]: