In [1]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline
#sns.set_palette('Dark2')
#sns.set_style('whitegrid')
In [2]:
con_mxm = sqlite3.connect('../mxm_dataset.db')
cur_mxm = con_mxm.cursor()
# displaying the different table available
tables = con_mxm.execute("SELECT name FROM sqlite_master WHERE type='table'")
table_names = tables.fetchall()
print('Tables within the database :')
print('{}'.format(table_names[0][0]))
print('{}'.format(table_names[1][0]))
In [3]:
# import the "words" table in a pandas DataFrame
words = pd.read_sql_query("SELECT * FROM words",con_mxm)
print(words.info())
words.head(5)
Out[3]:
In [4]:
# import the "lyrics" table in a pandas DataFrame
# limit to 10055 to have the exact content of each song (120 songs)
lyrics = pd.read_sql_query("SELECT *\
FROM lyrics\
ORDER BY track_id ASC\
LIMIT 10055",con_mxm)
#lyrics = lyric.to_sparse(fill_value=0)
print(lyrics.info())
lyrics.head(5)
Out[4]:
In [5]:
stp_wds = stopwords.words()
In [6]:
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
words_no_stopwords.head(5)
Out[6]:
In [7]:
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
print(lyrics_no_stopwords.info())
lyrics_no_stopwords.head(5)
Out[7]:
In [107]:
track_ids = np.loadtxt('../MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt',
delimiter=b'<SEP>', usecols=0, dtype = ('S20'))
lyrics_10Ksub = lyrics[lyrics['track_id'].isin(track_ids)]
In [76]:
from sklearn.utils import shuffle
def get_n_songs(lyrics_df, n_songs=1 ,random=False):
track_ids = lyrics.track_id.unique()
if n_songs > len(track_ids):
print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
print('... return the whole dataset')
return lyrics_df
if random == True :
track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
elif random == False :
track_to_keep = track_ids[:n_songs]
lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
return lyrics_subset
In [80]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
lyrics_sub.track_id.unique()
Out[80]:
In [10]:
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
print('Processing chunk number 0')
track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
del df0
pivot_df = pivot_df.to_sparse(fill_value=0)
for i in range(1, n_chunks):
print('Processing chunk number {}'.format(i))
df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
del df_tmp
pivot_df = pivot_df.to_sparse(fill_value=0)
return pivot_df
In [11]:
lyrics_sub = get_n_songs(lyrics, n_songs=30, random=True)
test_df = pivot_by_chunks(lyrics_df=lyrics_sub, n_chunks=2)
In [84]:
print(test_df.info())
test_df.head(5)
Out[84]:
In [12]:
# create a table with track_is as index and word as columns
lyrics_mat = lyrics.pivot_table(index='track_id', columns='word', values='count',
fill_value=0)
print(lyrics_mat.info())
lyrics_mat.head(50)
Out[12]:
In [ ]:
lyrics_mat.to_sparse(fill_value=0).info()
In [13]:
# create a table with track_is as index and word as columns (no stopwords)
lyrics_no_stopwords_mat =lyrics_no_stopwords.pivot_table(index='track_id', columns='word', values='count',
fill_value=0)
lyrics_no_stopwords_mat.head(50)
Out[13]:
In [ ]:
lyrics_no_stopwords_mat.to_sparse(fill_value=0).info()
In [ ]:
from sklearn.decomposition import PCA
In [ ]:
pca = PCA(n_components=2)
lyrics_no_pca = pca.fit_transform(lyrics_no_stopwords_mat)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,10))
ax[1].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[1].axis('scaled')
ax[0].scatter(lyrics_no_pca[:,0], lyrics_no_pca[:,1], marker='.')
ax[0].axis('scaled')
ax[1].set_xlim(-5.0,8.0)
ax[1].set_ylim(-5.0,8.0);
In [ ]:
from sklearn.manifold import Isomap
In [ ]:
iso = Isomap(n_components=2)
lyrics_no_iso = iso.fit_transform(lyrics_no_stopwords_mat)
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_iso[:,0], lyrics_no_iso[:,1], marker='.')
plt.axis('scaled');
In [ ]:
from sklearn.manifold import LocallyLinearEmbedding
In [ ]:
lle = LocallyLinearEmbedding(n_components=2)
lyrics_no_lle = lle.fit_transform(lyrics_no_stopwords_mat)
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_lle[:,0], lyrics_no_lle[:,1], marker='.')
plt.axis('scaled');
In [ ]:
from sklearn.manifold import TSNE
In [ ]:
tsne = TSNE(n_components=2)
lyrics_no_tsne = tsne.fit_transform(lyrics_no_stopwords_mat)
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.')
plt.axis('scaled');
In [ ]:
from sklearn.manifold import MDS
In [ ]:
mds = MDS(n_components=2)
lyrics_no_mds = mds.fit_transform(lyrics_no_stopwords_mat)
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_mds[:,0], lyrics_no_mds[:,1], marker='.')
plt.axis('scaled');
In [ ]:
from sklearn.cluster import AgglomerativeClustering
In [ ]:
agg = AgglomerativeClustering(n_clusters=5)
agg_preds = agg.fit_predict(lyrics_no_stopwords_mat)
In [ ]:
from sklearn.cluster import AffinityPropagation
In [ ]:
afp = AffinityPropagation(damping=0.95)
afp_preds = afp.fit_predict(lyrics_no_stopwords_mat)
In [ ]:
np.unique(afp_preds)
In [ ]:
plt.figure(figsize=(10,10))
plt.scatter(lyrics_no_tsne[:,0], lyrics_no_tsne[:,1], marker='.', c=afp_preds, cmap='gist_rainbow')
plt.axis('scaled');
In [ ]:
lyrics_mat.to_csv('lyrics_pivot.csv')
In [ ]: