In [ ]:
import os
import sys
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import cPickle as pickle
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.utils import shuffle
mxm_db_path = "../mxm_dataset.db"
msd_meta_db_path = "../MillionSongSubset/AdditionalFiles/subset_track_metadata.db"
msd_sim_db_path = "../MillionSongSubset/AdditionalFiles/subset_artist_similarity.db"
assert os.path.isfile(mxm_db_path) ,'wrong filepath'
assert os.path.isfile(msd_sim_db_path) ,'wrong filepath'
assert os.path.isfile(msd_meta_db_path) ,'wrong filepath'
In [ ]:
con_mxm = sqlite3.connect(mxm_db_path)
cur_mxm = con_mxm.cursor()
con_met = sqlite3.connect(msd_meta_db_path)
cur_met = con_met.cursor()
words = pd.read_sql_query("SELECT * FROM words" , con_mxm)
lyrics = pd.read_sql_query("SELECT * FROM lyrics ORDER BY track_id ASC LIMIT 50000", con_mxm)
songs = pd.read_sql_query("SELECT * FROM songs ORDER BY track_id ASC", con_met)
In [ ]:
def get_n_songs(lyrics_df, n_songs=1 ,random=False):
track_ids = lyrics.track_id.unique()
if n_songs > len(track_ids):
print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
print('... return the whole dataset')
return lyrics_df
if random == True :
track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
elif random == False :
track_to_keep = track_ids[:n_songs]
lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
return lyrics_subset
def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
print('Processing chunk number 0')
track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
del df0
pivot_df = pivot_df.to_sparse(fill_value=0)
for i in range(1, n_chunks):
print('Processing chunk number {}'.format(i))
df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
del df_tmp
pivot_df = pivot_df.to_sparse(fill_value=0)
return pivot_df
def save_pandas(fname, data):
'''Save DataFrame or Series
Parameters
----------
fname : str
filename to use
data: Pandas DataFrame or Series
'''
np.save(open(fname, 'w'), data)
if len(data.shape) == 2:
meta = data.index,data.columns
elif len(data.shape) == 1:
meta = (data.index,)
else:
raise ValueError('save_pandas: Cannot save this type')
s = pickle.dumps(meta)
s = s.encode('string_escape')
with open(fname, 'a') as f:
f.seek(0, 2)
f.write(s)
def load_pandas(fname, mmap_mode='r'):
'''Load DataFrame or Series
Parameters
----------
fname : str
filename
mmap_mode : str, optional
Same as numpy.load option
'''
values = np.load(fname, mmap_mode=mmap_mode)
with open(fname) as f:
np.lib.format.read_magic(f)
np.lib.format.read_array_header_1_0(f)
f.seek(values.dtype.alignment*values.size, 1)
meta = pickle.loads(f.readline().decode('string_escape'))
if len(meta) == 2:
return pd.DataFrame(values, index=meta[0], columns=meta[1])
elif len(meta) == 1:
return pd.Series(values, index=meta[0])
In [ ]:
stp_wds = stopwords.words()
words_no_stopwords = words[~np.isin(words.word, stp_wds)]
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]
test_df = pivot_by_chunks(get_n_songs(lyrics, n_songs=625, random=False), n_chunks=100)
In [ ]:
save_pandas("pivoted_table_1000songs.npy", test_df)
In [ ]:
test_df = load_pandas("pivoted_table_1000songs.npy")
test_df.info()
In [ ]:
indexes = test_df.index.values
test_df["hotness_the_label"] = np.zeros(len(test_df))
for i in range(0, len(test_df)):
print(i)
for j in range(0, len(songs)):
if (songs.iloc[j]["track_id"] == indexes[i]):
test_df.ix[i, "hotness_the_label"] = songs.iloc[j]["artist_hotttnesss"]
In [ ]:
new_lyrics = pd.DataFrame()
new_lyrics = test_df[test_df["hotness_the_label"] > 0]
In [ ]:
new_lyrics.info()
In [ ]:
features = new_lyrics.iloc[:, 0:-1]
label = new_lyrics.iloc[:, -1]
save_pandas("features.npy", features)
save_pandas("label.npy", label)
In [ ]: