In [ ]:
import os
import sys
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import cPickle as pickle
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.utils import shuffle

mxm_db_path      = "../mxm_dataset.db"
msd_meta_db_path = "../MillionSongSubset/AdditionalFiles/subset_track_metadata.db"
msd_sim_db_path  = "../MillionSongSubset/AdditionalFiles/subset_artist_similarity.db"

assert os.path.isfile(mxm_db_path)      ,'wrong filepath'
assert os.path.isfile(msd_sim_db_path)  ,'wrong filepath'
assert os.path.isfile(msd_meta_db_path) ,'wrong filepath'

In [ ]:
con_mxm = sqlite3.connect(mxm_db_path)
cur_mxm = con_mxm.cursor()

con_met = sqlite3.connect(msd_meta_db_path)
cur_met = con_met.cursor()

words      = pd.read_sql_query("SELECT * FROM words" , con_mxm)
lyrics     = pd.read_sql_query("SELECT * FROM lyrics ORDER BY track_id ASC LIMIT 50000", con_mxm)
songs      = pd.read_sql_query("SELECT * FROM songs ORDER BY track_id ASC", con_met)

In [ ]:
def get_n_songs(lyrics_df, n_songs=1 ,random=False):
    
    track_ids = lyrics.track_id.unique()
    
    if n_songs > len(track_ids):
        print('n_songs greater than the number of tracks ({}) ...'.format(len(track_ids)))
        print('... return the whole dataset')
        return lyrics_df
    
    if random == True :
        track_to_keep = np.random.choice(track_ids, n_songs, replace=False)
    elif random == False :
        track_to_keep = track_ids[:n_songs]

    lyrics_subset = lyrics_df[lyrics_df['track_id'].isin(track_to_keep)]
    
    return lyrics_subset

def pivot_by_chunks(lyrics_df, n_chunks=3, sparse=True):
    print('Processing chunk number 0')
    track_list = np.array_split(lyrics_df.track_id.unique(), n_chunks)
    df0 = lyrics_df[lyrics_df['track_id'].isin(track_list[0])]
    pivot_df = df0.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
    del df0
    pivot_df = pivot_df.to_sparse(fill_value=0)

    for i in range(1, n_chunks):
        print('Processing chunk number {}'.format(i))
        df_tmp = lyrics_df[lyrics_df['track_id'].isin(track_list[i])]
        pivot_df_tmp = df_tmp.pivot_table(index='track_id', columns=words, values='count', fill_value=0)
        pivot_df = pivot_df.append(pivot_df_tmp).fillna(0)
        del df_tmp
        pivot_df = pivot_df.to_sparse(fill_value=0)

    return pivot_df

def save_pandas(fname, data):
    '''Save DataFrame or Series
    Parameters
    ----------
    fname : str
        filename to use
    data: Pandas DataFrame or Series
    '''
    np.save(open(fname, 'w'), data)
    if len(data.shape) == 2:
        meta = data.index,data.columns
    elif len(data.shape) == 1:
        meta = (data.index,)
    else:
        raise ValueError('save_pandas: Cannot save this type')
    s = pickle.dumps(meta)
    s = s.encode('string_escape')
    with open(fname, 'a') as f:
        f.seek(0, 2)
        f.write(s)
        
def load_pandas(fname, mmap_mode='r'):
    '''Load DataFrame or Series
    Parameters
    ----------
    fname : str
        filename
    mmap_mode : str, optional
        Same as numpy.load option
    '''
    values = np.load(fname, mmap_mode=mmap_mode)
    with open(fname) as f:
        np.lib.format.read_magic(f)
        np.lib.format.read_array_header_1_0(f)
        f.seek(values.dtype.alignment*values.size, 1)
        meta = pickle.loads(f.readline().decode('string_escape'))
    if len(meta) == 2:
        return pd.DataFrame(values, index=meta[0], columns=meta[1])
    elif len(meta) == 1:
        return pd.Series(values, index=meta[0])

In [ ]:
stp_wds             = stopwords.words()
words_no_stopwords  = words[~np.isin(words.word, stp_wds)]
lyrics_no_stopwords = lyrics[~np.isin(lyrics.word, stp_wds)]

test_df             = pivot_by_chunks(get_n_songs(lyrics, n_songs=625, random=False), n_chunks=100)

In [ ]:
save_pandas("pivoted_table_1000songs.npy", test_df)

In [ ]:
test_df = load_pandas("pivoted_table_1000songs.npy")

test_df.info()

In [ ]:
indexes = test_df.index.values

test_df["hotness_the_label"] = np.zeros(len(test_df))

for i in range(0, len(test_df)):
    print(i)
    for j in range(0, len(songs)):
        if (songs.iloc[j]["track_id"] == indexes[i]):
            test_df.ix[i, "hotness_the_label"] = songs.iloc[j]["artist_hotttnesss"]

In [ ]:
new_lyrics = pd.DataFrame()
new_lyrics = test_df[test_df["hotness_the_label"] > 0]

In [ ]:
new_lyrics.info()

In [ ]:
features = new_lyrics.iloc[:, 0:-1]
label    = new_lyrics.iloc[:, -1]

save_pandas("features.npy", features)
save_pandas("label.npy", label)

In [ ]: