""" Thierry Bertin-Mahieux (2010) Columbia University tb2332@columbia.edu

This code demo the use of the track_metadata.db It is almost the same as demo_track_metadata.py in the github repository

This is part of the Million Song Dataset project from LabROSA (Columbia University) and The Echo Nest.

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/. """



In [91]:

    
import itertools
import os
import sys
import glob
import time
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set(color_codes=True)
from pandas import DataFrame
from sklearn import mixture
from scipy import linalg
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

try:
    import sqlite3
except ImportError:
    print 'you need sqlite3 installed to use this program'
    sys.exit(0)
    
%matplotlib inline



In [2]:

    
def encode_string(s):
    """
    Simple utility function to make sure a string is proper
    to be used in a SQLite query
    (different than posgtresql, no N to specify unicode)
    EXAMPLE:
      That's my boy! -> 'That''s my boy!'
    """
    return "'"+s.replace("'","''")+"'"



In [3]:

    
# PATH TO track_metadat.db
# CHANGE THIS TO YOUR LOCAL CONFIGURATION
# IT SHOULD BE IN THE ADDITIONAL FILES
# (you can use 'subset_track_metadata.db')
dbfile = '/home/michael/Barista/Music/MillionSongSubset/AdditionalFiles/subset_track_metadata.db'



In [15]:

    
# connect to the SQLite database
conn = sqlite3.connect(dbfile)

# from that connection, get a cursor to do queries
c = conn.cursor()

# so there is no confusion, the table name is 'songs'
TABLENAME = 'songs'



In [17]:

    
print '*************** GENERAL SQLITE DEMO ***************************'

# list all tables in that dataset
# note that sqlite does the actual job when we call fetchall() or fetchone()
q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
res = c.execute(q)
print "* tables contained in that SQLite file/database (should be only 'songs'):"
print res.fetchall()









    



*************** GENERAL SQLITE DEMO ***************************
* tables contained in that SQLite file/database (should be only 'songs'):
[(u'songs',)]



In [18]:

    
# list all columns names from table 'songs'
q = "SELECT sql FROM sqlite_master WHERE tbl_name = 'songs' AND type = 'table'"
res = c.execute(q)
print '* get info on columns names (original table creation command):'
print res.fetchall()[0][0]









    



* get info on columns names (original table creation command):
CREATE TABLE songs (track_id text PRIMARY KEY, title text, song_id text, release text, artist_id text, artist_mbid text, artist_name text, duration real, artist_familiarity real, artist_hotttnesss real, year int)



In [ ]:

    
# list all indices
q = "SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='songs' ORDER BY name"
res = c.execute(q)
print '* one of the index we added to the table to make things faster:'
print res.fetchone()



In [ ]:

    
# find the PRIMARY KEY of a query
# by default it's called ROWID, it would have been redefined if our primary key
# was of type INTEGER
q = "SELECT ROWID FROM songs WHERE artist_name='Red Hot Chili Peppers'"
res = c.execute(q)
print '* get the primary key (row id) of one entry where the artist is Red Hot Chili Peppers:'
print res.fetchone()



In [ ]:

    
# find an entry with Red Hot Chili Peppers as artist_name
# returns all info (the full table row)
q = "SELECT * FROM songs WHERE artist_name='Red Hot Chili Peppers' LIMIT 1"
res = c.execute(q)
print '* get all we have about one track from Red Hot Chili Peppers:'
print res.fetchone()



In [ ]:

    
print '*************** DEMOS AROUND ARTIST_ID ************************'

# query for all the artists Echo Nest ID
# the column name is 'artist_id'
# DISTINCT makes sure you get each ID returned only once
q = "SELECT DISTINCT artist_id FROM " + TABLENAME
res = c.execute(q)
artists = res.fetchall() # does the actual job of searching the db
print '* found',len(artists),'unique artist IDs, response looks like:'
print artists[:3]



In [ ]:

    
# more cumbersome, get unique artist ID but with one track ID for each.
# very usefull, it gives you a HDF5 file to query if you want more
# information about this artist
q = "SELECT artist_id,track_id FROM songs GROUP BY artist_id"
res = c.execute(q)
artist_track_pair = res.fetchone()
print '* one unique artist with some track (chosen at random) associated with it:'
print artist_track_pair



In [ ]:

    
# get artists having only one track in the database
q = "SELECT artist_id,track_id FROM songs GROUP BY artist_id HAVING ( COUNT(artist_id) = 1 )"
q += " ORDER BY RANDOM()"
res = c.execute(q)
artist_track_pair = res.fetchone()
print '* one artist that has only one track in the dataset:'
print artist_track_pair



In [ ]:

    
# get artists with no musicbrainz ID
# of course, we want only once each artist
# for demo purpose, we ask for only two at RANDOM
q = "SELECT artist_id,artist_mbid FROM songs WHERE artist_mbid=''"
q += " GROUP BY artist_id ORDER BY RANDOM() LIMIT 2"
res = c.execute(q)
print '* two random unique artists with no musicbrainz ID:'
print res.fetchall()



In [ ]:

    
print '*************** DEMOS AROUND NAMES ****************************'

# get all tracks by artist The Beatles
# artist name must be exact!
# the encode_string function simply deals with ' (by doubling them)
# and add ' after and before the string.
q = "SELECT track_id FROM songs WHERE artist_name="
q += encode_string('The Beatles')
res = c.execute(q)
print "* two track id from 'The Beatles', found by looking up the artist by name:"
print res.fetchall()[:2]



In [ ]:

    
# we find all release starting by letter 'T'
# T != t, we're just looking at albums starting with capital T
# here we use DISTINCT instead of GROUP BY artist_id
# since its fine that we find twice the same artist, as long as it is not
# the same (artist,release) pair
q = "SELECT DISTINCT artist_name,release FROM songs WHERE SUBSTR(release,1,1)='T'"
res = c.execute(q)
print '* one unique artist/release pair where album starts with capital T:'
print res.fetchone()



In [ ]:

    
# get all artists whose artist familiarity is > .8
q = "SELECT DISTINCT artist_name, artist_familiarity FROM songs WHERE artist_familiarity>.8"
res = c.execute(q)
print '* one artist having familiaryt >0.8:'
print res.fetchone()



In [ ]:

    
# get one artist with the highest artist_familiarity but no artist_hotttnesss
# notice the alias af and ah, makes things more readable
q = "SELECT DISTINCT artist_name, artist_familiarity as af, artist_hotttnesss as ah"
q += " FROM songs WHERE ah<0 ORDER BY af"
res = c.execute(q)
print '* get the artist with the highest familiarity that has no computed hotttnesss:'
print res.fetchone()



In [ ]:

    
# query for all the Track_ids
q = "SELECT track_id FROM " + TABLENAME
res = c.execute(q)
tracks = res.fetchall() # does the actual job of searching the db
print '* found',len(tracks),'Track IDs, response looks like:'
print tracks[:3]



In [7]:

    
# close the cursor and the connection
# (if for some reason you added stuff to the db or alter
#  a table, you need to also do a conn.commit())
c.close()
conn.close()