In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import numpy as np
import time
import apriori
To make this example more easily reproducible, I used the sqlite version of the dataset rather than the full Hive implementation. I did not include that database in my repo. To retrieve it for your own study, download it via the link below.
http://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset
In [2]:
conn = sqlite3.connect('artist_term.db')
lastid = ''
tagss = []
tags = []
cursor = conn.execute("SELECT * from artist_mbtag")
for row in cursor:
id = row[0]
tag = row[1]
if id != lastid and len(tags) > 0:
tagss.append(tags)
tags = []
lastid = id
tags.append(tag)
tagss.append(tags)
conn.close()
In [3]:
sensitivities = np.arange(0, 50) * 0.001 + 0.001
sensitivities
Out[3]:
In [5]:
tlist = []
ilist = []
for ms in sensitivities:
start = time.time()
ap = apriori.apriori(tagss, ms)
end = time.time()
tlist.append(end-start)
ilist.append(len(ap))
In [6]:
plt.figure(figsize=(15,8))
plt.plot(sensitivities, ilist)
plt.xlabel('sensitivity', fontsize=22)
plt.ylabel('Number of itemsets found', fontsize=22)
plt.show()
In [7]:
plt.plot(sensitivities, tlist)
plt.xlabel('sensitivity')
plt.ylabel('duration (seconds)')
plt.show()
In [8]:
plt.plot(sensitivities, ilist)
plt.xlabel('sensitivity')
plt.ylabel('# of rules')
plt.show()
In [10]:
ap = apriori.apriori(tagss, 0.0005, verbose=True)
In [11]:
supports = []
names = []
items = []
for a in ap:
supports.append(a['support'])
names.append(str(a['items']))
items.append(len(a['items']))
df = pd.DataFrame({'support': supports, 'name': names, 'itemcount': items})
In [12]:
df.sort('support', ascending=False, inplace=True)
In [13]:
itemsets = df[df.itemcount==6].head(10)['name'].apply(lambda x: x.replace('set([', '').replace('])', '').replace("u'", '').replace("'", '')).tolist()
for itemset in itemsets:
print itemset
In [14]:
choice = ['trance']
supports = []
itemsets = []
confidence = []
for i in ap:
if len(i['items'].intersection(choice)) == len(choice) and len(i['items']) > len(choice):
itemset = list(i['items'])
for c in choice:
itemset.remove(c)
supports.append(i['support'])
itemsets.append(itemset)
iset = set(itemset)
for j in ap:
if j['items'] == iset:
denom = j['count']
conf = 1.0 * i['count'] / denom
confidence.append(conf)
df2 = pd.DataFrame({'support': supports, 'itemset': itemsets, 'confidence': confidence})
df2.sort('confidence', ascending=False, inplace=True)
df2 = df2[['itemset', 'support', 'confidence']]
df2
Out[14]: