In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict
import apriori

In [2]:
f = open('speakers.txt', 'r')
lines = f.readlines()
f.close()

In [3]:
names = []
skills = []
votes = []
for line in lines:
    if line.find('#') != 0:
        if line.find('\t') == 0:
            arr = line.strip().split('\t')
            skill = arr[0]
            vote = arr[1]
            names.append(name)
            skills.append(skill)
            votes.append(vote)
        else:
            name = line.strip().title()

df = pd.DataFrame({'name': names, 'skill': skills, 'votes': votes})

In [4]:
gb = df.groupby(['skill'])['name'].count()
gb.sort(ascending=0)
plt.figure(figsize=(15,5))
x = np.arange(len(gb))
plt.plot(x, gb)
plt.ylabel('Number of speakers', fontsize=14)
plt.xlabel('Order of popularity', fontsize=14)
plt.show()



In [5]:
gb = df.groupby(['skill'])['name'].count()
gb.sort(ascending=0)
plt.figure(figsize=(10,8))
gbs = gb[0:15]
x = np.arange(len(gbs))
plt.barh(x, gbs)
plt.yticks(x+0.4, gbs.index, fontsize=14)
plt.xlabel('Order of popularity', fontsize=14)
plt.show()



In [6]:
g1 = df[df.skill=='R']['name']
g2 = df[df.skill=='Machine Learning']['name']
set(g1).intersection(set(g2))


Out[6]:
{'Benjamin Uminsky',
 'Gian Gonzanga\tH',
 'Jim Mcguire',
 'Kyle Polich',
 'Szilard Pafka'}

In [7]:
len(set(df['name']))


Out[7]:
35

In [8]:
s = ''
for n in g1:
    s = s + n + ', '
print s
s = ''
for n in g2:
    s = s + n + ', '
print s


Amelia Mcnamara, Benjamin Uminsky, Gian Gonzanga	H, Jim Mcguire, Kyle Polich, Szilard Pafka, 
Benjamin Uminsky, Brian Kursar, Gian Gonzanga	H, Jim Mcguire, Kyle Polich, Szilard Pafka, Ulas Bardak, 

In [ ]:


In [9]:
skills = []
for name in set(df.name):
    speaker = df[df.name==name]['skill'].tolist()
    skills.append(speaker)

In [ ]:


In [14]:
ap = apriori.apriori(skills, 0.05)
ex = pd.DataFrame(ap)
ex


Out[14]:
count items set support
0 14 set([Hadoop]) set([1, 2, 4, 5, 6, 9, 11, 12, 34, 22, 23, 25,... 0.080460
1 11 set([Distributed Systems]) set([1, 2, 4, 6, 9, 11, 12, 20, 23, 25, 26]) 0.063218
2 12 set([Java]) set([1, 2, 4, 6, 9, 12, 13, 19, 20, 22, 23, 28]) 0.068966
3 9 set([Software Development]) set([34, 3, 11, 12, 18, 20, 23, 28, 30]) 0.051724
4 13 set([Big Data]) set([33, 2, 4, 9, 11, 12, 34, 14, 21, 23, 24, ... 0.074713
5 8 set([Hadoop, Big Data]) set([Hadoop, Big Data]) 0.228571
6 5 set([Java, Big Data]) set([Java, Big Data]) 0.142857
7 4 set([Software Development, Java]) set([Software Development, Java]) 0.114286
8 10 set([Hadoop, Distributed Systems]) set([Hadoop, Distributed Systems]) 0.285714
9 7 set([Big Data, Distributed Systems]) set([Big Data, Distributed Systems]) 0.200000
10 8 set([Java, Distributed Systems]) set([Java, Distributed Systems]) 0.228571
11 8 set([Java, Hadoop]) set([Java, Hadoop]) 0.228571
12 4 set([Software Development, Hadoop]) set([Software Development, Hadoop]) 0.114286
13 4 set([Software Development, Distributed Systems]) set([Software Development, Distributed Systems]) 0.114286
14 4 set([Software Development, Big Data]) set([Software Development, Big Data]) 0.114286
15 5 set([Java, Hadoop, Big Data]) set([Java, Hadoop, Big Data]) 0.142857
16 2 set([Software Development, Java, Big Data]) set([Software Development, Java, Big Data]) 0.057143
17 7 set([Java, Hadoop, Distributed Systems]) set([Java, Hadoop, Distributed Systems]) 0.200000
18 3 set([Software Development, Hadoop, Distributed... set([Software Development, Hadoop, Distributed... 0.085714
19 3 set([Software Development, Big Data, Distribut... set([Software Development, Big Data, Distribut... 0.085714
20 3 set([Software Development, Java, Distributed S... set([Software Development, Java, Distributed S... 0.085714
21 2 set([Software Development, Java, Hadoop]) set([Software Development, Java, Hadoop]) 0.057143
22 3 set([Software Development, Distributed Systems... set([Software Development, Distributed Systems... 0.085714
23 7 set([Big Data, Hadoop, Distributed Systems]) set([Big Data, Hadoop, Distributed Systems]) 0.200000
24 7 set([Distributed Systems, Hadoop, Big Data]) set([Distributed Systems, Hadoop, Big Data]) 0.200000
25 5 set([Java, Distributed Systems, Big Data]) set([Java, Distributed Systems, Big Data]) 0.142857
26 4 set([Software Development, Hadoop, Big Data]) set([Software Development, Hadoop, Big Data]) 0.114286
27 3 set([Software Development, Big Data, Hadoop, D... set([Software Development, Big Data, Hadoop, D... 0.085714
28 2 set([Software Development, Java, Hadoop, Distr... set([Software Development, Java, Hadoop, Distr... 0.057143
29 2 set([Software Development, Java, Big Data, Dis... set([Software Development, Java, Big Data, Dis... 0.057143
30 2 set([Software Development, Java, Big Data, Had... set([Software Development, Java, Big Data, Had... 0.057143
31 5 set([Java, Big Data, Hadoop, Distributed Syste... set([Java, Big Data, Hadoop, Distributed Syste... 0.142857
32 2 set([Software Development, Java, Big Data, Had... set([Software Development, Java, Big Data, Had... 0.057143

In [16]:
ex[0:8][['items', 'count', 'support']]


Out[16]:
items count support
0 set([Hadoop]) 14 0.080460
1 set([Distributed Systems]) 11 0.063218
2 set([Java]) 12 0.068966
3 set([Software Development]) 9 0.051724
4 set([Big Data]) 13 0.074713
5 set([Hadoop, Big Data]) 8 0.228571
6 set([Java, Big Data]) 5 0.142857
7 set([Software Development, Java]) 4 0.114286

In [23]:
g1 = df[df.skill=='Hadoop']['name']
g2 = df[df.skill=='Distributed Systems']['name']
g3 = df[df.skill=='Java']['name']
g4 = df[df.skill=='Big Data']['name']
(set(g1).intersection(set(g2)).intersection(set(g3)).intersection(set(g4)))


Out[23]:
{'Alan Gates',
 'Ashish Singh',
 'Jonathan Gray',
 'Michael Stack',
 'Vinayak Borkar'}

In [25]:
minimums = np.arange(50) * .01 + .01
times = []
for m in minimums:
    start = time.time()
    ap = apriori.apriori(skills, m)
    end = time.time()
    dur = end-start
    times.append((m, dur, len(ap)))

In [34]:
benchmark = pd.DataFrame(times)
benchmark.columns = ['min_sensitivity', 'duration', 'rules']

#plt.plot(benchmark.min_sensitivity, benchmark.rules)
plt.plot(benchmark.min_sensitivity, benchmark.duration, linewidth=2)
plt.xlim([0.05, .5])
plt.ylim([0, .001])
plt.xlabel('sensitivity')
plt.ylabel('duration (seconds)')
plt.show()



In [35]:
plt.plot(benchmark.min_sensitivity, benchmark.rules, linewidth=2)
plt.xlim([0.01, .5])
plt.ylim([0, 800])
plt.xlabel('sensitivity')
plt.ylabel('# of rules')
plt.show()