notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict
import apriori



In [2]:

    
f = open('speakers.txt', 'r')
lines = f.readlines()
f.close()



In [3]:

    
names = []
skills = []
votes = []
for line in lines:
    if line.find('#') != 0:
        if line.find('\t') == 0:
            arr = line.strip().split('\t')
            skill = arr[0]
            vote = arr[1]
            names.append(name)
            skills.append(skill)
            votes.append(vote)
        else:
            name = line.strip().title()

df = pd.DataFrame({'name': names, 'skill': skills, 'votes': votes})



In [4]:

    
gb = df.groupby(['skill'])['name'].count()
gb.sort(ascending=0)
plt.figure(figsize=(15,5))
x = np.arange(len(gb))
plt.plot(x, gb)
plt.ylabel('Number of speakers', fontsize=14)
plt.xlabel('Order of popularity', fontsize=14)
plt.show()



In [5]:

    
gb = df.groupby(['skill'])['name'].count()
gb.sort(ascending=0)
plt.figure(figsize=(10,8))
gbs = gb[0:15]
x = np.arange(len(gbs))
plt.barh(x, gbs)
plt.yticks(x+0.4, gbs.index, fontsize=14)
plt.xlabel('Order of popularity', fontsize=14)
plt.show()



In [6]:

    
g1 = df[df.skill=='R']['name']
g2 = df[df.skill=='Machine Learning']['name']
set(g1).intersection(set(g2))









    Out[6]:





{'Benjamin Uminsky',
 'Gian Gonzanga\tH',
 'Jim Mcguire',
 'Kyle Polich',
 'Szilard Pafka'}



In [7]:

    
len(set(df['name']))









    Out[7]:





35



In [8]:

    
s = ''
for n in g1:
    s = s + n + ', '
print s
s = ''
for n in g2:
    s = s + n + ', '
print s









    



Amelia Mcnamara, Benjamin Uminsky, Gian Gonzanga	H, Jim Mcguire, Kyle Polich, Szilard Pafka, 
Benjamin Uminsky, Brian Kursar, Gian Gonzanga	H, Jim Mcguire, Kyle Polich, Szilard Pafka, Ulas Bardak,



In [ ]:



In [9]:

    
skills = []
for name in set(df.name):
    speaker = df[df.name==name]['skill'].tolist()
    skills.append(speaker)



In [ ]:



In [14]:

    
ap = apriori.apriori(skills, 0.05)
ex = pd.DataFrame(ap)
ex









    Out[14]:






  
    
      
      count
      items
      set
      support
    
  
  
    
      0
      14
      set([Hadoop])
      set([1, 2, 4, 5, 6, 9, 11, 12, 34, 22, 23, 25,...
      0.080460
    
    
      1
      11
      set([Distributed Systems])
      set([1, 2, 4, 6, 9, 11, 12, 20, 23, 25, 26])
      0.063218
    
    
      2
      12
      set([Java])
      set([1, 2, 4, 6, 9, 12, 13, 19, 20, 22, 23, 28])
      0.068966
    
    
      3
      9
      set([Software Development])
      set([34, 3, 11, 12, 18, 20, 23, 28, 30])
      0.051724
    
    
      4
      13
      set([Big Data])
      set([33, 2, 4, 9, 11, 12, 34, 14, 21, 23, 24, ...
      0.074713
    
    
      5
      8
      set([Hadoop, Big Data])
      set([Hadoop, Big Data])
      0.228571
    
    
      6
      5
      set([Java, Big Data])
      set([Java, Big Data])
      0.142857
    
    
      7
      4
      set([Software Development, Java])
      set([Software Development, Java])
      0.114286
    
    
      8
      10
      set([Hadoop, Distributed Systems])
      set([Hadoop, Distributed Systems])
      0.285714
    
    
      9
      7
      set([Big Data, Distributed Systems])
      set([Big Data, Distributed Systems])
      0.200000
    
    
      10
      8
      set([Java, Distributed Systems])
      set([Java, Distributed Systems])
      0.228571
    
    
      11
      8
      set([Java, Hadoop])
      set([Java, Hadoop])
      0.228571
    
    
      12
      4
      set([Software Development, Hadoop])
      set([Software Development, Hadoop])
      0.114286
    
    
      13
      4
      set([Software Development, Distributed Systems])
      set([Software Development, Distributed Systems])
      0.114286
    
    
      14
      4
      set([Software Development, Big Data])
      set([Software Development, Big Data])
      0.114286
    
    
      15
      5
      set([Java, Hadoop, Big Data])
      set([Java, Hadoop, Big Data])
      0.142857
    
    
      16
      2
      set([Software Development, Java, Big Data])
      set([Software Development, Java, Big Data])
      0.057143
    
    
      17
      7
      set([Java, Hadoop, Distributed Systems])
      set([Java, Hadoop, Distributed Systems])
      0.200000
    
    
      18
      3
      set([Software Development, Hadoop, Distributed...
      set([Software Development, Hadoop, Distributed...
      0.085714
    
    
      19
      3
      set([Software Development, Big Data, Distribut...
      set([Software Development, Big Data, Distribut...
      0.085714
    
    
      20
      3
      set([Software Development, Java, Distributed S...
      set([Software Development, Java, Distributed S...
      0.085714
    
    
      21
      2
      set([Software Development, Java, Hadoop])
      set([Software Development, Java, Hadoop])
      0.057143
    
    
      22
      3
      set([Software Development, Distributed Systems...
      set([Software Development, Distributed Systems...
      0.085714
    
    
      23
      7
      set([Big Data, Hadoop, Distributed Systems])
      set([Big Data, Hadoop, Distributed Systems])
      0.200000
    
    
      24
      7
      set([Distributed Systems, Hadoop, Big Data])
      set([Distributed Systems, Hadoop, Big Data])
      0.200000
    
    
      25
      5
      set([Java, Distributed Systems, Big Data])
      set([Java, Distributed Systems, Big Data])
      0.142857
    
    
      26
      4
      set([Software Development, Hadoop, Big Data])
      set([Software Development, Hadoop, Big Data])
      0.114286
    
    
      27
      3
      set([Software Development, Big Data, Hadoop, D...
      set([Software Development, Big Data, Hadoop, D...
      0.085714
    
    
      28
      2
      set([Software Development, Java, Hadoop, Distr...
      set([Software Development, Java, Hadoop, Distr...
      0.057143
    
    
      29
      2
      set([Software Development, Java, Big Data, Dis...
      set([Software Development, Java, Big Data, Dis...
      0.057143
    
    
      30
      2
      set([Software Development, Java, Big Data, Had...
      set([Software Development, Java, Big Data, Had...
      0.057143
    
    
      31
      5
      set([Java, Big Data, Hadoop, Distributed Syste...
      set([Java, Big Data, Hadoop, Distributed Syste...
      0.142857
    
    
      32
      2
      set([Software Development, Java, Big Data, Had...
      set([Software Development, Java, Big Data, Had...
      0.057143



In [16]:

    
ex[0:8][['items', 'count', 'support']]









    Out[16]:






  
    
      
      items
      count
      support
    
  
  
    
      0
      set([Hadoop])
      14
      0.080460
    
    
      1
      set([Distributed Systems])
      11
      0.063218
    
    
      2
      set([Java])
      12
      0.068966
    
    
      3
      set([Software Development])
      9
      0.051724
    
    
      4
      set([Big Data])
      13
      0.074713
    
    
      5
      set([Hadoop, Big Data])
      8
      0.228571
    
    
      6
      set([Java, Big Data])
      5
      0.142857
    
    
      7
      set([Software Development, Java])
      4
      0.114286



In [23]:

    
g1 = df[df.skill=='Hadoop']['name']
g2 = df[df.skill=='Distributed Systems']['name']
g3 = df[df.skill=='Java']['name']
g4 = df[df.skill=='Big Data']['name']
(set(g1).intersection(set(g2)).intersection(set(g3)).intersection(set(g4)))









    Out[23]:





{'Alan Gates',
 'Ashish Singh',
 'Jonathan Gray',
 'Michael Stack',
 'Vinayak Borkar'}



In [25]:

    
minimums = np.arange(50) * .01 + .01
times = []
for m in minimums:
    start = time.time()
    ap = apriori.apriori(skills, m)
    end = time.time()
    dur = end-start
    times.append((m, dur, len(ap)))



In [34]:

    
benchmark = pd.DataFrame(times)
benchmark.columns = ['min_sensitivity', 'duration', 'rules']

#plt.plot(benchmark.min_sensitivity, benchmark.rules)
plt.plot(benchmark.min_sensitivity, benchmark.duration, linewidth=2)
plt.xlim([0.05, .5])
plt.ylim([0, .001])
plt.xlabel('sensitivity')
plt.ylabel('duration (seconds)')
plt.show()



In [35]:

    
plt.plot(benchmark.min_sensitivity, benchmark.rules, linewidth=2)
plt.xlim([0.01, .5])
plt.ylim([0, 800])
plt.xlabel('sensitivity')
plt.ylabel('# of rules')
plt.show()

	count	items	set	support
0	14	set([Hadoop])	set([1, 2, 4, 5, 6, 9, 11, 12, 34, 22, 23, 25,...	0.080460
1	11	set([Distributed Systems])	set([1, 2, 4, 6, 9, 11, 12, 20, 23, 25, 26])	0.063218
2	12	set([Java])	set([1, 2, 4, 6, 9, 12, 13, 19, 20, 22, 23, 28])	0.068966
3	9	set([Software Development])	set([34, 3, 11, 12, 18, 20, 23, 28, 30])	0.051724
4	13	set([Big Data])	set([33, 2, 4, 9, 11, 12, 34, 14, 21, 23, 24, ...	0.074713
5	8	set([Hadoop, Big Data])	set([Hadoop, Big Data])	0.228571
6	5	set([Java, Big Data])	set([Java, Big Data])	0.142857
7	4	set([Software Development, Java])	set([Software Development, Java])	0.114286
8	10	set([Hadoop, Distributed Systems])	set([Hadoop, Distributed Systems])	0.285714
9	7	set([Big Data, Distributed Systems])	set([Big Data, Distributed Systems])	0.200000
10	8	set([Java, Distributed Systems])	set([Java, Distributed Systems])	0.228571
11	8	set([Java, Hadoop])	set([Java, Hadoop])	0.228571
12	4	set([Software Development, Hadoop])	set([Software Development, Hadoop])	0.114286
13	4	set([Software Development, Distributed Systems])	set([Software Development, Distributed Systems])	0.114286
14	4	set([Software Development, Big Data])	set([Software Development, Big Data])	0.114286
15	5	set([Java, Hadoop, Big Data])	set([Java, Hadoop, Big Data])	0.142857
16	2	set([Software Development, Java, Big Data])	set([Software Development, Java, Big Data])	0.057143
17	7	set([Java, Hadoop, Distributed Systems])	set([Java, Hadoop, Distributed Systems])	0.200000
18	3	set([Software Development, Hadoop, Distributed...	set([Software Development, Hadoop, Distributed...	0.085714
19	3	set([Software Development, Big Data, Distribut...	set([Software Development, Big Data, Distribut...	0.085714
20	3	set([Software Development, Java, Distributed S...	set([Software Development, Java, Distributed S...	0.085714
21	2	set([Software Development, Java, Hadoop])	set([Software Development, Java, Hadoop])	0.057143
22	3	set([Software Development, Distributed Systems...	set([Software Development, Distributed Systems...	0.085714
23	7	set([Big Data, Hadoop, Distributed Systems])	set([Big Data, Hadoop, Distributed Systems])	0.200000
24	7	set([Distributed Systems, Hadoop, Big Data])	set([Distributed Systems, Hadoop, Big Data])	0.200000
25	5	set([Java, Distributed Systems, Big Data])	set([Java, Distributed Systems, Big Data])	0.142857
26	4	set([Software Development, Hadoop, Big Data])	set([Software Development, Hadoop, Big Data])	0.114286
27	3	set([Software Development, Big Data, Hadoop, D...	set([Software Development, Big Data, Hadoop, D...	0.085714
28	2	set([Software Development, Java, Hadoop, Distr...	set([Software Development, Java, Hadoop, Distr...	0.057143
29	2	set([Software Development, Java, Big Data, Dis...	set([Software Development, Java, Big Data, Dis...	0.057143
30	2	set([Software Development, Java, Big Data, Had...	set([Software Development, Java, Big Data, Had...	0.057143
31	5	set([Java, Big Data, Hadoop, Distributed Syste...	set([Java, Big Data, Hadoop, Distributed Syste...	0.142857
32	2	set([Software Development, Java, Big Data, Had...	set([Software Development, Java, Big Data, Had...	0.057143