Use the online browsing behavior dataset "browsing.txt". Each line represents a browsing session of a customer. On each line, each string of 8 characters represents the id of an item browsed during that session. The items are separated by spaces.
List the top 15 rules with corresponding confidence scores in decreasing order of confidence score for itemsets of size 2. Then list the top 15 rules with corresponding confidence scores in decreasing order of confidence score for itemsets of size 3. A rule is of the form: (item1, item2) ⇒ item3.
In [1]:
from __future__ import division
import itertools
import operator
from sys import argv
In [2]:
support = 99
mappings = []
itemCounts = []
transactions = 0
Count all items
In [3]:
data = open("browsing.txt","r")
for basket in data:
#print basket
transactions += 1
for item in set(basket.split()):
#print item
if item not in mappings:
mappings.append(item)
itemCounts.append(1)
else:
indexItem = mappings.index(item)
counter = itemCounts[indexItem]
counter += 1
itemCounts[indexItem] = counter
data.close()
Get frequent items
In [4]:
frequentItems = [mappings.index(item) for item in mappings \
if itemCounts[mappings.index(item)] > support]
Get all candidate pairs (all combination pairs of frequent items).
In [5]:
candidatePairs = {}
for pair in itertools.combinations(sorted(frequentItems),2):
candidatePairs[pair] = 0
Get counts for all candidate pairs.
In [6]:
data = open("browsing.txt","r")
for basket in data:
fitems = sorted( [ mappings.index(item) for item in set(basket.split()) ] )
# Generate pairs for them and update counts
for pair in itertools.combinations(fitems,2):
if pair in candidatePairs:
count = candidatePairs[pair]
count += 1
candidatePairs[pair] = count
data.close()
Get all frequent pairs
In [7]:
frequentPairs = sorted([k for k,v in candidatePairs.iteritems() if v > support])
Generate candidate triples by frequentPairs JOIN frequentPairs
In [8]:
candidateTriples = {}
allCandidateTriples = []
for fcPair in frequentPairs:
for jp in [joinPair for joinPair in frequentPairs \
if joinPair[0] == fcPair[1]]:
allCandidateTriples.append( (fcPair[0],fcPair[1],jp[1]) )
Prune non frequent candidate triples
In [9]:
for candidate in allCandidateTriples:
whatAboutIt = True
for pair in itertools.combinations(candidate,2):
if pair not in frequentPairs:
whatAboutIt = False
break
if whatAboutIt:
candidateTriples[candidate] = 0
Get count for candidate triples
In [10]:
data = open("browsing.txt","r")
for basket in data:
items = sorted([mappings.index(item) for item in set(basket.split())])
fPair = []
for triple in itertools.combinations(items,3):
if triple in candidateTriples:
tripleCount = candidateTriples[triple]
tripleCount = tripleCount +1
candidateTriples[triple] = tripleCount
data.close()
Get frequent triples
In [11]:
frequentTriples = sorted ([k for k,v in candidateTriples.iteritems() if v > support])
Generating Rules for confidence
In [12]:
def confidence(I,J):
# Calculate P(IJ)
PIJ = 0
IJ = set(I).union(set(J))
if len(IJ) == 2:
PIJ = candidatePairs[tuple(sorted(IJ))]
elif len(IJ) == 3:
PIJ = candidateTriples[tuple(sorted(IJ))]
#Calculate P(I)
PI = 0
if len(I) == 1:
PI = itemCounts[I[0]]
elif len(I) == 2:
PI = candidatePairs[tuple(sorted(I))]
if PIJ > PI:
print I, J, IJ
print PIJ, PI, PIJ / PI
return PIJ / PI
Frequent pairs by confidence
In [13]:
pairRules = {}
for pair in frequentPairs:
pairRules[pair]=confidence( (pair[0],),(pair[1],) )
pairRules[(pair[1],pair[0])] = confidence( (pair[1],),(pair[0],) )
Frequent triples by confidence
In [14]:
tripleRules = {}
for triple in frequentTriples:
for pair in itertools.combinations(triple,2):
item2 = tuple(set(triple).difference(set(pair)))
tripleRules[(pair,item2)] = confidence(pair,item2)
Final sort rules and get top 15 desc
In [15]:
cp = sorted(pairRules.iteritems(), key = operator.itemgetter(1))
cp.reverse()
cp5 = [ "%s-->%s %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
for rule in cp[0:15] ]
print 'Top 15 pairs by confidence:'
print "\n".join(cp5)
In [16]:
ct = sorted(tripleRules.iteritems(), key = operator.itemgetter(1))
ct.reverse()
ct5 = [ "{%s,%s}-->%s %s" % (mappings[rule[0][0][0]], \
mappings[rule[0][0][1]], \
mappings[rule[0][1][0]], \
rule[1])\
for rule in ct[0:15] ]
print 'Top 15 triples by confidence:'
print "\n".join(ct5)
Generating Rules for lift
In [17]:
def lift(J,conf):
if isinstance(J, tuple):
suppJ = itemCounts[J[0]]
else:
suppJ = itemCounts[J]
SJ = suppJ / transactions
return conf / SJ
In [18]:
liftedPairRules = { k:lift(k[1],v) for k,v in pairRules.iteritems()}
lp = sorted(liftedPairRules.iteritems(), key = operator.itemgetter(1))
lp.reverse()
lp5 = [ "%s-->%s %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
for rule in lp[0:15] ]
print 'Top 15 pairs by lift:'
print "\n".join(lp5)
In [19]:
liftedTripleRules = { k:lift(k[1],v) for k,v in tripleRules.iteritems()}
lt = sorted(liftedTripleRules.iteritems(), key = operator.itemgetter(1))
lt.reverse()
lt5 = [ "{%s,%s}-->%s %s" % (mappings[rule[0][0][0]], \
mappings[rule[0][0][1]], \
mappings[rule[0][1][0]], \
rule[1])\
for rule in lt[0:15] ]
print 'Top 15 triples by lift:'
print "\n".join(lt5)
Generating Rules for conviction
In [20]:
def conv(J,conf):
if isinstance(J, tuple):
suppJ = itemCounts[J[0]]
else:
suppJ = itemCounts[J]
SJ = suppJ / transactions
conv = float('inf')
if not conf == 1:
conv = (1 - SJ)/(1 - conf)
return conv
In [21]:
convictedPairRules = { k:conv(k[1],v) for k,v in pairRules.iteritems()}
convp = sorted(convictedPairRules.iteritems(), key = operator.itemgetter(1))
convp.reverse()
convp5 = [ "%s-->%s %s" % (mappings[rule[0][0]],mappings[rule[0][1]],rule[1])\
for rule in convp[0:15] ]
print 'Top 15 pairs by conviction:'
print "\n".join(convp5)
In [22]:
convictedTripleRules = { k:conv(k[1],v) for k,v in tripleRules.iteritems()}
convt = sorted(convictedTripleRules.iteritems(), key = operator.itemgetter(1))
convt.reverse()
convt5 = [ "{%s,%s}-->%s %s" % (mappings[rule[0][0][0]], \
mappings[rule[0][0][1]], \
mappings[rule[0][1][0]], \
rule[1])\
for rule in convt[0:15] ]
print 'Top 15 triples by conviction:'
print "\n".join(convt5)
In [ ]: