In [2]:
import numpy as np
import pandas as pd
import datetime
from bokeh.charts import Scatter, Bar, show, output_notebook
output_notebook()
data = pd.read_csv('../input/3-Airplane_Crashes_Since_1908.txt',sep=',')
data.sample()

# Return a bunch of tuples with the Zodiac and its Start/End Dates
def chinese_zodaics():
    start_date = pd.to_datetime("2/2/1908")
    end_date = pd.to_datetime("7/1/2009")
    animals = ['Monkey', 'Rooster', 'Dog', 'Pig', 'Rat', 'Ox', 'Tiger', 'Rabbit', 'Dragon', 'Snake', 'Horse', 'Goat']
    zodiacs = []
    while start_date < end_date:
        for a in animals:    
            year_start = start_date
            year_end = year_start + pd.DateOffset(days=365)
            z = (a, start_date, year_end)
            zodiacs.append(z)
            start_date = year_end
    return zodiacs 

zodiacs = chinese_zodaics()

# Apply the zodiacs to the accident dates
def match_zodiac(date):
    for z in zodiacs: 
        animal, start, end, = z[0], z[1], z[2]
        if start <= date <= end:
            return animal
        
data.Date = pd.to_datetime(data.Date)
data['Zodiac'] = data.Date.apply(match_zodiac)
data['Year'] = pd.DatetimeIndex(data['Date']).year
data = data[['Zodiac', 'Year', 'Fatalities', 'Aboard']].dropna()
data = data[data.Fatalities > 1]
data.sample(5)
# Return a bunch of tuples with the Zodiac and its Start/End Dates
def chinese_zodaics():
    start_date = pd.to_datetime("2/2/1908")
    end_date = pd.to_datetime("7/1/2009")
    animals = ['Monkey', 'Rooster', 'Dog', 'Pig', 'Rat', 'Ox', 'Tiger', 'Rabbit', 'Dragon', 'Snake', 'Horse', 'Goat']
    zodiacs = []
    while start_date < end_date:
        for a in animals:    
            year_start = start_date
            year_end = year_start + pd.DateOffset(days=365)
            z = (a, start_date, year_end)
            zodiacs.append(z)
            start_date = year_end
    return zodiacs 

zodiacs = chinese_zodaics()

# Apply the zodiacs to the accident dates
def match_zodiac(date):
    for z in zodiacs: 
        animal, start, end, = z[0], z[1], z[2]
        if start <= date <= end:
            return animal
        
data.Date = pd.to_datetime(data.Date)
data['Zodiac'] = data.Date.apply(match_zodiac)
data['Year'] = pd.DatetimeIndex(data['Date']).year
data = data[['Zodiac', 'Year', 'Fatalities', 'Aboard']].dropna()
data = data[data.Fatalities > 1]
data.sample(5)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-7b6f69929b9e> in <module>()
     24             return animal
     25 
---> 26 data.Date = pd.to_datetime(data.Date)
     27 data['Zodiac'] = data.Date.apply(match_zodiac)
     28 data['Year'] = pd.DatetimeIndex(data['Date']).year

NameError: name 'data' is not defined

In [ ]:


In [13]:
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 04 22:25:57 2013

@author: Administrator
"""
import codecs

def loadDataSet():
 return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
 f = codecs.open("d:\\exercise\\associationrules.txt", encoding='utf-16')
#content = f.readlines()
#print content
 result = list()  
 Array1=[]
 ArraySub=[]
 str_prevLine=''
 i=0
 for line in f.readlines():
#    line = line.strip()
    #print line
    
    
    if not len(line) or line.startswith('#'):       #判断是否是空行或注释行  
        continue                                    #是的话,跳过不处理  
    parts = line.split(',')
    #print parts
    if str_prevLine==parts[0] :
       ArraySub.append(parts[1])
    else:
       Array1.append(ArraySub)
       ArraySub=[]
       ArraySub.append(parts[1])
   
    #Array1.append(ArraySub)
    col1=parts[0]
    str_prevLine=parts[0]
    i=i+1
    
 Array1.append(ArraySub)
 return Array1
def createC1(dataSet):#产生单个item的集合
    C1=[]
    for transaction in dataSet:
        for item in transaction: 
            if not [item] in C1:
                C1.append([item])
    
    C1.sort()
    
    return map(frozenset,C1)#给C1.list每个元素执行函数
    
    
def scanD(D,ck,minSupport):#dataset,a list of candidate set,最小支持率
    ssCnt={}
    for tid in D:
        for can in ck:
            if can.issubset(tid):
                if not ssCnt.has_key(can):
                    ssCnt[can]=1
                else: ssCnt[can]+=1
    
    numItem=float(len(D))
    retList=[]
    supportData={}
    for key in ssCnt:
        support=ssCnt[key]/numItem
        if support>=minSupport:
            retList.insert(0,key)
            supportData[key]=support
            
    return retList,supportData#返回频繁k项集,相应支持度
        

def aprioriGen(Lk,k):#create ck(k项集)
    retList=[]
    lenLk=len(Lk)
    for i in range(lenLk):
        for j in range(i+1,lenLk):
            L1=list(Lk[i])[:k-2];L2=list(Lk[j])[:k-2]
            L1.sort();L2.sort()#排序
            if L1==L2:#比较i,j前k-1个项若相同,和合并它俩
                retList.append(Lk[i] | Lk[j])#加入新的k项集 | stanf for union
    return retList
    
    
def apriori(dataSet,minSupport=0.5):
    C1=createC1(dataSet)
    D=map(set,dataSet)
    L1,supportData=scanD(D,C1,minSupport)#利用k项集生成频繁k项集(即满足最小支持率的k项集)
    L=[L1]#L保存所有频繁项集
    
    k=2
    while(len(L[k-2])>0):#直到频繁k-1项集为空
        Ck=aprioriGen(L[k-2],k)#利用频繁k-1项集 生成k项集
        Lk,supK= scanD(D,Ck,minSupport)
        supportData.update(supK)#保存新的频繁项集与其支持度
        L.append(Lk)#保存频繁k项集
        k+=1
    return L,supportData#返回所有频繁项集,与其相应的支持率
        
    
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
    prunedH=[]
    for conseq in H:#后件中的每个元素
        conf=supportData[freqSet]/supportData[freqSet-conseq]
        if conf>=minConf:
            print freqSet-conseq,',',conseq,'S1Upport',supportData[freqSet-conseq],'conf11:',conf
            brl.append((freqSet-conseq,conseq,conf))#添加入规则集中
            prunedH.append(conseq)#添加入被修剪过的H中
        #print freqSet-conseq
    return prunedH

def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
     
    m=len(H[0])#H是一系列后件长度相同的规则,所以取H0的长度即可
    if (len(freqSet)>m+1):
        Hmp1=aprioriGen(H,m+1)
        Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
        if (len(Hmp1)>1):
            rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
            
def generateRules(L,supportData,minConf=0.7):
  
    bigRuleList=[]#存储规则
    for i in range(1,len(L)):
        for freqSet in L[i]:
            H1=[frozenset([item]) for item in freqSet]
            if(i>1):
                rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
            else:
                calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    return bigRuleList

dataSet=loadDataSet()

C1=createC1(dataSet)

D=map(set,dataSet)
L1,suppData0=scanD(D,C1,0.5)

L,suppData=apriori(dataSet)

print L
print suppData
rules=generateRules(L,suppData,minConf=0.5)


[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]
{frozenset([5]): 0.75, frozenset([3]): 0.75, frozenset([2, 3, 5]): 0.5, frozenset([3, 5]): 0.5, frozenset([2, 3]): 0.5, frozenset([2, 5]): 0.75, frozenset([1]): 0.5, frozenset([1, 3]): 0.5, frozenset([2]): 0.75}
frozenset([3]) , frozenset([1]) S1Upport 0.75 conf11: 0.666666666667
frozenset([1]) , frozenset([3]) S1Upport 0.5 conf11: 1.0
frozenset([5]) , frozenset([2]) S1Upport 0.75 conf11: 1.0
frozenset([2]) , frozenset([5]) S1Upport 0.75 conf11: 1.0
frozenset([3]) , frozenset([2]) S1Upport 0.75 conf11: 0.666666666667
frozenset([2]) , frozenset([3]) S1Upport 0.75 conf11: 0.666666666667
frozenset([5]) , frozenset([3]) S1Upport 0.75 conf11: 0.666666666667
frozenset([3]) , frozenset([5]) S1Upport 0.75 conf11: 0.666666666667
frozenset([5]) , frozenset([2, 3]) S1Upport 0.75 conf11: 0.666666666667
frozenset([3]) , frozenset([2, 5]) S1Upport 0.75 conf11: 0.666666666667
frozenset([2]) , frozenset([3, 5]) S1Upport 0.75 conf11: 0.666666666667

import apriori #meant to be stored as a module file


In [1]:
print 88


88

Below is code to load transction into numpy list

I need to get the output into a 2d array, with each person as a row, and the product as columns. The date will be dropped, and the values are summed.

person, product, date, val A, x, 1/1/2013, 10 A, x, 1/10/2013, 10 B, x, 1/2/2013, 20 B, y, 1/4/2013, 15 A, y, 1/8/2013, 20 C, z, 2/12/2013, 40


In [10]:
#coding=utf-8

#with open(infile, "d:\\exercise\alibba search log.txt") as f:
#    for line in f:
#        if "\n" in line:
#            line = line.replace("\n", " ")

import codecs
f = codecs.open("d:\\exercise\\associationrules.txt", encoding='utf-16')
#content = f.readlines()
#print content
result = list()  
Array1=[]
ArraySub=[]
str_prevLine=''
i=0
for line in f.readlines():
#    line = line.strip()
    #print line
    
    
    if not len(line) or line.startswith('#'):       #判断是否是空行或注释行  
        continue                                    #是的话,跳过不处理  
    parts = line.split(',')
    #print parts
    if str_prevLine==parts[0] :
       ArraySub.append(parts[1])
    else:
       Array1.append(ArraySub)
       ArraySub=[]
       ArraySub.append(parts[1])
   
    #Array1.append(ArraySub)
    col1=parts[0]
    str_prevLine=parts[0]
    i=i+1
    
Array1.append(ArraySub)
print Array1
print isinstance(u'中文',unicode)


[[], [u' item_id\r\n'], [u' \u83dc1\r\n', u'sku2meatB\r\n'], [u'Sku1vegB\r\n', u'Sku3vegC\r\n', u'Sku5vegC\r\n'], [u'Sku1\r\n'], [u'sku2\r\n', u'sku2\r\n'], [u'sku1\r\n', u'sku3']]
True

In [16]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
from scipy import sparse
import os
import assoc


#read in data to a dict object - sums scripts by tuple (doc, drug)
dictObj = {}
rawData = 'subset.txt'
with open(rawData) as infile:
for line in infile:
    parts = line.split(',')
    key = (parts[0],parts[1])
    val = float(parts[3])
    if key in dictObj:
        dictObj[key] += val
    else:
        dictObj[key] = val
infile.close()

print "stage 1 done"
#get the number of doctors and the number of drugs
keys =  dictObj.keys()
docs = list(set([x[0] for x in keys]))
drugs = sorted(list(set([x[1] for x in keys])))

#read through the dict and build out a 2d numpy array 
docC = 0
mat = np.empty([len(docs),len(drugs)])
for doc in docs:
drugC = 0
for drug in drugs:
    key = (doc,drug)
    if key in dictObj:
        mat[(docC,drugC)] = dictObj[(key)]
            else:
        mat[(docC,drugC)] = 0
    drugC += 1
docC+=1


  File "<ipython-input-16-d100a57ef107>", line 17
    for line in infile:
      ^
IndentationError: expected an indented block

In [ ]:


In [ ]: