In [2]:
import numpy as np
import pandas as pd
import datetime
from bokeh.charts import Scatter, Bar, show, output_notebook
output_notebook()
data = pd.read_csv('../input/3-Airplane_Crashes_Since_1908.txt',sep=',')
data.sample()
# Return a bunch of tuples with the Zodiac and its Start/End Dates
def chinese_zodaics():
start_date = pd.to_datetime("2/2/1908")
end_date = pd.to_datetime("7/1/2009")
animals = ['Monkey', 'Rooster', 'Dog', 'Pig', 'Rat', 'Ox', 'Tiger', 'Rabbit', 'Dragon', 'Snake', 'Horse', 'Goat']
zodiacs = []
while start_date < end_date:
for a in animals:
year_start = start_date
year_end = year_start + pd.DateOffset(days=365)
z = (a, start_date, year_end)
zodiacs.append(z)
start_date = year_end
return zodiacs
zodiacs = chinese_zodaics()
# Apply the zodiacs to the accident dates
def match_zodiac(date):
for z in zodiacs:
animal, start, end, = z[0], z[1], z[2]
if start <= date <= end:
return animal
data.Date = pd.to_datetime(data.Date)
data['Zodiac'] = data.Date.apply(match_zodiac)
data['Year'] = pd.DatetimeIndex(data['Date']).year
data = data[['Zodiac', 'Year', 'Fatalities', 'Aboard']].dropna()
data = data[data.Fatalities > 1]
data.sample(5)
# Return a bunch of tuples with the Zodiac and its Start/End Dates
def chinese_zodaics():
start_date = pd.to_datetime("2/2/1908")
end_date = pd.to_datetime("7/1/2009")
animals = ['Monkey', 'Rooster', 'Dog', 'Pig', 'Rat', 'Ox', 'Tiger', 'Rabbit', 'Dragon', 'Snake', 'Horse', 'Goat']
zodiacs = []
while start_date < end_date:
for a in animals:
year_start = start_date
year_end = year_start + pd.DateOffset(days=365)
z = (a, start_date, year_end)
zodiacs.append(z)
start_date = year_end
return zodiacs
zodiacs = chinese_zodaics()
# Apply the zodiacs to the accident dates
def match_zodiac(date):
for z in zodiacs:
animal, start, end, = z[0], z[1], z[2]
if start <= date <= end:
return animal
data.Date = pd.to_datetime(data.Date)
data['Zodiac'] = data.Date.apply(match_zodiac)
data['Year'] = pd.DatetimeIndex(data['Date']).year
data = data[['Zodiac', 'Year', 'Fatalities', 'Aboard']].dropna()
data = data[data.Fatalities > 1]
data.sample(5)
In [ ]:
In [13]:
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 04 22:25:57 2013
@author: Administrator
"""
import codecs
def loadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
f = codecs.open("d:\\exercise\\associationrules.txt", encoding='utf-16')
#content = f.readlines()
#print content
result = list()
Array1=[]
ArraySub=[]
str_prevLine=''
i=0
for line in f.readlines():
# line = line.strip()
#print line
if not len(line) or line.startswith('#'): #判断是否是空行或注释行
continue #是的话,跳过不处理
parts = line.split(',')
#print parts
if str_prevLine==parts[0] :
ArraySub.append(parts[1])
else:
Array1.append(ArraySub)
ArraySub=[]
ArraySub.append(parts[1])
#Array1.append(ArraySub)
col1=parts[0]
str_prevLine=parts[0]
i=i+1
Array1.append(ArraySub)
return Array1
def createC1(dataSet):#产生单个item的集合
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset,C1)#给C1.list每个元素执行函数
def scanD(D,ck,minSupport):#dataset,a list of candidate set,最小支持率
ssCnt={}
for tid in D:
for can in ck:
if can.issubset(tid):
if not ssCnt.has_key(can):
ssCnt[can]=1
else: ssCnt[can]+=1
numItem=float(len(D))
retList=[]
supportData={}
for key in ssCnt:
support=ssCnt[key]/numItem
if support>=minSupport:
retList.insert(0,key)
supportData[key]=support
return retList,supportData#返回频繁k项集,相应支持度
def aprioriGen(Lk,k):#create ck(k项集)
retList=[]
lenLk=len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk):
L1=list(Lk[i])[:k-2];L2=list(Lk[j])[:k-2]
L1.sort();L2.sort()#排序
if L1==L2:#比较i,j前k-1个项若相同,和合并它俩
retList.append(Lk[i] | Lk[j])#加入新的k项集 | stanf for union
return retList
def apriori(dataSet,minSupport=0.5):
C1=createC1(dataSet)
D=map(set,dataSet)
L1,supportData=scanD(D,C1,minSupport)#利用k项集生成频繁k项集(即满足最小支持率的k项集)
L=[L1]#L保存所有频繁项集
k=2
while(len(L[k-2])>0):#直到频繁k-1项集为空
Ck=aprioriGen(L[k-2],k)#利用频繁k-1项集 生成k项集
Lk,supK= scanD(D,Ck,minSupport)
supportData.update(supK)#保存新的频繁项集与其支持度
L.append(Lk)#保存频繁k项集
k+=1
return L,supportData#返回所有频繁项集,与其相应的支持率
def calcConf(freqSet,H,supportData,brl,minConf=0.7):
prunedH=[]
for conseq in H:#后件中的每个元素
conf=supportData[freqSet]/supportData[freqSet-conseq]
if conf>=minConf:
print freqSet-conseq,',',conseq,'S1Upport',supportData[freqSet-conseq],'conf11:',conf
brl.append((freqSet-conseq,conseq,conf))#添加入规则集中
prunedH.append(conseq)#添加入被修剪过的H中
#print freqSet-conseq
return prunedH
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
m=len(H[0])#H是一系列后件长度相同的规则,所以取H0的长度即可
if (len(freqSet)>m+1):
Hmp1=aprioriGen(H,m+1)
Hmp1=calcConf(freqSet,Hmp1,supportData,brl,minConf)
if (len(Hmp1)>1):
rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]#存储规则
for i in range(1,len(L)):
for freqSet in L[i]:
H1=[frozenset([item]) for item in freqSet]
if(i>1):
rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
else:
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList
dataSet=loadDataSet()
C1=createC1(dataSet)
D=map(set,dataSet)
L1,suppData0=scanD(D,C1,0.5)
L,suppData=apriori(dataSet)
print L
print suppData
rules=generateRules(L,suppData,minConf=0.5)
In [1]:
print 88
Below is code to load transction into numpy list
I need to get the output into a 2d array, with each person as a row, and the product as columns. The date will be dropped, and the values are summed.
person, product, date, val A, x, 1/1/2013, 10 A, x, 1/10/2013, 10 B, x, 1/2/2013, 20 B, y, 1/4/2013, 15 A, y, 1/8/2013, 20 C, z, 2/12/2013, 40
In [10]:
#coding=utf-8
#with open(infile, "d:\\exercise\alibba search log.txt") as f:
# for line in f:
# if "\n" in line:
# line = line.replace("\n", " ")
import codecs
f = codecs.open("d:\\exercise\\associationrules.txt", encoding='utf-16')
#content = f.readlines()
#print content
result = list()
Array1=[]
ArraySub=[]
str_prevLine=''
i=0
for line in f.readlines():
# line = line.strip()
#print line
if not len(line) or line.startswith('#'): #判断是否是空行或注释行
continue #是的话,跳过不处理
parts = line.split(',')
#print parts
if str_prevLine==parts[0] :
ArraySub.append(parts[1])
else:
Array1.append(ArraySub)
ArraySub=[]
ArraySub.append(parts[1])
#Array1.append(ArraySub)
col1=parts[0]
str_prevLine=parts[0]
i=i+1
Array1.append(ArraySub)
print Array1
print isinstance(u'中文',unicode)
In [16]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
from scipy import sparse
import os
import assoc
#read in data to a dict object - sums scripts by tuple (doc, drug)
dictObj = {}
rawData = 'subset.txt'
with open(rawData) as infile:
for line in infile:
parts = line.split(',')
key = (parts[0],parts[1])
val = float(parts[3])
if key in dictObj:
dictObj[key] += val
else:
dictObj[key] = val
infile.close()
print "stage 1 done"
#get the number of doctors and the number of drugs
keys = dictObj.keys()
docs = list(set([x[0] for x in keys]))
drugs = sorted(list(set([x[1] for x in keys])))
#read through the dict and build out a 2d numpy array
docC = 0
mat = np.empty([len(docs),len(drugs)])
for doc in docs:
drugC = 0
for drug in drugs:
key = (doc,drug)
if key in dictObj:
mat[(docC,drugC)] = dictObj[(key)]
else:
mat[(docC,drugC)] = 0
drugC += 1
docC+=1
In [ ]:
In [ ]: