In [ ]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
%matplotlib inline
In [ ]:
!. ~/.bashrc
In [ ]:
from sklearn.cross_validation import train_test_split
np.random.seed(1337)
import theano
In [ ]:
import keras
In [ ]:
import math
In [ ]:
def check_args(*types):
def real_decorator(func):
def wrapper(*args, **kwargs):
for val, typ in zip(args, types):
assert isinstance(val, typ), "Value {} is not of expected type {}".format(val, typ)
return func(*args, **kwargs)
return wrapper
return real_decorator
def do_long_computation(name):
""" dummy function """
time.sleep(10)
return "FruitMart"
@check_args(str, int, int)
def print_fruit(name, apples, oranges):
pass
In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
In [ ]:
df = pd.read_csv(data_dir+evt_name)
In [ ]:
df
In [ ]:
df_pure = pd.read_csv(data_dir+auth_name,nrows=10000)
In [ ]:
df_pure.TSYS_DCLN_REAS_CD
In [ ]:
df.isnull().sum()
In [ ]:
df.dropna()
CARD_VFCN_REJ_CD has only NaNs
In [ ]:
df.dropna(how='all')
In [ ]:
grouped = df.groupby('acct_id')
In [ ]:
grouped.groups
In [ ]:
grouped_lbl = df.groupby('FRD_IND')
In [ ]:
grouped_lbl.count()
In [ ]:
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp['acct_id']
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter
plt.pie(x_list,labels=label_list,autopct="%1.1f%%")
plt.title("Transactions")
plt.show()
In [ ]:
col_names = list(df.columns.values)
In [ ]:
for c,col in enumerate(col_names):
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp[col]
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter
# plt.subplot(12,4,c+1)
plt.pie(x_list,labels=label_list,autopct="%1.1f%%")
plt.title(col)
plt.show()
if c==45:
break
In [ ]:
def value_exist(val,col,df):
keys = set(df.groupby(col).groups.keys())
print keys
return val in keys
col = 'AUTHZN_APPRL_CD'
value_exist(55555,col,df)
In [ ]:
val = 55555
df_rmna = df.fillna(value={'AUTHZN_APPRL_CD':val})
df_rmna
In [ ]:
df_auth = pd.read_csv(data_dir+auth_name,nrows=500000)
In [ ]:
df_auth
In [ ]:
df_auth[df_auth['MRCH_CITY_NM'].isnull() & df_auth['MRCH_NM']=='FYP']
In [ ]:
df_auth['MRCH_NM'].isnull()
In [ ]:
df_auth.isnull().sum()
In [255]:
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display
import plotly.plotly as py # interactive graphing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Figure
In [ ]:
init_notebook_mode()
In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'
In [ ]:
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str
In [ ]:
table = 'data_trim'
In [ ]:
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
'from {table} '
'group by FRD_IND'.format(table=table), disk_engine)
df
In [ ]:
df['num_usr'][0]
In [ ]:
title = 'Fraud by Distinct Users'
fig = {
'data': [{'labels': ['Fraud', 'Genuine'],
'values': [df['num_usr'][1], df['num_usr'][0]],
'type': 'pie'}],
'layout': {'title': title}
}
iplot(fig,filename='figures/'+title)
In [ ]:
usr_ratio = df['num_usr'][0]/ df['num_usr'][1]
usr_ratio
In [ ]:
usr_ratio= 80
In [ ]:
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
'from {table} '
'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u
In [ ]:
df_t_u = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
'group by acct_id '
'order by -num_trans'.format(table=table), disk_engine)
df_t_u
In [ ]:
table
In [ ]:
num_trans = df_t_u['num_trans']
total_eg = 0
for t_len in num_trans:
total_eg += math.ceil(t_len/50.0)
print total_eg*0.66
In [ ]:
math.ceil(152/50.0)
In [ ]:
###graph bars %fraud al transactions
In [ ]:
pad_val = -1
In [ ]:
table= 'data_little'
In [ ]:
from sklearn import preprocessing
def encode_column(df_col):
le = preprocessing.LabelEncoder()
le.fit(df_col)
return le
In [ ]:
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
df.head()
In [ ]:
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
tp = df.dtypes[c]
# print tp
if tp == 'object':
if r not in time_cols:
encoders[r] = encode_column(df[r])
encoders
In [ ]:
def populate_encoders(table,disk_engine):
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
df.head()
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
tp = df.dtypes[c]
# print tp
if tp == 'object':
if r not in time_cols:
encoders[r] = encode_column(df[r])
return encoders
In [ ]:
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
batch_size = tail
for i in range(batch_size):
if cnt<usr_ratio:
users.add(df_ds_u.acct_id[head])
cnt+=1
head+=1
else:
users.add(df_ds_u.acct_id[tail])
tail-=1
cnt=0
In [ ]:
def encode_df(df,encoders):
for col in encoders.keys():
try:
df[col] = encoders[col].transform(df[col])
except:
print 'EXCEPTION'
display(df[col])
print col
raise
for col in time_cols:
df[col] = pd.to_numeric(pd.to_datetime(df[col],errors='coerce'))
In [ ]:
def get_user_info(user,table):
if user == '.':
user = '"."'
df_u = pd.read_sql_query('select * from {table} where acct_id = {user}'.format(table=table,user=user),disk_engine)
return df_u
In [ ]:
def get_last_date(df_u,cuttoff_date):
# print "Before Trim"
# display(df_u)
df_trim = df_u[df_u['FRD_IND_SWT_DT'] >= pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]]
# print "After Trim"
# display(df_trim)
### a historicly later transaction may have been confirmed earlier than a historicly preceeding T
df_trim = df_trim.sort_values('AUTHZN_RQST_PROC_TM',ascending=True,inplace=False)
df_trim = df_trim.reset_index(drop=True)
# print "After Reorder"
# display(df_trim)
# display(df_trim)
if not df_trim.empty:
# print 'value to be returned',df_trim['AUTHZN_RQST_PROC_TM'][0]
return df_trim['AUTHZN_RQST_PROC_TM'][0]
else:
return None
In [ ]:
query = ['select AUTHZN_RQST_PROC_TM '
'from {table} '
'where FRD_IND_SWT_DT >='
'"',
cutt_off_date,
'" '
'order by AUTHZN_RQST_PROC_TM limit 1 '
]
query = ''.join(query)
query = query.format(table=table)
dataFrame = pd.read_sql_query(query
.format(table=table), disk_engine)
dataFrame
In [ ]:
def get_col_id(col,df):
col_list = list(df.columns.values)
col_list.remove('index')
col_list.index(col)
def generate_sequence(user,table,encoders,cuttoff_date='2014-05-11'):
df_u = get_user_info(user,table)
unav_cols = ['AUTHZN_APPRL_CD','TSYS_DCLN_REAS_CD','AUTHZN_RESPNS_CD','AUTHZN_APPRD_AMT',]
nan_rpl = ['AUTHZN_APPRL_CD',]
for col in unav_cols:
df_u[col] = df_u[col].shift(1)
loc = list(df_u.columns.values).index(col)
if(col in nan_rpl):
df_u.iloc[0,loc] = 'nan'
else:
df_u.iloc[0,loc] = pad_val
# print df_u.count()
# display(df_u.head())
# display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
encode_df(df_u,encoders)
# print df_u.count()
# display(df_u.head())
# display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
df_u = df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True)
# display(df_u[df_u['FRD_IND_SWT_DT'].isnull()])
df_u = df_u.drop('index', axis=1)
# display(df_u[df_u['FRD_IND_SWT_DT'] < pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]].head(8))
### This is the last date, before which transaction will be used for trainning.
### It coresponds to the date when the last knwon fraudulent transaction was confirmed
last_date_num = get_last_date(df_u,cuttoff_date)
if last_date_num == None:
train = np.array(df_u)
# print "No cutt offs"
# print train[:,0:-2].shape
# print "labels"
# print train[:,-2].shape
return train[:,0:-2],[],train[:,-2],[]
else:
df_train = df_u[df_u['AUTHZN_RQST_PROC_TM'] < last_date_num]
df_test = df_u[df_u['AUTHZN_RQST_PROC_TM'] >= last_date_num]
print 'train/test split:',np.array(df_train).shape[0],np.array(df_test).shape[0]
# display(df_train)
# display(df_test)
# print 'is this running at all?!',df_test
train = np.array(df_train)
test = np.array(df_test)
# print train
# print test
# print "Shapes"
# print train.shape
# print test.shape
# print "features"
# print train[:,0:-2].shape
# print test[:,0:-2].shape
# print "labels"
# print train[:,-2].shape
# print test[:,-2].shape
return train[:,0:-2],test[:,0:-2],train[:,-2],test[:,-2]
In [ ]:
user = '128237902'
table = 'data_trim'
encoders = encoders
train,test,y,y_tes = generate_sequence(user,table,encoders)
In [ ]:
loc = list(df.columns.values).index('AUTHZN_APPRL_CD')
df.iloc[0,loc] = 'nan'
In [ ]:
df.TSYS_DCLN_REAS_CD.dtype
In [ ]:
In [ ]:
train[:,1] = np.roll(train[:,1],1)
In [ ]:
def set_roll_values(array)
In [ ]:
display(col_list[35])
In [ ]:
np.array_split(X_train_S[0:2], 5)[0][0].shape
In [ ]:
map(lambda x: len(x),X_train_S[0:10])
In [ ]:
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/50.0)) if len(x)>50 else [x],X_train_S[0:10])
In [ ]:
In [ ]:
172%50
In [ ]:
map(lambda x: len(x),split_seq)
In [ ]:
len(map(lambda x: reduce(lambda y,z: np.vstack([y,z]),x),split_seq))
In [ ]:
flattened = [sequence for user_seq in split_seq for sequence in user_seq]
In [ ]:
len(flattened)
In [ ]:
map(lambda x: len(x),flattened)
In [ ]:
chunks_lens = map(lambda x: len(x),flattened)
chunks_lens[5] = 55
In [ ]:
for cnk in chunks_lens:
assert cnk < 50, 'Sequence chunks are exceeding the max_len of {}'.format(seq_len_param)
In [ ]:
x_try = np.array([4])
In [ ]:
###has to be float!!!!!!!!!!
seq_len_param = 60.0
def chunck_seq(seq_list,seq_len=seq_len_param):
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/seq_len)) if len(x)>seq_len else [x],seq_list)
flattened = [sequence for user_seq in split_seq for sequence in user_seq]
assert sum(map(lambda x: len(x),flattened)) == sum(map(lambda x: len(x),seq_list))
chunks_lens = map(lambda x: len(x),flattened)
for cnk in chunks_lens:
assert cnk <= seq_len_param, 'Sequence chunks are exceeding the max_len of {} \n {}'.format(seq_len_param,chunks_lens)
return flattened
In [ ]:
from keras.preprocessing.sequence import pad_sequences
In [ ]:
pad_chunk = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S[0:10]), maxlen=int(seq_len_param),dtype='float32')
In [ ]:
pad_chunk.shape
In [ ]:
X_train_pad = keras.preprocessing.sequence.pad_sequences(X_train_S, maxlen=None,dtype='float32')
In [ ]:
X_train_pad.shape
In [ ]:
X_train_S[0][-1]
In [ ]:
X_train_pad[0][-1]
In [ ]:
X_train_S = []
X_test_S = []
y_train_S =[]
y_test_S = []
print "Number of users:",len(users)
for user in users:
# if user != '337018623':
# continue
X_train,X_test,y_train,y_test = generate_sequence(user,'data_trim',encoders)
X_train_S.append(X_train)
X_test_S.append(X_test)
y_train_S.append(y_train)
y_test_S.append(y_test)
In [ ]:
def generate_sample_w(y_true,class_weight):
shps = y_true.shape
sample_w = []
for i in range(shps[0]):
sample_w.append([])
for j in range(shps[1]):
sample_w[i].append(class_weight[y_true[i,j,0]])
return np.asarray(sample_w)
def sequence_generator(users,encoders,mode='train',table='data_trim',class_weight=None):
X_train_S = []
X_test_S = []
y_train_S =[]
y_test_S = []
print "Number of users:",len(users)
for user in users:
# if user != '337018623':
# continue
X_train,X_test,y_train,y_test = generate_sequence(user,table,encoders)
X_train_S.append(X_train)
X_test_S.append(X_test)
y_train_S.append(y_train)
y_test_S.append(y_test)
# break
X_test_S = filter(lambda a: a != [], X_test_S)
y_test_S = filter(lambda a: a != [], y_test_S)
if mode =='train':
# chuncked = chunck_seq(X_train_S)
# assert
X_train_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=int(seq_len_param),dtype='float32',value=pad_val)
y_train_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_train_S)), maxlen=int(seq_len_param),dtype='float32',value=lbl_pad_val)
y_train_S = np.expand_dims(y_train_S, -1)
if class_weight != None:
sample_w = generate_sample_w(y_train_S,class_weight)
return X_train_pad,y_train_S,sample_w
# print y_train_S
# print y_train_S.shape
# y_train_S = to_categorical(y_train_S,3)
return X_train_pad,y_train_S
else:
print 'len test',len(X_test_S)
X_test_S_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_test_S), maxlen=int(seq_len_param),dtype='float32',value=pad_val)
y_test_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_test_S)),maxlen=int(seq_len_param),dtype='float32',value=lbl_pad_val)
y_test_S = np.expand_dims(y_test_S, -1)
if class_weight != None:
sample_w = generate_sample_w(y_train_S,class_weight)
return X_train_pad,y_train_S,sample_w
return X_test_S_pad,y_test_S
In [ ]:
a = [1,[],[],1,2,3,21,1]
filter(lambda x:x !=[],a)
In [ ]:
seq_len = 50
math.ceil(1.0*seq_len/seq_len_param)
In [ ]:
table = 'data_little'
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
'group by acct_id '
'order by -num_trans'
.format(table=table), disk_engine)
In [ ]:
def get_count_table(table,disk_engine,cutt_off_date,trans_mode):
query = ['select acct_id,count(*) '
'as num_trans from {table} '
'where AUTHZN_RQST_PROC_TM <= '
'(select AUTHZN_RQST_PROC_TM '
'from {table} '
'where FRD_IND_SWT_DT >='
'"',
cutt_off_date,
'" '
'order by AUTHZN_RQST_PROC_TM limit 1) '
'group by acct_id order by num_trans']
query = ''.join(query)
query = query.format(table=table)
print trans_mode
if trans_mode == 'test':
print 'replaced'
query = query.replace('<=','>')
dataFrame = pd.read_sql_query(query
.format(table=table), disk_engine)
display(dataFrame)
return dataFrame
In [ ]:
def trans_num_table(table,disk_engine,mode='train',cutt_off_date='2014-05-11',trans_mode='train'):
# dataFrame_acc = pd.read_sql_query('select distinct acct_id, FRD_IND '
# 'from {table} '
# 'order by FRD_IND'.format(table=table), disk_engine)
# # dataFrame = pd.read_sql_query('select acct_id, count(*) as num_trans '
# # 'from {table} '
# # 'group by acct_id '
# # 'order by num_trans'
# # .format(table=table), disk_engine)
dataFrame = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
u_list = set(dataFrame.acct_id)
user_tr,user_ts = train_test_split(list(u_list), test_size=0.33, random_state=42)
total_t =0
if mode == 'train':
users = user_tr
else:
users = user_ts
total_t = total_trans_batch(users,dataFrame)
return math.ceil(total_t)
In [ ]:
trans_num_table('data_trim',disk_engine,mode='train',cutt_off_date='2014-05-11',trans_mode='train')
In [ ]:
cutt_off_date='2014-05-11'
query = ['select acct_id,count(*) '
'as num_trans from {table} '
'where AUTHZN_RQST_PROC_TM < '
'(select AUTHZN_RQST_PROC_TM '
'from {table} '
'where FRD_IND_SWT_DT >='
'"',
cutt_off_date,
'" '
'order by AUTHZN_RQST_PROC_TM limit 1) '
'group by acct_id order by num_trans']
In [ ]:
query = ''.join(query)
In [ ]:
query.replace('<','>=')
In [ ]:
query.format(table='data_trim')
In [ ]:
trans_num_table('data_little',disk_engine)
In [ ]:
table = 'data_little'
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
'group by acct_id '
'order by -num_trans'
.format(table=table), disk_engine)
In [282]:
def total_trans_batch(users,dataFrame_count):
num_trans = 0
users = set(users)
for user in users:
num_trans+=get_num_trans(user,dataFrame_count)
return num_trans
def get_num_trans(user,dfc):
try:
df = dfc[dfc['acct_id']==user]
if df.empty:
print " user not existing in the table",user
seq_len = 0
else:
seq_len = dfc[dfc['acct_id']==user].values[0][1]
except:
display(dfc.head(5))
print dfc[dfc['acct_id']==user]
raise
return math.ceil(1.0*seq_len/seq_len_param)
def add_user(index,u_list,dataFrame_count,users):
cnt_trans = 0
user = u_list[index]
if user not in users:
users.add(user)
return get_num_trans(user,dataFrame_count)
else:
return 0
def user_generator(disk_engine,table='data_trim',batch_size=50,usr_ratio=80,
mode='train',cutt_off_date='2014-05-11',trans_mode='train',sub_sample=None):
dataFrame_count = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
# display(dataFrame_count.head(5))
print "User List acquired"
u_list = list(dataFrame_count.acct_id)
# u_list.extend(list(dataFrame_Y.acct_id))
print 'total # users:',len(u_list)
u_set = set(u_list)
print 'total # unique users:',len(u_set)
user_tr,user_ts = train_test_split(list(u_set), test_size=0.33, random_state=42)
print 'total # sequences:',total_trans_batch(list(u_set),dataFrame_count)
if mode == 'train':
u_list = user_tr
else:
u_list = user_ts
if trans_mode == 'test':
print 'used # sequences: value is inaccurate, please implement'
print 'used # sequences:',total_trans_batch(u_list,dataFrame_count)
# display(dataFrame.acct_id)
u_list = list(set(u_list))
print 'return set cardinality:',len(u_list)
cnt = 0
head = 0
tail = len(u_list)-1
u_list_all = u_list
while True:
users = set()
cnt_trans = 0
if sub_sample != None:
assert sub_sample<len(u_list_all), 'sub_sample size select is {sub_sample}, but there are only {us} users'.format(sub_sample=sub_sample,us=len(u_list_all))
u_list = np.random.choice(u_list_all, sub_sample,replace=False)
### reset tail value, to avoid outof bounds exception
tail = len(u_list)-1
while cnt_trans<batch_size:
if cnt<usr_ratio:
cnt_trans+=add_user(head,u_list,dataFrame_count,users)
cnt+=1
head+=1
else:
cnt_trans+=add_user(tail,u_list,dataFrame_count,users)
tail-=1
cnt=0
# print 'head',head
# print 'tail',tail
# print 'cnt_trans',cnt_trans
if head == tail+1:
head = 0
tail = len(u_list)-1
cnt_trans = 0
cnt = 0
#if you have go through all users - return in order not to overfill epoch
#the same logic could have been achieved with break and without the yield line
print "##########ALL COVERED##########"
yield users
users = set()
In [276]:
pd.set_option("display.max_rows",60)
In [ ]:
user_gen = user_generator(disk_engine,table='data_trim')
In [ ]:
seq_len_param
In [ ]:
user_gen = user_generator(disk_engine,table='data_trim')
for i in range(4):
total_trans = 0
for i in range(int(math.floor(484/seq_len_param))-1):
t_num =total_trans_batch(next(user_gen),dataFrame_count)
print t_num
total_trans += t_num
print "###########TOTAL",total_trans
In [283]:
epoch_smpls = []
sample_num = 10
seq_len_param = 60.0
table = 'data_trim'
dataFrame_count = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
# dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
# 'from {table} '
# 'group by acct_id '
# 'order by -num_trans'
# .format(table=table), disk_engine)
user_gen = user_generator(disk_engine,table=table,sub_sample=50)
for i in range(5):
total_trans = 0
while total_trans < sample_num:
t_num =total_trans_batch(next(user_gen),dataFrame_count)
print t_num
total_trans += t_num
epoch_smpls.append(total_trans)
In [284]:
epoch_smpls
Out[284]:
In [ ]:
np.array(dataFrame_count.sum())[1]
In [ ]:
dataFrame_Y = pd.read_sql_query('select acct_id, FRD_IND, count(*) as num_trans '
'from {table} '
'where FRD_IND="Y"'
'group by acct_id '
'order by -num_trans'
.format(table=table), disk_engine)
display(dataFrame_Y)
In [ ]:
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
'group by acct_id '
'order by -num_trans'
.format(table=table), disk_engine)
dataFrame_count[dataFrame_count['acct_id']=='70557011'].values[0][1]
In [350]:
def data_generator(user_mode,trans_mode,disk_engine,encoders,table,
batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2,
pad_val = -1,cutt_off_date='2014-05-11',sub_sample=None,epoch_size=None):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode=user_mode,trans_mode=trans_mode,sub_sample=sub_sample)
print "Users generator"
last_date = get_last_date(cutt_off_date,table,disk_engine)
print 'last_date calculated!'
x_acc = []
y_acc = []
sample_w = []
total_eg = 0
while True:
users = next(user_gen)
outs = sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,last_date,mode=trans_mode,table=table,class_weight=class_weight)
if not(epoch_size == None):
while True:
num_seq = outs[0].shape[0]
print 'num_Seq',num_seq
remain = epoch_size - (total_eg + num_seq)
print '{remain} = {epoch_size} - ({total_eg}+{num_seq})'.format(remain=remain,epoch_size=epoch_size,total_eg=total_eg,num_seq=num_seq)
print 'remain',remain
if remain >=0:
total_eg +=num_seq
yield outs
else:
### remain <0 => num_seq - remain
cutline = num_seq + remain
temp = []
for i in range(len(outs)):
temp.append(outs[i][0:cutline])
yield tuple(temp)
####end of epoch!
total_eg = 0
temp = []
for i in range(len(outs)):
temp.append(outs[0][cutline:])
outs = tuple(temp)
if remain >=0:
break
else:
yield outs
In [351]:
def inside(t):
yield t+1
yield t+2
def outside(t):
inner = inside(t)
while True:
yield next(inner)
t*=10
gen_test = outside(5)
print next(gen_test)
print next(gen_test)
In [355]:
user_mode = 'train'
trans_mode = 'train'
data_gen = data_generator(user_mode,trans_mode,disk_engine,encoders,table='data_little',class_weight=class_weight,batch_size=400,sub_sample=80,epoch_size=30)
for i in range(15):
X,y,s = next(data_gen)
print 'X',X.shape
print 'y',y.shape
print 's',s.shape
In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape
In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape
In [ ]:
print X_train_pad[-1]
print y_train_S[-1]
In [ ]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, LSTM, TimeDistributed, Masking
from keras.utils.np_utils import to_categorical
In [ ]:
hidden_dim = 200
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
table = 'data_little'
samples_per_epoch = math.ceil(trans_num_table(table,disk_engine)*0.67)
nb_epoch = 100
lbl_pad_val = 2
pad_val = -1
class_weight = {0 : 1.,
1: 10.,
2: 0.}
In [ ]:
encoders = populate_encoders(table,disk_engine)
In [ ]:
encoders
In [ ]:
input_layer = Input(shape=(seq_len_param, 44),name='main_input')
mask = Masking(mask_value=pad_val)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
# for i in range(num_layers-1):
# prev = GRU(output_dim, init='glorot_uniform', inner_init='orthogonal', activation='tanh',
# inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
# b_regularizer=None, dropout_W=0.0, dropout_U=0.0)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
loss='sparse_categorical_crossentropy',
# metrics=['accuracy','hinge','squared_hinge','binary_accuracy','binary_crossentropy'])
metrics=['accuracy'],
sample_weight_mode=None)
data_gen = data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, max_q_size=10000)
In [ ]:
history_2 = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, max_q_size=10000)
In [ ]:
for i in range(X_train_pad.shape[0]):
if 1 in (list(y_train_S[i])):
print i
In [ ]:
prediction = model.predict(X_train_pad)
In [ ]:
prediction.shape
In [ ]:
prediction.shape
In [ ]:
y_train_S
In [ ]:
join_= np.dstack([prediction,y_train_S])
df_pred = pd.DataFrame(join_[391])
df_pred
In [ ]:
index = 23
d = {'target' : pd.Series(np.reshape(y_train_S[index],len(y_train_S[index]))),
'pred' : pd.Series(np.reshape(prediction[index][0],len(prediction[index][0]))),
'pred_2' : pd.Series(np.reshape(prediction[index][1],len(prediction[index][1]))),}
df_pred = pd.DataFrame(d)
df_pred
In [ ]:
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
for i in range(num_layers-1):
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(1))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=['accuracy'])
data_gen = data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)
In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)
In [ ]:
history.__dict__
In [ ]:
In [ ]:
print train_test_split(range(10), test_size=0.33, random_state=42)
In [ ]:
print len(X_train_S)
print len(X_test_S)
print len(y_train_S)
print len(y_test_S)
In [ ]:
y_test_S
In [ ]:
'.' in users
In [ ]:
users
In [ ]:
df = pd.read_sql_query('select * '
'from {table} '.format(table=table), disk_engine)
df.head()
In [ ]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.MRCH_NM)
In [ ]:
len(le.classes_)
In [ ]:
encoders['acct_id'].transform(['.'])
In [ ]:
df_u = get_user_info("'.'",'data')
In [ ]:
df_u= df
In [ ]:
df_u[df_u['FRD_IND']=='Y'].head()
In [ ]:
pd.Timestamp('20120101')
In [ ]:
df_u[df_u['FRD_IND_SWT_DT']>pd.Timestamp('20120101')]
In [ ]:
df_u.head()
In [ ]:
df_test = df_u['AUTHZN_AMT']>10
df_test.value_counts()
In [ ]:
df_u[df_u['FRD_IND']=='Y']
In [ ]:
df_u['FRD_IND'].value_counts()
In [ ]:
df_u[df_u['acct_id']=='337018623']['CAVV_CD'].head()
In [ ]:
pd.Timestamp('2013-05-11')
In [ ]:
le = preprocessing.LabelEncoder()
le.fit(df_u[df_u['acct_id']=='337018623']['CAVV_CD'])
In [ ]:
le.classes_
In [ ]:
le.transform([None])
In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2013-05-11')))[0]
In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2014-05-11')))[0]
In [ ]:
x = np.arange(10.0)
np.array_split(x, 30)
In [ ]:
import io
title = 'dsdssd'
with io.open('./data/gs_results.csv', 'a', encoding='utf-8') as output:
title_csv = title.replace('_',',')+','+str(history.history['acc'][-1])+','+str(history.history['loss'][-1])
print title_csv
output.write(unicode(title_csv))
In [ ]:
title = 'Trainin_Loss'
title.replace('Loss','acc')
In [ ]:
print s
In [263]:
table = 'data_little'
encoders = populate_encoders(table,disk_engine)
In [ ]:
def eval_generator(user_mode,trans_mode,disk_engine,encoders,table='data_trim',
batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode=user_mode)
print "Users generator"
while True:
users = next(user_gen)
yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode=trans_mode,table=table,class_weight=class_weight)
In [ ]:
def eval_trans_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table)
print "Users generator"
while True:
users = next(user_gen)
yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode='test',table=table,class_weight=class_weight)
In [ ]:
def eval_users_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode='test')
print "Users generator"
while True:
users = next(user_gen)
yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode='train',table=table,class_weight=class_weight)
In [ ]:
def eval_usertrans_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode='test')
print "Users generator"
In [ ]:
test_gen = eval_trans_generator(disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape
In [ ]:
test_gen = eval_users_generator(disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape
In [ ]:
user_mode = 'test'
trans_mode = 'train'
table = 'data_more'
test_gen = eval_generator(user_mode,trans_mode,disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape
In [ ]:
rnn = 'lstm'
hidden_dim = 300
num_layers = 3
lr= 1e-3
nb_epoch = 13
optimizer = keras.optimizers.RMSprop(lr=lr, rho=0.9, epsilon=1e-08)
title = 'Training_Loss'+'_'+rnn.upper()+'_'+str(hidden_dim)+'_'+str(num_layers)+'_'+str(type(optimizer).__name__)+'_'+str(lr)
print title
input_layer = Input(shape=(int(seq_len_param), 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
if rnn == 'gru':
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
else:
prev = LSTM(hidden_dim, return_sequences=True,go_backwards=False,stateful=False,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
for i in range(num_layers-1):
if rnn == 'gru':
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
else:
prev = LSTM(hidden_dim, return_sequences=True,go_backwards=False,stateful=False,
init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
data_gen = data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)
In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, 3, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)
In [ ]:
%load_ext autoreload
%autoreload 2
from ccfd_dnn.proto import compile_seq2seq_RNN
from ccfd_dnn.model import *
In [ ]:
nb_epoch = 1
table = 'data_little'
samples_per_epoch = 1959
model = compile_seq2seq_RNN(rnn = 'gru', hidden_dim = 300, num_layers = 3, lbl_pad_val = 2, pad_val = -1, optimizer = keras.optimizers.RMSprop(lr=1e-3, rho=0.9, epsilon=1e-08))
data_gen = data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)
In [ ]:
pad_val
In [ ]:
val_samples = 400
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
model.evaluate_generator(eval_gen, val_samples, max_q_size=10000)
In [ ]:
val_samples = 180
# model.predict_generator(eval_gen, val_samples, max_q_size=10000)
samples = 0
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
for batch in eval_gen:
In [ ]:
val_samples = 1
outs = model.predict_generator(eval_gen, val_samples, max_q_size=10000)
In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
x,y = next(eval_gen)
y_hat = model.predict(x)
In [ ]:
data_gen = data_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
x,y = next(eval_gen)
y_hat = model.predict(x)
In [ ]:
print y.shape
print y_hat.shape
print y_r.shape
print y_hat_r.shape
In [ ]:
from sklearn.metrics import roc_curve, auc
y_r = y.ravel()
y_hat_r = y_hat[:,:,1].ravel()
pad_ids = np.where(y_r!=2)
fpr,tpr,_ = roc_curve(y_r[pad_ids], y_hat_r[pad_ids])
trace = Scatter(x=fpr,y=tpr)
data = [trace]
title = 'ROC'
layout = Layout(title=title, width=800, height=640)
fig = Figure(data=data, layout=layout)
iplot(fig)
auc_val = auc(fpr, tpr)
auc_val
In [ ]:
x.shape
In [ ]:
eval_auc(model,mode,num_sample):
if mode =='train':
gen = data_gen = data_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
In [ ]:
data_gen = data_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
model.eval_auc_generator(data_gen, 484, max_q_size=10000)
In [ ]:
from keras.engine import Model
In [ ]:
from keras.engine.training import *
In [ ]:
from ccfd_dnn.model import eval_auc_generator
In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
aucs = eval_auc_generator(model, eval_gen, 978, max_q_size=10000,plt_filename=None)
In [ ]:
aucs
In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None)
all_outs = eval_auc_generator(model, eval_gen, val_samples, max_q_size=10000,plt_filename=None)
print all_outs
In [ ]:
encoders['FRD_IND'].classes_
In [ ]:
from keras.engine.training import *
from sklearn.metrics import *
def eval_auc_generator(model, generator, val_samples, max_q_size=10000,plt_filename=None,acc=True):
'''Generates predictions for the input samples from a data generator.
The generator should return the same kind of data as accepted by
`predict_on_batch`.
# Arguments
generator: generator yielding batches of input samples.
val_samples: total number of samples to generate from `generator`
before returning.
max_q_size: maximum size for the generator queue
# Returns
Numpy array(s) of predictions.
'''
processed_samples = 0
wait_time = 0.01
all_outs = []
all_y_r = []
all_y_hat = []
data_gen_queue, _stop = generator_queue(generator, max_q_size=max_q_size)
while processed_samples < val_samples:
generator_output = None
while not _stop.is_set():
if not data_gen_queue.empty():
generator_output = data_gen_queue.get()
break
else:
time.sleep(wait_time)
if isinstance(generator_output, tuple):
if len(generator_output) == 2:
x, y = generator_output
sample_weight = None
elif len(generator_output) == 3:
x, y, sample_weight = generator_output
else:
_stop.set()
raise Exception('output of generator should be a tuple '
'(x, y, sample_weight) '
'or (x, y). Found: ' + str(generator_output))
else:
_stop.set()
raise Exception('output of generator should be a tuple '
'(x, y, sample_weight) '
'or (x, y). Found: ' + str(generator_output))
try:
y_hat = model.predict_on_batch(x)
y_r = y.ravel()
y_hat_r = y_hat[:,:,1].ravel()
pad_ids = np.where(y_r!=2)
all_y_r.extend(y_r[pad_ids])
all_y_hat.extend(y_hat_r[pad_ids])
except:
_stop.set()
raise
nb_samples = x.shape[0]
processed_samples += nb_samples
_stop.set()
all_y_r = np.array(all_y_r,dtype=np.dtype(float))
all_y_hat = np.array(all_y_hat,dtype=np.dtype(float))
print all_y_r.shape
print all_y_hat.shape
print '#####################FRAUD TRNAS##################'
print '# fraud transaction',all_y_hat[np.where(all_y_hat==1)].shape
#######ROC CURVE
fpr,tpr,tresholds = roc_curve(all_y_r,all_y_hat)
print all_y_hat
print tresholds
print tresholds.shape
auc_val = auc(fpr, tpr)
print auc_val
############CLASSIFICATION REPORT########################
target_names = ['Genuine', 'Fraud']
#########Need to determine treshold
all_y_hat[np.where(all_y_hat>=tresholds[1])] = 1
all_y_hat[np.where(all_y_hat<tresholds[1])] = 0
clc_report = classification_report(all_y_r, all_y_hat, target_names=target_names)
############Accuracy
acc = accuracy_score(all_y_r,all_y_hat)
if plt_filename != None:
trace = Scatter(x=fpr,y=tpr)
data = [trace]
title = 'ROC'
layout = Layout(title=title, width=800, height=640)
fig = Figure(data=data, layout=layout)
py.image.save_as(fig,filename=plt_filename)
return [auc_val,clc_report,acc]
In [ ]:
plt_filename= None
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = lbl_pad_val, pad_val = pad_val)
eval_list = eval_auc_generator(model, eval_gen, val_samples, max_q_size=10000,plt_filename=plt_filename)
In [ ]:
filter(lambda a: a!=[],[1,2,[],2,2,[]])
In [264]:
from ccfd_dnn.model import *
data_gen = data_generator(user_mode,trans_mode,disk_engine,encoders,table=table,
batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)
next(data_gen)
Out[264]:
In [259]:
df
In [266]:
','.join([str(1),str(2),str(3)])
Out[266]:
In [267]:
model.__dict__
Out[267]:
In [269]:
model.history.__dict__
Out[269]:
In [ ]:
users =
np.random.choice(aa_milne_arr, 5, replace=False)
In [271]:
next(data_gen)
Out[271]:
In [ ]:
model.train_on_batch(self, x, y, sample_weight=None, class_weight=None)
In [356]:
py.sign_in('bottydim', 'o1kuyms9zv')
In [357]:
help(keras.callbacks.RemoteMonitor())
In [359]:
remote_log = keras.callbacks.RemoteMonitor(root='root_url', path='/publish/epoch/end/')
In [ ]: