In [ ]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
%matplotlib inline
In [ ]:
!. ~/.bashrc
In [ ]:
from sklearn.cross_validation import train_test_split
np.random.seed(1337)
import theano
In [ ]:
import keras
In [ ]:
import math
In [ ]:
%load_ext line_profiler
%lprun -f function_to_profile statement_that_invokes_the_fuction
%load_ext cythonmagic => %load_ext Cython
In [ ]:
%load_ext Cython
In [ ]:
%%cython
def sum_cythonized():
cdef long a = 0 # this directive defines a type for the variable
cdef int i = 0
for i in range(100000):
a += i
return a
In [ ]:
def sum_uncythonized():
a = 0
for i in range(100000):
a += i
return a
In [ ]:
%timeit sum_cythonized()
In [ ]:
%timeit sum_uncythonized()
In [ ]:
def check_args(*types):
def real_decorator(func):
def wrapper(*args, **kwargs):
for val, typ in zip(args, types):
assert isinstance(val, typ), "Value {} is not of expected type {}".format(val, typ)
return func(*args, **kwargs)
return wrapper
return real_decorator
def do_long_computation(name):
""" dummy function """
time.sleep(10)
return "FruitMart"
@check_args(str, int, int)
def print_fruit(name, apples, oranges):
pass
In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
In [ ]:
df = pd.read_csv(data_dir+evt_name)
In [ ]:
df
In [ ]:
df_pure = pd.read_csv(data_dir+auth_name,nrows=10000)
In [ ]:
df_pure.TSYS_DCLN_REAS_CD
In [ ]:
df_pure.columns.values
In [ ]:
df.isnull().sum()
In [ ]:
df.dropna()
CARD_VFCN_REJ_CD has only NaNs
In [ ]:
df.dropna(how='all')
In [ ]:
grouped = df.groupby('acct_id')
In [ ]:
grouped.groups
In [ ]:
grouped_lbl = df.groupby('FRD_IND')
In [ ]:
grouped_lbl.count()
In [ ]:
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp['acct_id']
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter
plt.pie(x_list,labels=label_list,autopct="%1.1f%%")
plt.title("Transactions")
plt.show()
In [ ]:
col_names = list(df.columns.values)
In [ ]:
for c,col in enumerate(col_names):
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp[col]
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter
# plt.subplot(12,4,c+1)
plt.pie(x_list,labels=label_list,autopct="%1.1f%%")
plt.title(col)
plt.show()
if c==45:
break
In [ ]:
def value_exist(val,col,df):
keys = set(df.groupby(col).groups.keys())
print keys
return val in keys
col = 'AUTHZN_APPRL_CD'
value_exist(55555,col,df)
In [ ]:
val = 55555
df_rmna = df.fillna(value={'AUTHZN_APPRL_CD':val})
df_rmna
In [ ]:
df_auth = pd.read_csv(data_dir+auth_name,nrows=500000)
In [ ]:
df_auth
In [ ]:
df_auth[df_auth['MRCH_CITY_NM'].isnull() & df_auth['MRCH_NM']=='FYP']
In [ ]:
df_auth['MRCH_NM'].isnull()
In [ ]:
df_auth.isnull().sum()
In [ ]:
num_seq = 1597.0
epoch_t = 638
time_per_seq = num_seq/epoch_t
net_num_t = 2e6
total_time = net_num_t*time_per_seq
hours =total_time/3600
days = hours/24
print 'NET hours',hours
print 'NET days',days
In [ ]:
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display
import plotly.plotly as py # interactive graphing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Figure
In [ ]:
init_notebook_mode()
In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'
In [3]:
5 if None else 10
Out[3]:
In [ ]:
import sqlite3
import time
In [ ]:
def init_sqlite3(dbname):
conn = sqlite3.connect(dbname)
return conn
In [ ]:
db_conn = init_sqlite3(data_dir+db_name)
In [ ]:
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str
In [ ]:
table = 'auth'
In [ ]:
t0 = time.time()
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
'from {table} '
'group by FRD_IND'.format(table=table), disk_engine)
t1 = time.time()
print str(t1-t0)
df
In [ ]:
t0 = time.time()
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
'from {table} '
'group by FRD_IND'.format(table=table), db_conn)
t1 = time.time()
print str(t1-t0)
df
In [ ]:
df['num_usr'][0]
In [ ]:
title = 'Fraud by Distinct Users'
fig = {
'data': [{'labels': ['Fraud', 'Genuine'],
'values': [df['num_usr'][1], df['num_usr'][0]],
'type': 'pie'}],
'layout': {'title': title}
}
iplot(fig,filename='figures/'+title)
In [ ]:
usr_ratio = df['num_usr'][0]/ df['num_usr'][1]
usr_ratio
In [ ]:
usr_ratio= 80
In [ ]:
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
'from {table} '
'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u
In [ ]:
df_t_u = pd.read_sql_query('select acct_id, count(*) as num_trans '
'from {table} '
'group by acct_id '
'order by -num_trans'.format(table=table), disk_engine)
df_t_u
In [ ]:
###graph bars %fraud al transactions
In [ ]:
pad_val = -1
In [ ]:
table= 'data_little'
In [ ]:
from sklearn import preprocessing
def encode_column(df_col):
print df_col.shape
le = preprocessing.LabelEncoder()
le.fit(df_col)
return le
In [ ]:
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
df.head()
In [ ]:
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
tp = df.dtypes[c]
# print tp
if tp == 'object':
if r not in time_cols:
encoders[r] = encode_column(df[r])
encoders
In [ ]:
In [ ]:
def populate_encoders(table,disk_engine):
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
cols = df.columns.values
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
tp = df.dtypes[c]
# print tp
if tp == 'object':
if r not in time_cols:
encoders[r] = encode_column(df[r])
return encoders
In [ ]:
def populate_encoders_scale(table,disk_engine):
df = pd.read_sql_query('select * from {table} limit 5'.format(table=table),disk_engine)
col_names = df.columns.values
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,name in enumerate(col_names):
tp = df.dtypes[c]
# print tp
if tp == 'object':
if name not in time_cols:
df_col = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=table),disk_engine)
encoders[name] = encode_column(np.array(df_col).ravel())
return encoders
In [ ]:
table = 'auth'
encoders = populate_encoders_scale(table,disk_engine)
# data_gen = data_generator(disk_engine,encoders,table=table)
# total_trans = 0
# sample_num=484
# while total_trans < sample_num:
# total_trains +=next(data_gen)[0].shape[0]
In [ ]:
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
sample_size = tail
for i in range(sample_size):
if cnt<usr_ratio:
users.add(df_ds_u.acct_id[head])
cnt+=1
head+=1
else:
users.add(df_ds_u.acct_id[tail])
tail-=1
cnt=0
In [ ]:
def encode_df(df,encoders):
for col in encoders.keys():
try:
df[col] = encoders[col].transform(df[col])
except:
print 'EXCEPTION'
display(df[col])
print col
raise
for col in time_cols:
df[col] = pd.to_numeric(pd.to_datetime(df[col],errors='coerce'))
In [ ]:
def get_user_info(user,table):
if user == '.':
user = '"."'
df_u = pd.read_sql_query('select * from {table} where acct_id = {user}'.format(table=table,user=user),disk_engine)
return df_u
In [ ]:
def get_last_date(df_u,cuttoff_date):
# print "Before Trim"
# display(df_u)
df_trim = df_u[df_u['FRD_IND_SWT_DT'] >= pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]]
# print "After Trim"
# display(df_trim)
### a historicly later transaction may have been confirmed earlier than a historicly preceeding T
df_trim = df_trim.sort_values('AUTHZN_RQST_PROC_TM',ascending=True,inplace=False)
df_trim = df_trim.reset_index(drop=True)
# print "After Reorder"
# display(df_trim)
# display(df_trim)
if not df_trim.empty:
# print 'value to be returned',df_trim['AUTHZN_RQST_PROC_TM'][0]
return df_trim['AUTHZN_RQST_PROC_TM'][0]
else:
return None
In [ ]:
cuttoff_date='2014-05-11'
pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]
In [ ]:
def get_col_id(col,df):
col_list = list(df.columns.values)
col_list.remove('index')
col_list.index(col)
def generate_sequence(user,table,encoders,cuttoff_date='2014-05-11'):
df_u = get_user_info(user,table)
unav_cols = ['AUTHZN_APPRL_CD','TSYS_DCLN_REAS_CD','AUTHZN_RESPNS_CD','AUTHZN_APPRD_AMT',]
nan_rpl = ['AUTHZN_APPRL_CD',]
for col in unav_cols:
df_u[col] = df_u[col].shift(1)
loc = list(df_u.columns.values).index(col)
if(col in nan_rpl):
df_u.iloc[0,loc] = 'nan'
else:
df_u.iloc[0,loc] = pad_val
# print df_u.count()
# display(df_u.head())
# display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
encode_df(df_u,encoders)
# print df_u.count()
# display(df_u.head())
# display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
df_u = df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True)
# display(df_u[df_u['FRD_IND_SWT_DT'].isnull()])
df_u = df_u.drop('index', axis=1)
# display(df_u[df_u['FRD_IND_SWT_DT'] < pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]].head(8))
### This is the last date, before which transaction will be used for trainning.
### It coresponds to the date when the last knwon fraudulent transaction was confirmed
last_date_num = get_last_date(df_u,cuttoff_date)
if last_date_num == None:
train = np.array(df_u)
# print "No cutt offs"
# print train[:,0:-2].shape
# print "labels"
# print train[:,-2].shape
return train[:,0:-2],[],train[:,-2],[]
else:
df_train = df_u[df_u['AUTHZN_RQST_PROC_TM'] < last_date_num]
df_test = df_u[df_u['AUTHZN_RQST_PROC_TM'] >= last_date_num]
# display(df_train)
# display(df_test)
train = np.array(df_train)
test = np.array(df_test)
# print train
# print test
# print "Shapes"
# print train.shape
# print test.shape
# print "features"
# print train[:,0:-2].shape
# print test[:,0:-2].shape
# print "labels"
# print train[:,-2].shape
# print test[:,-2].shape
return train[:,0:-2],test[:,0:-2],train[:,-2],test[:,-2]
In [ ]:
user = '128237902'
table = 'data_trim'
encoders = encoders
train,test,y,y_tes = generate_sequence(user,table,encoders)
In [ ]:
usr_ratio
In [ ]:
loc = list(df.columns.values).index('AUTHZN_APPRL_CD')
df.iloc[0,loc] = 'nan'
In [ ]:
df.TSYS_DCLN_REAS_CD.dtype
In [ ]:
In [ ]:
train[:,1] = np.roll(train[:,1],1)
In [ ]:
def set_roll_values(array)
In [ ]:
display(col_list[35])
In [ ]:
np.array_split(X_train_S[0:2], 5)[0][0].shape
In [ ]:
map(lambda x: len(x),X_train_S[0:10])
In [ ]:
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/50.0)) if len(x)>50 else [x],X_train_S[0:10])
In [ ]:
172%50
In [ ]:
map(lambda x: len(x),split_seq)
In [ ]:
len(map(lambda x: reduce(lambda y,z: np.vstack([y,z]),x),split_seq))
In [ ]:
flattened = [sequence for user_seq in split_seq for sequence in user_seq]
In [ ]:
len(flattened)
In [ ]:
map(lambda x: len(x),flattened)
In [ ]:
def chunck_seq(seq_list,seq_len=50.0):
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/seq_len)) if len(x)>seq_len else [x],seq_list)
flattened = [sequence for user_seq in split_seq for sequence in user_seq]
assert sum(map(lambda x: len(x),flattened)) == sum(map(lambda x: len(x),seq_list))
return flattened
In [ ]:
from keras.preprocessing.sequence import pad_sequences
In [ ]:
pad_chunk = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=None,dtype='float32')
In [ ]:
pad_chunk.shape
In [ ]:
X_train_pad = keras.preprocessing.sequence.pad_sequences(X_train_S, maxlen=None,dtype='float32')
In [ ]:
X_train_pad.shape
In [ ]:
X_train_S[0][-1]
In [ ]:
X_train_pad[0][-1]
In [ ]:
def sequence_generator(users,encoders,mode='train',table='data_trim',class_weight=None):
X_train_S = []
X_test_S = []
y_train_S =[]
y_test_S = []
print "Number of users:",len(users)
for user in users:
# if user != '337018623':
# continue
X_train,X_test,y_train,y_test = generate_sequence(user,table,encoders)
X_train_S.append(X_train)
X_test_S.append(X_test)
y_train_S.append(y_train)
y_test_S.append(y_test)
# break
X_test_S = filter(lambda a: a != [], X_test_S)
y_test_S = filter(lambda a: a != [], y_test_S)
if mode =='train':
X_train_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=None,dtype='float32',value=pad_val)
y_train_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_train_S)), maxlen=None,dtype='float32',value=lbl_pad_val)
y_train_S = np.expand_dims(y_train_S, -1)
if class_weight != None:
shps = y_train_S.shape
sample_w = []
for i in range(shps[0]):
sample_w.append([])
for j in range(shps[1]):
sample_w[i].append(class_weight[y_train_S[i,j,0]])
return X_train_pad,y_train_S,np.asarray(sample_w)
# print y_train_S
# print y_train_S.shape
# y_train_S = to_categorical(y_train_S,3)
return X_train_pad,y_train_S
else:
X_test_S_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_test_S), maxlen=None,dtype='float32',value=pad_val)
y_test_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_test_S)),value=lbl_pad_val)
y_test_S = np.expand_dims(y_test_S, -1)
return X_test_S,y_test_S
In [ ]:
def user_generator(disk_engine,table='data_trim',sample_size=400,usr_ratio=80,mode='train'):
dataFrame = pd.read_sql_query('select distinct acct_id, FRD_IND, '
'from {table} '
'order by FRD_IND'.format(table=table), disk_engine)
# display(dataFrame)
print "User List acquired"
u_list = list(dataFrame.acct_id)
print 'total # users:',len(u_list)
user_tr,user_ts = train_test_split(u_list, test_size=0.33, random_state=42)
if mode == 'train':
u_list = user_tr
else:
u_list = user_ts
# display(dataFrame.acct_id)
print 'return set cardinality:',len(u_list)
cnt = 0
head = 0
tail = len(u_list)-1
while True:
users = set()
for i in range(sample_size):
if cnt<usr_ratio:
users.add(u_list[head])
cnt+=1
head+=1
else:
users.add(u_list[tail])
tail-=1
cnt=0
if head ==tail:
head = 0
tail = len(u_list)-1
# print head
# print tail
print 'return list length:',len(users)
print '# users expiriencing both', len(u_list)-len(users)
yield users
In [ ]:
pd.set_option("display.max_rows",60)
In [ ]:
user_gen = user_generator(disk_engine)
In [ ]:
for i in range(3):
print(next(user_gen))
In [ ]:
def data_generator(disk_engine,encoders,table='data_trim',sample_size=400,usr_ratio=80,class_weight=None):
user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,sample_size=sample_size,table=table)
print "Users generator"
while True:
users = next(user_gen)
yield sequence_generator(users,encoders,mode='train',table=table,class_weight=class_weight)
In [ ]:
data_gen = data_generator(disk_engine,encoders,table='data_trim',class_weight=class_weight)
In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape
In [ ]:
print X_train_pad[-1]
print y_train_S[-1]
In [ ]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, LSTM, TimeDistributed, Masking
from keras.utils.np_utils import to_categorical
In [ ]:
hidden_dim = 400
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
samples_per_epoch = 485
nb_epoch = 20
table = 'data_trim'
lbl_pad_val = 2
pad_val = -1
class_weight = {0 : 1.,
1: 10.,
2: 0.}
In [ ]:
encoders = populate_encoders(table,disk_engine)
In [ ]:
encoders
In [ ]:
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=pad_val)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
# for i in range(num_layers-1):
# prev = GRU(output_dim, init='glorot_uniform', inner_init='orthogonal', activation='tanh',
# inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
# b_regularizer=None, dropout_W=0.0, dropout_U=0.0)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
loss='sparse_categorical_crossentropy',
# metrics=['accuracy','hinge','squared_hinge','binary_accuracy','binary_crossentropy'])
metrics=['accuracy'],
sample_weight_mode="temporal")
data_gen = data_generator(disk_engine,encoders,table=table,class_weight=class_weight)
# data_gen = data_generator(disk_engine,encoders,table=table,class_weight=class_weight)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=class_weight, max_q_size=10000)
In [ ]:
In [ ]:
for i in range(X_train_pad.shape[0]):
if 1 in (list(y_train_S[i])):
print i
In [ ]:
prediction = model.predict(X_train_pad)
In [ ]:
prediction.shape
In [ ]:
prediction.shape
In [ ]:
y_train_S
In [ ]:
id_ = 391
join_= np.dstack([prediction,y_train_S])
df_pred = pd.DataFrame(join_[id_])
df_trim = df_pred.drop(df_pred[3]!=2.0)
roc_join = np.array(df_trim)
In [ ]:
roc_join
In [ ]:
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(roc_join[:,3], roc_join[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
In [ ]:
classes = np.unique(roc_join[:,3])
classes
In [ ]:
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
join_= np.dstack([prediction,y_train_S])
for i in range(prediction.shape[0]):
id_ = i
df_pred = pd.DataFrame(join_[id_])
display(df_pred)
df_trim = df_pred.drop(df_pred[df_pred.ix[:,3]==2.0].index)
display(df_trim)
roc_join = np.array(df_trim)
classes = np.unique(roc_join[:,3])
print classes
fpr, tpr, thresholds = roc_curve(roc_join[:,3], roc_join[:,1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [ ]:
index = 23
d = {'target' : pd.Series(np.reshape(y_train_S[index],len(y_train_S[index]))),
'pred' : pd.Series(np.reshape(prediction[index][0],len(prediction[index][0]))),
'pred_2' : pd.Series(np.reshape(prediction[index][1],len(prediction[index][1]))),}
df_pred = pd.DataFrame(d)
df_pred
In [ ]:
hid_dim = [256,512,1024,2048]
num_l = [1,2,3,4,5,6,7,8,9,10]
lr_s = [1e-1,1e-2,1e-3]
opts = lambda x,lr:[keras.optimizers.RMSprop(lr=lr, rho=0.9, epsilon=1e-08),
keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08),
keras.optimizers.Nadam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)][x]
hidden_dim = 400
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
samples_per_epoch = 485
nb_epoch = 20
table = 'data_trim'
lbl_pad_val = 2
pad_val = -1
encoders = populate_encoders(table,disk_engine)
In [ ]:
opts(2,1)
In [ ]:
for hidden_dim in hid_dim:
for opt_id in range(3):
for lr in lr_s:
optimizer = opts(opt_id,lr)
for num_layers in num_l:
for rnn in ['gru','lstm']
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
if rnn == gru:
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
for i in range(num_layers-1):
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(1))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=['accuracy'])
data_gen = data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)
In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)
In [ ]:
history.__dict__
In [ ]:
In [ ]:
print train_test_split(range(10), test_size=0.33, random_state=42)
In [ ]:
print len(X_train_S)
print len(X_test_S)
print len(y_train_S)
print len(y_test_S)
In [ ]:
y_test_S
In [ ]:
'.' in users
In [ ]:
users
In [ ]:
df = pd.read_sql_query('select * '
'from {table} '.format(table=table), disk_engine)
df.head()
In [ ]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.MRCH_NM)
In [ ]:
len(le.classes_)
In [ ]:
encoders['acct_id'].transform(['.'])
In [ ]:
df_u = get_user_info("'.'",'data')
In [ ]:
df_u= df
In [ ]:
df_u[df_u['FRD_IND']=='Y'].head()
In [ ]:
pd.Timestamp('20120101')
In [ ]:
df_u[df_u['FRD_IND_SWT_DT']>pd.Timestamp('20120101')]
In [ ]:
df_u.head()
In [ ]:
df_test = df_u['AUTHZN_AMT']>10
df_test.value_counts()
In [ ]:
df_u[df_u['FRD_IND']=='Y']
In [ ]:
df_u['FRD_IND'].value_counts()
In [ ]:
df_u[df_u['acct_id']=='337018623']['CAVV_CD'].head()
In [ ]:
pd.Timestamp('2013-05-11')
In [ ]:
le = preprocessing.LabelEncoder()
le.fit(df_u[df_u['acct_id']=='337018623']['CAVV_CD'])
In [ ]:
le.classes_
In [ ]:
le.transform([None])
In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2013-05-11')))[0]
In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2014-05-11')))[0]
In [ ]:
x = np.arange(10.0)
np.array_split(x, 30)
In [ ]:
history.__dict__
In [ ]:
init_notebook_mode()
In [ ]:
# Create a simple chart..
trace = Scatter(x=history.epoch,y=history.history['acc'])
data = [trace]
layout = Layout(title=title, width=800, height=640)
fig = Figure(data=data, layout=layout)
iplot(fig)
In [ ]:
title = 'Training Acc'
fig = {
'data': [Scatter(
x=history.epoch,
y=history.history['acc'])],
'layout': {'title': title}
}
# iplot(fig,filename='figures/'+title)
# iplot(fig,filename='figures/'+title,image='png')
In [ ]:
history.epoch
In [ ]:
help(fig)
In [ ]:
gru_dict = {}
lstm_dict = {}
for hidden_dim in hid_dim:
gru_dict[hidden_dim] = {}
lstm_dict[hidden_dim] = {}
for opt_id in range(3):
for lr in lr_s:
optimizer = opts(opt_id,lr)
gru_dict[hidden_dim][type(optimizer).__name__] = {}
lstm_dict[hidden_dim][type(optimizer).__name__] = {}
for num_layers in num_l:
for rnn in ['gru','lstm']
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
if rnn == gru:
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
else:
prev = LSTM(hidden_dim, init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
for i in range(num_layers-1):
if rnn == gru:
prev = GRU(hidden_dim,#input_length=50,
return_sequences=True,go_backwards=False,stateful=False,
unroll=False,consume_less='gpu',
init='glorot_uniform', inner_init='orthogonal', activation='tanh',
inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
else:
prev = LSTM(hidden_dim, init='glorot_uniform', inner_init='orthogonal',
forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(1))(prev)
In [ ]:
title = title.replace('Loss','Acc')
In [ ]:
title.upper()
In [ ]:
In [ ]:
gru_ = {}
gru_[5] = {}
gru_[5][1] = 's'
In [ ]:
help(optimizer)
In [ ]:
type(optimizer).__name__
In [ ]:
data_gen
In [ ]:
from keras.engine import generator_queue
In [ ]:
zs = np.zeros([100])
zs = zs + np.arange(100)
In [ ]:
zs[np.where(zs>65)] = 0
In [ ]:
encoders['FRD_IND'].inverse_transform([1,1])
In [ ]:
from ccfd_dnn.model import *
gen = eval_generator('train','test',disk_engine,encoders,table='data_trim',
sample_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)
gen = eval_trans_generator(disk_engine,encoders,table='data_trim',sample_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)
next(gen)
In [ ]:
arr = np.array([])
if arr.size ==0:
print 'd'
In [ ]: