In [ ]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer

%matplotlib inline

In [ ]:
!. ~/.bashrc

In [ ]:
from sklearn.cross_validation import train_test_split
np.random.seed(1337)
import theano

In [ ]:
import keras

In [ ]:
import math

Some useful tricks


In [ ]:
%load_ext line_profiler
%lprun -f function_to_profile statement_that_invokes_the_fuction

%load_ext cythonmagic => %load_ext Cython


In [ ]:
%load_ext Cython

In [ ]:
%%cython
def sum_cythonized():
    cdef long a = 0 # this directive defines a type for the variable
    cdef int i = 0
    for i in range(100000):
        a += i
    return a

In [ ]:
def sum_uncythonized():
    a = 0
    for i in range(100000):
        a += i
    return a

In [ ]:
%timeit sum_cythonized()

In [ ]:
%timeit sum_uncythonized()

In [ ]:
def check_args(*types):
    def real_decorator(func):
        def wrapper(*args, **kwargs):
            for val, typ in zip(args, types):
                assert isinstance(val, typ), "Value {} is not of expected type {}".format(val, typ)
            return func(*args, **kwargs)
        return wrapper
    return real_decorator

def do_long_computation(name):
    """ dummy function """
    time.sleep(10)
    return "FruitMart"

@check_args(str, int, int)
def print_fruit(name, apples, oranges):
    pass

Data Analysis


In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'

In [ ]:
df = pd.read_csv(data_dir+evt_name)

In [ ]:
df

In [ ]:
df_pure = pd.read_csv(data_dir+auth_name,nrows=10000)

In [ ]:
df_pure.TSYS_DCLN_REAS_CD

In [ ]:
df_pure.columns.values

Count Nuls


In [ ]:
df.isnull().sum()

In [ ]:
df.dropna()

CARD_VFCN_REJ_CD has only NaNs


In [ ]:
df.dropna(how='all')

In [ ]:
grouped = df.groupby('acct_id')

In [ ]:
grouped.groups

In [ ]:
grouped_lbl = df.groupby('FRD_IND')

Feature Exploration


In [ ]:
grouped_lbl.count()

In [ ]:
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp['acct_id']
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter 
plt.pie(x_list,labels=label_list,autopct="%1.1f%%") 
plt.title("Transactions")
plt.show()

In [ ]:
col_names = list(df.columns.values)

In [ ]:
for c,col in enumerate(col_names):
    var=grouped_lbl.count().stack()
    temp=var.unstack()
    type(temp)
    x_list = temp[col]
    label_list = temp.index
    plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
    #To show the percentage of each pie slice, pass an output format to the autopctparameter
#     plt.subplot(12,4,c+1)
    plt.pie(x_list,labels=label_list,autopct="%1.1f%%") 
    plt.title(col)
    plt.show()
    if c==45:
        break

Dealing with Missing Data


In [ ]:
def value_exist(val,col,df):
    keys = set(df.groupby(col).groups.keys())
    print keys
    return val in keys
col = 'AUTHZN_APPRL_CD'
value_exist(55555,col,df)

In [ ]:
val = 55555
df_rmna = df.fillna(value={'AUTHZN_APPRL_CD':val})
df_rmna

Authentication Data


In [ ]:
df_auth = pd.read_csv(data_dir+auth_name,nrows=500000)

In [ ]:
df_auth

In [ ]:
df_auth[df_auth['MRCH_CITY_NM'].isnull() & df_auth['MRCH_NM']=='FYP']

In [ ]:
df_auth['MRCH_NM'].isnull()

In [ ]:
df_auth.isnull().sum()

Estimated Time


In [ ]:
num_seq = 1597.0
epoch_t = 638
time_per_seq = num_seq/epoch_t
net_num_t = 2e6
total_time = net_num_t*time_per_seq
hours =total_time/3600
days = hours/24
print 'NET hours',hours
print 'NET days',days

Data Engineering


In [ ]:
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display

import plotly.plotly as py # interactive graphing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Figure

In [ ]:
init_notebook_mode()

In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'

In [3]:
5 if None else 10


Out[3]:
10

In [ ]:
import sqlite3
import time

In [ ]:
def init_sqlite3(dbname):
    conn = sqlite3.connect(dbname)
    return conn

In [ ]:
db_conn = init_sqlite3(data_dir+db_name)

In [ ]:
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str

In [ ]:
table = 'auth'

In [ ]:
t0 = time.time()
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
                       'from {table} '
                       'group by FRD_IND'.format(table=table), disk_engine)
t1 = time.time()
print str(t1-t0)
df

In [ ]:
t0 = time.time()
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
                       'from {table} '
                       'group by FRD_IND'.format(table=table), db_conn)
t1 = time.time()
print str(t1-t0)
df

In [ ]:
df['num_usr'][0]

In [ ]:
title = 'Fraud by Distinct Users'
fig = {
    'data': [{'labels': ['Fraud', 'Genuine'],
              'values': [df['num_usr'][1], df['num_usr'][0]],
              'type': 'pie'}],
    'layout': {'title': title}
     }
iplot(fig,filename='figures/'+title)

In [ ]:
usr_ratio = df['num_usr'][0]/ df['num_usr'][1]
usr_ratio

In [ ]:
usr_ratio= 80

In [ ]:
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
                       'from {table} '
                       'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u

In [ ]:
df_t_u = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                        'group by acct_id '
                        'order by -num_trans'.format(table=table), disk_engine)
df_t_u

In [ ]:
###graph bars %fraud al transactions

Encode categories


In [ ]:
pad_val = -1

In [ ]:
table= 'data_little'

In [ ]:
from sklearn import preprocessing
def encode_column(df_col):
    print df_col.shape
    le = preprocessing.LabelEncoder()
    le.fit(df_col)
    return le

In [ ]:
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
df.head()

In [ ]:
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
    tp = df.dtypes[c]
#     print tp
    if tp == 'object':
        if r not in time_cols:
            encoders[r] = encode_column(df[r])
encoders

In [ ]:


In [ ]:
def populate_encoders(table,disk_engine):
    df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
    cols = df.columns.values
    encoders = {}
    time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
    for c,r in enumerate(df):
        tp = df.dtypes[c]
    #     print tp
        if tp == 'object':
            if r not in time_cols:
                encoders[r] = encode_column(df[r])
    return encoders

In [ ]:
def populate_encoders_scale(table,disk_engine):
    df = pd.read_sql_query('select * from {table} limit 5'.format(table=table),disk_engine)
    col_names = df.columns.values
    encoders = {}
    time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
    for c,name in enumerate(col_names):
        tp = df.dtypes[c]
    #     print tp
        if tp == 'object':
            if name not in time_cols:
                df_col = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=table),disk_engine)
                encoders[name] = encode_column(np.array(df_col).ravel())
    return encoders

In [ ]:
table = 'auth'
encoders = populate_encoders_scale(table,disk_engine)
# data_gen =  data_generator(disk_engine,encoders,table=table)
# total_trans = 0
# sample_num=484
# while total_trans < sample_num:
#     total_trains +=next(data_gen)[0].shape[0]

In [ ]:
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
sample_size = tail
for i in range(sample_size):
    
    if cnt<usr_ratio:
        users.add(df_ds_u.acct_id[head])
        cnt+=1
        head+=1
    else:
        users.add(df_ds_u.acct_id[tail])
        tail-=1
        cnt=0

In [ ]:
def encode_df(df,encoders):
    for col in encoders.keys():
        try: 
            df[col] = encoders[col].transform(df[col])
        except:
            print 'EXCEPTION'
            display(df[col])
            print col 
            raise
    for col in time_cols:
        df[col] = pd.to_numeric(pd.to_datetime(df[col],errors='coerce'))

In [ ]:
def get_user_info(user,table):
    if user == '.':
        user = '"."'
    df_u = pd.read_sql_query('select * from {table} where acct_id = {user}'.format(table=table,user=user),disk_engine)
    return df_u

In [ ]:
def get_last_date(df_u,cuttoff_date):
#     print "Before Trim"
#     display(df_u)
    df_trim = df_u[df_u['FRD_IND_SWT_DT'] >= pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]]
#     print "After Trim"
#     display(df_trim)
    ### a historicly later transaction may have been confirmed earlier than a historicly preceeding T
    df_trim = df_trim.sort_values('AUTHZN_RQST_PROC_TM',ascending=True,inplace=False)
    df_trim = df_trim.reset_index(drop=True)
#     print "After Reorder"
#     display(df_trim)
#     display(df_trim)
    if not df_trim.empty:
#         print 'value to be returned',df_trim['AUTHZN_RQST_PROC_TM'][0]
        return df_trim['AUTHZN_RQST_PROC_TM'][0]
    else:
        return None

In [ ]:
cuttoff_date='2014-05-11'
pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]

In [ ]:
def get_col_id(col,df):
    col_list = list(df.columns.values)
    col_list.remove('index')
    col_list.index(col)
    
def generate_sequence(user,table,encoders,cuttoff_date='2014-05-11'):
    df_u = get_user_info(user,table)
    unav_cols = ['AUTHZN_APPRL_CD','TSYS_DCLN_REAS_CD','AUTHZN_RESPNS_CD','AUTHZN_APPRD_AMT',]
    nan_rpl = ['AUTHZN_APPRL_CD',]
    for col in unav_cols:
        df_u[col] = df_u[col].shift(1)
        loc = list(df_u.columns.values).index(col)
        if(col in nan_rpl):
            df_u.iloc[0,loc] = 'nan'
        else:
            df_u.iloc[0,loc] = pad_val
#     print df_u.count()
#     display(df_u.head())
#     display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
    encode_df(df_u,encoders)
#     print df_u.count()
#     display(df_u.head())
#     display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
    df_u = df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True)
#     display(df_u[df_u['FRD_IND_SWT_DT'].isnull()])
    df_u = df_u.drop('index', axis=1)
#     display(df_u[df_u['FRD_IND_SWT_DT'] < pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]].head(8))
### This is the last date, before which transaction will be used for trainning. 
### It coresponds to the date when the last knwon fraudulent transaction was confirmed
    last_date_num = get_last_date(df_u,cuttoff_date)
    if last_date_num == None:
        train = np.array(df_u)
#         print "No cutt offs"
#         print train[:,0:-2].shape
#         print "labels"
#         print train[:,-2].shape
        return train[:,0:-2],[],train[:,-2],[]
    else:
        df_train = df_u[df_u['AUTHZN_RQST_PROC_TM'] < last_date_num]
        df_test = df_u[df_u['AUTHZN_RQST_PROC_TM'] >= last_date_num]
#     display(df_train)
#     display(df_test)

        
    train = np.array(df_train)
    test = np.array(df_test)

#     print train
#     print test
#     print "Shapes"
#     print train.shape
#     print test.shape
#     print "features"

#     print train[:,0:-2].shape
#     print test[:,0:-2].shape 
#     print "labels"
#     print train[:,-2].shape
#     print test[:,-2].shape 
    return train[:,0:-2],test[:,0:-2],train[:,-2],test[:,-2]

In [ ]:
user = '128237902'
table = 'data_trim'
encoders = encoders
train,test,y,y_tes = generate_sequence(user,table,encoders)

In [ ]:
usr_ratio

In [ ]:
loc = list(df.columns.values).index('AUTHZN_APPRL_CD')
df.iloc[0,loc] = 'nan'

In [ ]:
df.TSYS_DCLN_REAS_CD.dtype

In [ ]:


In [ ]:
train[:,1] = np.roll(train[:,1],1)

In [ ]:
def set_roll_values(array)

In [ ]:
display(col_list[35])

In [ ]:
np.array_split(X_train_S[0:2], 5)[0][0].shape

In [ ]:
map(lambda x: len(x),X_train_S[0:10])

In [ ]:
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/50.0)) if len(x)>50 else [x],X_train_S[0:10])

In [ ]:
172%50

In [ ]:
map(lambda x: len(x),split_seq)

In [ ]:
len(map(lambda x: reduce(lambda y,z: np.vstack([y,z]),x),split_seq))

In [ ]:
flattened = [sequence for user_seq in split_seq for sequence in user_seq]

In [ ]:
len(flattened)

In [ ]:
map(lambda x: len(x),flattened)

In [ ]:
def chunck_seq(seq_list,seq_len=50.0):
    split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/seq_len)) if len(x)>seq_len else [x],seq_list)
    flattened = [sequence for user_seq in split_seq for sequence in user_seq]
    assert sum(map(lambda x: len(x),flattened)) == sum(map(lambda x: len(x),seq_list))
    return flattened

In [ ]:
from keras.preprocessing.sequence import pad_sequences

In [ ]:
pad_chunk = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=None,dtype='float32')

In [ ]:
pad_chunk.shape

In [ ]:
X_train_pad = keras.preprocessing.sequence.pad_sequences(X_train_S, maxlen=None,dtype='float32')

In [ ]:
X_train_pad.shape

In [ ]:
X_train_S[0][-1]

In [ ]:
X_train_pad[0][-1]

In [ ]:
def sequence_generator(users,encoders,mode='train',table='data_trim',class_weight=None):
    X_train_S = []
    X_test_S = []
    y_train_S =[]
    y_test_S = []
    print "Number of users:",len(users)
    for user in users:
    #     if user != '337018623': 
    #         continue
        X_train,X_test,y_train,y_test = generate_sequence(user,table,encoders)
        X_train_S.append(X_train)
        X_test_S.append(X_test) 
        y_train_S.append(y_train)
        y_test_S.append(y_test)
    #     break
    X_test_S = filter(lambda a: a != [], X_test_S)
    y_test_S = filter(lambda a: a != [], y_test_S)
    if mode =='train':
        X_train_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=None,dtype='float32',value=pad_val)
        y_train_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_train_S)), maxlen=None,dtype='float32',value=lbl_pad_val)
        y_train_S = np.expand_dims(y_train_S, -1)
        if class_weight != None:
            shps = y_train_S.shape
            sample_w = []
            for i in range(shps[0]):
                sample_w.append([])
                for j in range(shps[1]):
                    sample_w[i].append(class_weight[y_train_S[i,j,0]])
            return X_train_pad,y_train_S,np.asarray(sample_w)
#         print y_train_S
#         print y_train_S.shape
#         y_train_S = to_categorical(y_train_S,3)
        return X_train_pad,y_train_S
    else:
        X_test_S_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_test_S), maxlen=None,dtype='float32',value=pad_val)
        y_test_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_test_S)),value=lbl_pad_val)
        y_test_S = np.expand_dims(y_test_S, -1)
        return X_test_S,y_test_S

In [ ]:
def user_generator(disk_engine,table='data_trim',sample_size=400,usr_ratio=80,mode='train'):
    dataFrame = pd.read_sql_query('select distinct acct_id, FRD_IND,  '
                       'from {table} '
                       'order by FRD_IND'.format(table=table), disk_engine)
#     display(dataFrame) 
    print "User List acquired"
    u_list = list(dataFrame.acct_id)
    print 'total # users:',len(u_list)
    user_tr,user_ts = train_test_split(u_list, test_size=0.33, random_state=42)
    
    if mode == 'train':
        u_list =  user_tr
    else:
        u_list =  user_ts                              
#     display(dataFrame.acct_id)
    print 'return set cardinality:',len(u_list)
    cnt = 0
    head = 0
    tail = len(u_list)-1
    
    while True:
        users = set()
        for i in range(sample_size):
            
            if cnt<usr_ratio:
                users.add(u_list[head])
                cnt+=1
                head+=1
            else:
                users.add(u_list[tail])
                tail-=1
                cnt=0
            if head ==tail:
                    head = 0
                    tail = len(u_list)-1
#         print head
#         print tail
        print 'return list length:',len(users)
        print '# users expiriencing both', len(u_list)-len(users)
        yield users

In [ ]:
pd.set_option("display.max_rows",60)

In [ ]:
user_gen = user_generator(disk_engine)

In [ ]:
for i in range(3):
    print(next(user_gen))

In [ ]:
def data_generator(disk_engine,encoders,table='data_trim',sample_size=400,usr_ratio=80,class_weight=None):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,sample_size=sample_size,table=table)
    print "Users generator"
    while True:
        users = next(user_gen)
        yield sequence_generator(users,encoders,mode='train',table=table,class_weight=class_weight)

In [ ]:
data_gen =  data_generator(disk_engine,encoders,table='data_trim',class_weight=class_weight)

In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape

In [ ]:
print X_train_pad[-1]
print y_train_S[-1]

Deep Learning Model


In [ ]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, LSTM, TimeDistributed, Masking
from keras.utils.np_utils import to_categorical

In [ ]:
hidden_dim = 400
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
samples_per_epoch = 485
nb_epoch = 20
table = 'data_trim'
lbl_pad_val = 2
pad_val = -1
class_weight = {0 : 1.,
               1: 10.,
               2: 0.}

In [ ]:
encoders = populate_encoders(table,disk_engine)

In [ ]:
encoders

In [ ]:
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=pad_val)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
                  return_sequences=True,go_backwards=False,stateful=False,
                  unroll=False,consume_less='gpu',
                  init='glorot_uniform', inner_init='orthogonal', activation='tanh',
           inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
           b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
# for i in range(num_layers-1):
#     prev = GRU(output_dim, init='glorot_uniform', inner_init='orthogonal', activation='tanh',
#            inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
#            b_regularizer=None, dropout_W=0.0, dropout_U=0.0)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
#               metrics=['accuracy','hinge','squared_hinge','binary_accuracy','binary_crossentropy'])
              metrics=['accuracy'],
             sample_weight_mode="temporal")
data_gen =  data_generator(disk_engine,encoders,table=table,class_weight=class_weight)
# data_gen =  data_generator(disk_engine,encoders,table=table,class_weight=class_weight)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=class_weight, max_q_size=10000)

In [ ]:


In [ ]:
for i in range(X_train_pad.shape[0]):
    if 1 in (list(y_train_S[i])):
        print i

In [ ]:
prediction = model.predict(X_train_pad)

In [ ]:
prediction.shape

In [ ]:
prediction.shape

In [ ]:
y_train_S

In [ ]:
id_ = 391
join_= np.dstack([prediction,y_train_S])
df_pred = pd.DataFrame(join_[id_])
df_trim = df_pred.drop(df_pred[3]!=2.0)
roc_join = np.array(df_trim)

In [ ]:
roc_join

In [ ]:
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and area the curve

fpr, tpr, thresholds = roc_curve(roc_join[:,3], roc_join[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

In [ ]:
classes = np.unique(roc_join[:,3])
classes

In [ ]:
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
join_= np.dstack([prediction,y_train_S])
for i in range(prediction.shape[0]):
    id_ = i 

    df_pred = pd.DataFrame(join_[id_])
    display(df_pred)
    df_trim = df_pred.drop(df_pred[df_pred.ix[:,3]==2.0].index)
    display(df_trim)
    roc_join = np.array(df_trim)
    
    classes = np.unique(roc_join[:,3])
    print classes
    fpr, tpr, thresholds = roc_curve(roc_join[:,3], roc_join[:,1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [ ]:
index = 23
d = {'target' : pd.Series(np.reshape(y_train_S[index],len(y_train_S[index]))),
    'pred' : pd.Series(np.reshape(prediction[index][0],len(prediction[index][0]))),
    'pred_2' : pd.Series(np.reshape(prediction[index][1],len(prediction[index][1]))),}
df_pred = pd.DataFrame(d)
df_pred

Grid Search


In [ ]:
hid_dim = [256,512,1024,2048]
num_l = [1,2,3,4,5,6,7,8,9,10]
lr_s = [1e-1,1e-2,1e-3]
opts = lambda x,lr:[keras.optimizers.RMSprop(lr=lr, rho=0.9, epsilon=1e-08),
                keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08),
                   keras.optimizers.Nadam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)][x] 
hidden_dim = 400
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
samples_per_epoch = 485
nb_epoch = 20
table = 'data_trim'
lbl_pad_val = 2
pad_val = -1
encoders = populate_encoders(table,disk_engine)

In [ ]:
opts(2,1)

In [ ]:
for hidden_dim in hid_dim:
    for opt_id in range(3):
        for lr in lr_s:
            optimizer = opts(opt_id,lr)
            for num_layers in num_l:
                for rnn in ['gru','lstm']
                    input_layer = Input(shape=(50, 44),name='main_input')
                    mask = Masking(mask_value=0)(input_layer)
                    if rnn == gru:
                    prev = GRU(hidden_dim,#input_length=50,
                                      return_sequences=True,go_backwards=False,stateful=False,
                                      unroll=False,consume_less='gpu',
                                      init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                               inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                               b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
                    for i in range(num_layers-1):
                        prev = GRU(hidden_dim,#input_length=50,
                                      return_sequences=True,go_backwards=False,stateful=False,
                                      unroll=False,consume_less='gpu',
                                      init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                               inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                               b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
                    output_layer = TimeDistributed(Dense(1))(prev)
                    model = Model(input=[input_layer],output=[output_layer])
                    model.compile(optimizer=optimizer,
                                  loss='binary_crossentropy',
                                  metrics=['accuracy'])
                    data_gen =  data_generator(disk_engine,encoders,table=table)
                    history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)

In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)

In [ ]:
history.__dict__

In [ ]:


In [ ]:
print train_test_split(range(10), test_size=0.33, random_state=42)

In [ ]:
print len(X_train_S)
print len(X_test_S)
print len(y_train_S)
print len(y_test_S)

In [ ]:
y_test_S

In [ ]:
'.' in users

In [ ]:
users

In [ ]:
df = pd.read_sql_query('select * '
                       'from {table} '.format(table=table), disk_engine)
df.head()

In [ ]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.MRCH_NM)

In [ ]:
len(le.classes_)

In [ ]:
encoders['acct_id'].transform(['.'])

In [ ]:
df_u = get_user_info("'.'",'data')

In [ ]:
df_u= df

In [ ]:
df_u[df_u['FRD_IND']=='Y'].head()

In [ ]:
pd.Timestamp('20120101')

In [ ]:
df_u[df_u['FRD_IND_SWT_DT']>pd.Timestamp('20120101')]

In [ ]:
df_u.head()

In [ ]:
df_test = df_u['AUTHZN_AMT']>10
df_test.value_counts()

In [ ]:
df_u[df_u['FRD_IND']=='Y']

In [ ]:
df_u['FRD_IND'].value_counts()

In [ ]:
df_u[df_u['acct_id']=='337018623']['CAVV_CD'].head()

In [ ]:
pd.Timestamp('2013-05-11')

In [ ]:
le = preprocessing.LabelEncoder()
    le.fit(df_u[df_u['acct_id']=='337018623']['CAVV_CD'])

In [ ]:
le.classes_

In [ ]:
le.transform([None])

In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2013-05-11')))[0]

In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2014-05-11')))[0]

In [ ]:
x = np.arange(10.0)
np.array_split(x, 30)

In [ ]:
history.__dict__

In [ ]:
init_notebook_mode()

In [ ]:
# Create a simple chart..
trace = Scatter(x=history.epoch,y=history.history['acc'])
data = [trace]
layout = Layout(title=title, width=800, height=640)
fig = Figure(data=data, layout=layout)
iplot(fig)

In [ ]:
title = 'Training Acc'
	fig = {
    	'data': [Scatter(
    		x=history.epoch,
    		y=history.history['acc'])],
    	'layout': {'title': title}
     	}
# 	iplot(fig,filename='figures/'+title)
# 	iplot(fig,filename='figures/'+title,image='png')

In [ ]:
history.epoch

In [ ]:
help(fig)

In [ ]:
gru_dict = {}
    lstm_dict = {}
    for hidden_dim in hid_dim:
      gru_dict[hidden_dim] = {}
      lstm_dict[hidden_dim] = {}
      for opt_id in range(3):
          for lr in lr_s:
              optimizer = opts(opt_id,lr)
              gru_dict[hidden_dim][type(optimizer).__name__] = {}
              lstm_dict[hidden_dim][type(optimizer).__name__] = {}
              for num_layers in num_l:
                  for rnn in ['gru','lstm']
                      input_layer = Input(shape=(50, 44),name='main_input')
                      mask = Masking(mask_value=0)(input_layer)
                      if rnn == gru:
                          prev = GRU(hidden_dim,#input_length=50,
                                              return_sequences=True,go_backwards=False,stateful=False,
                                              unroll=False,consume_less='gpu',
                                              init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                                      inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                                      b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
                        else:
                          prev = LSTM(hidden_dim, init='glorot_uniform', inner_init='orthogonal', 
                              forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
                               W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
                      for i in range(num_layers-1):
                          if rnn == gru:
                              prev = GRU(hidden_dim,#input_length=50,
                                                  return_sequences=True,go_backwards=False,stateful=False,
                                                  unroll=False,consume_less='gpu',
                                                  init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                                          inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                                          b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
                          else:
                              prev = LSTM(hidden_dim, init='glorot_uniform', inner_init='orthogonal', 
                                  forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
                                  W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
                      output_layer = TimeDistributed(Dense(1))(prev)

In [ ]:
title = title.replace('Loss','Acc')

In [ ]:
title.upper()

In [ ]:


In [ ]:
gru_ = {}
gru_[5] = {}
gru_[5][1] = 's'

In [ ]:
help(optimizer)

In [ ]:
type(optimizer).__name__

In [ ]:
data_gen

In [ ]:
from keras.engine import generator_queue

In [ ]:
zs = np.zeros([100])
zs = zs + np.arange(100)

In [ ]:
zs[np.where(zs>65)] = 0

In [ ]:
encoders['FRD_IND'].inverse_transform([1,1])

In [ ]:
from ccfd_dnn.model import *
gen = eval_generator('train','test',disk_engine,encoders,table='data_trim',
                   sample_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)

gen = eval_trans_generator(disk_engine,encoders,table='data_trim',sample_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)
next(gen)

In [ ]:
arr = np.array([])
if arr.size ==0:
    print 'd'

In [ ]: