In [ ]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer

%matplotlib inline

In [ ]:
!. ~/.bashrc

In [ ]:
from sklearn.cross_validation import train_test_split
np.random.seed(1337)
import theano

In [ ]:
import keras

In [ ]:
import math

Some useful tricks


In [ ]:
def check_args(*types):
    def real_decorator(func):
        def wrapper(*args, **kwargs):
            for val, typ in zip(args, types):
                assert isinstance(val, typ), "Value {} is not of expected type {}".format(val, typ)
            return func(*args, **kwargs)
        return wrapper
    return real_decorator

def do_long_computation(name):
    """ dummy function """
    time.sleep(10)
    return "FruitMart"

@check_args(str, int, int)
def print_fruit(name, apples, oranges):
    pass

Data Analysis


In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'

In [ ]:
df = pd.read_csv(data_dir+evt_name)

In [ ]:
df

In [ ]:
df_pure = pd.read_csv(data_dir+auth_name,nrows=10000)

In [ ]:
df_pure.TSYS_DCLN_REAS_CD

Count Nuls


In [ ]:
df.isnull().sum()

In [ ]:
df.dropna()

CARD_VFCN_REJ_CD has only NaNs


In [ ]:
df.dropna(how='all')

In [ ]:
grouped = df.groupby('acct_id')

In [ ]:
grouped.groups

In [ ]:
grouped_lbl = df.groupby('FRD_IND')

Feature Exploration


In [ ]:
grouped_lbl.count()

In [ ]:
var=grouped_lbl.count().stack()
temp=var.unstack()
type(temp)
x_list = temp['acct_id']
label_list = temp.index
plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
#To show the percentage of each pie slice, pass an output format to the autopctparameter 
plt.pie(x_list,labels=label_list,autopct="%1.1f%%") 
plt.title("Transactions")
plt.show()

In [ ]:
col_names = list(df.columns.values)

In [ ]:
for c,col in enumerate(col_names):
    var=grouped_lbl.count().stack()
    temp=var.unstack()
    type(temp)
    x_list = temp[col]
    label_list = temp.index
    plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
    #To show the percentage of each pie slice, pass an output format to the autopctparameter
#     plt.subplot(12,4,c+1)
    plt.pie(x_list,labels=label_list,autopct="%1.1f%%") 
    plt.title(col)
    plt.show()
    if c==45:
        break

Dealing with Missing Data


In [ ]:
def value_exist(val,col,df):
    keys = set(df.groupby(col).groups.keys())
    print keys
    return val in keys
col = 'AUTHZN_APPRL_CD'
value_exist(55555,col,df)

In [ ]:
val = 55555
df_rmna = df.fillna(value={'AUTHZN_APPRL_CD':val})
df_rmna

Authentication Data


In [ ]:
df_auth = pd.read_csv(data_dir+auth_name,nrows=500000)

In [ ]:
df_auth

In [ ]:
df_auth[df_auth['MRCH_CITY_NM'].isnull() & df_auth['MRCH_NM']=='FYP']

In [ ]:
df_auth['MRCH_NM'].isnull()

In [ ]:
df_auth.isnull().sum()

Data Engineering


In [255]:
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display

import plotly.plotly as py # interactive graphing
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Figure

In [ ]:
init_notebook_mode()

In [ ]:
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'

In [ ]:
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str

In [ ]:
table = 'data_trim'

In [ ]:
df = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
                       'from {table} '
                       'group by FRD_IND'.format(table=table), disk_engine)
df

In [ ]:
df['num_usr'][0]

In [ ]:
title = 'Fraud by Distinct Users'
fig = {
    'data': [{'labels': ['Fraud', 'Genuine'],
              'values': [df['num_usr'][1], df['num_usr'][0]],
              'type': 'pie'}],
    'layout': {'title': title}
     }
iplot(fig,filename='figures/'+title)

In [ ]:
usr_ratio = df['num_usr'][0]/ df['num_usr'][1]
usr_ratio

In [ ]:
usr_ratio= 80

In [ ]:
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
                       'from {table} '
                       'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u

In [ ]:
df_t_u = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                        'group by acct_id '
                        'order by -num_trans'.format(table=table), disk_engine)
df_t_u

Total # Training Examples


In [ ]:
table

In [ ]:
num_trans = df_t_u['num_trans'] 
total_eg = 0
for t_len in num_trans:
    total_eg += math.ceil(t_len/50.0)
print total_eg*0.66

In [ ]:
math.ceil(152/50.0)

In [ ]:
###graph bars %fraud al transactions

Encode categories


In [ ]:
pad_val = -1

In [ ]:
table= 'data_little'

In [ ]:
from sklearn import preprocessing
def encode_column(df_col):
    le = preprocessing.LabelEncoder()
    le.fit(df_col)
    return le

In [ ]:
df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
df.head()

In [ ]:
encoders = {}
time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
for c,r in enumerate(df):
    tp = df.dtypes[c]
#     print tp
    if tp == 'object':
        if r not in time_cols:
            encoders[r] = encode_column(df[r])
encoders

In [ ]:
def populate_encoders(table,disk_engine):
    df = pd.read_sql_query('select * from {table}'.format(table=table),disk_engine)
    df.head()
    encoders = {}
    time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','FRD_IND_SWT_DT']
    for c,r in enumerate(df):
        tp = df.dtypes[c]
    #     print tp
        if tp == 'object':
            if r not in time_cols:
                encoders[r] = encode_column(df[r])
    return encoders

In [ ]:
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
batch_size = tail
for i in range(batch_size):
    
    if cnt<usr_ratio:
        users.add(df_ds_u.acct_id[head])
        cnt+=1
        head+=1
    else:
        users.add(df_ds_u.acct_id[tail])
        tail-=1
        cnt=0

In [ ]:
def encode_df(df,encoders):
    for col in encoders.keys():
        try: 
            df[col] = encoders[col].transform(df[col])
        except:
            print 'EXCEPTION'
            display(df[col])
            print col 
            raise
    for col in time_cols:
        df[col] = pd.to_numeric(pd.to_datetime(df[col],errors='coerce'))

In [ ]:
def get_user_info(user,table):
    if user == '.':
        user = '"."'
    df_u = pd.read_sql_query('select * from {table} where acct_id = {user}'.format(table=table,user=user),disk_engine)
    return df_u

In [ ]:
def get_last_date(df_u,cuttoff_date):
#     print "Before Trim"
#     display(df_u)
    df_trim = df_u[df_u['FRD_IND_SWT_DT'] >= pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]]
#     print "After Trim"
#     display(df_trim)
    ### a historicly later transaction may have been confirmed earlier than a historicly preceeding T
    df_trim = df_trim.sort_values('AUTHZN_RQST_PROC_TM',ascending=True,inplace=False)
    df_trim = df_trim.reset_index(drop=True)
#     print "After Reorder"
#     display(df_trim)
#     display(df_trim)
    if not df_trim.empty:
#         print 'value to be returned',df_trim['AUTHZN_RQST_PROC_TM'][0]
        return df_trim['AUTHZN_RQST_PROC_TM'][0]
    else:
        return None

In [ ]:
query = ['select AUTHZN_RQST_PROC_TM '
        'from {table} '
        'where FRD_IND_SWT_DT >=' 
             '"',
        cutt_off_date,
             '" '
        'order by AUTHZN_RQST_PROC_TM limit 1 '
        ]
    query = ''.join(query)
    query = query.format(table=table)
    dataFrame = pd.read_sql_query(query
                       .format(table=table), disk_engine)
    dataFrame

In [ ]:
def get_col_id(col,df):
    col_list = list(df.columns.values)
    col_list.remove('index')
    col_list.index(col)
    
def generate_sequence(user,table,encoders,cuttoff_date='2014-05-11'):
    df_u = get_user_info(user,table)
    unav_cols = ['AUTHZN_APPRL_CD','TSYS_DCLN_REAS_CD','AUTHZN_RESPNS_CD','AUTHZN_APPRD_AMT',]
    nan_rpl = ['AUTHZN_APPRL_CD',]
    for col in unav_cols:
        df_u[col] = df_u[col].shift(1)
        loc = list(df_u.columns.values).index(col)
        if(col in nan_rpl):
            df_u.iloc[0,loc] = 'nan'
        else:
            df_u.iloc[0,loc] = pad_val
#     print df_u.count()
#     display(df_u.head())
#     display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
    encode_df(df_u,encoders)
#     print df_u.count()
#     display(df_u.head())
#     display(df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True))
    df_u = df_u.sort_values('AUTHZN_RQST_PROC_TM',ascending=True)
#     display(df_u[df_u['FRD_IND_SWT_DT'].isnull()])
    df_u = df_u.drop('index', axis=1)
#     display(df_u[df_u['FRD_IND_SWT_DT'] < pd.to_numeric(pd.Series(pd.to_datetime(cuttoff_date)))[0]].head(8))
### This is the last date, before which transaction will be used for trainning. 
### It coresponds to the date when the last knwon fraudulent transaction was confirmed
    last_date_num = get_last_date(df_u,cuttoff_date)
    if last_date_num == None:
        train = np.array(df_u)
#         print "No cutt offs"
#         print train[:,0:-2].shape
#         print "labels"
#         print train[:,-2].shape
        return train[:,0:-2],[],train[:,-2],[]
    else:
        df_train = df_u[df_u['AUTHZN_RQST_PROC_TM'] < last_date_num]
        df_test = df_u[df_u['AUTHZN_RQST_PROC_TM'] >= last_date_num]
        print 'train/test split:',np.array(df_train).shape[0],np.array(df_test).shape[0]
#     display(df_train)
#     display(df_test)
#     print 'is this running at all?!',df_test
        
    train = np.array(df_train)
    test = np.array(df_test)

#     print train
#     print test
#     print "Shapes"
#     print train.shape
#     print test.shape
#     print "features"

#     print train[:,0:-2].shape
#     print test[:,0:-2].shape 
#     print "labels"
#     print train[:,-2].shape
#     print test[:,-2].shape 
    return train[:,0:-2],test[:,0:-2],train[:,-2],test[:,-2]

In [ ]:
user = '128237902'
table = 'data_trim'
encoders = encoders
train,test,y,y_tes = generate_sequence(user,table,encoders)

In [ ]:
loc = list(df.columns.values).index('AUTHZN_APPRL_CD')
df.iloc[0,loc] = 'nan'

In [ ]:
df.TSYS_DCLN_REAS_CD.dtype

In [ ]:


In [ ]:
train[:,1] = np.roll(train[:,1],1)

In [ ]:
def set_roll_values(array)

In [ ]:
display(col_list[35])

In [ ]:
np.array_split(X_train_S[0:2], 5)[0][0].shape

In [ ]:
map(lambda x: len(x),X_train_S[0:10])

In [ ]:
split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/50.0)) if len(x)>50 else [x],X_train_S[0:10])

In [ ]:


In [ ]:
172%50

In [ ]:
map(lambda x: len(x),split_seq)

In [ ]:
len(map(lambda x: reduce(lambda y,z: np.vstack([y,z]),x),split_seq))

In [ ]:
flattened = [sequence for user_seq in split_seq for sequence in user_seq]

In [ ]:
len(flattened)

In [ ]:
map(lambda x: len(x),flattened)

In [ ]:
chunks_lens = map(lambda x: len(x),flattened)
chunks_lens[5] = 55

In [ ]:
for cnk in chunks_lens:
    assert cnk < 50, 'Sequence chunks are exceeding the max_len of {}'.format(seq_len_param)

In [ ]:
x_try = np.array([4])

In [ ]:
###has to be float!!!!!!!!!!
seq_len_param = 60.0
def chunck_seq(seq_list,seq_len=seq_len_param):
    split_seq = map(lambda x: np.array_split(x,math.ceil(len(x)/seq_len)) if len(x)>seq_len else [x],seq_list)
    flattened = [sequence for user_seq in split_seq for sequence in user_seq]
    assert sum(map(lambda x: len(x),flattened)) == sum(map(lambda x: len(x),seq_list))
    chunks_lens = map(lambda x: len(x),flattened)
    for cnk in chunks_lens:
        assert cnk <= seq_len_param, 'Sequence chunks are exceeding the max_len of {} \n {}'.format(seq_len_param,chunks_lens)
    return flattened

In [ ]:
from keras.preprocessing.sequence import pad_sequences

In [ ]:
pad_chunk = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S[0:10]), maxlen=int(seq_len_param),dtype='float32')

In [ ]:
pad_chunk.shape

In [ ]:
X_train_pad = keras.preprocessing.sequence.pad_sequences(X_train_S, maxlen=None,dtype='float32')

In [ ]:
X_train_pad.shape

In [ ]:
X_train_S[0][-1]

In [ ]:
X_train_pad[0][-1]

In [ ]:
X_train_S = []
    X_test_S = []
    y_train_S =[]
    y_test_S = []
    print "Number of users:",len(users)
    for user in users:
    #     if user != '337018623': 
    #         continue
        X_train,X_test,y_train,y_test = generate_sequence(user,'data_trim',encoders)
        X_train_S.append(X_train)
        X_test_S.append(X_test) 
        y_train_S.append(y_train)
        y_test_S.append(y_test)

In [ ]:
def generate_sample_w(y_true,class_weight):
    shps = y_true.shape
    sample_w = []
    for i in range(shps[0]):
        sample_w.append([])
        for j in range(shps[1]):
            sample_w[i].append(class_weight[y_true[i,j,0]])
    return np.asarray(sample_w)
def sequence_generator(users,encoders,mode='train',table='data_trim',class_weight=None):
    X_train_S = []
    X_test_S = []
    y_train_S =[]
    y_test_S = []
    print "Number of users:",len(users)
    for user in users:
    #     if user != '337018623': 
    #         continue
        X_train,X_test,y_train,y_test = generate_sequence(user,table,encoders)
        X_train_S.append(X_train)
        X_test_S.append(X_test) 
        y_train_S.append(y_train)
        y_test_S.append(y_test)
    #     break
    X_test_S = filter(lambda a: a != [], X_test_S)
    y_test_S = filter(lambda a: a != [], y_test_S)
    if mode =='train':
        # chuncked = chunck_seq(X_train_S)
        # assert 
        X_train_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_train_S), maxlen=int(seq_len_param),dtype='float32',value=pad_val)
        y_train_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_train_S)), maxlen=int(seq_len_param),dtype='float32',value=lbl_pad_val)
        y_train_S = np.expand_dims(y_train_S, -1)
        if class_weight != None:

            sample_w = generate_sample_w(y_train_S,class_weight)
            return X_train_pad,y_train_S,sample_w
#         print y_train_S
#         print y_train_S.shape
#         y_train_S = to_categorical(y_train_S,3)
        return X_train_pad,y_train_S
    else:
        print 'len test',len(X_test_S)
        X_test_S_pad = keras.preprocessing.sequence.pad_sequences(chunck_seq(X_test_S), maxlen=int(seq_len_param),dtype='float32',value=pad_val)
        y_test_S = keras.preprocessing.sequence.pad_sequences(np.array(chunck_seq(y_test_S)),maxlen=int(seq_len_param),dtype='float32',value=lbl_pad_val)
        y_test_S = np.expand_dims(y_test_S, -1)
        if class_weight != None:
            sample_w = generate_sample_w(y_train_S,class_weight)
            return X_train_pad,y_train_S,sample_w
        return X_test_S_pad,y_test_S

In [ ]:
a = [1,[],[],1,2,3,21,1]
filter(lambda x:x !=[],a)

In [ ]:
seq_len = 50
math.ceil(1.0*seq_len/seq_len_param)

In [ ]:
table = 'data_little'
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                       'group by acct_id '
                        'order by -num_trans'
                       .format(table=table), disk_engine)

In [ ]:
def get_count_table(table,disk_engine,cutt_off_date,trans_mode):
    query = ['select acct_id,count(*) '
        'as num_trans from {table} '
        'where AUTHZN_RQST_PROC_TM <= '
        '(select AUTHZN_RQST_PROC_TM '
        'from {table} '
        'where FRD_IND_SWT_DT >=' 
             '"',
        cutt_off_date,
             '" '
        'order by AUTHZN_RQST_PROC_TM limit 1) '
        'group by acct_id order by num_trans']
    query = ''.join(query)
    query = query.format(table=table)
    print trans_mode
    if trans_mode == 'test':
        print 'replaced'
        query = query.replace('<=','>')
    dataFrame = pd.read_sql_query(query
                       .format(table=table), disk_engine)
    display(dataFrame)
    return dataFrame

In [ ]:
def trans_num_table(table,disk_engine,mode='train',cutt_off_date='2014-05-11',trans_mode='train'):
#     dataFrame_acc = pd.read_sql_query('select distinct acct_id, FRD_IND '
#                        'from {table} '
#                        'order by FRD_IND'.format(table=table), disk_engine)
# #     dataFrame = pd.read_sql_query('select acct_id, count(*) as num_trans '
# #                        'from {table} '
# #                        'group by acct_id '
# #                         'order by num_trans'
# #                        .format(table=table), disk_engine)

    dataFrame = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
    u_list = set(dataFrame.acct_id)
    
    user_tr,user_ts = train_test_split(list(u_list), test_size=0.33, random_state=42)

    total_t =0
    if mode == 'train':
        users = user_tr
    else:
        users = user_ts
    
    total_t = total_trans_batch(users,dataFrame)
    return math.ceil(total_t)

In [ ]:
trans_num_table('data_trim',disk_engine,mode='train',cutt_off_date='2014-05-11',trans_mode='train')

In [ ]:
cutt_off_date='2014-05-11'
    query = ['select acct_id,count(*) '
        'as num_trans from {table} '
        'where AUTHZN_RQST_PROC_TM < '
        '(select AUTHZN_RQST_PROC_TM '
        'from {table} '
        'where FRD_IND_SWT_DT >='
             '"',
        cutt_off_date,
             '" '
        'order by AUTHZN_RQST_PROC_TM limit 1) '
        'group by acct_id order by num_trans']

In [ ]:
query = ''.join(query)

In [ ]:
query.replace('<','>=')

In [ ]:
query.format(table='data_trim')

In [ ]:
trans_num_table('data_little',disk_engine)

In [ ]:
table = 'data_little'
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                       'group by acct_id '
                        'order by -num_trans'
                       .format(table=table), disk_engine)

In [282]:
def total_trans_batch(users,dataFrame_count):
    num_trans = 0
    users = set(users)
    for user in users:
        num_trans+=get_num_trans(user,dataFrame_count)
    return num_trans

def get_num_trans(user,dfc):
    try:
        df = dfc[dfc['acct_id']==user]
        if df.empty:
            print " user not existing in the table",user
            seq_len = 0
        else:
            seq_len = dfc[dfc['acct_id']==user].values[0][1]
    except:
        display(dfc.head(5))
        print dfc[dfc['acct_id']==user]
        raise
    return math.ceil(1.0*seq_len/seq_len_param)

def add_user(index,u_list,dataFrame_count,users):
    cnt_trans = 0
    user = u_list[index]
    if user not in users:
        users.add(user)
        return get_num_trans(user,dataFrame_count)
    else:
        return 0
def user_generator(disk_engine,table='data_trim',batch_size=50,usr_ratio=80,
                   mode='train',cutt_off_date='2014-05-11',trans_mode='train',sub_sample=None):


    dataFrame_count = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
    
#     display(dataFrame_count.head(5)) 
    print "User List acquired"
    u_list = list(dataFrame_count.acct_id)
#     u_list.extend(list(dataFrame_Y.acct_id))
    print 'total # users:',len(u_list)
    u_set = set(u_list)
    print 'total # unique users:',len(u_set) 
    user_tr,user_ts = train_test_split(list(u_set), test_size=0.33, random_state=42)
    print 'total # sequences:',total_trans_batch(list(u_set),dataFrame_count)
    if mode == 'train':
        u_list =  user_tr
    else:
        u_list =  user_ts
    if trans_mode == 'test':
        print 'used # sequences: value is inaccurate, please implement'
    print 'used # sequences:',total_trans_batch(u_list,dataFrame_count)                         
#     display(dataFrame.acct_id)
    
    u_list = list(set(u_list))
    print 'return set cardinality:',len(u_list)
    cnt = 0
    head = 0
    tail = len(u_list)-1
    u_list_all = u_list
    while True:
        users = set()
        cnt_trans = 0
        if sub_sample != None:
            assert sub_sample<len(u_list_all), 'sub_sample size select is {sub_sample}, but there are only {us} users'.format(sub_sample=sub_sample,us=len(u_list_all))
            u_list = np.random.choice(u_list_all, sub_sample,replace=False)
            ### reset tail value, to avoid outof bounds exception
            tail = len(u_list)-1
        while cnt_trans<batch_size:
            
            if cnt<usr_ratio:
                cnt_trans+=add_user(head,u_list,dataFrame_count,users)
                cnt+=1
                head+=1

            else:
                cnt_trans+=add_user(tail,u_list,dataFrame_count,users)
                tail-=1
                cnt=0
#             print 'head',head
#             print 'tail',tail
#             print 'cnt_trans',cnt_trans
            if head == tail+1:
                    head = 0
                    tail = len(u_list)-1
                    cnt_trans = 0
                    cnt = 0
                    #if you have go through all users - return in order not to overfill epoch
                    #the same logic could have been achieved with break and without the yield line
                    print "##########ALL COVERED##########"
                    yield users
                    users = set()

In [276]:
pd.set_option("display.max_rows",60)

In [ ]:
user_gen = user_generator(disk_engine,table='data_trim')

In [ ]:
seq_len_param

In [ ]:
user_gen = user_generator(disk_engine,table='data_trim')
for i in range(4):
    total_trans = 0
    for i in range(int(math.floor(484/seq_len_param))-1):
        t_num =total_trans_batch(next(user_gen),dataFrame_count)
        print t_num
        total_trans += t_num
    print "###########TOTAL",total_trans

In [283]:
epoch_smpls = [] 
sample_num = 10
seq_len_param = 60.0
table = 'data_trim'
dataFrame_count = get_count_table(table,disk_engine,cutt_off_date,trans_mode)
# dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
#                        'from {table} '
#                        'group by acct_id '
#                         'order by -num_trans'
#                        .format(table=table), disk_engine)
user_gen = user_generator(disk_engine,table=table,sub_sample=50)
for i in range(5):
    total_trans = 0
    while total_trans < sample_num:
        t_num =total_trans_batch(next(user_gen),dataFrame_count)
        print t_num
        total_trans += t_num
    epoch_smpls.append(total_trans)


User List acquired
total # users: 478
total # unique users: 478
total # sequences: 653.0
return set cardinality: 320
##########ALL COVERED##########
11.0
##########ALL COVERED##########
19.0
##########ALL COVERED##########
14.0
##########ALL COVERED##########
29.0
##########ALL COVERED##########
18.0

In [284]:
epoch_smpls


Out[284]:
[11.0, 19.0, 14.0, 29.0, 18.0]

In [ ]:
np.array(dataFrame_count.sum())[1]

In [ ]:
dataFrame_Y = pd.read_sql_query('select acct_id, FRD_IND, count(*) as num_trans '
                       'from {table} '
                       'where FRD_IND="Y"'
                       'group by acct_id '
                        'order by -num_trans'
                       .format(table=table), disk_engine)
display(dataFrame_Y)

In [ ]:
dataFrame_count = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                       'group by acct_id '
                        'order by -num_trans'
                       .format(table=table), disk_engine)
dataFrame_count[dataFrame_count['acct_id']=='70557011'].values[0][1]

In [350]:
def data_generator(user_mode,trans_mode,disk_engine,encoders,table,
                   batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2,
                   pad_val = -1,cutt_off_date='2014-05-11',sub_sample=None,epoch_size=None):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode=user_mode,trans_mode=trans_mode,sub_sample=sub_sample)
    print "Users generator"
    last_date = get_last_date(cutt_off_date,table,disk_engine)
    print 'last_date calculated!'
    x_acc = []
    y_acc = []
    sample_w = []
    total_eg = 0
    while True:
        users = next(user_gen)
        outs = sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,last_date,mode=trans_mode,table=table,class_weight=class_weight)
        
        if not(epoch_size == None):
            while True:
                num_seq = outs[0].shape[0]
                print 'num_Seq',num_seq
               
                remain = epoch_size - (total_eg + num_seq)
                print '{remain} = {epoch_size} - ({total_eg}+{num_seq})'.format(remain=remain,epoch_size=epoch_size,total_eg=total_eg,num_seq=num_seq)   
                print 'remain',remain
                if remain >=0:
                    total_eg +=num_seq
                    yield outs
                else:
                    ### remain <0 => num_seq - remain
                    cutline = num_seq + remain
                    temp = []
                    for i in range(len(outs)):
                        temp.append(outs[i][0:cutline])
                    yield tuple(temp)
                    ####end of epoch!

                    total_eg = 0
                    temp = []
                    for i in range(len(outs)):
                        temp.append(outs[0][cutline:])
                    outs =  tuple(temp) 
                if remain >=0:
                    break
        else:    
            yield outs

In [351]:
def inside(t):
    yield t+1
    yield t+2
def outside(t):
    inner = inside(t)
    while True:
        yield next(inner)
        t*=10
gen_test = outside(5)
print next(gen_test)
print next(gen_test)


6
7

In [355]:
user_mode = 'train'
trans_mode = 'train'
data_gen =  data_generator(user_mode,trans_mode,disk_engine,encoders,table='data_little',class_weight=class_weight,batch_size=400,sub_sample=80,epoch_size=30)
for i in range(15):
    X,y,s = next(data_gen)
    print 'X',X.shape
    print 'y',y.shape
    print 's',s.shape


Users generator
0    1398712475000000000
Name: AUTHZN_RQST_PROC_TM, dtype: int64
last_date calculated!
User List acquired
total # users: 1840
total # unique users: 1840
total # sequences: 2763.0
return set cardinality: 1232
##########ALL COVERED##########
Number of users: 80
num_Seq 121
-91 = 30 - (0+121)
remain -91
X (30, 60, 44)
y (30, 60, 1)
s (30, 60)
num_Seq 91
-61 = 30 - (0+91)
remain -61
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 61
-31 = 30 - (0+61)
remain -31
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 31
-1 = 30 - (0+31)
remain -1
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 1
29 = 30 - (0+1)
remain 29
X (1, 60, 44)
y (1, 60, 44)
s (1, 60, 44)
##########ALL COVERED##########
Number of users: 80
num_Seq 121
-92 = 30 - (1+121)
remain -92
X (29, 60, 44)
y (29, 60, 1)
s (29, 60)
num_Seq 92
-62 = 30 - (0+92)
remain -62
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 62
-32 = 30 - (0+62)
remain -32
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 32
-2 = 30 - (0+32)
remain -2
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 2
28 = 30 - (0+2)
remain 28
X (2, 60, 44)
y (2, 60, 44)
s (2, 60, 44)
##########ALL COVERED##########
Number of users: 80
num_Seq 121
-93 = 30 - (2+121)
remain -93
X (28, 60, 44)
y (28, 60, 1)
s (28, 60)
num_Seq 93
-63 = 30 - (0+93)
remain -63
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 63
-33 = 30 - (0+63)
remain -33
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 33
-3 = 30 - (0+33)
remain -3
X (30, 60, 44)
y (30, 60, 44)
s (30, 60, 44)
num_Seq 3
27 = 30 - (0+3)
remain 27
X (3, 60, 44)
y (3, 60, 44)
s (3, 60, 44)

In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape

In [ ]:
# %debug
X_train_pad,y_train_S,sample_w = next(data_gen)
print X_train_pad.shape
print y_train_S.shape
print sample_w.shape

In [ ]:
print X_train_pad[-1]
print y_train_S[-1]

Deep Learning Model


In [ ]:
from keras.models import Model
from keras.layers import Input, Dense, GRU, LSTM, TimeDistributed, Masking
from keras.utils.np_utils import to_categorical

In [ ]:
hidden_dim = 200
num_layers = 1
optimizer=keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=1e-08)
table = 'data_little'
samples_per_epoch = math.ceil(trans_num_table(table,disk_engine)*0.67)
nb_epoch = 100
lbl_pad_val = 2
pad_val = -1
class_weight = {0 : 1.,
               1: 10.,
               2: 0.}

In [ ]:
encoders = populate_encoders(table,disk_engine)

In [ ]:
encoders

In [ ]:
input_layer = Input(shape=(seq_len_param, 44),name='main_input')
mask = Masking(mask_value=pad_val)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
                  return_sequences=True,go_backwards=False,stateful=False,
                  unroll=False,consume_less='gpu',
                  init='glorot_uniform', inner_init='orthogonal', activation='tanh',
           inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
           b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
# for i in range(num_layers-1):
#     prev = GRU(output_dim, init='glorot_uniform', inner_init='orthogonal', activation='tanh',
#            inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
#            b_regularizer=None, dropout_W=0.0, dropout_U=0.0)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
#               metrics=['accuracy','hinge','squared_hinge','binary_accuracy','binary_crossentropy'])
              metrics=['accuracy'],
             sample_weight_mode=None)
data_gen =  data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, max_q_size=10000)

In [ ]:
history_2 = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, max_q_size=10000)

In [ ]:
for i in range(X_train_pad.shape[0]):
    if 1 in (list(y_train_S[i])):
        print i

In [ ]:
prediction = model.predict(X_train_pad)

In [ ]:
prediction.shape

In [ ]:
prediction.shape

In [ ]:
y_train_S

In [ ]:
join_= np.dstack([prediction,y_train_S])
df_pred = pd.DataFrame(join_[391])
df_pred

In [ ]:
index = 23
d = {'target' : pd.Series(np.reshape(y_train_S[index],len(y_train_S[index]))),
    'pred' : pd.Series(np.reshape(prediction[index][0],len(prediction[index][0]))),
    'pred_2' : pd.Series(np.reshape(prediction[index][1],len(prediction[index][1]))),}
df_pred = pd.DataFrame(d)
df_pred

In [ ]:
input_layer = Input(shape=(50, 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
prev = GRU(hidden_dim,#input_length=50,
                  return_sequences=True,go_backwards=False,stateful=False,
                  unroll=False,consume_less='gpu',
                  init='glorot_uniform', inner_init='orthogonal', activation='tanh',
           inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
           b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
for i in range(num_layers-1):
    prev = GRU(hidden_dim,#input_length=50,
                  return_sequences=True,go_backwards=False,stateful=False,
                  unroll=False,consume_less='gpu',
                  init='glorot_uniform', inner_init='orthogonal', activation='tanh',
           inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
           b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(1))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])
data_gen =  data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)

In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10)

In [ ]:
history.__dict__

In [ ]:


In [ ]:
print train_test_split(range(10), test_size=0.33, random_state=42)

In [ ]:
print len(X_train_S)
print len(X_test_S)
print len(y_train_S)
print len(y_test_S)

In [ ]:
y_test_S

In [ ]:
'.' in users

In [ ]:
users

In [ ]:
df = pd.read_sql_query('select * '
                       'from {table} '.format(table=table), disk_engine)
df.head()

In [ ]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.MRCH_NM)

In [ ]:
len(le.classes_)

In [ ]:
encoders['acct_id'].transform(['.'])

In [ ]:
df_u = get_user_info("'.'",'data')

In [ ]:
df_u= df

In [ ]:
df_u[df_u['FRD_IND']=='Y'].head()

In [ ]:
pd.Timestamp('20120101')

In [ ]:
df_u[df_u['FRD_IND_SWT_DT']>pd.Timestamp('20120101')]

In [ ]:
df_u.head()

In [ ]:
df_test = df_u['AUTHZN_AMT']>10
df_test.value_counts()

In [ ]:
df_u[df_u['FRD_IND']=='Y']

In [ ]:
df_u['FRD_IND'].value_counts()

In [ ]:
df_u[df_u['acct_id']=='337018623']['CAVV_CD'].head()

In [ ]:
pd.Timestamp('2013-05-11')

In [ ]:
le = preprocessing.LabelEncoder()
    le.fit(df_u[df_u['acct_id']=='337018623']['CAVV_CD'])

In [ ]:
le.classes_

In [ ]:
le.transform([None])

In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2013-05-11')))[0]

In [ ]:
pd.to_numeric(pd.Series(pd.to_datetime('2014-05-11')))[0]

In [ ]:
x = np.arange(10.0)
np.array_split(x, 30)

In [ ]:
import io
title = 'dsdssd'
with io.open('./data/gs_results.csv', 'a', encoding='utf-8') as output:
                            title_csv = title.replace('_',',')+','+str(history.history['acc'][-1])+','+str(history.history['loss'][-1])
                            print title_csv
                            output.write(unicode(title_csv))

In [ ]:
title = 'Trainin_Loss'
title.replace('Loss','acc')

In [ ]:
print s

Evaluation


In [263]:
table = 'data_little'
encoders = populate_encoders(table,disk_engine)


(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)
(100000,)

Test Data generators


In [ ]:
def eval_generator(user_mode,trans_mode,disk_engine,encoders,table='data_trim',
                   batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode=user_mode)
    print "Users generator"
    while True:
        users = next(user_gen)
        yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode=trans_mode,table=table,class_weight=class_weight)

In [ ]:
def eval_trans_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table)
    print "Users generator"
    while True:
        users = next(user_gen)
        yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode='test',table=table,class_weight=class_weight)

In [ ]:
def eval_users_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode='test')
    print "Users generator"
    while True:
        users = next(user_gen)
        yield sequence_generator(users,encoders,disk_engine,lbl_pad_val,pad_val,mode='train',table=table,class_weight=class_weight)

In [ ]:
def eval_usertrans_generator(disk_engine,encoders,table='data_trim',batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1):
    user_gen = user_generator(disk_engine,usr_ratio=usr_ratio,batch_size=batch_size,table=table,mode='test')
    print "Users generator"

In [ ]:
test_gen = eval_trans_generator(disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape

In [ ]:
test_gen = eval_users_generator(disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape

In [ ]:
user_mode = 'test'
trans_mode = 'train'
table = 'data_more'
test_gen = eval_generator(user_mode,trans_mode,disk_engine,encoders,table=table,batch_size=400,usr_ratio=80,class_weight=None)
X_test_pad,y_test_S = next(test_gen)
print X_test_pad.shape
print y_test_S.shape
Users generator User List acquired List lens: df- 1912 count- 1888 total # users: 1912 total # unique users: 1888 total # transactions: 1959.0 return set cardinality: 1264 return list length: 248 Number of users: 248 (437, 60, 44) (437, 60, 1)

In [ ]:
rnn = 'lstm'
hidden_dim = 300
num_layers = 3
lr= 1e-3
nb_epoch = 13

optimizer = keras.optimizers.RMSprop(lr=lr, rho=0.9, epsilon=1e-08)


title = 'Training_Loss'+'_'+rnn.upper()+'_'+str(hidden_dim)+'_'+str(num_layers)+'_'+str(type(optimizer).__name__)+'_'+str(lr)
print title
input_layer = Input(shape=(int(seq_len_param), 44),name='main_input')
mask = Masking(mask_value=0)(input_layer)
if rnn == 'gru':
    prev = GRU(hidden_dim,#input_length=50,
                        return_sequences=True,go_backwards=False,stateful=False,
                        unroll=False,consume_less='gpu',
                        init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
else:
    prev = LSTM(hidden_dim, return_sequences=True,go_backwards=False,stateful=False,
        init='glorot_uniform', inner_init='orthogonal', 
        forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
        W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(mask)
for i in range(num_layers-1):
    if rnn == 'gru':
        prev = GRU(hidden_dim,#input_length=50,
                            return_sequences=True,go_backwards=False,stateful=False,
                            unroll=False,consume_less='gpu',
                            init='glorot_uniform', inner_init='orthogonal', activation='tanh',
                    inner_activation='hard_sigmoid', W_regularizer=None, U_regularizer=None,
                    b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
    else:
        prev = LSTM(hidden_dim, return_sequences=True,go_backwards=False,stateful=False,
            init='glorot_uniform', inner_init='orthogonal', 
            forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
            W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0)(prev)
output_layer = TimeDistributed(Dense(3,activation='softmax'))(prev)
model = Model(input=[input_layer],output=[output_layer])
model.compile(optimizer=optimizer,
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
data_gen =  data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)

In [ ]:
history = model.fit_generator(data_gen, samples_per_epoch, 3, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)

In [ ]:
%load_ext autoreload
%autoreload 2
from ccfd_dnn.proto import compile_seq2seq_RNN
from ccfd_dnn.model import *

In [ ]:
nb_epoch = 1
table = 'data_little'
samples_per_epoch = 1959
model = compile_seq2seq_RNN(rnn = 'gru', hidden_dim = 300, num_layers = 3, lbl_pad_val = 2, pad_val = -1, optimizer = keras.optimizers.RMSprop(lr=1e-3, rho=0.9, epsilon=1e-08))

data_gen =  data_generator(disk_engine,encoders,table=table)
history = model.fit_generator(data_gen, samples_per_epoch, nb_epoch, verbose=1, callbacks=[],validation_data=None, nb_val_samples=None, class_weight=None, max_q_size=10000)

In [ ]:
pad_val

In [ ]:
val_samples = 400
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
model.evaluate_generator(eval_gen, val_samples, max_q_size=10000)

ROC evaluation auc


In [ ]:
val_samples = 180
# model.predict_generator(eval_gen, val_samples, max_q_size=10000)
samples = 0
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
for batch in eval_gen:

In [ ]:
val_samples = 1
outs = model.predict_generator(eval_gen, val_samples, max_q_size=10000)

In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
x,y = next(eval_gen)
y_hat = model.predict(x)

In [ ]:
data_gen = data_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
x,y = next(eval_gen)
y_hat = model.predict(x)

In [ ]:
print y.shape
print y_hat.shape
print y_r.shape
print y_hat_r.shape

In [ ]:
from sklearn.metrics import roc_curve, auc
y_r = y.ravel()
y_hat_r = y_hat[:,:,1].ravel()
pad_ids = np.where(y_r!=2)
fpr,tpr,_ = roc_curve(y_r[pad_ids], y_hat_r[pad_ids])
trace = Scatter(x=fpr,y=tpr)
data = [trace]
title = 'ROC'
layout = Layout(title=title, width=800, height=640)
fig = Figure(data=data, layout=layout)
iplot(fig)
auc_val = auc(fpr, tpr)
auc_val

In [ ]:
x.shape

In [ ]:
eval_auc(model,mode,num_sample):
    if mode =='train':
        gen = data_gen = data_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)

In [ ]:
data_gen = data_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
model.eval_auc_generator(data_gen, 484, max_q_size=10000)

In [ ]:
from keras.engine import Model

In [ ]:
from keras.engine.training import *

In [ ]:
from ccfd_dnn.model import eval_auc_generator

In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
aucs = eval_auc_generator(model, eval_gen, 978, max_q_size=10000,plt_filename=None)

In [ ]:
aucs

In [ ]:
eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                     batch_size=400,usr_ratio=80,class_weight=None)
all_outs = eval_auc_generator(model, eval_gen, val_samples, max_q_size=10000,plt_filename=None)
print all_outs

In [ ]:
encoders['FRD_IND'].classes_

In [ ]:
from keras.engine.training import *
from sklearn.metrics import *
def eval_auc_generator(model, generator, val_samples, max_q_size=10000,plt_filename=None,acc=True):
    '''Generates predictions for the input samples from a data generator.
    The generator should return the same kind of data as accepted by
    `predict_on_batch`.

    # Arguments
        generator: generator yielding batches of input samples.
        val_samples: total number of samples to generate from `generator`
            before returning.
        max_q_size: maximum size for the generator queue

    # Returns
        Numpy array(s) of predictions.
    '''


    processed_samples = 0
    wait_time = 0.01
    all_outs = []
    all_y_r = []
    all_y_hat = []

    data_gen_queue, _stop = generator_queue(generator, max_q_size=max_q_size)

    while processed_samples < val_samples:
        generator_output = None
        while not _stop.is_set():
            if not data_gen_queue.empty():
                generator_output = data_gen_queue.get()
                break
            else:
                time.sleep(wait_time)

        if isinstance(generator_output, tuple):
            if len(generator_output) == 2:
                x, y = generator_output
                sample_weight = None
            elif len(generator_output) == 3:
                x, y, sample_weight = generator_output
            else:
                _stop.set()
                raise Exception('output of generator should be a tuple '
                                '(x, y, sample_weight) '
                                'or (x, y). Found: ' + str(generator_output))
        else:
            _stop.set()
            raise Exception('output of generator should be a tuple '
                                '(x, y, sample_weight) '
                                'or (x, y). Found: ' + str(generator_output))

        try:
            y_hat = model.predict_on_batch(x)
            y_r = y.ravel()
            y_hat_r = y_hat[:,:,1].ravel()
            pad_ids = np.where(y_r!=2)
            all_y_r.extend(y_r[pad_ids])
            all_y_hat.extend(y_hat_r[pad_ids])
        except:
            _stop.set()
            raise
        nb_samples = x.shape[0]   

        processed_samples += nb_samples

    _stop.set()


    all_y_r = np.array(all_y_r,dtype=np.dtype(float))
    all_y_hat = np.array(all_y_hat,dtype=np.dtype(float))
    print all_y_r.shape
    print all_y_hat.shape
    print '#####################FRAUD TRNAS##################'
    print '# fraud transaction',all_y_hat[np.where(all_y_hat==1)].shape
    #######ROC CURVE
    fpr,tpr,tresholds = roc_curve(all_y_r,all_y_hat)
    print all_y_hat
    print tresholds
    print tresholds.shape
    auc_val = auc(fpr, tpr)
    print auc_val
    ############CLASSIFICATION REPORT########################
    target_names = ['Genuine', 'Fraud']
    #########Need to determine treshold 
    all_y_hat[np.where(all_y_hat>=tresholds[1])] = 1
    all_y_hat[np.where(all_y_hat<tresholds[1])]  = 0
    clc_report = classification_report(all_y_r, all_y_hat, target_names=target_names)
    ############Accuracy
    acc = accuracy_score(all_y_r,all_y_hat)
    if plt_filename != None:
        trace = Scatter(x=fpr,y=tpr)
        data = [trace]
        title = 'ROC'
        layout = Layout(title=title, width=800, height=640)
        fig = Figure(data=data, layout=layout)
        py.image.save_as(fig,filename=plt_filename)
    return [auc_val,clc_report,acc]

In [ ]:
plt_filename= None                    
    eval_gen = eval_users_generator(disk_engine,encoders,table=table,
                                batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = lbl_pad_val, pad_val = pad_val)

    
    eval_list  = eval_auc_generator(model, eval_gen, val_samples, max_q_size=10000,plt_filename=plt_filename)

In [ ]:
filter(lambda a: a!=[],[1,2,[],2,2,[]])

In [264]:
from ccfd_dnn.model import *
data_gen = data_generator(user_mode,trans_mode,disk_engine,encoders,table=table,
                                         batch_size=400,usr_ratio=80,class_weight=None,lbl_pad_val = 2, pad_val = -1)
next(data_gen)


Users generator
0    1398712475000000000
Name: AUTHZN_RQST_PROC_TM, dtype: int64
last_date calculated!
User List acquired
total # users: 1840
total # unique users: 1840
total # sequences: 2763.0
used # sequences: 884.0
return set cardinality: 608
Number of users: 277
xs shape (400, 60, 44)
labels shape (400, 60, 1)
Out[264]:
(array([[[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.59100000e+03,   1.39764640e+18,   5.90640000e+04, ...,
            1.39656957e+18,   1.85080002e+02,   1.39294077e+18],
         [  1.59100000e+03,   1.39837291e+18,   5.45270000e+04, ...,
            1.39656957e+18,   1.85080002e+02,   1.39294077e+18],
         [  1.59100000e+03,   1.39837291e+18,   5.26200000e+03, ...,
            1.39656957e+18,   1.85080002e+02,   1.39294077e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.61500000e+03,   1.39756600e+18,   5.36580000e+04, ...,
           -9.22337204e+18,   0.00000000e+00,   1.39518721e+18],
         [  1.61500000e+03,   1.39836658e+18,   5.90640000e+04, ...,
           -9.22337204e+18,   0.00000000e+00,   1.39518721e+18],
         [  1.61500000e+03,   1.39842390e+18,   4.98300000e+04, ...,
           -9.22337204e+18,   0.00000000e+00,   1.39518721e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [  1.07900000e+03,   1.36794255e+18,   5.90640000e+04, ...,
            1.36676154e+18,   6.30200005e+01,   1.33816324e+18],
         [  1.07900000e+03,   1.36890311e+18,   5.46660000e+04, ...,
            1.36676154e+18,   6.30200005e+01,   1.33816324e+18]],
 
        ..., 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.37000000e+03,   1.38829341e+18,   1.64620000e+04, ...,
            1.38628804e+18,   2.00000000e+01,   1.37090876e+18],
         [  1.37000000e+03,   1.39317909e+18,   5.90640000e+04, ...,
            1.39173117e+18,   1.00000000e+01,   1.37090876e+18],
         [  1.37000000e+03,   1.39522404e+18,   5.84490000e+04, ...,
            1.39432327e+18,   1.00000000e+01,   1.37090876e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.60500000e+03,   1.39559980e+18,   7.54200000e+03, ...,
           -9.22337204e+18,   0.00000000e+00,   1.39466879e+18],
         [  1.60500000e+03,   1.39661506e+18,   1.90950000e+04, ...,
           -9.22337204e+18,   0.00000000e+00,   1.39466879e+18],
         [  1.60500000e+03,   1.39721869e+18,   5.19360000e+04, ...,
            1.39708799e+18,   9.40000000e+01,   1.39466879e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.10400000e+03,   1.38748733e+18,   5.90640000e+04, ...,
            1.38386884e+18,   1.80000000e+01,   1.34084165e+18],
         [  1.10400000e+03,   1.38866216e+18,   1.95100000e+03, ...,
            1.38741119e+18,   2.25000000e+01,   1.34084165e+18],
         [  1.10400000e+03,   1.39162328e+18,   4.06700000e+03, ...,
            1.38905276e+18,   2.36000004e+01,   1.34084165e+18]]], dtype=float32),
 array([[[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 2.],
         [ 0.],
         [ 0.]],
 
        ..., 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]]], dtype=float32))

In [259]:
df


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-259-7ed0097d7e9e> in <module>()
----> 1 df

NameError: name 'df' is not defined

In [266]:
','.join([str(1),str(2),str(3)])


Out[266]:
'1,2,3'

In [267]:
model.__dict__


Out[267]:
{'_function_kwargs': {},
 '_output_mask_cache': {'139972926101776_139972926060048': Any{2}.0,
  '139972926101776_9545840': Any{2}.0},
 '_output_shape_cache': {'(None, 60, 44)': (None, 60, 3)},
 '_output_tensor_cache': {'139972926101776_9545840': Reshape{3}.0},
 'built': True,
 'container_nodes': {'gru_7_ib-0',
  'gru_8_ib-0',
  'gru_9_ib-0',
  'main_input_ib-0',
  'masking_3_ib-0',
  'timedistributed_3_ib-0'},
 'history': <keras.callbacks.History at 0x7f4dc8c1cc50>,
 'inbound_nodes': [<keras.engine.topology.Node at 0x7f4def443550>],
 'input_layers': [<keras.engine.topology.InputLayer at 0x7f4dfc89d690>],
 'input_layers_node_indices': [0],
 'input_layers_tensor_indices': [0],
 'input_names': ['main_input'],
 'inputs': [main_input],
 'internal_input_shapes': [(None, 60, 44)],
 'internal_output_shapes': [(None, 60, 3)],
 'layers': [<keras.engine.topology.InputLayer at 0x7f4dfc89d690>,
  <keras.layers.core.Masking at 0x7f4dfc89d990>,
  <keras.layers.recurrent.GRU at 0x7f4dfc893690>,
  <keras.layers.recurrent.GRU at 0x7f4dedb3e790>,
  <keras.layers.recurrent.GRU at 0x7f4dee8a49d0>,
  <keras.layers.wrappers.TimeDistributed at 0x7f4def428050>],
 'layers_by_depth': {0: [<keras.layers.wrappers.TimeDistributed at 0x7f4def428050>],
  1: [<keras.layers.recurrent.GRU at 0x7f4dee8a49d0>],
  2: [<keras.layers.recurrent.GRU at 0x7f4dedb3e790>],
  3: [<keras.layers.recurrent.GRU at 0x7f4dfc893690>],
  4: [<keras.layers.core.Masking at 0x7f4dfc89d990>],
  5: [<keras.engine.topology.InputLayer at 0x7f4dfc89d690>]},
 'loss': 'sparse_categorical_crossentropy',
 'loss_functions': [<function keras.objectives.sparse_categorical_crossentropy>],
 'loss_weights': None,
 'metrics': [mean],
 'metrics_names': ['loss', 'acc'],
 'name': 'model_3',
 'nodes_by_depth': {0: [<keras.engine.topology.Node at 0x7f4def39fa50>],
  1: [<keras.engine.topology.Node at 0x7f4dee6a9950>],
  2: [<keras.engine.topology.Node at 0x7f4dee2c2790>],
  3: [<keras.engine.topology.Node at 0x7f4dedabf690>],
  4: [<keras.engine.topology.Node at 0x7f4dfc8934d0>],
  5: [<keras.engine.topology.Node at 0x7f4dfc89d950>]},
 'optimizer': <keras.optimizers.RMSprop at 0x7f4dfc89d610>,
 'outbound_nodes': [],
 'output_layers': [<keras.layers.wrappers.TimeDistributed at 0x7f4def428050>],
 'output_layers_node_indices': [0],
 'output_layers_tensor_indices': [0],
 'output_names': ['timedistributed_3'],
 'outputs': [Reshape{3}.0],
 'predict_function': <keras.backend.theano_backend.Function at 0x7f4dc982a250>,
 'sample_weight_mode': None,
 'sample_weight_modes': [None],
 'sample_weights': [timedistributed_3_sample_weights],
 'stop_training': False,
 'supports_masking': False,
 'targets': [timedistributed_3_target],
 'test_function': <keras.backend.theano_backend.Function at 0x7f4e055cda10>,
 'total_loss': Elemwise{mul,no_inplace}.0,
 'train_function': <keras.backend.theano_backend.Function at 0x7f4df10dda90>,
 'validation_data': None}

In [269]:
model.history.__dict__


Out[269]:
{'epoch': [0, 1],
 'history': {'acc': [0.99639272671563706, 0.99828143543342718],
  'loss': [0.030823190755963386, 0.021165021918162573]},
 'model': <keras.engine.training.Model at 0x7f4def443610>,
 'params': {'do_validation': False,
  'metrics': ['loss', 'acc', 'val_loss', 'val_acc'],
  'nb_epoch': 3,
  'nb_sample': 1959,
  'verbose': 1}}

In [ ]:
users =  
np.random.choice(aa_milne_arr, 5, replace=False)

In [271]:
next(data_gen)


Number of users: 268
Out[271]:
(array([[[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  2.42000000e+02,   1.39394930e+18,   5.90640000e+04, ...,
            1.38983039e+18,   9.70000000e+01,   1.36995837e+18],
         [  2.42000000e+02,   1.39463869e+18,   1.93390000e+04, ...,
            1.38983039e+18,   9.70000000e+01,   1.36995837e+18],
         [  2.42000000e+02,   1.39463979e+18,   5.44880000e+04, ...,
            1.38983039e+18,   9.70000000e+01,   1.36995837e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  6.23000000e+02,   1.39024174e+18,   5.73580000e+04, ...,
            1.38913921e+18,   5.00000000e+01,   1.38326397e+18],
         [  6.23000000e+02,   1.39090228e+18,   1.81970000e+04, ...,
            1.38913921e+18,   5.00000000e+01,   1.38326397e+18],
         [  6.23000000e+02,   1.39618295e+18,   5.11260000e+04, ...,
            1.39449603e+18,   3.03999996e+01,   1.38326397e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  7.65000000e+02,   1.39825306e+18,   2.69500000e+04, ...,
            1.39639681e+18,   3.47970001e+02,   1.34429755e+18],
         [  7.65000000e+02,   1.39853811e+18,   5.69780000e+04, ...,
            1.39639681e+18,   3.47970001e+02,   1.34429755e+18],
         [  7.65000000e+02,   1.39860752e+18,   2.01500000e+03, ...,
            1.39639681e+18,   3.47970001e+02,   1.34429755e+18]],
 
        ..., 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  5.82000000e+02,   1.38934798e+18,   5.76130000e+04, ...,
            1.38913921e+18,   1.49121997e+03,   1.36762561e+18],
         [  5.82000000e+02,   1.38936653e+18,   5.66200000e+03, ...,
            1.38913921e+18,   1.49121997e+03,   1.36762561e+18],
         [  5.82000000e+02,   1.38943773e+18,   5.01400000e+03, ...,
            1.38913921e+18,   1.49121997e+03,   1.36762561e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  5.82000000e+02,   1.39808140e+18,   2.19760000e+04, ...,
            1.39700154e+18,   7.21359985e+02,   1.36762561e+18],
         [  5.82000000e+02,   1.39809514e+18,   9.47700000e+03, ...,
            1.39700154e+18,   7.21359985e+02,   1.36762561e+18],
         [  5.82000000e+02,   1.39854251e+18,   4.82200000e+03, ...,
            1.39700154e+18,   7.21359985e+02,   1.36762561e+18]],
 
        [[ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         [ -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00, ...,
           -1.00000000e+00,  -1.00000000e+00,  -1.00000000e+00],
         ..., 
         [  1.82000000e+02,   1.38591145e+18,   4.64510000e+04, ...,
            1.38551041e+18,   8.48099976e+01,   1.32287041e+18],
         [  1.82000000e+02,   1.38969075e+18,   5.06420000e+04, ...,
            1.38870724e+18,   3.56579987e+02,   1.32287041e+18],
         [  1.82000000e+02,   1.39075453e+18,   2.64090000e+04, ...,
            1.38870724e+18,   3.56579987e+02,   1.32287041e+18]]], dtype=float32),
 array([[[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        ..., 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]],
 
        [[ 2.],
         [ 2.],
         [ 2.],
         ..., 
         [ 0.],
         [ 0.],
         [ 0.]]], dtype=float32))

In [ ]:
model.train_on_batch(self, x, y, sample_weight=None, class_weight=None)

In [356]:
py.sign_in('bottydim', 'o1kuyms9zv')

In [357]:
help(keras.callbacks.RemoteMonitor())


Help on RemoteMonitor in module keras.callbacks object:

class RemoteMonitor(Callback)
 |  Callback used to stream events to a server.
 |  
 |  Requires the `requests` library.
 |  
 |  # Arguments
 |      root: root url to which the events will be sent (at the end
 |          of every epoch). Events are sent to
 |          `root + '/publish/epoch/end/'`. Calls are HTTP POST,
 |          with a `data` argument which is a JSON-encoded dictionary
 |          of event data.
 |  
 |  Method resolution order:
 |      RemoteMonitor
 |      Callback
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, root='http://localhost:9000')
 |  
 |  on_epoch_end(self, epoch, logs={})
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Callback:
 |  
 |  on_batch_begin(self, batch, logs={})
 |  
 |  on_batch_end(self, batch, logs={})
 |  
 |  on_epoch_begin(self, epoch, logs={})
 |  
 |  on_train_begin(self, logs={})
 |  
 |  on_train_end(self, logs={})
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from Callback:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)


In [359]:
remote_log = keras.callbacks.RemoteMonitor(root='root_url', path='/publish/epoch/end/')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-359-c4504c4675a1> in <module>()
----> 1 remote_log = keras.callbacks.RemoteMonitor(root='root_url', path='/publish/epoch/end/')

TypeError: __init__() got an unexpected keyword argument 'path'

In [ ]: