In [1]:
    
import h2o
h2o.init(max_mem_size = 2)
    
    
    
In [2]:
    
h2o.remove_all()
    
In [3]:
    
get_ipython().magic(u'matplotlib inline')
    
In [4]:
    
import numpy as np
import pandas as pd
import os
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
    
In [5]:
    
credit_card_data = h2o.import_file(path = os.path.realpath("../data/fraud/creditcard.csv"))
    
    
In [6]:
    
credit_card_data.head()
    
    
    Out[6]:
In [7]:
    
credit_card_data_df = credit_card_data.as_data_frame(use_pandas = True)
    
In [8]:
    
credit_card_data_df['Class'].describe()
    
    Out[8]:
In [9]:
    
import matplotlib.pyplot as plt
credit_card_data_df['Counter_on_Class'] =1
print(credit_card_data_df.groupby(['Class'])['Counter_on_Class'].sum())
plt.style.use('ggplot')
credit_card_data_df['Class'].hist()
    
    
    Out[9]:
    
In [12]:
    
# Feature: Time is in seconds 1hr = 60*60 = 3600sec
# 1day = 24*3600 =  86,400          
print(credit_card_data_df['Time'].describe())
credit_card_data_df['Time'].hist(bins=20)
credit_card_data_df['Time'].hist(bins=100, by=credit_card_data_df['Class'])
#or credit_card_data_df['Time'].plot(kind='hist', alpha=0.5)
    
    
    Out[12]:
    
    
In [39]:
    
credit_card_data_df.groupby(['Time', 'Class']).size().reset_index(name='Counter').head()
    
    Out[39]:
In [ ]:
    
# import matplotlib.cm as cm
# gg = credit_card_data_df.groupby(['Time', 'Class']).sum().unstack()
    
In [ ]:
    
# gg.columns = gg.columns.droplevel()
# gg.plot(kind = 'bar', colormap = cm.Accent, width = 1)
    
In [14]:
    
# Visualize the class distribution ...
from ggplot import *
ggplot(aes(x='Time', y='Class', color='Class'), data=credit_card_data_df) + geom_point()
    
    
    Out[14]:
In [20]:
    
print("Class 0")
print(credit_card_data_df[credit_card_data_df['Class'] == 0]['Amount'].describe())
print("Class 1")
print(credit_card_data_df[credit_card_data_df['Class'] == 1]['Amount'].describe())
    
    
In [32]:
    
credit_card_data_df[credit_card_data_df.columns.difference(['Class'])].head(2)
    
    Out[32]:
In [35]:
    
credit_card_h2o_frame = h2o.H2OFrame(credit_card_data_df)
train, valid, test = credit_card_h2o_frame.split_frame(ratios=[0.6, 0.2], seed=0)
    
    
In [34]:
    
model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[50, 50, 50],
                                ignore_const_cols=False, epochs=100)
    
In [ ]:
    
model.train(x=credit_card_h2o_frame.col_names, training_frame=credit_card_h2o_frame)