Download, parse and cleanse the data

In this notebook, we download, check, parse and cleanse the dataset.
Also, we translate several values and column names to their English counterparts.

Data checking variable.

As we want to quickly check whether our manipulations provide the desired result we add a checking variable. This will allow us to print tests to the screen


In [1]:
check = True

Download the data


In [2]:
import urllib
import os
import pandas as pd
import numpy as np

if not os.path.exists("Downloads"):
    os.makedirs("Downloads")
if not os.path.isfile("Downloads/data_berka.zip"):    
    zipfiles = urllib.URLopener()
    zipfiles.retrieve("http://lisp.vse.cz/pkdd99/DATA/data_berka.zip", "Downloads/data_berka.zip")
if not os.path.isfile("Downloads/berka.htm"):    
    challenge_file = urllib.URLopener()
    challenge_file.retrieve("http://lisp.vse.cz/pkdd99/Challenge/berka.htm", "Downloads/berka.htm")

Extract the zipfile


In [3]:
import zipfile

if not os.path.exists('Downloads/data_berka'):
    os.makedirs('Downloads/data_berka')
    
zip_ref = zipfile.ZipFile('Downloads/data_berka.zip', 'r')
zip_ref.extractall('Downloads/data_berka')
zip_ref.close()

Inspect filetype, seperator, quote and column types. Print it for the first and last two lines in the file.


In [4]:
files = os.listdir('Downloads/data_berka')
if check:
    for fl in files:
        with open('Downloads/data_berka/' + fl) as myfile:
            print fl
            lines = myfile.readlines()
            print '\t' + repr(lines[0])
            print '\t' + repr(lines[1])
            print '\t ...'
            print '\t' + repr(lines[-2])
            print '\t' + repr(lines[-1])


account.asc
	'"account_id";"district_id";"frequency";"date"\r\n'
	'576;55;"POPLATEK MESICNE";930101\r\n'
	 ...
	'1573;63;"POPLATEK MESICNE";971229\r\n'
	'3276;1;"POPLATEK MESICNE";971229\r\n'
card.asc
	'"card_id";"disp_id";"type";"issued"\r\n'
	'1005;9285;"classic";931107 00:00:00\r\n'
	 ...
	'685;4467;"classic";981229 00:00:00\r\n'
	'635;4083;"classic";981229 00:00:00\r\n'
client.asc
	'"client_id";"birth_number";"district_id"\r\n'
	'1;"706213";18\r\n'
	 ...
	'13971;"626019";67\r\n'
	'13998;"535812";74\r\n'
disp.asc
	'"disp_id";"client_id";"account_id";"type"\r\n'
	'1;1;1;"OWNER"\r\n'
	 ...
	'13663;13971;11362;"OWNER"\r\n'
	'13690;13998;11382;"OWNER"\r\n'
district.asc
	'A1;A2;A3;A4;A5;A6;A7;A8;A9;A10;A11;A12;A13;A14;A15;A16\r\n'
	'1;"Hl.m. Praha";"Prague";1204953;0;0;0;1;1;100.0;12541;0.29;0.43;167;85677;99107\r\n'
	 ...
	'76;"Sumperk";"north Moravia";127369;31;32;13;2;7;51.2;8369;4.73;5.88;107;3736;2807\r\n'
	'77;"Vsetin";"north Moravia";148545;8;35;12;3;4;53.5;8909;4.01;5.56;113;3460;3590\r\n'
loan.asc
	'"loan_id";"account_id";"date";"amount";"duration";"payments";"status"\r\n'
	'5314;1787;930705;96396;12;8033.00;"B"\r\n'
	 ...
	'5346;1928;981206;55632;24;2318.00;"C"\r\n'
	'6748;8645;981208;240900;60;4015.00;"C"\r\n'
order.asc
	'"order_id";"account_id";"bank_to";"account_to";"amount";"k_symbol"\r\n'
	'29401;1;"YZ";"87144583";2452.00;"SIPO"\r\n'
	 ...
	'46337;11362;"KL";"20009470";129.00;" "\r\n'
	'46338;11362;"MN";"61540514";5392.00;"UVER"\r\n'
trans.asc
	'"trans_id";"account_id";"date";"type";"operation";"amount";"balance";"k_symbol";"bank";"account"\r\n'
	'695247;2378;930101;"PRIJEM";"VKLAD";700.00;700.00;"";;\r\n'
	 ...
	'3626683;2907;981231;"PRIJEM";"";107.50;23453.00;"UROK";;\r\n'
	'3626540;2902;981231;"PRIJEM";"";164.10;41642.90;"UROK";;\r\n'

Load into pandas data.frames:

Pandas dataframes will be imported into a dictionary such that we can perform actions on all of them and quickly scan whether the import was succesfull.


In [5]:
data_dict = {}
keys = [fl.split('.')[0] for fl in files]

for fl, key in zip(files,keys):
    if key=="trans":
        data_dict[key] = pd.read_csv('Downloads/data_berka/' + fl, sep = ';', 
                                     quotechar = '\"',
                                     dtype = {'account':np.float64})
    else:
        data_dict[key] = pd.read_csv('Downloads/data_berka/' + fl, sep = ';', quotechar = '\"')

if check:
    for key in keys:
        print key
        print 'Shape: ' + str(data_dict[key].shape)
        print 'First_row:' 
        print data_dict[key].iloc[0]
        print '\n'


account
Shape: (4500, 4)
First_row:
account_id                  576
district_id                  55
frequency      POPLATEK MESICNE
date                     930101
Name: 0, dtype: object


card
Shape: (892, 4)
First_row:
card_id               1005
disp_id               9285
type               classic
issued     931107 00:00:00
Name: 0, dtype: object


client
Shape: (5369, 3)
First_row:
client_id            1
birth_number    706213
district_id         18
Name: 0, dtype: int64


disp
Shape: (5369, 4)
First_row:
disp_id           1
client_id         1
account_id        1
type          OWNER
Name: 0, dtype: object


district
Shape: (77, 16)
First_row:
A1               1
A2     Hl.m. Praha
A3          Prague
A4         1204953
A5               0
A6               0
A7               0
A8               1
A9               1
A10            100
A11          12541
A12           0.29
A13           0.43
A14            167
A15          85677
A16          99107
Name: 0, dtype: object


loan
Shape: (682, 7)
First_row:
loan_id         5314
account_id      1787
date          930705
amount         96396
duration          12
payments        8033
status             B
Name: 0, dtype: object


order
Shape: (6471, 6)
First_row:
order_id         29401
account_id           1
bank_to             YZ
account_to    87144583
amount            2452
k_symbol          SIPO
Name: 0, dtype: object


trans
Shape: (1056320, 10)
First_row:
trans_id      695247
account_id      2378
date          930101
type          PRIJEM
operation      VKLAD
amount           700
balance          700
k_symbol         NaN
bank             NaN
account          NaN
Name: 0, dtype: object


/Users/moeben001/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

District headers are not directly usefull.

Print the explanation file.


In [7]:
if check:
    from IPython.core.display import HTML
    html_file = open('Downloads/berka.htm').read()
    HTML(html_file)

Provide a mapping for the district headers (from the explanation file). We do this manually (Scraping is a waste of time).


In [7]:
new_headers = ['district_id', 'district_name', 'region', 
               'inhabitants', 'municipalties_499_less', 
               'municipalties_500_1999', 'municipalties_2000_9999', 
               'municipalties_10000_more', 'cities','urban_ratio',
               'avg_salary', 'unemployment_rate_1995', 
               'unemployment_rate_1996', 'entrepeneurs_per_1000', 
               'committed_crimes_1995','comitted_crimes_1996']
data_dict['district'].columns = new_headers
if check:
    data_dict['district'].describe()

Data cleansing (according to description)

Quickly setting all values to those we are used to work with.


In [8]:
# Set dates to a date format
data_dict['account']['date'] = pd.to_datetime(data_dict['account']['date'], format='%y%m%d')
data_dict['card']['issued'] = pd.to_datetime(data_dict['card']['issued'], 
                                             format='%y%m%d %H:%M:%S')
data_dict['trans']['date'] = pd.to_datetime(data_dict['trans']['date'], format='%y%m%d')
data_dict['loan']['date'] = pd.to_datetime(data_dict['loan']['date'], format='%y%m%d')

# Client needs modification (seperate the sexes and adjust the dates)
data_dict['client']['sex'] = "M"
tmp = (data_dict['client']['birth_number']/100).round().astype(int)
tmp = tmp % 100 > 50
data_dict['client'].loc[tmp, "sex"] = "F"
data_dict['client'].loc[tmp, "birth_number"] = data_dict['client'].loc[tmp, "birth_number"] - 5000

# Assumption: Everyone borne in the 1900's, (min birth_number -> 110820, max -> 870927)
# 113 is rather old.
data_dict['client']['birth_number'] = data_dict['client']['birth_number'] + 19000000
data_dict['client']['birth_number'] = pd.to_datetime(data_dict['client']['birth_number'], 
                                                format='%Y%m%d')

if check:
    print data_dict['account']['date'].describe()
    print data_dict['card']['issued'].describe()
    print data_dict['trans']['date'].describe()
    print data_dict['loan']['date'].describe()
    print data_dict['client']['birth_number'].describe()


count                    4500
unique                   1535
top       1993-02-08 00:00:00
freq                       13
first     1993-01-01 00:00:00
last      1997-12-29 00:00:00
Name: date, dtype: object
count                     892
unique                    607
top       1998-09-29 00:00:00
freq                        7
first     1993-11-07 00:00:00
last      1998-12-29 00:00:00
Name: issued, dtype: object
count                 1056320
unique                   2191
top       1998-06-30 00:00:00
freq                     9269
first     1993-01-01 00:00:00
last      1998-12-31 00:00:00
Name: date, dtype: object
count                     682
unique                    559
top       1998-07-12 00:00:00
freq                        4
first     1993-07-05 00:00:00
last      1998-12-08 00:00:00
Name: date, dtype: object
count                    5369
unique                   4738
top       1952-08-26 00:00:00
freq                        4
first     1911-08-20 00:00:00
last      1987-09-27 00:00:00
Name: birth_number, dtype: object

Change data to its English equivalent

As this is a Czech bank, we have to change the wording to Enlgish to make it workable for us.


In [9]:
# For the account data:
df = data_dict['account']
df.loc[df.frequency == 'POPLATEK MESICNE', 'frequency'] = 'monthly'
df.loc[df.frequency == 'POPLATEK TYDNE', 'frequency'] = 'weekly'
df.loc[df.frequency == 'POPLATEK PO OBRATU', 'frequency'] = 'after_transaction'

data_dict['account'] = df

# For the order data:
df = data_dict['order']
df.loc[df.k_symbol == 'POJISTNE', 'k_symbol'] = 'insurance'
df.loc[df.k_symbol == 'SIPO', 'k_symbol'] = 'household'
df.loc[df.k_symbol == 'LEASING', 'k_symbol'] = 'leasing'
df.loc[df.k_symbol == 'UVER', 'k_symbol'] = 'loan'

data_dict['order'] = df

# For the transaction data:
df = data_dict['trans']
df.loc[df.type == 'PRIJEM', 'type'] = 'credit'
df.loc[df.type == 'VYDAJ', 'type'] = 'withdrawal'
df.loc[df.operation == 'VYBER KARTOU', 'operation'] = 'cc_withdrawal'
df.loc[df.operation == 'VKLAD', 'operation'] = 'cash_credit'
df.loc[df.operation == 'PREVOD Z UCTU', 'operation'] = 'inc_bank'
df.loc[df.operation == 'VYBER', 'operation'] = 'cash_withdrawl'
df.loc[df.operation == 'PREVOD NA UCET', 'operation'] = 'outg_bank'
df.loc[df.k_symbol == 'POJISTNE', 'k_symbol'] = 'insurance'
df.loc[df.k_symbol == 'SLUZBY', 'k_symbol'] = 'payment_statement'
df.loc[df.k_symbol == 'UROK', 'k_symbol'] = 'interest_credited'
df.loc[df.k_symbol == 'SANKC. UROK', 'k_symbol'] = 'interest_sactioned'
df.loc[df.k_symbol == 'SIPO', 'k_symbol'] = 'household'
df.loc[df.k_symbol == 'DUCHOD', 'k_symbol'] = 'pension'
df.loc[df.k_symbol == 'UVER', 'k_symbol'] = 'loan'
data_dict['trans'] = df

# For the loan data:
df = data_dict['loan']
df.loc[df.status == 'A', 'status'] = 'fin_no_problem'
df.loc[df.status == 'B', 'status'] = 'fin_unpaid'
df.loc[df.status == 'C', 'status'] = 'run_no_problem'
df.loc[df.status == 'D', 'status'] = 'run_but_debt'
data_dict['loan'] = df

Merge data into three data.frames, client info, demographic info and transactions:

We can now start to merge the data in to less dataframes. All merges are left-joins such that none of the information in the left data frames is lost.


In [10]:
# Add disposition
client_info = pd.merge(left = data_dict['client'], 
                       right = data_dict['disp'], 
                       how ='left', 
                       on = 'client_id')
# Add account info
client_info = pd.merge(left = client_info, 
                       right = data_dict['account'], 
                       how ='left', 
                       on = 'account_id',
                       suffixes=('_client', '_branch'))

# Add Loans
client_info = pd.merge(left = client_info, 
                       right = data_dict['loan'], 
                       how ='left', 
                       on = 'account_id',
                       suffixes=('_client', '_loan'))
# Add credit cards
client_info = pd.merge(left = client_info, 
                       right = data_dict['card'], 
                       how ='left', 
                       on = 'disp_id',
                       suffixes=('_client', '_card'))

Save data


In [11]:
if not os.path.exists("data"):
    os.makedirs("data")
    
demographic_info = data_dict['district']
transaction_info = data_dict['trans']
order_info = data_dict['order']
loan_info = data_dict['loan']

client_info.to_csv('data/client_info.csv', index = False)
demographic_info.to_csv('data/demographic_data.csv', index = False)
transaction_info.to_csv('data/transction_info.csv', index = False)
order_info.to_csv('data/order_info.csv', index = False)
loan_info.to_csv('data/loan_info.csv', index = False)

In [12]:
client_info = pd.read_csv('data/client_info.csv')
loan_info = pd.read_csv('data/loan_info.csv')