In [1]:
check = True
In [2]:
import urllib
import os
import pandas as pd
import numpy as np
if not os.path.exists("Downloads"):
os.makedirs("Downloads")
if not os.path.isfile("Downloads/data_berka.zip"):
zipfiles = urllib.URLopener()
zipfiles.retrieve("http://lisp.vse.cz/pkdd99/DATA/data_berka.zip", "Downloads/data_berka.zip")
if not os.path.isfile("Downloads/berka.htm"):
challenge_file = urllib.URLopener()
challenge_file.retrieve("http://lisp.vse.cz/pkdd99/Challenge/berka.htm", "Downloads/berka.htm")
In [3]:
import zipfile
if not os.path.exists('Downloads/data_berka'):
os.makedirs('Downloads/data_berka')
zip_ref = zipfile.ZipFile('Downloads/data_berka.zip', 'r')
zip_ref.extractall('Downloads/data_berka')
zip_ref.close()
In [4]:
files = os.listdir('Downloads/data_berka')
if check:
for fl in files:
with open('Downloads/data_berka/' + fl) as myfile:
print fl
lines = myfile.readlines()
print '\t' + repr(lines[0])
print '\t' + repr(lines[1])
print '\t ...'
print '\t' + repr(lines[-2])
print '\t' + repr(lines[-1])
In [5]:
data_dict = {}
keys = [fl.split('.')[0] for fl in files]
for fl, key in zip(files,keys):
if key=="trans":
data_dict[key] = pd.read_csv('Downloads/data_berka/' + fl, sep = ';',
quotechar = '\"',
dtype = {'account':np.float64})
else:
data_dict[key] = pd.read_csv('Downloads/data_berka/' + fl, sep = ';', quotechar = '\"')
if check:
for key in keys:
print key
print 'Shape: ' + str(data_dict[key].shape)
print 'First_row:'
print data_dict[key].iloc[0]
print '\n'
In [7]:
if check:
from IPython.core.display import HTML
html_file = open('Downloads/berka.htm').read()
HTML(html_file)
Provide a mapping for the district headers (from the explanation file). We do this manually (Scraping is a waste of time).
In [7]:
new_headers = ['district_id', 'district_name', 'region',
'inhabitants', 'municipalties_499_less',
'municipalties_500_1999', 'municipalties_2000_9999',
'municipalties_10000_more', 'cities','urban_ratio',
'avg_salary', 'unemployment_rate_1995',
'unemployment_rate_1996', 'entrepeneurs_per_1000',
'committed_crimes_1995','comitted_crimes_1996']
data_dict['district'].columns = new_headers
if check:
data_dict['district'].describe()
In [8]:
# Set dates to a date format
data_dict['account']['date'] = pd.to_datetime(data_dict['account']['date'], format='%y%m%d')
data_dict['card']['issued'] = pd.to_datetime(data_dict['card']['issued'],
format='%y%m%d %H:%M:%S')
data_dict['trans']['date'] = pd.to_datetime(data_dict['trans']['date'], format='%y%m%d')
data_dict['loan']['date'] = pd.to_datetime(data_dict['loan']['date'], format='%y%m%d')
# Client needs modification (seperate the sexes and adjust the dates)
data_dict['client']['sex'] = "M"
tmp = (data_dict['client']['birth_number']/100).round().astype(int)
tmp = tmp % 100 > 50
data_dict['client'].loc[tmp, "sex"] = "F"
data_dict['client'].loc[tmp, "birth_number"] = data_dict['client'].loc[tmp, "birth_number"] - 5000
# Assumption: Everyone borne in the 1900's, (min birth_number -> 110820, max -> 870927)
# 113 is rather old.
data_dict['client']['birth_number'] = data_dict['client']['birth_number'] + 19000000
data_dict['client']['birth_number'] = pd.to_datetime(data_dict['client']['birth_number'],
format='%Y%m%d')
if check:
print data_dict['account']['date'].describe()
print data_dict['card']['issued'].describe()
print data_dict['trans']['date'].describe()
print data_dict['loan']['date'].describe()
print data_dict['client']['birth_number'].describe()
In [9]:
# For the account data:
df = data_dict['account']
df.loc[df.frequency == 'POPLATEK MESICNE', 'frequency'] = 'monthly'
df.loc[df.frequency == 'POPLATEK TYDNE', 'frequency'] = 'weekly'
df.loc[df.frequency == 'POPLATEK PO OBRATU', 'frequency'] = 'after_transaction'
data_dict['account'] = df
# For the order data:
df = data_dict['order']
df.loc[df.k_symbol == 'POJISTNE', 'k_symbol'] = 'insurance'
df.loc[df.k_symbol == 'SIPO', 'k_symbol'] = 'household'
df.loc[df.k_symbol == 'LEASING', 'k_symbol'] = 'leasing'
df.loc[df.k_symbol == 'UVER', 'k_symbol'] = 'loan'
data_dict['order'] = df
# For the transaction data:
df = data_dict['trans']
df.loc[df.type == 'PRIJEM', 'type'] = 'credit'
df.loc[df.type == 'VYDAJ', 'type'] = 'withdrawal'
df.loc[df.operation == 'VYBER KARTOU', 'operation'] = 'cc_withdrawal'
df.loc[df.operation == 'VKLAD', 'operation'] = 'cash_credit'
df.loc[df.operation == 'PREVOD Z UCTU', 'operation'] = 'inc_bank'
df.loc[df.operation == 'VYBER', 'operation'] = 'cash_withdrawl'
df.loc[df.operation == 'PREVOD NA UCET', 'operation'] = 'outg_bank'
df.loc[df.k_symbol == 'POJISTNE', 'k_symbol'] = 'insurance'
df.loc[df.k_symbol == 'SLUZBY', 'k_symbol'] = 'payment_statement'
df.loc[df.k_symbol == 'UROK', 'k_symbol'] = 'interest_credited'
df.loc[df.k_symbol == 'SANKC. UROK', 'k_symbol'] = 'interest_sactioned'
df.loc[df.k_symbol == 'SIPO', 'k_symbol'] = 'household'
df.loc[df.k_symbol == 'DUCHOD', 'k_symbol'] = 'pension'
df.loc[df.k_symbol == 'UVER', 'k_symbol'] = 'loan'
data_dict['trans'] = df
# For the loan data:
df = data_dict['loan']
df.loc[df.status == 'A', 'status'] = 'fin_no_problem'
df.loc[df.status == 'B', 'status'] = 'fin_unpaid'
df.loc[df.status == 'C', 'status'] = 'run_no_problem'
df.loc[df.status == 'D', 'status'] = 'run_but_debt'
data_dict['loan'] = df
In [10]:
# Add disposition
client_info = pd.merge(left = data_dict['client'],
right = data_dict['disp'],
how ='left',
on = 'client_id')
# Add account info
client_info = pd.merge(left = client_info,
right = data_dict['account'],
how ='left',
on = 'account_id',
suffixes=('_client', '_branch'))
# Add Loans
client_info = pd.merge(left = client_info,
right = data_dict['loan'],
how ='left',
on = 'account_id',
suffixes=('_client', '_loan'))
# Add credit cards
client_info = pd.merge(left = client_info,
right = data_dict['card'],
how ='left',
on = 'disp_id',
suffixes=('_client', '_card'))
In [11]:
if not os.path.exists("data"):
os.makedirs("data")
demographic_info = data_dict['district']
transaction_info = data_dict['trans']
order_info = data_dict['order']
loan_info = data_dict['loan']
client_info.to_csv('data/client_info.csv', index = False)
demographic_info.to_csv('data/demographic_data.csv', index = False)
transaction_info.to_csv('data/transction_info.csv', index = False)
order_info.to_csv('data/order_info.csv', index = False)
loan_info.to_csv('data/loan_info.csv', index = False)
In [12]:
client_info = pd.read_csv('data/client_info.csv')
loan_info = pd.read_csv('data/loan_info.csv')