Import packages


In [1]:
from datetime import datetime
import dateutil.parser
import re

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing

# The command below means that the output of multiple commands in a cell will be output at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# The command below tells jupyter to display up to 80 columns, this keeps everything visible
pd.set_option('display.max_columns', 80)
pd.set_option('expand_frame_repr', True)

# Show figures in notebook
%matplotlib inline

# Plotting defaults
plt.rcParams['figure.figsize'] = (15,8)
sns.set()
sns.set(font_scale=1.5)

Create dataframe with basic exploration


In [2]:
path = 'data/'
filename = 'loans.csv'

try:
    df = pd.read_csv(path+filename)
except FileNotFoundError:
    # If data is not found, download it from GitHub
    import os
    os.system(f'git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data {path}')

df.sample(n=2)
print(f'There are {df.shape[0]} observations and {df.shape[1]} features')


Out[2]:
id_number loan_amount lender_count status funded_date funded_amount repayment_term location_country_code sector description use
5376 1529256 8800 282 funded 2018-05-29T11:08:14Z 8800 6 CD Retail Madam Pelagie is the president of the banc vil... to buy and transport 12 goats to increase her ...
4976 1560912 800 19 fundraising NaN 475 11 TG Clothing Ya is a married woman, 55 years old, residing ... to pay for 3 bales of curtains, 3 of bed sheet...
There are 6019 observations and 11 features

Data Types


In [3]:
df.columns.tolist()
df.dtypes
df.describe()
df[df.dtypes[df.dtypes == 'object'].index].describe()


Out[3]:
['id_number',
 'loan_amount',
 'lender_count',
 'status',
 'funded_date',
 'funded_amount',
 'repayment_term',
 'location_country_code',
 'sector',
 'description',
 'use']
Out[3]:
id_number                 int64
loan_amount               int64
lender_count              int64
status                   object
funded_date              object
funded_amount             int64
repayment_term            int64
location_country_code    object
sector                   object
description              object
use                      object
dtype: object
Out[3]:
id_number loan_amount lender_count funded_amount repayment_term
count 6.019000e+03 6019.000000 6019.000000 6019.000000 6019.000000
mean 1.359770e+06 1499.011464 35.661406 1325.070610 11.803290
std 3.719316e+05 2512.517280 73.420256 2444.726815 9.114948
min 1.377200e+04 50.000000 0.000000 0.000000 3.000000
25% 1.425188e+06 300.000000 7.000000 200.000000 8.000000
50% 1.550673e+06 625.000000 16.000000 525.000000 10.000000
75% 1.566204e+06 1825.000000 41.000000 1525.000000 14.000000
max 1.573593e+06 80000.000000 2665.000000 80000.000000 133.000000
Out[3]:
status funded_date location_country_code sector description use
count 6019 5082 6002 6019 5677 5677
unique 3 4453 30 14 5277 4325
top funded 2018-07-22T15:54:41Z CD Food Anthony is 29 years old and a single father of... to pay for a stove.
freq 5082 9 400 1738 2 80

Missing Values


In [4]:
def num_missing(x):
    return sum(x.isnull())

print('Missing values per column:')
print(df.apply(num_missing,axis=0).where(lambda x : x != 0).dropna())


Missing values per column:
funded_date              937.0
location_country_code     17.0
description              342.0
use                      342.0
dtype: float64

Sanity Checks

  • is the range of values what you would expect. For example, are all loan_amounts above 0.
  • do you have the number of rows you would expect
  • is your data for the date range you would expect. For example, is there a strange year in the data like 1880.
  • are there unexpected spikes when you plot the data over time

In [5]:
#specific column analysis for numerical types
print(len(df['lender_count']))
print(max(df['funded_amount']))
print(df['loan_amount'].mean())


6019
80000
1499.0114636982887

In [6]:
#specific column analysis for categorical types
df['status'].value_counts()


Out[6]:
funded         5082
fundraising     841
expired          96
Name: status, dtype: int64

In [7]:
df[['loan_amount','funded_amount']].tail()


Out[7]:
loan_amount funded_amount
6014 200 200
6015 200 200
6016 200 200
6017 200 200
6018 200 200

In [8]:
# get rows 1 through 3 and columns 0 through 5
df.iloc[1:3,:5]


Out[8]:
id_number loan_amount lender_count status funded_date
1 743090 975 34 funded 2014-08-18T09:10:54Z
2 743120 950 25 funded 2014-08-09T17:46:35Z

In [9]:
# get rows with index values of 2-4 and the columns status and lender_count
df.loc[2:4,["status","lender_count"]]


Out[9]:
status lender_count
2 funded 25
3 funded 28
4 funded 21

In [10]:
df.iloc[:,:]


Out[10]:
id_number loan_amount lender_count status funded_date funded_amount repayment_term location_country_code sector description use
0 736066 4825 60 funded 2014-08-03T17:51:50Z 4825 8 BJ Retail NaN NaN
1 743090 975 34 funded 2014-08-18T09:10:54Z 975 12 BJ Food NaN NaN
2 743120 950 25 funded 2014-08-09T17:46:35Z 950 14 BJ Services NaN NaN
3 743121 825 28 funded 2014-08-24T17:00:38Z 825 14 BJ Retail NaN NaN
4 743124 725 21 funded 2014-08-25T03:24:54Z 725 13 BJ Retail NaN NaN
5 743125 725 27 funded 2014-08-18T23:08:00Z 725 13 BJ Retail NaN NaN
6 743130 725 27 funded 2014-08-19T12:14:49Z 725 13 BJ Retail NaN NaN
7 743134 525 14 funded 2014-07-28T06:20:44Z 525 13 BJ Services NaN NaN
8 743207 950 35 funded 2014-08-26T14:24:08Z 950 14 BJ Retail NaN NaN
9 743228 500 10 funded 2014-08-01T01:54:34Z 500 10 BJ Food NaN NaN
10 743233 200 6 funded 2014-07-28T14:06:20Z 200 11 BJ Services NaN NaN
11 743236 225 8 funded 2014-07-29T00:07:22Z 225 14 BJ Food NaN NaN
12 743238 400 10 funded 2014-08-17T22:03:01Z 400 14 BJ Retail NaN NaN
13 744232 5850 176 funded 2014-08-26T22:16:21Z 5850 12 BJ Retail NaN NaN
14 744237 1250 42 funded 2014-08-23T15:33:30Z 1250 10 BJ Clothing NaN NaN
15 744240 500 13 funded 2014-08-20T01:26:40Z 500 10 BJ Retail NaN NaN
16 744243 1000 31 funded 2014-08-26T10:01:10Z 1000 13 BJ Retail NaN NaN
17 767909 300 12 funded 2014-09-15T11:10:34Z 300 10 BJ Arts Ahmed Baba is a Beninese artisan who specializ... to invest in a bulk purchase of raw materials ...
18 1266423 50000 1519 funded 2017-04-18T06:42:43Z 50000 12 BJ Agriculture <a href="http://agpowerbenin.com/"> Tolaro Glo... to add value and jobs to the local economy by ...
19 1480779 50000 1574 funded 2018-04-06T22:46:13Z 50000 14 BJ Agriculture In February 2017, MCE made a 12-month loan of ... to promote the growth of the business by trans...
20 741324 950 30 funded 2014-08-22T18:52:41Z 950 12 BJ Retail NaN NaN
21 741327 1200 40 funded 2014-08-22T18:22:30Z 1200 14 BJ Retail NaN NaN
22 741383 1025 33 funded 2014-08-19T23:41:31Z 1025 12 BJ Services NaN NaN
23 741390 950 30 funded 2014-08-16T21:38:53Z 950 14 BJ Services NaN NaN
24 741395 900 30 funded 2014-08-22T16:39:02Z 900 12 BJ Retail NaN NaN
25 741422 3825 102 funded 2014-08-22T16:14:10Z 3825 9 BJ Retail NaN NaN
26 741431 850 25 funded 2014-08-21T18:18:17Z 850 12 BJ Services NaN NaN
27 741433 875 26 funded 2014-08-22T21:51:59Z 875 12 BJ Retail NaN NaN
28 741435 750 16 funded 2014-08-23T01:11:26Z 750 12 BJ Retail NaN NaN
29 741437 475 15 funded 2014-08-18T15:10:18Z 475 9 BJ Retail NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
5989 1569410 200 6 funded 2018-07-19T19:32:40Z 200 14 ZW Food Fidelity is a 26-year-old entrepreneur who liv... to start her own grocery shop.
5990 1569437 200 5 funded 2018-07-20T13:34:34Z 200 14 ZW Retail Anna is an 18-year-old entrepreneur who lives ... to start her own grocery shop, selling sweets,...
5991 1569452 200 8 funded 2018-07-21T15:30:14Z 200 14 ZW Retail Beverly is a 21-year-old entrepreneur who live... to purchase more groceries to sell and expand ...
5992 1569456 200 8 funded 2018-07-19T21:37:39Z 200 14 ZW Food Nancy is an 18-year-old entrepreneur who lives... to buy grocery products for her business.
5993 1569459 200 6 funded 2018-07-20T08:50:21Z 200 14 ZW Food Petronella is a 24-year-old entrepreneur who l... to buy grocery products for her business.
5994 1569464 200 8 funded 2018-07-21T11:30:11Z 200 14 ZW Retail Princess is a 20-year-old entrepreneur who liv... to purchase hair products.
5995 1569471 200 7 funded 2018-07-20T14:19:48Z 200 14 ZW Retail Edith is a 23-year-old entrepreneur who lives ... to start her own grocery shop selling differen...
5996 1569476 200 7 funded 2018-07-20T01:17:06Z 200 14 ZW Clothing Mary is a 24-year-old entrepreneur who lives w... to start her own clothing shop.
5997 1569481 200 8 funded 2018-07-20T15:47:27Z 200 14 ZW Food Nozinhle is a 25-year-old entrepreneur who liv... to buy groceries and hair products to resell.
5998 1569497 500 13 funded 2018-07-20T14:08:01Z 500 14 ZW Agriculture Sinanisiwe is a 30-year-old entrepreneur who l... to buy more stock, chicks and chick feed.
5999 1568794 200 8 funded 2018-07-19T12:39:07Z 200 14 ZW Food Mary is an 18-year-old entrepreneur who lives ... to increase the size and variety of her stock.
6000 1568795 200 8 funded 2018-07-18T21:21:30Z 200 14 ZW Manufacturing Rosemary is an 18-year-old entrepreneur who li... to start her own business manufacturing deterg...
6001 1568806 200 8 funded 2018-07-19T01:41:56Z 200 14 ZW Agriculture Sharon is a 17-year-old entrepreneur who lives... to buy chicks and chick feed.
6002 1568807 200 8 funded 2018-07-18T22:21:38Z 200 14 ZW Agriculture Loveness is a 29-year-old entrepreneur who liv... to start her own business growing crops to sel...
6003 1568825 200 6 funded 2018-07-19T11:54:08Z 200 14 ZW Clothing Joyce is a 25-year-old entrepreneur who lives ... to buy more clothes for her business.
6004 1568826 200 8 funded 2018-07-19T07:20:45Z 200 14 ZW Agriculture Shebba is a 22-year-old entrepreneur who lives... to start her own business of rearing chickens.
6005 1568829 200 8 funded 2018-07-19T02:02:36Z 200 14 ZW Agriculture Priscilar is an 18-year-old entrepreneur who l... to buy chicks and chick feed for her business.
6006 1568834 200 6 funded 2018-07-18T23:00:49Z 200 14 ZW Agriculture Delayed is a 19-year-old entrepreneur who live... to start her own business rearing chickens.
6007 1568835 200 6 funded 2018-07-19T06:24:30Z 200 14 ZW Agriculture Matilda is a 20-year-old entrepreneur who live... to start her own poultry business keeping broi...
6008 1568840 200 7 funded 2018-07-19T02:32:48Z 200 14 ZW Clothing Belinda is a 23-year-old entrepreneur who live... to start her own business selling clothes.
6009 1568842 200 8 funded 2018-07-19T04:57:26Z 200 14 ZW Agriculture Nyasha is a 22-year-old entrepreneur who lives... to start her own business rearing chickens.
6010 1568843 200 8 funded 2018-07-19T00:46:32Z 200 14 ZW Agriculture Lydia is a 19-year-old entrepreneur who lives ... to start her own poultry business keeping broi...
6011 1568850 200 8 funded 2018-07-19T07:58:32Z 200 14 ZW Food Linda is an 18-year-old entrepreneur who lives... to start a grocery business.
6012 1568857 200 7 funded 2018-07-19T02:00:45Z 200 14 ZW Retail Clara is a 19-year-old entrepreneur who lives ... to start her own grocery shop, where she will ...
6013 1568859 200 8 funded 2018-07-18T23:57:55Z 200 14 ZW Clothing Molline is a 34-year-old entrepreneur who live... to start a clothing and accessories retail bus...
6014 1568871 200 8 funded 2018-07-19T15:14:35Z 200 14 ZW Food Sethukelo is a 19-year-old entrepreneur who li... to purchase goods for starting a grocery store.
6015 1568880 200 8 funded 2018-07-19T19:22:43Z 200 14 ZW Food Hlanjiwe is a 20-year-old entrepreneur who liv... to buy grocery goods for her business.
6016 1568883 200 6 funded 2018-07-19T20:18:53Z 200 14 ZW Clothing Lebuhani is a 21-year-old entrepreneur who liv... to buy clothes for her business.
6017 1568887 200 8 funded 2018-07-18T23:38:44Z 200 14 ZW Food Jacqueline is a 23-year-old entrepreneur who l... her to buy goods to sell in her store.
6018 1568890 200 8 funded 2018-07-19T16:54:18Z 200 14 ZW Food Delligent is a 23-year-old entrepreneur who li... to buy grocery goods for her business.

6019 rows × 11 columns


In [11]:
df[df['status'] == 'fundraising']


Out[11]:
id_number loan_amount lender_count status funded_date funded_amount repayment_term location_country_code sector description use
210 1567810 950 15 fundraising NaN 750 8 BF Retail The group is made up of 3 members of whom the ... to buy some suitcases in quantity for resale.
213 1567905 2950 2 fundraising NaN 50 14 BF Food Fhatimatoun is 35 years old, married and mothe... to buy sacks of cereals and charcoal to sell.
214 1567938 2600 3 fundraising NaN 75 14 BF Food Matagara is 48 years old, married, and is the ... who wants to buy powdered milk and other ingre...
217 1568867 1875 4 fundraising NaN 100 10 BF Food Tewende Group has just finished their first Ki... to buy sprouted millet for the preparation of ...
218 1568894 2425 26 fundraising NaN 1125 8 BF Food Kiswendsida has just finished its Kiva loan an... to buy wheat flour and oil for the fried food ...
219 1568898 900 6 fundraising NaN 175 8 BF Food The group named “Eben Ezer” just finished its ... to pay for rice and condiments for her restaur...
220 1569523 1800 6 fundraising NaN 175 8 BF Clothing The Sidzadba group just completed its [previou... to buy a large quantity of "pagnes" to resell.
222 1560770 1275 31 fundraising NaN 800 10 BF Food Zeneba is 52 yeas old, married and the mother ... to buy 5 bags of rice, 10 packages of spaghett...
230 1562208 900 19 fundraising NaN 500 14 BF Agriculture Daouda is married and the father of two childr... To buy a hundred chickens.
231 1562277 900 5 fundraising NaN 150 14 BF Clothing Seni 1er Jumeau is married and father of four ... to buy 50 handbags and a dozen pairs of shoes.
232 1562341 900 6 fundraising NaN 150 14 BF Food Mr. LASSANE is the married father of two child... to pay for sugar, oil, and rice for his grocer...
242 1559166 1325 7 fundraising NaN 175 14 BF Retail Mr Ablace is married. He is the father of 2 ch... to buy 20 dozen tires and 10 dozen inner tubes.
275 1558451 1800 36 fundraising NaN 1150 12 BF Clothing Ramata is 50 years old, married, and the mothe... buy clothing.
280 1559148 900 11 fundraising NaN 325 14 BF Clothing Mr. Andre is unmarried. He has four dependents... to buy six cartons of shoes.
295 1556364 2400 25 fundraising NaN 650 14 BF Services Cécile is thirty years old and single. She is ... to buy beauty products and hair extensions to ...
601 1564730 550 6 fundraising NaN 150 15 CM Food Mireille is a 41 year old woman, engaged, moth... to buy cutlery and food items.
604 1565401 550 7 fundraising NaN 175 15 CM Services Claudine, 29, is a young woman who is engaged ... to buy hair extensions and cosmetics.
607 1566174 375 3 fundraising NaN 75 15 CM Services Gilles Brice is thirty-five years old, single ... to buy an audio visual projector.
609 1566286 550 1 fundraising NaN 25 15 CM Food Mr. Louis Marie is a 52-year-old man. He is wi... to buy merchandise.
610 1566835 1075 33 fundraising NaN 850 25 CM Services Flourence is separated with five children. The... to pay for rent, medicine, and other items.
613 1566985 375 2 fundraising NaN 50 15 CM Food Jeanette is a 36-year-old woman, married and t... to buy sacks of rice, cans of oil and the smal...
615 1568720 450 3 fundraising NaN 75 15 CM Retail Brigitte is a young woman, 34 years old, marri... to buy some perfume and some clothes.
616 1568869 625 0 fundraising NaN 0 15 CM Retail Adamou is a 51-year-old engaged man, father of... to buy charcoal stoves.
617 1569382 225 0 fundraising NaN 0 15 CM Food Miree is a fifty-five-year-old woman and the m... to buy a lot of fruits.
618 1569487 900 1 fundraising NaN 25 15 CM Wholesale Viviane is a thirty-seven-year-old married wom... to buy dry foodstuffs wholesale.
619 1569514 900 0 fundraising NaN 0 15 CM Clothing Audrey, 24, is a young single woman with no ch... to buy baby products (clothes, lotions, diaper...
620 1569568 450 0 fundraising NaN 0 15 CM Food Hubert is a 40-year-old man engaged to be marr... to pay for eggs.
633 1565433 450 6 fundraising NaN 150 20 CM Agriculture Nelson is married with three children. He and ... to buy farm inputs, manure, seedlings and pest...
638 1565481 450 4 fundraising NaN 100 20 CM Retail He is a widower and father of two who live wit... to pay rent, purchase tea flasks, tins of oval...
647 1561721 550 12 fundraising NaN 375 19 CM Agriculture He is married with six children. They live in ... to pay for clearing, tilling, seedlings and fe...
... ... ... ... ... ... ... ... ... ... ... ...
5851 1572453 1150 0 fundraising NaN 0 9 ZW Services Eustina has close to 15 years experience in ha... to purchase more weaves and wigs.
5853 1572465 1200 0 fundraising NaN 0 9 ZW Clothing Betty is a 60-year-old married grandmother and... to restock more jersey materials from town and...
5854 1572470 1100 0 fundraising NaN 0 9 ZW Services Ruramai has been a tailor since 2005 so she no... to buy clothing materials.
5855 1572488 1000 2 fundraising NaN 50 9 ZW Services -Together, everyone achieves more' is the mott... to buy bulk braids and weaves and some hair ch...
5856 1572515 1250 0 fundraising NaN 0 9 ZW Retail Yvonne’s flea market is well known because of ... to restock fish, blankets, comforters, and win...
5857 1572519 1300 0 fundraising NaN 0 8 ZW Agriculture Gladys comes from Idube Farm, an area in centr... to buy a batch of 100 birds and feed for her c...
5858 1572522 1250 3 fundraising NaN 75 9 ZW Health Abigirl has gathered great knowledge and ideas... to buy more herbs.
5862 1570622 500 5 fundraising NaN 125 14 ZW Clothing Tapiwa is a 24-year-old entrepreneur who lives... to buy more clothes.
5864 1570646 1200 7 fundraising NaN 175 9 ZW Food Having realized the existence of a niche marke... to buy more bakery ingredients in bulk.
5865 1570653 1100 0 fundraising NaN 0 9 ZW Retail Tadzeyi is a 42-year-old married mother of fou... to restock more groceries in bulk and to resto...
5867 1570659 1000 0 fundraising NaN 0 9 ZW Clothing At the age of 50, Hazvinei still feels that sh... to restock more bedding products and clothes f...
5868 1570661 1700 0 fundraising NaN 0 9 ZW Agriculture Ndaizivei is part of the Billionaires Group, w... to restock more feed for the chicks and add an...
5870 1570741 1150 0 fundraising NaN 0 9 ZW Clothing Residing in Stoneridge Park, Harare is a 38-ye... to restock clothes and blankets for her flea m...
5872 1571452 1000 5 fundraising NaN 125 8 ZW Food Comfort is a male aged 35 years old living in ... to purchase additional stock of drinks, cookin...
5874 1571492 900 10 fundraising NaN 250 8 ZW Clothing Revai is a female, aged 45 years and living in... to purchase additional additional stock of sc...
5876 1571543 600 13 fundraising NaN 325 8 ZW Services Kudzanayi is a 49-year-old woman living in Har... to purchase additional stocks of material to f...
5877 1571571 3000 7 fundraising NaN 250 8 ZW Retail Samuel is a married man aged 44 years with thr... to buy more grocery and footwear stocks.
5879 1570671 1200 2 fundraising NaN 50 9 ZW Retail Miriam is a 34-year-old mother of two school-g... to purchase more mobile phones with different ...
5880 1570686 2800 44 fundraising NaN 1225 8 ZW Agriculture Toriya raises poultry, keeping several types o... to purchase an incubator which allows her to h...
5882 1570692 1000 2 fundraising NaN 50 9 ZW Retail Sarah is a hardworking and determined lady age... to buy vegetables, bananas, fruits, potatoes a...
5883 1570698 1000 0 fundraising NaN 0 9 ZW Agriculture Evernice is a 43 year old businesslady residin... to buy hair chemicals and a water pump.
5885 1570706 1800 4 fundraising NaN 100 8 ZW Retail Dudzai is part of the Cheetah group, which con... to restock groceries for her tuck shop.
5887 1570721 1600 6 fundraising NaN 150 8 ZW Agriculture 32-year-old Moblin lives in Kasimbwi village. ... to add another batch of 100 birds.
5890 1570726 1400 0 fundraising NaN 0 9 ZW Clothing Tatenda is a young single mother of three chil... to restock cosmetics, weaves and clothes.
5892 1570730 1250 6 fundraising NaN 150 8 ZW Clothing With her ailing parents in mind, Grace could n... to expand her business by restocking warm clot...
5895 1570733 1000 0 fundraising NaN 0 9 ZW Food Increasing business value has always been a de... to restock kapenta, charcoal and paraffin from...
5897 1570738 1000 14 fundraising NaN 400 9 ZW Retail Sarudzai is one of two business-minded ladies ... to add electricity tokens, eggs, fruits, veget...
5898 1570739 200 2 fundraising NaN 50 14 ZW Clothing Kupukai is a 38-year-old entrepreneur, who liv... to buy a variety of clothes.
5915 1570649 2550 6 fundraising NaN 150 8 ZW Clothing Elizabeth is a 45-year-old married mother of 3... to buy more clothes, shoes, and bags to expand...
5918 1570687 1000 4 fundraising NaN 100 8 ZW Clothing Chihoro is a group with four lively women, who... to restock a variety of shoes and clothes to a...

841 rows × 11 columns


In [12]:
df[(df['status'] == 'fundraising')|(df['status'] == 'expired')]


Out[12]:
id_number loan_amount lender_count status funded_date funded_amount repayment_term location_country_code sector description use
210 1567810 950 15 fundraising NaN 750 8 BF Retail The group is made up of 3 members of whom the ... to buy some suitcases in quantity for resale.
213 1567905 2950 2 fundraising NaN 50 14 BF Food Fhatimatoun is 35 years old, married and mothe... to buy sacks of cereals and charcoal to sell.
214 1567938 2600 3 fundraising NaN 75 14 BF Food Matagara is 48 years old, married, and is the ... who wants to buy powdered milk and other ingre...
217 1568867 1875 4 fundraising NaN 100 10 BF Food Tewende Group has just finished their first Ki... to buy sprouted millet for the preparation of ...
218 1568894 2425 26 fundraising NaN 1125 8 BF Food Kiswendsida has just finished its Kiva loan an... to buy wheat flour and oil for the fried food ...
219 1568898 900 6 fundraising NaN 175 8 BF Food The group named “Eben Ezer” just finished its ... to pay for rice and condiments for her restaur...
220 1569523 1800 6 fundraising NaN 175 8 BF Clothing The Sidzadba group just completed its [previou... to buy a large quantity of "pagnes" to resell.
222 1560770 1275 31 fundraising NaN 800 10 BF Food Zeneba is 52 yeas old, married and the mother ... to buy 5 bags of rice, 10 packages of spaghett...
230 1562208 900 19 fundraising NaN 500 14 BF Agriculture Daouda is married and the father of two childr... To buy a hundred chickens.
231 1562277 900 5 fundraising NaN 150 14 BF Clothing Seni 1er Jumeau is married and father of four ... to buy 50 handbags and a dozen pairs of shoes.
232 1562341 900 6 fundraising NaN 150 14 BF Food Mr. LASSANE is the married father of two child... to pay for sugar, oil, and rice for his grocer...
242 1559166 1325 7 fundraising NaN 175 14 BF Retail Mr Ablace is married. He is the father of 2 ch... to buy 20 dozen tires and 10 dozen inner tubes.
275 1558451 1800 36 fundraising NaN 1150 12 BF Clothing Ramata is 50 years old, married, and the mothe... buy clothing.
280 1559148 900 11 fundraising NaN 325 14 BF Clothing Mr. Andre is unmarried. He has four dependents... to buy six cartons of shoes.
295 1556364 2400 25 fundraising NaN 650 14 BF Services Cécile is thirty years old and single. She is ... to buy beauty products and hair extensions to ...
402 1231896 4750 54 expired NaN 3550 10 BI Food Evariste is part of the Mageyo group and lives... to increase his capital and purchase a large q...
423 1213488 4700 76 expired NaN 3000 8 BI Food Jacques is a member of the Mutumba Business Gr... to buy a bag of sugar, a bag of corn flour and...
430 1215845 2575 57 expired NaN 1775 8 BI Food Jeanine is part of the Twungubumwe group and l... to increase her capital in order to buy banana...
433 1218187 2900 37 expired NaN 1625 9 BI Food Elie belongs to the Kayago Groupe and lives in... to increase their capital and purchase 40 crat...
434 1218287 4400 40 expired NaN 1850 10 BI Food Laurent is a member of the Twijukire-Ibikorwa ... to bolster his capital and buy unripe bananas...
435 1220071 5375 53 expired NaN 1650 10 BI Food Marcien is a member of the group called Gitwe-... to grow his working capital and buy a pig to s...
436 1220107 2875 58 expired NaN 2450 10 BI Clothing Alexis is part of the Gitwe-Twitezimbere group... to increase his capital and buy clothing to re...
439 1231289 4375 45 expired NaN 2025 9 BI Food Isidore is a member of the Butanuka group and ... to buy palm oil for resale in order to earn more.
440 1231292 3425 51 expired NaN 2975 9 BI Retail Egide is part of the Yagurukundo group and liv... to increase their capital and buy rice, beans,...
443 1181213 3350 58 expired NaN 2050 9 BI Retail Claude is a member of the Giriyuja-Muyira grou... to buy a large supply of beans, cassava flour,...
445 1192075 3750 23 expired NaN 2500 10 BI Food Claver is a member of the Twibukanye group and... to buy a large amount of beer and lemonade to ...
450 1199327 3900 52 expired NaN 2725 9 BI Food Radjabu belongs to the Senga group and lives i... to increase his capital and buy a large quanti...
457 1207767 4025 69 expired NaN 3775 14 BI Clothing Odette is a member of the Tubandanye group and... to increase her capital and buy clothes for re...
458 1207824 4125 38 expired NaN 1500 14 BI Retail Emmanuel is a member of Kundayehova Group and ... to increase his capital and purchase medicines...
460 1208874 3850 36 expired NaN 3125 10 BI Food Eric is a member of the Turavyiyemeje group an... to increase his working capital and to buy avo...
... ... ... ... ... ... ... ... ... ... ... ...
5851 1572453 1150 0 fundraising NaN 0 9 ZW Services Eustina has close to 15 years experience in ha... to purchase more weaves and wigs.
5853 1572465 1200 0 fundraising NaN 0 9 ZW Clothing Betty is a 60-year-old married grandmother and... to restock more jersey materials from town and...
5854 1572470 1100 0 fundraising NaN 0 9 ZW Services Ruramai has been a tailor since 2005 so she no... to buy clothing materials.
5855 1572488 1000 2 fundraising NaN 50 9 ZW Services -Together, everyone achieves more' is the mott... to buy bulk braids and weaves and some hair ch...
5856 1572515 1250 0 fundraising NaN 0 9 ZW Retail Yvonne’s flea market is well known because of ... to restock fish, blankets, comforters, and win...
5857 1572519 1300 0 fundraising NaN 0 8 ZW Agriculture Gladys comes from Idube Farm, an area in centr... to buy a batch of 100 birds and feed for her c...
5858 1572522 1250 3 fundraising NaN 75 9 ZW Health Abigirl has gathered great knowledge and ideas... to buy more herbs.
5862 1570622 500 5 fundraising NaN 125 14 ZW Clothing Tapiwa is a 24-year-old entrepreneur who lives... to buy more clothes.
5864 1570646 1200 7 fundraising NaN 175 9 ZW Food Having realized the existence of a niche marke... to buy more bakery ingredients in bulk.
5865 1570653 1100 0 fundraising NaN 0 9 ZW Retail Tadzeyi is a 42-year-old married mother of fou... to restock more groceries in bulk and to resto...
5867 1570659 1000 0 fundraising NaN 0 9 ZW Clothing At the age of 50, Hazvinei still feels that sh... to restock more bedding products and clothes f...
5868 1570661 1700 0 fundraising NaN 0 9 ZW Agriculture Ndaizivei is part of the Billionaires Group, w... to restock more feed for the chicks and add an...
5870 1570741 1150 0 fundraising NaN 0 9 ZW Clothing Residing in Stoneridge Park, Harare is a 38-ye... to restock clothes and blankets for her flea m...
5872 1571452 1000 5 fundraising NaN 125 8 ZW Food Comfort is a male aged 35 years old living in ... to purchase additional stock of drinks, cookin...
5874 1571492 900 10 fundraising NaN 250 8 ZW Clothing Revai is a female, aged 45 years and living in... to purchase additional additional stock of sc...
5876 1571543 600 13 fundraising NaN 325 8 ZW Services Kudzanayi is a 49-year-old woman living in Har... to purchase additional stocks of material to f...
5877 1571571 3000 7 fundraising NaN 250 8 ZW Retail Samuel is a married man aged 44 years with thr... to buy more grocery and footwear stocks.
5879 1570671 1200 2 fundraising NaN 50 9 ZW Retail Miriam is a 34-year-old mother of two school-g... to purchase more mobile phones with different ...
5880 1570686 2800 44 fundraising NaN 1225 8 ZW Agriculture Toriya raises poultry, keeping several types o... to purchase an incubator which allows her to h...
5882 1570692 1000 2 fundraising NaN 50 9 ZW Retail Sarah is a hardworking and determined lady age... to buy vegetables, bananas, fruits, potatoes a...
5883 1570698 1000 0 fundraising NaN 0 9 ZW Agriculture Evernice is a 43 year old businesslady residin... to buy hair chemicals and a water pump.
5885 1570706 1800 4 fundraising NaN 100 8 ZW Retail Dudzai is part of the Cheetah group, which con... to restock groceries for her tuck shop.
5887 1570721 1600 6 fundraising NaN 150 8 ZW Agriculture 32-year-old Moblin lives in Kasimbwi village. ... to add another batch of 100 birds.
5890 1570726 1400 0 fundraising NaN 0 9 ZW Clothing Tatenda is a young single mother of three chil... to restock cosmetics, weaves and clothes.
5892 1570730 1250 6 fundraising NaN 150 8 ZW Clothing With her ailing parents in mind, Grace could n... to expand her business by restocking warm clot...
5895 1570733 1000 0 fundraising NaN 0 9 ZW Food Increasing business value has always been a de... to restock kapenta, charcoal and paraffin from...
5897 1570738 1000 14 fundraising NaN 400 9 ZW Retail Sarudzai is one of two business-minded ladies ... to add electricity tokens, eggs, fruits, veget...
5898 1570739 200 2 fundraising NaN 50 14 ZW Clothing Kupukai is a 38-year-old entrepreneur, who liv... to buy a variety of clothes.
5915 1570649 2550 6 fundraising NaN 150 8 ZW Clothing Elizabeth is a 45-year-old married mother of 3... to buy more clothes, shoes, and bags to expand...
5918 1570687 1000 4 fundraising NaN 100 8 ZW Clothing Chihoro is a group with four lively women, who... to restock a variety of shoes and clothes to a...

937 rows × 11 columns


In [13]:
df.groupby(['sector'])['loan_amount'].sum().reset_index()


Out[13]:
sector loan_amount
0 Agriculture 1518025
1 Arts 65025
2 Clothing 940975
3 Construction 143950
4 Education 215850
5 Food 2973075
6 Health 263575
7 Housing 72800
8 Manufacturing 43200
9 Personal Use 170025
10 Retail 2049150
11 Services 465400
12 Transportation 62750
13 Wholesale 38750

Other Resources

Exploratory data analysis visualization techniques

Scatter Plots


In [14]:
# visualize relationship between two continuous features
numeric_vars = df.select_dtypes(include=[np.number]).columns.tolist()
for variable in numeric_vars:
    print(variable)


id_number
loan_amount
lender_count
funded_amount
repayment_term

In [15]:
ax = sns.regplot(x='lender_count', y='loan_amount',data=df)



In [16]:
sns.pairplot(df, hue='status', vars=['loan_amount','funded_amount'], dropna=True, diag_kind='hist');


Histograms


In [17]:
# distribution of values for a single variable
sns.distplot(df['loan_amount'].dropna(axis=0));



In [18]:
small_loans_df = df[(df['loan_amount']<5000)]
sns.distplot(small_loans_df['loan_amount']);


Bar Plot


In [19]:
# understanding how categorical groups are different with respect to a continous variable
p = sns.barplot(x='sector', y='loan_amount', data=df, estimator=np.mean);
p.set(title='Average loan amount by sector')
p.set_xticklabels(p.get_xticklabels(),rotation=-45);



In [20]:
p = sns.barplot(x='sector', y='loan_amount', data=df, estimator=np.sum);
p.set(title='Total loan amount by sector')
p.set_xticklabels(p.get_xticklabels(),rotation=-45);


Box Plots


In [21]:
# describes distribution of data based on min, max, median, first & third quartile
p = sns.boxplot(x='sector',
                y='loan_amount',
                data = df[(df['loan_amount']<50000)]);
p.set(title = 'Loan amounts across sectors');
p.set_xticklabels(p.get_xticklabels(),rotation=-45);


Time series


In [22]:
# useful to see how variable changes over time
time_column = 'funded_date'
df[time_column] = pd.to_datetime(df[time_column])

# resample to monthly intervals, taking mean of y variable
time_data = df.resample('M', on=time_column)['loan_amount'].mean().fillna(0)
fig, ax = plt.subplots(figsize=(15,18))
ax.plot(time_data)
plt.title('Mean loan_amount over time');


/Users/Jenny/anaconda3/lib/python3.7/site-packages/pandas/plotting/_converter.py:129: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)

In [23]:
# resample to monthly intervals, taking mean of y variable
time_data = df.resample('7D', on=time_column)['loan_amount'].mean().fillna(0)
fig, ax = plt.subplots(figsize=(15,18))
ax.plot(time_data)
plt.title('Mean loan_amount over time');


Feature Engineering

Feature pruning


In [24]:
# removing features that have 0 variation

for col in df.columns:
    if df[col].unique().size==0:
        print("Dropping column: {0}".format(col))
        df = df.drop(col,axis=1)

Temporal features


In [25]:
# raw datetimes may not be useful but many algos can find patterns if they are informed which obs occur in a given year (weekend vs. weekday, holidays etc)

columns = []
for col in [c for c in df.columns if "_date" in c]:
    df[col] = pd.to_datetime(df[col])

In [26]:
# .dt.accessor enables easy construction of additional features based off of datetimes

df_test = df
#df_test['posted_year'] = df_test['posted_date'].dt.year
#df_test['posted_month'] = df_test['posted_date'].dt.month

#df_test['time_to_fund'] = df_test['funded_date'] - df_test['posted_date']
#df_test['days_to_fund'] = df_test['time_to_fund'].dt.days

One-hot encoding


In [27]:
# converting categorical or string data into a binary
pd.get_dummies(df.sector).head(n=2)


Out[27]:
Agriculture Arts Clothing Construction Education Food Health Housing Manufacturing Personal Use Retail Services Transportation Wholesale
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
1 0 0 0 0 0 1 0 0 0 0 0 0 0 0

Extracting features from strings


In [28]:
#df.get_dtype_counts()
df.select_dtypes(include=[object])


Out[28]:
status location_country_code sector description use
0 funded BJ Retail NaN NaN
1 funded BJ Food NaN NaN
2 funded BJ Services NaN NaN
3 funded BJ Retail NaN NaN
4 funded BJ Retail NaN NaN
5 funded BJ Retail NaN NaN
6 funded BJ Retail NaN NaN
7 funded BJ Services NaN NaN
8 funded BJ Retail NaN NaN
9 funded BJ Food NaN NaN
10 funded BJ Services NaN NaN
11 funded BJ Food NaN NaN
12 funded BJ Retail NaN NaN
13 funded BJ Retail NaN NaN
14 funded BJ Clothing NaN NaN
15 funded BJ Retail NaN NaN
16 funded BJ Retail NaN NaN
17 funded BJ Arts Ahmed Baba is a Beninese artisan who specializ... to invest in a bulk purchase of raw materials ...
18 funded BJ Agriculture <a href="http://agpowerbenin.com/"> Tolaro Glo... to add value and jobs to the local economy by ...
19 funded BJ Agriculture In February 2017, MCE made a 12-month loan of ... to promote the growth of the business by trans...
20 funded BJ Retail NaN NaN
21 funded BJ Retail NaN NaN
22 funded BJ Services NaN NaN
23 funded BJ Services NaN NaN
24 funded BJ Retail NaN NaN
25 funded BJ Retail NaN NaN
26 funded BJ Services NaN NaN
27 funded BJ Retail NaN NaN
28 funded BJ Retail NaN NaN
29 funded BJ Retail NaN NaN
... ... ... ... ... ...
5989 funded ZW Food Fidelity is a 26-year-old entrepreneur who liv... to start her own grocery shop.
5990 funded ZW Retail Anna is an 18-year-old entrepreneur who lives ... to start her own grocery shop, selling sweets,...
5991 funded ZW Retail Beverly is a 21-year-old entrepreneur who live... to purchase more groceries to sell and expand ...
5992 funded ZW Food Nancy is an 18-year-old entrepreneur who lives... to buy grocery products for her business.
5993 funded ZW Food Petronella is a 24-year-old entrepreneur who l... to buy grocery products for her business.
5994 funded ZW Retail Princess is a 20-year-old entrepreneur who liv... to purchase hair products.
5995 funded ZW Retail Edith is a 23-year-old entrepreneur who lives ... to start her own grocery shop selling differen...
5996 funded ZW Clothing Mary is a 24-year-old entrepreneur who lives w... to start her own clothing shop.
5997 funded ZW Food Nozinhle is a 25-year-old entrepreneur who liv... to buy groceries and hair products to resell.
5998 funded ZW Agriculture Sinanisiwe is a 30-year-old entrepreneur who l... to buy more stock, chicks and chick feed.
5999 funded ZW Food Mary is an 18-year-old entrepreneur who lives ... to increase the size and variety of her stock.
6000 funded ZW Manufacturing Rosemary is an 18-year-old entrepreneur who li... to start her own business manufacturing deterg...
6001 funded ZW Agriculture Sharon is a 17-year-old entrepreneur who lives... to buy chicks and chick feed.
6002 funded ZW Agriculture Loveness is a 29-year-old entrepreneur who liv... to start her own business growing crops to sel...
6003 funded ZW Clothing Joyce is a 25-year-old entrepreneur who lives ... to buy more clothes for her business.
6004 funded ZW Agriculture Shebba is a 22-year-old entrepreneur who lives... to start her own business of rearing chickens.
6005 funded ZW Agriculture Priscilar is an 18-year-old entrepreneur who l... to buy chicks and chick feed for her business.
6006 funded ZW Agriculture Delayed is a 19-year-old entrepreneur who live... to start her own business rearing chickens.
6007 funded ZW Agriculture Matilda is a 20-year-old entrepreneur who live... to start her own poultry business keeping broi...
6008 funded ZW Clothing Belinda is a 23-year-old entrepreneur who live... to start her own business selling clothes.
6009 funded ZW Agriculture Nyasha is a 22-year-old entrepreneur who lives... to start her own business rearing chickens.
6010 funded ZW Agriculture Lydia is a 19-year-old entrepreneur who lives ... to start her own poultry business keeping broi...
6011 funded ZW Food Linda is an 18-year-old entrepreneur who lives... to start a grocery business.
6012 funded ZW Retail Clara is a 19-year-old entrepreneur who lives ... to start her own grocery shop, where she will ...
6013 funded ZW Clothing Molline is a 34-year-old entrepreneur who live... to start a clothing and accessories retail bus...
6014 funded ZW Food Sethukelo is a 19-year-old entrepreneur who li... to purchase goods for starting a grocery store.
6015 funded ZW Food Hlanjiwe is a 20-year-old entrepreneur who liv... to buy grocery goods for her business.
6016 funded ZW Clothing Lebuhani is a 21-year-old entrepreneur who liv... to buy clothes for her business.
6017 funded ZW Food Jacqueline is a 23-year-old entrepreneur who l... her to buy goods to sell in her store.
6018 funded ZW Food Delligent is a 23-year-old entrepreneur who li... to buy grocery goods for her business.

6019 rows × 5 columns


In [29]:
df['description_length'] = df.description.str.len()
df['description_length'].tail()


Out[29]:
6014    660.0
6015    588.0
6016    614.0
6017    591.0
6018    592.0
Name: description_length, dtype: float64

Feature Scaling

Three common feature scaling techniques (depends on algo!)

  • normalization (rescale from 0-1)

X_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_rescaled = X_norm * (max - min) + min


In [30]:
min_max_scaler = preprocessing.MinMaxScaler()
normalized = min_max_scaler.fit_transform(df['loan_amount'].astype(np.float64).values.reshape(-1,1))[:,0]
print("Pre Scaling\tMin: {0}\t\tMax: {1}\tMean: {2:3f}".format(df['loan_amount'].min(),df['loan_amount'].max(),df['loan_amount'].mean()))
print("Post Scaling\tMin: {0}\tMax: {1:.3f}\tMean: {2:3f}".format(np.min(normalized),np.max(normalized),np.mean(normalized)))


Pre Scaling	Min: 50		Max: 80000	Mean: 1499.011464
Post Scaling	Min: 0.0	Max: 1.000	Mean: 0.018124
  • standardization (scales data so it has 0 variance)

(X - mean) / standard deviation


In [31]:
standardized = preprocessing.scale(df['loan_amount'].astype(np.float64))
print("Post Scaling\tMin: {0:3f}\t Max: {1:3f}\tMean: {2:3f}".format(np.min(standardized),np.max(standardized),np.mean(standardized)))


Post Scaling	Min: -0.576765	 Max: 31.246555	Mean: -0.000000
  • log-transformation

Used to fix skewed distributions


In [32]:
# find features with a skewed distribution

plt.hist(df['loan_amount'])
plt.show()
plt.hist(np.log(df['loan_amount']))
plt.show()


Out[32]:
(array([5.944e+03, 6.900e+01, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        5.000e+00, 0.000e+00, 0.000e+00, 1.000e+00]),
 array([5.0000e+01, 8.0450e+03, 1.6040e+04, 2.4035e+04, 3.2030e+04,
        4.0025e+04, 4.8020e+04, 5.6015e+04, 6.4010e+04, 7.2005e+04,
        8.0000e+04]),
 <a list of 10 Patch objects>)
Out[32]:
(array([ 190.,  772., 1293., 1479.,  880.,  792.,  572.,   35.,    0.,
           6.]),
 array([ 3.91202301,  4.6497989 ,  5.38757479,  6.12535068,  6.86312657,
         7.60090246,  8.33867835,  9.07645424,  9.81423013, 10.55200602,
        11.28978191]),
 <a list of 10 Patch objects>)

Data imputation / cleaning


In [33]:
# missing data can be imformative but can also prevent algos from training
# we can impute missing data with column's mean

df.get_dtype_counts()


Out[33]:
int64                  5
object                 5
datetime64[ns, UTC]    1
float64                1
dtype: int64

In [34]:
df.dtypes
df['funded_date'] = df['funded_date'].dt.tz_convert(None)
time_columns = df.select_dtypes(include=['datetime64','timedelta64']).columns
str_columns = df.select_dtypes(include=[object]).columns
numeric_columns = df.select_dtypes(exclude=[object,'datetime64','timedelta64']).columns


Out[34]:
id_number                              int64
loan_amount                            int64
lender_count                           int64
status                                object
funded_date              datetime64[ns, UTC]
funded_amount                          int64
repayment_term                         int64
location_country_code                 object
sector                                object
description                           object
use                                   object
description_length                   float64
dtype: object

In [35]:
df[time_columns].isnull().sum()[df[time_columns].isnull().sum()>0]


Out[35]:
funded_date    937
dtype: int64

In [36]:
df[str_columns].isnull().sum()[df[str_columns].isnull().sum()>0]


Out[36]:
location_country_code     17
description              342
use                      342
dtype: int64

In [37]:
df[numeric_columns].isnull().sum()[df[numeric_columns].isnull().sum()>0]


Out[37]:
description_length    342
dtype: int64

In [38]:
# is there a systematic difference between null and non null values?

df[df['funded_date'].isnull()].describe()


Out[38]:
id_number loan_amount lender_count funded_amount repayment_term description_length
count 9.370000e+02 937.000000 937.000000 937.000000 937.000000 937.000000
mean 1.553091e+06 1566.942369 13.839915 449.599787 12.324440 755.668090
std 6.970535e+04 1959.649153 24.887396 881.083386 4.269421 270.137706
min 1.035139e+06 100.000000 0.000000 0.000000 5.000000 209.000000
25% 1.563631e+06 425.000000 0.000000 0.000000 9.000000 572.000000
50% 1.568888e+06 725.000000 4.000000 100.000000 12.000000 711.000000
75% 1.571593e+06 1500.000000 14.000000 375.000000 14.000000 906.000000
max 1.573593e+06 10000.000000 196.000000 7450.000000 39.000000 1514.000000

In [39]:
df[~df['funded_date'].isnull()].describe()


Out[39]:
id_number loan_amount lender_count funded_amount repayment_term description_length
count 5.082000e+03 5082.000000 5082.000000 5082.000000 5082.000000 4740.000000
mean 1.324127e+06 1486.486619 39.684770 1486.486619 11.707202 738.523418
std 3.934281e+05 2601.625321 78.527144 2601.625321 9.746101 269.246968
min 1.377200e+04 50.000000 1.000000 50.000000 3.000000 74.000000
25% 1.138890e+06 275.000000 8.000000 275.000000 8.000000 553.000000
50% 1.541855e+06 600.000000 19.000000 600.000000 10.000000 696.000000
75% 1.563550e+06 1850.000000 45.000000 1850.000000 14.000000 894.250000
max 1.573032e+06 80000.000000 2665.000000 80000.000000 133.000000 3107.000000

In [40]:
# create columns indicating whether data is missing or not

for col in numeric_columns:
    df[col+'na_'] = pd.isnull(df[col])

In [41]:
# impute missing data with mean

df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

In [42]:
new_file_name = 'loans_additional_features.csv'

df.to_csv(path + new_file_name, index=False)

In [ ]: