In [1]:
from datetime import datetime
import dateutil.parser
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
# The command below means that the output of multiple commands in a cell will be output at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# The command below tells jupyter to display up to 80 columns, this keeps everything visible
pd.set_option('display.max_columns', 80)
pd.set_option('expand_frame_repr', True)
# Show figures in notebook
%matplotlib inline
# Plotting defaults
plt.rcParams['figure.figsize'] = (15,8)
sns.set()
sns.set(font_scale=1.5)
In [2]:
path = 'data/'
filename = 'loans.csv'
try:
df = pd.read_csv(path+filename)
except FileNotFoundError:
# If data is not found, download it from GitHub
import os
os.system(f'git clone --single-branch --depth=1 https://github.com/DeltaAnalytics/machine_learning_for_good_data {path}')
df.sample(n=2)
print(f'There are {df.shape[0]} observations and {df.shape[1]} features')
Out[2]:
In [3]:
df.columns.tolist()
df.dtypes
df.describe()
df[df.dtypes[df.dtypes == 'object'].index].describe()
Out[3]:
Out[3]:
Out[3]:
Out[3]:
In [4]:
def num_missing(x):
return sum(x.isnull())
print('Missing values per column:')
print(df.apply(num_missing,axis=0).where(lambda x : x != 0).dropna())
In [5]:
#specific column analysis for numerical types
print(len(df['lender_count']))
print(max(df['funded_amount']))
print(df['loan_amount'].mean())
In [6]:
#specific column analysis for categorical types
df['status'].value_counts()
Out[6]:
In [7]:
df[['loan_amount','funded_amount']].tail()
Out[7]:
In [8]:
# get rows 1 through 3 and columns 0 through 5
df.iloc[1:3,:5]
Out[8]:
In [9]:
# get rows with index values of 2-4 and the columns status and lender_count
df.loc[2:4,["status","lender_count"]]
Out[9]:
In [10]:
df.iloc[:,:]
Out[10]:
In [11]:
df[df['status'] == 'fundraising']
Out[11]:
In [12]:
df[(df['status'] == 'fundraising')|(df['status'] == 'expired')]
Out[12]:
In [13]:
df.groupby(['sector'])['loan_amount'].sum().reset_index()
Out[13]:
In [14]:
# visualize relationship between two continuous features
numeric_vars = df.select_dtypes(include=[np.number]).columns.tolist()
for variable in numeric_vars:
print(variable)
In [15]:
ax = sns.regplot(x='lender_count', y='loan_amount',data=df)
In [16]:
sns.pairplot(df, hue='status', vars=['loan_amount','funded_amount'], dropna=True, diag_kind='hist');
In [17]:
# distribution of values for a single variable
sns.distplot(df['loan_amount'].dropna(axis=0));
In [18]:
small_loans_df = df[(df['loan_amount']<5000)]
sns.distplot(small_loans_df['loan_amount']);
In [19]:
# understanding how categorical groups are different with respect to a continous variable
p = sns.barplot(x='sector', y='loan_amount', data=df, estimator=np.mean);
p.set(title='Average loan amount by sector')
p.set_xticklabels(p.get_xticklabels(),rotation=-45);
In [20]:
p = sns.barplot(x='sector', y='loan_amount', data=df, estimator=np.sum);
p.set(title='Total loan amount by sector')
p.set_xticklabels(p.get_xticklabels(),rotation=-45);
In [21]:
# describes distribution of data based on min, max, median, first & third quartile
p = sns.boxplot(x='sector',
y='loan_amount',
data = df[(df['loan_amount']<50000)]);
p.set(title = 'Loan amounts across sectors');
p.set_xticklabels(p.get_xticklabels(),rotation=-45);
In [22]:
# useful to see how variable changes over time
time_column = 'funded_date'
df[time_column] = pd.to_datetime(df[time_column])
# resample to monthly intervals, taking mean of y variable
time_data = df.resample('M', on=time_column)['loan_amount'].mean().fillna(0)
fig, ax = plt.subplots(figsize=(15,18))
ax.plot(time_data)
plt.title('Mean loan_amount over time');
In [23]:
# resample to monthly intervals, taking mean of y variable
time_data = df.resample('7D', on=time_column)['loan_amount'].mean().fillna(0)
fig, ax = plt.subplots(figsize=(15,18))
ax.plot(time_data)
plt.title('Mean loan_amount over time');
In [24]:
# removing features that have 0 variation
for col in df.columns:
if df[col].unique().size==0:
print("Dropping column: {0}".format(col))
df = df.drop(col,axis=1)
In [25]:
# raw datetimes may not be useful but many algos can find patterns if they are informed which obs occur in a given year (weekend vs. weekday, holidays etc)
columns = []
for col in [c for c in df.columns if "_date" in c]:
df[col] = pd.to_datetime(df[col])
In [26]:
# .dt.accessor enables easy construction of additional features based off of datetimes
df_test = df
#df_test['posted_year'] = df_test['posted_date'].dt.year
#df_test['posted_month'] = df_test['posted_date'].dt.month
#df_test['time_to_fund'] = df_test['funded_date'] - df_test['posted_date']
#df_test['days_to_fund'] = df_test['time_to_fund'].dt.days
In [27]:
# converting categorical or string data into a binary
pd.get_dummies(df.sector).head(n=2)
Out[27]:
In [28]:
#df.get_dtype_counts()
df.select_dtypes(include=[object])
Out[28]:
In [29]:
df['description_length'] = df.description.str.len()
df['description_length'].tail()
Out[29]:
In [30]:
min_max_scaler = preprocessing.MinMaxScaler()
normalized = min_max_scaler.fit_transform(df['loan_amount'].astype(np.float64).values.reshape(-1,1))[:,0]
print("Pre Scaling\tMin: {0}\t\tMax: {1}\tMean: {2:3f}".format(df['loan_amount'].min(),df['loan_amount'].max(),df['loan_amount'].mean()))
print("Post Scaling\tMin: {0}\tMax: {1:.3f}\tMean: {2:3f}".format(np.min(normalized),np.max(normalized),np.mean(normalized)))
(X - mean) / standard deviation
In [31]:
standardized = preprocessing.scale(df['loan_amount'].astype(np.float64))
print("Post Scaling\tMin: {0:3f}\t Max: {1:3f}\tMean: {2:3f}".format(np.min(standardized),np.max(standardized),np.mean(standardized)))
Used to fix skewed distributions
In [32]:
# find features with a skewed distribution
plt.hist(df['loan_amount'])
plt.show()
plt.hist(np.log(df['loan_amount']))
plt.show()
Out[32]:
Out[32]:
In [33]:
# missing data can be imformative but can also prevent algos from training
# we can impute missing data with column's mean
df.get_dtype_counts()
Out[33]:
In [34]:
df.dtypes
df['funded_date'] = df['funded_date'].dt.tz_convert(None)
time_columns = df.select_dtypes(include=['datetime64','timedelta64']).columns
str_columns = df.select_dtypes(include=[object]).columns
numeric_columns = df.select_dtypes(exclude=[object,'datetime64','timedelta64']).columns
Out[34]:
In [35]:
df[time_columns].isnull().sum()[df[time_columns].isnull().sum()>0]
Out[35]:
In [36]:
df[str_columns].isnull().sum()[df[str_columns].isnull().sum()>0]
Out[36]:
In [37]:
df[numeric_columns].isnull().sum()[df[numeric_columns].isnull().sum()>0]
Out[37]:
In [38]:
# is there a systematic difference between null and non null values?
df[df['funded_date'].isnull()].describe()
Out[38]:
In [39]:
df[~df['funded_date'].isnull()].describe()
Out[39]:
In [40]:
# create columns indicating whether data is missing or not
for col in numeric_columns:
df[col+'na_'] = pd.isnull(df[col])
In [41]:
# impute missing data with mean
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
In [42]:
new_file_name = 'loans_additional_features.csv'
df.to_csv(path + new_file_name, index=False)
In [ ]: