In [1]:
import matplotlib
import os
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0
In [2]:
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/loan.csv')
df.head()
Out[2]:
In [3]:
ax = df['int_rate'].hist(bins=25)
ax.set_xlabel('Int. Rate')
ax.set_ylabel('Count')
ax.set_title('Interest Rates')
plt.show()
In [4]:
ax = df['loan_amnt'].hist(bins=20)
ax.set_xlabel('S')
ax.set_ylabel('Count')
ax.set_title('Loan Amount')
plt.show()
In [5]:
ax = df[['grade','id']].groupby('grade').count()['id'].plot.pie(figsize=(6, 6))
ax.set_ylabel('Loan Grades')
plt.show()
In [6]:
print 'Number of rows: ',len(df)
print 'Number of rows with at least 1 NA value: ', len(df.dropna())
In [7]:
c = df.corr().abs()
print 'Most uncorrelated Variables'
s = c.unstack()
so = pd.DataFrame(s.sort_values(kind="quicksort", ascending=True)).reset_index().dropna()
so.drop_duplicates(subset=[0]).head(10)
Out[7]:
In [8]:
df.columns
Out[8]:
In [9]:
partial_df = df[['grade','funded_amnt', 'installment', 'int_rate','term', 'pymnt_plan']]
partial_df.columns = ['Grade', 'Funding Amount', 'Installment', 'Interest Rate','Term', 'Payment Plan']
In [10]:
ax = sns.pairplot(partial_df[['Grade', 'Funding Amount', 'Installment', 'Interest Rate']], hue="Grade", size=4)
matplotlib.rc("legend", fontsize=20)
plt.xticks(rotation=45)
for _ in ax.axes.flat:
for label in _.get_xticklabels():
label.set_rotation(30)
plt.show()
In [ ]:
sns.boxplot(x="Grade", y="Interest Rate", hue='Term', data=partial_df)
plt
sns.despine(offset=0, trim=False)
In [ ]:
sns.boxplot(x="Payment Plan", y="Interest Rate", hue='Term', data=partial_df)
sns.despine(offset=10, trim=True)
In [ ]:
sns.boxplot(x="term", y="int_rate", data=df)
sns.despine(offset=10, trim=True)
In [ ]:
from scipy.stats import kendalltau
matplotlib.rc("legend", fontsize=14)
sns.jointplot(df['int_rate'], df['funded_amnt'], kind="hex", stat_func=kendalltau, size=10)
In [ ]: