In [1]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
pd.options.display.max_rows = 125
Load loan data from CSV
In [24]:
loandata = pd.read_csv("data/loandata.csv", low_memory=False)
loandata = loandata[pd.to_datetime(loandata['ListedOnUTC']).dt.year > 2012]
First look on data:
In [25]:
loandata.head()
Out[25]:
ReportAsOfEOD
LoanId
LoanNumber
ListedOnUTC
BiddingStartedOn
BidsPortfolioManager
BidsApi
BidsManual
UserName
NewCreditCustomer
...
PreviousEarlyRepaymentsCountBeforeLoan
GracePeriodStart
GracePeriodEnd
NextPaymentDate
NextPaymentNr
NrOfScheduledPayments
ReScheduledOn
PrincipalDebtServicingCost
InterestAndPenaltyDebtServicingCost
ActiveLateLastPaymentCategory
0
2017-05-04
66AE108B-532B-4BB3-BAB7-0019A46412C1
483449
2016-03-23 16:07:19
2016-03-23 16:07:19
970
1150
5.0
BO965519
False
...
0
NaN
NaN
2017-05-12
13.0
60.0
NaN
0.0
0.0
NaN
1
2017-05-04
D152382E-A50D-46ED-8FF2-0053E0C86A70
378148
2015-06-25 11:02:28
2015-06-25 11:02:28
1295
0
1705.0
BOA9K172A
False
...
1
NaN
NaN
NaN
0.0
60.0
NaN
0.0
0.0
8-15
2
2017-05-04
87342E13-66CB-483F-833A-007953E50C78
451831
2016-01-14 10:00:21
2016-01-14 10:00:21
2700
565
5835.0
BO7971663
True
...
0
NaN
NaN
2017-05-22
16.0
60.0
NaN
0.0
0.0
31-60
3
2017-05-04
87227056-6BF9-410C-98D1-008F788E122A
349381
2015-03-24 15:55:44
2015-03-24 15:55:44
1115
0
385.0
BO76151K3
True
...
1
NaN
NaN
NaN
0.0
60.0
NaN
0.0
0.0
180+
4
2017-05-04
2DDE6336-E466-4624-A337-00A0ED1A1468
443082
2015-12-17 10:12:00
2015-12-17 10:12:00
305
0
785.0
BOK423A63
True
...
0
NaN
NaN
NaN
0.0
48.0
NaN
0.0
0.0
180+
5 rows × 112 columns
In [26]:
loandata.shape
Out[26]:
(32779, 112)
All the columns with types and number of unique values:
In [27]:
types = [loandata[c].dtype for c in loandata.columns]
uniques = [loandata[c].unique().size for c in loandata.columns]
nans = [loandata[c].isnull().sum() for c in loandata.columns]
# + ", " + str(loandata[i].unique().size))
columns = pd.DataFrame(data={'names': loandata.columns, 'types': types, 'uniques': uniques, 'nans': nans})
columns = columns.set_index('names').sort_values(by='nans')
columns.head(120)
Out[27]:
nans
types
uniques
names
ReportAsOfEOD
0
object
1
HomeOwnershipType
0
float64
10
IncomeFromPrincipalEmployer
0
float64
2735
IncomeFromPension
0
float64
1094
IncomeFromFamilyAllowance
0
float64
334
IncomeFromSocialWelfare
0
float64
392
IncomeFromLeavePay
0
float64
359
IncomeFromChildSupport
0
float64
189
IncomeOther
0
float64
462
IncomeTotal
0
float64
2940
ExistingLiabilities
0
int64
32
LiabilitiesTotal
0
float64
13069
RefinanceLiabilities
0
int64
21
DebtToIncome
0
float64
6744
FreeCash
0
float64
22985
OccupationArea
0
float64
20
MonthlyPaymentDay
0
int64
28
Status
0
object
3
Restructured
0
bool
2
PrincipalPaymentsMade
0
float64
19187
InterestAndPenaltyPaymentsMade
0
float64
24659
PrincipalWriteOffs
0
float64
164
InterestAndPenaltyWriteOffs
0
float64
386
PrincipalBalance
0
float64
21886
InterestAndPenaltyBalance
0
float64
12387
NoOfPreviousLoansBeforeLoan
0
int64
25
AmountOfPreviousLoansBeforeLoan
0
float64
1897
PreviousRepaymentsBeforeLoan
0
float64
10506
PreviousEarlyRepaymentsBefoleLoan
0
float64
348
PreviousEarlyRepaymentsCountBeforeLoan
0
int64
9
PrincipalDebtServicingCost
0
float64
126
InterestAndPenaltyDebtServicingCost
0
float64
3089
MaritalStatus
0
float64
5
ActiveScheduleFirstPaymentReached
0
bool
2
LoanApplicationStartedDate
0
object
32774
LoanId
0
object
32779
Education
0
float64
5
LoanNumber
0
int64
32779
ListedOnUTC
0
object
32749
BiddingStartedOn
0
object
32749
BidsPortfolioManager
0
int64
2215
MaturityDate_Last
0
object
1814
ApplicationSignedHour
0
int64
24
ApplicationSignedWeekday
0
int64
7
VerificationType
0
float64
4
LanguageCode
0
int64
13
Age
0
int64
56
Gender
0
float64
3
DateOfBirth
0
object
11682
AppliedAmount
0
float64
298
Amount
0
float64
1539
Interest
0
float64
4517
LoanDuration
0
int64
20
MaturityDate_Original
0
object
1736
FirstPaymentDate
0
object
929
BidsApi
0
int64
491
UseOfLoan
0
int64
9
BidsManual
0
float64
1135
UserName
0
object
22237
NewCreditCustomer
0
bool
2
Country
0
object
4
LoanDate
0
object
1493
WorkExperience
8
object
7
ProbabilityOfDefault
12
float64
7291
ModelVersion
12
float64
7
LossGivenDefault
12
float64
7
ExpectedLoss
12
float64
11309
ExpectedReturn
12
float64
13303
Rating
16
object
9
EmploymentStatus
53
float64
7
EmploymentPosition
126
object
2635
EmploymentDurationCurrentEmployer
274
object
8
City
396
object
4985
NrOfDependants
930
object
12
PlannedPrincipalTillDate
1674
float64
22723
PlannedInterestTillDate
1674
float64
28592
County
2675
object
906
LastPaymentOn
2940
object
1117
NextPaymentNr
6462
float64
52
MonthlyPayment
7061
float64
14515
PrincipalOverdueBySchedule
7167
float64
9856
Rating_V2
7643
object
9
NrOfScheduledPayments
8055
float64
64
CreditScoreEeMini
16835
float64
7
NextPaymentDate
17541
object
38
DebtOccuredOnForSecondary
19752
object
773
CurrentDebtDaysSecondary
19752
float64
773
EL_V1
19877
float64
1909
Rating_V1
19877
object
9
StageActiveSince
20457
object
7061
DebtOccuredOn
20709
object
737
CurrentDebtDaysPrimary
20709
float64
737
ActiveLateCategory
20724
object
10
RecoveryStage
20728
float64
4
WorseLateCategory
20953
object
10
ActiveLateLastPaymentCategory
21034
object
10
ContractEndDate
21627
object
1804
EAD1
21700
float64
9152
DefaultDate
21700
object
668
EAD2
21700
float64
9181
InterestRecovery
21750
float64
428
PrincipalRecovery
21750
float64
3922
PlannedInterestPostDefault
21750
float64
10295
PlannedPrincipalPostDefault
21750
float64
9920
ReScheduledOn
23809
object
989
CreditScoreEsEquifaxRisk
25523
object
7
CreditScoreEsMicroL
25523
object
11
CreditScoreFiAsiakasTietoRiskGrade
26472
object
7
Rating_V0
28210
object
9
EL_V0
28210
float64
1204
GracePeriodStart
28875
object
434
GracePeriodEnd
28875
object
583
In [33]:
loandata[loandata['PrincipalWriteOffs'] > 0][['PrincipalPaymentsMade', 'Status', 'InterestAndPenaltyPaymentsMade', 'Amount', 'Interest']]
Out[33]:
PrincipalPaymentsMade
Status
InterestAndPenaltyPaymentsMade
Amount
Interest
124
1184.65
Repaid
495.51
1455.0
57.09
176
2553.31
Repaid
599.17
3500.0
17.12
356
2346.45
Repaid
852.14
3190.0
47.22
766
1307.90
Repaid
331.71
1500.0
30.65
3826
334.17
Repaid
357.37
3000.0
28.00
4177
2432.03
Repaid
2850.04
3200.0
30.00
4298
382.28
Repaid
1014.40
5600.0
28.00
4362
2740.76
Repaid
474.56
3000.0
28.00
4385
714.87
Repaid
560.29
850.0
30.00
4483
234.86
Repaid
474.44
1700.0
30.00
4496
3606.00
Repaid
2158.95
4000.0
28.00
4497
3851.84
Repaid
784.97
4500.0
28.00
4755
8964.09
Repaid
2703.53
9000.0
25.00
4872
6338.79
Repaid
5164.43
7700.0
28.00
4913
1093.83
Repaid
544.15
1200.0
28.00
4934
2099.15
Repaid
180.33
2100.0
30.00
5090
1045.12
Repaid
384.76
1050.0
30.00
5268
5136.83
Repaid
2899.21
6100.0
25.00
5364
497.76
Repaid
107.76
500.0
20.00
5399
1643.61
Repaid
564.20
1700.0
28.00
5795
2529.12
Repaid
1256.79
2700.0
28.00
6045
2954.19
Repaid
788.15
3000.0
18.00
6166
4334.63
Repaid
371.11
5000.0
24.00
6479
1723.36
Repaid
272.87
2000.0
22.00
6615
1382.37
Repaid
539.84
1700.0
28.00
6978
1658.16
Repaid
70.84
1700.0
22.00
6979
1956.26
Repaid
1224.80
2000.0
33.00
7038
506.98
Repaid
317.36
1000.0
20.00
7211
3886.45
Repaid
937.45
4350.0
21.00
7374
1902.60
Repaid
958.15
2000.0
28.00
7405
1779.06
Repaid
814.16
3000.0
22.00
7712
5723.17
Repaid
2396.39
6000.0
33.00
7789
4306.03
Repaid
0.00
5000.0
22.00
8088
1911.81
Repaid
2175.65
9200.0
28.00
8111
9381.17
Repaid
950.45
10000.0
22.00
8147
7114.63
Repaid
1198.01
9100.0
22.00
8199
6207.30
Repaid
2009.08
6500.0
28.00
8319
421.64
Repaid
234.49
800.0
26.00
8431
6129.54
Repaid
2657.10
6500.0
31.00
8434
3410.78
Repaid
0.00
5000.0
22.00
8555
9559.64
Repaid
2558.53
10000.0
29.00
8563
1066.38
Repaid
279.82
1300.0
29.00
8589
1091.77
Repaid
298.92
2000.0
31.00
8679
5084.53
Repaid
4024.43
7000.0
26.00
8939
1044.22
Repaid
0.00
1100.0
41.00
9019
15.13
Repaid
0.00
3000.0
26.00
9041
919.70
Repaid
64.12
3000.0
22.00
9053
462.94
Repaid
236.20
500.0
33.00
9328
2013.11
Repaid
158.70
2500.0
31.00
9351
2647.94
Repaid
1360.44
5000.0
26.00
9407
1467.85
Repaid
0.00
1700.0
32.00
9454
2109.74
Repaid
776.40
2500.0
31.00
9499
1399.61
Repaid
0.00
1500.0
32.00
9702
2599.71
Repaid
117.08
3000.0
31.00
9876
680.67
Repaid
179.19
700.0
29.00
10034
446.02
Repaid
195.97
500.0
38.00
10047
374.21
Repaid
0.00
500.0
32.00
10191
1170.19
Repaid
356.15
1200.0
26.00
10342
2195.55
Repaid
252.04
3000.0
31.00
10360
1499.33
Repaid
0.00
2000.0
34.00
10398
1030.63
Repaid
559.27
1500.0
32.00
10515
1887.82
Repaid
842.00
1910.0
31.00
...
...
...
...
...
...
15864
7512.44
Repaid
3123.33
9800.0
29.71
15886
5377.63
Repaid
735.82
5525.0
23.44
15916
1825.56
Repaid
74.62
2000.0
30.56
16194
918.34
Repaid
45.26
1000.0
32.88
16195
459.80
Repaid
0.00
800.0
41.47
16404
690.33
Repaid
318.00
700.0
57.09
16464
1929.83
Repaid
173.52
2000.0
35.40
16778
191.24
Repaid
372.19
10000.0
21.62
16908
288.74
Repaid
598.98
10000.0
21.62
17029
0.00
Repaid
0.00
10000.0
21.62
17064
446.70
Repaid
15.15
500.0
20.44
17245
2141.40
Repaid
423.54
2455.0
40.40
17310
387.29
Repaid
199.98
500.0
51.62
17343
765.21
Repaid
346.10
1000.0
39.63
17381
874.49
Repaid
425.75
1100.0
52.08
17399
967.44
Repaid
37.76
1000.0
26.76
17526
422.50
Repaid
1003.85
10000.0
26.76
17618
488.93
Repaid
928.84
10000.0
21.62
17676
780.20
Repaid
451.39
1000.0
48.05
17683
198.33
Repaid
337.45
10000.0
20.22
17732
311.47
Repaid
565.81
10000.0
18.40
18355
2710.89
Repaid
934.57
3700.0
23.68
18421
892.84
Repaid
0.00
900.0
31.92
18505
1522.01
Repaid
280.34
2000.0
33.32
18702
2.00
Repaid
0.00
4625.0
23.68
18720
599.90
Repaid
168.24
800.0
28.98
18829
432.19
Repaid
0.00
500.0
17.09
18848
376.62
Repaid
32.31
530.0
17.69
19311
0.01
Repaid
0.00
3370.0
21.62
19336
3818.43
Repaid
0.00
5315.0
23.38
19422
2907.43
Repaid
0.00
3190.0
31.12
19629
3478.16
Repaid
1026.96
4250.0
32.98
19835
458.89
Repaid
86.80
530.0
41.43
20105
0.01
Repaid
0.00
10630.0
41.81
20312
0.01
Repaid
0.00
10630.0
30.44
20349
0.02
Repaid
0.00
10630.0
28.61
20464
0.01
Repaid
0.00
10630.0
34.53
20738
574.37
Repaid
108.23
740.0
36.36
20871
0.50
Repaid
0.00
2495.0
41.19
20898
459.00
Repaid
53.50
530.0
30.68
20938
437.06
Repaid
0.00
530.0
27.69
21189
856.29
Repaid
23.48
1275.0
20.46
21807
1535.94
Repaid
328.44
2125.0
42.30
21846
443.73
Repaid
0.00
530.0
35.09
21860
1562.12
Repaid
213.63
2125.0
36.86
21990
1763.63
Repaid
154.67
2020.0
32.53
22233
1489.58
Repaid
234.25
2075.0
37.66
22305
452.34
Repaid
0.00
530.0
37.21
22760
2581.77
Repaid
660.23
2895.0
66.65
22922
414.54
Repaid
16.74
530.0
33.98
23113
403.23
Repaid
41.90
530.0
38.99
23185
347.53
Repaid
38.75
530.0
20.73
23387
1876.71
Repaid
0.00
2130.0
75.03
23610
2.00
Repaid
0.00
3185.0
35.44
24497
398.10
Repaid
0.00
530.0
39.37
25280
5577.89
Repaid
0.00
6910.0
37.72
25575
396.50
Repaid
5.13
530.0
37.20
26313
942.72
Repaid
0.00
1275.0
28.36
27011
5654.68
Repaid
0.00
7970.0
27.22
35051
383.02
Repaid
22.76
530.0
27.55
35143
922.50
Repaid
535.52
1200.0
44.31
35215
479.69
Repaid
216.83
600.0
34.58
168 rows × 5 columns
In [29]:
loandata[['PrincipalPaymentsMade', 'Status', 'InterestAndPenaltyPaymentsMade', 'Amount', 'Interest']].head(30)
Out[29]:
PrincipalPaymentsMade
Status
InterestAndPenaltyPaymentsMade
Amount
Interest
0
268.62
Current
445.48
2125.0
20.97
1
421.44
Late
360.07
3000.0
17.12
2
1606.84
Late
1337.29
9100.0
13.67
3
65.32
Late
355.92
1500.0
40.40
4
0.01
Late
0.00
1090.0
68.39
5
0.04
Late
0.00
775.0
73.73
6
17.02
Current
294.52
635.0
42.66
7
1000.00
Repaid
403.89
1000.0
61.49
8
4000.00
Repaid
1567.36
4000.0
31.01
9
1251.38
Current
2140.94
5000.0
24.52
10
2000.00
Repaid
575.44
2000.0
17.11
11
530.00
Repaid
130.03
530.0
25.68
12
1156.71
Current
1749.47
5500.0
21.62
13
6900.00
Repaid
3.74
6900.0
21.63
14
264.37
Current
898.50
2655.0
26.88
15
217.78
Current
2010.16
3190.0
51.55
16
34.21
Late
299.79
530.0
50.82
17
49.28
Current
417.15
500.0
43.97
18
3720.00
Repaid
512.48
3720.0
26.94
19
52.72
Late
233.32
2500.0
32.58
20
1.00
Late
0.00
465.0
40.40
21
106.78
Late
445.10
3500.0
47.34
22
105.77
Late
202.26
3780.0
34.31
23
505.38
Current
846.91
2000.0
24.10
24
424.53
Current
1463.75
2500.0
44.99
25
31.80
Current
194.90
530.0
41.81
26
90.08
Current
455.58
1060.0
41.11
27
736.25
Current
1546.40
3000.0
28.36
28
249.75
Current
790.78
1595.0
43.25
29
1500.00
Repaid
204.25
1500.0
15.91
Analyse missing values:
In [6]:
missingvalues = (columns['nans'] / len(loandata.index) * 100)
missingvalues = missingvalues[missingvalues > 5].sort_values(ascending=False)
missingvalues.plot(kind='bar', figsize=(20, 2))
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f59ea1efef0>
In [7]:
missingvalues
Out[7]:
names
GracePeriodEnd 88.961789
GracePeriodStart 88.961789
EL_V0 87.134289
Rating_V0 87.134289
CreditScoreFiAsiakasTietoRiskGrade 82.240306
CreditScoreEsMicroL 79.568046
CreditScoreEsEquifaxRisk 79.568046
ReScheduledOn 74.600850
InterestRecovery 66.372878
PrincipalRecovery 66.372878
PlannedPrincipalPostDefault 66.372878
PlannedInterestPostDefault 66.372878
DefaultDate 66.232084
EAD1 66.232084
EAD2 66.232084
ActiveLateLastPaymentCategory 65.716780
ActiveLateCategory 65.232450
DebtOccuredOn 65.190212
CurrentDebtDaysPrimary 65.190212
RecoveryStage 65.080393
StageActiveSince 64.263791
WorseLateCategory 63.799172
Rating_V1 63.613325
EL_V1 63.613325
CurrentDebtDaysSecondary 61.994199
DebtOccuredOnForSecondary 61.994199
ContractEndDate 61.980120
NextPaymentDate 57.052347
CreditScoreEeMini 55.047447
NrOfScheduledPayments 30.239631
Rating_V2 29.200574
MonthlyPayment 27.581449
PrincipalOverdueBySchedule 26.781742
NextPaymentNr 23.284431
LastPaymentOn 8.391293
Rating 7.687326
ModelVersion 7.676062
ExpectedLoss 7.676062
ProbabilityOfDefault 7.676062
ExpectedReturn 7.676062
LossGivenDefault 7.676062
County 7.583139
Name: nans, dtype: float64
Let's have a look at some of the columns:
In [8]:
loandata['yearmonth'] = pd.to_datetime(loandata['ListedOnUTC']).dt.to_period('M')
Number of loans in countries per YearMonth:
In [9]:
loandata.groupby(['yearmonth', 'Country']).size().unstack(1).sort_index(ascending=True).fillna(0).plot(figsize=(16, 5))
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f59e44da278>
In [10]:
r = pd.concat([loandata['yearmonth'], loandata[['Rating', 'Rating_V0', 'Rating_V1', 'Rating_V2']].notnull()], axis=1)
Different types of rating per YearMonth:
In [11]:
g = r.groupby('yearmonth').sum()
g = g[(g.T != 0).any()]
g.head()
Out[11]:
Rating
Rating_V0
Rating_V1
Rating_V2
yearmonth
2012-11
5.0
0.0
5.0
0.0
2012-12
15.0
0.0
15.0
7.0
2013-01
103.0
89.0
14.0
103.0
2013-02
87.0
78.0
9.0
87.0
2013-03
113.0
103.0
10.0
113.0
In [12]:
g.plot(figsize=(16,4))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f59e408cf60>
In [ ]:
In [ ]:
In [ ]:
Content source: zczapran/datascienceintensive
Similar notebooks: