In [14]:
import pandas as pd
import numpy as np
import random
In [15]:
data = pd.read_csv('../data/loan/train.csv')
print len(data)
sampler = np.array(range(0,len(data), 10))
data.take(sampler)
614
Out[15]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
0
LP001002
Male
No
0
Graduate
No
5849
0.0
NaN
360.0
1.0
Urban
Y
10
LP001024
Male
Yes
2
Graduate
No
3200
700.0
70.0
360.0
1.0
Urban
Y
20
LP001043
Male
Yes
0
Not Graduate
No
7660
0.0
104.0
360.0
0.0
Urban
N
30
LP001091
Male
Yes
1
Graduate
NaN
4166
3369.0
201.0
360.0
NaN
Urban
N
40
LP001119
Male
No
0
Graduate
No
3600
0.0
80.0
360.0
1.0
Urban
N
50
LP001155
Female
Yes
0
Not Graduate
No
1928
1644.0
100.0
360.0
1.0
Semiurban
Y
60
LP001205
Male
Yes
0
Graduate
No
2500
3796.0
120.0
360.0
1.0
Urban
Y
70
LP001243
Male
Yes
0
Graduate
No
3208
3066.0
172.0
360.0
1.0
Urban
Y
80
LP001265
Female
No
0
Graduate
No
3846
0.0
111.0
360.0
1.0
Semiurban
Y
90
LP001316
Male
Yes
0
Graduate
No
2958
2900.0
131.0
360.0
1.0
Semiurban
Y
100
LP001345
Male
Yes
2
Not Graduate
No
4288
3263.0
133.0
180.0
1.0
Urban
Y
110
LP001385
Male
No
0
Graduate
No
5316
0.0
136.0
360.0
1.0
Urban
Y
120
LP001426
Male
Yes
NaN
Graduate
No
5667
2667.0
180.0
360.0
1.0
Rural
Y
130
LP001469
Male
No
0
Graduate
Yes
20166
0.0
650.0
480.0
NaN
Urban
Y
140
LP001497
Male
Yes
2
Graduate
No
5042
2083.0
185.0
360.0
1.0
Rural
N
150
LP001528
Male
No
0
Graduate
No
6277
0.0
118.0
360.0
0.0
Rural
N
160
LP001560
Male
Yes
0
Not Graduate
No
1863
1041.0
98.0
360.0
1.0
Semiurban
Y
170
LP001581
Male
Yes
0
Not Graduate
NaN
1820
1769.0
95.0
360.0
1.0
Rural
Y
180
LP001633
Male
Yes
1
Graduate
No
6400
7250.0
180.0
360.0
0.0
Urban
N
190
LP001653
Male
No
0
Not Graduate
No
4885
0.0
48.0
360.0
1.0
Rural
Y
200
LP001674
Male
Yes
1
Not Graduate
No
2600
2500.0
90.0
360.0
1.0
Semiurban
Y
210
LP001708
Female
No
0
Graduate
No
10000
0.0
214.0
360.0
1.0
Semiurban
N
220
LP001736
Male
Yes
0
Graduate
No
2221
0.0
60.0
360.0
0.0
Urban
N
230
LP001765
Male
Yes
1
Graduate
No
2491
2054.0
104.0
360.0
1.0
Semiurban
Y
240
LP001798
Male
Yes
2
Graduate
No
5819
5000.0
120.0
360.0
1.0
Rural
Y
250
LP001835
Male
Yes
0
Not Graduate
No
1668
3890.0
201.0
360.0
0.0
Semiurban
N
260
LP001865
Male
Yes
1
Graduate
No
6083
4250.0
330.0
360.0
NaN
Urban
Y
270
LP001888
Female
No
0
Graduate
No
3237
0.0
30.0
360.0
1.0
Urban
Y
280
LP001910
Male
No
1
Not Graduate
Yes
4053
2426.0
158.0
360.0
0.0
Urban
N
290
LP001936
Male
Yes
0
Graduate
No
3075
2416.0
139.0
360.0
1.0
Rural
Y
...
...
...
...
...
...
...
...
...
...
...
...
...
...
320
LP002051
Male
Yes
0
Graduate
No
2400
2167.0
115.0
360.0
1.0
Semiurban
Y
330
LP002097
Male
No
1
Graduate
No
4384
1793.0
117.0
360.0
1.0
Urban
Y
340
LP002115
Male
Yes
3+
Not Graduate
No
2647
1587.0
173.0
360.0
1.0
Rural
N
350
LP002139
Male
Yes
0
Graduate
No
9083
0.0
228.0
360.0
1.0
Semiurban
Y
360
LP002161
Female
No
1
Graduate
No
4723
0.0
81.0
360.0
1.0
Semiurban
N
370
LP002194
Female
No
0
Graduate
Yes
15759
0.0
55.0
360.0
1.0
Semiurban
Y
380
LP002226
Male
Yes
0
Graduate
NaN
3333
2500.0
128.0
360.0
1.0
Semiurban
Y
390
LP002255
Male
No
3+
Graduate
No
9167
0.0
185.0
360.0
1.0
Rural
Y
400
LP002288
Male
Yes
2
Not Graduate
No
2889
0.0
45.0
180.0
0.0
Urban
N
410
LP002318
Female
No
1
Not Graduate
Yes
3867
0.0
62.0
360.0
1.0
Semiurban
N
420
LP002348
Male
Yes
0
Graduate
No
5829
0.0
138.0
360.0
1.0
Rural
Y
430
LP002377
Female
No
1
Graduate
Yes
8624
0.0
150.0
360.0
1.0
Semiurban
Y
440
LP002408
Male
No
0
Graduate
No
3660
5064.0
187.0
360.0
1.0
Semiurban
Y
450
LP002446
Male
Yes
2
Not Graduate
No
2309
1255.0
125.0
360.0
0.0
Rural
N
460
LP002478
NaN
Yes
0
Graduate
Yes
2083
4083.0
160.0
360.0
NaN
Semiurban
Y
470
LP002515
Male
Yes
1
Graduate
Yes
3450
2079.0
162.0
360.0
1.0
Semiurban
Y
480
LP002534
Female
No
0
Not Graduate
No
4350
0.0
154.0
360.0
1.0
Rural
Y
490
LP002560
Male
No
0
Not Graduate
No
2699
2785.0
96.0
360.0
NaN
Semiurban
Y
500
LP002603
Female
No
0
Graduate
No
645
3683.0
113.0
480.0
1.0
Rural
Y
510
LP002637
Male
No
0
Not Graduate
No
3598
1287.0
100.0
360.0
1.0
Rural
N
520
LP002689
Male
Yes
2
Not Graduate
No
2192
1742.0
45.0
360.0
1.0
Semiurban
Y
530
LP002717
Male
Yes
0
Graduate
No
1025
5500.0
216.0
360.0
NaN
Rural
Y
540
LP002741
Female
Yes
1
Graduate
No
4608
2845.0
140.0
180.0
1.0
Semiurban
Y
550
LP002778
Male
Yes
2
Graduate
Yes
6633
0.0
NaN
360.0
0.0
Rural
N
560
LP002807
Male
Yes
2
Not Graduate
No
3675
242.0
108.0
360.0
1.0
Semiurban
Y
570
LP002842
Male
Yes
1
Graduate
No
3417
1750.0
186.0
360.0
1.0
Urban
Y
580
LP002892
Male
Yes
2
Graduate
No
6540
0.0
205.0
360.0
1.0
Semiurban
Y
590
LP002928
Male
Yes
0
Graduate
No
3000
3416.0
56.0
180.0
1.0
Semiurban
Y
600
LP002949
Female
No
3+
Graduate
NaN
416
41667.0
350.0
180.0
NaN
Urban
N
610
LP002979
Male
Yes
3+
Graduate
No
4106
0.0
40.0
180.0
1.0
Rural
Y
62 rows × 13 columns
In [16]:
# sampler = np.random.randint(0, len(data),
# size = len(data) / 10)
sampler = random.sample(xrange(len(data)), len(data) / 10)
sampled_data = data.take(sampler)
sampled_data.sort_index(inplace = True)
sampled_data
Out[16]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
0
LP001002
Male
No
0
Graduate
No
5849
0.000000
NaN
360.0
1.0
Urban
Y
43
LP001131
Male
Yes
0
Graduate
No
3941
2336.000000
134.0
360.0
1.0
Semiurban
Y
52
LP001164
Female
No
0
Graduate
No
4230
0.000000
112.0
360.0
1.0
Semiurban
N
76
LP001256
Male
No
0
Graduate
No
3750
4750.000000
176.0
360.0
1.0
Urban
N
77
LP001259
Male
Yes
1
Graduate
Yes
1000
3022.000000
110.0
360.0
1.0
Urban
N
86
LP001280
Male
Yes
2
Not Graduate
No
3333
2000.000000
99.0
360.0
NaN
Semiurban
Y
92
LP001319
Male
Yes
2
Not Graduate
No
3273
1820.000000
81.0
360.0
1.0
Urban
Y
98
LP001334
Male
Yes
0
Not Graduate
No
4188
0.000000
115.0
180.0
1.0
Semiurban
Y
104
LP001357
Male
NaN
NaN
Graduate
No
3816
754.000000
160.0
360.0
1.0
Urban
Y
107
LP001370
Male
No
0
Not Graduate
NaN
7333
0.000000
120.0
360.0
1.0
Rural
N
117
LP001405
Male
Yes
1
Graduate
No
2214
1398.000000
85.0
360.0
NaN
Urban
Y
119
LP001422
Female
No
0
Graduate
No
10408
0.000000
259.0
360.0
1.0
Urban
Y
133
LP001482
Male
Yes
0
Graduate
Yes
3459
0.000000
25.0
120.0
1.0
Semiurban
Y
136
LP001489
Female
Yes
0
Graduate
No
4583
0.000000
84.0
360.0
1.0
Rural
N
145
LP001514
Female
Yes
0
Graduate
No
2330
4486.000000
100.0
360.0
1.0
Semiurban
Y
158
LP001546
Male
No
0
Graduate
NaN
2980
2083.000000
120.0
360.0
1.0
Rural
Y
159
LP001552
Male
Yes
0
Graduate
No
4583
5625.000000
255.0
360.0
1.0
Semiurban
Y
167
LP001578
Male
Yes
0
Graduate
No
2439
3333.000000
129.0
360.0
1.0
Rural
Y
172
LP001586
Male
Yes
3+
Not Graduate
No
3522
0.000000
81.0
180.0
1.0
Rural
N
174
LP001603
Male
Yes
0
Not Graduate
Yes
4344
736.000000
87.0
360.0
1.0
Semiurban
N
176
LP001608
Male
Yes
2
Graduate
No
2045
1619.000000
101.0
360.0
1.0
Rural
Y
183
LP001637
Male
Yes
1
Graduate
No
33846
0.000000
260.0
360.0
1.0
Semiurban
N
186
LP001641
Male
Yes
1
Graduate
Yes
2178
0.000000
66.0
300.0
0.0
Rural
N
188
LP001644
NaN
Yes
0
Graduate
Yes
674
5296.000000
168.0
360.0
1.0
Rural
Y
197
LP001669
Female
No
0
Not Graduate
No
1907
2365.000000
120.0
NaN
1.0
Urban
Y
198
LP001671
Female
Yes
0
Graduate
No
3416
2816.000000
113.0
360.0
NaN
Semiurban
Y
207
LP001698
Male
No
0
Not Graduate
No
3975
2531.000000
55.0
360.0
1.0
Rural
Y
229
LP001761
Male
No
0
Graduate
Yes
6400
0.000000
200.0
360.0
1.0
Rural
Y
236
LP001786
Male
Yes
0
Graduate
NaN
5746
0.000000
255.0
360.0
NaN
Urban
N
238
LP001790
Female
No
1
Graduate
No
3812
0.000000
112.0
360.0
1.0
Rural
Y
...
...
...
...
...
...
...
...
...
...
...
...
...
...
270
LP001888
Female
No
0
Graduate
No
3237
0.000000
30.0
360.0
1.0
Urban
Y
312
LP002006
Female
No
0
Graduate
No
2507
0.000000
56.0
360.0
1.0
Rural
Y
321
LP002053
Male
Yes
3+
Graduate
No
4342
189.000000
124.0
360.0
1.0
Semiurban
Y
335
LP002106
Male
Yes
NaN
Graduate
Yes
5503
4490.000000
70.0
NaN
1.0
Semiurban
Y
337
LP002112
Male
Yes
2
Graduate
Yes
2500
4600.000000
176.0
360.0
1.0
Rural
Y
343
LP002126
Male
Yes
3+
Not Graduate
No
3173
0.000000
74.0
360.0
1.0
Semiurban
Y
361
LP002170
Male
Yes
2
Graduate
No
5000
3667.000000
236.0
360.0
1.0
Semiurban
Y
372
LP002201
Male
Yes
2
Graduate
Yes
9323
7873.000000
380.0
300.0
1.0
Rural
Y
378
LP002224
Male
No
0
Graduate
No
3069
0.000000
71.0
480.0
1.0
Urban
N
403
LP002300
Female
No
0
Not Graduate
No
1963
0.000000
53.0
360.0
1.0
Semiurban
Y
409
LP002317
Male
Yes
3+
Graduate
No
81000
0.000000
360.0
360.0
0.0
Rural
N
419
LP002347
Male
Yes
0
Graduate
No
3246
1417.000000
138.0
360.0
1.0
Semiurban
Y
428
LP002369
Male
Yes
0
Graduate
No
2920
16.120001
87.0
360.0
1.0
Rural
Y
433
LP002387
Male
Yes
0
Graduate
No
2425
2340.000000
143.0
360.0
1.0
Semiurban
Y
449
LP002444
Male
No
1
Not Graduate
Yes
2769
1542.000000
190.0
360.0
NaN
Semiurban
N
455
LP002455
Male
Yes
2
Graduate
No
3859
0.000000
96.0
360.0
1.0
Semiurban
Y
476
LP002529
Male
Yes
2
Graduate
No
6700
1750.000000
230.0
300.0
1.0
Semiurban
Y
478
LP002531
Male
Yes
1
Graduate
Yes
16667
2250.000000
86.0
360.0
1.0
Semiurban
Y
479
LP002533
Male
Yes
2
Graduate
No
2947
1603.000000
NaN
360.0
1.0
Urban
N
484
LP002543
Male
Yes
2
Graduate
No
8333
0.000000
246.0
360.0
1.0
Semiurban
Y
491
LP002562
Male
Yes
1
Not Graduate
No
5333
1131.000000
186.0
360.0
NaN
Urban
Y
503
LP002618
Male
Yes
1
Not Graduate
No
4050
5302.000000
138.0
360.0
NaN
Rural
N
527
LP002706
Male
Yes
1
Not Graduate
No
5285
1430.000000
161.0
360.0
0.0
Semiurban
Y
548
LP002776
Female
No
0
Graduate
No
5000
0.000000
103.0
360.0
0.0
Semiurban
N
551
LP002784
Male
Yes
1
Not Graduate
No
2492
2375.000000
NaN
360.0
1.0
Rural
Y
553
LP002788
Male
Yes
0
Not Graduate
No
2454
2333.000000
181.0
360.0
0.0
Urban
N
573
LP002862
Male
Yes
2
Not Graduate
No
6125
1625.000000
187.0
480.0
1.0
Semiurban
N
582
LP002894
Female
Yes
0
Graduate
No
3166
0.000000
36.0
360.0
1.0
Semiurban
Y
590
LP002928
Male
Yes
0
Graduate
No
3000
3416.000000
56.0
180.0
1.0
Semiurban
Y
599
LP002948
Male
Yes
2
Graduate
No
5780
0.000000
192.0
360.0
1.0
Urban
Y
61 rows × 13 columns
In [17]:
data_Y = data.loc[data['Loan_Status'] == 'Y']
# sampler = np.random.randint(0, len(data_Y),
# size = len(data_Y) / 2)
sampler = random.sample(xrange(len(data_Y)), len(data_Y) / 2)
sampled_Y = data_Y.take(sampler)
sampled_Y
Out[17]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
374
LP002209
Female
No
0
Graduate
NaN
2764
1459.000000
110.0
360.0
1.0
Urban
Y
608
LP002974
Male
Yes
0
Graduate
No
3232
1950.000000
108.0
360.0
1.0
Rural
Y
344
LP002128
Male
Yes
2
Graduate
NaN
2583
2330.000000
125.0
360.0
1.0
Rural
Y
283
LP001917
Female
No
0
Graduate
No
1811
1666.000000
54.0
360.0
1.0
Urban
Y
11
LP001027
Male
Yes
2
Graduate
NaN
2500
1840.000000
109.0
360.0
1.0
Urban
Y
143
LP001507
Male
Yes
0
Graduate
No
2698
2034.000000
122.0
360.0
1.0
Semiurban
Y
214
LP001716
Male
Yes
0
Graduate
No
3173
3021.000000
137.0
360.0
1.0
Urban
Y
368
LP002190
Male
Yes
1
Graduate
No
6325
0.000000
175.0
360.0
1.0
Semiurban
Y
444
LP002424
Male
Yes
0
Graduate
No
7333
8333.000000
175.0
300.0
NaN
Rural
Y
315
LP002031
Male
Yes
1
Not Graduate
No
3399
1640.000000
111.0
180.0
1.0
Urban
Y
334
LP002103
NaN
Yes
1
Graduate
Yes
9833
1833.000000
182.0
180.0
1.0
Urban
Y
559
LP002804
Female
Yes
0
Graduate
No
4180
2306.000000
182.0
360.0
1.0
Semiurban
Y
74
LP001253
Male
Yes
3+
Graduate
Yes
5266
1774.000000
187.0
360.0
1.0
Semiurban
Y
439
LP002407
Female
Yes
0
Not Graduate
Yes
7142
0.000000
138.0
360.0
1.0
Rural
Y
331
LP002098
Male
No
0
Graduate
No
2935
0.000000
98.0
360.0
1.0
Semiurban
Y
529
LP002716
Male
No
0
Not Graduate
No
6783
0.000000
130.0
360.0
1.0
Semiurban
Y
244
LP001811
Male
Yes
0
Not Graduate
No
3406
4417.000000
123.0
360.0
1.0
Semiurban
Y
488
LP002555
Male
Yes
2
Graduate
Yes
4583
2083.000000
160.0
360.0
1.0
Semiurban
Y
79
LP001264
Male
Yes
3+
Not Graduate
Yes
3333
2166.000000
130.0
360.0
NaN
Semiurban
Y
114
LP001398
Male
No
0
Graduate
NaN
5050
0.000000
118.0
360.0
1.0
Semiurban
Y
94
LP001325
Male
No
0
Not Graduate
No
3620
0.000000
25.0
120.0
1.0
Semiurban
Y
506
LP002624
Male
Yes
0
Graduate
No
20833
6667.000000
480.0
360.0
NaN
Urban
Y
588
LP002925
NaN
No
0
Graduate
No
4750
0.000000
94.0
360.0
1.0
Semiurban
Y
473
LP002522
Female
No
0
Graduate
Yes
2500
0.000000
93.0
360.0
NaN
Urban
Y
279
LP001908
Female
Yes
0
Not Graduate
No
4100
0.000000
124.0
360.0
NaN
Rural
Y
264
LP001872
Male
No
0
Graduate
Yes
5166
0.000000
128.0
360.0
1.0
Semiurban
Y
178
LP001616
Male
Yes
1
Graduate
No
3750
0.000000
116.0
360.0
1.0
Semiurban
Y
611
LP002983
Male
Yes
1
Graduate
No
8072
240.000000
253.0
360.0
1.0
Urban
Y
217
LP001726
Male
Yes
0
Graduate
No
3727
1775.000000
131.0
360.0
1.0
Semiurban
Y
306
LP001993
Female
No
0
Graduate
No
3762
1666.000000
135.0
360.0
1.0
Rural
Y
...
...
...
...
...
...
...
...
...
...
...
...
...
...
255
LP001846
Female
No
3+
Graduate
No
3083
0.000000
255.0
360.0
1.0
Rural
Y
101
LP001349
Male
No
0
Graduate
No
4843
3806.000000
151.0
360.0
1.0
Semiurban
Y
470
LP002515
Male
Yes
1
Graduate
Yes
3450
2079.000000
162.0
360.0
1.0
Semiurban
Y
60
LP001205
Male
Yes
0
Graduate
No
2500
3796.000000
120.0
360.0
1.0
Urban
Y
39
LP001116
Male
No
0
Not Graduate
No
3748
1668.000000
110.0
360.0
1.0
Semiurban
Y
432
LP002386
Male
No
0
Graduate
NaN
12876
0.000000
405.0
360.0
1.0
Semiurban
Y
86
LP001280
Male
Yes
2
Not Graduate
No
3333
2000.000000
99.0
360.0
NaN
Semiurban
Y
49
LP001151
Female
No
0
Graduate
No
4000
2275.000000
144.0
360.0
1.0
Semiurban
Y
528
LP002714
Male
No
1
Not Graduate
No
2679
1302.000000
94.0
360.0
1.0
Semiurban
Y
337
LP002112
Male
Yes
2
Graduate
Yes
2500
4600.000000
176.0
360.0
1.0
Rural
Y
197
LP001669
Female
No
0
Not Graduate
No
1907
2365.000000
120.0
NaN
1.0
Urban
Y
278
LP001907
Male
Yes
0
Graduate
No
14583
0.000000
436.0
360.0
1.0
Semiurban
Y
545
LP002767
Male
Yes
0
Graduate
No
2768
1950.000000
155.0
360.0
1.0
Rural
Y
463
LP002489
Female
No
1
Not Graduate
NaN
5191
0.000000
132.0
360.0
1.0
Semiurban
Y
70
LP001243
Male
Yes
0
Graduate
No
3208
3066.000000
172.0
360.0
1.0
Urban
Y
557
LP002795
Male
Yes
3+
Graduate
Yes
10139
0.000000
260.0
360.0
1.0
Semiurban
Y
10
LP001024
Male
Yes
2
Graduate
No
3200
700.000000
70.0
360.0
1.0
Urban
Y
520
LP002689
Male
Yes
2
Not Graduate
No
2192
1742.000000
45.0
360.0
1.0
Semiurban
Y
227
LP001758
Male
Yes
2
Graduate
No
6250
1695.000000
210.0
360.0
1.0
Semiurban
Y
290
LP001936
Male
Yes
0
Graduate
No
3075
2416.000000
139.0
360.0
1.0
Rural
Y
561
LP002813
Female
Yes
1
Graduate
Yes
19484
0.000000
600.0
360.0
1.0
Semiurban
Y
392
LP002263
Male
Yes
0
Graduate
No
2583
2115.000000
120.0
360.0
NaN
Urban
Y
19
LP001041
Male
Yes
0
Graduate
NaN
2600
3500.000000
115.0
NaN
1.0
Urban
Y
419
LP002347
Male
Yes
0
Graduate
No
3246
1417.000000
138.0
360.0
1.0
Semiurban
Y
51
LP001157
Female
No
0
Graduate
No
3086
0.000000
120.0
360.0
1.0
Semiurban
Y
332
LP002100
Male
No
NaN
Graduate
No
2833
0.000000
71.0
360.0
1.0
Urban
Y
607
LP002964
Male
Yes
2
Not Graduate
No
3987
1411.000000
157.0
360.0
1.0
Rural
Y
428
LP002369
Male
Yes
0
Graduate
No
2920
16.120001
87.0
360.0
1.0
Rural
Y
126
LP001448
NaN
Yes
3+
Graduate
No
23803
0.000000
370.0
360.0
1.0
Rural
Y
560
LP002807
Male
Yes
2
Not Graduate
No
3675
242.000000
108.0
360.0
1.0
Semiurban
Y
211 rows × 13 columns
In [18]:
data_N = data.loc[data['Loan_Status'] == 'N']
sampler = np.random.randint(0, len(data_N),
size = len(data_N) / 10)
# sampler = random.sample(xrange(len(data_N)), len(data_Y) / N)
sampled_N = data_N.take(sampler)
sampled_N
Out[18]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
153
LP001532
Male
Yes
2
Not Graduate
No
2281
0.0
113.0
360.0
1.0
Rural
N
76
LP001256
Male
No
0
Graduate
No
3750
4750.0
176.0
360.0
1.0
Urban
N
191
LP001656
Male
No
0
Graduate
No
12000
0.0
164.0
360.0
1.0
Semiurban
N
78
LP001263
Male
Yes
3+
Graduate
No
3167
4000.0
180.0
300.0
0.0
Semiurban
N
367
LP002188
Male
No
0
Graduate
No
5124
0.0
124.0
NaN
0.0
Rural
N
195
LP001665
Male
Yes
1
Graduate
No
3125
2583.0
170.0
360.0
1.0
Semiurban
N
36
LP001109
Male
Yes
0
Graduate
No
1828
1330.0
100.0
NaN
0.0
Urban
N
591
LP002931
Male
Yes
2
Graduate
Yes
6000
0.0
205.0
240.0
1.0
Semiurban
N
308
LP001996
Male
No
0
Graduate
No
20233
0.0
480.0
360.0
1.0
Rural
N
63
LP001213
Male
Yes
1
Graduate
No
4945
0.0
NaN
360.0
0.0
Rural
N
541
LP002743
Female
No
0
Graduate
No
2138
0.0
99.0
360.0
0.0
Semiurban
N
409
LP002317
Male
Yes
3+
Graduate
No
81000
0.0
360.0
360.0
0.0
Rural
N
583
LP002898
Male
Yes
1
Graduate
No
1880
0.0
61.0
360.0
NaN
Rural
N
366
LP002187
Male
No
0
Graduate
No
2500
0.0
96.0
480.0
1.0
Semiurban
N
34
LP001100
Male
No
3+
Graduate
No
12500
3000.0
320.0
360.0
1.0
Rural
N
30
LP001091
Male
Yes
1
Graduate
NaN
4166
3369.0
201.0
360.0
NaN
Urban
N
17
LP001036
Female
No
0
Graduate
No
3510
0.0
76.0
360.0
0.0
Urban
N
513
LP002648
Male
Yes
0
Graduate
No
2130
6666.0
70.0
180.0
1.0
Semiurban
N
423
LP002362
Male
Yes
1
Graduate
No
7250
1667.0
110.0
NaN
0.0
Urban
N
In [19]:
sampled_data = pd.concat([sampled_Y, sampled_N])
sampled_data.sort_index(inplace = True)
sampled_data
Out[19]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
5
LP001011
Male
Yes
2
Graduate
Yes
5417
4196.0
267.0
360.0
1.0
Urban
Y
10
LP001024
Male
Yes
2
Graduate
No
3200
700.0
70.0
360.0
1.0
Urban
Y
11
LP001027
Male
Yes
2
Graduate
NaN
2500
1840.0
109.0
360.0
1.0
Urban
Y
14
LP001030
Male
Yes
2
Graduate
No
1299
1086.0
17.0
120.0
1.0
Urban
Y
16
LP001034
Male
No
1
Not Graduate
No
3596
0.0
100.0
240.0
NaN
Urban
Y
17
LP001036
Female
No
0
Graduate
No
3510
0.0
76.0
360.0
0.0
Urban
N
19
LP001041
Male
Yes
0
Graduate
NaN
2600
3500.0
115.0
NaN
1.0
Urban
Y
21
LP001046
Male
Yes
1
Graduate
No
5955
5625.0
315.0
360.0
1.0
Urban
Y
25
LP001066
Male
Yes
0
Graduate
Yes
9560
0.0
191.0
360.0
1.0
Semiurban
Y
26
LP001068
Male
Yes
0
Graduate
No
2799
2253.0
122.0
360.0
1.0
Semiurban
Y
29
LP001087
Female
No
2
Graduate
NaN
3750
2083.0
120.0
360.0
1.0
Semiurban
Y
30
LP001091
Male
Yes
1
Graduate
NaN
4166
3369.0
201.0
360.0
NaN
Urban
N
33
LP001098
Male
Yes
0
Graduate
No
3500
1667.0
114.0
360.0
1.0
Semiurban
Y
34
LP001100
Male
No
3+
Graduate
No
12500
3000.0
320.0
360.0
1.0
Rural
N
36
LP001109
Male
Yes
0
Graduate
No
1828
1330.0
100.0
NaN
0.0
Urban
N
38
LP001114
Male
No
0
Graduate
No
4166
7210.0
184.0
360.0
1.0
Urban
Y
39
LP001116
Male
No
0
Not Graduate
No
3748
1668.0
110.0
360.0
1.0
Semiurban
Y
44
LP001136
Male
Yes
0
Not Graduate
Yes
4695
0.0
96.0
NaN
1.0
Urban
Y
45
LP001137
Female
No
0
Graduate
No
3410
0.0
88.0
NaN
1.0
Urban
Y
49
LP001151
Female
No
0
Graduate
No
4000
2275.0
144.0
360.0
1.0
Semiurban
Y
51
LP001157
Female
No
0
Graduate
No
3086
0.0
120.0
360.0
1.0
Semiurban
Y
56
LP001195
Male
Yes
0
Graduate
No
2132
1591.0
96.0
360.0
1.0
Semiurban
Y
60
LP001205
Male
Yes
0
Graduate
No
2500
3796.0
120.0
360.0
1.0
Urban
Y
61
LP001206
Male
Yes
3+
Graduate
No
3029
0.0
99.0
360.0
1.0
Urban
Y
63
LP001213
Male
Yes
1
Graduate
No
4945
0.0
NaN
360.0
0.0
Rural
N
67
LP001233
Male
Yes
1
Graduate
No
10750
0.0
312.0
360.0
1.0
Urban
Y
70
LP001243
Male
Yes
0
Graduate
No
3208
3066.0
172.0
360.0
1.0
Urban
Y
74
LP001253
Male
Yes
3+
Graduate
Yes
5266
1774.0
187.0
360.0
1.0
Semiurban
Y
76
LP001256
Male
No
0
Graduate
No
3750
4750.0
176.0
360.0
1.0
Urban
N
78
LP001263
Male
Yes
3+
Graduate
No
3167
4000.0
180.0
300.0
0.0
Semiurban
N
...
...
...
...
...
...
...
...
...
...
...
...
...
...
540
LP002741
Female
Yes
1
Graduate
No
4608
2845.0
140.0
180.0
1.0
Semiurban
Y
541
LP002743
Female
No
0
Graduate
No
2138
0.0
99.0
360.0
0.0
Semiurban
N
542
LP002753
Female
No
1
Graduate
NaN
3652
0.0
95.0
360.0
1.0
Semiurban
Y
543
LP002755
Male
Yes
1
Not Graduate
No
2239
2524.0
128.0
360.0
1.0
Urban
Y
545
LP002767
Male
Yes
0
Graduate
No
2768
1950.0
155.0
360.0
1.0
Rural
Y
547
LP002772
Male
No
0
Graduate
No
2526
1783.0
145.0
360.0
1.0
Rural
Y
551
LP002784
Male
Yes
1
Not Graduate
No
2492
2375.0
NaN
360.0
1.0
Rural
Y
555
LP002792
Male
Yes
1
Graduate
No
5468
1032.0
26.0
360.0
1.0
Semiurban
Y
556
LP002794
Female
No
0
Graduate
No
2667
1625.0
84.0
360.0
NaN
Urban
Y
557
LP002795
Male
Yes
3+
Graduate
Yes
10139
0.0
260.0
360.0
1.0
Semiurban
Y
558
LP002798
Male
Yes
0
Graduate
No
3887
2669.0
162.0
360.0
1.0
Semiurban
Y
559
LP002804
Female
Yes
0
Graduate
No
4180
2306.0
182.0
360.0
1.0
Semiurban
Y
560
LP002807
Male
Yes
2
Not Graduate
No
3675
242.0
108.0
360.0
1.0
Semiurban
Y
561
LP002813
Female
Yes
1
Graduate
Yes
19484
0.0
600.0
360.0
1.0
Semiurban
Y
572
LP002855
Male
Yes
2
Graduate
No
16666
0.0
275.0
360.0
1.0
Urban
Y
579
LP002888
Male
No
0
Graduate
NaN
3182
2917.0
161.0
360.0
1.0
Urban
Y
580
LP002892
Male
Yes
2
Graduate
No
6540
0.0
205.0
360.0
1.0
Semiurban
Y
583
LP002898
Male
Yes
1
Graduate
No
1880
0.0
61.0
360.0
NaN
Rural
N
587
LP002917
Female
No
0
Not Graduate
No
2165
0.0
70.0
360.0
1.0
Semiurban
Y
588
LP002925
NaN
No
0
Graduate
No
4750
0.0
94.0
360.0
1.0
Semiurban
Y
590
LP002928
Male
Yes
0
Graduate
No
3000
3416.0
56.0
180.0
1.0
Semiurban
Y
591
LP002931
Male
Yes
2
Graduate
Yes
6000
0.0
205.0
240.0
1.0
Semiurban
N
593
LP002936
Male
Yes
0
Graduate
No
3859
3300.0
142.0
180.0
1.0
Rural
Y
594
LP002938
Male
Yes
0
Graduate
Yes
16120
0.0
260.0
360.0
1.0
Urban
Y
599
LP002948
Male
Yes
2
Graduate
No
5780
0.0
192.0
360.0
1.0
Urban
Y
602
LP002953
Male
Yes
3+
Graduate
No
5703
0.0
128.0
360.0
1.0
Urban
Y
606
LP002961
Male
Yes
1
Graduate
No
3400
2500.0
173.0
360.0
1.0
Semiurban
Y
607
LP002964
Male
Yes
2
Not Graduate
No
3987
1411.0
157.0
360.0
1.0
Rural
Y
608
LP002974
Male
Yes
0
Graduate
No
3232
1950.0
108.0
360.0
1.0
Rural
Y
611
LP002983
Male
Yes
1
Graduate
No
8072
240.0
253.0
360.0
1.0
Urban
Y
230 rows × 13 columns
In [20]:
np.random.seed(12345)
chunker = pd.read_csv('../data/loan/train.csv',
chunksize = 1000)
sampled_chunker = []
for piece in chunker:
for index, row in piece.iterrows():
# print type(row)
val = np.random.random()
if row['Loan_Status'] == 'Y' and val < 0.5:
sampled_chunker.append(row)
elif row['Loan_Status'] == 'N' and val < 0.1:
sampled_chunker.append(row)
sampled_data = pd.DataFrame(sampled_chunker)
sampled_data.sort_index(inplace = True)
sampled_data
Out[20]:
Loan_ID
Gender
Married
Dependents
Education
Self_Employed
ApplicantIncome
CoapplicantIncome
LoanAmount
Loan_Amount_Term
Credit_History
Property_Area
Loan_Status
2
LP001005
Male
Yes
0
Graduate
Yes
3000
0.0
66.0
360.0
1.0
Urban
Y
3
LP001006
Male
Yes
0
Not Graduate
No
2583
2358.0
120.0
360.0
1.0
Urban
Y
12
LP001028
Male
Yes
2
Graduate
No
3073
8106.0
200.0
360.0
1.0
Urban
Y
14
LP001030
Male
Yes
2
Graduate
No
1299
1086.0
17.0
120.0
1.0
Urban
Y
29
LP001087
Female
No
2
Graduate
NaN
3750
2083.0
120.0
360.0
1.0
Semiurban
Y
30
LP001091
Male
Yes
1
Graduate
NaN
4166
3369.0
201.0
360.0
NaN
Urban
N
33
LP001098
Male
Yes
0
Graduate
No
3500
1667.0
114.0
360.0
1.0
Semiurban
Y
37
LP001112
Female
Yes
0
Graduate
No
3667
1459.0
144.0
360.0
1.0
Semiurban
Y
43
LP001131
Male
Yes
0
Graduate
No
3941
2336.0
134.0
360.0
1.0
Semiurban
Y
44
LP001136
Male
Yes
0
Not Graduate
Yes
4695
0.0
96.0
NaN
1.0
Urban
Y
45
LP001137
Female
No
0
Graduate
No
3410
0.0
88.0
NaN
1.0
Urban
Y
46
LP001138
Male
Yes
1
Graduate
No
5649
0.0
44.0
360.0
1.0
Urban
Y
47
LP001144
Male
Yes
0
Graduate
No
5821
0.0
144.0
360.0
1.0
Urban
Y
49
LP001151
Female
No
0
Graduate
No
4000
2275.0
144.0
360.0
1.0
Semiurban
Y
51
LP001157
Female
No
0
Graduate
No
3086
0.0
120.0
360.0
1.0
Semiurban
Y
55
LP001194
Male
Yes
2
Graduate
No
2708
1167.0
97.0
360.0
1.0
Semiurban
Y
56
LP001195
Male
Yes
0
Graduate
No
2132
1591.0
96.0
360.0
1.0
Semiurban
Y
60
LP001205
Male
Yes
0
Graduate
No
2500
3796.0
120.0
360.0
1.0
Urban
Y
61
LP001206
Male
Yes
3+
Graduate
No
3029
0.0
99.0
360.0
1.0
Urban
Y
71
LP001245
Male
Yes
2
Not Graduate
Yes
1875
1875.0
97.0
360.0
1.0
Semiurban
Y
72
LP001248
Male
No
0
Graduate
No
3500
0.0
81.0
300.0
1.0
Semiurban
Y
84
LP001275
Male
Yes
1
Graduate
No
3988
0.0
50.0
240.0
1.0
Urban
Y
85
LP001279
Male
No
0
Graduate
No
2366
2531.0
136.0
360.0
1.0
Semiurban
Y
86
LP001280
Male
Yes
2
Not Graduate
No
3333
2000.0
99.0
360.0
NaN
Semiurban
Y
87
LP001282
Male
Yes
0
Graduate
No
2500
2118.0
104.0
360.0
1.0
Semiurban
Y
89
LP001310
Male
Yes
0
Graduate
No
5695
4167.0
175.0
360.0
1.0
Semiurban
Y
94
LP001325
Male
No
0
Not Graduate
No
3620
0.0
25.0
120.0
1.0
Semiurban
Y
96
LP001327
Female
Yes
0
Graduate
No
2484
2302.0
137.0
360.0
1.0
Semiurban
Y
99
LP001343
Male
Yes
0
Graduate
No
1759
3541.0
131.0
360.0
1.0
Semiurban
Y
100
LP001345
Male
Yes
2
Not Graduate
No
4288
3263.0
133.0
180.0
1.0
Urban
Y
...
...
...
...
...
...
...
...
...
...
...
...
...
...
542
LP002753
Female
No
1
Graduate
NaN
3652
0.0
95.0
360.0
1.0
Semiurban
Y
543
LP002755
Male
Yes
1
Not Graduate
No
2239
2524.0
128.0
360.0
1.0
Urban
Y
544
LP002757
Female
Yes
0
Not Graduate
No
3017
663.0
102.0
360.0
NaN
Semiurban
Y
545
LP002767
Male
Yes
0
Graduate
No
2768
1950.0
155.0
360.0
1.0
Rural
Y
549
LP002777
Male
Yes
0
Graduate
No
2785
2016.0
110.0
360.0
1.0
Rural
Y
551
LP002784
Male
Yes
1
Not Graduate
No
2492
2375.0
NaN
360.0
1.0
Rural
Y
552
LP002785
Male
Yes
1
Graduate
No
3333
3250.0
158.0
360.0
1.0
Urban
Y
555
LP002792
Male
Yes
1
Graduate
No
5468
1032.0
26.0
360.0
1.0
Semiurban
Y
557
LP002795
Male
Yes
3+
Graduate
Yes
10139
0.0
260.0
360.0
1.0
Semiurban
Y
558
LP002798
Male
Yes
0
Graduate
No
3887
2669.0
162.0
360.0
1.0
Semiurban
Y
560
LP002807
Male
Yes
2
Not Graduate
No
3675
242.0
108.0
360.0
1.0
Semiurban
Y
561
LP002813
Female
Yes
1
Graduate
Yes
19484
0.0
600.0
360.0
1.0
Semiurban
Y
562
LP002820
Male
Yes
0
Graduate
No
5923
2054.0
211.0
360.0
1.0
Rural
Y
566
LP002836
Male
No
0
Graduate
No
3333
0.0
70.0
360.0
1.0
Urban
Y
569
LP002841
Male
Yes
0
Graduate
No
3166
2064.0
104.0
360.0
0.0
Urban
N
574
LP002863
Male
Yes
3+
Graduate
No
6406
0.0
150.0
360.0
1.0
Semiurban
N
577
LP002874
Male
No
0
Graduate
No
3229
2739.0
110.0
360.0
1.0
Urban
Y
579
LP002888
Male
No
0
Graduate
NaN
3182
2917.0
161.0
360.0
1.0
Urban
Y
580
LP002892
Male
Yes
2
Graduate
No
6540
0.0
205.0
360.0
1.0
Semiurban
Y
581
LP002893
Male
No
0
Graduate
No
1836
33837.0
90.0
360.0
1.0
Urban
N
587
LP002917
Female
No
0
Not Graduate
No
2165
0.0
70.0
360.0
1.0
Semiurban
Y
588
LP002925
NaN
No
0
Graduate
No
4750
0.0
94.0
360.0
1.0
Semiurban
Y
592
LP002933
NaN
No
3+
Graduate
Yes
9357
0.0
292.0
360.0
1.0
Semiurban
Y
595
LP002940
Male
No
0
Not Graduate
No
3833
0.0
110.0
360.0
1.0
Rural
Y
599
LP002948
Male
Yes
2
Graduate
No
5780
0.0
192.0
360.0
1.0
Urban
Y
603
LP002958
Male
No
0
Graduate
No
3676
4301.0
172.0
360.0
1.0
Rural
Y
608
LP002974
Male
Yes
0
Graduate
No
3232
1950.0
108.0
360.0
1.0
Rural
Y
609
LP002978
Female
No
0
Graduate
No
2900
0.0
71.0
360.0
1.0
Rural
Y
611
LP002983
Male
Yes
1
Graduate
No
8072
240.0
253.0
360.0
1.0
Urban
Y
612
LP002984
Male
Yes
2
Graduate
No
7583
0.0
187.0
360.0
1.0
Urban
Y
228 rows × 13 columns
Content source: yao-matrix/mLearning
Similar notebooks: