In [14]:
import pandas as pd
import numpy as np
import random

Task 1


In [15]:
data = pd.read_csv('../data/loan/train.csv')
print len(data)
sampler = np.array(range(0,len(data), 10))
data.take(sampler)


614
Out[15]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y
10 LP001024 Male Yes 2 Graduate No 3200 700.0 70.0 360.0 1.0 Urban Y
20 LP001043 Male Yes 0 Not Graduate No 7660 0.0 104.0 360.0 0.0 Urban N
30 LP001091 Male Yes 1 Graduate NaN 4166 3369.0 201.0 360.0 NaN Urban N
40 LP001119 Male No 0 Graduate No 3600 0.0 80.0 360.0 1.0 Urban N
50 LP001155 Female Yes 0 Not Graduate No 1928 1644.0 100.0 360.0 1.0 Semiurban Y
60 LP001205 Male Yes 0 Graduate No 2500 3796.0 120.0 360.0 1.0 Urban Y
70 LP001243 Male Yes 0 Graduate No 3208 3066.0 172.0 360.0 1.0 Urban Y
80 LP001265 Female No 0 Graduate No 3846 0.0 111.0 360.0 1.0 Semiurban Y
90 LP001316 Male Yes 0 Graduate No 2958 2900.0 131.0 360.0 1.0 Semiurban Y
100 LP001345 Male Yes 2 Not Graduate No 4288 3263.0 133.0 180.0 1.0 Urban Y
110 LP001385 Male No 0 Graduate No 5316 0.0 136.0 360.0 1.0 Urban Y
120 LP001426 Male Yes NaN Graduate No 5667 2667.0 180.0 360.0 1.0 Rural Y
130 LP001469 Male No 0 Graduate Yes 20166 0.0 650.0 480.0 NaN Urban Y
140 LP001497 Male Yes 2 Graduate No 5042 2083.0 185.0 360.0 1.0 Rural N
150 LP001528 Male No 0 Graduate No 6277 0.0 118.0 360.0 0.0 Rural N
160 LP001560 Male Yes 0 Not Graduate No 1863 1041.0 98.0 360.0 1.0 Semiurban Y
170 LP001581 Male Yes 0 Not Graduate NaN 1820 1769.0 95.0 360.0 1.0 Rural Y
180 LP001633 Male Yes 1 Graduate No 6400 7250.0 180.0 360.0 0.0 Urban N
190 LP001653 Male No 0 Not Graduate No 4885 0.0 48.0 360.0 1.0 Rural Y
200 LP001674 Male Yes 1 Not Graduate No 2600 2500.0 90.0 360.0 1.0 Semiurban Y
210 LP001708 Female No 0 Graduate No 10000 0.0 214.0 360.0 1.0 Semiurban N
220 LP001736 Male Yes 0 Graduate No 2221 0.0 60.0 360.0 0.0 Urban N
230 LP001765 Male Yes 1 Graduate No 2491 2054.0 104.0 360.0 1.0 Semiurban Y
240 LP001798 Male Yes 2 Graduate No 5819 5000.0 120.0 360.0 1.0 Rural Y
250 LP001835 Male Yes 0 Not Graduate No 1668 3890.0 201.0 360.0 0.0 Semiurban N
260 LP001865 Male Yes 1 Graduate No 6083 4250.0 330.0 360.0 NaN Urban Y
270 LP001888 Female No 0 Graduate No 3237 0.0 30.0 360.0 1.0 Urban Y
280 LP001910 Male No 1 Not Graduate Yes 4053 2426.0 158.0 360.0 0.0 Urban N
290 LP001936 Male Yes 0 Graduate No 3075 2416.0 139.0 360.0 1.0 Rural Y
... ... ... ... ... ... ... ... ... ... ... ... ... ...
320 LP002051 Male Yes 0 Graduate No 2400 2167.0 115.0 360.0 1.0 Semiurban Y
330 LP002097 Male No 1 Graduate No 4384 1793.0 117.0 360.0 1.0 Urban Y
340 LP002115 Male Yes 3+ Not Graduate No 2647 1587.0 173.0 360.0 1.0 Rural N
350 LP002139 Male Yes 0 Graduate No 9083 0.0 228.0 360.0 1.0 Semiurban Y
360 LP002161 Female No 1 Graduate No 4723 0.0 81.0 360.0 1.0 Semiurban N
370 LP002194 Female No 0 Graduate Yes 15759 0.0 55.0 360.0 1.0 Semiurban Y
380 LP002226 Male Yes 0 Graduate NaN 3333 2500.0 128.0 360.0 1.0 Semiurban Y
390 LP002255 Male No 3+ Graduate No 9167 0.0 185.0 360.0 1.0 Rural Y
400 LP002288 Male Yes 2 Not Graduate No 2889 0.0 45.0 180.0 0.0 Urban N
410 LP002318 Female No 1 Not Graduate Yes 3867 0.0 62.0 360.0 1.0 Semiurban N
420 LP002348 Male Yes 0 Graduate No 5829 0.0 138.0 360.0 1.0 Rural Y
430 LP002377 Female No 1 Graduate Yes 8624 0.0 150.0 360.0 1.0 Semiurban Y
440 LP002408 Male No 0 Graduate No 3660 5064.0 187.0 360.0 1.0 Semiurban Y
450 LP002446 Male Yes 2 Not Graduate No 2309 1255.0 125.0 360.0 0.0 Rural N
460 LP002478 NaN Yes 0 Graduate Yes 2083 4083.0 160.0 360.0 NaN Semiurban Y
470 LP002515 Male Yes 1 Graduate Yes 3450 2079.0 162.0 360.0 1.0 Semiurban Y
480 LP002534 Female No 0 Not Graduate No 4350 0.0 154.0 360.0 1.0 Rural Y
490 LP002560 Male No 0 Not Graduate No 2699 2785.0 96.0 360.0 NaN Semiurban Y
500 LP002603 Female No 0 Graduate No 645 3683.0 113.0 480.0 1.0 Rural Y
510 LP002637 Male No 0 Not Graduate No 3598 1287.0 100.0 360.0 1.0 Rural N
520 LP002689 Male Yes 2 Not Graduate No 2192 1742.0 45.0 360.0 1.0 Semiurban Y
530 LP002717 Male Yes 0 Graduate No 1025 5500.0 216.0 360.0 NaN Rural Y
540 LP002741 Female Yes 1 Graduate No 4608 2845.0 140.0 180.0 1.0 Semiurban Y
550 LP002778 Male Yes 2 Graduate Yes 6633 0.0 NaN 360.0 0.0 Rural N
560 LP002807 Male Yes 2 Not Graduate No 3675 242.0 108.0 360.0 1.0 Semiurban Y
570 LP002842 Male Yes 1 Graduate No 3417 1750.0 186.0 360.0 1.0 Urban Y
580 LP002892 Male Yes 2 Graduate No 6540 0.0 205.0 360.0 1.0 Semiurban Y
590 LP002928 Male Yes 0 Graduate No 3000 3416.0 56.0 180.0 1.0 Semiurban Y
600 LP002949 Female No 3+ Graduate NaN 416 41667.0 350.0 180.0 NaN Urban N
610 LP002979 Male Yes 3+ Graduate No 4106 0.0 40.0 180.0 1.0 Rural Y

62 rows × 13 columns

Task 2


In [16]:
# sampler = np.random.randint(0, len(data), 
#                            size = len(data) / 10)

sampler = random.sample(xrange(len(data)), len(data) / 10)
sampled_data = data.take(sampler)
sampled_data.sort_index(inplace = True)
sampled_data


Out[16]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 LP001002 Male No 0 Graduate No 5849 0.000000 NaN 360.0 1.0 Urban Y
43 LP001131 Male Yes 0 Graduate No 3941 2336.000000 134.0 360.0 1.0 Semiurban Y
52 LP001164 Female No 0 Graduate No 4230 0.000000 112.0 360.0 1.0 Semiurban N
76 LP001256 Male No 0 Graduate No 3750 4750.000000 176.0 360.0 1.0 Urban N
77 LP001259 Male Yes 1 Graduate Yes 1000 3022.000000 110.0 360.0 1.0 Urban N
86 LP001280 Male Yes 2 Not Graduate No 3333 2000.000000 99.0 360.0 NaN Semiurban Y
92 LP001319 Male Yes 2 Not Graduate No 3273 1820.000000 81.0 360.0 1.0 Urban Y
98 LP001334 Male Yes 0 Not Graduate No 4188 0.000000 115.0 180.0 1.0 Semiurban Y
104 LP001357 Male NaN NaN Graduate No 3816 754.000000 160.0 360.0 1.0 Urban Y
107 LP001370 Male No 0 Not Graduate NaN 7333 0.000000 120.0 360.0 1.0 Rural N
117 LP001405 Male Yes 1 Graduate No 2214 1398.000000 85.0 360.0 NaN Urban Y
119 LP001422 Female No 0 Graduate No 10408 0.000000 259.0 360.0 1.0 Urban Y
133 LP001482 Male Yes 0 Graduate Yes 3459 0.000000 25.0 120.0 1.0 Semiurban Y
136 LP001489 Female Yes 0 Graduate No 4583 0.000000 84.0 360.0 1.0 Rural N
145 LP001514 Female Yes 0 Graduate No 2330 4486.000000 100.0 360.0 1.0 Semiurban Y
158 LP001546 Male No 0 Graduate NaN 2980 2083.000000 120.0 360.0 1.0 Rural Y
159 LP001552 Male Yes 0 Graduate No 4583 5625.000000 255.0 360.0 1.0 Semiurban Y
167 LP001578 Male Yes 0 Graduate No 2439 3333.000000 129.0 360.0 1.0 Rural Y
172 LP001586 Male Yes 3+ Not Graduate No 3522 0.000000 81.0 180.0 1.0 Rural N
174 LP001603 Male Yes 0 Not Graduate Yes 4344 736.000000 87.0 360.0 1.0 Semiurban N
176 LP001608 Male Yes 2 Graduate No 2045 1619.000000 101.0 360.0 1.0 Rural Y
183 LP001637 Male Yes 1 Graduate No 33846 0.000000 260.0 360.0 1.0 Semiurban N
186 LP001641 Male Yes 1 Graduate Yes 2178 0.000000 66.0 300.0 0.0 Rural N
188 LP001644 NaN Yes 0 Graduate Yes 674 5296.000000 168.0 360.0 1.0 Rural Y
197 LP001669 Female No 0 Not Graduate No 1907 2365.000000 120.0 NaN 1.0 Urban Y
198 LP001671 Female Yes 0 Graduate No 3416 2816.000000 113.0 360.0 NaN Semiurban Y
207 LP001698 Male No 0 Not Graduate No 3975 2531.000000 55.0 360.0 1.0 Rural Y
229 LP001761 Male No 0 Graduate Yes 6400 0.000000 200.0 360.0 1.0 Rural Y
236 LP001786 Male Yes 0 Graduate NaN 5746 0.000000 255.0 360.0 NaN Urban N
238 LP001790 Female No 1 Graduate No 3812 0.000000 112.0 360.0 1.0 Rural Y
... ... ... ... ... ... ... ... ... ... ... ... ... ...
270 LP001888 Female No 0 Graduate No 3237 0.000000 30.0 360.0 1.0 Urban Y
312 LP002006 Female No 0 Graduate No 2507 0.000000 56.0 360.0 1.0 Rural Y
321 LP002053 Male Yes 3+ Graduate No 4342 189.000000 124.0 360.0 1.0 Semiurban Y
335 LP002106 Male Yes NaN Graduate Yes 5503 4490.000000 70.0 NaN 1.0 Semiurban Y
337 LP002112 Male Yes 2 Graduate Yes 2500 4600.000000 176.0 360.0 1.0 Rural Y
343 LP002126 Male Yes 3+ Not Graduate No 3173 0.000000 74.0 360.0 1.0 Semiurban Y
361 LP002170 Male Yes 2 Graduate No 5000 3667.000000 236.0 360.0 1.0 Semiurban Y
372 LP002201 Male Yes 2 Graduate Yes 9323 7873.000000 380.0 300.0 1.0 Rural Y
378 LP002224 Male No 0 Graduate No 3069 0.000000 71.0 480.0 1.0 Urban N
403 LP002300 Female No 0 Not Graduate No 1963 0.000000 53.0 360.0 1.0 Semiurban Y
409 LP002317 Male Yes 3+ Graduate No 81000 0.000000 360.0 360.0 0.0 Rural N
419 LP002347 Male Yes 0 Graduate No 3246 1417.000000 138.0 360.0 1.0 Semiurban Y
428 LP002369 Male Yes 0 Graduate No 2920 16.120001 87.0 360.0 1.0 Rural Y
433 LP002387 Male Yes 0 Graduate No 2425 2340.000000 143.0 360.0 1.0 Semiurban Y
449 LP002444 Male No 1 Not Graduate Yes 2769 1542.000000 190.0 360.0 NaN Semiurban N
455 LP002455 Male Yes 2 Graduate No 3859 0.000000 96.0 360.0 1.0 Semiurban Y
476 LP002529 Male Yes 2 Graduate No 6700 1750.000000 230.0 300.0 1.0 Semiurban Y
478 LP002531 Male Yes 1 Graduate Yes 16667 2250.000000 86.0 360.0 1.0 Semiurban Y
479 LP002533 Male Yes 2 Graduate No 2947 1603.000000 NaN 360.0 1.0 Urban N
484 LP002543 Male Yes 2 Graduate No 8333 0.000000 246.0 360.0 1.0 Semiurban Y
491 LP002562 Male Yes 1 Not Graduate No 5333 1131.000000 186.0 360.0 NaN Urban Y
503 LP002618 Male Yes 1 Not Graduate No 4050 5302.000000 138.0 360.0 NaN Rural N
527 LP002706 Male Yes 1 Not Graduate No 5285 1430.000000 161.0 360.0 0.0 Semiurban Y
548 LP002776 Female No 0 Graduate No 5000 0.000000 103.0 360.0 0.0 Semiurban N
551 LP002784 Male Yes 1 Not Graduate No 2492 2375.000000 NaN 360.0 1.0 Rural Y
553 LP002788 Male Yes 0 Not Graduate No 2454 2333.000000 181.0 360.0 0.0 Urban N
573 LP002862 Male Yes 2 Not Graduate No 6125 1625.000000 187.0 480.0 1.0 Semiurban N
582 LP002894 Female Yes 0 Graduate No 3166 0.000000 36.0 360.0 1.0 Semiurban Y
590 LP002928 Male Yes 0 Graduate No 3000 3416.000000 56.0 180.0 1.0 Semiurban Y
599 LP002948 Male Yes 2 Graduate No 5780 0.000000 192.0 360.0 1.0 Urban Y

61 rows × 13 columns

Task 3


In [17]:
data_Y = data.loc[data['Loan_Status'] == 'Y']
# sampler = np.random.randint(0, len(data_Y), 
#                            size = len(data_Y) / 2)
sampler = random.sample(xrange(len(data_Y)), len(data_Y) / 2)

sampled_Y = data_Y.take(sampler)
sampled_Y


Out[17]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
374 LP002209 Female No 0 Graduate NaN 2764 1459.000000 110.0 360.0 1.0 Urban Y
608 LP002974 Male Yes 0 Graduate No 3232 1950.000000 108.0 360.0 1.0 Rural Y
344 LP002128 Male Yes 2 Graduate NaN 2583 2330.000000 125.0 360.0 1.0 Rural Y
283 LP001917 Female No 0 Graduate No 1811 1666.000000 54.0 360.0 1.0 Urban Y
11 LP001027 Male Yes 2 Graduate NaN 2500 1840.000000 109.0 360.0 1.0 Urban Y
143 LP001507 Male Yes 0 Graduate No 2698 2034.000000 122.0 360.0 1.0 Semiurban Y
214 LP001716 Male Yes 0 Graduate No 3173 3021.000000 137.0 360.0 1.0 Urban Y
368 LP002190 Male Yes 1 Graduate No 6325 0.000000 175.0 360.0 1.0 Semiurban Y
444 LP002424 Male Yes 0 Graduate No 7333 8333.000000 175.0 300.0 NaN Rural Y
315 LP002031 Male Yes 1 Not Graduate No 3399 1640.000000 111.0 180.0 1.0 Urban Y
334 LP002103 NaN Yes 1 Graduate Yes 9833 1833.000000 182.0 180.0 1.0 Urban Y
559 LP002804 Female Yes 0 Graduate No 4180 2306.000000 182.0 360.0 1.0 Semiurban Y
74 LP001253 Male Yes 3+ Graduate Yes 5266 1774.000000 187.0 360.0 1.0 Semiurban Y
439 LP002407 Female Yes 0 Not Graduate Yes 7142 0.000000 138.0 360.0 1.0 Rural Y
331 LP002098 Male No 0 Graduate No 2935 0.000000 98.0 360.0 1.0 Semiurban Y
529 LP002716 Male No 0 Not Graduate No 6783 0.000000 130.0 360.0 1.0 Semiurban Y
244 LP001811 Male Yes 0 Not Graduate No 3406 4417.000000 123.0 360.0 1.0 Semiurban Y
488 LP002555 Male Yes 2 Graduate Yes 4583 2083.000000 160.0 360.0 1.0 Semiurban Y
79 LP001264 Male Yes 3+ Not Graduate Yes 3333 2166.000000 130.0 360.0 NaN Semiurban Y
114 LP001398 Male No 0 Graduate NaN 5050 0.000000 118.0 360.0 1.0 Semiurban Y
94 LP001325 Male No 0 Not Graduate No 3620 0.000000 25.0 120.0 1.0 Semiurban Y
506 LP002624 Male Yes 0 Graduate No 20833 6667.000000 480.0 360.0 NaN Urban Y
588 LP002925 NaN No 0 Graduate No 4750 0.000000 94.0 360.0 1.0 Semiurban Y
473 LP002522 Female No 0 Graduate Yes 2500 0.000000 93.0 360.0 NaN Urban Y
279 LP001908 Female Yes 0 Not Graduate No 4100 0.000000 124.0 360.0 NaN Rural Y
264 LP001872 Male No 0 Graduate Yes 5166 0.000000 128.0 360.0 1.0 Semiurban Y
178 LP001616 Male Yes 1 Graduate No 3750 0.000000 116.0 360.0 1.0 Semiurban Y
611 LP002983 Male Yes 1 Graduate No 8072 240.000000 253.0 360.0 1.0 Urban Y
217 LP001726 Male Yes 0 Graduate No 3727 1775.000000 131.0 360.0 1.0 Semiurban Y
306 LP001993 Female No 0 Graduate No 3762 1666.000000 135.0 360.0 1.0 Rural Y
... ... ... ... ... ... ... ... ... ... ... ... ... ...
255 LP001846 Female No 3+ Graduate No 3083 0.000000 255.0 360.0 1.0 Rural Y
101 LP001349 Male No 0 Graduate No 4843 3806.000000 151.0 360.0 1.0 Semiurban Y
470 LP002515 Male Yes 1 Graduate Yes 3450 2079.000000 162.0 360.0 1.0 Semiurban Y
60 LP001205 Male Yes 0 Graduate No 2500 3796.000000 120.0 360.0 1.0 Urban Y
39 LP001116 Male No 0 Not Graduate No 3748 1668.000000 110.0 360.0 1.0 Semiurban Y
432 LP002386 Male No 0 Graduate NaN 12876 0.000000 405.0 360.0 1.0 Semiurban Y
86 LP001280 Male Yes 2 Not Graduate No 3333 2000.000000 99.0 360.0 NaN Semiurban Y
49 LP001151 Female No 0 Graduate No 4000 2275.000000 144.0 360.0 1.0 Semiurban Y
528 LP002714 Male No 1 Not Graduate No 2679 1302.000000 94.0 360.0 1.0 Semiurban Y
337 LP002112 Male Yes 2 Graduate Yes 2500 4600.000000 176.0 360.0 1.0 Rural Y
197 LP001669 Female No 0 Not Graduate No 1907 2365.000000 120.0 NaN 1.0 Urban Y
278 LP001907 Male Yes 0 Graduate No 14583 0.000000 436.0 360.0 1.0 Semiurban Y
545 LP002767 Male Yes 0 Graduate No 2768 1950.000000 155.0 360.0 1.0 Rural Y
463 LP002489 Female No 1 Not Graduate NaN 5191 0.000000 132.0 360.0 1.0 Semiurban Y
70 LP001243 Male Yes 0 Graduate No 3208 3066.000000 172.0 360.0 1.0 Urban Y
557 LP002795 Male Yes 3+ Graduate Yes 10139 0.000000 260.0 360.0 1.0 Semiurban Y
10 LP001024 Male Yes 2 Graduate No 3200 700.000000 70.0 360.0 1.0 Urban Y
520 LP002689 Male Yes 2 Not Graduate No 2192 1742.000000 45.0 360.0 1.0 Semiurban Y
227 LP001758 Male Yes 2 Graduate No 6250 1695.000000 210.0 360.0 1.0 Semiurban Y
290 LP001936 Male Yes 0 Graduate No 3075 2416.000000 139.0 360.0 1.0 Rural Y
561 LP002813 Female Yes 1 Graduate Yes 19484 0.000000 600.0 360.0 1.0 Semiurban Y
392 LP002263 Male Yes 0 Graduate No 2583 2115.000000 120.0 360.0 NaN Urban Y
19 LP001041 Male Yes 0 Graduate NaN 2600 3500.000000 115.0 NaN 1.0 Urban Y
419 LP002347 Male Yes 0 Graduate No 3246 1417.000000 138.0 360.0 1.0 Semiurban Y
51 LP001157 Female No 0 Graduate No 3086 0.000000 120.0 360.0 1.0 Semiurban Y
332 LP002100 Male No NaN Graduate No 2833 0.000000 71.0 360.0 1.0 Urban Y
607 LP002964 Male Yes 2 Not Graduate No 3987 1411.000000 157.0 360.0 1.0 Rural Y
428 LP002369 Male Yes 0 Graduate No 2920 16.120001 87.0 360.0 1.0 Rural Y
126 LP001448 NaN Yes 3+ Graduate No 23803 0.000000 370.0 360.0 1.0 Rural Y
560 LP002807 Male Yes 2 Not Graduate No 3675 242.000000 108.0 360.0 1.0 Semiurban Y

211 rows × 13 columns


In [18]:
data_N = data.loc[data['Loan_Status'] == 'N']
sampler = np.random.randint(0, len(data_N), 
                            size = len(data_N) / 10)
# sampler = random.sample(xrange(len(data_N)), len(data_Y) / N)
sampled_N = data_N.take(sampler)
sampled_N


Out[18]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
153 LP001532 Male Yes 2 Not Graduate No 2281 0.0 113.0 360.0 1.0 Rural N
76 LP001256 Male No 0 Graduate No 3750 4750.0 176.0 360.0 1.0 Urban N
191 LP001656 Male No 0 Graduate No 12000 0.0 164.0 360.0 1.0 Semiurban N
78 LP001263 Male Yes 3+ Graduate No 3167 4000.0 180.0 300.0 0.0 Semiurban N
367 LP002188 Male No 0 Graduate No 5124 0.0 124.0 NaN 0.0 Rural N
195 LP001665 Male Yes 1 Graduate No 3125 2583.0 170.0 360.0 1.0 Semiurban N
36 LP001109 Male Yes 0 Graduate No 1828 1330.0 100.0 NaN 0.0 Urban N
591 LP002931 Male Yes 2 Graduate Yes 6000 0.0 205.0 240.0 1.0 Semiurban N
308 LP001996 Male No 0 Graduate No 20233 0.0 480.0 360.0 1.0 Rural N
63 LP001213 Male Yes 1 Graduate No 4945 0.0 NaN 360.0 0.0 Rural N
541 LP002743 Female No 0 Graduate No 2138 0.0 99.0 360.0 0.0 Semiurban N
409 LP002317 Male Yes 3+ Graduate No 81000 0.0 360.0 360.0 0.0 Rural N
583 LP002898 Male Yes 1 Graduate No 1880 0.0 61.0 360.0 NaN Rural N
366 LP002187 Male No 0 Graduate No 2500 0.0 96.0 480.0 1.0 Semiurban N
34 LP001100 Male No 3+ Graduate No 12500 3000.0 320.0 360.0 1.0 Rural N
30 LP001091 Male Yes 1 Graduate NaN 4166 3369.0 201.0 360.0 NaN Urban N
17 LP001036 Female No 0 Graduate No 3510 0.0 76.0 360.0 0.0 Urban N
513 LP002648 Male Yes 0 Graduate No 2130 6666.0 70.0 180.0 1.0 Semiurban N
423 LP002362 Male Yes 1 Graduate No 7250 1667.0 110.0 NaN 0.0 Urban N

In [19]:
sampled_data = pd.concat([sampled_Y, sampled_N])
sampled_data.sort_index(inplace = True)
sampled_data


Out[19]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y
10 LP001024 Male Yes 2 Graduate No 3200 700.0 70.0 360.0 1.0 Urban Y
11 LP001027 Male Yes 2 Graduate NaN 2500 1840.0 109.0 360.0 1.0 Urban Y
14 LP001030 Male Yes 2 Graduate No 1299 1086.0 17.0 120.0 1.0 Urban Y
16 LP001034 Male No 1 Not Graduate No 3596 0.0 100.0 240.0 NaN Urban Y
17 LP001036 Female No 0 Graduate No 3510 0.0 76.0 360.0 0.0 Urban N
19 LP001041 Male Yes 0 Graduate NaN 2600 3500.0 115.0 NaN 1.0 Urban Y
21 LP001046 Male Yes 1 Graduate No 5955 5625.0 315.0 360.0 1.0 Urban Y
25 LP001066 Male Yes 0 Graduate Yes 9560 0.0 191.0 360.0 1.0 Semiurban Y
26 LP001068 Male Yes 0 Graduate No 2799 2253.0 122.0 360.0 1.0 Semiurban Y
29 LP001087 Female No 2 Graduate NaN 3750 2083.0 120.0 360.0 1.0 Semiurban Y
30 LP001091 Male Yes 1 Graduate NaN 4166 3369.0 201.0 360.0 NaN Urban N
33 LP001098 Male Yes 0 Graduate No 3500 1667.0 114.0 360.0 1.0 Semiurban Y
34 LP001100 Male No 3+ Graduate No 12500 3000.0 320.0 360.0 1.0 Rural N
36 LP001109 Male Yes 0 Graduate No 1828 1330.0 100.0 NaN 0.0 Urban N
38 LP001114 Male No 0 Graduate No 4166 7210.0 184.0 360.0 1.0 Urban Y
39 LP001116 Male No 0 Not Graduate No 3748 1668.0 110.0 360.0 1.0 Semiurban Y
44 LP001136 Male Yes 0 Not Graduate Yes 4695 0.0 96.0 NaN 1.0 Urban Y
45 LP001137 Female No 0 Graduate No 3410 0.0 88.0 NaN 1.0 Urban Y
49 LP001151 Female No 0 Graduate No 4000 2275.0 144.0 360.0 1.0 Semiurban Y
51 LP001157 Female No 0 Graduate No 3086 0.0 120.0 360.0 1.0 Semiurban Y
56 LP001195 Male Yes 0 Graduate No 2132 1591.0 96.0 360.0 1.0 Semiurban Y
60 LP001205 Male Yes 0 Graduate No 2500 3796.0 120.0 360.0 1.0 Urban Y
61 LP001206 Male Yes 3+ Graduate No 3029 0.0 99.0 360.0 1.0 Urban Y
63 LP001213 Male Yes 1 Graduate No 4945 0.0 NaN 360.0 0.0 Rural N
67 LP001233 Male Yes 1 Graduate No 10750 0.0 312.0 360.0 1.0 Urban Y
70 LP001243 Male Yes 0 Graduate No 3208 3066.0 172.0 360.0 1.0 Urban Y
74 LP001253 Male Yes 3+ Graduate Yes 5266 1774.0 187.0 360.0 1.0 Semiurban Y
76 LP001256 Male No 0 Graduate No 3750 4750.0 176.0 360.0 1.0 Urban N
78 LP001263 Male Yes 3+ Graduate No 3167 4000.0 180.0 300.0 0.0 Semiurban N
... ... ... ... ... ... ... ... ... ... ... ... ... ...
540 LP002741 Female Yes 1 Graduate No 4608 2845.0 140.0 180.0 1.0 Semiurban Y
541 LP002743 Female No 0 Graduate No 2138 0.0 99.0 360.0 0.0 Semiurban N
542 LP002753 Female No 1 Graduate NaN 3652 0.0 95.0 360.0 1.0 Semiurban Y
543 LP002755 Male Yes 1 Not Graduate No 2239 2524.0 128.0 360.0 1.0 Urban Y
545 LP002767 Male Yes 0 Graduate No 2768 1950.0 155.0 360.0 1.0 Rural Y
547 LP002772 Male No 0 Graduate No 2526 1783.0 145.0 360.0 1.0 Rural Y
551 LP002784 Male Yes 1 Not Graduate No 2492 2375.0 NaN 360.0 1.0 Rural Y
555 LP002792 Male Yes 1 Graduate No 5468 1032.0 26.0 360.0 1.0 Semiurban Y
556 LP002794 Female No 0 Graduate No 2667 1625.0 84.0 360.0 NaN Urban Y
557 LP002795 Male Yes 3+ Graduate Yes 10139 0.0 260.0 360.0 1.0 Semiurban Y
558 LP002798 Male Yes 0 Graduate No 3887 2669.0 162.0 360.0 1.0 Semiurban Y
559 LP002804 Female Yes 0 Graduate No 4180 2306.0 182.0 360.0 1.0 Semiurban Y
560 LP002807 Male Yes 2 Not Graduate No 3675 242.0 108.0 360.0 1.0 Semiurban Y
561 LP002813 Female Yes 1 Graduate Yes 19484 0.0 600.0 360.0 1.0 Semiurban Y
572 LP002855 Male Yes 2 Graduate No 16666 0.0 275.0 360.0 1.0 Urban Y
579 LP002888 Male No 0 Graduate NaN 3182 2917.0 161.0 360.0 1.0 Urban Y
580 LP002892 Male Yes 2 Graduate No 6540 0.0 205.0 360.0 1.0 Semiurban Y
583 LP002898 Male Yes 1 Graduate No 1880 0.0 61.0 360.0 NaN Rural N
587 LP002917 Female No 0 Not Graduate No 2165 0.0 70.0 360.0 1.0 Semiurban Y
588 LP002925 NaN No 0 Graduate No 4750 0.0 94.0 360.0 1.0 Semiurban Y
590 LP002928 Male Yes 0 Graduate No 3000 3416.0 56.0 180.0 1.0 Semiurban Y
591 LP002931 Male Yes 2 Graduate Yes 6000 0.0 205.0 240.0 1.0 Semiurban N
593 LP002936 Male Yes 0 Graduate No 3859 3300.0 142.0 180.0 1.0 Rural Y
594 LP002938 Male Yes 0 Graduate Yes 16120 0.0 260.0 360.0 1.0 Urban Y
599 LP002948 Male Yes 2 Graduate No 5780 0.0 192.0 360.0 1.0 Urban Y
602 LP002953 Male Yes 3+ Graduate No 5703 0.0 128.0 360.0 1.0 Urban Y
606 LP002961 Male Yes 1 Graduate No 3400 2500.0 173.0 360.0 1.0 Semiurban Y
607 LP002964 Male Yes 2 Not Graduate No 3987 1411.0 157.0 360.0 1.0 Rural Y
608 LP002974 Male Yes 0 Graduate No 3232 1950.0 108.0 360.0 1.0 Rural Y
611 LP002983 Male Yes 1 Graduate No 8072 240.0 253.0 360.0 1.0 Urban Y

230 rows × 13 columns

Big File


In [20]:
np.random.seed(12345)
chunker = pd.read_csv('../data/loan/train.csv', 
                      chunksize = 1000)
sampled_chunker = []
for piece in chunker:
    for index, row in piece.iterrows():
        # print type(row)
        val = np.random.random()
        if row['Loan_Status'] == 'Y' and val < 0.5:
            sampled_chunker.append(row)
        elif row['Loan_Status'] == 'N' and val < 0.1:
            sampled_chunker.append(row)
sampled_data = pd.DataFrame(sampled_chunker)
sampled_data.sort_index(inplace = True)
sampled_data


Out[20]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y
12 LP001028 Male Yes 2 Graduate No 3073 8106.0 200.0 360.0 1.0 Urban Y
14 LP001030 Male Yes 2 Graduate No 1299 1086.0 17.0 120.0 1.0 Urban Y
29 LP001087 Female No 2 Graduate NaN 3750 2083.0 120.0 360.0 1.0 Semiurban Y
30 LP001091 Male Yes 1 Graduate NaN 4166 3369.0 201.0 360.0 NaN Urban N
33 LP001098 Male Yes 0 Graduate No 3500 1667.0 114.0 360.0 1.0 Semiurban Y
37 LP001112 Female Yes 0 Graduate No 3667 1459.0 144.0 360.0 1.0 Semiurban Y
43 LP001131 Male Yes 0 Graduate No 3941 2336.0 134.0 360.0 1.0 Semiurban Y
44 LP001136 Male Yes 0 Not Graduate Yes 4695 0.0 96.0 NaN 1.0 Urban Y
45 LP001137 Female No 0 Graduate No 3410 0.0 88.0 NaN 1.0 Urban Y
46 LP001138 Male Yes 1 Graduate No 5649 0.0 44.0 360.0 1.0 Urban Y
47 LP001144 Male Yes 0 Graduate No 5821 0.0 144.0 360.0 1.0 Urban Y
49 LP001151 Female No 0 Graduate No 4000 2275.0 144.0 360.0 1.0 Semiurban Y
51 LP001157 Female No 0 Graduate No 3086 0.0 120.0 360.0 1.0 Semiurban Y
55 LP001194 Male Yes 2 Graduate No 2708 1167.0 97.0 360.0 1.0 Semiurban Y
56 LP001195 Male Yes 0 Graduate No 2132 1591.0 96.0 360.0 1.0 Semiurban Y
60 LP001205 Male Yes 0 Graduate No 2500 3796.0 120.0 360.0 1.0 Urban Y
61 LP001206 Male Yes 3+ Graduate No 3029 0.0 99.0 360.0 1.0 Urban Y
71 LP001245 Male Yes 2 Not Graduate Yes 1875 1875.0 97.0 360.0 1.0 Semiurban Y
72 LP001248 Male No 0 Graduate No 3500 0.0 81.0 300.0 1.0 Semiurban Y
84 LP001275 Male Yes 1 Graduate No 3988 0.0 50.0 240.0 1.0 Urban Y
85 LP001279 Male No 0 Graduate No 2366 2531.0 136.0 360.0 1.0 Semiurban Y
86 LP001280 Male Yes 2 Not Graduate No 3333 2000.0 99.0 360.0 NaN Semiurban Y
87 LP001282 Male Yes 0 Graduate No 2500 2118.0 104.0 360.0 1.0 Semiurban Y
89 LP001310 Male Yes 0 Graduate No 5695 4167.0 175.0 360.0 1.0 Semiurban Y
94 LP001325 Male No 0 Not Graduate No 3620 0.0 25.0 120.0 1.0 Semiurban Y
96 LP001327 Female Yes 0 Graduate No 2484 2302.0 137.0 360.0 1.0 Semiurban Y
99 LP001343 Male Yes 0 Graduate No 1759 3541.0 131.0 360.0 1.0 Semiurban Y
100 LP001345 Male Yes 2 Not Graduate No 4288 3263.0 133.0 180.0 1.0 Urban Y
... ... ... ... ... ... ... ... ... ... ... ... ... ...
542 LP002753 Female No 1 Graduate NaN 3652 0.0 95.0 360.0 1.0 Semiurban Y
543 LP002755 Male Yes 1 Not Graduate No 2239 2524.0 128.0 360.0 1.0 Urban Y
544 LP002757 Female Yes 0 Not Graduate No 3017 663.0 102.0 360.0 NaN Semiurban Y
545 LP002767 Male Yes 0 Graduate No 2768 1950.0 155.0 360.0 1.0 Rural Y
549 LP002777 Male Yes 0 Graduate No 2785 2016.0 110.0 360.0 1.0 Rural Y
551 LP002784 Male Yes 1 Not Graduate No 2492 2375.0 NaN 360.0 1.0 Rural Y
552 LP002785 Male Yes 1 Graduate No 3333 3250.0 158.0 360.0 1.0 Urban Y
555 LP002792 Male Yes 1 Graduate No 5468 1032.0 26.0 360.0 1.0 Semiurban Y
557 LP002795 Male Yes 3+ Graduate Yes 10139 0.0 260.0 360.0 1.0 Semiurban Y
558 LP002798 Male Yes 0 Graduate No 3887 2669.0 162.0 360.0 1.0 Semiurban Y
560 LP002807 Male Yes 2 Not Graduate No 3675 242.0 108.0 360.0 1.0 Semiurban Y
561 LP002813 Female Yes 1 Graduate Yes 19484 0.0 600.0 360.0 1.0 Semiurban Y
562 LP002820 Male Yes 0 Graduate No 5923 2054.0 211.0 360.0 1.0 Rural Y
566 LP002836 Male No 0 Graduate No 3333 0.0 70.0 360.0 1.0 Urban Y
569 LP002841 Male Yes 0 Graduate No 3166 2064.0 104.0 360.0 0.0 Urban N
574 LP002863 Male Yes 3+ Graduate No 6406 0.0 150.0 360.0 1.0 Semiurban N
577 LP002874 Male No 0 Graduate No 3229 2739.0 110.0 360.0 1.0 Urban Y
579 LP002888 Male No 0 Graduate NaN 3182 2917.0 161.0 360.0 1.0 Urban Y
580 LP002892 Male Yes 2 Graduate No 6540 0.0 205.0 360.0 1.0 Semiurban Y
581 LP002893 Male No 0 Graduate No 1836 33837.0 90.0 360.0 1.0 Urban N
587 LP002917 Female No 0 Not Graduate No 2165 0.0 70.0 360.0 1.0 Semiurban Y
588 LP002925 NaN No 0 Graduate No 4750 0.0 94.0 360.0 1.0 Semiurban Y
592 LP002933 NaN No 3+ Graduate Yes 9357 0.0 292.0 360.0 1.0 Semiurban Y
595 LP002940 Male No 0 Not Graduate No 3833 0.0 110.0 360.0 1.0 Rural Y
599 LP002948 Male Yes 2 Graduate No 5780 0.0 192.0 360.0 1.0 Urban Y
603 LP002958 Male No 0 Graduate No 3676 4301.0 172.0 360.0 1.0 Rural Y
608 LP002974 Male Yes 0 Graduate No 3232 1950.0 108.0 360.0 1.0 Rural Y
609 LP002978 Female No 0 Graduate No 2900 0.0 71.0 360.0 1.0 Rural Y
611 LP002983 Male Yes 1 Graduate No 8072 240.0 253.0 360.0 1.0 Urban Y
612 LP002984 Male Yes 2 Graduate No 7583 0.0 187.0 360.0 1.0 Urban Y

228 rows × 13 columns