In [47]:
import pandas as pd
Data Import #Hint wil take some time
fraud=pd.read_csv("https://packages.revolutionanalytics.com/datasets/ccFraud.csv")
In [48]:
import os as os
In [49]:
os.getcwd()
Out[49]:
'C:\\Users\\Dell\\Downloads'
In [50]:
os.chdir('C:\\Users\\Dell\\Downloads')
In [51]:
os.listdir()
Out[51]:
['140749_2017.pdf',
'2011-F01-0700-Rev4-MDDS.XLSX',
'20150817143155.pdf',
'20160111060911.pdf',
'20170214052225.pdf',
'861415_10151432783238421_2124270505_o (1).jpg',
'861415_10151432783238421_2124270505_o.jpg',
'AirPassengers.csv',
'ajayo.jpg',
'Alison Python Invoice - Sheet1.pdf',
'Alison SAS Invoice - Sheet1.pdf',
'Allison Interview Jones Invoice - Sheet1.pdf',
'Anaconda3-4.2.0-Windows-x86_64.exe',
'apachehttpd.exe',
'April invoice adaptive analytics - Sheet1.pdf',
'Assignment14_BusinessAnalytics (1).docx',
'Assignment14_BusinessAnalytics.docx',
'Assignment15_BusinessAnalytics.docx',
'Assignment16_BusinessAnalytics (1).docx',
'Assignment16_BusinessAnalytics (2).docx',
'Assignment16_BusinessAnalytics.docx',
'aug ust 2008.JPG',
'avast_free_antivirus_setup_online.exe',
'avinash_ltv.zip',
'BigDiamonds.csv',
'BigDiamonds.csv (1).zip',
'BigDiamonds.csv (2)',
'BigDiamonds.csv (2).zip',
'BigDiamonds.csv.zip',
'Boston (1).csv',
'Boston.csv',
'CAM- Ajay Ohri (1).pdf',
'CAM- Ajay Ohri.pdf',
'camtasia.exe',
'ccFraud.csv',
'Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf',
'CHAP1-6PythonforRUsersAnapproachforDataScience.docx',
'chapter+3+_+spark.html',
'chi+square+test.ipynb',
'chromeinstall-8u111.exe',
'Cisco_WebEx_Add-On.exe',
'class2.csv',
'Collabera Invoice (1).pdf',
'Collabera Invoice.pdf',
'Collectcent Invoice.pdf',
'college degrees.pdf',
'DAP 1.pdf',
'DAP 1.pptx',
'DAP 6 RDBMS and SQL.pdf',
'DAP 6 RDBMS and SQL.pptx',
'data+exploration.ipynb',
'data+manipulation.ipynb',
'data1.csv',
'datasets.csv',
'Decision Trees.pdf',
'DecisionStatsOfferLetter.docx',
'DecisionStatsRelievingLetter.docx',
'descriptive+stats+in+Python.ipynb',
'desktop.ini',
'Diamond (1).csv',
'Diamond (2).csv',
'Diamond (3).csv',
'Diamond (4).csv',
'Diamond (5).csv',
'Diamond (6).csv',
'Diamond.csv',
'DropboxInstaller.exe',
'edb_npgsql.exe',
'edb_pgjdbc.exe',
'edb_psqlodbc.exe',
'edb_psqlodbc.exe-20170203172812',
'edb_psqlodbc.exe-20170307203617',
'final invoice edureka - Sheet1.pdf',
'FinalPythonforRUsersAnapproachforDataScience (1).docx',
'FinalPythonforRUsersAnapproachforDataScience (2).docx',
'FinalPythonforRUsersAnapproachforDataScience (3).docx',
'FinalPythonforRUsersAnapproachforDataScience (4).docx',
'FinalPythonforRUsersAnapproachforDataScience.docx',
'final_webinar (1).pdf',
'final_webinar.pdf',
'Git-2.11.0-64-bit.exe',
'Git-2.12.0-64-bit.exe',
'GitHubSetup (1).exe',
'GitHubSetup (2).exe',
'GitHubSetup.exe',
'GOMAUDIOGLOBALSETUP.EXE',
'HP Downloads',
'HPSupportSolutionsFramework-12.5.32.203.exe',
'image.png',
'IMS PROSCHOOL Workshop.pptx.pdf',
'IMS PROSCHOOL Workshop.pptx.pptx',
'Introduction to SAS (1).pdf',
'Introduction to SAS Part 1 (1).pdf',
'Introduction to SAS Part 1.pdf',
'Introduction to SAS.pdf',
'Invoice for Digital Vidya.pdf',
'Invoice for Weekendr.pdf',
'Invoice format - Ajay Ohri CONTATA (1).xls',
'Invoice format - Ajay Ohri CONTATA.xls',
'invoice rapid miner.pdf',
'iris2 (1).ipynb',
'iris2 (2).ipynb',
'iris2.ipynb',
'January invoice Indicus .pdf',
'June AV Invoice - Sheet1.pdf',
'Lecture 6 - KNN & Naive Bayes.ppt',
'Local Disk (C) - Shortcut.lnk',
'logistic regression - script for ppt.R',
'logistic_regression_-_script_for_ppt.html',
'March invoice Indicus - Sheet1.pdf',
'mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi',
'mongodb-win32-x86_64-3.4.2-signed.msi',
'mtcarslm.R',
'nltk.ipynb',
'notebook-Copy1.html',
'Offer Letter - Ajay Ohri (1).pdf',
'Offer Letter - Ajay Ohri.pdf',
'Other Data Mining Methods (1).pdf',
'Other Data Mining Methods.pdf',
'output1 (1).xls',
'output1 (2).xls',
'output1.xls',
'passport image.pdf',
'Pawconinvoice2016.pdf',
'Pawconinvoice2017 (1).pdf',
'Pawconinvoice2017 (2).pdf',
'Pawconinvoice2017 (3).pdf',
'Pawconinvoice2017.pdf',
'Payslip Feb 2016 - Sheet1.pdf',
'Payslip Feb 2016.pdf',
'Payslip Format Decisionstats - Sheet1.pdf',
'Payslip Jan 2016 - Sheet1.pdf',
'Payslip Jan 2016.pdf',
'Payslip March 2016 - Sheet1.pdf',
'Payslip March 2016.pdf',
'pgd.csv',
'postgresql-9.6.1-1-windows-x64.exe',
'Program 1-results.rtf',
'protein.csv',
'python+with+postgres (1).ipynb',
'python+with+postgres.ipynb',
'R-3.3.2-win.exe',
'R-3.3.3-win.exe',
'RCertificationExam.pdf',
'reg+model.ipynb',
'Revision - Business Analytics (1).pdf',
'Revision - Business Analytics.pdf',
'RidingMowers.csv',
'rsconnect',
'RStudio-1.0.136.exe',
'Salary Slip, Feb 2016.pdf',
'Salary Slip, Jan 2016.pdf',
'Salary Slip, March 2016 (1).pdf',
'Salary Slip, March 2016 (2).pdf',
'Salary Slip, March 2016.pdf',
'sales-of-shampoo-over-a-three-ye.csv',
'SAS part 2.pdf',
'SAS Part 3.pdf',
'sas-university-edition-107140.pdf',
'Scan0095.pdf',
'Scanned Invoice for Collabera.pdf',
'Screenshot 2017-01-23 12.36.55.png',
'September invoice adaptive analytics - Sheet1.pdf',
'Sollers January.pdf',
'sqlalchemy.ipynb',
'stackoverflow-dump-analysis.html',
'Sunstone.pdf',
'Tableau.pdf',
'TableauPublicDesktop-64bit-10-1-3.exe',
'TableauPublicDesktop-64bit-10-1-4.exe',
'telecom.csv',
'TelecomServiceProviderCaseStudy.pdf',
'Text Mining (1).pdf',
'Text Mining.pdf',
'third.sas7bdat',
'Time Series Forecasting (1).pdf',
'Time Series Forecasting.pdf',
'ts.html',
'ts.R',
'Unconfirmed 373974.crdownload',
'Unconfirmed 376562.crdownload',
'Unconfirmed 376991.crdownload',
'Unconfirmed 930917.crdownload',
'Unconfirmed 950045.crdownload',
'unvbasicvapp__9411008__ova__en__sp0__1.ova.crdownload',
'VirtualBox-5.1.8-111374-Win (1).exe',
'VirtualBox-5.1.8-111374-Win.exe',
'Webinar for Business Analytics.pdf',
'WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg',
'WhatsApp Image 2017-02-18 at 08.42.55.jpeg']
In [52]:
fraud=pd.read_csv("ccFraud.csv")
In [53]:
fraud2=fraud
In [54]:
fraud3=fraud.copy()
In [55]:
type(fraud)
Out[55]:
pandas.core.frame.DataFrame
In [56]:
fraud.columns
Out[56]:
Index(['custID', 'gender', 'state', 'cardholder', 'balance', 'numTrans',
'numIntlTrans', 'creditLine', 'fraudRisk'],
dtype='object')
In [57]:
fraud.dtypes
Out[57]:
custID int64
gender int64
state int64
cardholder int64
balance int64
numTrans int64
numIntlTrans int64
creditLine int64
fraudRisk int64
dtype: object
In [58]:
fraud.shape
Out[58]:
(10000000, 9)
In [59]:
fraud.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 9 columns):
custID int64
gender int64
state int64
cardholder int64
balance int64
numTrans int64
numIntlTrans int64
creditLine int64
fraudRisk int64
dtypes: int64(9)
memory usage: 686.6 MB
In [60]:
len(fraud)
Out[60]:
10000000
In [61]:
len(fraud.columns)
Out[61]:
9
In [62]:
fraud.head()
Out[62]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
In [63]:
fraud.tail()
Out[63]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
9999995
9999996
1
37
1
0
10
0
9
0
9999996
9999997
1
16
1
0
33
2
4
0
9999997
9999998
1
24
1
9000
38
0
8
0
9999998
9999999
1
28
1
7000
20
19
6
0
9999999
10000000
1
23
1
0
13
0
7
0
In [64]:
fraud.ix[0:10]
Out[64]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
5
6
2
44
2
5546
21
0
13
0
6
7
1
3
1
2000
41
0
1
0
7
8
1
10
1
6016
20
3
6
0
8
9
2
32
1
2428
4
10
22
0
9
10
1
23
1
0
18
56
5
0
10
11
1
46
1
4601
54
0
4
0
In [65]:
fraud[['gender','state','cardholder']]
Out[65]:
gender
state
cardholder
0
1
35
1
1
2
2
1
2
2
2
1
3
1
15
1
4
1
46
1
5
2
44
2
6
1
3
1
7
1
10
1
8
2
32
1
9
1
23
1
10
1
46
1
11
1
10
1
12
1
6
1
13
2
38
1
14
1
27
1
15
1
44
1
16
2
18
1
17
1
35
1
18
1
5
1
19
2
31
1
20
1
39
1
21
1
34
1
22
1
5
1
23
2
21
1
24
1
25
1
25
2
29
1
26
1
38
1
27
1
9
1
28
2
20
1
29
2
49
1
...
...
...
...
9999970
1
10
1
9999971
2
40
1
9999972
1
46
1
9999973
1
10
2
9999974
1
25
1
9999975
2
48
1
9999976
2
4
1
9999977
1
35
1
9999978
1
44
1
9999979
1
6
1
9999980
1
10
1
9999981
1
33
1
9999982
1
44
1
9999983
2
13
1
9999984
1
39
1
9999985
2
45
2
9999986
1
23
1
9999987
2
24
1
9999988
1
18
1
9999989
2
4
1
9999990
1
16
2
9999991
2
36
1
9999992
1
38
1
9999993
2
43
1
9999994
1
16
2
9999995
1
37
1
9999996
1
16
1
9999997
1
24
1
9999998
1
28
1
9999999
1
23
1
10000000 rows × 3 columns
In [66]:
fraud.iloc[:,1:3]
Out[66]:
gender
state
0
1
35
1
2
2
2
2
2
3
1
15
4
1
46
5
2
44
6
1
3
7
1
10
8
2
32
9
1
23
10
1
46
11
1
10
12
1
6
13
2
38
14
1
27
15
1
44
16
2
18
17
1
35
18
1
5
19
2
31
20
1
39
21
1
34
22
1
5
23
2
21
24
1
25
25
2
29
26
1
38
27
1
9
28
2
20
29
2
49
...
...
...
9999970
1
10
9999971
2
40
9999972
1
46
9999973
1
10
9999974
1
25
9999975
2
48
9999976
2
4
9999977
1
35
9999978
1
44
9999979
1
6
9999980
1
10
9999981
1
33
9999982
1
44
9999983
2
13
9999984
1
39
9999985
2
45
9999986
1
23
9999987
2
24
9999988
1
18
9999989
2
4
9999990
1
16
9999991
2
36
9999992
1
38
9999993
2
43
9999994
1
16
9999995
1
37
9999996
1
16
9999997
1
24
9999998
1
28
9999999
1
23
10000000 rows × 2 columns
In [67]:
fraud.iloc[0:10,1:3]
Out[67]:
gender
state
0
1
35
1
2
2
2
2
2
3
1
15
4
1
46
5
2
44
6
1
3
7
1
10
8
2
32
9
1
23
In [68]:
import numpy as np
In [69]:
np.random.choice(10,2)
Out[69]:
array([1, 0])
In [70]:
np.random.choice(len(fraud),0.000001*len(fraud))
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
if __name__ == '__main__':
Out[70]:
array([7100177, 4058088, 2302568, 9002606, 3792183, 888579, 6465822,
3062360, 3663548, 5578048])
In [71]:
b=np.random.choice(len(fraud),0.000001*len(fraud))
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
if __name__ == '__main__':
In [72]:
fraud.ix[b]
Out[72]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
7461768
7461769
1
21
1
3000
15
2
2
0
5089359
5089360
1
5
1
8000
80
35
7
1
8548436
8548437
1
5
1
5000
3
0
4
0
3997580
3997581
1
32
1
10000
47
1
9
0
1776937
1776938
1
43
1
3000
100
0
2
0
3590544
3590545
2
5
1
4000
52
15
3
0
1824320
1824321
1
35
1
1118
87
8
5
0
906910
906911
1
10
1
6000
6
0
5
0
2265882
2265883
1
5
1
1426
17
0
8
0
7983346
7983347
2
13
1
3000
43
8
2
0
In [73]:
d=fraud.ix[b]
In [74]:
d.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 7461768 to 7983346
Data columns (total 9 columns):
custID 10 non-null int64
gender 10 non-null int64
state 10 non-null int64
cardholder 10 non-null int64
balance 10 non-null int64
numTrans 10 non-null int64
numIntlTrans 10 non-null int64
creditLine 10 non-null int64
fraudRisk 10 non-null int64
dtypes: int64(9)
memory usage: 800.0 bytes
In [75]:
del d
In [76]:
del fraud['gender']
In [77]:
fraud.columns
Out[77]:
Index(['custID', 'state', 'cardholder', 'balance', 'numTrans', 'numIntlTrans',
'creditLine', 'fraudRisk'],
dtype='object')
In [78]:
fraud.drop(fraud.columns[[1, 5]], axis=1, inplace=True)
In [79]:
fraud.columns
Out[79]:
Index(['custID', 'cardholder', 'balance', 'numTrans', 'creditLine',
'fraudRisk'],
dtype='object')
In [80]:
fraud.drop(fraud.columns[[1,2,3]], axis=1, inplace=True)
In [81]:
fraud.columns
Out[81]:
Index(['custID', 'creditLine', 'fraudRisk'], dtype='object')
In [82]:
fraud.head()
Out[82]:
custID
creditLine
fraudRisk
0
1
2
0
1
2
18
0
2
3
16
0
3
4
5
0
4
5
7
0
In [83]:
fraud.drop(fraud.index[[1,3]])
Out[83]:
custID
creditLine
fraudRisk
0
1
2
0
2
3
16
0
4
5
7
0
5
6
13
0
6
7
1
0
7
8
6
0
8
9
22
0
9
10
5
0
10
11
4
0
11
12
2
0
12
13
4
0
13
14
8
0
14
15
17
0
15
16
5
0
16
17
13
0
17
18
8
0
18
19
8
0
19
20
8
0
20
21
3
0
21
22
3
0
22
23
11
0
23
24
3
0
24
25
65
0
25
26
4
0
26
27
3
0
27
28
11
0
28
29
2
0
29
30
13
1
30
31
4
0
31
32
5
0
...
...
...
...
9999970
9999971
11
0
9999971
9999972
7
0
9999972
9999973
4
0
9999973
9999974
14
0
9999974
9999975
3
0
9999975
9999976
8
0
9999976
9999977
7
0
9999977
9999978
7
0
9999978
9999979
4
0
9999979
9999980
7
0
9999980
9999981
13
0
9999981
9999982
18
0
9999982
9999983
2
0
9999983
9999984
5
0
9999984
9999985
6
0
9999985
9999986
2
0
9999986
9999987
6
0
9999987
9999988
5
0
9999988
9999989
14
1
9999989
9999990
11
0
9999990
9999991
5
0
9999991
9999992
5
0
9999992
9999993
2
0
9999993
9999994
5
0
9999994
9999995
20
0
9999995
9999996
9
0
9999996
9999997
4
0
9999997
9999998
8
0
9999998
9999999
6
0
9999999
10000000
7
0
9999998 rows × 3 columns
In [84]:
pd.Series(range(1,101))
Out[84]:
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
...
70 71
71 72
72 73
73 74
74 75
75 76
76 77
77 78
78 79
79 80
80 81
81 82
82 83
83 84
84 85
85 86
86 87
87 88
88 89
89 90
90 91
91 92
92 93
93 94
94 95
95 96
96 97
97 98
98 99
99 100
dtype: int32
In [85]:
s=pd.Series(range(1,101))
In [86]:
fraud.drop(fraud.index[[s]])
Out[86]:
custID
creditLine
fraudRisk
0
1
2
0
101
102
10
0
102
103
28
0
103
104
22
0
104
105
10
0
105
106
4
0
106
107
11
0
107
108
6
0
108
109
4
0
109
110
4
0
110
111
10
1
111
112
4
0
112
113
3
0
113
114
5
0
114
115
4
0
115
116
2
0
116
117
4
0
117
118
13
0
118
119
9
0
119
120
8
0
120
121
3
0
121
122
11
0
122
123
5
0
123
124
4
0
124
125
13
0
125
126
4
0
126
127
5
0
127
128
6
0
128
129
3
0
129
130
2
0
...
...
...
...
9999970
9999971
11
0
9999971
9999972
7
0
9999972
9999973
4
0
9999973
9999974
14
0
9999974
9999975
3
0
9999975
9999976
8
0
9999976
9999977
7
0
9999977
9999978
7
0
9999978
9999979
4
0
9999979
9999980
7
0
9999980
9999981
13
0
9999981
9999982
18
0
9999982
9999983
2
0
9999983
9999984
5
0
9999984
9999985
6
0
9999985
9999986
2
0
9999986
9999987
6
0
9999987
9999988
5
0
9999988
9999989
14
1
9999989
9999990
11
0
9999990
9999991
5
0
9999991
9999992
5
0
9999992
9999993
2
0
9999993
9999994
5
0
9999994
9999995
20
0
9999995
9999996
9
0
9999996
9999997
4
0
9999997
9999998
8
0
9999998
9999999
6
0
9999999
10000000
7
0
9999900 rows × 3 columns
In [87]:
fraud.query('fraudRisk==0').head(6)
Out[87]:
custID
creditLine
fraudRisk
0
1
2
0
1
2
18
0
2
3
16
0
3
4
5
0
4
5
7
0
5
6
13
0
In [88]:
fraud.loc[fraud.fraudRisk == 0,'creditLine'] = -1; fraud
Out[88]:
custID
creditLine
fraudRisk
0
1
-1
0
1
2
-1
0
2
3
-1
0
3
4
-1
0
4
5
-1
0
5
6
-1
0
6
7
-1
0
7
8
-1
0
8
9
-1
0
9
10
-1
0
10
11
-1
0
11
12
-1
0
12
13
-1
0
13
14
-1
0
14
15
-1
0
15
16
-1
0
16
17
-1
0
17
18
-1
0
18
19
-1
0
19
20
-1
0
20
21
-1
0
21
22
-1
0
22
23
-1
0
23
24
-1
0
24
25
-1
0
25
26
-1
0
26
27
-1
0
27
28
-1
0
28
29
-1
0
29
30
13
1
...
...
...
...
9999970
9999971
-1
0
9999971
9999972
-1
0
9999972
9999973
-1
0
9999973
9999974
-1
0
9999974
9999975
-1
0
9999975
9999976
-1
0
9999976
9999977
-1
0
9999977
9999978
-1
0
9999978
9999979
-1
0
9999979
9999980
-1
0
9999980
9999981
-1
0
9999981
9999982
-1
0
9999982
9999983
-1
0
9999983
9999984
-1
0
9999984
9999985
-1
0
9999985
9999986
-1
0
9999986
9999987
-1
0
9999987
9999988
-1
0
9999988
9999989
14
1
9999989
9999990
-1
0
9999990
9999991
-1
0
9999991
9999992
-1
0
9999992
9999993
-1
0
9999993
9999994
-1
0
9999994
9999995
-1
0
9999995
9999996
-1
0
9999996
9999997
-1
0
9999997
9999998
-1
0
9999998
9999999
-1
0
9999999
10000000
-1
0
10000000 rows × 3 columns
In [89]:
fraud2.head()
Out[89]:
custID
creditLine
fraudRisk
0
1
-1
0
1
2
-1
0
2
3
-1
0
3
4
-1
0
4
5
-1
0
In [90]:
fraud3.head()
Out[90]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
In [ ]:
Content source: decisionstats/pythonfordatascience
Similar notebooks: