In [1]:
import pandas as pd
In [2]:
import os as os
In [11]:
os.getcwd()
Out[11]:
'C:\\Users\\Dell\\Downloads'
In [5]:
os.chdir('C:\\Users\\Dell\\Downloads')
In [6]:
os.listdir()
Out[6]:
['140749_2017.pdf',
'2011-F01-0700-Rev4-MDDS.XLSX',
'20150817143155.pdf',
'20160111060911.pdf',
'20170214052225.pdf',
'861415_10151432783238421_2124270505_o (1).jpg',
'861415_10151432783238421_2124270505_o.jpg',
'AirPassengers.csv',
'ajayo.jpg',
'Alison Python Invoice - Sheet1.pdf',
'Alison SAS Invoice - Sheet1.pdf',
'Allison Interview Jones Invoice - Sheet1.pdf',
'Anaconda3-4.2.0-Windows-x86_64.exe',
'apachehttpd.exe',
'April invoice adaptive analytics - Sheet1.pdf',
'Assignment14_BusinessAnalytics (1).docx',
'Assignment14_BusinessAnalytics.docx',
'Assignment15_BusinessAnalytics.docx',
'Assignment16_BusinessAnalytics (1).docx',
'Assignment16_BusinessAnalytics (2).docx',
'Assignment16_BusinessAnalytics.docx',
'aug ust 2008.JPG',
'avast_free_antivirus_setup_online.exe',
'avinash_ltv.zip',
'BigDiamonds.csv',
'BigDiamonds.csv (1).zip',
'BigDiamonds.csv (2)',
'BigDiamonds.csv (2).zip',
'BigDiamonds.csv.zip',
'Boston (1).csv',
'Boston.csv',
'CAM- Ajay Ohri (1).pdf',
'CAM- Ajay Ohri.pdf',
'camtasia.exe',
'ccFraud.csv',
'Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf',
'CHAP1-6PythonforRUsersAnapproachforDataScience.docx',
'chapter+3+_+spark.html',
'chi+square+test.ipynb',
'chromeinstall-8u111.exe',
'Cisco_WebEx_Add-On.exe',
'class2.csv',
'Collabera Invoice (1).pdf',
'Collabera Invoice.pdf',
'Collectcent Invoice.pdf',
'college degrees.pdf',
'DAP 1.pdf',
'DAP 1.pptx',
'DAP 6 RDBMS and SQL.pdf',
'DAP 6 RDBMS and SQL.pptx',
'data+exploration.ipynb',
'data+manipulation.ipynb',
'data1.csv',
'datasets.csv',
'Decision Trees.pdf',
'DecisionStatsOfferLetter.docx',
'DecisionStatsRelievingLetter.docx',
'descriptive+stats+in+Python.ipynb',
'desktop.ini',
'Diamond (1).csv',
'Diamond (2).csv',
'Diamond (3).csv',
'Diamond (4).csv',
'Diamond (5).csv',
'Diamond (6).csv',
'Diamond.csv',
'DropboxInstaller.exe',
'edb_npgsql.exe',
'edb_pgjdbc.exe',
'edb_psqlodbc.exe',
'edb_psqlodbc.exe-20170203172812',
'edb_psqlodbc.exe-20170307203617',
'final invoice edureka - Sheet1.pdf',
'FinalPythonforRUsersAnapproachforDataScience (1).docx',
'FinalPythonforRUsersAnapproachforDataScience (2).docx',
'FinalPythonforRUsersAnapproachforDataScience (3).docx',
'FinalPythonforRUsersAnapproachforDataScience (4).docx',
'FinalPythonforRUsersAnapproachforDataScience.docx',
'final_webinar (1).pdf',
'final_webinar.pdf',
'Git-2.11.0-64-bit.exe',
'Git-2.12.0-64-bit.exe',
'GitHubSetup (1).exe',
'GitHubSetup (2).exe',
'GitHubSetup.exe',
'GOMAUDIOGLOBALSETUP.EXE',
'Hdma.csv',
'Hedonic.csv',
'HP Downloads',
'HPSupportSolutionsFramework-12.5.32.203.exe',
'image.png',
'IMS PROSCHOOL Workshop.pptx.pdf',
'IMS PROSCHOOL Workshop.pptx.pptx',
'Introduction to SAS (1).pdf',
'Introduction to SAS Part 1 (1).pdf',
'Introduction to SAS Part 1.pdf',
'Introduction to SAS.pdf',
'Invoice for Digital Vidya.pdf',
'Invoice for Weekendr.pdf',
'Invoice format - Ajay Ohri CONTATA (1).xls',
'Invoice format - Ajay Ohri CONTATA.xls',
'invoice rapid miner.pdf',
'iris2 (1).ipynb',
'iris2 (2).ipynb',
'iris2.ipynb',
'January invoice Indicus .pdf',
'June AV Invoice - Sheet1.pdf',
'Lecture 6 - KNN & Naive Bayes.ppt',
'Local Disk (C) - Shortcut.lnk',
'logistic regression - script for ppt.R',
'logistic_regression_-_script_for_ppt.html',
'March invoice Indicus - Sheet1.pdf',
'mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi',
'mongodb-win32-x86_64-3.4.2-signed.msi',
'mtcarslm.R',
'nltk.ipynb',
'notebook-Copy1.html',
'Offer Letter - Ajay Ohri (1).pdf',
'Offer Letter - Ajay Ohri.pdf',
'Other Data Mining Methods (1).pdf',
'Other Data Mining Methods.pdf',
'output1 (1).xls',
'output1 (2).xls',
'output1.xls',
'pandas+11.ipynb',
'pandas+analysis+1.ipynb',
'passport image.pdf',
'Pawconinvoice2016.pdf',
'Pawconinvoice2017 (1).pdf',
'Pawconinvoice2017 (2).pdf',
'Pawconinvoice2017 (3).pdf',
'Pawconinvoice2017.pdf',
'Payslip Feb 2016 - Sheet1.pdf',
'Payslip Feb 2016.pdf',
'Payslip Format Decisionstats - Sheet1.pdf',
'Payslip Jan 2016 - Sheet1.pdf',
'Payslip Jan 2016.pdf',
'Payslip March 2016 - Sheet1.pdf',
'Payslip March 2016.pdf',
'pgd.csv',
'postgresql-9.6.1-1-windows-x64.exe',
'Program 1-results.rtf',
'protein.csv',
'python+with+postgres (1).ipynb',
'python+with+postgres.ipynb',
'Python.docx',
'R-3.3.2-win.exe',
'R-3.3.3-win.exe',
'RCertificationExam.pdf',
'reg+model.ipynb',
'Revision - Business Analytics (1).pdf',
'Revision - Business Analytics.pdf',
'RidingMowers.csv',
'rsconnect',
'RStudio-1.0.136.exe',
'Salary Slip, Feb 2016.pdf',
'Salary Slip, Jan 2016.pdf',
'Salary Slip, March 2016 (1).pdf',
'Salary Slip, March 2016 (2).pdf',
'Salary Slip, March 2016.pdf',
'sales-of-shampoo-over-a-three-ye.csv',
'SAS part 2.pdf',
'SAS Part 3.pdf',
'sas-university-edition-107140.pdf',
'Scan0095.pdf',
'Scanned Invoice for Collabera.pdf',
'Screenshot 2017-01-23 12.36.55.png',
'September invoice adaptive analytics - Sheet1.pdf',
'Sollers January.pdf',
'sqlalchemy.ipynb',
'stackoverflow-dump-analysis.html',
'Sunstone.pdf',
'Tableau.pdf',
'TableauPublicDesktop-64bit-10-1-3.exe',
'TableauPublicDesktop-64bit-10-1-4.exe',
'telecom.csv',
'TelecomServiceProviderCaseStudy.pdf',
'Text Mining (1).pdf',
'Text Mining.pdf',
'third.sas7bdat',
'Time Series Forecasting (1).pdf',
'Time Series Forecasting.pdf',
'ts.html',
'ts.R',
'Unconfirmed 373974.crdownload',
'Unconfirmed 376562.crdownload',
'Unconfirmed 376991.crdownload',
'Unconfirmed 930917.crdownload',
'Unconfirmed 950045.crdownload',
'unvbasicvapp__9411008__ova__en__sp0__1.ova.crdownload',
'VirtualBox-5.1.8-111374-Win (1).exe',
'VirtualBox-5.1.8-111374-Win.exe',
'Webinar for Business Analytics.pdf',
'WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg',
'WhatsApp Image 2017-02-18 at 08.42.55.jpeg']
In [8]:
fraud=pd.read_csv("ccFraud.csv")
In [9]:
fraud2=fraud
In [10]:
fraud3=fraud.copy()
In [12]:
type(fraud)
Out[12]:
pandas.core.frame.DataFrame
In [13]:
fraud.columns
Out[13]:
Index(['custID', 'gender', 'state', 'cardholder', 'balance', 'numTrans',
'numIntlTrans', 'creditLine', 'fraudRisk'],
dtype='object')
In [29]:
fraud.dtypes
Out[29]:
custID int64
gender int64
state int64
cardholder int64
balance int64
numTrans int64
numIntlTrans int64
creditLine int64
fraudRisk int64
dtype: object
In [26]:
fraud.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 9 columns):
custID int64
gender int64
state int64
cardholder int64
balance int64
numTrans int64
numIntlTrans int64
creditLine int64
fraudRisk int64
dtypes: int64(9)
memory usage: 686.6 MB
In [23]:
fraud.memory_usage()
Out[23]:
Index 80
custID 80000000
gender 80000000
state 80000000
cardholder 80000000
balance 80000000
numTrans 80000000
numIntlTrans 80000000
creditLine 80000000
fraudRisk 80000000
dtype: int64
In [18]:
fraud.memory_usage(index=True).sum()
Out[18]:
720000080
In [20]:
fraud.shape
Out[20]:
(10000000, 9)
In [30]:
len(fraud)
Out[30]:
10000000
In [32]:
fraud.shape[1]
Out[32]:
9
In [33]:
len(fraud.columns)
Out[33]:
9
In [34]:
fraud.head()
Out[34]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
In [37]:
fraud.head(10)
Out[37]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
5
6
2
44
2
5546
21
0
13
0
6
7
1
3
1
2000
41
0
1
0
7
8
1
10
1
6016
20
3
6
0
8
9
2
32
1
2428
4
10
22
0
9
10
1
23
1
0
18
56
5
0
In [41]:
fraud.tail()
Out[41]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
9999995
9999996
1
37
1
0
10
0
9
0
9999996
9999997
1
16
1
0
33
2
4
0
9999997
9999998
1
24
1
9000
38
0
8
0
9999998
9999999
1
28
1
7000
20
19
6
0
9999999
10000000
1
23
1
0
13
0
7
0
In [39]:
fraud.gender.head()
Out[39]:
0 1
1 2
2 2
3 1
4 1
Name: gender, dtype: int64
In [40]:
fraud.head().gender
Out[40]:
0 1
1 2
2 2
3 1
4 1
Name: gender, dtype: int64
In [42]:
fraud['gender'].head()
Out[42]:
0 1
1 2
2 2
3 1
4 1
Name: gender, dtype: int64
In [43]:
fraud[['gender','state','fraudRisk']].head()
Out[43]:
gender
state
fraudRisk
0
1
35
0
1
2
2
0
2
2
2
0
3
1
15
0
4
1
46
0
In [52]:
fraud.iloc[:,[1,2,8]].head()
Out[52]:
gender
state
fraudRisk
0
1
35
0
1
2
2
0
2
2
2
0
3
1
15
0
4
1
46
0
In [53]:
fraud.iloc[20:30,[1,2,8]]
Out[53]:
gender
state
fraudRisk
20
1
39
0
21
1
34
0
22
1
5
0
23
2
21
0
24
1
25
0
25
2
29
0
26
1
38
0
27
1
9
0
28
2
20
0
29
2
49
1
In [54]:
fraud.ix[20:30]
Out[54]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
20
21
1
39
1
4000
24
0
3
0
21
22
1
34
1
0
22
0
3
0
22
23
1
5
1
0
7
0
11
0
23
24
2
21
1
0
15
0
3
0
24
25
1
25
1
0
12
0
65
0
25
26
2
29
1
5000
4
9
4
0
26
27
1
38
1
4000
21
5
3
0
27
28
1
9
1
12000
20
0
11
0
28
29
2
20
1
0
19
0
2
0
29
30
2
49
1
5192
84
0
13
1
30
31
1
29
1
0
23
0
4
0
In [57]:
fraud[['gender','state','fraudRisk']].ix[20:30]
Out[57]:
gender
state
fraudRisk
20
1
39
0
21
1
34
0
22
1
5
0
23
2
21
0
24
1
25
0
25
2
29
0
26
1
38
0
27
1
9
0
28
2
20
0
29
2
49
1
30
1
29
0
In [58]:
import numpy as np
In [59]:
np.random.choice(100,5)
Out[59]:
array([50, 83, 66, 90, 11])
In [60]:
np.random.choice(len(fraud),0.000001*len(fraud))
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
if __name__ == '__main__':
Out[60]:
array([2148721, 5783663, 4642367, 1808583, 3507214, 9673216, 7101634,
2846151, 5954489, 7471532])
In [61]:
b=np.random.choice(len(fraud),0.000001*len(fraud))
C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
if __name__ == '__main__':
In [62]:
fraud.ix[b]
Out[62]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
802467
802468
2
44
1
11657
54
0
24
0
6215623
6215624
1
4
1
803
12
3
19
0
2763901
2763902
2
36
1
0
7
3
2
0
996086
996087
1
10
1
8711
10
0
10
0
2826430
2826431
1
44
1
3834
27
29
5
0
1298626
1298627
1
41
1
6000
10
0
5
0
4450853
4450854
2
49
1
9515
100
0
11
0
7596059
7596060
1
30
1
6764
16
0
6
0
2694049
2694050
2
2
1
2071
43
0
12
0
9762815
9762816
1
48
1
0
69
0
9
0
In [63]:
d=fraud.ix[b]
In [64]:
d.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 802467 to 9762815
Data columns (total 9 columns):
custID 10 non-null int64
gender 10 non-null int64
state 10 non-null int64
cardholder 10 non-null int64
balance 10 non-null int64
numTrans 10 non-null int64
numIntlTrans 10 non-null int64
creditLine 10 non-null int64
fraudRisk 10 non-null int64
dtypes: int64(9)
memory usage: 800.0 bytes
In [65]:
del d
In [67]:
del fraud['gender']
In [68]:
fraud.head()
Out[68]:
custID
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
35
1
3000
4
14
2
0
1
2
2
1
0
9
0
18
0
2
3
2
1
0
27
9
16
0
3
4
15
1
0
12
0
5
0
4
5
46
1
0
11
16
7
0
In [69]:
fraud.drop(fraud.columns[[1, 3]], axis=1, inplace=True)
In [70]:
fraud.head()
Out[70]:
custID
cardholder
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
4
14
2
0
1
2
1
9
0
18
0
2
3
1
27
9
16
0
3
4
1
12
0
5
0
4
5
1
11
16
7
0
In [71]:
fraud.drop(fraud.index[[1,2,3]])
Out[71]:
custID
cardholder
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
4
14
2
0
4
5
1
11
16
7
0
5
6
2
21
0
13
0
6
7
1
41
0
1
0
7
8
1
20
3
6
0
8
9
1
4
10
22
0
9
10
1
18
56
5
0
10
11
1
54
0
4
0
11
12
1
20
0
2
0
12
13
1
45
2
4
0
13
14
1
41
3
8
0
14
15
1
60
0
17
0
15
16
1
22
0
5
0
16
17
1
20
0
13
0
17
18
1
13
6
8
0
18
19
1
20
2
8
0
19
20
1
21
10
8
0
20
21
1
24
0
3
0
21
22
1
22
0
3
0
22
23
1
7
0
11
0
23
24
1
15
0
3
0
24
25
1
12
0
65
0
25
26
1
4
9
4
0
26
27
1
21
5
3
0
27
28
1
20
0
11
0
28
29
1
19
0
2
0
29
30
1
84
0
13
1
30
31
1
23
0
4
0
31
32
1
8
0
5
0
32
33
1
49
0
10
0
...
...
...
...
...
...
...
9999970
9999971
1
12
0
11
0
9999971
9999972
1
31
3
7
0
9999972
9999973
1
69
0
4
0
9999973
9999974
2
36
31
14
0
9999974
9999975
1
18
0
3
0
9999975
9999976
1
23
0
8
0
9999976
9999977
1
14
0
7
0
9999977
9999978
1
24
0
7
0
9999978
9999979
1
12
0
4
0
9999979
9999980
1
5
3
7
0
9999980
9999981
1
54
0
13
0
9999981
9999982
1
23
26
18
0
9999982
9999983
1
14
0
2
0
9999983
9999984
1
2
0
5
0
9999984
9999985
1
30
0
6
0
9999985
9999986
2
4
0
2
0
9999986
9999987
1
59
0
6
0
9999987
9999988
1
46
0
5
0
9999988
9999989
1
72
0
14
1
9999989
9999990
1
17
7
11
0
9999990
9999991
2
8
0
5
0
9999991
9999992
1
6
0
5
0
9999992
9999993
1
7
0
2
0
9999993
9999994
1
6
0
5
0
9999994
9999995
2
3
0
20
0
9999995
9999996
1
10
0
9
0
9999996
9999997
1
33
2
4
0
9999997
9999998
1
38
0
8
0
9999998
9999999
1
20
19
6
0
9999999
10000000
1
13
0
7
0
9999997 rows × 6 columns
In [72]:
fraud.head()
Out[72]:
custID
cardholder
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
4
14
2
0
1
2
1
9
0
18
0
2
3
1
27
9
16
0
3
4
1
12
0
5
0
4
5
1
11
16
7
0
In [73]:
fraud2.head()
Out[73]:
custID
cardholder
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
4
14
2
0
1
2
1
9
0
18
0
2
3
1
27
9
16
0
3
4
1
12
0
5
0
4
5
1
11
16
7
0
In [74]:
fraud3.head()
Out[74]:
custID
gender
state
cardholder
balance
numTrans
numIntlTrans
creditLine
fraudRisk
0
1
1
35
1
3000
4
14
2
0
1
2
2
2
1
0
9
0
18
0
2
3
2
2
1
0
27
9
16
0
3
4
1
15
1
0
12
0
5
0
4
5
1
46
1
0
11
16
7
0
In [78]:
fraud3.shape
Out[78]:
(10000000, 9)
In [79]:
s=pd.Series(range(1,101))
In [81]:
type(s)
Out[81]:
pandas.core.series.Series
In [ ]:
Content source: decisionstats/pythonfordatascience
Similar notebooks: