In [47]:
import pandas as pd

Data Import #Hint wil take some time


In [48]:
import os as os

In [49]:
os.getcwd()


Out[49]:
'C:\\Users\\Dell\\Downloads'

In [50]:
os.chdir('C:\\Users\\Dell\\Downloads')

In [51]:
os.listdir()


Out[51]:
['140749_2017.pdf',
 '2011-F01-0700-Rev4-MDDS.XLSX',
 '20150817143155.pdf',
 '20160111060911.pdf',
 '20170214052225.pdf',
 '861415_10151432783238421_2124270505_o (1).jpg',
 '861415_10151432783238421_2124270505_o.jpg',
 'AirPassengers.csv',
 'ajayo.jpg',
 'Alison Python  Invoice   - Sheet1.pdf',
 'Alison SAS  Invoice   - Sheet1.pdf',
 'Allison Interview Jones Invoice   - Sheet1.pdf',
 'Anaconda3-4.2.0-Windows-x86_64.exe',
 'apachehttpd.exe',
 'April invoice adaptive analytics   - Sheet1.pdf',
 'Assignment14_BusinessAnalytics (1).docx',
 'Assignment14_BusinessAnalytics.docx',
 'Assignment15_BusinessAnalytics.docx',
 'Assignment16_BusinessAnalytics (1).docx',
 'Assignment16_BusinessAnalytics (2).docx',
 'Assignment16_BusinessAnalytics.docx',
 'aug ust 2008.JPG',
 'avast_free_antivirus_setup_online.exe',
 'avinash_ltv.zip',
 'BigDiamonds.csv',
 'BigDiamonds.csv (1).zip',
 'BigDiamonds.csv (2)',
 'BigDiamonds.csv (2).zip',
 'BigDiamonds.csv.zip',
 'Boston (1).csv',
 'Boston.csv',
 'CAM- Ajay Ohri (1).pdf',
 'CAM- Ajay Ohri.pdf',
 'camtasia.exe',
 'ccFraud.csv',
 'Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf',
 'CHAP1-6PythonforRUsersAnapproachforDataScience.docx',
 'chapter+3+_+spark.html',
 'chi+square+test.ipynb',
 'chromeinstall-8u111.exe',
 'Cisco_WebEx_Add-On.exe',
 'class2.csv',
 'Collabera Invoice (1).pdf',
 'Collabera Invoice.pdf',
 'Collectcent Invoice.pdf',
 'college degrees.pdf',
 'DAP 1.pdf',
 'DAP 1.pptx',
 'DAP 6 RDBMS and SQL.pdf',
 'DAP 6 RDBMS and SQL.pptx',
 'data+exploration.ipynb',
 'data+manipulation.ipynb',
 'data1.csv',
 'datasets.csv',
 'Decision Trees.pdf',
 'DecisionStatsOfferLetter.docx',
 'DecisionStatsRelievingLetter.docx',
 'descriptive+stats+in+Python.ipynb',
 'desktop.ini',
 'Diamond (1).csv',
 'Diamond (2).csv',
 'Diamond (3).csv',
 'Diamond (4).csv',
 'Diamond (5).csv',
 'Diamond (6).csv',
 'Diamond.csv',
 'DropboxInstaller.exe',
 'edb_npgsql.exe',
 'edb_pgjdbc.exe',
 'edb_psqlodbc.exe',
 'edb_psqlodbc.exe-20170203172812',
 'edb_psqlodbc.exe-20170307203617',
 'final invoice edureka  - Sheet1.pdf',
 'FinalPythonforRUsersAnapproachforDataScience (1).docx',
 'FinalPythonforRUsersAnapproachforDataScience (2).docx',
 'FinalPythonforRUsersAnapproachforDataScience (3).docx',
 'FinalPythonforRUsersAnapproachforDataScience (4).docx',
 'FinalPythonforRUsersAnapproachforDataScience.docx',
 'final_webinar (1).pdf',
 'final_webinar.pdf',
 'Git-2.11.0-64-bit.exe',
 'Git-2.12.0-64-bit.exe',
 'GitHubSetup (1).exe',
 'GitHubSetup (2).exe',
 'GitHubSetup.exe',
 'GOMAUDIOGLOBALSETUP.EXE',
 'HP Downloads',
 'HPSupportSolutionsFramework-12.5.32.203.exe',
 'image.png',
 'IMS PROSCHOOL Workshop.pptx.pdf',
 'IMS PROSCHOOL Workshop.pptx.pptx',
 'Introduction to SAS (1).pdf',
 'Introduction to SAS Part 1 (1).pdf',
 'Introduction to SAS Part 1.pdf',
 'Introduction to SAS.pdf',
 'Invoice for Digital Vidya.pdf',
 'Invoice for Weekendr.pdf',
 'Invoice format - Ajay Ohri CONTATA (1).xls',
 'Invoice format - Ajay Ohri CONTATA.xls',
 'invoice rapid miner.pdf',
 'iris2 (1).ipynb',
 'iris2 (2).ipynb',
 'iris2.ipynb',
 'January invoice Indicus  .pdf',
 'June AV   Invoice   - Sheet1.pdf',
 'Lecture 6 - KNN & Naive Bayes.ppt',
 'Local Disk (C) - Shortcut.lnk',
 'logistic regression - script for ppt.R',
 'logistic_regression_-_script_for_ppt.html',
 'March invoice Indicus   - Sheet1.pdf',
 'mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi',
 'mongodb-win32-x86_64-3.4.2-signed.msi',
 'mtcarslm.R',
 'nltk.ipynb',
 'notebook-Copy1.html',
 'Offer Letter - Ajay Ohri (1).pdf',
 'Offer Letter - Ajay Ohri.pdf',
 'Other Data Mining  Methods (1).pdf',
 'Other Data Mining  Methods.pdf',
 'output1 (1).xls',
 'output1 (2).xls',
 'output1.xls',
 'passport image.pdf',
 'Pawconinvoice2016.pdf',
 'Pawconinvoice2017 (1).pdf',
 'Pawconinvoice2017 (2).pdf',
 'Pawconinvoice2017 (3).pdf',
 'Pawconinvoice2017.pdf',
 'Payslip Feb 2016 - Sheet1.pdf',
 'Payslip Feb 2016.pdf',
 'Payslip Format Decisionstats - Sheet1.pdf',
 'Payslip Jan 2016 - Sheet1.pdf',
 'Payslip Jan 2016.pdf',
 'Payslip March 2016 - Sheet1.pdf',
 'Payslip March 2016.pdf',
 'pgd.csv',
 'postgresql-9.6.1-1-windows-x64.exe',
 'Program 1-results.rtf',
 'protein.csv',
 'python+with+postgres (1).ipynb',
 'python+with+postgres.ipynb',
 'R-3.3.2-win.exe',
 'R-3.3.3-win.exe',
 'RCertificationExam.pdf',
 'reg+model.ipynb',
 'Revision -  Business Analytics (1).pdf',
 'Revision -  Business Analytics.pdf',
 'RidingMowers.csv',
 'rsconnect',
 'RStudio-1.0.136.exe',
 'Salary Slip, Feb 2016.pdf',
 'Salary Slip, Jan 2016.pdf',
 'Salary Slip, March 2016 (1).pdf',
 'Salary Slip, March 2016 (2).pdf',
 'Salary Slip, March 2016.pdf',
 'sales-of-shampoo-over-a-three-ye.csv',
 'SAS part 2.pdf',
 'SAS Part 3.pdf',
 'sas-university-edition-107140.pdf',
 'Scan0095.pdf',
 'Scanned Invoice for Collabera.pdf',
 'Screenshot 2017-01-23 12.36.55.png',
 'September invoice adaptive analytics   - Sheet1.pdf',
 'Sollers January.pdf',
 'sqlalchemy.ipynb',
 'stackoverflow-dump-analysis.html',
 'Sunstone.pdf',
 'Tableau.pdf',
 'TableauPublicDesktop-64bit-10-1-3.exe',
 'TableauPublicDesktop-64bit-10-1-4.exe',
 'telecom.csv',
 'TelecomServiceProviderCaseStudy.pdf',
 'Text Mining (1).pdf',
 'Text Mining.pdf',
 'third.sas7bdat',
 'Time Series  Forecasting (1).pdf',
 'Time Series  Forecasting.pdf',
 'ts.html',
 'ts.R',
 'Unconfirmed 373974.crdownload',
 'Unconfirmed 376562.crdownload',
 'Unconfirmed 376991.crdownload',
 'Unconfirmed 930917.crdownload',
 'Unconfirmed 950045.crdownload',
 'unvbasicvapp__9411008__ova__en__sp0__1.ova.crdownload',
 'VirtualBox-5.1.8-111374-Win (1).exe',
 'VirtualBox-5.1.8-111374-Win.exe',
 'Webinar for Business Analytics.pdf',
 'WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg',
 'WhatsApp Image 2017-02-18 at 08.42.55.jpeg']

In [52]:
fraud=pd.read_csv("ccFraud.csv")

In [53]:
fraud2=fraud

In [54]:
fraud3=fraud.copy()

In [55]:
type(fraud)


Out[55]:
pandas.core.frame.DataFrame

In [56]:
fraud.columns


Out[56]:
Index(['custID', 'gender', 'state', 'cardholder', 'balance', 'numTrans',
       'numIntlTrans', 'creditLine', 'fraudRisk'],
      dtype='object')

In [57]:
fraud.dtypes


Out[57]:
custID          int64
gender          int64
state           int64
cardholder      int64
balance         int64
numTrans        int64
numIntlTrans    int64
creditLine      int64
fraudRisk       int64
dtype: object

In [58]:
fraud.shape


Out[58]:
(10000000, 9)

In [59]:
fraud.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 9 columns):
custID          int64
gender          int64
state           int64
cardholder      int64
balance         int64
numTrans        int64
numIntlTrans    int64
creditLine      int64
fraudRisk       int64
dtypes: int64(9)
memory usage: 686.6 MB

In [60]:
len(fraud)


Out[60]:
10000000

In [61]:
len(fraud.columns)


Out[61]:
9

In [62]:
fraud.head()


Out[62]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0

In [63]:
fraud.tail()


Out[63]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
9999995 9999996 1 37 1 0 10 0 9 0
9999996 9999997 1 16 1 0 33 2 4 0
9999997 9999998 1 24 1 9000 38 0 8 0
9999998 9999999 1 28 1 7000 20 19 6 0
9999999 10000000 1 23 1 0 13 0 7 0

In [64]:
fraud.ix[0:10]


Out[64]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0
5 6 2 44 2 5546 21 0 13 0
6 7 1 3 1 2000 41 0 1 0
7 8 1 10 1 6016 20 3 6 0
8 9 2 32 1 2428 4 10 22 0
9 10 1 23 1 0 18 56 5 0
10 11 1 46 1 4601 54 0 4 0

In [65]:
fraud[['gender','state','cardholder']]


Out[65]:
gender state cardholder
0 1 35 1
1 2 2 1
2 2 2 1
3 1 15 1
4 1 46 1
5 2 44 2
6 1 3 1
7 1 10 1
8 2 32 1
9 1 23 1
10 1 46 1
11 1 10 1
12 1 6 1
13 2 38 1
14 1 27 1
15 1 44 1
16 2 18 1
17 1 35 1
18 1 5 1
19 2 31 1
20 1 39 1
21 1 34 1
22 1 5 1
23 2 21 1
24 1 25 1
25 2 29 1
26 1 38 1
27 1 9 1
28 2 20 1
29 2 49 1
... ... ... ...
9999970 1 10 1
9999971 2 40 1
9999972 1 46 1
9999973 1 10 2
9999974 1 25 1
9999975 2 48 1
9999976 2 4 1
9999977 1 35 1
9999978 1 44 1
9999979 1 6 1
9999980 1 10 1
9999981 1 33 1
9999982 1 44 1
9999983 2 13 1
9999984 1 39 1
9999985 2 45 2
9999986 1 23 1
9999987 2 24 1
9999988 1 18 1
9999989 2 4 1
9999990 1 16 2
9999991 2 36 1
9999992 1 38 1
9999993 2 43 1
9999994 1 16 2
9999995 1 37 1
9999996 1 16 1
9999997 1 24 1
9999998 1 28 1
9999999 1 23 1

10000000 rows × 3 columns


In [66]:
fraud.iloc[:,1:3]


Out[66]:
gender state
0 1 35
1 2 2
2 2 2
3 1 15
4 1 46
5 2 44
6 1 3
7 1 10
8 2 32
9 1 23
10 1 46
11 1 10
12 1 6
13 2 38
14 1 27
15 1 44
16 2 18
17 1 35
18 1 5
19 2 31
20 1 39
21 1 34
22 1 5
23 2 21
24 1 25
25 2 29
26 1 38
27 1 9
28 2 20
29 2 49
... ... ...
9999970 1 10
9999971 2 40
9999972 1 46
9999973 1 10
9999974 1 25
9999975 2 48
9999976 2 4
9999977 1 35
9999978 1 44
9999979 1 6
9999980 1 10
9999981 1 33
9999982 1 44
9999983 2 13
9999984 1 39
9999985 2 45
9999986 1 23
9999987 2 24
9999988 1 18
9999989 2 4
9999990 1 16
9999991 2 36
9999992 1 38
9999993 2 43
9999994 1 16
9999995 1 37
9999996 1 16
9999997 1 24
9999998 1 28
9999999 1 23

10000000 rows × 2 columns


In [67]:
fraud.iloc[0:10,1:3]


Out[67]:
gender state
0 1 35
1 2 2
2 2 2
3 1 15
4 1 46
5 2 44
6 1 3
7 1 10
8 2 32
9 1 23

In [68]:
import numpy as np

In [69]:
np.random.choice(10,2)


Out[69]:
array([1, 0])

In [70]:
np.random.choice(len(fraud),0.000001*len(fraud))


C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  if __name__ == '__main__':
Out[70]:
array([7100177, 4058088, 2302568, 9002606, 3792183,  888579, 6465822,
       3062360, 3663548, 5578048])

In [71]:
b=np.random.choice(len(fraud),0.000001*len(fraud))


C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  if __name__ == '__main__':

In [72]:
fraud.ix[b]


Out[72]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
7461768 7461769 1 21 1 3000 15 2 2 0
5089359 5089360 1 5 1 8000 80 35 7 1
8548436 8548437 1 5 1 5000 3 0 4 0
3997580 3997581 1 32 1 10000 47 1 9 0
1776937 1776938 1 43 1 3000 100 0 2 0
3590544 3590545 2 5 1 4000 52 15 3 0
1824320 1824321 1 35 1 1118 87 8 5 0
906910 906911 1 10 1 6000 6 0 5 0
2265882 2265883 1 5 1 1426 17 0 8 0
7983346 7983347 2 13 1 3000 43 8 2 0

In [73]:
d=fraud.ix[b]

In [74]:
d.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 7461768 to 7983346
Data columns (total 9 columns):
custID          10 non-null int64
gender          10 non-null int64
state           10 non-null int64
cardholder      10 non-null int64
balance         10 non-null int64
numTrans        10 non-null int64
numIntlTrans    10 non-null int64
creditLine      10 non-null int64
fraudRisk       10 non-null int64
dtypes: int64(9)
memory usage: 800.0 bytes

In [75]:
del d

In [76]:
del fraud['gender']

In [77]:
fraud.columns


Out[77]:
Index(['custID', 'state', 'cardholder', 'balance', 'numTrans', 'numIntlTrans',
       'creditLine', 'fraudRisk'],
      dtype='object')

In [78]:
fraud.drop(fraud.columns[[1, 5]], axis=1, inplace=True)

In [79]:
fraud.columns


Out[79]:
Index(['custID', 'cardholder', 'balance', 'numTrans', 'creditLine',
       'fraudRisk'],
      dtype='object')

In [80]:
fraud.drop(fraud.columns[[1,2,3]], axis=1, inplace=True)

In [81]:
fraud.columns


Out[81]:
Index(['custID', 'creditLine', 'fraudRisk'], dtype='object')

In [82]:
fraud.head()


Out[82]:
custID creditLine fraudRisk
0 1 2 0
1 2 18 0
2 3 16 0
3 4 5 0
4 5 7 0

In [83]:
fraud.drop(fraud.index[[1,3]])


Out[83]:
custID creditLine fraudRisk
0 1 2 0
2 3 16 0
4 5 7 0
5 6 13 0
6 7 1 0
7 8 6 0
8 9 22 0
9 10 5 0
10 11 4 0
11 12 2 0
12 13 4 0
13 14 8 0
14 15 17 0
15 16 5 0
16 17 13 0
17 18 8 0
18 19 8 0
19 20 8 0
20 21 3 0
21 22 3 0
22 23 11 0
23 24 3 0
24 25 65 0
25 26 4 0
26 27 3 0
27 28 11 0
28 29 2 0
29 30 13 1
30 31 4 0
31 32 5 0
... ... ... ...
9999970 9999971 11 0
9999971 9999972 7 0
9999972 9999973 4 0
9999973 9999974 14 0
9999974 9999975 3 0
9999975 9999976 8 0
9999976 9999977 7 0
9999977 9999978 7 0
9999978 9999979 4 0
9999979 9999980 7 0
9999980 9999981 13 0
9999981 9999982 18 0
9999982 9999983 2 0
9999983 9999984 5 0
9999984 9999985 6 0
9999985 9999986 2 0
9999986 9999987 6 0
9999987 9999988 5 0
9999988 9999989 14 1
9999989 9999990 11 0
9999990 9999991 5 0
9999991 9999992 5 0
9999992 9999993 2 0
9999993 9999994 5 0
9999994 9999995 20 0
9999995 9999996 9 0
9999996 9999997 4 0
9999997 9999998 8 0
9999998 9999999 6 0
9999999 10000000 7 0

9999998 rows × 3 columns


In [84]:
pd.Series(range(1,101))


Out[84]:
0       1
1       2
2       3
3       4
4       5
5       6
6       7
7       8
8       9
9      10
10     11
11     12
12     13
13     14
14     15
15     16
16     17
17     18
18     19
19     20
20     21
21     22
22     23
23     24
24     25
25     26
26     27
27     28
28     29
29     30
     ... 
70     71
71     72
72     73
73     74
74     75
75     76
76     77
77     78
78     79
79     80
80     81
81     82
82     83
83     84
84     85
85     86
86     87
87     88
88     89
89     90
90     91
91     92
92     93
93     94
94     95
95     96
96     97
97     98
98     99
99    100
dtype: int32

In [85]:
s=pd.Series(range(1,101))

In [86]:
fraud.drop(fraud.index[[s]])


Out[86]:
custID creditLine fraudRisk
0 1 2 0
101 102 10 0
102 103 28 0
103 104 22 0
104 105 10 0
105 106 4 0
106 107 11 0
107 108 6 0
108 109 4 0
109 110 4 0
110 111 10 1
111 112 4 0
112 113 3 0
113 114 5 0
114 115 4 0
115 116 2 0
116 117 4 0
117 118 13 0
118 119 9 0
119 120 8 0
120 121 3 0
121 122 11 0
122 123 5 0
123 124 4 0
124 125 13 0
125 126 4 0
126 127 5 0
127 128 6 0
128 129 3 0
129 130 2 0
... ... ... ...
9999970 9999971 11 0
9999971 9999972 7 0
9999972 9999973 4 0
9999973 9999974 14 0
9999974 9999975 3 0
9999975 9999976 8 0
9999976 9999977 7 0
9999977 9999978 7 0
9999978 9999979 4 0
9999979 9999980 7 0
9999980 9999981 13 0
9999981 9999982 18 0
9999982 9999983 2 0
9999983 9999984 5 0
9999984 9999985 6 0
9999985 9999986 2 0
9999986 9999987 6 0
9999987 9999988 5 0
9999988 9999989 14 1
9999989 9999990 11 0
9999990 9999991 5 0
9999991 9999992 5 0
9999992 9999993 2 0
9999993 9999994 5 0
9999994 9999995 20 0
9999995 9999996 9 0
9999996 9999997 4 0
9999997 9999998 8 0
9999998 9999999 6 0
9999999 10000000 7 0

9999900 rows × 3 columns


In [87]:
fraud.query('fraudRisk==0').head(6)


Out[87]:
custID creditLine fraudRisk
0 1 2 0
1 2 18 0
2 3 16 0
3 4 5 0
4 5 7 0
5 6 13 0

In [88]:
fraud.loc[fraud.fraudRisk == 0,'creditLine'] = -1; fraud


Out[88]:
custID creditLine fraudRisk
0 1 -1 0
1 2 -1 0
2 3 -1 0
3 4 -1 0
4 5 -1 0
5 6 -1 0
6 7 -1 0
7 8 -1 0
8 9 -1 0
9 10 -1 0
10 11 -1 0
11 12 -1 0
12 13 -1 0
13 14 -1 0
14 15 -1 0
15 16 -1 0
16 17 -1 0
17 18 -1 0
18 19 -1 0
19 20 -1 0
20 21 -1 0
21 22 -1 0
22 23 -1 0
23 24 -1 0
24 25 -1 0
25 26 -1 0
26 27 -1 0
27 28 -1 0
28 29 -1 0
29 30 13 1
... ... ... ...
9999970 9999971 -1 0
9999971 9999972 -1 0
9999972 9999973 -1 0
9999973 9999974 -1 0
9999974 9999975 -1 0
9999975 9999976 -1 0
9999976 9999977 -1 0
9999977 9999978 -1 0
9999978 9999979 -1 0
9999979 9999980 -1 0
9999980 9999981 -1 0
9999981 9999982 -1 0
9999982 9999983 -1 0
9999983 9999984 -1 0
9999984 9999985 -1 0
9999985 9999986 -1 0
9999986 9999987 -1 0
9999987 9999988 -1 0
9999988 9999989 14 1
9999989 9999990 -1 0
9999990 9999991 -1 0
9999991 9999992 -1 0
9999992 9999993 -1 0
9999993 9999994 -1 0
9999994 9999995 -1 0
9999995 9999996 -1 0
9999996 9999997 -1 0
9999997 9999998 -1 0
9999998 9999999 -1 0
9999999 10000000 -1 0

10000000 rows × 3 columns


In [89]:
fraud2.head()


Out[89]:
custID creditLine fraudRisk
0 1 -1 0
1 2 -1 0
2 3 -1 0
3 4 -1 0
4 5 -1 0

In [90]:
fraud3.head()


Out[90]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0

In [ ]: