In [1]:
import pandas as pd

In [2]:
import os as os

In [11]:
os.getcwd()


Out[11]:
'C:\\Users\\Dell\\Downloads'

In [5]:
os.chdir('C:\\Users\\Dell\\Downloads')

In [6]:
os.listdir()


Out[6]:
['140749_2017.pdf',
 '2011-F01-0700-Rev4-MDDS.XLSX',
 '20150817143155.pdf',
 '20160111060911.pdf',
 '20170214052225.pdf',
 '861415_10151432783238421_2124270505_o (1).jpg',
 '861415_10151432783238421_2124270505_o.jpg',
 'AirPassengers.csv',
 'ajayo.jpg',
 'Alison Python  Invoice   - Sheet1.pdf',
 'Alison SAS  Invoice   - Sheet1.pdf',
 'Allison Interview Jones Invoice   - Sheet1.pdf',
 'Anaconda3-4.2.0-Windows-x86_64.exe',
 'apachehttpd.exe',
 'April invoice adaptive analytics   - Sheet1.pdf',
 'Assignment14_BusinessAnalytics (1).docx',
 'Assignment14_BusinessAnalytics.docx',
 'Assignment15_BusinessAnalytics.docx',
 'Assignment16_BusinessAnalytics (1).docx',
 'Assignment16_BusinessAnalytics (2).docx',
 'Assignment16_BusinessAnalytics.docx',
 'aug ust 2008.JPG',
 'avast_free_antivirus_setup_online.exe',
 'avinash_ltv.zip',
 'BigDiamonds.csv',
 'BigDiamonds.csv (1).zip',
 'BigDiamonds.csv (2)',
 'BigDiamonds.csv (2).zip',
 'BigDiamonds.csv.zip',
 'Boston (1).csv',
 'Boston.csv',
 'CAM- Ajay Ohri (1).pdf',
 'CAM- Ajay Ohri.pdf',
 'camtasia.exe',
 'ccFraud.csv',
 'Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf',
 'CHAP1-6PythonforRUsersAnapproachforDataScience.docx',
 'chapter+3+_+spark.html',
 'chi+square+test.ipynb',
 'chromeinstall-8u111.exe',
 'Cisco_WebEx_Add-On.exe',
 'class2.csv',
 'Collabera Invoice (1).pdf',
 'Collabera Invoice.pdf',
 'Collectcent Invoice.pdf',
 'college degrees.pdf',
 'DAP 1.pdf',
 'DAP 1.pptx',
 'DAP 6 RDBMS and SQL.pdf',
 'DAP 6 RDBMS and SQL.pptx',
 'data+exploration.ipynb',
 'data+manipulation.ipynb',
 'data1.csv',
 'datasets.csv',
 'Decision Trees.pdf',
 'DecisionStatsOfferLetter.docx',
 'DecisionStatsRelievingLetter.docx',
 'descriptive+stats+in+Python.ipynb',
 'desktop.ini',
 'Diamond (1).csv',
 'Diamond (2).csv',
 'Diamond (3).csv',
 'Diamond (4).csv',
 'Diamond (5).csv',
 'Diamond (6).csv',
 'Diamond.csv',
 'DropboxInstaller.exe',
 'edb_npgsql.exe',
 'edb_pgjdbc.exe',
 'edb_psqlodbc.exe',
 'edb_psqlodbc.exe-20170203172812',
 'edb_psqlodbc.exe-20170307203617',
 'final invoice edureka  - Sheet1.pdf',
 'FinalPythonforRUsersAnapproachforDataScience (1).docx',
 'FinalPythonforRUsersAnapproachforDataScience (2).docx',
 'FinalPythonforRUsersAnapproachforDataScience (3).docx',
 'FinalPythonforRUsersAnapproachforDataScience (4).docx',
 'FinalPythonforRUsersAnapproachforDataScience.docx',
 'final_webinar (1).pdf',
 'final_webinar.pdf',
 'Git-2.11.0-64-bit.exe',
 'Git-2.12.0-64-bit.exe',
 'GitHubSetup (1).exe',
 'GitHubSetup (2).exe',
 'GitHubSetup.exe',
 'GOMAUDIOGLOBALSETUP.EXE',
 'Hdma.csv',
 'Hedonic.csv',
 'HP Downloads',
 'HPSupportSolutionsFramework-12.5.32.203.exe',
 'image.png',
 'IMS PROSCHOOL Workshop.pptx.pdf',
 'IMS PROSCHOOL Workshop.pptx.pptx',
 'Introduction to SAS (1).pdf',
 'Introduction to SAS Part 1 (1).pdf',
 'Introduction to SAS Part 1.pdf',
 'Introduction to SAS.pdf',
 'Invoice for Digital Vidya.pdf',
 'Invoice for Weekendr.pdf',
 'Invoice format - Ajay Ohri CONTATA (1).xls',
 'Invoice format - Ajay Ohri CONTATA.xls',
 'invoice rapid miner.pdf',
 'iris2 (1).ipynb',
 'iris2 (2).ipynb',
 'iris2.ipynb',
 'January invoice Indicus  .pdf',
 'June AV   Invoice   - Sheet1.pdf',
 'Lecture 6 - KNN & Naive Bayes.ppt',
 'Local Disk (C) - Shortcut.lnk',
 'logistic regression - script for ppt.R',
 'logistic_regression_-_script_for_ppt.html',
 'March invoice Indicus   - Sheet1.pdf',
 'mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi',
 'mongodb-win32-x86_64-3.4.2-signed.msi',
 'mtcarslm.R',
 'nltk.ipynb',
 'notebook-Copy1.html',
 'Offer Letter - Ajay Ohri (1).pdf',
 'Offer Letter - Ajay Ohri.pdf',
 'Other Data Mining  Methods (1).pdf',
 'Other Data Mining  Methods.pdf',
 'output1 (1).xls',
 'output1 (2).xls',
 'output1.xls',
 'pandas+11.ipynb',
 'pandas+analysis+1.ipynb',
 'passport image.pdf',
 'Pawconinvoice2016.pdf',
 'Pawconinvoice2017 (1).pdf',
 'Pawconinvoice2017 (2).pdf',
 'Pawconinvoice2017 (3).pdf',
 'Pawconinvoice2017.pdf',
 'Payslip Feb 2016 - Sheet1.pdf',
 'Payslip Feb 2016.pdf',
 'Payslip Format Decisionstats - Sheet1.pdf',
 'Payslip Jan 2016 - Sheet1.pdf',
 'Payslip Jan 2016.pdf',
 'Payslip March 2016 - Sheet1.pdf',
 'Payslip March 2016.pdf',
 'pgd.csv',
 'postgresql-9.6.1-1-windows-x64.exe',
 'Program 1-results.rtf',
 'protein.csv',
 'python+with+postgres (1).ipynb',
 'python+with+postgres.ipynb',
 'Python.docx',
 'R-3.3.2-win.exe',
 'R-3.3.3-win.exe',
 'RCertificationExam.pdf',
 'reg+model.ipynb',
 'Revision -  Business Analytics (1).pdf',
 'Revision -  Business Analytics.pdf',
 'RidingMowers.csv',
 'rsconnect',
 'RStudio-1.0.136.exe',
 'Salary Slip, Feb 2016.pdf',
 'Salary Slip, Jan 2016.pdf',
 'Salary Slip, March 2016 (1).pdf',
 'Salary Slip, March 2016 (2).pdf',
 'Salary Slip, March 2016.pdf',
 'sales-of-shampoo-over-a-three-ye.csv',
 'SAS part 2.pdf',
 'SAS Part 3.pdf',
 'sas-university-edition-107140.pdf',
 'Scan0095.pdf',
 'Scanned Invoice for Collabera.pdf',
 'Screenshot 2017-01-23 12.36.55.png',
 'September invoice adaptive analytics   - Sheet1.pdf',
 'Sollers January.pdf',
 'sqlalchemy.ipynb',
 'stackoverflow-dump-analysis.html',
 'Sunstone.pdf',
 'Tableau.pdf',
 'TableauPublicDesktop-64bit-10-1-3.exe',
 'TableauPublicDesktop-64bit-10-1-4.exe',
 'telecom.csv',
 'TelecomServiceProviderCaseStudy.pdf',
 'Text Mining (1).pdf',
 'Text Mining.pdf',
 'third.sas7bdat',
 'Time Series  Forecasting (1).pdf',
 'Time Series  Forecasting.pdf',
 'ts.html',
 'ts.R',
 'Unconfirmed 373974.crdownload',
 'Unconfirmed 376562.crdownload',
 'Unconfirmed 376991.crdownload',
 'Unconfirmed 930917.crdownload',
 'Unconfirmed 950045.crdownload',
 'unvbasicvapp__9411008__ova__en__sp0__1.ova.crdownload',
 'VirtualBox-5.1.8-111374-Win (1).exe',
 'VirtualBox-5.1.8-111374-Win.exe',
 'Webinar for Business Analytics.pdf',
 'WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg',
 'WhatsApp Image 2017-02-18 at 08.42.55.jpeg']

In [8]:
fraud=pd.read_csv("ccFraud.csv")

In [9]:
fraud2=fraud

In [10]:
fraud3=fraud.copy()

In [12]:
type(fraud)


Out[12]:
pandas.core.frame.DataFrame

In [13]:
fraud.columns


Out[13]:
Index(['custID', 'gender', 'state', 'cardholder', 'balance', 'numTrans',
       'numIntlTrans', 'creditLine', 'fraudRisk'],
      dtype='object')

In [29]:
fraud.dtypes


Out[29]:
custID          int64
gender          int64
state           int64
cardholder      int64
balance         int64
numTrans        int64
numIntlTrans    int64
creditLine      int64
fraudRisk       int64
dtype: object

In [26]:
fraud.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 9 columns):
custID          int64
gender          int64
state           int64
cardholder      int64
balance         int64
numTrans        int64
numIntlTrans    int64
creditLine      int64
fraudRisk       int64
dtypes: int64(9)
memory usage: 686.6 MB

In [23]:
fraud.memory_usage()


Out[23]:
Index                 80
custID          80000000
gender          80000000
state           80000000
cardholder      80000000
balance         80000000
numTrans        80000000
numIntlTrans    80000000
creditLine      80000000
fraudRisk       80000000
dtype: int64

In [18]:
fraud.memory_usage(index=True).sum()


Out[18]:
720000080

In [20]:
fraud.shape


Out[20]:
(10000000, 9)

In [30]:
len(fraud)


Out[30]:
10000000

In [32]:
fraud.shape[1]


Out[32]:
9

In [33]:
len(fraud.columns)


Out[33]:
9

In [34]:
fraud.head()


Out[34]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0

In [37]:
fraud.head(10)


Out[37]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0
5 6 2 44 2 5546 21 0 13 0
6 7 1 3 1 2000 41 0 1 0
7 8 1 10 1 6016 20 3 6 0
8 9 2 32 1 2428 4 10 22 0
9 10 1 23 1 0 18 56 5 0

In [41]:
fraud.tail()


Out[41]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
9999995 9999996 1 37 1 0 10 0 9 0
9999996 9999997 1 16 1 0 33 2 4 0
9999997 9999998 1 24 1 9000 38 0 8 0
9999998 9999999 1 28 1 7000 20 19 6 0
9999999 10000000 1 23 1 0 13 0 7 0

In [39]:
fraud.gender.head()


Out[39]:
0    1
1    2
2    2
3    1
4    1
Name: gender, dtype: int64

In [40]:
fraud.head().gender


Out[40]:
0    1
1    2
2    2
3    1
4    1
Name: gender, dtype: int64

In [42]:
fraud['gender'].head()


Out[42]:
0    1
1    2
2    2
3    1
4    1
Name: gender, dtype: int64

In [43]:
fraud[['gender','state','fraudRisk']].head()


Out[43]:
gender state fraudRisk
0 1 35 0
1 2 2 0
2 2 2 0
3 1 15 0
4 1 46 0

In [52]:
fraud.iloc[:,[1,2,8]].head()


Out[52]:
gender state fraudRisk
0 1 35 0
1 2 2 0
2 2 2 0
3 1 15 0
4 1 46 0

In [53]:
fraud.iloc[20:30,[1,2,8]]


Out[53]:
gender state fraudRisk
20 1 39 0
21 1 34 0
22 1 5 0
23 2 21 0
24 1 25 0
25 2 29 0
26 1 38 0
27 1 9 0
28 2 20 0
29 2 49 1

In [54]:
fraud.ix[20:30]


Out[54]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
20 21 1 39 1 4000 24 0 3 0
21 22 1 34 1 0 22 0 3 0
22 23 1 5 1 0 7 0 11 0
23 24 2 21 1 0 15 0 3 0
24 25 1 25 1 0 12 0 65 0
25 26 2 29 1 5000 4 9 4 0
26 27 1 38 1 4000 21 5 3 0
27 28 1 9 1 12000 20 0 11 0
28 29 2 20 1 0 19 0 2 0
29 30 2 49 1 5192 84 0 13 1
30 31 1 29 1 0 23 0 4 0

In [57]:
fraud[['gender','state','fraudRisk']].ix[20:30]


Out[57]:
gender state fraudRisk
20 1 39 0
21 1 34 0
22 1 5 0
23 2 21 0
24 1 25 0
25 2 29 0
26 1 38 0
27 1 9 0
28 2 20 0
29 2 49 1
30 1 29 0

In [58]:
import numpy as np

In [59]:
np.random.choice(100,5)


Out[59]:
array([50, 83, 66, 90, 11])

In [60]:
np.random.choice(len(fraud),0.000001*len(fraud))


C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  if __name__ == '__main__':
Out[60]:
array([2148721, 5783663, 4642367, 1808583, 3507214, 9673216, 7101634,
       2846151, 5954489, 7471532])

In [61]:
b=np.random.choice(len(fraud),0.000001*len(fraud))


C:\Users\Dell\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  if __name__ == '__main__':

In [62]:
fraud.ix[b]


Out[62]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
802467 802468 2 44 1 11657 54 0 24 0
6215623 6215624 1 4 1 803 12 3 19 0
2763901 2763902 2 36 1 0 7 3 2 0
996086 996087 1 10 1 8711 10 0 10 0
2826430 2826431 1 44 1 3834 27 29 5 0
1298626 1298627 1 41 1 6000 10 0 5 0
4450853 4450854 2 49 1 9515 100 0 11 0
7596059 7596060 1 30 1 6764 16 0 6 0
2694049 2694050 2 2 1 2071 43 0 12 0
9762815 9762816 1 48 1 0 69 0 9 0

In [63]:
d=fraud.ix[b]

In [64]:
d.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 802467 to 9762815
Data columns (total 9 columns):
custID          10 non-null int64
gender          10 non-null int64
state           10 non-null int64
cardholder      10 non-null int64
balance         10 non-null int64
numTrans        10 non-null int64
numIntlTrans    10 non-null int64
creditLine      10 non-null int64
fraudRisk       10 non-null int64
dtypes: int64(9)
memory usage: 800.0 bytes

In [65]:
del d

In [67]:
del fraud['gender']

In [68]:
fraud.head()


Out[68]:
custID state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 35 1 3000 4 14 2 0
1 2 2 1 0 9 0 18 0
2 3 2 1 0 27 9 16 0
3 4 15 1 0 12 0 5 0
4 5 46 1 0 11 16 7 0

In [69]:
fraud.drop(fraud.columns[[1, 3]], axis=1, inplace=True)

In [70]:
fraud.head()


Out[70]:
custID cardholder numTrans numIntlTrans creditLine fraudRisk
0 1 1 4 14 2 0
1 2 1 9 0 18 0
2 3 1 27 9 16 0
3 4 1 12 0 5 0
4 5 1 11 16 7 0

In [71]:
fraud.drop(fraud.index[[1,2,3]])


Out[71]:
custID cardholder numTrans numIntlTrans creditLine fraudRisk
0 1 1 4 14 2 0
4 5 1 11 16 7 0
5 6 2 21 0 13 0
6 7 1 41 0 1 0
7 8 1 20 3 6 0
8 9 1 4 10 22 0
9 10 1 18 56 5 0
10 11 1 54 0 4 0
11 12 1 20 0 2 0
12 13 1 45 2 4 0
13 14 1 41 3 8 0
14 15 1 60 0 17 0
15 16 1 22 0 5 0
16 17 1 20 0 13 0
17 18 1 13 6 8 0
18 19 1 20 2 8 0
19 20 1 21 10 8 0
20 21 1 24 0 3 0
21 22 1 22 0 3 0
22 23 1 7 0 11 0
23 24 1 15 0 3 0
24 25 1 12 0 65 0
25 26 1 4 9 4 0
26 27 1 21 5 3 0
27 28 1 20 0 11 0
28 29 1 19 0 2 0
29 30 1 84 0 13 1
30 31 1 23 0 4 0
31 32 1 8 0 5 0
32 33 1 49 0 10 0
... ... ... ... ... ... ...
9999970 9999971 1 12 0 11 0
9999971 9999972 1 31 3 7 0
9999972 9999973 1 69 0 4 0
9999973 9999974 2 36 31 14 0
9999974 9999975 1 18 0 3 0
9999975 9999976 1 23 0 8 0
9999976 9999977 1 14 0 7 0
9999977 9999978 1 24 0 7 0
9999978 9999979 1 12 0 4 0
9999979 9999980 1 5 3 7 0
9999980 9999981 1 54 0 13 0
9999981 9999982 1 23 26 18 0
9999982 9999983 1 14 0 2 0
9999983 9999984 1 2 0 5 0
9999984 9999985 1 30 0 6 0
9999985 9999986 2 4 0 2 0
9999986 9999987 1 59 0 6 0
9999987 9999988 1 46 0 5 0
9999988 9999989 1 72 0 14 1
9999989 9999990 1 17 7 11 0
9999990 9999991 2 8 0 5 0
9999991 9999992 1 6 0 5 0
9999992 9999993 1 7 0 2 0
9999993 9999994 1 6 0 5 0
9999994 9999995 2 3 0 20 0
9999995 9999996 1 10 0 9 0
9999996 9999997 1 33 2 4 0
9999997 9999998 1 38 0 8 0
9999998 9999999 1 20 19 6 0
9999999 10000000 1 13 0 7 0

9999997 rows × 6 columns


In [72]:
fraud.head()


Out[72]:
custID cardholder numTrans numIntlTrans creditLine fraudRisk
0 1 1 4 14 2 0
1 2 1 9 0 18 0
2 3 1 27 9 16 0
3 4 1 12 0 5 0
4 5 1 11 16 7 0

In [73]:
fraud2.head()


Out[73]:
custID cardholder numTrans numIntlTrans creditLine fraudRisk
0 1 1 4 14 2 0
1 2 1 9 0 18 0
2 3 1 27 9 16 0
3 4 1 12 0 5 0
4 5 1 11 16 7 0

In [74]:
fraud3.head()


Out[74]:
custID gender state cardholder balance numTrans numIntlTrans creditLine fraudRisk
0 1 1 35 1 3000 4 14 2 0
1 2 2 2 1 0 9 0 18 0
2 3 2 2 1 0 27 9 16 0
3 4 1 15 1 0 12 0 5 0
4 5 1 46 1 0 11 16 7 0

In [78]:
fraud3.shape


Out[78]:
(10000000, 9)

In [79]:
s=pd.Series(range(1,101))

In [81]:
type(s)


Out[81]:
pandas.core.series.Series

In [ ]: