In [1]:
import pandas as pd
import os as os
import numpy as np

In [2]:
adult=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)

In [3]:
adult.columns=["age ",
"workclass ",
"fnlwgt",
"education ",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"income",
]

In [5]:
adult.head()


Out[5]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [12]:
adult.iloc[:2,:3]


Out[12]:
age workclass fnlwgt
0 39 State-gov 77516
1 50 Self-emp-not-inc 83311

In [13]:
adult.iloc[1:2,2:3]


Out[13]:
fnlwgt
1 83311

In [14]:
adult.iloc[4,]#index


Out[14]:
age                                28
workclass                     Private
fnlwgt                         338409
education                   Bachelors
education-num                      13
marital-status     Married-civ-spouse
occupation             Prof-specialty
relationship                     Wife
race                            Black
sex                            Female
capital-gain                        0
capital-loss                        0
hours-per-week                     40
native-country                   Cuba
income                          <=50K
Name: 4, dtype: object

In [15]:
adult.iloc[3:4,:]


Out[15]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K

In [20]:
adult.drop?

In [21]:
adult2=adult.drop('capital-loss',1)

In [22]:
adult2.head()


Out[22]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 40 Cuba <=50K

In [25]:
del adult2["capital-gain"]

In [27]:
adult2.head()


Out[27]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 40 Cuba <=50K

In [28]:
titanic=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/Titanic.csv")

In [29]:
titanic.columns


Out[29]:
Index(['Unnamed: 0', 'Name', 'PClass', 'Age', 'Sex', 'Survived', 'SexCode'], dtype='object')

In [30]:
titanic=titanic.drop('Unnamed: 0',1)

In [31]:
titanic.head()


Out[31]:
Name PClass Age Sex Survived SexCode
0 Allen, Miss Elisabeth Walton 1st 29.00 female 1 1
1 Allison, Miss Helen Loraine 1st 2.00 female 0 1
2 Allison, Mr Hudson Joshua Creighton 1st 30.00 male 0 0
3 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 female 0 1
4 Allison, Master Hudson Trevor 1st 0.92 male 1 0

In [32]:
import IPython

In [33]:
print (IPython.sys_info())


{'commit_hash': '5a894b9',
 'commit_source': 'installation',
 'default_encoding': 'cp1252',
 'ipython_path': 'C:\\Users\\KOGENTIX\\Anaconda3\\lib\\site-packages\\IPython',
 'ipython_version': '5.3.0',
 'os_name': 'nt',
 'platform': 'Windows-10-10.0.15063-SP0',
 'sys_executable': 'C:\\Users\\KOGENTIX\\Anaconda3\\python.exe',
 'sys_platform': 'win32',
 'sys_version': '3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, '
                '13:25:24) [MSC v.1900 64 bit (AMD64)]'}

In [34]:
!pip install version_information


Collecting version_information
  Downloading version_information-1.0.3.tar.gz
Building wheels for collected packages: version-information
  Running setup.py bdist_wheel for version-information: started
  Running setup.py bdist_wheel for version-information: finished with status 'done'
  Stored in directory: C:\Users\KOGENTIX\AppData\Local\pip\Cache\wheels\4b\4c\f7\4d99d7820a507d8ae55204fcc00d66cdabf596d4b01228e7bd
Successfully built version-information
Installing collected packages: version-information
Successfully installed version-information-1.0.3

In [35]:
%load_ext version_information

In [36]:
%version_information


Out[36]:
SoftwareVersion
Python3.6.1 64bit [MSC v.1900 64 bit (AMD64)]
IPython5.3.0
OSWindows 10 10.0.15063 SP0
Tue Dec 12 14:13:34 2017 SE Asia Standard Time

In [37]:
!pip freeze


alabaster==0.7.10
anaconda-client==1.6.3
anaconda-navigator==1.6.2
anaconda-project==0.6.0
asn1crypto==0.22.0
astroid==1.4.9
astropy==1.3.2
Babel==2.4.0
backports.shutil-get-terminal-size==1.0.0
bcrypt==3.1.3
beautifulsoup4==4.6.0
bitarray==0.8.1
blaze==0.10.1
bleach==1.5.0
bokeh==0.12.5
boto==2.46.1
Bottleneck==1.2.1
brewer2mpl==1.4.1
cffi==1.10.0
chardet==3.0.3
click==6.7
cloudpickle==0.2.2
clyent==1.2.2
cm-api==16.0.0
colorama==0.3.9
comtypes==1.1.2
conda==4.3.22
contextlib2==0.5.5
cryptography==1.8.1
cycler==0.10.0
Cython==0.25.2
cytoolz==0.8.2
dask==0.14.3
datashape==0.5.4
decorator==4.0.11
distributed==1.16.3
docutils==0.13.1
entrypoints==0.2.2
et-xmlfile==1.0.1
fastcache==1.0.2
findspark==1.1.0
Flask==0.12.2
Flask-Cors==3.0.2
future==0.16.0
gevent==1.2.1
ggplot==0.11.5
greenlet==0.4.12
h5py==2.7.0
HeapDict==1.0.0
html5lib==0.999
idna==2.5
imagesize==0.7.1
ipykernel==4.6.1
ipython==5.3.0
ipython-genutils==0.2.0
ipywidgets==6.0.0
isort==4.2.5
itsdangerous==0.24
jdcal==1.3
jedi==0.10.2
Jinja2==2.9.6
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.0.1
jupyter-console==5.1.0
jupyter-core==4.3.0
kmodes==0.7
lazy-object-proxy==1.2.2
llvmlite==0.18.0
locket==0.2.0
lxml==3.7.3
MarkupSafe==0.23
matplotlib==2.0.2
menuinst==1.4.7
mistune==0.7.4
mpmath==0.19
msgpack-python==0.4.8
multipledispatch==0.4.9
navigator-updater==0.1.0
nbconvert==5.1.1
nbformat==4.3.0
networkx==1.11
nltk==3.2.3
nose==1.3.7
notebook==5.0.0
numba==0.33.0
numexpr==2.6.2
numpy==1.12.1
numpydoc==0.6.0
odo==0.5.0
olefile==0.44
openpyxl==2.4.7
packaging==16.8
pandas==0.20.1
pandasql==0.7.3
pandocfilters==1.4.1
paramiko==2.2.1
partd==0.3.8
path.py==10.3.1
pathlib2==2.2.1
patsy==0.4.1
pep8==1.7.0
pickleshare==0.7.4
Pillow==4.1.1
ply==3.10
prompt-toolkit==1.0.14
psutil==5.2.2
py==1.4.33
py4j==0.10.4
pyasn1==0.3.1
pycosat==0.6.2
pycparser==2.17
pycrypto==2.6.1
pycurl==7.43.0
pyflakes==1.5.0
Pygments==2.2.0
PyHive==0.4.0
pyhs2==0.6.0
pylint==1.6.4
PyNaCl==1.1.2
pyodbc==4.0.16
pyOpenSSL==17.0.0
pyparsing==2.1.4
pyspark==2.2.0
pytest==3.0.7
python-dateutil==2.6.0
pytz==2017.2
PyWavelets==0.5.2
pywin32==220
PyYAML==3.12
pyzmq==16.0.2
QtAwesome==0.4.4
qtconsole==4.3.0
QtPy==1.2.1
requests==2.14.2
rope-py3k==0.9.4.post1
rpy2==2.8.5
sasl==0.2.1
scikit-image==0.13.0
scikit-learn==0.18.1
scipy==0.19.0
seaborn==0.7.1
simplegeneric==0.8.1
singledispatch==3.4.0.3
six==1.10.0
snowballstemmer==1.2.1
sortedcollections==0.5.3
sortedcontainers==1.5.7
sphinx==1.5.6
spyder==3.1.4
SQLAlchemy==1.1.9
statsmodels==0.8.0
sympy==1.0
tables==3.2.2
tblib==1.3.2
testpath==0.3
thrift==0.10.0
thrift-sasl==0.2.1
thriftpy==0.3.9
toolz==0.8.2
tornado==4.5.1
traitlets==4.3.2
unicodecsv==0.14.1
version-information==1.0.3
wcwidth==0.1.7
Werkzeug==0.12.2
widgetsnbextension==2.0.0
win-unicode-console==0.5
wrapt==1.10.10
xlrd==1.0.0
XlsxWriter==0.9.6
xlwings==0.10.4
xlwt==1.2.0
zict==0.1.2

In [38]:
titanic.columns


Out[38]:
Index(['Name', 'PClass', 'Age', 'Sex', 'Survived', 'SexCode'], dtype='object')

In [40]:
titanic.head(10)


Out[40]:
Name PClass Age Sex Survived SexCode
0 Allen, Miss Elisabeth Walton 1st 29.00 female 1 1
1 Allison, Miss Helen Loraine 1st 2.00 female 0 1
2 Allison, Mr Hudson Joshua Creighton 1st 30.00 male 0 0
3 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 female 0 1
4 Allison, Master Hudson Trevor 1st 0.92 male 1 0
5 Anderson, Mr Harry 1st 47.00 male 1 0
6 Andrews, Miss Kornelia Theodosia 1st 63.00 female 1 1
7 Andrews, Mr Thomas, jr 1st 39.00 male 0 0
8 Appleton, Mrs Edward Dale (Charlotte Lamson) 1st 58.00 female 1 1
9 Artagaveytia, Mr Ramon 1st 71.00 male 0 0

In [41]:
titanic.tail()


Out[41]:
Name PClass Age Sex Survived SexCode
1308 Zakarian, Mr Artun 3rd 27.0 male 0 0
1309 Zakarian, Mr Maprieder 3rd 26.0 male 0 0
1310 Zenni, Mr Philip 3rd 22.0 male 0 0
1311 Lievens, Mr Rene 3rd 24.0 male 0 0
1312 Zimmerman, Leo 3rd 29.0 male 0 0

In [42]:
titanic.dtypes


Out[42]:
Name         object
PClass       object
Age         float64
Sex          object
Survived      int64
SexCode       int64
dtype: object

In [43]:
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 6 columns):
Name        1313 non-null object
PClass      1313 non-null object
Age         756 non-null float64
Sex         1313 non-null object
Survived    1313 non-null int64
SexCode     1313 non-null int64
dtypes: float64(1), int64(2), object(3)
memory usage: 61.6+ KB

In [44]:
type(titanic)


Out[44]:
pandas.core.frame.DataFrame

In [47]:
titanic2=titanic.iloc[:,1:6]

In [48]:
titanic2.columns


Out[48]:
Index(['PClass', 'Age', 'Sex', 'Survived', 'SexCode'], dtype='object')

In [49]:
del titanic2['Sex']

In [61]:
list1=[1,2,4,5]

In [62]:
titanic2.columns


Out[62]:
Index(['PClass', 'Age', 'Survived', 'SexCode'], dtype='object')

In [63]:
titanic3=titanic.iloc[:,list1]

In [64]:
titanic3.columns


Out[64]:
Index(['PClass', 'Age', 'Survived', 'SexCode'], dtype='object')

In [65]:
titanic4=titanic[['PClass', 'Age', 'Survived', 'SexCode']]

In [66]:
titanic4.columns


Out[66]:
Index(['PClass', 'Age', 'Survived', 'SexCode'], dtype='object')

In [67]:
titanic.ix[20:28]


C:\Users\KOGENTIX\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.
Out[67]:
Name PClass Age Sex Survived SexCode
20 Behr, Mr Karl Howell 1st 26.0 male 1 0
21 Birnbaum, Mr Jakob 1st 25.0 male 0 0
22 Bishop, Mr Dickinson H 1st 25.0 male 1 0
23 Bishop, Mrs Dickinson H (Helen Walton) 1st 19.0 female 1 1
24 Bjornstrm-Steffansson, Mr Mauritz Hakan 1st 28.0 male 1 0
25 Blackwell, Mr Stephen Weart 1st 45.0 male 0 0
26 Blank, Mr Henry 1st 39.0 male 1 0
27 Bonnell, Miss Caroline 1st 30.0 female 1 1
28 Bonnell, Miss Elizabeth 1st 58.0 female 1 1

In [68]:
titanic.iloc[20:28,:]


Out[68]:
Name PClass Age Sex Survived SexCode
20 Behr, Mr Karl Howell 1st 26.0 male 1 0
21 Birnbaum, Mr Jakob 1st 25.0 male 0 0
22 Bishop, Mr Dickinson H 1st 25.0 male 1 0
23 Bishop, Mrs Dickinson H (Helen Walton) 1st 19.0 female 1 1
24 Bjornstrm-Steffansson, Mr Mauritz Hakan 1st 28.0 male 1 0
25 Blackwell, Mr Stephen Weart 1st 45.0 male 0 0
26 Blank, Mr Henry 1st 39.0 male 1 0
27 Bonnell, Miss Caroline 1st 30.0 female 1 1

In [71]:
titanic.Age.mean()


Out[71]:
30.397989417989415

In [72]:
import numpy as np

In [74]:
adult.index


Out[74]:
RangeIndex(start=0, stop=32561, step=1)

In [73]:
adult.index.values


Out[73]:
array([    0,     1,     2, ..., 32558, 32559, 32560], dtype=int64)

In [75]:
len(adult)


Out[75]:
32561

In [78]:
0.001*len(adult)


Out[78]:
32.561

In [79]:
round(0.001*len(adult))


Out[79]:
33

In [81]:
rows = np.random.choice(adult.index.values, round(0.001*len(adult)))
print(rows)


[16132  5411 11189 12552  1920  6733 31595 10530   278 23958  9603  8259
  3089 16182  6404  7994 19078 28333 14027 32534 14665 31239 32336 11534
 13507  7887 29310 13020 31124 16488  9102 17185  3422]

In [83]:
adult.iloc[rows,:]


Out[83]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
16132 47 Private 344916 Assoc-acdm 12 Divorced Transport-moving Not-in-family Black Male 0 0 40 United-States <=50K
5411 71 Local-gov 337064 Masters 14 Widowed Prof-specialty Not-in-family White Female 0 0 40 United-States <=50K
11189 59 Private 46466 HS-grad 9 Married-civ-spouse Transport-moving Husband White Male 0 0 40 United-States >50K
12552 51 Private 193720 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 42 United-States <=50K
1920 32 Private 120426 HS-grad 9 Separated Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
6733 21 ? 152328 Some-college 10 Never-married ? Own-child White Male 0 0 20 United-States <=50K
31595 27 Private 278617 Some-college 10 Never-married Craft-repair Not-in-family White Male 0 0 40 United-States <=50K
10530 67 Local-gov 233681 Assoc-acdm 12 Married-civ-spouse Exec-managerial Husband White Male 0 0 35 United-States <=50K
278 25 Private 193787 Some-college 10 Never-married Tech-support Own-child White Female 0 0 40 United-States <=50K
23958 56 Private 265086 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 50 United-States >50K
9603 36 Private 272944 HS-grad 9 Never-married Transport-moving Not-in-family White Male 0 0 45 United-States <=50K
8259 23 Private 195767 HS-grad 9 Never-married Craft-repair Not-in-family White Male 0 0 40 United-States <=50K
3089 51 Self-emp-not-inc 145409 Bachelors 13 Married-civ-spouse Sales Husband White Male 15024 0 50 United-States >50K
16182 56 Private 145574 HS-grad 9 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States >50K
6404 20 Self-emp-inc 95997 HS-grad 9 Never-married Farming-fishing Own-child White Male 0 0 70 United-States <=50K
7994 37 Private 178136 HS-grad 9 Married-civ-spouse Machine-op-inspct Husband Black Male 0 0 40 United-States <=50K
19078 60 Local-gov 259803 Bachelors 13 Married-civ-spouse Prof-specialty Wife White Female 0 0 45 United-States >50K
28333 61 Private 29059 HS-grad 9 Divorced Sales Unmarried White Female 0 2754 25 United-States <=50K
14027 34 Private 35644 Some-college 10 Married-civ-spouse Craft-repair Husband White Male 0 0 40 United-States >50K
32534 37 Private 179137 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 39 United-States <=50K
14665 40 Private 26892 Bachelors 13 Married-civ-spouse Adm-clerical Husband White Male 0 0 40 United-States >50K
31239 51 Private 99987 10th 6 Separated Machine-op-inspct Unmarried Black Female 0 0 40 United-States <=50K
32336 32 Private 172415 HS-grad 9 Never-married Other-service Unmarried Black Female 0 0 40 United-States <=50K
11534 25 Private 35854 Some-college 10 Married-spouse-absent Sales Unmarried White Female 0 0 40 United-States <=50K
13507 34 Private 236543 HS-grad 9 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
7887 25 Private 34803 Bachelors 13 Never-married Exec-managerial Own-child White Female 0 0 20 United-States <=50K
29310 22 ? 313786 HS-grad 9 Divorced ? Other-relative Black Female 0 0 40 United-States <=50K
13020 34 Self-emp-not-inc 137223 10th 6 Never-married Other-service Own-child White Female 0 0 40 United-States <=50K
31124 63 Private 163708 9th 5 Widowed Other-service Not-in-family White Female 0 0 20 United-States <=50K
16488 33 ? 119918 Bachelors 13 Never-married ? Not-in-family Black Male 0 0 45 ? <=50K
9102 68 Private 201732 Some-college 10 Divorced Adm-clerical Unmarried White Female 0 0 40 United-States <=50K
17185 22 Private 202871 Assoc-voc 11 Married-civ-spouse Other-service Husband White Male 0 0 44 United-States <=50K
3422 40 Private 168936 Assoc-voc 11 Divorced Other-service Not-in-family White Female 0 0 32 United-States <=50K

In [84]:
adultsm=adult.iloc[rows,:]

In [85]:
diamonds=pd.read_csv("C:\\Users\\KOGENTIX\\Desktop\\training\\BigDiamonds.csv\\BigDiamonds.csv")

In [86]:
diamonds.head()


Out[86]:
Unnamed: 0 carat cut color clarity table depth cert measurements price x y z
0 1 0.25 V.Good K I1 59.0 63.7 GIA 3.96 x 3.95 x 2.52 NaN 3.96 3.95 2.52
1 2 0.23 Good G I1 61.0 58.1 GIA 4.00 x 4.05 x 2.30 NaN 4.00 4.05 2.30
2 3 0.34 Good J I2 58.0 58.7 GIA 4.56 x 4.53 x 2.67 NaN 4.56 4.53 2.67
3 4 0.21 V.Good D I1 60.0 60.6 GIA 3.80 x 3.82 x 2.31 NaN 3.80 3.82 2.31
4 5 0.31 V.Good K I1 59.0 62.2 EGL 4.35 x 4.26 x 2.68 NaN 4.35 4.26 2.68

In [87]:
diamonds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598024 entries, 0 to 598023
Data columns (total 13 columns):
Unnamed: 0      598024 non-null int64
carat           598024 non-null float64
cut             598024 non-null object
color           598024 non-null object
clarity         598024 non-null object
table           598024 non-null float64
depth           598024 non-null float64
cert            598024 non-null object
measurements    597978 non-null object
price           597311 non-null float64
x               596209 non-null float64
y               596172 non-null float64
z               595480 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 59.3+ MB

In [88]:
diamonds= diamonds.dropna(how='any')

In [89]:
diamonds.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 593784 entries, 493 to 598023
Data columns (total 13 columns):
Unnamed: 0      593784 non-null int64
carat           593784 non-null float64
cut             593784 non-null object
color           593784 non-null object
clarity         593784 non-null object
table           593784 non-null float64
depth           593784 non-null float64
cert            593784 non-null object
measurements    593784 non-null object
price           593784 non-null float64
x               593784 non-null float64
y               593784 non-null float64
z               593784 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 63.4+ MB

In [90]:
adult.describe()


Out[90]:
age fnlwgt education-num capital-gain capital-loss hours-per-week
count 32561.000000 3.256100e+04 32561.000000 32561.000000 32561.000000 32561.000000
mean 38.581647 1.897784e+05 10.080679 1077.648844 87.303830 40.437456
std 13.640433 1.055500e+05 2.572720 7385.292085 402.960219 12.347429
min 17.000000 1.228500e+04 1.000000 0.000000 0.000000 1.000000
25% 28.000000 1.178270e+05 9.000000 0.000000 0.000000 40.000000
50% 37.000000 1.783560e+05 10.000000 0.000000 0.000000 40.000000
75% 48.000000 2.370510e+05 12.000000 0.000000 0.000000 45.000000
max 90.000000 1.484705e+06 16.000000 99999.000000 4356.000000 99.000000

In [91]:
titanic.describe()


Out[91]:
Age Survived SexCode
count 756.000000 1313.000000 1313.000000
mean 30.397989 0.342727 0.351866
std 14.259049 0.474802 0.477734
min 0.170000 0.000000 0.000000
25% 21.000000 0.000000 0.000000
50% 28.000000 0.000000 0.000000
75% 39.000000 1.000000 1.000000
max 71.000000 1.000000 1.000000

In [93]:
diamonds.describe()


Out[93]:
Unnamed: 0 carat table depth price x y z
count 593784.000000 593784.000000 593784.000000 593784.000000 593784.000000 593784.000000 593784.000000 593784.000000
mean 299220.966754 1.072593 57.658755 61.091980 8755.808723 5.991952 6.200535 4.036075
std 172625.362546 0.813113 4.827985 7.487465 13022.108651 1.530444 1.485081 1.240932
min 494.000000 0.200000 0.000000 0.000000 300.000000 0.150000 1.000000 0.040000
25% 149637.750000 0.500000 56.000000 61.000000 1218.000000 4.740000 4.970000 3.120000
50% 299311.500000 0.900000 58.000000 62.000000 3503.000000 5.780000 6.050000 3.860000
75% 448775.250000 1.500000 59.000000 62.700000 11186.000000 6.970000 7.230000 4.610000
max 598024.000000 9.250000 75.900000 81.300000 99990.000000 13.890000 13.890000 13.180000

In [94]:
diamonds.price.describe()


Out[94]:
count    593784.000000
mean       8755.808723
std       13022.108651
min         300.000000
25%        1218.000000
50%        3503.000000
75%       11186.000000
max       99990.000000
Name: price, dtype: float64

In [95]:
diamonds.ppc=diamonds.price/diamonds.carat

In [96]:
diamonds.ppc.describe()


Out[96]:
count    593784.000000
mean       5788.585161
std        4570.993823
min         525.000000
25%        2666.666667
50%        4172.857143
75%        7437.198068
max       49519.402985
dtype: float64

In [98]:
diamonds=diamonds.drop('Unnamed: 0',1)

In [99]:
diamonds.corr()


Out[99]:
carat table depth price x y z
carat 1.000000 0.037631 0.008883 0.856340 0.859864 0.960857 0.791658
table 0.037631 1.000000 0.423914 0.023266 0.028462 0.045617 0.031170
depth 0.008883 0.423914 1.000000 -0.002129 -0.003632 0.007346 0.031961
price 0.856340 0.023266 -0.002129 1.000000 0.719537 0.796746 0.645191
x 0.859864 0.028462 -0.003632 0.719537 1.000000 0.893783 0.482109
y 0.960857 0.045617 0.007346 0.796746 0.893783 1.000000 0.819880
z 0.791658 0.031170 0.031961 0.645191 0.482109 0.819880 1.000000

In [101]:
diamonds.shape


Out[101]:
(593784, 12)

In [102]:
adult2=adult.copy()

In [107]:
! pip install pandasql


Requirement already satisfied: pandasql in c:\users\kogentix\anaconda3\lib\site-packages
Requirement already satisfied: numpy in c:\users\kogentix\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: sqlalchemy in c:\users\kogentix\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: pandas in c:\users\kogentix\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: python-dateutil>=2 in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->pandasql)
Requirement already satisfied: pytz>=2011k in c:\users\kogentix\anaconda3\lib\site-packages (from pandas->pandasql)
Requirement already satisfied: six>=1.5 in c:\users\kogentix\anaconda3\lib\site-packages (from python-dateutil>=2->pandas->pandasql)

In [109]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [110]:
import pandas as pd

In [111]:
mycars=pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv")

In [112]:
mycars.head()


Out[112]:
Unnamed: 0 mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2

In [113]:
mycars.columns= ['brand','mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs',
       'am', 'gear', 'carb']

In [114]:
pysqldf("SELECT * FROM mycars LIMIT 10;")


Out[114]:
brand mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
5 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
6 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
7 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
8 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
9 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

In [115]:
pysqldf("SELECT * from mycars where gear >3")


Out[115]:
brand mpg cyl disp hp drat wt qsec vs am gear carb
0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
4 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
5 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
6 Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
7 Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
8 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
9 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
10 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
11 Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
12 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
13 Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
14 Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
15 Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
16 Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

In [117]:
pysqldf("SELECT avg(mpg),gear from mycars group by gear ")


Out[117]:
avg(mpg) gear
0 16.106667 3
1 24.533333 4
2 21.380000 5

In [119]:
np.arange(0.1,1,0.1)


Out[119]:
array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9])

In [120]:
diamonds.quantile(np.arange(0.1,1,0.1))


Out[120]:
carat table depth price x y z
0.1 0.31 55.0 59.7 734.0 4.29 4.38 2.71
0.2 0.40 56.0 60.7 994.0 4.58 4.75 2.96
0.3 0.51 57.0 61.3 1510.0 4.95 5.15 3.22
0.4 0.70 57.0 61.7 2240.0 5.37 5.64 3.55
0.5 0.90 58.0 62.0 3503.0 5.78 6.05 3.86
0.6 1.01 58.0 62.3 5294.0 6.28 6.44 4.04
0.7 1.22 59.0 62.6 8362.0 6.64 6.87 4.44
0.8 1.57 60.0 63.0 14460.0 7.34 7.49 4.92
0.9 2.06 61.0 63.9 22211.0 8.12 8.21 5.60

In [121]:
titanic.columns


Out[121]:
Index(['Name', 'PClass', 'Age', 'Sex', 'Survived', 'SexCode'], dtype='object')

In [125]:
titanic.PClass.unique()


Out[125]:
array(['1st', '2nd', '*', '3rd'], dtype=object)

In [126]:
titanic.Survived.unique()


Out[126]:
array([1, 0], dtype=int64)

In [127]:
titanic.SexCode.unique()


Out[127]:
array([1, 0], dtype=int64)

In [128]:
titanic.PClass.value_counts()


Out[128]:
3rd    711
1st    322
2nd    279
*        1
Name: PClass, dtype: int64

In [129]:
titanic.Survived.value_counts()


Out[129]:
0    863
1    450
Name: Survived, dtype: int64

In [132]:
titanic.SexCode.value_counts()


Out[132]:
0    851
1    462
Name: SexCode, dtype: int64

In [133]:
pd.crosstab(titanic.SexCode,titanic.PClass)


Out[133]:
PClass * 1st 2nd 3rd
SexCode
0 1 179 172 499
1 0 143 107 212

In [134]:
pd.crosstab(titanic.Sex,titanic.PClass)


Out[134]:
PClass * 1st 2nd 3rd
Sex
female 0 143 107 212
male 1 179 172 499

In [135]:
pd.crosstab(titanic.Sex,titanic.Survived)


Out[135]:
Survived 0 1
Sex
female 154 308
male 709 142

In [136]:
pd.crosstab(titanic.PClass,titanic.Survived)


Out[136]:
Survived 0 1
PClass
* 1 0
1st 129 193
2nd 160 119
3rd 573 138

In [138]:
pd.crosstab(titanic.Sex,[titanic.PClass,titanic.Survived])


Out[138]:
PClass * 1st 2nd 3rd
Survived 0 0 1 0 1 0 1
Sex
female 0 9 134 13 94 132 80
male 1 120 59 147 25 441 58

In [140]:
x=titanic.groupby(['Survived'])

In [141]:
type(x)


Out[141]:
pandas.core.groupby.DataFrameGroupBy

In [142]:
x


Out[142]:
<pandas.core.groupby.DataFrameGroupBy object at 0x000002289F6CFCF8>

In [143]:
x.describe()


Out[143]:
Age SexCode
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
Survived
0 443.0 31.131670 13.438604 0.33 22.0 28.0 39.0 71.0 863.0 0.178447 0.383111 0.0 0.0 0.0 0.0 1.0
1 313.0 29.359585 15.307481 0.17 19.0 28.0 39.0 69.0 450.0 0.684444 0.465254 0.0 0.0 1.0 1.0 1.0

In [144]:
z=titanic.groupby(['Survived','Sex'])

In [145]:
z.Age


Out[145]:
<pandas.core.groupby.SeriesGroupBy object at 0x000002289F750BE0>

In [146]:
z.Age.mean()


Out[146]:
Survived  Sex   
0         female    24.901408
          male      32.320780
1         female    30.867143
          male      25.951875
Name: Age, dtype: float64

In [147]:
z.Age.mean().reset_index()


Out[147]:
Survived Sex Age
0 0 female 24.901408
1 0 male 32.320780
2 1 female 30.867143
3 1 male 25.951875

In [148]:
p=z.Age.mean().reset_index()

In [150]:
p.pivot(index='Survived',columns='Sex',values="Age")


Out[150]:
Sex female male
Survived
0 24.901408 32.320780
1 30.867143 25.951875

In [151]:
q=p.pivot(index='Survived',columns='Sex',values="Age")

In [152]:
q


Out[152]:
Sex female male
Survived
0 24.901408 32.320780
1 30.867143 25.951875

In [153]:
q.transpose()


Out[153]:
Survived 0 1
Sex
female 24.901408 30.867143
male 32.320780 25.951875

In [154]:
iris=pd.read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/iris.csv")

In [165]:
iris=iris.drop('Unnamed: 0',1)

In [166]:
len(iris)


Out[166]:
150

In [167]:
a=0.8*len(iris)

In [168]:
a


Out[168]:
120.0

In [169]:
np.arange(0,a)


Out[169]:
array([   0.,    1.,    2.,    3.,    4.,    5.,    6.,    7.,    8.,
          9.,   10.,   11.,   12.,   13.,   14.,   15.,   16.,   17.,
         18.,   19.,   20.,   21.,   22.,   23.,   24.,   25.,   26.,
         27.,   28.,   29.,   30.,   31.,   32.,   33.,   34.,   35.,
         36.,   37.,   38.,   39.,   40.,   41.,   42.,   43.,   44.,
         45.,   46.,   47.,   48.,   49.,   50.,   51.,   52.,   53.,
         54.,   55.,   56.,   57.,   58.,   59.,   60.,   61.,   62.,
         63.,   64.,   65.,   66.,   67.,   68.,   69.,   70.,   71.,
         72.,   73.,   74.,   75.,   76.,   77.,   78.,   79.,   80.,
         81.,   82.,   83.,   84.,   85.,   86.,   87.,   88.,   89.,
         90.,   91.,   92.,   93.,   94.,   95.,   96.,   97.,   98.,
         99.,  100.,  101.,  102.,  103.,  104.,  105.,  106.,  107.,
        108.,  109.,  110.,  111.,  112.,  113.,  114.,  115.,  116.,
        117.,  118.,  119.])

In [170]:
a=int(0.8*len(iris))

In [171]:
a


Out[171]:
120

In [172]:
np.arange(0,a)


Out[172]:
array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119])

In [173]:
b=np.arange(0,a)

In [174]:
iris.iloc[b,:]


Out[174]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
5 5.4 3.9 1.7 0.4 setosa
6 4.6 3.4 1.4 0.3 setosa
7 5.0 3.4 1.5 0.2 setosa
8 4.4 2.9 1.4 0.2 setosa
9 4.9 3.1 1.5 0.1 setosa
10 5.4 3.7 1.5 0.2 setosa
11 4.8 3.4 1.6 0.2 setosa
12 4.8 3.0 1.4 0.1 setosa
13 4.3 3.0 1.1 0.1 setosa
14 5.8 4.0 1.2 0.2 setosa
15 5.7 4.4 1.5 0.4 setosa
16 5.4 3.9 1.3 0.4 setosa
17 5.1 3.5 1.4 0.3 setosa
18 5.7 3.8 1.7 0.3 setosa
19 5.1 3.8 1.5 0.3 setosa
20 5.4 3.4 1.7 0.2 setosa
21 5.1 3.7 1.5 0.4 setosa
22 4.6 3.6 1.0 0.2 setosa
23 5.1 3.3 1.7 0.5 setosa
24 4.8 3.4 1.9 0.2 setosa
25 5.0 3.0 1.6 0.2 setosa
26 5.0 3.4 1.6 0.4 setosa
27 5.2 3.5 1.5 0.2 setosa
28 5.2 3.4 1.4 0.2 setosa
29 4.7 3.2 1.6 0.2 setosa
... ... ... ... ... ...
90 5.5 2.6 4.4 1.2 versicolor
91 6.1 3.0 4.6 1.4 versicolor
92 5.8 2.6 4.0 1.2 versicolor
93 5.0 2.3 3.3 1.0 versicolor
94 5.6 2.7 4.2 1.3 versicolor
95 5.7 3.0 4.2 1.2 versicolor
96 5.7 2.9 4.2 1.3 versicolor
97 6.2 2.9 4.3 1.3 versicolor
98 5.1 2.5 3.0 1.1 versicolor
99 5.7 2.8 4.1 1.3 versicolor
100 6.3 3.3 6.0 2.5 virginica
101 5.8 2.7 5.1 1.9 virginica
102 7.1 3.0 5.9 2.1 virginica
103 6.3 2.9 5.6 1.8 virginica
104 6.5 3.0 5.8 2.2 virginica
105 7.6 3.0 6.6 2.1 virginica
106 4.9 2.5 4.5 1.7 virginica
107 7.3 2.9 6.3 1.8 virginica
108 6.7 2.5 5.8 1.8 virginica
109 7.2 3.6 6.1 2.5 virginica
110 6.5 3.2 5.1 2.0 virginica
111 6.4 2.7 5.3 1.9 virginica
112 6.8 3.0 5.5 2.1 virginica
113 5.7 2.5 5.0 2.0 virginica
114 5.8 2.8 5.1 2.4 virginica
115 6.4 3.2 5.3 2.3 virginica
116 6.5 3.0 5.5 1.8 virginica
117 7.7 3.8 6.7 2.2 virginica
118 7.7 2.6 6.9 2.3 virginica
119 6.0 2.2 5.0 1.5 virginica

120 rows × 5 columns


In [184]:
test1=iris.iloc[b,:]

In [180]:
np.arange(a,len(iris))


Out[180]:
array([120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149])

In [181]:
c=np.arange(a,len(iris))

In [183]:
iris.iloc[c,:]


Out[183]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
120 6.9 3.2 5.7 2.3 virginica
121 5.6 2.8 4.9 2.0 virginica
122 7.7 2.8 6.7 2.0 virginica
123 6.3 2.7 4.9 1.8 virginica
124 6.7 3.3 5.7 2.1 virginica
125 7.2 3.2 6.0 1.8 virginica
126 6.2 2.8 4.8 1.8 virginica
127 6.1 3.0 4.9 1.8 virginica
128 6.4 2.8 5.6 2.1 virginica
129 7.2 3.0 5.8 1.6 virginica
130 7.4 2.8 6.1 1.9 virginica
131 7.9 3.8 6.4 2.0 virginica
132 6.4 2.8 5.6 2.2 virginica
133 6.3 2.8 5.1 1.5 virginica
134 6.1 2.6 5.6 1.4 virginica
135 7.7 3.0 6.1 2.3 virginica
136 6.3 3.4 5.6 2.4 virginica
137 6.4 3.1 5.5 1.8 virginica
138 6.0 3.0 4.8 1.8 virginica
139 6.9 3.1 5.4 2.1 virginica
140 6.7 3.1 5.6 2.4 virginica
141 6.9 3.1 5.1 2.3 virginica
142 5.8 2.7 5.1 1.9 virginica
143 6.8 3.2 5.9 2.3 virginica
144 6.7 3.3 5.7 2.5 virginica
145 6.7 3.0 5.2 2.3 virginica
146 6.3 2.5 5.0 1.9 virginica
147 6.5 3.0 5.2 2.0 virginica
148 6.2 3.4 5.4 2.3 virginica
149 5.9 3.0 5.1 1.8 virginica

In [185]:
control1=iris.iloc[c,:]

In [188]:
rowsi = np.random.choice(iris.index.values, round(0.8*len(iris)),replace=False)
print(rowsi)


[147 112   8 139  99 119 110 109 135 144  60  89 138 117  83  38 101  66
  78  27   6  16 126 106 134  29  36  40  80  39  76  85  69  86 125  17
  82   4 132 118  62  93 121  22  35  63  53  30 127  67   5  32  28  48
  34 115   9  14  56  33 131  81 137 149 103 116  52  43  77 129  46  44
  21  96  75  68  25  92 122 102   1 124 108  57  97  50  94  73  90  64
 148 140   7 141 120  23  58  79 111 130  31 136  18  88  12  59  19 113
  26  42  70  71  91  20 143  72  95 107  49  74]

In [187]:
#np.random.choice?

In [189]:
test2=iris.iloc[rowsi,:]

In [190]:
test2


Out[190]:
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
147 6.5 3.0 5.2 2.0 virginica
112 6.8 3.0 5.5 2.1 virginica
8 4.4 2.9 1.4 0.2 setosa
139 6.9 3.1 5.4 2.1 virginica
99 5.7 2.8 4.1 1.3 versicolor
119 6.0 2.2 5.0 1.5 virginica
110 6.5 3.2 5.1 2.0 virginica
109 7.2 3.6 6.1 2.5 virginica
135 7.7 3.0 6.1 2.3 virginica
144 6.7 3.3 5.7 2.5 virginica
60 5.0 2.0 3.5 1.0 versicolor
89 5.5 2.5 4.0 1.3 versicolor
138 6.0 3.0 4.8 1.8 virginica
117 7.7 3.8 6.7 2.2 virginica
83 6.0 2.7 5.1 1.6 versicolor
38 4.4 3.0 1.3 0.2 setosa
101 5.8 2.7 5.1 1.9 virginica
66 5.6 3.0 4.5 1.5 versicolor
78 6.0 2.9 4.5 1.5 versicolor
27 5.2 3.5 1.5 0.2 setosa
6 4.6 3.4 1.4 0.3 setosa
16 5.4 3.9 1.3 0.4 setosa
126 6.2 2.8 4.8 1.8 virginica
106 4.9 2.5 4.5 1.7 virginica
134 6.1 2.6 5.6 1.4 virginica
29 4.7 3.2 1.6 0.2 setosa
36 5.5 3.5 1.3 0.2 setosa
40 5.0 3.5 1.3 0.3 setosa
80 5.5 2.4 3.8 1.1 versicolor
39 5.1 3.4 1.5 0.2 setosa
... ... ... ... ... ...
148 6.2 3.4 5.4 2.3 virginica
140 6.7 3.1 5.6 2.4 virginica
7 5.0 3.4 1.5 0.2 setosa
141 6.9 3.1 5.1 2.3 virginica
120 6.9 3.2 5.7 2.3 virginica
23 5.1 3.3 1.7 0.5 setosa
58 6.6 2.9 4.6 1.3 versicolor
79 5.7 2.6 3.5 1.0 versicolor
111 6.4 2.7 5.3 1.9 virginica
130 7.4 2.8 6.1 1.9 virginica
31 5.4 3.4 1.5 0.4 setosa
136 6.3 3.4 5.6 2.4 virginica
18 5.7 3.8 1.7 0.3 setosa
88 5.6 3.0 4.1 1.3 versicolor
12 4.8 3.0 1.4 0.1 setosa
59 5.2 2.7 3.9 1.4 versicolor
19 5.1 3.8 1.5 0.3 setosa
113 5.7 2.5 5.0 2.0 virginica
26 5.0 3.4 1.6 0.4 setosa
42 4.4 3.2 1.3 0.2 setosa
70 5.9 3.2 4.8 1.8 versicolor
71 6.1 2.8 4.0 1.3 versicolor
91 6.1 3.0 4.6 1.4 versicolor
20 5.4 3.4 1.7 0.2 setosa
143 6.8 3.2 5.9 2.3 virginica
72 6.3 2.5 4.9 1.5 versicolor
95 5.7 3.0 4.2 1.2 versicolor
107 7.3 2.9 6.3 1.8 virginica
49 5.0 3.3 1.4 0.2 setosa
74 6.4 2.9 4.3 1.3 versicolor

120 rows × 5 columns


In [191]:
rowsi


Out[191]:
array([147, 112,   8, 139,  99, 119, 110, 109, 135, 144,  60,  89, 138,
       117,  83,  38, 101,  66,  78,  27,   6,  16, 126, 106, 134,  29,
        36,  40,  80,  39,  76,  85,  69,  86, 125,  17,  82,   4, 132,
       118,  62,  93, 121,  22,  35,  63,  53,  30, 127,  67,   5,  32,
        28,  48,  34, 115,   9,  14,  56,  33, 131,  81, 137, 149, 103,
       116,  52,  43,  77, 129,  46,  44,  21,  96,  75,  68,  25,  92,
       122, 102,   1, 124, 108,  57,  97,  50,  94,  73,  90,  64, 148,
       140,   7, 141, 120,  23,  58,  79, 111, 130,  31, 136,  18,  88,
        12,  59,  19, 113,  26,  42,  70,  71,  91,  20, 143,  72,  95,
       107,  49,  74], dtype=int64)

In [192]:
indices = np.random.permutation(len(iris))
indices


Out[192]:
array([  9,  29, 130, 101,  74,  14, 139, 108,   5,  21,  88,  58, 122,
        27,  31,  32,  33, 107,  13, 129, 140,  43,  38, 125,  12, 141,
        80,   8,  16,  36,  75,   1,  93,  10,  82,  56, 119, 105,  67,
       114,  55,  57,  30, 137,  59,  34,  73,  91,  24, 149, 120,  17,
        90, 138,  35, 144, 136, 115,  99,  72,  79,  18,   4,  98,  70,
       112,  62,  15,  52, 121,  49, 146, 117, 110, 148, 133,  41,  26,
       106,  84,  89,  44,  94, 104, 118,  77,  78,  42,  47,  51,  68,
       132,   6,  69,  19,  86,   2, 143,  50, 103, 124, 127,  60,   3,
       128, 102,  39,  85, 126,  28,  96,  45,  71,  81, 116,  48, 135,
       123,  83,  25, 145,  22,  65,   7,  40, 142,  20,  46,  64,  54,
       111,  37,  66,   0,  97, 100, 113,  53,  87, 147,  76,  11, 109,
        63, 131, 134,  95,  61,  92,  23])

In [196]:
indices[0:120]


Out[196]:
array([  9,  29, 130, 101,  74,  14, 139, 108,   5,  21,  88,  58, 122,
        27,  31,  32,  33, 107,  13, 129, 140,  43,  38, 125,  12, 141,
        80,   8,  16,  36,  75,   1,  93,  10,  82,  56, 119, 105,  67,
       114,  55,  57,  30, 137,  59,  34,  73,  91,  24, 149, 120,  17,
        90, 138,  35, 144, 136, 115,  99,  72,  79,  18,   4,  98,  70,
       112,  62,  15,  52, 121,  49, 146, 117, 110, 148, 133,  41,  26,
       106,  84,  89,  44,  94, 104, 118,  77,  78,  42,  47,  51,  68,
       132,   6,  69,  19,  86,   2, 143,  50, 103, 124, 127,  60,   3,
       128, 102,  39,  85, 126,  28,  96,  45,  71,  81, 116,  48, 135,
       123,  83,  25])

In [195]:
indices[120:150]


Out[195]:
array([145,  22,  65,   7,  40, 142,  20,  46,  64,  54, 111,  37,  66,
         0,  97, 100, 113,  53,  87, 147,  76,  11, 109,  63, 131, 134,
        95,  61,  92,  23])

In [197]:
from sklearn.linear_model import LogisticRegression

In [198]:
from sklearn import datasets

In [199]:
iris = datasets.load_iris()

In [200]:
type(iris)


Out[200]:
sklearn.datasets.base.Bunch

In [201]:
x,y=iris.data,iris.target

In [202]:
x


Out[202]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.8,  3.1,  1.6,  0.2],
       [ 5.4,  3.4,  1.5,  0.4],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 5.5,  4.2,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5. ,  3.2,  1.2,  0.2],
       [ 5.5,  3.5,  1.3,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 4.4,  3. ,  1.3,  0.2],
       [ 5.1,  3.4,  1.5,  0.2],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 4.5,  2.3,  1.3,  0.3],
       [ 4.4,  3.2,  1.3,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 5.1,  3.8,  1.9,  0.4],
       [ 4.8,  3. ,  1.4,  0.3],
       [ 5.1,  3.8,  1.6,  0.2],
       [ 4.6,  3.2,  1.4,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.3,  1.4,  0.2],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.4,  3.2,  4.5,  1.5],
       [ 6.9,  3.1,  4.9,  1.5],
       [ 5.5,  2.3,  4. ,  1.3],
       [ 6.5,  2.8,  4.6,  1.5],
       [ 5.7,  2.8,  4.5,  1.3],
       [ 6.3,  3.3,  4.7,  1.6],
       [ 4.9,  2.4,  3.3,  1. ],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5. ,  2. ,  3.5,  1. ],
       [ 5.9,  3. ,  4.2,  1.5],
       [ 6. ,  2.2,  4. ,  1. ],
       [ 6.1,  2.9,  4.7,  1.4],
       [ 5.6,  2.9,  3.6,  1.3],
       [ 6.7,  3.1,  4.4,  1.4],
       [ 5.6,  3. ,  4.5,  1.5],
       [ 5.8,  2.7,  4.1,  1. ],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 5.6,  2.5,  3.9,  1.1],
       [ 5.9,  3.2,  4.8,  1.8],
       [ 6.1,  2.8,  4. ,  1.3],
       [ 6.3,  2.5,  4.9,  1.5],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6.4,  2.9,  4.3,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6.8,  2.8,  4.8,  1.4],
       [ 6.7,  3. ,  5. ,  1.7],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 5.7,  2.6,  3.5,  1. ],
       [ 5.5,  2.4,  3.8,  1.1],
       [ 5.5,  2.4,  3.7,  1. ],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5.4,  3. ,  4.5,  1.5],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 6.7,  3.1,  4.7,  1.5],
       [ 6.3,  2.3,  4.4,  1.3],
       [ 5.6,  3. ,  4.1,  1.3],
       [ 5.5,  2.5,  4. ,  1.3],
       [ 5.5,  2.6,  4.4,  1.2],
       [ 6.1,  3. ,  4.6,  1.4],
       [ 5.8,  2.6,  4. ,  1.2],
       [ 5. ,  2.3,  3.3,  1. ],
       [ 5.6,  2.7,  4.2,  1.3],
       [ 5.7,  3. ,  4.2,  1.2],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.2,  2.9,  4.3,  1.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.3,  3.3,  6. ,  2.5],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 7.1,  3. ,  5.9,  2.1],
       [ 6.3,  2.9,  5.6,  1.8],
       [ 6.5,  3. ,  5.8,  2.2],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 4.9,  2.5,  4.5,  1.7],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.7,  2.5,  5.8,  1.8],
       [ 7.2,  3.6,  6.1,  2.5],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 6.4,  2.7,  5.3,  1.9],
       [ 6.8,  3. ,  5.5,  2.1],
       [ 5.7,  2.5,  5. ,  2. ],
       [ 5.8,  2.8,  5.1,  2.4],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 7.7,  3.8,  6.7,  2.2],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.9,  3.2,  5.7,  2.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 7.7,  2.8,  6.7,  2. ],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.7,  3.3,  5.7,  2.1],
       [ 7.2,  3.2,  6. ,  1.8],
       [ 6.2,  2.8,  4.8,  1.8],
       [ 6.1,  3. ,  4.9,  1.8],
       [ 6.4,  2.8,  5.6,  2.1],
       [ 7.2,  3. ,  5.8,  1.6],
       [ 7.4,  2.8,  6.1,  1.9],
       [ 7.9,  3.8,  6.4,  2. ],
       [ 6.4,  2.8,  5.6,  2.2],
       [ 6.3,  2.8,  5.1,  1.5],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.7,  3. ,  6.1,  2.3],
       [ 6.3,  3.4,  5.6,  2.4],
       [ 6.4,  3.1,  5.5,  1.8],
       [ 6. ,  3. ,  4.8,  1.8],
       [ 6.9,  3.1,  5.4,  2.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.9,  3.1,  5.1,  2.3],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 6.8,  3.2,  5.9,  2.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6.7,  3. ,  5.2,  2.3],
       [ 6.3,  2.5,  5. ,  1.9],
       [ 6.5,  3. ,  5.2,  2. ],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.9,  3. ,  5.1,  1.8]])

In [203]:
y


Out[203]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [205]:
from sklearn.cross_validation import train_test_split


C:\Users\KOGENTIX\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [206]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.8)

In [ ]: