In [1]:
import pandas as pd
import os as os
import numpy as np
In [2]:
adult=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=None)
In [3]:
adult.columns=["age ",
"workclass ",
"fnlwgt",
"education ",
"education-num",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"capital-gain",
"capital-loss",
"hours-per-week",
"native-country",
"income",
]
In [5]:
adult.head()
Out[5]:
In [12]:
adult.iloc[:2,:3]
Out[12]:
In [13]:
adult.iloc[1:2,2:3]
Out[13]:
In [14]:
adult.iloc[4,]#index
Out[14]:
In [15]:
adult.iloc[3:4,:]
Out[15]:
In [20]:
adult.drop?
In [21]:
adult2=adult.drop('capital-loss',1)
In [22]:
adult2.head()
Out[22]:
In [25]:
del adult2["capital-gain"]
In [27]:
adult2.head()
Out[27]:
In [28]:
titanic=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/Titanic.csv")
In [29]:
titanic.columns
Out[29]:
In [30]:
titanic=titanic.drop('Unnamed: 0',1)
In [31]:
titanic.head()
Out[31]:
In [32]:
import IPython
In [33]:
print (IPython.sys_info())
In [34]:
!pip install version_information
In [35]:
%load_ext version_information
In [36]:
%version_information
Out[36]:
In [37]:
!pip freeze
In [38]:
titanic.columns
Out[38]:
In [40]:
titanic.head(10)
Out[40]:
In [41]:
titanic.tail()
Out[41]:
In [42]:
titanic.dtypes
Out[42]:
In [43]:
titanic.info()
In [44]:
type(titanic)
Out[44]:
In [47]:
titanic2=titanic.iloc[:,1:6]
In [48]:
titanic2.columns
Out[48]:
In [49]:
del titanic2['Sex']
In [61]:
list1=[1,2,4,5]
In [62]:
titanic2.columns
Out[62]:
In [63]:
titanic3=titanic.iloc[:,list1]
In [64]:
titanic3.columns
Out[64]:
In [65]:
titanic4=titanic[['PClass', 'Age', 'Survived', 'SexCode']]
In [66]:
titanic4.columns
Out[66]:
In [67]:
titanic.ix[20:28]
Out[67]:
In [68]:
titanic.iloc[20:28,:]
Out[68]:
In [71]:
titanic.Age.mean()
Out[71]:
In [72]:
import numpy as np
In [74]:
adult.index
Out[74]:
In [73]:
adult.index.values
Out[73]:
In [75]:
len(adult)
Out[75]:
In [78]:
0.001*len(adult)
Out[78]:
In [79]:
round(0.001*len(adult))
Out[79]:
In [81]:
rows = np.random.choice(adult.index.values, round(0.001*len(adult)))
print(rows)
In [83]:
adult.iloc[rows,:]
Out[83]:
In [84]:
adultsm=adult.iloc[rows,:]
In [85]:
diamonds=pd.read_csv("C:\\Users\\KOGENTIX\\Desktop\\training\\BigDiamonds.csv\\BigDiamonds.csv")
In [86]:
diamonds.head()
Out[86]:
In [87]:
diamonds.info()
In [88]:
diamonds= diamonds.dropna(how='any')
In [89]:
diamonds.info()
In [90]:
adult.describe()
Out[90]:
In [91]:
titanic.describe()
Out[91]:
In [93]:
diamonds.describe()
Out[93]:
In [94]:
diamonds.price.describe()
Out[94]:
In [95]:
diamonds.ppc=diamonds.price/diamonds.carat
In [96]:
diamonds.ppc.describe()
Out[96]:
In [98]:
diamonds=diamonds.drop('Unnamed: 0',1)
In [99]:
diamonds.corr()
Out[99]:
In [101]:
diamonds.shape
Out[101]:
In [102]:
adult2=adult.copy()
In [107]:
! pip install pandasql
In [109]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
In [110]:
import pandas as pd
In [111]:
mycars=pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/datasets/mtcars.csv")
In [112]:
mycars.head()
Out[112]:
In [113]:
mycars.columns= ['brand','mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs',
'am', 'gear', 'carb']
In [114]:
pysqldf("SELECT * FROM mycars LIMIT 10;")
Out[114]:
In [115]:
pysqldf("SELECT * from mycars where gear >3")
Out[115]:
In [117]:
pysqldf("SELECT avg(mpg),gear from mycars group by gear ")
Out[117]:
In [119]:
np.arange(0.1,1,0.1)
Out[119]:
In [120]:
diamonds.quantile(np.arange(0.1,1,0.1))
Out[120]:
In [121]:
titanic.columns
Out[121]:
In [125]:
titanic.PClass.unique()
Out[125]:
In [126]:
titanic.Survived.unique()
Out[126]:
In [127]:
titanic.SexCode.unique()
Out[127]:
In [128]:
titanic.PClass.value_counts()
Out[128]:
In [129]:
titanic.Survived.value_counts()
Out[129]:
In [132]:
titanic.SexCode.value_counts()
Out[132]:
In [133]:
pd.crosstab(titanic.SexCode,titanic.PClass)
Out[133]:
In [134]:
pd.crosstab(titanic.Sex,titanic.PClass)
Out[134]:
In [135]:
pd.crosstab(titanic.Sex,titanic.Survived)
Out[135]:
In [136]:
pd.crosstab(titanic.PClass,titanic.Survived)
Out[136]:
In [138]:
pd.crosstab(titanic.Sex,[titanic.PClass,titanic.Survived])
Out[138]:
In [140]:
x=titanic.groupby(['Survived'])
In [141]:
type(x)
Out[141]:
In [142]:
x
Out[142]:
In [143]:
x.describe()
Out[143]:
In [144]:
z=titanic.groupby(['Survived','Sex'])
In [145]:
z.Age
Out[145]:
In [146]:
z.Age.mean()
Out[146]:
In [147]:
z.Age.mean().reset_index()
Out[147]:
In [148]:
p=z.Age.mean().reset_index()
In [150]:
p.pivot(index='Survived',columns='Sex',values="Age")
Out[150]:
In [151]:
q=p.pivot(index='Survived',columns='Sex',values="Age")
In [152]:
q
Out[152]:
In [153]:
q.transpose()
Out[153]:
In [154]:
iris=pd.read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/iris.csv")
In [165]:
iris=iris.drop('Unnamed: 0',1)
In [166]:
len(iris)
Out[166]:
In [167]:
a=0.8*len(iris)
In [168]:
a
Out[168]:
In [169]:
np.arange(0,a)
Out[169]:
In [170]:
a=int(0.8*len(iris))
In [171]:
a
Out[171]:
In [172]:
np.arange(0,a)
Out[172]:
In [173]:
b=np.arange(0,a)
In [174]:
iris.iloc[b,:]
Out[174]:
In [184]:
test1=iris.iloc[b,:]
In [180]:
np.arange(a,len(iris))
Out[180]:
In [181]:
c=np.arange(a,len(iris))
In [183]:
iris.iloc[c,:]
Out[183]:
In [185]:
control1=iris.iloc[c,:]
In [188]:
rowsi = np.random.choice(iris.index.values, round(0.8*len(iris)),replace=False)
print(rowsi)
In [187]:
#np.random.choice?
In [189]:
test2=iris.iloc[rowsi,:]
In [190]:
test2
Out[190]:
In [191]:
rowsi
Out[191]:
In [192]:
indices = np.random.permutation(len(iris))
indices
Out[192]:
In [196]:
indices[0:120]
Out[196]:
In [195]:
indices[120:150]
Out[195]:
In [197]:
from sklearn.linear_model import LogisticRegression
In [198]:
from sklearn import datasets
In [199]:
iris = datasets.load_iris()
In [200]:
type(iris)
Out[200]:
In [201]:
x,y=iris.data,iris.target
In [202]:
x
Out[202]:
In [203]:
y
Out[203]:
In [205]:
from sklearn.cross_validation import train_test_split
In [206]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.8)
In [ ]: