In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/datascienceschool/docker_rpython/master/data/titanic.csv')
df.tail()
Out[1]:
survived : 생존여부 pclass : 1,2,3등석 sex : 성별 age : 나이 sibsb : 같이 탑승한 형제, 배우자수 parch : 같이 탑승한 부모, 자녀의수 fare : 요금 embarked : 탑승한곳
In [2]:
df1 = df.ix[:,0:8]
df1.tail() # 박사님께서 설명을 위해 뒷부분에 컬럼을 채워놓은 것같아서 who 부터 끝까지 잘랐습니다
Out[2]:
In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1['sex']= le.fit_transform(df1['sex'])
df1['embarked'] = le.fit_transform(df1['embarked'])
In [4]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy = 'median', axis = 0)
df2 = imp.fit_transform(df1)
df2
Out[4]:
In [5]:
df3 = pd.DataFrame(df2, columns = ['survived', 'pclass','sex','age','sibsp','parch','fare','embarked'])
df3.tail()
Out[5]:
In [6]:
df3.info()
In [6]:
y = df3.ix[:,0]
x = df3.ix[:,1:]
In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state= 42 )
In [8]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis(store_covariances = True).fit(x,y)
In [84]:
qda.priors_
Out[84]:
In [90]:
549.0/(549 + 342)
Out[90]:
In [73]:
qda.means_
Out[73]:
In [75]:
qda.covariances_
Out[75]:
In [9]:
qda_score_train = qda.score(x_train,y_train)
print "Train Score :", qda_score_train
print "=" * 40
qda_score_test = qda.score(x_test, y_test)
print "Test Score :", qda_score_test
In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
LDA = LinearDiscriminantAnalysis(n_components = 2, solver = 'svd', store_covariance = True).fit(x,y)
solver svd : Singular value decomposition (default). Does not compute the covariance matrix, therefore this solver is recommended for data with a large number of features. lsqr : Least squares solution, can be combined with shrinkage. eigen : Eigenvalue decomposition, can be combined with shrinkage
In [101]:
LDA.coef_
Out[101]:
In [102]:
LDA.intercept_
Out[102]:
In [103]:
LDA.covariance_
Out[103]:
In [104]:
LDA.means_
Out[104]:
In [80]:
LDA.n_components
Out[80]:
In [105]:
LDA.explained_variance_ratio_
Out[105]:
In [12]:
LDA_score_train = LDA.score(x_train,y_train)
print "Train Score :", LDA_score_train
print "=" * 40
LDA_score_test = LDA.score(x_test, y_test)
print "Test Score :", LDA_score_test
In [13]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(x,y)
In [108]:
NB.classes_
Out[108]:
In [109]:
NB.class_count_
Out[109]:
In [111]:
NB.class_prior_
Out[111]:
In [113]:
NB.theta_
Out[113]:
In [14]:
NB_score_train = NB.score(x_train,y_train)
print "Train Score :", NB_score_train
print "=" * 40
NB_score_test = NB.score(x_test, y_test)
print "Test Score :", NB_score_test
In [15]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB().fit(x,y)
In [137]:
MNB.classes_
Out[137]:
In [138]:
MNB.class_count_
Out[138]:
In [16]:
MNB_score_train = MNB.score(x_train,y_train)
print "Train Score :", MNB_score_train
print "=" * 40
MNB_score_test = MNB.score(x_test, y_test)
print "Test Score :", MNB_score_test
In [17]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf_1 = Pipeline([
('clf', MultinomialNB())
])
In [18]:
clf_1.fit(x_train, y_train)
print(classification_report(y_test, clf_1.predict(x_test), digits = 4))
In [17]:
from sklearn.tree import DecisionTreeClassifier
tree1 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 1, random_state = 0).fit(x,y)
In [18]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5).fit(x_train, y_train)
In [19]:
from sklearn import tree
decision = tree.DecisionTreeClassifier()
decision.fit(x_train, y_train)
decision_score_train = decision.score(x_train, y_train)
print "Train score : ",decision_score_train
decision_score_test = decision.score(x_test, y_test)
print "-" * 40
print "Test score : ",decision_score_test
In [ ]:
In [213]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron(n_iter = 1, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [214]:
perceptron = Perceptron(n_iter = 5, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [215]:
perceptron = Perceptron(n_iter = 10, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [216]:
perceptron = Perceptron(n_iter = 20, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [217]:
perceptron = Perceptron(n_iter = 100, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [220]:
perceptron = Perceptron(n_iter = 150, eta0 = 0.3, random_state= 1).fit(x,y)
perceptron_score_train = perceptron.score(x_train, y_train)
print "Train score : ", perceptron_score_train
print "=" *40
perceptron_score_test = perceptron.score(x_test, y_test)
print "Test score : ", perceptron_score_test
In [ ]:
In [21]:
from sklearn.svm import SVC
svm = SVC() # defalt 값은 rbf 이다
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test
In [22]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test
In [22]:
from sklearn.svm import SVC
svm = SVC(kernel = 'sigmoid')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test
In [ ]:
from sklearn.svm import SVC
svm = SVC(kernel = 'poly')
svm.fit(x_train, y_train)
svm_score_train = svm.score(x_train, y_train)
print "Train Score : ", svm_score_train
svm_score_test = svm.score(x_test, y_test)
print "-" * 40
print "Test Score : ", svm_score_test
In [23]:
models = pd.DataFrame({
'Model' : ['QDA', 'LDA', 'GausianNB','MultinomialNB', 'DecisionTree','SVM'],
'Train_Score' : [qda_score_train, LDA_score_train, NB_score_train, MNB_score_train, decision_score_train, svm_score_train],
'Test_Score' : [qda_score_test, LDA_score_test, NB_score_test, MNB_score_test, decision_score_test, svm_score_test]
})
models.sort_values(by='Test_Score', ascending=True)
Out[23]:
In [ ]: