In [19]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [20]:
#load the watermelon dataset
def get_watermelon_dataset():
file = open('/Users/HansZeng/Desktop/watermelon-dataset.txt')
lines = file.readlines()
m = []
i = 0
for line in lines:
m.append(line.split(","))
i = i + 1
df = pd.DataFrame(m[1:])
l1 = ['是']*8
l2 = ['否']*9
l1.extend(l2)
df[9] = l1
df.columns = ['编号','色泽','根蒂','敲声','纹理','脐部','触感','密度','含糖率','好瓜']
return df
df = get_watermelon_dataset()
df
Out[20]:
In [115]:
"""preprocess the dataset """
#从原始数据集中抽取新的数据集,新的数据集只包含“密度”和“含糖量”的数据
s1 = df['密度']
s2 = df['含糖率']
s3 = [1]*8
s3.extend([0]*9)
df = pd.DataFrame({'label':s3,'密度':s1,'含糖率':s2})
#convert the dataframe to matrix
m = df.values
#convert the string in matrix to float
for i in range(m.shape[0]):
for j in range(m.shape[1]):
m[i,j] = float(m[i,j])
#draw scatter diagram to plot raw data
xdim = m[:,1]
ydim = m[:,2]
plt.scatter(xdim, ydim, c=m[:,0])
plt.xlabel("ratio sugar")
plt.ylabel("density")
Out[115]:
In [116]:
"""use the sklearn libaray to complete logistic regression homework"""
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
from sklearn.metrics import classification_report
from sklearn import model_selection
#make the y_label encoder
lab_enc = preprocessing.LabelEncoder()
y_label = lab_enc.fit_transform(m[:,0])
#select the train and test set
X_train, X_test, y_train, y_test = model_selection.train_test_split(m[:,1:3], y_label,test_size=0.5, random_state=42)
#train the X_train
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
#get the summarize of fitting report
print(classification_report(y_test, y_pred))
可以看出上面的分类情况不是很好,如何改进?
下面自己编程得到梯度下降算法
In [149]:
import numpy as np
import math
def likehood_func(X, y, beta): #the goal function to get the minimal value as the (3.27) equation in zzh's book
"""
@para X: X is the sample matrix, the dim is (3,17)
@para y: y is the label array
@return beta: the parameter of (3.27) in zzh's book
"""
r, c = X.shape
sum = 0
for i in range(r):
exp = -y[i]*np.dot(beta, X[i,]) + math.log(1+math.e**(np.dot(X[i,],beta)))
sum = sum + exp
return sum
In [272]:
def gradient_descent(X, y,h):
"""
@para X: X is the sample matrix, the dim is (3,17)
@para y: y is the label array
@para h: h is the step length of iteration
@return beta: the best parameter of (3.27) in zzh's book
"""
maxtime = 500 #give the iterative time limit
r, c = X.shape
delta_beta = np.array([h]*c)
beta = np.zeros(c)
cur_lh = 0 #the initial function value and set it 0
lhs = [] #记录似然函数在不同beta下的值
for i in range(maxtime):
temp_beta = beta
#patrial part
for j in range(c):
beta[j] += delta_beta[j]
new_lh = likehood_func(X, y, beta)
delta_beta[j] = -h * (new_lh - cur_lh) / delta_beta[j]
beta = temp_beta
beta += delta_beta
cur_lh = likehood_func(X, y, beta)
lhs.append(cur_lh)
return (beta,lhs)
In [290]:
X = m[:,0:3]
y = m[:,0]
In [242]:
"""draw line diagram to view the converage condition"""
x = range(500)
plt.subplot(3,1,1)
plt.plot(range(500),gradient_descent(X,y,0.1)[1])
plt.title("converage diagram with 0.1 step")
plt.subplot(3,1,2)
plt.plot(range(500),gradient_descent(X,y,0.05)[1])
plt.title("converage diagram with 0.05 step")
plt.subplot(3,1,3)
plt.plot(range(500),gradient_descent(X,y,0.01)[1])
plt.title("converage diagram with 0.01 step")
plt.show()
In [270]:
def gradient_descent_version2(X, y, h):
"""
@para X: X is the sample matrix, the dim is (3,17)
@para y: y is the label array
@para h: h is the step length of iteration
@return beta: the best parameter of (3.27) in zzh's book
"""
maxtime = 500 #give the iterative time limit
r, c = X.shape
delta_beta = np.array([h]*c)
beta = np.zeros(c)
cur_lh = 0 #the initial function value and set it 0
lhs = [] #记录似然函数在不同beta下的值
for i in range(maxtime):
temp_beta = beta
#patrial part
for j in range(c):
beta[j] += delta_beta[j]
new_lh = likehood_func(X, y, beta)
delta_beta[j] = -h * partial_likehood(X,y,beta)[j]
beta = temp_beta
beta += delta_beta
cur_lh = likehood_func(X, y, beta)
lhs.append(cur_lh)
return (beta,lhs)
def partial_likehood(X, y, beta):
r, c = X.shape
partial = np.ones(3)
for i in range(r):
for j in range(c):
partial[j] += X[i,j] *(1-1/(1+math.e**np.dot(beta,X[i,]))-y[i])
return partial
In [277]:
"""draw line diagram to view the converage condition for gradient_descent_version2"""
x = range(500)
plt.subplot(3,1,1)
plt.plot(range(500),gradient_descent_version2(X,y,0.1)[1])
plt.title("converage diagram with 0.1 step")
plt.subplot(3,1,2)
plt.plot(range(500),gradient_descent_version2(X,y,0.05)[1])
plt.title("converage diagram with 0.05 step")
plt.subplot(3,1,3)
plt.plot(range(500),gradient_descent_version2(X,y,0.01)[1])
plt.title("converage diagram with 0.01 step")
plt.show()
In [288]:
'''
use the own logistic regression function to complete the homework
'''
X_train, X_test, y_train, y_test = model_selection.train_test_split(m[:,1:3], y_label,test_size=0.5, random_state=42)
def prob_select_true(beta,x):
print np.e**(np.dot(beta,x))
return 1 - 1 / (1 + np.e**(np.dot(beta,x)))
beta = gradient_descent(X_train, y_train, 0.1)[0] #get the optimized sequence
y_pred = []
for i in range(len(y_test)):
y_pred.append(prob_select_true(beta,X_test[i,]))
print "y_test: ", y_test, " y_pred:" ,y_pred #print the pro_y by the X_test to compare with y_test
In [ ]: