In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
os.chdir("/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/House price/")
In [2]:
dataset = pd.read_csv('train.csv')
dataset.head(5)
Out[2]:
In [3]:
testset = pd.read_csv("test.csv")
testset.head(5)
Out[3]:
In [4]:
dataset.info()
In [5]:
dataset.columns[dataset.isnull().sum()>0]
Out[5]:
In [6]:
testset.info()
In [7]:
testset.columns[testset.isnull().sum()>0]
Out[7]:
In [8]:
dataset_exp = dataset; # create an experimental dataset
testset_exp = testset; # create an experimental testset
#split daataset_exp into features and target
feas = dataset_exp.shape[1]
X_train = dataset_exp.iloc[:,:feas-1]
y_target = dataset_exp['SalePrice']
print "Train data shape is:", X_train.shape
print "target data shape is:", y_target.shape
X = X_train.append(testset_exp,ignore_index = True)
print "all features shape is:", X.shape
In [9]:
X.info()
In [10]:
X.index.is_unique
Out[10]:
In [11]:
# change null in all object columns to 'NA'
cat_cols = [] # all categorical features
cat_n_cols = [] # all categorical features that contained null values
n_cols = [] # all numerical features
for i in X.columns:
if X[i].dtype == 'object':
if X[i].isnull().sum()>0:
cat_n_cols.append(i);
X[i].fillna("Mis", inplace = True) # do not forget in place
cat_cols.append(i);
else:
n_cols.append(i)
print "Here are all categorical data: ",'||'.join(cat_cols)
print "Here are all categorical data that contains missing values: ",'||'.join(cat_n_cols)
print "Here are all numerical data: ",'||'.join(n_cols)
In [12]:
n_n_cols = X.columns[X.isnull().sum()>0]
print "Numerical features with missing values:","||".join(n_n_cols)
In [13]:
import seaborn as sns
In [15]:
# visualize the manipulation of setting missing value to Mis category
g = sns.PairGrid(pd.concat([X.iloc[:1459,:],y_target],axis =1),x_vars = cat_n_cols[18:], y_vars = "SalePrice")
g = g.map(sns.stripplot);
# possibly delete some columns that a large propotion is missing values (given that those missing values show
# no obvious correlation with price
In [16]:
# as for PoolQC has lots of missing values and also there is
# no obvious correlation between these features and sales Price
# here I just drop it
X.drop(['PoolQC'],axis = 1, inplace = True)
In [17]:
X[n_n_cols].info()
In [19]:
g = sns.PairGrid(pd.concat([X.iloc[:1459,:],y_target],axis =1),x_vars = n_n_cols, y_vars = "SalePrice", dropna = False)
g = g.map(sns.jointplot);
In [20]:
# fill the na value with median
X.fillna(X.median()[n_n_cols], inplace = True)
Out[20]:
In [21]:
X.head(5)
Out[21]:
In [23]:
import matplotlib.pylab as plt
import numpy as np
train_set = pd.concat([X.iloc[:1459,:],y_target],axis = 1)
corr = train_set.corr()
sns.set(style = "white")
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220,10,as_cmap=True)
mask = np.zeros_like(corr,dtype = np.bool)
sns.heatmap(corr,mask = mask,cmap = cmap, vmax=.3,square = True, linewidths = .5,
cbar_kws = {"shrink": .5},ax = ax)
Out[23]:
In [26]:
num_features = 0
cat_cols.remove('PoolQC')
for i in cat_cols:
temp = len(X[i].unique())
num_features+=temp
#print i,"with %d unique values"%(temp)
print X[i].value_counts()
if temp >10:
print "feature",i,"has %d features"%temp
#print X[i].value_counts()
print "There are total of %d features after converting %d features"%(num_features,len(cat_cols
))
# if we convert every feature with one hot encoder 259 features + ~37 numerical data
# that is almost ~20% of the training data
# To reduce the dimensinality
#1. match the data with test set
#2. eliminat some really sparse features
In [27]:
# convert different features into numbers
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
X_cat = X[cat_cols].apply(lambda x:d[x.name].fit_transform(x))
# convert different numbers into a binary sparse matrix
from sklearn.preprocessing import OneHotEncoder
Ohe = OneHotEncoder(sparse = False)
Ohe.fit(X_cat)
X_cat = Ohe.transform(X_cat)
X_cat = pd.DataFrame(X_cat)
print "Total number of cols after conversaion: %d"%X_cat.shape[0]
print X_cat.shape
X_n = X[n_cols]
X_allcat = pd.concat([X_cat,X_n],axis = 1)
print " the new feature matrix has the shape",X_allcat.shape
In [108]:
col_sparse = []
for i in X_allcat.columns:
if len((X_allcat[i].nonzero())[0])<=1:
col_sparse.append(i)
print len(col_sparse)
In [109]:
X_2 = X_allcat.drop(col_sparse, axis = 1)
print "the non-sparse features matrix has the shape", X_2.shape
In [95]:
# split the data into festures and target
from sklearn.model_selection import train_test_split
X_train_a = X_allcat.iloc[:1460,:]
X_test_a = X_allcat.iloc[1460:,:]
y_target = dataset_exp['SalePrice']
X_train,X_val,y_train,y_val = train_test_split(X_train_a,y_target,test_size = 0.1,random_state = 1)
print "Trainning set shape is:", X_train.shape
print "Val set shape is:" ,X_val.shape
print "y_train shape is:", y_train.shape
print "y_val is:", y_val.shape
In [110]:
from sklearn.model_selection import train_test_split
X_train_a = X_2.iloc[:1460,:]
X_test_a = X_2.iloc[1460:,:]
y_target = dataset_exp['SalePrice']
X_train,X_val,y_train,y_val = train_test_split(X_train_a,y_target,test_size = 0.1,random_state = 1)
print "Trainning set shape is:", X_train.shape
print "Val set shape is:" ,X_val.shape
print "y_train shape is:", y_train.shape
print "y_val is:", y_val.shape
In [98]:
X_test_a['LotArea'].head(5) # just to validate on any mistakes on the indices
Out[98]:
In [190]:
# exammine the distribution of lg target
import matplotlib.mlab as mlab
from scipy.stats import norm,lognorm
lg_y_target = np.log(y_target)
(mu,sigma) = norm.fit(lg_y_target)
n,bins,pathes = plt.hist(lg_y_target,60,normed = 1,facecolor = 'green',alpha = 0.75)
y = mlab.normpdf(bins,mu,sigma)
l = plt.plot(bins,y,'r--',linewidth = 2)
plt.xlabel('sales price')
plt.ylabel('probability')
plt.grid(True)
plt.show()
In [143]:
# Machine learing methods selection:
#1. Random Forest # use RMSE to evaluate results
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
from sklearn.ensemble import RandomForestRegressor
#rf_dict = {}
for n_est in np.arange(80,120,10):
rf = RandomForestRegressor(n_estimators = n_est)
forest = rf.fit(X_train,y_train)
y_train_pred = np.log(rf.predict(X_train))
y_val_pred = np.log(rf.predict(X_val))
#train_score = rf.score(X_train,y_train)
#res_score = rf.score(X_val,y_val)
train_score = mean_squared_error(lg_y_train,y_train_pred)
res_score = mean_squared_error(lg_y_val,y_val_pred)
print "n_estimators: ",n_est
print "tainning set score is:", sqrt(train_score),"Val set score is:", sqrt(res_score)
rf_dict[n_est] = (train_score, res_score)
# 110 random estimators seem to be a not bad choice
In [132]:
#2 Random Forest with log normalized target (worse than raw values)
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
rf_log_dic = {}
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
for n_est in np.arange(50,200,10):
rf = RandomForestRegressor(n_estimators = n_est)
forest = rf.fit(X_train,lg_y_train)
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
#train_score = r2_score(y_train,y_train_pred)
train_score = mean_squared_error(lg_y_train,y_train_pred)
#res_score = r2_score(y_val,y_val_pred)
res_score = mean_squared_error(lg_y_val,y_val_pred)
print "n_estimators: ",n_est
print "tainning set score is:", sqrt(train_score),"Val set score is:", sqrt(res_score)
rf_dict[n_est] = (train_score, res_score)
In [128]:
import xgboost as xgb
regr = xgb.XGBRegressor(colsample_bytree= 0.2,gamma = 0.0,learning_rate = 0.005,max_depth = 6,
min_child_weight = 2,n_estimators = 7200,reg_alpha = 0.9,
reg_lambda = 0.6,
subsample=0.3,
seed = 40,
silent = 1)
In [129]:
from sklearn.metrics import mean_squared_error
from math import sqrt
lg_y_train = np.log(y_train)
lg_y_val = np.log(y_val)
regr.fit(X_train,lg_y_train)
y_val_pred = regr.predict(X_val)
score = mean_squared_error(lg_y_val,y_val_pred)
#print r2_score(y_val,y_val_pred)
print sqrt(score)
In [171]:
from sklearn.linear_model import Lasso
#alpha = 0.00099
alpha_1 =0.00029
las = Lasso(alpha = alpha_1, max_iter = 50000)
las.fit(X_train,lg_y_train)
y_val_pred = las.predict(X_val)
score = mean_squared_error(lg_y_val,y_val_pred)
print sqrt(score)
In [176]:
import matplotlib.pyplot as plt
plt.plot(las.predict(X_train),regr.predict(X_train),'o')
plt.xlim(11,13.5)
plt.ylim(11,13.5)
Out[176]:
In [185]:
pred_w = 0.4*las.predict(X_val)+0.6*regr.predict(X_val)
cb_score = mean_squared_error(lg_y_val, pred_w)
print sqrt(cb_score)
In [115]:
rf_f = RandomForestRegressor(n_estimators = 100)
forest_f = rf_f.fit(X_train,y_train)
res_score = rf_f.score(X_val,y_val)
print res_score
In [227]:
pred = rf_f.predict(X_test_a)
print pred.shape
In [130]:
# Xgboost
pred = np.exp(regr.predict(X_test_a))
In [188]:
submission = pd.read_csv('sample_submission.csv')
submission.iloc[:,1] = pred
submission.to_csv('submission.csv',index = None)
In [172]:
pred = np.exp(las.predict(X_test_a))
In [187]:
pred_s = 0.4*las.predict(X_test_a)+0.6*regr.predict(X_test_a)
pred = np.exp(pred_s)
In [ ]: