In [1]:
import numpy as np
import pandas as pd
#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test])
#size of training dataset
train_samples = train.shape[0]
test_samples = test.shape[0]
In [2]:
# remove the Id feature
data.drop(['Id'],1, inplace=True);
In [3]:
#data.describe()
Here we have to find the 'NaN' values and fill them with the mean. Probably it's not the best way to complete the info where we have empty values but at least we are keeping the same distribution.
Average and standard deviation are not modified with this method.
In [4]:
datanum = data.select_dtypes([np.number])
datanum = datanum.fillna(datanum.dropna().mean())
The 'SalePrice' has a skewed graph. We can stabilize it applying a logarithmic operation because we know that all the values are positive.
In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
# Transforming to non-skewed SalePrice
data.SalePrice = data.SalePrice.apply(np.log)
data.SalePrice.hist(bins=50)
Out[5]:
There are a lot of features in this dataset so we are going to select only the most correlated features with the 'SalePrice'.
With the following query we can see that the ten first features have a good correlation. In case we want to change the number of correlated features to be retrieved, we'll create a function to customize that.
In [6]:
# Correlation features
datanum.corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False).head(10)
Out[6]:
In [7]:
def getDataWithHighCorrFeatures(data, numberFeatures=10):
high_corr_feat_names = data.corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False).head(numberFeatures).axes[0].tolist()
#high_corr_feat_names.remove('SalePrice')
return data[high_corr_feat_names]
In [8]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, random_state=13)#, shuffle=True)
print(kf)
We implement two methos to plot the PCA in case we need to visualize the information in a 2D graph. We'll need to reduce all the features to only one feature (component).
In [9]:
#plotting PCA
from sklearn.decomposition import PCA
def getX_PCA(X):
pca = PCA(n_components=1)
return pca.fit(X).transform(X)
def plotPCA(X, y):
pca = PCA(n_components=1)
X_r = pca.fit(X).transform(X)
plt.plot(X_r, y, 'x')
Removing the 1% of the anomalies we can get an stable prediction but it's not sure. Probably it's going to help but this would be removed from the final calculation.
In [10]:
from sklearn.covariance import EllipticEnvelope
def removeAnomalies(X_train, y_train, verbose=False):
# fit the model
ee = EllipticEnvelope(contamination=0.01,
assume_centered=True,
random_state=13)
ee.fit(X_train)
pred = ee.predict(X_train)
X_anom = X_train[pred != 1]
y_anom = y_train[pred != 1]
X_no_anom = X_train[pred == 1]
y_no_anom = y_train[pred == 1]
if (verbose):
print("Number samples no anomalies: {}".format(X_no_anom.shape[0]))
#after removing anomalies
#plt.scatter(getX_PCA(X_no_anom), y_no_anom)
#plt.scatter(getX_PCA(X_anom), y_anom)
return X_no_anom, y_no_anom
def idxNotAnomalies(X):
ee = EllipticEnvelope(contamination=0.01,
assume_centered=True,
random_state=13)
ee.fit(X)
pred = ee.predict(X)
return [index[0] for index, x in np.ndenumerate(pred) if x == 1]
In [11]:
# Linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
def train(X_train, y_train, verbose=False):
#lr = LinearRegression()
import xgboost as xgb
lr = xgb.XGBRegressor(max_depth=5,
n_estimators=250,
min_child_weight=7,
n_jobs=4)
#
batch = 0
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
#training
lr.fit(X_t, y_t)
#calculate costs
t_error = mean_squared_error(y_t, lr.predict(X_t))**0.5
v_error = mean_squared_error(y_v, lr.predict(X_v))**0.5
if verbose:
print("{}) Training error: {:.2f} Validation error: {:.2f} Score: {:.2f}"
.format(batch, t_error, v_error, lr.score(X_v, y_v)))
batch += 1
return lr
def metrics(model, X, y, verbose=False):
#Scores
if verbose:
print("Training score: {:.4f}".format(model.score(X, y)))
#RMSLE
#print(np.count_nonzero(~np.isfinite(model.predict(X))))
rmsle = mean_squared_error(y, model.predict(X))**0.5
if verbose:
print("RMSLE: {:.4f}".format(rmsle))
# Plotting the results
plt.scatter(model.predict(X), y)
return rmsle, model.score(X, y)
In [12]:
# Get polynomial features
from sklearn.preprocessing import PolynomialFeatures
def getPolynomial(X_train, X_no_anom, X_test):
poly = PolynomialFeatures(degree=2)
return poly.fit_transform(X_train), poly.fit_transform(X_no_anom), poly.fit_transform(X_test)
In [13]:
def getKeyWithMinError(X_train, X_no_amon, y_train, y_no_anom, verbose=False):
rmsles = {}
for f in range(1,X_train.shape[1]):
model = train(X_no_anom[:,:f], y_no_anom, verbose=False)
rmsles[f] = metrics(model, X_train[:,:f], y_train, verbose=False)
min_error_key = min(rmsles, key=rmsles.get)
if (verbose):
print("Min error (k={}):{}".format(min_error_key, rmsles[min_error_key]))
#model = train(X_train_pol[:,:min_error_key], y_train)
#metrics(model, X_train_orig_pol[:,:min_error_key], y_train_orig)
#pd.Series(rmsles).plot()
return min_error_key
In [14]:
import warnings
warnings.filterwarnings('ignore')
errors = []
for f in range(1,17):
#print("====Corr feat: {}====".format(f))
datanum_high_corr = getDataWithHighCorrFeatures(datanum, f)
y = np.array(data['SalePrice'])
X = np.array(datanum_high_corr)
#split by idx
idx = train_samples
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]
#print("Shape X train: {}".format(X_train.shape))
X_no_anom, y_no_anom = removeAnomalies(X_train, y_train)
#print("Shape X train (no anom): {}".format(X_no_anom.shape))
X_train, X_no_anom, X_test = getPolynomial(X_train, X_no_anom, X_test)
#print("Shape X train (poly): {}".format(X_no_anom.shape))
key = 1000 #getKeyWithMinError(X_train, X_no_anom, y_train, y_no_anom)
model = train(X_no_anom[:,:key], y_no_anom)
error, score = metrics(model, X_train[:,:key], y_train)
print("f:{} err:{:.3f} score:{:.3f}".format(f, error, score))
errors.append(error)
# show graph
pd.Series(errors).plot()
Out[14]:
In [15]:
features = data.select_dtypes([np.object]).axes[1].tolist()
features.append('SalePrice')
datacat = pd.get_dummies(data[features])
datacat = datacat.fillna(datacat.dropna().mean())
datacat.corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False).head(10)
Out[15]:
In [16]:
features_order_by_corr = datacat.corr()['SalePrice'].drop('SalePrice').sort_values(ascending=False).axes[0].tolist()
In [17]:
import warnings
warnings.filterwarnings('ignore')
datanum_high_corr = getDataWithHighCorrFeatures(datanum, 15)
Xn = np.array(datanum_high_corr)
#choosing the number of categorical
num_cat = 10
Xc = np.array(datacat[features_order_by_corr[:num_cat]])
y = np.array(data['SalePrice'])
poly = PolynomialFeatures(degree=2)
Xpn = poly.fit_transform(Xn)
X = np.concatenate([Xpn, Xc], axis=1)
no_anom_idx = idxNotAnomalies(Xn[:idx]) #only from numeric features
print(Xpn.shape)
print(Xc.shape)
print(X.shape)
In [18]:
#split by idx
idx = train_samples
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]
print("Shape X train: {}".format(X_train.shape))
print("Shape X test: {}".format(X_test.shape))
print("Shape X train (no anom): {}".format(X_train[no_anom_idx].shape))
In [19]:
X_no_anom = X_train[no_anom_idx]
y_no_anom = y_train[no_anom_idx]
errors = {}
scores = {}
for f in range(15^2, X_train.shape[1]):
modelt = train(X_no_anom[:,:f], y_no_anom, verbose=False)
err, score = metrics(modelt, X_train[:,:f], y_train, verbose=False)
if err > 1e-7:
errors[f] = err
scores[f] = score
else:
break
min_error_key = min(errors, key=errors.get)
max_score_key = max(scores, key=scores.get)
print("Min error: {:.3f}".format(errors[min_error_key]))
print("Max score: {:.3f}".format(scores[max_score_key]))
In [20]:
modelt = train(X_no_anom[:,:235], y_no_anom, verbose=False)
err, score = metrics(modelt, X_train[:,:235], y_train, verbose=False)
print(err)
print(score)
In [21]:
pd.Series(errors).plot()
Out[21]:
In [22]:
pd.Series(scores).plot()
Out[22]:
In [23]:
print(min_error_key)
In [24]:
import os
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
min_error_key = 235
def RMSE(y_true,y_pred):
rmse = mean_squared_error(y_true, y_pred)**0.5
return rmse
modelt = train(X_no_anom[:,:min_error_key], y_no_anom, verbose=False)
err, score = metrics(modelt, X_train[:,:min_error_key], y_train, verbose=False)
print("Err: {:.3f} | R2: {:.3f}".format(err, score))
scores = cross_val_score(modelt, X_train[:,:min_error_key], y_train,
scoring=make_scorer(RMSE, greater_is_better=True), cv=10)
print("Scores: {}".format(scores))
print("Score (mean): {:.3f}".format(scores.mean()))
In [25]:
predict = modelt.predict(X_test[:,:min_error_key])
#predictions are logs, return to the value
predict = np.exp(predict)
file = "Id,SalePrice" + os.linesep
startId = 1461
for i in range(len(X_test)):
file += "{},{}".format(startId, (int)(predict[i])) + os.linesep
startId += 1
In [92]:
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)
In [ ]:
In [ ]:
In [ ]:
# Using XGRegressor?
#lr = XGBRegressor(max_depth=5, n_estimators=250,min_child_weight=10)