In [1]:
#0.48500 <- elim std>3, 12-300-30-10-1
#0.52512 <- Epoch 11/200 16s - loss: 0.4993 - val_loss: 0.3509
#0.38798 <- 12-300-30-5-1
#0.41785<- 9col grp1 loss: 0.3123 - acc: 0.0000e+00 - val_loss: 0.2602
#0.43247 <-3col
#0.53085 <- adadelta
#0.50647 <-decay
#0.48978 <-rmsle
#task
# feature extraction
# feature selection
# optimizaer selection , lr , decay
In [1]:
### importing libraries
%matplotlib inline
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import SGD,RMSprop
from keras.datasets import mnist
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
print(timestr)
In [64]:
df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
df_train.head()
Out[64]:
In [65]:
print( df_train.shape)
In [66]:
# This section added: drop crazy data point
print( df_train.life_sq.max() )
gr1=["full_sq", "life_sq", "floor", "max_floor", "material", "build_year", \
"num_room", "kitch_sq","state", "material","radiation_km","green_zone_km"]
clist = gr1#['life_sq','floor']
for cname in clist:
df_train.drop(df_train[np.abs((df_train[cname]-df_train[cname].mean())/df_train[cname].std())>3].index, inplace=True)
print( df_train.shape)
In [67]:
df_train['price_doc'].hist(bins=50)
Out[67]:
In [68]:
y_train = df_train['price_doc'].values
id_test = df_test['id']
df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)
# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = df_all.join(df_macro, on='timestamp', rsuffix='_macro')
print(df_all.shape)
# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)
# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)
# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek
# Other feature engineering
#df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
#df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)
# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp', 'timestamp_macro'], axis=1, inplace=True)
In [69]:
factorize = lambda t: pd.factorize(t[1])[0]
df_obj = df_all.select_dtypes(include=['object'])
X_all = np.c_[
df_all.select_dtypes(exclude=['object']).values,
np.array(list(map(factorize, df_obj.iteritems()))).T
]
print(X_all.shape)
X_train = X_all[:num_train]
X_test = X_all[num_train:]
In [70]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()
for c in df_obj:
df_obj[c] = pd.factorize(df_obj[c])[0]
df_values = pd.concat([df_numeric, df_obj], axis=1)
df_values=df_values.fillna(df_values.mean())
df_values=df_values.dropna(axis="columns", how='all')
df_values.drop(['area_m','ID_metro'], axis=1, inplace=True)
#gr1=["full_sq", "life_sq", "floor", "max_floor", "material", "build_year", \
# "num_room", "kitch_sq","state", "material","radiation_km","green_zone_km"]
df_values=df_values[gr1]
#df_values.mean()
df_values.shape
Out[70]:
In [71]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)
X_train = X_all[:num_train]
X_test = X_all[num_train:]
df_columns = df_values.columns.tolist
df_columns
Out[71]:
In [72]:
#X_all.tofile("x_all.csv",format="%s",sep=",")
In [73]:
#df_valuesclean=df_values.dropna(axis="columns", how='any')
#df_valuesclean.shape
In [74]:
#df_valuesclean.to_csv("x_allpd.csv")
In [75]:
# define base model
def baseline_model():
model = Sequential()
#model.add(Convolution1D(20, 5, border_mode='valid', input_shape=(244, 1)))
model.add(Dense(300, input_dim=df_values.shape[1], activation='relu'))
#model.add(Convolution1D(20, 5, strides=1, padding='valid', dilation_rate=1, activation='relu'))
#model.add(Flatten())
#model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(30,activation="relu"))
model.add(Dropout(0.5))
#model.add(Dense(30,activation="relu"))
#model.add(Dropout(0.5))
#model.add(BatchNormalization())
model.add(Dense(10,activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1,activation="relu"))
# Compile model
#sgd=SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
model.compile(loss='mean_squared_logarithmic_error', optimizer=RMSprop(decay=0.0001))
#Adadelta
#sgd
return model
In [76]:
model=baseline_model()
model.summary()
In [77]:
#model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
print ("^^^INFO: Fit Model^^^")
#X_train = X_train.reshape(X_train.shape[0],244,1)
callbacks = [
EarlyStopping(monitor='val_loss', patience=5, verbose=2)
]
history = model.fit(X_train, y_train, epochs=200, batch_size=180, validation_split=0.3, verbose=2,callbacks=callbacks) #verbose=2 )#
In [78]:
# list all data in history
'''
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
# axes = plt.gca()
# axes.set_xlim([0,120])
# axes.set_ylim([90,100])
#plt.savefig('acc.png') # save the figure to file
plt.show()
#plt.close()
'''
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
#plt.savefig('loss.png')
#plt.show()
plt.ylim([0, \
min(history.history['val_loss'])+min(history.history['val_loss'])/2])
plt.show()
#plt.close()
In [79]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.ylim([0, 2])
plt.show()
In [ ]:
In [52]:
#X_test = X_test.reshape(X_test.shape[0],244,1)
In [80]:
y_test=model.predict(X_test)
In [81]:
y_test[:,0].shape
Out[81]:
In [82]:
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_test[:,0]})
df_sub.to_csv('sub 2017-04-30_rmsle_12_elimOut.csv', index=False)
# old LB score 0.47631
#Epoch 25/25
#8s - loss: 0.3268 - acc: 0.0000e+00 - val_loss: 0.3183 - val_acc: 0.0000e+00, now 0.48978
#0.50647 with decay
#0.5308 ada delta
#Epoch 173/205
#0s - loss: 0.3716 - acc: 0.0000e+00 - val_loss: 0.3339 - val_acc: 0.0000e+00
In [ ]: