In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def read_housing_data(source='train'):
dataset = load_data(source)
features, labels = featurize(dataset, source)
return features, labels
def load_data(source):
if source == 'train':
return pd.read_csv('../input/train.csv.gz')
else:
return pd.read_csv('../input/test.csv.gz')
def featurize(dataset, source='train'):
print 'the source is ' + source
if source == 'train':
labels = dataset.iloc[:,-1]
else:
labels = dataset.iloc[:,0]
notnullcount = dataset.count()
# List the columns with more than 30 % missing values
nullmorethan30 = [n for n in notnullcount if n < 0.3 * dataset.shape[0]]
removablecolumns =[]
for v in nullmorethan30:
colr = notnullcount[notnullcount == v].index[0]
removablecolumns.append(colr)
dataset = dataset.drop(removablecolumns,1)
print dataset.shape
if source == 'train':
all_data = dataset.iloc[:, 1:-1] #removing id and label
print all_data.shape
else:
all_data = dataset.iloc[:, 1:] #removing id only
print all_data.shape
for col in all_data.columns:
if(all_data[col].dtype == np.dtype('O')):
all_data[col] = all_data[col].fillna(all_data[col].value_counts().index[0])
else:
all_data[col] = all_data[col].fillna(all_data[col].mean())
# Check if any value is null
print(all_data.isnull().any().value_counts())
print dataset.shape
# Get only numerics
print all_data.columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
dataset_numeric = all_data.select_dtypes(include=numerics)
features = dataset_numeric
print "ALAA"
print features.shape
return features, labels
def feature_normalize(dataset):
mu = np.mean(dataset,axis=0)
sigma = np.std(dataset,axis=0)
return (dataset - mu)/sigma
def append_bias_reshape(features,labels):
n_training_samples = features.shape[0]
n_dim = features.shape[1]
f = np.reshape(np.c_[np.ones(n_training_samples),features],[n_training_samples,n_dim + 1])
l = np.reshape(labels,[n_training_samples,1])
return f, l
In [3]:
features,labels = read_housing_data(source='train')
normalized_features = feature_normalize(features)
f, l = append_bias_reshape(normalized_features,labels)
rnd_indices = np.random.rand(len(f)) < 0.80
train_x = f[rnd_indices]
train_y = l[rnd_indices]
test_x = f[~rnd_indices]
test_y = l[~rnd_indices]
In [4]:
n_dim = train_x.shape[1]
print n_dim
learning_rate = 0.01
training_epochs = 250
X = tf.placeholder(tf.float32,[None,n_dim])
Y = tf.placeholder(tf.float32,[None,1])
W = tf.Variable(tf.ones([n_dim,1]))
init = tf.initialize_all_variables()
In [5]:
y_ = tf.matmul(X, W)
cost = tf.reduce_mean(tf.square(y_ - Y))
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
In [6]:
sess = tf.Session()
sess.run(init)
cost_history = np.empty(shape=[1],dtype=float)
for epoch in range(training_epochs):
sess.run(training_step,feed_dict={X:train_x,Y:train_y})
cost_history = np.append(cost_history,sess.run(cost,feed_dict={X: train_x,Y: train_y}))
plt.plot(range(len(cost_history)),cost_history)
plt.axis([0,training_epochs,0,np.max(cost_history)])
plt.show()
In [7]:
pred_y = sess.run(y_, feed_dict={X: test_x})
mse = tf.reduce_mean(tf.square(pred_y - test_y))
print('MSE: %.4f' % sess.run(mse))
In [8]:
fig, ax = plt.subplots()
ax.scatter(test_y, pred_y)
ax.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'k--', lw=3)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
In [9]:
x_validation,x_ids = read_housing_data(source='test')
normalized_features = feature_normalize(x_validation)
x_validation, ids = append_bias_reshape(normalized_features,x_ids)
In [10]:
#x_validation.shape
train_x.shape
Out[10]:
In [15]:
predictions = sess.run(y_, feed_dict={X: x_validation})
predictions.shape
x_ids = np.reshape(x_ids,[x_validation.shape[0],1])
In [34]:
#df = pd.concat([x_ids, predictions], axis=1)
df = np.column_stack((x_ids.astype(int), predictions))
solution = pd.DataFrame(df)
solution.to_csv("submission2.csv", index=False)
In [ ]: