notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline



In [2]:

    
def read_housing_data(source='train'):
  dataset = load_data(source)
  features, labels = featurize(dataset, source)
  return features, labels

def load_data(source):
  if source == 'train':
    return pd.read_csv('../input/train.csv.gz')
  else:
    return pd.read_csv('../input/test.csv.gz')

def featurize(dataset, source='train'):
  print 'the source is ' + source
  if source == 'train':
    labels = dataset.iloc[:,-1]
  else:
    labels = dataset.iloc[:,0]

  notnullcount = dataset.count()
  # List the columns with more than 30 % missing values
  nullmorethan30 = [n for n in notnullcount if n < 0.3 * dataset.shape[0]]
  removablecolumns =[]
  for v in nullmorethan30:
      colr = notnullcount[notnullcount == v].index[0]
      removablecolumns.append(colr)
  dataset = dataset.drop(removablecolumns,1)
  print dataset.shape
  
  if source == 'train':
    all_data = dataset.iloc[:,  1:-1] #removing id and label
  else:
    all_data = dataset.iloc[:,  1:] #removing id only

  for col in all_data.columns:
      if(all_data[col].dtype == np.dtype('O')):
          all_data[col] = all_data[col].fillna(all_data[col].value_counts().index[0])
      else:
          all_data[col] = all_data[col].fillna(all_data[col].mean())
  # Check if any value is null
  print(all_data.isnull().any().value_counts())

  # Get only numerics
  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  dataset_numeric = all_data.select_dtypes(include=numerics)
  features = dataset_numeric

  return features, labels

def feature_normalize(dataset):
  mu = np.mean(dataset,axis=0)
  sigma = np.std(dataset,axis=0)
  return (dataset - mu)/sigma

def append_bias_reshape(features,labels):
  n_training_samples = features.shape[0]
  n_dim = features.shape[1]
  f = np.reshape(np.c_[np.ones(n_training_samples),features],[n_training_samples,n_dim + 1])
  l = np.reshape(labels,[n_training_samples,1])
  return f, l



In [3]:

    
features,labels = read_housing_data(source='train')
normalized_features = feature_normalize(features)
f, l = append_bias_reshape(normalized_features,labels)

rnd_indices = np.random.rand(len(f)) < 0.80

train_x = f[rnd_indices]
train_y = l[rnd_indices]
test_x = f[~rnd_indices]
test_y = l[~rnd_indices]









    



the source is train
(1460, 77)
False    75
dtype: int64






    



//anaconda/lib/python2.7/site-packages/numpy/core/fromnumeric.py:225: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  return reshape(newshape, order=order)



In [4]:

    
n_dim = train_x.shape[1]
print n_dim
learning_rate = 0.01
training_epochs = 500

X = tf.placeholder(tf.float32,[None,n_dim])
Y = tf.placeholder(tf.float32,[None,1])
W = tf.Variable(tf.ones([n_dim,1]))

init = tf.initialize_all_variables()



In [5]:

    
y_ = tf.matmul(X, W)
cost = tf.reduce_mean(tf.square(y_ - Y))
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)



In [6]:

    
sess = tf.Session()
sess.run(init)
cost_history = np.empty(shape=[1],dtype=float)
for epoch in range(training_epochs):
    sess.run(training_step,feed_dict={X:train_x,Y:train_y})
    cost_history = np.append(cost_history,sess.run(cost,feed_dict={X: train_x,Y: train_y}))

    plt.plot(range(len(cost_history)),cost_history)
plt.axis([0,training_epochs,0,np.max(cost_history)])
plt.show()



In [7]:

    
pred_y = sess.run(y_, feed_dict={X: test_x})
mse = tf.reduce_mean(tf.square(pred_y - test_y))
print('MSE: %.4f' % sess.run(mse))









    



MSE: 775896411.1744



In [8]:

    
fig, ax = plt.subplots()
ax.scatter(test_y, pred_y)
ax.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'k--', lw=3)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()









    



//anaconda/lib/python2.7/site-packages/matplotlib/collections.py:571: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [9]:

    
x_validation,x_ids = read_housing_data(source='test')
normalized_features = feature_normalize(x_validation)
x_validation, ids = append_bias_reshape(normalized_features,x_ids)









    



the source is test
(1459, 76)
False    75
dtype: int64



In [10]:

    
#x_validation.shape
train_x.shape









    Out[10]:





(1196, 37)



In [11]:

    
predictions = sess.run(y_, feed_dict={X: x_validation})
predictions.shape
x_ids = np.reshape(x_ids,[x_validation.shape[0],1])



In [12]:

    
#df = pd.concat([x_ids, predictions], axis=1)
df = np.column_stack((x_ids.astype(int), predictions))
solution = pd.DataFrame(df)
solution.to_csv("submission2.csv", index=False)



In [ ]: