In [32]:
import numpy as np
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()



In [44]:
# Load data from file
data = np.loadtxt('chatt_houses.csv',dtype='float',delimiter=',')
# Remove outliers
data=data[data[:,0]<8e3,:] #  (keep if sqft < 8k)
data=data[data[:,1]<8e5,:] #  (keep if price < $800k)
X = data[:,:-1]; y = data[:,-1:]
print "Number of data points:", X.shape[0]
print "Number of features:",X.shape[1]
print "Sample data:"
print data[:5]


Number of data points: 499
Number of features: 1
Sample data:
[[   1980.   45000.]
 [   1686.   30660.]
 [   1097.   20000.]
 [   2350.  220000.]
 [   2276.  404900.]]

In [45]:
dataplot = go.Scatter(x = X[:,0], y = y[:,0], mode = 'markers', name = 'Training Data')
plotly.offline.iplot([dataplot])


Drawing...

In [46]:
def getMuSigma(X):
    """ Calculate mean and stdev of each feature"""
    m = len(X.T)
    mu, sigma = np.zeros((2,m))
    for i in range(m):
        mu[i] = np.mean(X[:,i])
        sigma[i] = np.std(X[:,i])
    return mu, sigma

def normalize(X, mu, sigma):
    return np.divide(X - mu,sigma)

def unNormalize(X, mu, sigma):
    return (X * sigma) + mu

In [47]:
def gradientDescent(X, y, theta, alpha, iterations):
    m = len(y)
    for i in xrange(iterations):
        theta = theta - alpha/m * X.T.dot(X.dot(theta) - y)
    return theta

In [48]:
m = len(y)
mu_X,sigma_X = getMuSigma(X)
mu_y,sigma_y = getMuSigma(y)
# Add a column of ones to X
X_ = np.concatenate((np.ones((m,1)),normalize(X,mu_X,sigma_X)),axis=1)
y_ = normalize(y,mu_y,sigma_y)
# Gradient descent settings
iterations = 15000
alpha = 0.01
theta_i = np.zeros((2,1))
# Run gradient descent
theta = unNormalize(gradientDescent(X_, y_, theta_i, alpha, iterations),mu_y,sigma_y)
print theta


[[ 148638.59318637]
 [ 220777.95689813]]

In [49]:
dataplot = go.Scatter(x = X[:,0], y = y[:,0], mode = 'markers', name = 'Training data')
theta_x = np.array([[1,min(X)],[1,max(X)]])
theta_y = theta_x.dot(theta)
print theta_x
print theta_y
thetaplot = go.Scatter(x = X[:,0], y = X_.dot(theta)[:,0],
                       mode = 'lines', name = 'Linear Regression')
plotly.offline.iplot([dataplot,thetaplot])


[[  1.00000000e+00   4.64000000e+02]
 [  1.00000000e+00   7.42000000e+03]]
[[  1.02589611e+08]
 [  1.63832108e+09]]
Drawing...

In [50]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [51]:
df = pd.DataFrame()
df['SqFt'] = X[:,0]
df['Price'] = y[:,0]
df.head()


Out[51]:
SqFt Price
0 1980 45000
1 1686 30660
2 1097 20000
3 2350 220000
4 2276 404900

In [52]:
sns.regplot(x = 'SqFt', y = 'Price',data=df,robust=True);



In [53]:
sns.jointplot('SqFt', 'Price', data=df, kind='reg')


Out[53]:
<seaborn.axisgrid.JointGrid at 0x114524750>

In [ ]:


In [ ]: