In [32]:
import numpy as np
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
In [44]:
# Load data from file
data = np.loadtxt('chatt_houses.csv',dtype='float',delimiter=',')
# Remove outliers
data=data[data[:,0]<8e3,:] # (keep if sqft < 8k)
data=data[data[:,1]<8e5,:] # (keep if price < $800k)
X = data[:,:-1]; y = data[:,-1:]
print "Number of data points:", X.shape[0]
print "Number of features:",X.shape[1]
print "Sample data:"
print data[:5]
In [45]:
dataplot = go.Scatter(x = X[:,0], y = y[:,0], mode = 'markers', name = 'Training Data')
plotly.offline.iplot([dataplot])
In [46]:
def getMuSigma(X):
""" Calculate mean and stdev of each feature"""
m = len(X.T)
mu, sigma = np.zeros((2,m))
for i in range(m):
mu[i] = np.mean(X[:,i])
sigma[i] = np.std(X[:,i])
return mu, sigma
def normalize(X, mu, sigma):
return np.divide(X - mu,sigma)
def unNormalize(X, mu, sigma):
return (X * sigma) + mu
In [47]:
def gradientDescent(X, y, theta, alpha, iterations):
m = len(y)
for i in xrange(iterations):
theta = theta - alpha/m * X.T.dot(X.dot(theta) - y)
return theta
In [48]:
m = len(y)
mu_X,sigma_X = getMuSigma(X)
mu_y,sigma_y = getMuSigma(y)
# Add a column of ones to X
X_ = np.concatenate((np.ones((m,1)),normalize(X,mu_X,sigma_X)),axis=1)
y_ = normalize(y,mu_y,sigma_y)
# Gradient descent settings
iterations = 15000
alpha = 0.01
theta_i = np.zeros((2,1))
# Run gradient descent
theta = unNormalize(gradientDescent(X_, y_, theta_i, alpha, iterations),mu_y,sigma_y)
print theta
In [49]:
dataplot = go.Scatter(x = X[:,0], y = y[:,0], mode = 'markers', name = 'Training data')
theta_x = np.array([[1,min(X)],[1,max(X)]])
theta_y = theta_x.dot(theta)
print theta_x
print theta_y
thetaplot = go.Scatter(x = X[:,0], y = X_.dot(theta)[:,0],
mode = 'lines', name = 'Linear Regression')
plotly.offline.iplot([dataplot,thetaplot])
In [50]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
In [51]:
df = pd.DataFrame()
df['SqFt'] = X[:,0]
df['Price'] = y[:,0]
df.head()
Out[51]:
In [52]:
sns.regplot(x = 'SqFt', y = 'Price',data=df,robust=True);
In [53]:
sns.jointplot('SqFt', 'Price', data=df, kind='reg')
Out[53]:
In [ ]:
In [ ]: