In [1]:
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from math import sqrt
plt.style.use('ggplot')
%matplotlib inline
    
In [2]:
    
sales = pd.read_csv("../data/home_data.csv")
    
In [3]:
    
sales.head(10)
    
    Out[3]:
In [4]:
    
sales.dtypes
    
    Out[4]:
The import into pandas looks correct, but the date is not formatted correctly and the zipcode is interpreted as a number rather than a category. Let's fix this.
In [5]:
    
sales['date'] = pd.to_datetime(sales['date'])
sales['zipcode'] = sales['zipcode'].astype('category')
    
In [6]:
    
sales.head(10)
    
    Out[6]:
In [7]:
    
len(sales)
    
    Out[7]:
In [8]:
    
sales.plot(kind = 'scatter', x='sqft_living', y='price')
    
    Out[8]:
    
In [9]:
    
# Use the train_test_split function from scikit-learn (imported at the top)
# Here we have to give the size of the test (instead of the size of 
# the train data in the lecture). random_state is the seed
train_data, test_data = train_test_split(sales, test_size=0.2, random_state=42)
    
In [10]:
    
print(len(train_data), len(test_data))
    
    
In [11]:
    
sqft_model = linear_model.LinearRegression()
train_X = np.reshape(train_data['sqft_living'], (-1, 1)) # reshape required for sklearn
train_Y = train_data['price']
sqft_model.fit(X = train_X, y = train_Y)
    
    Out[11]:
In [12]:
    
print(test_data['price'].mean())
    
    
In [13]:
    
# Define a helper function to assess model performance
def evaluate(model, test_x, test_y):
    errors = np.absolute(model.predict(test_x) - test_y)
    return {'rmse': sqrt(np.mean(errors ** 2)), 'max_error' : errors.max()}
    
In [14]:
    
test_X = np.reshape(test_data['sqft_living'], (-1, 1))
test_Y = test_data['price']
evaluate(sqft_model, test_X, test_Y)
    
    Out[14]:
In [15]:
    
plt.plot(test_data['sqft_living'], test_data['price'], '.',
        test_data['sqft_living'], sqft_model.predict(test_X), '-')
    
    Out[15]:
    
In [16]:
    
print(sqft_model.coef_[0], sqft_model.intercept_)
    
    
In [17]:
    
sales.columns
    
    Out[17]:
In [18]:
    
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
    
In [19]:
    
sales[my_features].describe(include = 'all')
    
    Out[19]:
In [20]:
    
sales[my_features].hist()
    
    Out[20]:
    
In [20]:
    
sales.boxplot(column='price', by='zipcode', rot=-45)
    
    Out[20]:
    
In [21]:
    
train_with_dummies = pd.get_dummies(train_data[my_features])
test_with_dummies = pd.get_dummies(test_data[my_features])
train_with_dummies.head()
    
    Out[21]:
In [22]:
    
my_features_model = linear_model.LinearRegression()
my_features_model.fit(X = train_with_dummies, y = train_Y)
    
    Out[22]:
In [23]:
    
print(evaluate(sqft_model, test_X, test_Y))
print(evaluate(my_features_model, test_with_dummies, test_Y))
    
    
In [24]:
    
house1 = sales.loc[sales['id'] == 5309101200]
    
In [25]:
    
house1
    
    Out[25]:
In [26]:
    
print(house1.price)
    
    
In [27]:
    
# Prediction from the sqft_model (with some reshaping to prevent errors)
sqft_model.predict(np.reshape(house1['sqft_living'],(-1,1)))
    
    Out[27]:
In [28]:
    
# Prediction from my_features_model (with zipcode converted to dummy)
my_features_model.predict(pd.get_dummies(house1[my_features]))
    
    Out[28]:
In [29]:
    
house2 = sales.loc[sales['id'] == 1925069082]
    
In [30]:
    
house2
    
    Out[30]:
In [31]:
    
sqft_model.predict(np.reshape(house2['sqft_living'],(-1,1)))
    
    Out[31]:
In [32]:
    
my_features_model.predict(pd.get_dummies(house2[my_features]))
    
    Out[32]:
In [33]:
    
bill_gates = house2.copy()
    
In [34]:
    
bill_gates['bedrooms'] = 8
bill_gates['bathrooms'] = 25
bill_gates['sqft_living'] = 50000
bill_gates['sqft_lot'] = 225000
bill_gates['floors'] = 4
bill_gates.loc[:,'zipcode'] = 98039
    
In [35]:
    
bill_gates['zipcode']
    
    Out[35]:
In [36]:
    
sqft_model.predict(np.reshape(bill_gates['sqft_living'], (-1,1)))
    
    Out[36]:
In [37]:
    
my_features_model.predict(pd.get_dummies(bill_gates[my_features]))
    
    Out[37]: