In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
%matplotlib inline
df = pd.read_csv("kc_house_data.csv")
df.head(15)
Out[1]:
In [2]:
df.info()
In [3]:
df.isnull().sum()
Out[3]:
In [4]:
df = df.drop(['id','date', 'lat', 'long','zipcode'], axis =1)
In [5]:
df.head()
Out[5]:
In [6]:
import matplotlib.pyplot as plt
plt.figure(figsize=(48, 6))
sb.stripplot(x="yr_built", y="bedrooms", data=df);
In [7]:
plt.figure(figsize=(20, 8))
sb.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sb.stripplot(x="bedrooms", y="price", data=df);
In [8]:
plt.figure(figsize=(48, 8))
sb.barplot(x="bedrooms", y="price", hue="grade", data=df);
In [9]:
sb.countplot(x='bedrooms',data=df, palette='hls')
Out[9]:
In [6]:
from sklearn.model_selection import train_test_split
columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade','sqft_above', 'sqft_basement', 'yr_built','yr_renovated', 'sqft_living15', 'sqft_lot15']
labels = df['price'].values
features = df[list(columns)].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)
In [7]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
Accuracy = regr.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"
accuracy = regr.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"
In [ ]: