In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
%matplotlib inline

df = pd.read_csv("kc_house_data.csv")

df.head(15)


Out[1]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
0 7129300520 20141013T000000 221900.0 3 1.00 1180 5650 1.0 0 0 ... 7 1180 0 1955 0 98178 47.5112 -122.257 1340 5650
1 6414100192 20141209T000000 538000.0 3 2.25 2570 7242 2.0 0 0 ... 7 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639
2 5631500400 20150225T000000 180000.0 2 1.00 770 10000 1.0 0 0 ... 6 770 0 1933 0 98028 47.7379 -122.233 2720 8062
3 2487200875 20141209T000000 604000.0 4 3.00 1960 5000 1.0 0 0 ... 7 1050 910 1965 0 98136 47.5208 -122.393 1360 5000
4 1954400510 20150218T000000 510000.0 3 2.00 1680 8080 1.0 0 0 ... 8 1680 0 1987 0 98074 47.6168 -122.045 1800 7503
5 7237550310 20140512T000000 1225000.0 4 4.50 5420 101930 1.0 0 0 ... 11 3890 1530 2001 0 98053 47.6561 -122.005 4760 101930
6 1321400060 20140627T000000 257500.0 3 2.25 1715 6819 2.0 0 0 ... 7 1715 0 1995 0 98003 47.3097 -122.327 2238 6819
7 2008000270 20150115T000000 291850.0 3 1.50 1060 9711 1.0 0 0 ... 7 1060 0 1963 0 98198 47.4095 -122.315 1650 9711
8 2414600126 20150415T000000 229500.0 3 1.00 1780 7470 1.0 0 0 ... 7 1050 730 1960 0 98146 47.5123 -122.337 1780 8113
9 3793500160 20150312T000000 323000.0 3 2.50 1890 6560 2.0 0 0 ... 7 1890 0 2003 0 98038 47.3684 -122.031 2390 7570
10 1736800520 20150403T000000 662500.0 3 2.50 3560 9796 1.0 0 0 ... 8 1860 1700 1965 0 98007 47.6007 -122.145 2210 8925
11 9212900260 20140527T000000 468000.0 2 1.00 1160 6000 1.0 0 0 ... 7 860 300 1942 0 98115 47.6900 -122.292 1330 6000
12 114101516 20140528T000000 310000.0 3 1.00 1430 19901 1.5 0 0 ... 7 1430 0 1927 0 98028 47.7558 -122.229 1780 12697
13 6054650070 20141007T000000 400000.0 3 1.75 1370 9680 1.0 0 0 ... 7 1370 0 1977 0 98074 47.6127 -122.045 1370 10208
14 1175000570 20150312T000000 530000.0 5 2.00 1810 4850 1.5 0 0 ... 7 1810 0 1900 0 98107 47.6700 -122.394 1360 4850

15 rows × 21 columns


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB

In [3]:
df.isnull().sum()


Out[3]:
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [4]:
df = df.drop(['id','date', 'lat', 'long','zipcode'], axis =1)

In [5]:
df.head()


Out[5]:
price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated sqft_living15 sqft_lot15
0 221900.0 3 1.00 1180 5650 1.0 0 0 3 7 1180 0 1955 0 1340 5650
1 538000.0 3 2.25 2570 7242 2.0 0 0 3 7 2170 400 1951 1991 1690 7639
2 180000.0 2 1.00 770 10000 1.0 0 0 3 6 770 0 1933 0 2720 8062
3 604000.0 4 3.00 1960 5000 1.0 0 0 5 7 1050 910 1965 0 1360 5000
4 510000.0 3 2.00 1680 8080 1.0 0 0 3 8 1680 0 1987 0 1800 7503

In [6]:
import matplotlib.pyplot as plt
plt.figure(figsize=(48, 6))
sb.stripplot(x="yr_built", y="bedrooms", data=df);



In [7]:
plt.figure(figsize=(20, 8))
sb.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sb.stripplot(x="bedrooms", y="price", data=df);



In [8]:
plt.figure(figsize=(48, 8))
sb.barplot(x="bedrooms", y="price", hue="grade", data=df);



In [9]:
sb.countplot(x='bedrooms',data=df, palette='hls')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0xe4b9320>

In [6]:
from sklearn.model_selection import train_test_split

columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade','sqft_above', 'sqft_basement', 'yr_built','yr_renovated', 'sqft_living15', 'sqft_lot15']
labels = df['price'].values
features = df[list(columns)].values

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)

In [7]:
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X_train, y_train)

Accuracy = regr.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = regr.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"


Accuracy in the training data:  65.445910834 %
Accuracy in the test data 65.1300159738 %

In [ ]: