notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import seaborn as sb
%matplotlib inline

df = pd.read_csv("kc_house_data.csv")

df.head(15)









    Out[1]:






  
    
      
      id
      date
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      ...
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      zipcode
      lat
      long
      sqft_living15
      sqft_lot15
    
  
  
    
      0
      7129300520
      20141013T000000
      221900.0
      3
      1.00
      1180
      5650
      1.0
      0
      0
      ...
      7
      1180
      0
      1955
      0
      98178
      47.5112
      -122.257
      1340
      5650
    
    
      1
      6414100192
      20141209T000000
      538000.0
      3
      2.25
      2570
      7242
      2.0
      0
      0
      ...
      7
      2170
      400
      1951
      1991
      98125
      47.7210
      -122.319
      1690
      7639
    
    
      2
      5631500400
      20150225T000000
      180000.0
      2
      1.00
      770
      10000
      1.0
      0
      0
      ...
      6
      770
      0
      1933
      0
      98028
      47.7379
      -122.233
      2720
      8062
    
    
      3
      2487200875
      20141209T000000
      604000.0
      4
      3.00
      1960
      5000
      1.0
      0
      0
      ...
      7
      1050
      910
      1965
      0
      98136
      47.5208
      -122.393
      1360
      5000
    
    
      4
      1954400510
      20150218T000000
      510000.0
      3
      2.00
      1680
      8080
      1.0
      0
      0
      ...
      8
      1680
      0
      1987
      0
      98074
      47.6168
      -122.045
      1800
      7503
    
    
      5
      7237550310
      20140512T000000
      1225000.0
      4
      4.50
      5420
      101930
      1.0
      0
      0
      ...
      11
      3890
      1530
      2001
      0
      98053
      47.6561
      -122.005
      4760
      101930
    
    
      6
      1321400060
      20140627T000000
      257500.0
      3
      2.25
      1715
      6819
      2.0
      0
      0
      ...
      7
      1715
      0
      1995
      0
      98003
      47.3097
      -122.327
      2238
      6819
    
    
      7
      2008000270
      20150115T000000
      291850.0
      3
      1.50
      1060
      9711
      1.0
      0
      0
      ...
      7
      1060
      0
      1963
      0
      98198
      47.4095
      -122.315
      1650
      9711
    
    
      8
      2414600126
      20150415T000000
      229500.0
      3
      1.00
      1780
      7470
      1.0
      0
      0
      ...
      7
      1050
      730
      1960
      0
      98146
      47.5123
      -122.337
      1780
      8113
    
    
      9
      3793500160
      20150312T000000
      323000.0
      3
      2.50
      1890
      6560
      2.0
      0
      0
      ...
      7
      1890
      0
      2003
      0
      98038
      47.3684
      -122.031
      2390
      7570
    
    
      10
      1736800520
      20150403T000000
      662500.0
      3
      2.50
      3560
      9796
      1.0
      0
      0
      ...
      8
      1860
      1700
      1965
      0
      98007
      47.6007
      -122.145
      2210
      8925
    
    
      11
      9212900260
      20140527T000000
      468000.0
      2
      1.00
      1160
      6000
      1.0
      0
      0
      ...
      7
      860
      300
      1942
      0
      98115
      47.6900
      -122.292
      1330
      6000
    
    
      12
      114101516
      20140528T000000
      310000.0
      3
      1.00
      1430
      19901
      1.5
      0
      0
      ...
      7
      1430
      0
      1927
      0
      98028
      47.7558
      -122.229
      1780
      12697
    
    
      13
      6054650070
      20141007T000000
      400000.0
      3
      1.75
      1370
      9680
      1.0
      0
      0
      ...
      7
      1370
      0
      1977
      0
      98074
      47.6127
      -122.045
      1370
      10208
    
    
      14
      1175000570
      20150312T000000
      530000.0
      5
      2.00
      1810
      4850
      1.5
      0
      0
      ...
      7
      1810
      0
      1900
      0
      98107
      47.6700
      -122.394
      1360
      4850
    
  

15 rows × 21 columns



In [2]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB



In [3]:

    
df.isnull().sum()









    Out[3]:





id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64



In [4]:

    
df = df.drop(['id','date', 'lat', 'long','zipcode'], axis =1)



In [5]:

    
df.head()









    Out[5]:






  
    
      
      price
      bedrooms
      bathrooms
      sqft_living
      sqft_lot
      floors
      waterfront
      view
      condition
      grade
      sqft_above
      sqft_basement
      yr_built
      yr_renovated
      sqft_living15
      sqft_lot15
    
  
  
    
      0
      221900.0
      3
      1.00
      1180
      5650
      1.0
      0
      0
      3
      7
      1180
      0
      1955
      0
      1340
      5650
    
    
      1
      538000.0
      3
      2.25
      2570
      7242
      2.0
      0
      0
      3
      7
      2170
      400
      1951
      1991
      1690
      7639
    
    
      2
      180000.0
      2
      1.00
      770
      10000
      1.0
      0
      0
      3
      6
      770
      0
      1933
      0
      2720
      8062
    
    
      3
      604000.0
      4
      3.00
      1960
      5000
      1.0
      0
      0
      5
      7
      1050
      910
      1965
      0
      1360
      5000
    
    
      4
      510000.0
      3
      2.00
      1680
      8080
      1.0
      0
      0
      3
      8
      1680
      0
      1987
      0
      1800
      7503



In [6]:

    
import matplotlib.pyplot as plt
plt.figure(figsize=(48, 6))
sb.stripplot(x="yr_built", y="bedrooms", data=df);



In [7]:

    
plt.figure(figsize=(20, 8))
sb.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
sb.stripplot(x="bedrooms", y="price", data=df);



In [8]:

    
plt.figure(figsize=(48, 8))
sb.barplot(x="bedrooms", y="price", hue="grade", data=df);



In [9]:

    
sb.countplot(x='bedrooms',data=df, palette='hls')









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0xe4b9320>



In [6]:

    
from sklearn.model_selection import train_test_split

columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade','sqft_above', 'sqft_basement', 'yr_built','yr_renovated', 'sqft_living15', 'sqft_lot15']
labels = df['price'].values
features = df[list(columns)].values

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.30)



In [7]:

    
from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X_train, y_train)

Accuracy = regr.score(X_train, y_train)
print "Accuracy in the training data: ", Accuracy*100, "%"

accuracy = regr.score(X_test, y_test)
print "Accuracy in the test data", accuracy*100, "%"









    



Accuracy in the training data:  65.445910834 %
Accuracy in the test data 65.1300159738 %



In [ ]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
5	7237550310	20140512T000000	1225000.0	4	4.50	5420	101930	1.0	...	11	3890	1530	2001	0	98053	47.6561	-122.005	4760	101930
6	1321400060	20140627T000000	257500.0	3	2.25	1715	6819	2.0	...	7	1715	0	1995	0	98003	47.3097	-122.327	2238	6819
7	2008000270	20150115T000000	291850.0	3	1.50	1060	9711	1.0	...	7	1060	0	1963	0	98198	47.4095	-122.315	1650	9711
8	2414600126	20150415T000000	229500.0	3	1.00	1780	7470	1.0	...	7	1050	730	1960	0	98146	47.5123	-122.337	1780	8113
9	3793500160	20150312T000000	323000.0	3	2.50	1890	6560	2.0	...	7	1890	0	2003	0	98038	47.3684	-122.031	2390	7570
10	1736800520	20150403T000000	662500.0	3	2.50	3560	9796	1.0	...	8	1860	1700	1965	0	98007	47.6007	-122.145	2210	8925
11	9212900260	20140527T000000	468000.0	2	1.00	1160	6000	1.0	...	7	860	300	1942	0	98115	47.6900	-122.292	1330	6000
12	114101516	20140528T000000	310000.0	3	1.00	1430	19901	1.5	...	7	1430	0	1927	0	98028	47.7558	-122.229	1780	12697
13	6054650070	20141007T000000	400000.0	3	1.75	1370	9680	1.0	...	7	1370	0	1977	0	98074	47.6127	-122.045	1370	10208
14	1175000570	20150312T000000	530000.0	5	2.00	1810	4850	1.5	...	7	1810	0	1900	0	98107	47.6700	-122.394	1360	4850