In [10]:
# import
import graphlab as gl
import matplotlib.pyplot as plt

In [11]:
gl.canvas.set_target('ipynb')
%matplotlib inline

In [4]:
# reading the data
data = gl.SFrame.read_csv('data/Philadelphia_Crime_Rate_noNA.csv')
data.head(4)


Finished parsing file D:\Git\MachineLearning\ML_UoW\Course01_Regression\data\Philadelphia_Crime_Rate_noNA.csv
Parsing completed. Parsed 99 lines in 0.024024 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,float,float,float,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Finished parsing file D:\Git\MachineLearning\ML_UoW\Course01_Regression\data\Philadelphia_Crime_Rate_noNA.csv
Parsing completed. Parsed 99 lines in 0.020014 secs.
Out[4]:
HousePrice HsPrc ($10,000) CrimeRate MilesPhila PopChg Name County
140463 14.0463 29.7 10.0 -1.0 Abington Montgome
113033 11.3033 24.1 18.0 4.0 Ambler Montgome
124186 12.4186 19.5 25.0 8.0 Aston Delaware
110490 11.049 49.4 25.0 2.7 Bensalem Bucks
[4 rows x 7 columns]

In [8]:
# making a scatter plot
data.show(x='CrimeRate', y='HousePrice', view='Scatter Plot')


Fitting the Regression model


In [9]:
crime_model = gl.linear_regression.create(data,
                                         features=['CrimeRate'],
                                         target='HousePrice')


Linear regression:
--------------------------------------------------------
Number of examples          : 99
Number of features          : 1
Number of unpacked features : 1
Number of coefficients    : 2
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+---------------+
| Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
+-----------+----------+--------------+--------------------+---------------+
| 1         | 2        | 1.062593     | 314982.574662      | 83468.940148  |
+-----------+----------+--------------+--------------------+---------------+
SUCCESS: Optimal solution found.


In [20]:
plt.plot(data['CrimeRate'], data['HousePrice'], '.',
        data['CrimeRate'], crime_model.predict(data), '-')


Out[20]:
[<matplotlib.lines.Line2D at 0x20de12e8>,
 <matplotlib.lines.Line2D at 0x20de1390>]

Removing the outlier data


In [30]:
data2 = data[data['MilesPhila']!=0.0]

In [31]:
crime_model_noCC = gl.linear_regression.create(data2,
                                         features=['CrimeRate'],
                                         target='HousePrice')


Linear regression:
--------------------------------------------------------
Number of examples          : 98
Number of features          : 1
Number of unpacked features : 1
Number of coefficients    : 2
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+---------------+
| Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
+-----------+----------+--------------+--------------------+---------------+
| 1         | 2        | 0.000501     | 315335.534886      | 78052.676125  |
+-----------+----------+--------------+--------------------+---------------+
SUCCESS: Optimal solution found.


In [32]:
plt.plot(data2['CrimeRate'], data2['HousePrice'], '.',
        data2['CrimeRate'], crime_model_noCC.predict(data2), '-')


Out[32]:
[<matplotlib.lines.Line2D at 0x1d783b00>,
 <matplotlib.lines.Line2D at 0x1d783cc0>]

Comparing the co-efficieitns


In [33]:
print('data: ',crime_model.coefficients)
print('data2: ',crime_model_noCC.coefficients)


('data: ', Columns:
	name	str
	index	str
	value	float
	stderr	float

Rows: 2

Data:
+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 176626.046881  | 11245.5882194 |
|  CrimeRate  |  None | -576.804949058 |  226.90225951 |
+-------------+-------+----------------+---------------+
[2 rows x 4 columns]
)
('data2: ', Columns:
	name	str
	index	str
	value	float
	stderr	float

Rows: 2

Data:
+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | 225204.604303  | 16404.0247514 |
|  CrimeRate  |  None | -2287.69717443 | 491.537478123 |
+-------------+-------+----------------+---------------+
[2 rows x 4 columns]
)

In [34]:
crime_model.get('coefficients')


Out[34]:
name index value stderr
(intercept) None 176626.046881 11245.5882194
CrimeRate None -576.804949058 226.90225951
[2 rows x 4 columns]

In [35]:
crime_model_noCC.coefficients


Out[35]:
name index value stderr
(intercept) None 225204.604303 16404.0247514
CrimeRate None -2287.69717443 491.537478123
[2 rows x 4 columns]

Removing the High End House


In [36]:
data_noHighPrice = data2[data2['HousePrice']<350000]

In [41]:
crime_model_noHighPrice = gl.linear_regression.create(data_noHighPrice,
                                         features=['CrimeRate'],
                                         target='HousePrice')


Linear regression:
--------------------------------------------------------
Number of examples          : 93
Number of features          : 1
Number of unpacked features : 1
Number of coefficients    : 2
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+---------------+
| Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
+-----------+----------+--------------+--------------------+---------------+
| 1         | 2        | 0.000000     | 153105.996748      | 54958.023447  |
+-----------+----------+--------------+--------------------+---------------+
SUCCESS: Optimal solution found.


In [42]:
crime_model_noHighPrice.get('coefficients')


Out[42]:
name index value stderr
(intercept) None 199073.589615 11932.5101105
CrimeRate None -1837.71280989 351.519609333
[2 rows x 4 columns]

In [43]:
crime_model_noCC.get('coefficients')


Out[43]:
name index value stderr
(intercept) None 225204.604303 16404.0247514
CrimeRate None -2287.69717443 491.537478123
[2 rows x 4 columns]

In [ ]: