In [1]:

    
import os  
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
%matplotlib inline

Import training data



In [5]:

    
path = os.getcwd() + '/' + 'ex1data1.txt'  
data = pd.read_csv(path, header=None, names=['X', 'Y'])  
#data.head()
data









    Out[5]:






  
    
      
      X
      Y
    
  
  
    
      0
      6.1101
      17.59200
    
    
      1
      5.5277
      9.13020
    
    
      2
      8.5186
      13.66200
    
    
      3
      7.0032
      11.85400
    
    
      4
      5.8598
      6.82330
    
    
      5
      8.3829
      11.88600
    
    
      6
      7.4764
      4.34830
    
    
      7
      8.5781
      12.00000
    
    
      8
      6.4862
      6.59870
    
    
      9
      5.0546
      3.81660
    
    
      10
      5.7107
      3.25220
    
    
      11
      14.1640
      15.50500
    
    
      12
      5.7340
      3.15510
    
    
      13
      8.4084
      7.22580
    
    
      14
      5.6407
      0.71618
    
    
      15
      5.3794
      3.51290
    
    
      16
      6.3654
      5.30480
    
    
      17
      5.1301
      0.56077
    
    
      18
      6.4296
      3.65180
    
    
      19
      7.0708
      5.38930
    
    
      20
      6.1891
      3.13860
    
    
      21
      20.2700
      21.76700
    
    
      22
      5.4901
      4.26300
    
    
      23
      6.3261
      5.18750
    
    
      24
      5.5649
      3.08250
    
    
      25
      18.9450
      22.63800
    
    
      26
      12.8280
      13.50100
    
    
      27
      10.9570
      7.04670
    
    
      28
      13.1760
      14.69200
    
    
      29
      22.2030
      24.14700
    
    
      ...
      ...
      ...
    
    
      67
      10.2360
      7.77540
    
    
      68
      5.4994
      1.01730
    
    
      69
      20.3410
      20.99200
    
    
      70
      10.1360
      6.67990
    
    
      71
      7.3345
      4.02590
    
    
      72
      6.0062
      1.27840
    
    
      73
      7.2259
      3.34110
    
    
      74
      5.0269
      -2.68070
    
    
      75
      6.5479
      0.29678
    
    
      76
      7.5386
      3.88450
    
    
      77
      5.0365
      5.70140
    
    
      78
      10.2740
      6.75260
    
    
      79
      5.1077
      2.05760
    
    
      80
      5.7292
      0.47953
    
    
      81
      5.1884
      0.20421
    
    
      82
      6.3557
      0.67861
    
    
      83
      9.7687
      7.54350
    
    
      84
      6.5159
      5.34360
    
    
      85
      8.5172
      4.24150
    
    
      86
      9.1802
      6.79810
    
    
      87
      6.0020
      0.92695
    
    
      88
      5.5204
      0.15200
    
    
      89
      5.0594
      2.82140
    
    
      90
      5.7077
      1.84510
    
    
      91
      7.6366
      4.29590
    
    
      92
      5.8707
      7.20290
    
    
      93
      5.3054
      1.98690
    
    
      94
      8.2934
      0.14454
    
    
      95
      13.3940
      9.05510
    
    
      96
      5.4369
      0.61705
    
  

97 rows × 2 columns



In [7]:

    
data.describe()

Data Plotting and Visualization



In [8]:

    
data.plot(kind='scatter', x='X', y='Y', figsize=(12,8))









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a5074e0>

Implementing 1 Dimensional Linear Regression



In [9]:

    
def computeCost(X, y, theta):  
    inner = np.power(((X * theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))



In [10]:

    
# append a ones column to the front of the data set
data.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = data.shape[1]  
X = data.iloc[:,0:cols-1]  
y = data.iloc[:,cols-1:cols]



In [11]:

    
# convert from data frames to numpy matrices
X = np.matrix(X.values)  
y = np.matrix(y.values)  
theta = np.matrix(np.array([0,0]))



In [12]:

    
X.shape, theta.shape, y.shape









    Out[12]:





((97, 2), (1, 2), (97, 1))



In [13]:

    
computeCost(X, y, theta)









    Out[13]:





32.072733877455676



In [14]:

    
def gradientDescent(X, y, theta, alpha, iters):  
    temp = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])
    cost = np.zeros(iters)
    Wtheta = np.zeros(shape=(iters,2))
    
    for i in range(iters):
        error = (X * theta.T) - y
        for j in range(parameters):
            term = np.multiply(error, X[:,j])
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))

        theta = temp
        cost[i] = computeCost(X, y, theta)
        Wtheta [i,0] = theta[0,0]
        Wtheta [i,1] = theta[0,1]
        #print (theta)     
    return Wtheta, cost



In [15]:

    
# initialize variables for learning rate and iterations
alpha = 0.01
iters = 1000

# perform gradient descent to "fit" the model parameters
W, cost = gradientDescent(X, y, theta, alpha, iters)



In [17]:

    
x = np.linspace(data.X.min(), data.X.max(), 10)



In [18]:

    
fig, ax = plt.subplots(figsize=(12,8)) 
for i in range(iters):
    f = W[i, 0] + (W[i, 1] * x)
    ax.plot(x, f, 'b') 
f = W[iters-1, 0] + (W[iters-1, 1] * x)
ax.plot(x, f,'r') 
ax.scatter(data.X, data.Y, label='Traning Data')  
ax.legend(loc=2)  
ax.set_xlabel('X')  
ax.set_ylabel('Y')  
ax.set_title('Predicted Y vs. X Size')









    Out[18]:





<matplotlib.text.Text at 0x11d1cd978>



In [19]:

    
fig, ax = plt.subplots(figsize=(12,8))  
ax.plot(np.arange(iters), cost, 'r')  
ax.set_xlabel('Iterations')  
ax.set_ylabel('Cost')  
ax.set_title('Error vs. Training Epoch')









    Out[19]:





<matplotlib.text.Text at 0x11e6ef748>



In [ ]:

	X	Y
count	97.000000	97.000000
mean	8.159800	5.839135
std	3.869884	5.510262
min	5.026900	-2.680700
25%	5.707700	1.986900
50%	6.589400	4.562300
75%	8.578100	7.046700
max	22.203000	24.147000

	X	Y
0	6.1101	17.59200
1	5.5277	9.13020
2	8.5186	13.66200
3	7.0032	11.85400
4	5.8598	6.82330
5	8.3829	11.88600
6	7.4764	4.34830
7	8.5781	12.00000
8	6.4862	6.59870
9	5.0546	3.81660
10	5.7107	3.25220
11	14.1640	15.50500
12	5.7340	3.15510
13	8.4084	7.22580
14	5.6407	0.71618
15	5.3794	3.51290
16	6.3654	5.30480
17	5.1301	0.56077
18	6.4296	3.65180
19	7.0708	5.38930
20	6.1891	3.13860
21	20.2700	21.76700
22	5.4901	4.26300
23	6.3261	5.18750
24	5.5649	3.08250
25	18.9450	22.63800
26	12.8280	13.50100
27	10.9570	7.04670
28	13.1760	14.69200
29	22.2030	24.14700
...	...	...
67	10.2360	7.77540
68	5.4994	1.01730
69	20.3410	20.99200
70	10.1360	6.67990
71	7.3345	4.02590
72	6.0062	1.27840
73	7.2259	3.34110
74	5.0269	-2.68070
75	6.5479	0.29678
76	7.5386	3.88450
77	5.0365	5.70140
78	10.2740	6.75260
79	5.1077	2.05760
80	5.7292	0.47953
81	5.1884	0.20421
82	6.3557	0.67861
83	9.7687	7.54350
84	6.5159	5.34360
85	8.5172	4.24150
86	9.1802	6.79810
87	6.0020	0.92695
88	5.5204	0.15200
89	5.0594	2.82140
90	5.7077	1.84510
91	7.6366	4.29590
92	5.8707	7.20290
93	5.3054	1.98690
94	8.2934	0.14454
95	13.3940	9.05510
96	5.4369	0.61705