Univariate linear regression



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model



In [2]:

    
datafile = 'ex1/ex1data1.txt'
data = np.loadtxt(datafile, delimiter=',', usecols=(0,1), unpack=True)



In [3]:

    
X = data[0].reshape(97, 1)
y = data[1].reshape(97, 1)



In [4]:

    
# Split the data into training/testing sets
X_train = X[:-20]
X_test = X[-20:]



In [5]:

    
# Split the targets into training/testing sets
y_train = y[:-20]
y_test = y[-20:]



In [6]:

    
# Create linear regression object
lr = linear_model.LinearRegression()



In [7]:

    
# Train the model using the training sets
lr.fit(X_train, y_train)









    



/usr/local/lib/python2.7/site-packages/scipy/linalg/basic.py:884: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)






    Out[7]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [8]:

    
print "Coefficient: ", lr.coef_
print "Residual sum of squares: %.2f" % np.mean((lr.predict(X_test) - y_test) ** 2)
print 'Variance score: %.2f' % lr.score(X_test, y_test)









    



Coefficient:  [[ 1.19367575]]
Residual sum of squares: 6.59
Variance score: 0.20



In [9]:

    
plt.figure(figsize=(10,6))
plt.scatter(X, y, color='red')
plt.plot(X, lr.predict(X), color='blue', linewidth=1)
plt.grid(True)
plt.ylabel('Profit in $10,000s')
plt.xlabel('Population of City in 10,000s')
plt.show()

Multivariate linear regression



In [10]:

    
datafile = 'ex1/ex1data2.txt'
data = np.loadtxt(datafile, delimiter=',', usecols=(0,1,2), unpack=True)



In [11]:

    
X = data[0:2]
y = data[2].reshape(47, 1)



In [12]:

    
# Split the data into training/testing sets
X_1 = X[0].reshape(47, 1)
X_2 = X[1].reshape(47, 1)

X_train = np.hstack((X_1[:-20], X_2[:-20]))
X_test = np.hstack((X_1[-20:], X_2[-20:]))



In [13]:

    
# Split the targets into training/testing sets
y_train = y[:-20]
y_test = y[-20:]



In [14]:

    
# Create linear regression object
lr = linear_model.LinearRegression()



In [15]:

    
# Train the model using the training sets
lr.fit(X_train, y_train)









    Out[15]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [16]:

    
print "Coefficient: ", lr.coef_
print "Residual sum of squares: %.2f" % np.mean((lr.predict(X_test) - y_test) ** 2)
print 'Variance score: %.2f' % lr.score(X_test, y_test)









    



Coefficient:  [[  143.62075383  8517.08072987]]
Residual sum of squares: 4739542362.77
Variance score: 0.59



In [ ]: