In [172]:

    
#libraries used
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

Week 1: Types of machine learning, when to use machine learning, neural network architecture

Linear regression demo

brain_body.txt is dataset of brain and body sizes.



In [72]:

    
#read data
df = pd.read_fwf('linear_regression_demo/brain_body.txt')
x_values = df[['Brain']]
y_values = df[['Body']]

#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()

Siraj's Week 1 challange

The weekly challange is to make a prediction of life expectancy from BMI at birth.

The challenge for this video is to use scikit-learn to create a line of best fit for the included 'challenge_dataset'. Then, make a prediction for an existing data point and see how close it matches up to the actual value. Print out the error you get.



In [109]:

    
#read data
df = pd.read_csv('linear_regression_demo/challenge_dataset.txt', names=['Data','Outcome'])
x_values = df[['Data']]
y_values = df[['Outcome']]

#train model on data
reg = linear_model.LinearRegression()
reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, reg.predict(x_values))
plt.show()

So now we have simple trained dataset. now to make a prediction.



In [115]:

    
df['Predictions'] = reg.predict(x_values)
df["Pred_Error"] = df['Predictions'] - df['Outcome']
df.head()









    Out[115]:






  
    
      
      Data
      Outcome
      Predictions
      Pred_Error
    
  
  
    
      0
      6.1101
      17.5920
      3.393774
      -14.198226
    
    
      1
      5.5277
      9.1302
      2.698951
      -6.431249
    
    
      2
      8.5186
      13.6620
      6.267196
      -7.394804
    
    
      3
      7.0032
      11.8540
      4.459272
      -7.394728
    
    
      4
      5.8598
      6.8233
      3.095158
      -3.728142



In [114]:

    
# calculating the avg error
e = 0
for i in df['Pred_Error']:
    e += abs(i)
e / len(df['Pred_Error'])









    Out[114]:





2.194245398827007

Linear Regression Quiz



In [124]:

    
import pandas as pd
from sklearn.linear_model import LinearRegression

# Assign the dataframe to this variable.
# TODO: Load the data
bmi_life_data = pd.read_csv('bmi_and_life_expectancy.csv') 
print(bmi_life_data.shape)
bmi_life_data.head()









    



(163, 3)






    Out[124]:






  
    
      
      Country
      Life expectancy
      BMI
    
  
  
    
      0
      Afghanistan
      52.8
      20.62058
    
    
      1
      Albania
      76.8
      26.44657
    
    
      2
      Algeria
      75.5
      24.59620
    
    
      3
      Andorra
      84.6
      27.63048
    
    
      4
      Angola
      56.7
      22.25083



In [136]:

    
# Make and fit the linear regression model
#TODO: Fit the model and Assign it to bmi_life_model
x_vals = bmi_life_data[['BMI']]
y_vals = bmi_life_data[['Life expectancy']]

bmi_life_model = LinearRegression()
bmi_life_model.fit(x_vals, y_vals)

plt.scatter(x_vals, y_vals)
plt.plot(x_vals, bmi_life_model.predict(x_vals))
plt.show()



In [138]:

    
# Mak a prediction using the model
# TODO: Predict life expectancy for a BMI value of 21.07931
laos_life_exp = bmi_life_model.predict(21.07931)
laos_life_exp









    Out[138]:





array([[ 60.31564716]])

Programming Quiz: Multiple Linear Regression

In this quiz, you'll be using the Boston house-prices dataset. The dataset consists of 13 features of 506 houses and their median value in $1000's. You'll fit a model on the 13 features to predict on the value of houses.



In [157]:

    
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

# Load the data from the the boston house-prices dataset 
boston_data = load_boston()
print(boston_data.data[0], boston_data.target[0])









    



[  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
   5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
   1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
   4.98000000e+00] 24.0



In [162]:

    
x = boston_data['data']
y = boston_data['target']

# Make and fit the linear regression model
# TODO: Fit the model and Assign it to the model variable
model = LinearRegression()
model.fit(x,y)

# Make a prediction using the model
sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
                6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
                1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
# TODO: Predict housing price for the sample_house
prediction = model.predict(sample_house)
print(prediction)









    



[ 23.68420569]

Siraj's Linear Regression live course

Siraj's code is here.

dataset of student test scores and the amount of hours they studied. Intuitively, there must be a relationship right? The more you study, the better your test scores should be. We're going to use linear regression to prove this relationship.



In [181]:

    
#Step 1 - collect our data
df = pd.read_csv('linear_regression_live/data.csv', header=None)
df.head()



In [192]:

    
#collect data using numpy
points = np.genfromtxt('linear_regression_live/data.csv', delimiter=',')
points[:5]









    Out[192]:





array([[ 32.50234527,  31.70700585],
       [ 53.42680403,  68.77759598],
       [ 61.53035803,  62.5623823 ],
       [ 47.47563963,  71.54663223],
       [ 59.81320787,  87.23092513]])



In [235]:

    
# lets see the data
plt.scatter(df[0], df[1])
plt.show()

Step 2 - define our hyperparameters for the eq y = mx + b (slope formula) how fast should our model converge?



In [185]:

    
learning_rate = 0.0001
initial_b = 0
initial_m = 0
num_iterations = 1000

Step 3: Train the model



In [230]:

    
def compute_error_for_line_given_points(b, m , points):
    totalError = 0 #initialize error at 0
    for i in range(0, len(points)): #for every point
        x = points[i, 0] #get x val
        y = points[i, 1] #get y val
        totalError += (y - (m*x + b)) **2
    return totalError / float(len(points))

def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
    b = starting_b
    m = starting_m
    
    #gradient descent
    for i in range(num_iterations):
        #update b & m with new more accurate b and m
        b, m = step_gradient(b, m, np.array(points), learning_rate)
    return [b,m]

def step_gradient(b_current, m_current, points, learningRate):
    b_gradient = 0
    m_gradient = 0
    N = float(len(points))
    for i in range(0, len(points)):
        x = points[i, 0]
        y = points[i, 1]
        #direction with respect to b and m
        #computing partial deriavitives of our error function
        b_gradient += -(2/N) * (y - ((m_current * x) + b_current))
        m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))
    #update b and m values using partial derivates
    new_b = b_current - (learningRate * b_gradient)
    new_m = m_current - (learningRate * m_gradient)
    return [new_b, new_m]



In [233]:

    
print('starting gradient descent at b = {0}, m = {1}, error = {2}'.format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
print('ending point after {0} iterations at b = {1}, m = {2}, error = {3}'.format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))









    



starting gradient descent at b = 0, m = 0, error = 5565.107834483211
ending point after 1000 iterations at b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473



In [232]:

    
b,m









    Out[232]:





(0.088936519937413458, 1.4777440851894448)



In [245]:

    
x_vals = df[[0]]
y_vals = df[[1]]

live_model = LinearRegression()
live_model.fit(x_vals, y_vals)

plt.scatter(x_vals, y_vals)
plt.plot(x_vals, live_model.predict(x_vals))
plt.show()



In [246]:

    
run gradient_descent.py









    



           0          1
0  32.502345  31.707006
1  53.426804  68.777596
2  61.530358  62.562382
3  47.475640  71.546632
4  59.813208  87.230925






    












    



starting gradient descent at b = 0, m = 0, error = 5565.107834483211
ending point at b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473



In [ ]:

	Data	Outcome	Predictions	Pred_Error
0	6.1101	17.5920	3.393774	-14.198226
1	5.5277	9.1302	2.698951	-6.431249
2	8.5186	13.6620	6.267196	-7.394804
3	7.0032	11.8540	4.459272	-7.394728
4	5.8598	6.8233	3.095158	-3.728142

	Country	Life expectancy	BMI
0	Afghanistan	52.8	20.62058
1	Albania	76.8	26.44657
2	Algeria	75.5	24.59620
3	Andorra	84.6	27.63048
4	Angola	56.7	22.25083

	0	1
0	32.502345	31.707006
1	53.426804	68.777596
2	61.530358	62.562382
3	47.475640	71.546632
4	59.813208	87.230925