In [172]:
#libraries used
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

Week 1: Types of machine learning, when to use machine learning, neural network architecture

Linear regression demo

brain_body.txt is dataset of brain and body sizes.


In [72]:
#read data
df = pd.read_fwf('linear_regression_demo/brain_body.txt')
x_values = df[['Brain']]
y_values = df[['Body']]

#train model on data
body_reg = linear_model.LinearRegression()
body_reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, body_reg.predict(x_values))
plt.show()


Siraj's Week 1 challange

The weekly challange is to make a prediction of life expectancy from BMI at birth.

The challenge for this video is to use scikit-learn to create a line of best fit for the included 'challenge_dataset'. Then, make a prediction for an existing data point and see how close it matches up to the actual value. Print out the error you get.


In [109]:
#read data
df = pd.read_csv('linear_regression_demo/challenge_dataset.txt', names=['Data','Outcome'])
x_values = df[['Data']]
y_values = df[['Outcome']]

#train model on data
reg = linear_model.LinearRegression()
reg.fit(x_values, y_values)

#visualize results
plt.scatter(x_values, y_values)
plt.plot(x_values, reg.predict(x_values))
plt.show()


So now we have simple trained dataset. now to make a prediction.


In [115]:
df['Predictions'] = reg.predict(x_values)
df["Pred_Error"] = df['Predictions'] - df['Outcome']
df.head()


Out[115]:
Data Outcome Predictions Pred_Error
0 6.1101 17.5920 3.393774 -14.198226
1 5.5277 9.1302 2.698951 -6.431249
2 8.5186 13.6620 6.267196 -7.394804
3 7.0032 11.8540 4.459272 -7.394728
4 5.8598 6.8233 3.095158 -3.728142

In [114]:
# calculating the avg error
e = 0
for i in df['Pred_Error']:
    e += abs(i)
e / len(df['Pred_Error'])


Out[114]:
2.194245398827007

Linear Regression Quiz


In [124]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Assign the dataframe to this variable.
# TODO: Load the data
bmi_life_data = pd.read_csv('bmi_and_life_expectancy.csv') 
print(bmi_life_data.shape)
bmi_life_data.head()


(163, 3)
Out[124]:
Country Life expectancy BMI
0 Afghanistan 52.8 20.62058
1 Albania 76.8 26.44657
2 Algeria 75.5 24.59620
3 Andorra 84.6 27.63048
4 Angola 56.7 22.25083

In [136]:
# Make and fit the linear regression model
#TODO: Fit the model and Assign it to bmi_life_model
x_vals = bmi_life_data[['BMI']]
y_vals = bmi_life_data[['Life expectancy']]

bmi_life_model = LinearRegression()
bmi_life_model.fit(x_vals, y_vals)

plt.scatter(x_vals, y_vals)
plt.plot(x_vals, bmi_life_model.predict(x_vals))
plt.show()



In [138]:
# Mak a prediction using the model
# TODO: Predict life expectancy for a BMI value of 21.07931
laos_life_exp = bmi_life_model.predict(21.07931)
laos_life_exp


Out[138]:
array([[ 60.31564716]])

Programming Quiz: Multiple Linear Regression

In this quiz, you'll be using the Boston house-prices dataset. The dataset consists of 13 features of 506 houses and their median value in $1000's. You'll fit a model on the 13 features to predict on the value of houses.


In [157]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

# Load the data from the the boston house-prices dataset 
boston_data = load_boston()
print(boston_data.data[0], boston_data.target[0])


[  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
   5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
   1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
   4.98000000e+00] 24.0

In [162]:
x = boston_data['data']
y = boston_data['target']

# Make and fit the linear regression model
# TODO: Fit the model and Assign it to the model variable
model = LinearRegression()
model.fit(x,y)

# Make a prediction using the model
sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
                6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
                1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
# TODO: Predict housing price for the sample_house
prediction = model.predict(sample_house)
print(prediction)


[ 23.68420569]

Siraj's Linear Regression live course

Siraj's code is here.

dataset of student test scores and the amount of hours they studied. Intuitively, there must be a relationship right? The more you study, the better your test scores should be. We're going to use linear regression to prove this relationship.


In [181]:
#Step 1 - collect our data
df = pd.read_csv('linear_regression_live/data.csv', header=None)
df.head()


Out[181]:
0 1
0 32.502345 31.707006
1 53.426804 68.777596
2 61.530358 62.562382
3 47.475640 71.546632
4 59.813208 87.230925

In [192]:
#collect data using numpy
points = np.genfromtxt('linear_regression_live/data.csv', delimiter=',')
points[:5]


Out[192]:
array([[ 32.50234527,  31.70700585],
       [ 53.42680403,  68.77759598],
       [ 61.53035803,  62.5623823 ],
       [ 47.47563963,  71.54663223],
       [ 59.81320787,  87.23092513]])

In [235]:
# lets see the data
plt.scatter(df[0], df[1])
plt.show()


Step 2 - define our hyperparameters for the eq y = mx + b (slope formula) how fast should our model converge?


In [185]:
learning_rate = 0.0001
initial_b = 0
initial_m = 0
num_iterations = 1000

Step 3: Train the model


In [230]:
def compute_error_for_line_given_points(b, m , points):
    totalError = 0 #initialize error at 0
    for i in range(0, len(points)): #for every point
        x = points[i, 0] #get x val
        y = points[i, 1] #get y val
        totalError += (y - (m*x + b)) **2
    return totalError / float(len(points))

def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
    b = starting_b
    m = starting_m
    
    #gradient descent
    for i in range(num_iterations):
        #update b & m with new more accurate b and m
        b, m = step_gradient(b, m, np.array(points), learning_rate)
    return [b,m]

def step_gradient(b_current, m_current, points, learningRate):
    b_gradient = 0
    m_gradient = 0
    N = float(len(points))
    for i in range(0, len(points)):
        x = points[i, 0]
        y = points[i, 1]
        #direction with respect to b and m
        #computing partial deriavitives of our error function
        b_gradient += -(2/N) * (y - ((m_current * x) + b_current))
        m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))
    #update b and m values using partial derivates
    new_b = b_current - (learningRate * b_gradient)
    new_m = m_current - (learningRate * m_gradient)
    return [new_b, new_m]

In [233]:
print('starting gradient descent at b = {0}, m = {1}, error = {2}'.format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
[b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
print('ending point after {0} iterations at b = {1}, m = {2}, error = {3}'.format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)))


starting gradient descent at b = 0, m = 0, error = 5565.107834483211
ending point after 1000 iterations at b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473

In [232]:
b,m


Out[232]:
(0.088936519937413458, 1.4777440851894448)

In [245]:
x_vals = df[[0]]
y_vals = df[[1]]

live_model = LinearRegression()
live_model.fit(x_vals, y_vals)

plt.scatter(x_vals, y_vals)
plt.plot(x_vals, live_model.predict(x_vals))
plt.show()



In [246]:
run gradient_descent.py


           0          1
0  32.502345  31.707006
1  53.426804  68.777596
2  61.530358  62.562382
3  47.475640  71.546632
4  59.813208  87.230925
starting gradient descent at b = 0, m = 0, error = 5565.107834483211
ending point at b = 0.08893651993741346, m = 1.4777440851894448, error = 112.61481011613473

In [ ]: