Simple_Linear_Regression


In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [9]:
# The datset has two columns YearsExperience and Salary

In [3]:
# import the dataset
dataset = pd.read_csv('datasets/Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values

In [8]:
dataset.head()


Out[8]:
YearsExperience Salary
0 1.1 39343.0
1 1.3 46205.0
2 1.5 37731.0
3 2.0 43525.0
4 2.2 39891.0

In [5]:
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 0)

In [6]:
#Fit Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)


Out[6]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
# Predict the test set result
Y_pred = regressor.predict(X_test)

In [10]:
# Visualize only training set results
plt.scatter(X_train, Y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Training Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()



In [11]:
# in the above figure the real values are the red dots.
#The predicted salaries are in the blue line.
# It is a good fit.

Now, lets see how this model predicts salaries for new observations


In [12]:
# Visualize only test set results
plt.scatter(X_test, Y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Test Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()

# Dont change the X_train to X_test in second line as 
# our regressor is already trained on the train set.



In [14]:
# In the above figure the blue line is our simple linear regression 
# model trained on training set.
# The result is almost good on new observations.