Simple_Linear_Regression



In [1]:

    
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd









    



/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [9]:

    
# The datset has two columns YearsExperience and Salary



In [3]:

    
# import the dataset
dataset = pd.read_csv('datasets/Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values



In [8]:

    
dataset.head()









    Out[8]:






  
    
      
      YearsExperience
      Salary
    
  
  
    
      0
      1.1
      39343.0
    
    
      1
      1.3
      46205.0
    
    
      2
      1.5
      37731.0
    
    
      3
      2.0
      43525.0
    
    
      4
      2.2
      39891.0



In [5]:

    
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 0)



In [6]:

    
#Fit Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)









    Out[6]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [7]:

    
# Predict the test set result
Y_pred = regressor.predict(X_test)



In [10]:

    
# Visualize only training set results
plt.scatter(X_train, Y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Training Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()



In [11]:

    
# in the above figure the real values are the red dots.
#The predicted salaries are in the blue line.
# It is a good fit.

Now, lets see how this model predicts salaries for new observations



In [12]:

    
# Visualize only test set results
plt.scatter(X_test, Y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Test Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()

# Dont change the X_train to X_test in second line as 
# our regressor is already trained on the train set.



In [14]:

    
# In the above figure the blue line is our simple linear regression 
# model trained on training set.
# The result is almost good on new observations.