In [1]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [9]:
# The datset has two columns YearsExperience and Salary
In [3]:
# import the dataset
dataset = pd.read_csv('datasets/Salary_Data.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 1].values
In [8]:
dataset.head()
Out[8]:
In [5]:
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 0)
In [6]:
#Fit Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
Out[6]:
In [7]:
# Predict the test set result
Y_pred = regressor.predict(X_test)
In [10]:
# Visualize only training set results
plt.scatter(X_train, Y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Training Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()
In [11]:
# in the above figure the real values are the red dots.
#The predicted salaries are in the blue line.
# It is a good fit.
Now, lets see how this model predicts salaries for new observations
In [12]:
# Visualize only test set results
plt.scatter(X_test, Y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs. Experience[Test Set]')
plt.xlabel('Year of experience')
plt.ylabel('Salary')
plt.show()
# Dont change the X_train to X_test in second line as
# our regressor is already trained on the train set.
In [14]:
# In the above figure the blue line is our simple linear regression
# model trained on training set.
# The result is almost good on new observations.