In [13]:
import pandas as pd
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
In [28]:
y = np.array([0.72*i*np.random.randint(1,20) + 2*i for i in range(50)])
x = np.array([i for i in range(len(y))])
plt.scatter(x,y);
Now to Linear Regret it.
In [29]:
from sklearn.linear_model import LinearRegression
a = LinearRegression()
#putting the input data into a df to feed to the LM model as it doesn't like 1d arrays
df = pd.DataFrame(x, columns=['Input'])
df.head()
Out[29]:
In [30]:
a.fit(df, y)
plt.scatter(x,y, label='Original Data')
plt.plot(a.predict(df), color='r', label='Predictions')
plt.legend();
Split your data set into testing and training:
In [40]:
x_train, x_test, y_train, y_test = train_test_split(df,y)
len(x_train), len(x_test), len(y_train), len(y_test)
Out[40]:
Here we can see the model does a relatively good job of predicting the data it was trained on:
In [48]:
a2 = LinearRegression()
a2.fit(x_train,y_train)
a2.predict(x_test)
#plt.scatter(x,y, label='Original Data')
plt.plot(a2.predict(x_train), color='r', label='Predictions')
plt.plot(y_train, label="Actual", color="b")
plt.legend();
But it doesn't do a good job on the test data:
In [49]:
#plt.scatter(x,y, label='Original Data')
plt.plot(a2.predict(x_test), color='r', label='Predictions')
plt.plot(y_test, label="Actual", color="b")
plt.legend();
In [10]:
boston = datasets.load_boston()
data = pd.DataFrame(boston.data)
data.head()
Out[10]:
In [11]:
target = boston.target
target.shape
Out[11]:
In [12]:
lm = LinearRegression()
lm.fit(data, target)
predicted = lm.predict(data)
plt.plot(predicted, label='predicted')
plt.plot(target, label='target')
plt.legend();
In [50]:
from sklearn.metrics import r2_score
r2_score(target,predicted)
Out[50]:
In [ ]: