In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [80]:
rand_1kx = np.random.randint(0,100,1000)
x_mean = np.mean(rand_1kx)
x_sd = np.std(rand_1kx)
x_mean
Out[80]:
In [81]:
pop_intercept = 30
pop_slope = 1.8
error_boost = 10
pop_error = np.random.standard_normal(size = rand_1kx.size) * error_boost
# I added an error booster since without it, the correlation was too high.
y = pop_intercept + pop_slope*rand_1kx + pop_error
y_mean = np.mean(y)
y_sd = np.std(y)
y_mean
Out[81]:
Make a scatter plot of X
and y
variables.
In [82]:
sns.jointplot(rand_1kx, y)
Out[82]:
X
and y
follow uniform
distribution, but the error $\epsilon$ is generated from standard normal distribution
with a boosting factor. Let us plot its histogram to verify the distribution
In [83]:
sns.distplot(pop_error)
Out[83]:
In [84]:
from sklearn.linear_model import LinearRegression
X_train_full = rand_1kx.reshape(-1,1)
y_train_full = y.reshape(-1,1)
In [85]:
y_train_full.shape
Out[85]:
In [86]:
lm.fit(X_train, y_train)
#print the linear model built
predicted_pop_slope = lm.coef_[0][0]
predicted_pop_intercept = lm.intercept_[0]
print("y = " + str(predicted_pop_slope) + "*X" + " + " + str(predicted_pop_intercept))
In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(rand_1kx, y, test_size=0.33)
print(X_train.size)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
In [88]:
X_train = X_train.reshape(-1,1)
X_test = X_test.reshape(-1,1)
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
In [89]:
y_train.shape
Out[89]:
In [90]:
lm.fit(X_train, y_train)
#print the linear model built
predicted_subset_slope = lm.coef_[0][0]
predicted_subset_intercept = lm.intercept_[0]
print("y = " + str(predicted_subset_slope) + "*X"
+ " + " + str(predicted_subset_intercept))
In [95]:
y_predicted = lm.predict(X_test)
residuals = y_test - y_predicted
Fitted vs Actual scatter
In [96]:
jax = sns.jointplot(y_test, y_predicted)
jax.set_axis_labels(xlabel='Y', ylabel='Predicted Y')
Out[96]:
In [98]:
dax = sns.distplot(residuals)
dax.set_title('Distribution of residuals')
Out[98]:
In [99]:
jax = sns.jointplot(y_predicted, residuals)
jax.set_axis_labels(xlabel='Predicted Y', ylabel='Residuals')
Out[99]:
In [100]:
jax = sns.jointplot(y_test, residuals)
jax.set_axis_labels(xlabel='Y', ylabel='Residuals')
Out[100]:
In [101]:
pop_df = pd.DataFrame(data={'x':rand_1kx, 'y':y})
pop_df.head()
Out[101]:
In [102]:
pop_df.shape
Out[102]:
In [103]:
sample_slopes = []
sample_intercepts = []
for i in range(0,50):
# perform a choice on dataframe index
sample_index = np.random.choice(pop_df.index, size=50)
# select the subset using that index
sample_df = pop_df.iloc[sample_index]
# convert to numpy and reshape the matrix for lm.fit
sample_x = np.array(sample_df['x']).reshape(-1,1)
sample_y = np.array(sample_df['y']).reshape(-1,1)
lm.fit(X=sample_x, y=sample_y)
sample_slopes.append(lm.coef_[0][0])
sample_intercepts.append(lm.intercept_[0])
Plot the distribution of sample slopes and intercepts
In [104]:
mean_sample_slope = np.mean(sample_slopes)
mean_sample_intercept = np.mean(sample_intercepts)
fig, ax = plt.subplots(1,2, figsize=(15,6))
# plot sample slopes
sns.distplot(sample_slopes, ax=ax[0])
ax[0].set_title('Distribution of sample slopes. Mean: '
+ str(round(mean_sample_slope, 2)))
ax[0].axvline(mean_sample_slope, color='black')
# plot sample slopes
sns.distplot(sample_intercepts, ax=ax[1])
ax[1].set_title('Distribution of sample intercepts. Mean: '
+ str(round(mean_sample_intercept,2)))
ax[1].axvline(mean_sample_intercept, color='black')
Out[104]:
In [114]:
print("Predicting using population")
print("----------------------------")
print("Error in intercept: {}".format(pop_intercept - predicted_pop_intercept))
print("Error in slope: {}".format(pop_slope - predicted_pop_slope))
print("\n\nPredicting using subset")
print("----------------------------")
print("Error in intercept: {}".format(pop_intercept - predicted_subset_intercept))
print("Error in slope: {}".format(pop_slope - predicted_subset_slope))
print("\n\nPredicting using a number of smaller samples")
print("------------------------------------------------")
print("Error in intercept: {}".format(pop_intercept - mean_sample_intercept))
print("Error in slope: {}".format(pop_slope - mean_sample_slope))
As we can see, error in quite small in all 3 cases, especially for slope
. Prediction by averaging a number of smaller samples gives us much closer slope to population.
For intercept, the least error was with prediction using subset, which is still interesting as prediction using the whole population yielded poorer intercept!
In general, for really large datasets, that cannot be held in system memory, we can apply Central Limit Theorem for estimating slope and intercept by averaging over a number of smaller samples.