In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV, Lasso
from math import sqrt
import seaborn as sns
np.set_printoptions(suppress=True, precision=4)
plt.rcParams['figure.figsize'] = 10, 6
%matplotlib inline
In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/abulbasar/data/master/startups.csv")
df
Out[2]:
In [3]:
df.info()
There are 50 observations and 5 columns. 4 columns - R&D Spend, Administration and Marketing Spend, and Profile are numeric and one is categorical - State. There is 2 null values in columns R&D Spend feature and and 3 in Marketing Spend.
Replace the null values with median for respective state.
In [4]:
df_null_idx = df[df.isnull().sum(axis = 1) > 0].index
df.iloc[df_null_idx]
Out[4]:
In [5]:
median_values = df.groupby("State")[["R&D Spend", "Marketing Spend"]].median()
median_values
Out[5]:
In [6]:
df["R&D Spend"] = df.apply(lambda row: median_values.loc[row["State"], "R&D Spend"] if np.isnan(row["R&D Spend"]) else row["R&D Spend"], axis = 1 )
df["Marketing Spend"] = df.apply(lambda row: median_values.loc[row["State"], "Marketing Spend"] if np.isnan(row["Marketing Spend"]) else row["Marketing Spend"], axis = 1 )
df.iloc[df_null_idx]
Out[6]:
In [7]:
# Check if there are any more null values.
df.isnull().sum()
Out[7]:
Let's see the distribution of the Profit using a histogram plot and see if there is any outliers in the data using bosplot.
In [8]:
plt.figure(figsize = (8, 6))
plt.subplot(2, 1, 1)
df.Profit.plot.hist(bins = 10, normed = True)
df.Profit.plot.kde(title = "Historgram of Profit")
plt.subplot(2, 1, 2)
df.Profit.plot.box(vert = False, title = "Boxplot of Profit")
plt.tight_layout()
Profit has one outlier. We can try to take log scale to remove the outlier value before doing any prediction. But for now, let ignore the outlier.
Let's plot association between each pair of columns.
In [9]:
sns.pairplot(df)
Out[9]:
Displays only the numeric column. Let's how the avg Profit plays for each State.
In [10]:
df.groupby("State").Profit.mean().sort_values().plot.bar(title = "Avg Profit by State")
plt.xlabel("State")
plt.ylabel("Profit")
Out[10]:
Avg Profit is highest in state of Florida and least in California.
Let's create the y vector containing the outcome column.
In [11]:
y = df.Profit.values
y
Out[11]:
Create dummy variables for categorical feature.
In [12]:
df_features = df.iloc[:, 0:4]
df_dummied = pd.get_dummies(df_features, columns=["State"], drop_first=True)
df_dummied.sample(10)
Out[12]:
State column has been replaced by two additional column - one for Florida and one NY. First value in the categorical values CA has been dropped to avoid collinearity issue.
Now, let's create X feature matrix and y outcome vector.
In [13]:
X = df_dummied.values
X[0, :]
Out[13]:
Let's normalize the feature values to bring them to a similar scale.
In [14]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()
Out[14]:
Split the X and y into training and test sets.
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y,
test_size = 0.3, random_state = 100)
In [16]:
print("Training set: ", X_train.shape, y_train.shape)
In [17]:
print("Test set: ", X_test.shape, y_test.shape)
Ratio of the size of the training data
In [18]:
X_train.shape[0] / df.shape[0]
Out[18]:
Fit linear regression model
In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)
Out[19]:
In [20]:
lr.intercept_, lr.coef_
Out[20]:
By looking at the cofficients, we can conclude that R&D Spend has the higest influence on the outcome variable.
Predict the outcome based on the model
In [21]:
y_test_pred = lr.predict(X_test)
In [22]:
output = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
output["error"] = output.actual - output.prediction
output
Out[22]:
A simpliest prediction model could have been the average. Let's how the model did overall against one feature.
In [23]:
X_test_inv = scaler.inverse_transform(X_test)
plt.scatter(X_test_inv[:, 0], y_test, alpha = 0.3, c = "blue", label = "Actual")
plt.scatter(X_test_inv[:, 0], y_test_pred, c = "red", label = "Predicted")
plt.xlabel("R&D Spend")
plt.ylabel("Profit")
plt.title("Profit Actual vs Estimate")
plt.legend()
Out[23]:
In [24]:
np.mean((y_test_pred - y_test) ** 2)
Out[24]:
In [25]:
y_train_pred = lr.predict(X_train)
Compare the root mean squared error (RMSE) of test dataset against the training.
In [26]:
print("Test rmse: ", sqrt(mean_squared_error(y_test, y_test_pred)),
"\nTraining rmse:", sqrt(mean_squared_error(y_train, y_train_pred)))
r2 score can have a max value 1, negative values of R2 means suboptimal model
In [27]:
r2_score(y_test, y_test_pred), r2_score(y_train, y_train_pred)
Out[27]:
On the training the both RMSE and R2 scores perform natually better than those on the test dataset.
Let's calculate R2 score manually.
In [28]:
SSR = np.sum((y_train - y_train_pred) ** 2) # Sum of squared residuals
SST = np.sum((y_train - np.mean(y_train_pred)) ** 2) # Sum of squared totals
R2 = 1 - SSR/SST
R2
Out[28]:
R2 can be viewed as (1 - mse/variance(y))
In [29]:
from sklearn.feature_selection import f_regression
In [30]:
_, p_vals = f_regression(X_train, y_train)
p_vals
Out[30]:
In [31]:
pd.DataFrame({"feature": df_dummied.columns, "p_value": p_vals})
Out[31]:
p-value indicates the significant scores for each feature. p-value < 0.05 indicates, the corresponding feature is statistically significant. We can rebuild the model excluding the non-significant features one by one until all remaining features are significant.
In [32]:
df = pd.read_csv("/data/Combined_Cycle_Power_Plant.csv")
df.head()
Out[32]:
In [33]:
X = df.iloc[:, 0:4].values
y = df.PE.values
In [34]:
sns.pairplot(df)
Out[34]:
In [35]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = 1)
In [37]:
def rmse(y_true, y_pred):
return sqrt(mean_squared_error(y_true, y_pred))
In [38]:
lr = LinearRegression(normalize=False)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
rmse(y_test, y_test_pred)
Out[38]:
In [39]:
from scipy import stats
In [40]:
residuals = y_test - y_test_pred
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(y_test, residuals)
plt.xlabel("y_test")
plt.ylabel("Residuals")
plt.hlines([0], xmin = 420, xmax = 500, linestyles = "dashed")
plt.subplot(1, 2, 2)
stats.probplot(residuals, plot=plt)
Out[40]:
Residual plots show there are outliers in the lower end of the y_test values. qqPlot shows that residuals do not exhibit normaality, indicating non linearity in the model.
In [41]:
poly = PolynomialFeatures(degree=2)
X = df.iloc[:, 0:4].values
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.3, random_state = 100)
X_poly_train_std = scaler.fit_transform(X_poly_train)
X_poly_test_std = scaler.transform(X_poly_test)
pd.DataFrame(X_poly_train_std).head()
Out[41]:
In [42]:
lr.fit(X_poly_train_std, y_train)
print("Train rmse: ", rmse(y_train, lr.predict(X_poly_train_std)))
print("Test rmse: ", rmse(y_test, lr.predict(X_poly_test_std)))
In [43]:
print(lr.intercept_, lr.coef_)
Polynomial regression generally sufferes from overfitting. Let's regularize the model using Lasso.
In [44]:
lasso = Lasso(alpha=0.03, max_iter=10000, normalize=False, random_state=100)
lasso.fit(X_poly_train_std, y_train)
print("Train rmse: ", rmse(y_train, lasso.predict(X_poly_train_std)))
print("Test rmse: ", rmse(y_test, lasso.predict(X_poly_test_std)))
print(lasso.intercept_, lasso.coef_)
Let's find cross validation score that accuracy score is more reliable in a sense that it incorporates every piece of is incorporated in both training and testing.
In [45]:
X_poly_std = scaler.fit_transform(X_poly)
lasso = Lasso(alpha=0.03, max_iter=10000, random_state=100)
scores = cross_val_score(lasso, X_poly_std, y, cv = 10, scoring="neg_mean_squared_error")
scores = np.sqrt(-scores)
print("RMSE scores", scores)
print("Mean rmse: ", np.mean(scores))
In [46]:
from sklearn.pipeline import Pipeline
In [47]:
pipeline = Pipeline(steps = [
("poly", PolynomialFeatures(degree=2, include_bias=False)),
("scaler", StandardScaler()),
("lasso", Lasso(alpha=0.03, max_iter=10000, normalize=False, random_state=1))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)
pipeline.fit(X_train, y_train)
rmse(y_test, pipeline.predict(X_test))
Out[47]:
LassoCV helps find the best alpha. We could also use model tuning techqniues to find best alpha as well.
In [48]:
# Find best alpha
lassocv = LassoCV(cv = 10, max_iter=10000, tol=1e-5)
lassocv.fit(X_poly_std, y)
print("Lassocv alpha: ", lassocv.alpha_)
# Apply the best alpha to find cross validation score
lasso = Lasso(alpha = lassocv.alpha_, max_iter=10000, random_state=100)
scores = cross_val_score(lasso, X_poly_std, y, cv = 10, scoring="neg_mean_squared_error")
print("Mean rmse: ", np.mean(np.sqrt(-scores)))
Look at the cofficients values. Many of the features are not zero making the model parsimonious hence more robust - that is less prone to overfitting.
Let's plot how coefficient reached 0 values by varying the alpha valuess.
In [49]:
coefs = []
alphas = 10 ** np.linspace(-5, 5, 20)
for alpha in alphas:
lasso = Lasso(alpha=alpha, max_iter=10000, tol=1e-5,random_state=100)
lasso.fit(X_poly_std, y)
coefs.append(lasso.coef_)
plt.plot(alphas, coefs)
plt.xscale("log")
plt.xlabel("Alpha (penalty term on the coefficients)")
plt.ylabel("Coefficients of the features")
Out[49]:
From this graph, which alpha values should we select. That question can be answered by looking which alpha values gives the best performance (rmse for example). lassocv function does that for us, or we can use model tuning techniques using grid search - that will be explained later.
In [52]:
poly = PolynomialFeatures(degree=2)
X = df.iloc[:, 0:4].values
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.3, random_state = 100)
X_poly_train_std = scaler.fit_transform(X_poly_train)
X_poly_test_std = scaler.transform(X_poly_test)
In [53]:
gbm = xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=100,
objective='reg:linear', booster='gbtree',
reg_alpha=0.01, reg_lambda=1, random_state=0)
gbm.fit(X_poly_train_std, y_train)
print("rmse:", rmse(y_test, gbm.predict(X_poly_test_std)))
In [55]:
param = {'silent':1,
'objective':'reg:linear',
'booster':'gbtree',
'alpha': 0.01,
'lambda': 1
}
dtrain = xgb.DMatrix(X_poly_train_std, label=y_train)
dtest = xgb.DMatrix(X_poly_test_std, label=y_test)
watchlist = [(dtrain,'eval'), (dtest, 'train')]
num_round = 100
bst = xgb.train(param, dtrain, num_round, watchlist, verbose_eval=False)
print("rmse:", rmse(y_test, bst.predict(dtest)))
plt.figure(figsize=(8, 10))
xgb.plot_importance(bst)
Out[55]:
In [ ]: