In [2]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
import scipy.stats
import statsmodels.formula.api as smf
import sklearn
from sklearn.model_selection import train_test_split
In [3]:
#Read data
df_companies = pd.read_csv("data/big3_position.csv",sep="\t")
df_companies["log_revenue"] = np.log10(df_companies["Revenue"])
df_companies["log_assets"] = np.log10(df_companies["Assets"])
df_companies["log_employees"] = np.log10(df_companies["Employees"])
df_companies["log_marketcap"] = np.log10(df_companies["MarketCap"])
#Keep only industrial companies
df_companies = df_companies.loc[:,["log_revenue","log_assets","log_employees","log_marketcap","Company_name","TypeEnt"]]
df_companies = df_companies.loc[df_companies["TypeEnt"]=="Industrial company"]
#Dropnans
df_companies = df_companies.replace([np.inf,-np.inf],np.nan)
df_companies = df_companies.dropna()
df_companies.head()
Out[3]:
Correlation between variables
In [4]:
# Compute the correlation matrix
corr = df_companies.corr()
# Generate a mask for the upper triangle (hide the upper triangle)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, square=True,linewidths=.5,cmap="YlOrRd",vmin=0,vmax=1)
plt.show()
Revenue, employees and assets are highly correlated.
Let's imagine wwe want to explain the market capitalization in terms of the other variables.
In [5]:
mod = smf.ols(formula='log_marketcap ~ log_revenue + log_employees + log_assets', data=df_companies)
res = mod.fit()
print(res.summary())
In [6]:
#The residuals are fine
plt.figure(figsize=(4,3))
sns.regplot(res.predict(),df_companies["log_marketcap"] -res.predict())
Out[6]:
In [7]:
#Get many models to see hwo coefficient changes
from statsmodels.iolib.summary2 import summary_col
mod1 = smf.ols(formula='log_marketcap ~ log_revenue + log_employees + log_assets', data=df_companies).fit()
mod2 = smf.ols(formula='log_marketcap ~ log_revenue + log_assets', data=df_companies).fit()
mod3 = smf.ols(formula='log_marketcap ~ log_employees + log_assets', data=df_companies).fit()
mod4 = smf.ols(formula='log_marketcap ~ log_assets', data=df_companies).fit()
mod5 = smf.ols(formula='log_marketcap ~ log_revenue + log_employees ', data=df_companies).fit()
mod6 = smf.ols(formula='log_marketcap ~ log_revenue ', data=df_companies).fit()
mod7 = smf.ols(formula='log_marketcap ~ log_employees ', data=df_companies).fit()
output = summary_col([mod1,mod2,mod3,mod4,mod5,mod6,mod7],stars=True)
print(mod1.rsquared_adj,mod2.rsquared_adj,mod3.rsquared_adj,mod4.rsquared_adj,mod5.rsquared_adj,mod6.rsquared_adj,mod7.rsquared_adj)
output
Out[7]:
In [8]:
X = df_companies.loc[:,["log_revenue","log_employees","log_assets"]]
X.head(2)
Out[8]:
In [9]:
#Let's scale all the columns to have mean 0 and std 1
from sklearn.preprocessing import scale
X_to_combine = scale(X)
X_to_combine
Out[9]:
In [10]:
#In this case we sum them together
X_combined = np.sum(X_to_combine,axis=1)
X_combined
Out[10]:
In [11]:
#Add a new column with our combined variable and run regression
df_companies["combined"] = X_combined
print(smf.ols(formula='log_marketcap ~ combined ', data=df_companies).fit().summary())
In [12]:
#Do the fitting
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
new_X = pca.fit_transform(X)
print("Explained variance")
print(pca.explained_variance_ratio_)
print()
print("Weight of components")
print(["log_revenue","log_employees","log_assets"])
print(pca.components_)
print()
new_X
Out[12]:
In [13]:
#Create our new variables (2 components, so 2 variables)
df_companies["pca_x1"] = new_X[:,0]
df_companies["pca_x2"] = new_X[:,1]
In [14]:
print(smf.ols(formula='log_marketcap ~ pca_x1 + pca_x2 ', data=df_companies).fit().summary())
In [15]:
print("Before")
sns.lmplot("log_revenue","log_assets",data=df_companies,fit_reg=False)
Out[15]:
In [16]:
print("After")
sns.lmplot("pca_x1","pca_x2",data=df_companies,fit_reg=False)
Out[16]:
In [43]:
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2)
new_X = fa.fit_transform(X)
print("Weight of components")
print(["log_revenue","log_employees","log_assets"])
print(fa.components_)
print()
new_X
Out[43]:
In [44]:
#New variables
df_companies["fa_x1"] = new_X[:,0]
df_companies["fa_x2"] = new_X[:,1]
In [45]:
print(smf.ols(formula='log_marketcap ~ fa_x1 + fa_x2 ', data=df_companies).fit().summary())
In [46]:
print("After")
sns.lmplot("fa_x1","fa_x2",data=df_companies,fit_reg=False)
Out[46]:
Both have a regularization parameter, that penalizes having many terms. How to choose the best value of this parameter?
In [47]:
Image(url="http://www.holehouse.org/mlclass/07_Regularization_files/Image.png")
Out[47]:
In [19]:
from sklearn.model_selection import train_test_split
y = df_companies["log_marketcap"]
X = df_companies.loc[:,["log_revenue","log_employees","log_assets"]]
X.head(2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train.head()
Out[19]:
Linear regression (to compare)
In [25]:
df_train = X_train.copy()
df_train["log_marketcap"] = y_train
df_train.head()
mod = smf.ols(formula='log_marketcap ~ log_revenue + log_employees + log_assets', data=df_train).fit()
print("log_revenue log_employees log_assets ")
print(mod.params.values[1:])
SVR
In [56]:
from sklearn.svm import SVR
clf = SVR(C=0.1, epsilon=0.2,kernel="linear")
clf.fit(X_train, y_train)
print("log_revenue log_employees log_assets ")
print(clf.coef_)
Lasso
In [53]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha = 0.01)
reg.fit(X_train,y_train)
print("log_revenue log_employees log_assets ")
print(reg.coef_)
Summary
In [57]:
print(["SVR","Lasso","Linear regression"])
err1,err2,err3 = sklearn.metrics.mean_squared_error(clf.predict(X_test),y_test),sklearn.metrics.mean_squared_error(reg.predict(X_test),y_test),sklearn.metrics.mean_squared_error(mod.predict(X_test),y_test)
print(err1,err2,err3)
In [58]:
print(["SVR","Lasso","Linear regression"])
err1,err2,err3 = sklearn.metrics.r2_score(clf.predict(X_test),y_test),sklearn.metrics.r2_score(reg.predict(X_test),y_test),sklearn.metrics.r2_score(mod.predict(X_test),y_test)
print(err1,err2,err3)