In [63]:
import pandas as pd
import pylab as plt
import seaborn as sns
import numpy as np
import scikits.bootstrap as bootstrap
import scipy.stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.formula.api as smf
%matplotlib inline
In [ ]:
#Read file "data/company_data.csv" (beware the encoding) and print header
In [6]:
df = pd.read_csv("data/company_data.csv",encoding="iso-8859-1",sep="\t")
df.head()
Out[6]:
In [ ]:
#Use df.describe() to see how it looks like. Why is dollars_th not there? (describe only describes numeric columns)
In [4]:
df.describe()
Out[4]:
In [ ]:
#Sort the df by dollars_th to find out why this column is not numeric (check ascenidng=False and ascending=True)
In [9]:
#Weird symbol
df.sort_values(by="Dollars_th",ascending=False).head()
Out[9]:
In [ ]:
#Use df.replace() to replace the weird symbol `(you'll probably need the option regex=True)
In [10]:
df = df.replace("`","",regex=True)
df["Dollars_th"] = df["Dollars_th"].astype(float)
In [ ]:
#Pivot or Melt to tidy format
In [12]:
#Columns already present
variables_already_present = ["Company_name","Company_ID","Big3Share","Position","Exchange","TypeEnt"]
#Column with the variables
column_to_split = ["VAR"]
#Column with the values
column_with_values = "Dollars_th"
#Fix
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index()
df_fixed.head()
Out[12]:
In [ ]:
#Merge with this other dataset ("data/employees.csv")
In [17]:
df_emp = pd.read_csv("data/employees.csv",sep="\t")
df_merged = pd.merge(df_emp,df_fixed)
df_merged.head()
Out[17]:
In [ ]:
#Check normality (qqplot and histogram)
In [18]:
def qq_plot(x):
import scipy.stats
(osm, osr),(slope, intercept, r) = scipy.stats.probplot(x, dist='norm', plot=None)
plt.plot(osm, osr, '.', osm, slope*osm + intercept)
plt.xlabel('Quantiles',fontsize=14)
plt.ylabel('Quantiles Obs',fontsize=14)
In [20]:
emp = df_merged["Employees"]
rev = df_merged["Revenue"]
assets = df_merged["Assets"]
mark = df_merged["MarketCap"]
qq_plot(emp)
qq_plot(rev)
qq_plot(assets)
qq_plot(mark)
In [ ]:
#Transform variables to log
In [47]:
df_merged["log_Employees"] = np.log10(df_merged["Employees"])
df_merged["log_Revenue"] = np.log10(df_merged["Revenue"])
df_merged["log_Assets"] = np.log10(df_merged["Assets"])
df_merged["log_MarketCap"] = np.log10(df_merged["MarketCap"])
df_merged = df_merged.replace([np.inf,-np.inf],np.nan).dropna()
In [48]:
emp = df_merged["log_Employees"]
rev = df_merged["log_Revenue"]
assets = df_merged["log_Assets"]
mark = df_merged["log_MarketCap"]
qq_plot(emp)
qq_plot(rev)
qq_plot(assets)
qq_plot(mark)
In [49]:
sns.distplot(emp.dropna(),hist=False,label='emp')
sns.distplot(rev.dropna(),hist=False,label="rev")
sns.distplot(assets.dropna(),hist=False,label="assets")
sns.distplot(mark.dropna(),hist=False,label="mark")
Out[49]:
In [ ]:
#create 4 subsets (rows with position=1,position=2,position=3,position>3)
In [51]:
pos1 = df_merged.loc[df_merged["Position"]==1,"log_MarketCap"]
pos2 = df_merged.loc[df_merged["Position"]==2,"log_MarketCap"]
pos3 = df_merged.loc[df_merged["Position"]==3,"log_MarketCap"]
pos4 = df_merged.loc[df_merged["Position"]>3,"log_MarketCap"]
In [ ]:
#calculate the confidence intervals of the subsets
In [52]:
print(bootstrap.ci(pos1, statfunction=np.mean,n_samples=100000))
print(bootstrap.ci(pos2, statfunction=np.mean,n_samples=100000) )
print(bootstrap.ci(pos3, statfunction=np.mean,n_samples=100000) )
print(bootstrap.ci(pos4, statfunction=np.mean,n_samples=100000))
In [ ]:
#Do the right test (anova/kruskalwallis). If same variance (or very similar) do tukey test
In [54]:
scipy.stats.levene(pos1,pos2,pos3,pos4)
Out[54]:
In [55]:
scipy.stats.f_oneway(pos1,pos2,pos3,pos4)
Out[55]:
In [60]:
df_merged.loc[df["Position"]>3 , "Position"] = 3
In [61]:
#Plot tukey test
res2 = pairwise_tukeyhsd(df_merged["log_MarketCap"],df_merged["Position"])
print(res2)
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='Group')
plt.show()
In [ ]:
#Make a scatter plot of x = assets and y = market capitalization with hue = Type of ent
In [76]:
sns.lmplot(x="log_Assets",y="log_MarketCap",hue="TypeEnt",data=df_merged)
plt.ylim((-2,10))
Out[76]:
In [ ]:
#Run a liner regression explaining market capitalization in terms of the assets and
#the type of entity column.
In [66]:
#How to run a regression (be careful, this is wrong)
mod = smf.ols(formula='log_MarketCap ~ log_Assets + C(TypeEnt)', data=df_merged)
res = mod.fit()
print(res.summary())
In [ ]:
#Check the assumptions of the regression
In [67]:
#Shape of residuals
sns.regplot(res.predict(),res.resid)
plt.ylabel('Residual')
plt.xlabel('Fitted values')
Out[67]:
In [68]:
#Normality
sns.distplot(res.resid,kde=False)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlim((-3,3))
Out[68]:
In [69]:
#Normality 2
qq_plot(res.resid)
In [71]:
#Larger leverage, larger influence. If high residual and high leverage -> Probably affecting the model
from statsmodels.graphics.regressionplots import *
influence_plot(res)
plt.show()
In [80]:
df_merged.loc[3305].values
Out[80]:
In [81]:
df_merged.loc[2975].values
Out[81]:
In [82]:
df_merged.loc[1309].values
Out[82]:
The high leverage is in the groups without barely any members (Insurance companies, venture companies and research insitutes. We probably should drop those groups, but it's okay.