notebook.community

Edit and run



In [63]:

    
import pandas as pd
import pylab as plt
import seaborn as sns
import numpy as np

import scikits.bootstrap as bootstrap
import scipy.stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

import statsmodels.formula.api as smf

%matplotlib inline



In [ ]:

    
#Read file "data/company_data.csv" (beware the encoding) and print header



In [6]:

    
df = pd.read_csv("data/company_data.csv",encoding="iso-8859-1",sep="\t")
df.head()









    Out[6]:






  
    
      
      Company_name
      Company_ID
      Big3Share
      Position
      Exchange
      TypeEnt
      VAR
      Dollars_th
    
  
  
    
      0
      INVESCO LTD
      BM40671R
      17.85
      1
      New York Stock Exchange (NYSE)
      Bank
      Revenue
      7500000.0
    
    
      1
      ROYAL CARIBBEAN CRUISES LTD
      LR30002MX
      14.32
      3
      New York Stock Exchange (NYSE)
      Industrial company
      Revenue
      7500000.0
    
    
      2
      GENCO SHIPPING & TRADING LTD
      MH30004AQ
      0.14
      31
      New York Stock Exchange (NYSE)
      Industrial company
      Revenue
      350000.0
    
    
      3
      EAGLE BULK SHIPPING INC
      MH40003AQ
      2.85
      9
      NASDAQ National Market
      Industrial company
      Revenue
      350000.0
    
    
      4
      POWERSHARES DB US DOLLAR INDEX BEARISH
      US*S00100679
      0.00
      101
      NYSE ARCA
      Industrial company
      Revenue
      NaN



In [ ]:

    
#Use df.describe() to see how it looks like. Why is dollars_th not there? (describe only describes numeric columns)



In [4]:

    
df.describe()









    Out[4]:






  
    
      
      Big3Share
      Position
    
  
  
    
      count
      11646.000000
      11646.000000
    
    
      mean
      10.323029
      16.122875
    
    
      std
      7.950457
      34.873495
    
    
      min
      0.000000
      1.000000
    
    
      25%
      2.420000
      1.000000
    
    
      50%
      10.365000
      2.000000
    
    
      75%
      17.200000
      8.000000
    
    
      max
      48.190000
      165.000000



In [ ]:

    
#Sort the df by dollars_th to find out why this column is not numeric  (check ascenidng=False and ascending=True)



In [9]:

    
#Weird symbol
df.sort_values(by="Dollars_th",ascending=False).head()









    Out[9]:






  
    
      
      Company_name
      Company_ID
      Big3Share
      Position
      Exchange
      TypeEnt
      VAR
      Dollars_th
    
  
  
    
      7772
      JOHN HANCOCK HEDGED EQUITY & INCOME FUND
      US*S00138046
      0.00
      105
      New York Stock Exchange (NYSE)
      Industrial company
      MarketCap
      `189761.0
    
    
      9613
      AGREE REALTY CORP
      US383148187
      24.10
      1
      New York Stock Exchange (NYSE)
      Financial company
      MarketCap
      999644.0
    
    
      7948
      FIVE STAR QUALITY CARE, INC.
      US043516029
      8.78
      1
      New York Stock Exchange (NYSE)
      Industrial company
      MarketCap
      99933.0
    
    
      2693
      ELIZABETH ARDEN INC
      US590914138
      12.02
      5
      NASDAQ National Market
      Industrial company
      Revenue
      999298.0
    
    
      6082
      UNIQUE FABRICATING, INC.
      US461846791
      0.00
      116
      NYSE MKT
      Industrial company
      Assets
      99921.0



In [ ]:

    
#Use df.replace() to replace the weird symbol `(you'll probably need the option regex=True)



In [10]:

    
df = df.replace("`","",regex=True)
df["Dollars_th"] = df["Dollars_th"].astype(float)



In [ ]:

    
#Pivot or Melt to tidy format



In [12]:

    
#Columns already present
variables_already_present = ["Company_name","Company_ID","Big3Share","Position","Exchange","TypeEnt"]

#Column with the variables
column_to_split = ["VAR"]

#Column with the values
column_with_values = "Dollars_th"

#Fix
df_fixed = df.pivot_table(column_with_values,
               variables_already_present,
               column_to_split).reset_index()

df_fixed.head()









    Out[12]:






  
    
      VAR
      Company_name
      Company_ID
      Big3Share
      Position
      Exchange
      TypeEnt
      Assets
      MarketCap
      Revenue
    
  
  
    
      0
      1-800 FLOWERS.COM, INC.
      US113117311
      5.08
      4
      NASDAQ National Market
      Industrial company
      497073.0
      289266.0
      1121506.0
    
    
      1
      1347 PROPERTY INSURANCE HOLDINGS INC
      US257638358L
      0.00
      120
      NASDAQ National Market
      Insurance company
      NaN
      42390.0
      37500.0
    
    
      2
      1ST CENTURY BANCSHARES, INC.
      US261169687
      0.00
      122
      NASDAQ National Market
      Bank
      64927.0
      115050.0
      2585.0
    
    
      3
      1ST CONSTITUTION BANCORP
      US223665653
      0.59
      15
      NASDAQ National Market
      Bank
      NaN
      99595.0
      37500.0
    
    
      4
      1ST SOURCE CORPORATION
      US351068133
      6.78
      4
      NASDAQ National Market
      Bank
      5187916.0
      884045.0
      249837.0



In [ ]:

    
#Merge with this other dataset ("data/employees.csv")



In [17]:

    
df_emp = pd.read_csv("data/employees.csv",sep="\t")

df_merged = pd.merge(df_emp,df_fixed)
df_merged.head()









    Out[17]:






  
    
      
      Company_ID
      Employees
      Company_name
      Big3Share
      Position
      Exchange
      TypeEnt
      Assets
      MarketCap
      Revenue
    
  
  
    
      0
      BM40671R
      7500.0
      INVESCO LTD
      17.85
      1
      New York Stock Exchange (NYSE)
      Bank
      NaN
      13123024.0
      7500000.0
    
    
      1
      LR30002MX
      7500.0
      ROYAL CARIBBEAN CRUISES LTD
      14.32
      3
      New York Stock Exchange (NYSE)
      Industrial company
      NaN
      16739323.0
      7500000.0
    
    
      2
      MH30004AQ
      1500.0
      GENCO SHIPPING & TRADING LTD
      0.14
      31
      New York Stock Exchange (NYSE)
      Industrial company
      NaN
      43392.0
      350000.0
    
    
      3
      MH40003AQ
      750.0
      EAGLE BULK SHIPPING INC
      2.85
      9
      NASDAQ National Market
      Industrial company
      NaN
      26674.0
      350000.0
    
    
      4
      US*S00100679
      NaN
      POWERSHARES DB US DOLLAR INDEX BEARISH
      0.00
      101
      NYSE ARCA
      Industrial company
      NaN
      44120.0
      NaN



In [ ]:

    
#Check normality (qqplot and histogram)



In [18]:

    
def qq_plot(x):
    import scipy.stats
    (osm, osr),(slope, intercept, r)  = scipy.stats.probplot(x, dist='norm', plot=None)
    plt.plot(osm, osr, '.', osm, slope*osm + intercept)
    plt.xlabel('Quantiles',fontsize=14)
    plt.ylabel('Quantiles Obs',fontsize=14)



In [20]:

    
emp = df_merged["Employees"]
rev = df_merged["Revenue"]
assets = df_merged["Assets"]
mark = df_merged["MarketCap"]
qq_plot(emp)
qq_plot(rev)
qq_plot(assets)
qq_plot(mark)



In [ ]:

    
#Transform variables to log



In [47]:

    
df_merged["log_Employees"] = np.log10(df_merged["Employees"])
df_merged["log_Revenue"] = np.log10(df_merged["Revenue"])
df_merged["log_Assets"] = np.log10(df_merged["Assets"])
df_merged["log_MarketCap"] = np.log10(df_merged["MarketCap"])

df_merged = df_merged.replace([np.inf,-np.inf],np.nan).dropna()









    



/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: divide by zero encountered in log10
  from ipykernel import kernelapp as app
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in log10
  from ipykernel import kernelapp as app
/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:4: RuntimeWarning: divide by zero encountered in log10



In [48]:

    
emp = df_merged["log_Employees"]
rev = df_merged["log_Revenue"]
assets = df_merged["log_Assets"]
mark = df_merged["log_MarketCap"]
qq_plot(emp)
qq_plot(rev)
qq_plot(assets)
qq_plot(mark)



In [49]:

    
sns.distplot(emp.dropna(),hist=False,label='emp')
sns.distplot(rev.dropna(),hist=False,label="rev")
sns.distplot(assets.dropna(),hist=False,label="assets")
sns.distplot(mark.dropna(),hist=False,label="mark")









    



/home/javiergb/Programs/anaconda3/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[49]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff1cca7bac8>



In [ ]:

    
#create 4 subsets (rows with position=1,position=2,position=3,position>3)



In [51]:

    
pos1 = df_merged.loc[df_merged["Position"]==1,"log_MarketCap"]
pos2 = df_merged.loc[df_merged["Position"]==2,"log_MarketCap"]
pos3 = df_merged.loc[df_merged["Position"]==3,"log_MarketCap"]
pos4 = df_merged.loc[df_merged["Position"]>3,"log_MarketCap"]



In [ ]:

    
#calculate the confidence intervals of the subsets



In [52]:

    
print(bootstrap.ci(pos1, statfunction=np.mean,n_samples=100000))
print(bootstrap.ci(pos2, statfunction=np.mean,n_samples=100000) )
print(bootstrap.ci(pos3, statfunction=np.mean,n_samples=100000) )
print(bootstrap.ci(pos4, statfunction=np.mean,n_samples=100000))









    



[ 6.39021132  6.46088479]
[ 5.88630393  6.06001343]
[ 5.75614305  5.94732321]
[ 5.22997249  5.32225272]



In [ ]:

    
#Do the right test (anova/kruskalwallis). If same variance (or very similar) do tukey test



In [54]:

    
scipy.stats.levene(pos1,pos2,pos3,pos4)









    Out[54]:





LeveneResult(statistic=1.5035880086352347, pvalue=0.21156579598082711)



In [55]:

    
scipy.stats.f_oneway(pos1,pos2,pos3,pos4)









    Out[55]:





F_onewayResult(statistic=504.51127896305786, pvalue=1.4725733756146336e-265)



In [60]:

    
df_merged.loc[df["Position"]>3 , "Position"] = 3



In [61]:

    
#Plot tukey test
res2 = pairwise_tukeyhsd(df_merged["log_MarketCap"],df_merged["Position"])
print(res2)
res2.plot_simultaneous(comparison_name=None,xlabel='diffs',ylabel='Group')
plt.show()









    



Multiple Comparison of Means - Tukey HSD,FWER=0.05
=============================================
group1 group2 meandiff  lower   upper  reject
---------------------------------------------
  1      2    -0.4544  -0.5656 -0.3432  True 
  1      3    -1.0356  -1.1021 -0.9691  True 
  2      3    -0.5812  -0.6939 -0.4684  True 
---------------------------------------------



In [ ]:

    
#Make a scatter plot of x = assets and y = market capitalization with hue = Type of ent



In [76]:

    
sns.lmplot(x="log_Assets",y="log_MarketCap",hue="TypeEnt",data=df_merged)
plt.ylim((-2,10))









    Out[76]:





(-2, 10)



In [ ]:

    
#Run a liner regression explaining market capitalization in terms of the assets and  
#the type of entity column.



In [66]:

    
#How to run a regression (be careful, this is wrong)
mod = smf.ols(formula='log_MarketCap ~ log_Assets + C(TypeEnt)', data=df_merged)
res = mod.fit()
print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:          log_MarketCap   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.764
Method:                 Least Squares   F-statistic:                     1623.
Date:                Thu, 02 Feb 2017   Prob (F-statistic):               0.00
Time:                        02:00:10   Log-Likelihood:                -1734.6
No. Observations:                3013   AIC:                             3483.
Df Residuals:                    3006   BIC:                             3525.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
===============================================================================================================
                                                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------------------------
Intercept                                      -0.2862      0.068     -4.237      0.000        -0.419    -0.154
C(TypeEnt)[T.Financial company]                 0.6660      0.038     17.546      0.000         0.592     0.740
C(TypeEnt)[T.Foundation/Research institute]     0.5977      0.250      2.386      0.017         0.107     1.089
C(TypeEnt)[T.Industrial company]                0.8091      0.029     28.252      0.000         0.753     0.865
C(TypeEnt)[T.Insurance company]                 0.8842      0.217      4.074      0.000         0.459     1.310
C(TypeEnt)[T.Venture capital]                   0.6075      0.250      2.427      0.015         0.117     1.098
log_Assets                                      0.9121      0.009     98.068      0.000         0.894     0.930
==============================================================================
Omnibus:                      619.158   Durbin-Watson:                   2.027
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3114.122
Skew:                          -0.890   Prob(JB):                         0.00
Kurtosis:                       7.652   Cond. No.                         201.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [ ]:

    
#Check the assumptions of the regression



In [67]:

    
#Shape of residuals
sns.regplot(res.predict(),res.resid)
plt.ylabel('Residual')
plt.xlabel('Fitted values')









    Out[67]:





<matplotlib.text.Text at 0x7ff1cc00ee80>



In [68]:

    
#Normality
sns.distplot(res.resid,kde=False)
plt.ylabel('Count')
plt.xlabel('Normalized residuals')
plt.xlim((-3,3))









    Out[68]:





(-3, 3)



In [69]:

    
#Normality 2
qq_plot(res.resid)



In [71]:

    
#Larger leverage, larger influence. If high residual and high leverage -> Probably affecting the model
from statsmodels.graphics.regressionplots import *
influence_plot(res)
plt.show()



In [80]:

    
df_merged.loc[3305].values









    Out[80]:





array(['US830480694', 439.0, 'TRUPANION, INCORPORATED', 2.5299999999999998,
       3, 'New York Stock Exchange (NYSE)', 'Insurance company', 70917.0,
       422381.0, 146963.0, 2.6424645202421213, 5.167208008772354,
       4.850750355369331, 5.6257043740599784], dtype=object)



In [81]:

    
df_merged.loc[2975].values









    Out[81]:





array(['US741339132', 15922.0, 'HRG GROUP INCORPORATED',
       6.5099999999999998, 3, 'New York Stock Exchange (NYSE)',
       'Insurance company', 32334100.0, 2769580.0, 5815900.0,
       4.2019976195831052, 6.7646169305501633, 7.5096607771287083,
       6.4424139143668908], dtype=object)



In [82]:

    
df_merged.loc[1309].values









    Out[82]:





array(['US271316416', 417.0, 'FOUNDATION MEDICINE, INC.',
       5.0799999999999992, 3, 'NASDAQ National Market',
       'Foundation/Research institute', 297989.0, 640148.0, 93203.0,
       2.6201360549737576, 4.9694298915651078, 5.474200232776008,
       5.8062803929723295], dtype=object)

The high leverage is in the groups without barely any members (Insurance companies, venture companies and research insitutes. We probably should drop those groups, but it's okay.

	Company_name	Company_ID	Big3Share	Position	Exchange	TypeEnt	VAR	Dollars_th
0	INVESCO LTD	BM40671R	17.85	1	New York Stock Exchange (NYSE)	Bank	Revenue	7500000.0
1	ROYAL CARIBBEAN CRUISES LTD	LR30002MX	14.32	3	New York Stock Exchange (NYSE)	Industrial company	Revenue	7500000.0
2	GENCO SHIPPING & TRADING LTD	MH30004AQ	0.14	31	New York Stock Exchange (NYSE)	Industrial company	Revenue	350000.0
3	EAGLE BULK SHIPPING INC	MH40003AQ	2.85	9	NASDAQ National Market	Industrial company	Revenue	350000.0
4	POWERSHARES DB US DOLLAR INDEX BEARISH	US*S00100679	0.00	101	NYSE ARCA	Industrial company	Revenue	NaN

	Big3Share	Position
count	11646.000000	11646.000000
mean	10.323029	16.122875
std	7.950457	34.873495
min	0.000000	1.000000
25%	2.420000	1.000000
50%	10.365000	2.000000
75%	17.200000	8.000000
max	48.190000	165.000000

	Company_name	Company_ID	Big3Share	Position	Exchange	TypeEnt	VAR	Dollars_th
7772	JOHN HANCOCK HEDGED EQUITY & INCOME FUND	US*S00138046	0.00	105	New York Stock Exchange (NYSE)	Industrial company	MarketCap	`189761.0
9613	AGREE REALTY CORP	US383148187	24.10	1	New York Stock Exchange (NYSE)	Financial company	MarketCap	999644.0
7948	FIVE STAR QUALITY CARE, INC.	US043516029	8.78	1	New York Stock Exchange (NYSE)	Industrial company	MarketCap	99933.0
2693	ELIZABETH ARDEN INC	US590914138	12.02	5	NASDAQ National Market	Industrial company	Revenue	999298.0
6082	UNIQUE FABRICATING, INC.	US461846791	0.00	116	NYSE MKT	Industrial company	Assets	99921.0

VAR	Company_name	Company_ID	Big3Share	Position	Exchange	TypeEnt	Assets	MarketCap	Revenue
0	1-800 FLOWERS.COM, INC.	US113117311	5.08	4	NASDAQ National Market	Industrial company	497073.0	289266.0	1121506.0
1	1347 PROPERTY INSURANCE HOLDINGS INC	US257638358L	0.00	120	NASDAQ National Market	Insurance company	NaN	42390.0	37500.0
2	1ST CENTURY BANCSHARES, INC.	US261169687	0.00	122	NASDAQ National Market	Bank	64927.0	115050.0	2585.0
3	1ST CONSTITUTION BANCORP	US223665653	0.59	15	NASDAQ National Market	Bank	NaN	99595.0	37500.0
4	1ST SOURCE CORPORATION	US351068133	6.78	4	NASDAQ National Market	Bank	5187916.0	884045.0	249837.0

	Company_ID	Employees	Company_name	Big3Share	Position	Exchange	TypeEnt	Assets	MarketCap	Revenue
0	BM40671R	7500.0	INVESCO LTD	17.85	1	New York Stock Exchange (NYSE)	Bank	NaN	13123024.0	7500000.0
1	LR30002MX	7500.0	ROYAL CARIBBEAN CRUISES LTD	14.32	3	New York Stock Exchange (NYSE)	Industrial company	NaN	16739323.0	7500000.0
2	MH30004AQ	1500.0	GENCO SHIPPING & TRADING LTD	0.14	31	New York Stock Exchange (NYSE)	Industrial company	NaN	43392.0	350000.0
3	MH40003AQ	750.0	EAGLE BULK SHIPPING INC	2.85	9	NASDAQ National Market	Industrial company	NaN	26674.0	350000.0
4	US*S00100679	NaN	POWERSHARES DB US DOLLAR INDEX BEARISH	0.00	101	NYSE ARCA	Industrial company	NaN	44120.0	NaN