In [ ]:
#Read file "data/company_data.csv" (beware the encoding) and print header

In [ ]:


In [ ]:
#Use df.describe() to see how it looks like. Why is dollars_th not there? (describe only describes numeric columns)

In [ ]:


In [ ]:
#Sort the df by dollars_th to find out why this column is not numeric  (check ascenidng=False and ascending=True)

In [ ]:


In [ ]:
#Use df.replace() to replace the weird symbol `(you'll probably need the option regex=True)

In [ ]:


In [ ]:
#Pivot or Melt to tidy format

In [ ]:


In [ ]:
#Merge with this other dataset ("data/employees.csv")

In [ ]:


In [ ]:
#Check normality (qqplot)

In [ ]:


In [ ]:
#Transform variables to log and check normality (qqplot and histograms), also drop inf values by replacing them

In [ ]:


In [ ]:
#create 4 subsets for the market capitalization (rows with position=1,position=2,position=3,position>3)

In [ ]:


In [ ]:
#calculate the confidence intervals of the subsets. Interpret

In [ ]:


In [ ]:
#Do the right test (anova/kruskalwallis). If same variance (or very similar) do tukey test. Interpret

In [ ]:


In [ ]:
#Run a liner regression explaining market capitalization in terms of the assets and  
#the type of entity column. Interpret

In [ ]:


In [ ]:
#Check the assumptions of the regression. Interpret