In [ ]:
#Read file "data/company_data.csv" (beware the encoding) and print header
In [ ]:
In [ ]:
#Use df.describe() to see how it looks like. Why is dollars_th not there? (describe only describes numeric columns)
In [ ]:
In [ ]:
#Sort the df by dollars_th to find out why this column is not numeric (check ascenidng=False and ascending=True)
In [ ]:
In [ ]:
#Use df.replace() to replace the weird symbol `(you'll probably need the option regex=True)
In [ ]:
In [ ]:
#Pivot or Melt to tidy format
In [ ]:
In [ ]:
#Merge with this other dataset ("data/employees.csv")
In [ ]:
In [ ]:
#Check normality (qqplot)
In [ ]:
In [ ]:
#Transform variables to log and check normality (qqplot and histograms), also drop inf values by replacing them
In [ ]:
In [ ]:
#create 4 subsets for the market capitalization (rows with position=1,position=2,position=3,position>3)
In [ ]:
In [ ]:
#calculate the confidence intervals of the subsets. Interpret
In [ ]:
In [ ]:
#Do the right test (anova/kruskalwallis). If same variance (or very similar) do tukey test. Interpret
In [ ]:
In [ ]:
#Run a liner regression explaining market capitalization in terms of the assets and
#the type of entity column. Interpret
In [ ]:
In [ ]:
#Check the assumptions of the regression. Interpret