The purpose of this study is too quickly present the Glassdoor data with the different attributes
The datasets have been cleaned with cleaning functions written in python
We will after studied missing data siginification
In [3]:
# Packages
import pandas as pd
from autoc import DataExploration,NaImputer
from autoc.naimputer import missing_map
%pylab inline --no-import
import pylab as pl
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
# Seaborn options
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
plt.style.use('ggplot') # ggplot2 style for mathplotlib
# path to cleaned datasets
path_reviews_cleaned = '~/Google Drive/Auto_clean/Datasets/Glassdoor/'
In [4]:
df_reviews = pd.read_csv(path_reviews_cleaned + 'glassdoor_reviews_cleaned_utf8_170415.csv')
In [5]:
df_reviews.head()
Out[5]:
In [6]:
df_reviews[df_reviews.company_name == 'Google'].iloc[0]
Out[6]:
In [7]:
df_reviews[df_reviews.company_name == 'Dataiku'] # too bad (dataset too old)
Out[7]:
In [8]:
df_reviews[df_reviews.company_name == 'Uber'].iloc[0]['url']
Out[8]:
In [9]:
df_reviews.columns
Out[9]:
In [10]:
df_reviews.head()
Out[10]:
In [11]:
exploration = DataExploration(df_reviews)
In [12]:
exploration.structure()
Out[12]:
In [13]:
df_reviews[pd.isnull(df_reviews.stars)].company_name
Out[13]:
In [14]:
df_reviews = df_reviews.drop(labels = ['benefits_below','benefits_above'],axis = 1)
df_reviews = df_reviews.dropna(subset=['stars'])
In [15]:
# Let's see what if Lending Club is one of the best place to work based on stars
df_sort = df_reviews[df_reviews.nb_c_reviews_detailled > 50].sort_values('stars',ascending=False).reset_index()
print('Uber is the {} happiest company'.format(str(df_sort[(df_sort.company_name == "Uber")].index[0])))
In [16]:
df_sort.head(5)
Out[16]:
In [17]:
p = plt.hist(df_reviews.stars,bins = 30,histtype="stepfilled", color="#F08080", alpha=.5)
Notes: You can see the problem of a real life distribution (discontinuous because of small companies)
In [18]:
order = [u'1 to 5 Employees',u'6 to 15 Employees',u'16 to 50 Employees',
u'50 to 149 Employees', u'150 to 499 Employees', u'500 to 999 Employees',
u'1000 to 5000 Employees', u'5000+ Employees','Unknown']
In [19]:
# Violin plot
pl.figure(figsize=(20, 10))
sns.violinplot(df_reviews.stars, df_reviews['size'],order=order)
Out[19]:
Notes: You can see the discontinuity for company with few employees
In [20]:
# Stars per size of the company
pl.figure(figsize=(20, 10))
sns.barplot("size", "stars",order = order,data = df_reviews)
Out[20]:
In [21]:
big_companies = df_reviews.loc[df_reviews['size'] == "5000+ Employees"]
In [22]:
sns.distplot(big_companies.stars,color = '#F08080')
Out[22]:
This is a scraped and real dataset with a lot of missing data we are going to try to respond to lay and rubin theory
We are going to use statistic such as conditionnal expectation
In [23]:
exploration = DataExploration(df_reviews)
In [24]:
exploration.nacolcount()
Out[24]:
In [25]:
df_test = df_reviews.copy()
In [26]:
df_test['is_na_interview_difficulty'] = df_test.interview_difficulty.isnull().astype(int)
In [27]:
def cserie(serie):
return serie[serie].index.tolist()
cserie((df_test.dtypes == int) | (df_test.dtypes == float))
Out[27]:
In [28]:
def plot_hist_na(df, colname):
df_h = df.copy()
na_name = "is_na_{}".format(colname)
df_h[na_name] = df_h[colname].isnull().astype(int)
measure_col = cserie((df.dtypes == int) | (df.dtypes == float))
df_h.groupby(na_name)[measure_col].hist()
In [29]:
plot_hist_na(df_reviews,"revenue")
In [30]:
plot_hist_na(df_reviews,"interview_difficulty")
In [31]:
df_test.dtypes
Out[31]:
In [32]:
df_test['is_na_interview_difficulty']
Out[32]:
In [33]:
df_test.groupby('is_na_interview_difficulty').describe()
Out[33]:
In [34]:
df_test.groupby('is_na_interview_difficulty')[['founded','ceo_rating']].hist()
Out[34]:
In [35]:
# g = sns.FacetGrid(tips, col="time")
# g.map(plt.hist, "tip");
g = sns.FacetGrid(data=df_test, col='is_na_interview_difficulty')
g.map(plt.hist, "founded")
Out[35]:
In [36]:
for col in ['founded','ceo_rating']:
g = sns.FacetGrid(data=df_test, col='is_na_interview_difficulty',hue="is_na_interview_difficulty")
g.map(sns.distplot, col)
In [37]:
g = sns.PairGrid(df_test,
y_vars=["founded", "ceo_rating", "nb_c_interviews"],
x_vars=["is_na_interview_difficulty"],
aspect=.75, size=3.5)
g.map(sns.violinplot, palette="pastel")
Out[37]:
In [38]:
missing_map(df_reviews, nmax=1000)
Out[38]:
In [39]:
na = NaImputer(df_reviews)
In [40]:
na.get_isna_mean(colname='ceo_rating')
Out[40]:
In [41]:
na.isna_summary(colname='ceo_rating')
Out[41]:
In [42]:
na.infos_na()
Out[42]:
In [44]:
na.plot_corrplot_na(size=7,figsize=(20,10))
In [ ]: