Study of Glassdoor Data

The purpose of this study is too quickly present the Glassdoor data with the different attributes

The datasets have been cleaned with cleaning functions written in python

We will after studied missing data siginification

Import packages and constants and helpers


In [3]:
# Packages 
import pandas as pd 
from autoc import DataExploration,NaImputer
from autoc.naimputer import missing_map

%pylab inline --no-import
import pylab as pl
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Seaborn options 
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
plt.style.use('ggplot') # ggplot2 style for mathplotlib 

# path to cleaned datasets 
path_reviews_cleaned = '~/Google Drive/Auto_clean/Datasets/Glassdoor/'


Populating the interactive namespace from numpy and matplotlib

Glassdoor Cleaned Reviews Data

Quick Exploration


In [4]:
df_reviews = pd.read_csv(path_reviews_cleaned + 'glassdoor_reviews_cleaned_utf8_170415.csv')


/Users/ericfourrier/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [5]:
df_reviews.head()


Out[5]:
website interview_difficulty ceo_name nb_c_jobs company_description registered_employer benefits_below similar_companies acquired_by founded ... awards ceo_rating type industry operated_by headquarters_city headquarters_state nb_awards_after_2000 has_website ticker
0 www.target.com 2.8 Brian Cornell NaN From our Minneapolis-based headquarters to ove... Engaged Employer NaN Walmart,The Home Depot,Walgreens,Macy's,Bank o... NaN 1962 ... Best Companies for Hourly Workers,Working Moth... 66 Company - Public with Ticker Retail NaN Minneapolis MN 2 1 TGT
1 www.homedepot.com 2.3 Craig Menear 1700 NaN Engaged Employer NaN NaN NaN 1978 ... World's Most Admired Companies of 2013,Fortune... 80 Company - Public with Ticker Retail NaN Atlanta GA 6 1 HD
2 www.army.mil 2.3 John McHugh NaN The US Department of the Army has its marching... Engaged Employer NaN Walmart,US Navy,The Home Depot,UPS,Bank of Ame... NaN 1775 ... Best Places to Work for Commuters,National Cen... 77 Government Government NaN Washington DC 2 1 NaN
3 www.att.com 2.7 Randall L. Stephenson 2100 We understand that our customers want an easie... NaN NaN NaN NaN 1876 ... Happiest Companies for Young Professionals,For... 71 Company - Public with Ticker Telecommunications NaN Dallas TX 5 1 T
4 www.bestbuy.com 2.4 Hubert Joly NaN The biggest consumer electronics outlet in the... Engaged Employer NaN Walmart,The Home Depot,AT&T,Lowe's,Target,Bank... NaN 1966 ... Employees’ Choice -50 Best Places to Work,Glas... 74 Company - Public with Ticker Retail NaN Richfield MN 2 1 BBY

5 rows × 35 columns


In [6]:
df_reviews[df_reviews.company_name == 'Google'].iloc[0]


Out[6]:
website                                                      www.google.com
interview_difficulty                                                    3.4
ceo_name                                                         Larry Page
nb_c_jobs                                                               NaN
company_description       Google is not a conventional company, and we d...
registered_employer                                        Engaged Employer
benefits_below                                                          NaN
similar_companies         Facebook,Amazon.com,IBM,Apple,Microsoft,Cisco ...
acquired_by                                                             NaN
founded                                                                1998
affiliated_company        YouTube,Nest,Motorola Mobility,Wildfire Intera...
nb_ratings_ceo                                                         1777
stars                                                                   4.4
nb_c_reviews_detailled                                                 3237
size                                                        5000+ Employees
nb_c_interviews                                                        4000
friend_recommend                                                         93
url                       http://www.glassdoor.com/Overview/Working-at-G...
benefits_above                                                          NaN
employer_id                                                            9079
revenue                                         $10+ billion (USD) per year
headquarters                                              Mountain View, CA
competitors                                        Microsoft,Apple,Facebook
company_name                                                         Google
nb_c_salaries                                                         12000
awards                    Best Companies to Work For,Fortune,2014,100 Be...
ceo_rating                                                               97
type                                           Company - Public with Ticker
industry                                             Information Technology
operated_by                                                             NaN
headquarters_city                                             Mountain View
headquarters_state                                                       CA
nb_awards_after_2000                                                      2
has_website                                                               1
ticker                                                                 GOOG
Name: 42, dtype: object

In [7]:
df_reviews[df_reviews.company_name == 'Dataiku'] # too bad (dataset too old)


Out[7]:
website interview_difficulty ceo_name nb_c_jobs company_description registered_employer benefits_below similar_companies acquired_by founded ... awards ceo_rating type industry operated_by headquarters_city headquarters_state nb_awards_after_2000 has_website ticker

0 rows × 35 columns


In [8]:
df_reviews[df_reviews.company_name == 'Uber'].iloc[0]['url']


Out[8]:
'http://www.glassdoor.com/Overview/Working-at-Uber-EI_IE575263.11,15.htm'

In [9]:
df_reviews.columns


Out[9]:
Index([u'website', u'interview_difficulty', u'ceo_name', u'nb_c_jobs',
       u'company_description', u'registered_employer', u'benefits_below',
       u'similar_companies', u'acquired_by', u'founded', u'affiliated_company',
       u'nb_ratings_ceo', u'stars', u'nb_c_reviews_detailled', u'size',
       u'nb_c_interviews', u'friend_recommend', u'url', u'benefits_above',
       u'employer_id', u'revenue', u'headquarters', u'competitors',
       u'company_name', u'nb_c_salaries', u'awards', u'ceo_rating', u'type',
       u'industry', u'operated_by', u'headquarters_city',
       u'headquarters_state', u'nb_awards_after_2000', u'has_website',
       u'ticker'],
      dtype='object')

In [10]:
df_reviews.head()


Out[10]:
website interview_difficulty ceo_name nb_c_jobs company_description registered_employer benefits_below similar_companies acquired_by founded ... awards ceo_rating type industry operated_by headquarters_city headquarters_state nb_awards_after_2000 has_website ticker
0 www.target.com 2.8 Brian Cornell NaN From our Minneapolis-based headquarters to ove... Engaged Employer NaN Walmart,The Home Depot,Walgreens,Macy's,Bank o... NaN 1962 ... Best Companies for Hourly Workers,Working Moth... 66 Company - Public with Ticker Retail NaN Minneapolis MN 2 1 TGT
1 www.homedepot.com 2.3 Craig Menear 1700 NaN Engaged Employer NaN NaN NaN 1978 ... World's Most Admired Companies of 2013,Fortune... 80 Company - Public with Ticker Retail NaN Atlanta GA 6 1 HD
2 www.army.mil 2.3 John McHugh NaN The US Department of the Army has its marching... Engaged Employer NaN Walmart,US Navy,The Home Depot,UPS,Bank of Ame... NaN 1775 ... Best Places to Work for Commuters,National Cen... 77 Government Government NaN Washington DC 2 1 NaN
3 www.att.com 2.7 Randall L. Stephenson 2100 We understand that our customers want an easie... NaN NaN NaN NaN 1876 ... Happiest Companies for Young Professionals,For... 71 Company - Public with Ticker Telecommunications NaN Dallas TX 5 1 T
4 www.bestbuy.com 2.4 Hubert Joly NaN The biggest consumer electronics outlet in the... Engaged Employer NaN Walmart,The Home Depot,AT&T,Lowe's,Target,Bank... NaN 1966 ... Employees’ Choice -50 Best Places to Work,Glas... 74 Company - Public with Ticker Retail NaN Richfield MN 2 1 BBY

5 rows × 35 columns


In [11]:
exploration = DataExploration(df_reviews)

In [12]:
exploration.structure()


Out[12]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key dtype_infer string_length
website object character 9796 0.053928 169844 False False False mixed 204
interview_difficulty float64 numeric 122927 0.676721 41 False False False floating NaN
ceo_name object character 90996 0.500939 85347 False False False mixed 68
nb_c_jobs float64 numeric 179486 0.988082 478 False False False floating NaN
company_description object character 146083 0.804196 35523 False False False mixed 1093
registered_employer object factor 465 0.002560 2 False False False mixed 17
benefits_below float64 numeric 181651 1.000000 0 False True False floating NaN
similar_companies object character 9683 0.053306 55616 False False False mixed 442
acquired_by object character 181151 0.997247 432 False False False mixed 39
founded float64 numeric 125140 0.688903 312 False False False floating NaN
affiliated_company object character 179053 0.985698 2432 False False False mixed 561
nb_ratings_ceo int64 numeric 0 0.000000 526 False False False integer NaN
stars float64 numeric 364 0.002004 41 False False False floating NaN
nb_c_reviews_detailled int64 numeric 0 0.000000 873 False False False integer NaN
size object factor 0 0.000000 9 False False False string 22
nb_c_interviews float64 numeric 115333 0.634915 451 False False False floating NaN
friend_recommend float64 numeric 55236 0.304078 99 False False False floating NaN
url object character 0 0.000000 181651 False False True string 161
benefits_above float64 numeric 181651 1.000000 0 False True False floating NaN
employer_id int64 numeric 0 0.000000 181651 False False True integer NaN
revenue object character 566 0.003116 14 False False False mixed 41
headquarters object character 3852 0.021205 13843 False False False mixed 44
competitors object character 1277 0.007030 20663 False False False mixed 118
company_name object character 0 0.000000 181651 False False True string 94
nb_c_salaries float64 numeric 63970 0.352159 913 False False False floating NaN
awards object character 171181 0.942362 7812 False False False mixed 811
ceo_rating float64 numeric 132343 0.728556 100 False False False floating NaN
type object character 0 0.000000 15 False False False string 30
industry object character 0 0.000000 26 False False False string 34
operated_by object character 180142 0.991693 742 False False False mixed 42
headquarters_city object character 3852 0.021205 10371 False False False mixed 36
headquarters_state object character 4290 0.023617 202 False False False mixed 28
nb_awards_after_2000 float64 numeric 0 0.000000 10 False False False floating NaN
has_website int64 numeric 0 0.000000 2 False False False integer NaN
ticker object character 175746 0.967493 5659 False False False mixed 15

In [13]:
df_reviews[pd.isnull(df_reviews.stars)].company_name


Out[13]:
63930                                               NowThis
73517                                            CittaNuova
78808        Panasonic Factory Solutions Company of America
78821                                        Showcase Honda
79144                                Central Point Partners
79152                                           CarePayment
84702                                            TrueWealth
91635                                        Valuation Link
102302                              Clayton Williams Energy
102328                           Anchor Manufacturing Group
102384                                     Stanley Electric
102712                                   Lunda Construction
102801                                       ALMAG Aluminum
102831                               Top Grade Construction
102852                                              OEwaves
102887                                 Venture Technologies
102986                                    The Grupe Company
103185                             Gospel Music Association
103799                                Kansas Highway Patrol
103886                  Tuckahoe Union Free School District
103973                                     Benvenue Medical
104089                                          TalentQuest
105175                          Reed City Power Line Supply
105229                                     Hilltop Holdings
105233                                        Service Force
105264                                       Pride Products
105300                                        Concord Litho
105377                                   Moderne Promotions
105425                     Global Entertainment Corporation
105441                                       Shell Aviation
                                ...                        
178384                                     Showa Best Glove
178414                             SecureOne Data Solutions
178416                        Sentech Architectural Systems
178426                      Precision Manufacturing Company
178428                     Northwest Neurobehavioral Health
178741                                       Foreign Policy
179307                                            Slingshot
180170                     Evolution Academy Charter School
180284                                            Hillsides
180301                                Storch Amini & Munves
180443                                     ACP IT Solutions
180792    Law Office of Robert L. Lewis Immigrant Defens...
181069                                Health Care Navigator
181306                             Eagle Financial Services
181382                            Colorado Mountain Express
181526                              Randy Marion Auto Group
181581                                      Don Ayres Honda
181583                                              2pointb
181629                                     The Paint Doctor
181639                        Stetson Hills Family Medicine
181640                               Ava Anderson Non Toxic
181641                              Expose Gentleman's Club
181642                           Hagerstown Marketing Group
181643                     Advance Mechanical Systems, Inc.
181644                              Palm Beach Broadcasting
181645                                    Big Buddy Program
181646               Sebewaing Tool and Engineering Company
181647                                     Quality Lighting
181649                                           HolaDoctor
181650                                                 BWTP
Name: company_name, dtype: object

In [14]:
df_reviews = df_reviews.drop(labels = ['benefits_below','benefits_above'],axis = 1)
df_reviews = df_reviews.dropna(subset=['stars'])

In [15]:
# Let's see what if Lending Club is one of the best place to work based on stars 
df_sort = df_reviews[df_reviews.nb_c_reviews_detailled > 50].sort_values('stars',ascending=False).reset_index()


print('Uber is the {} happiest company'.format(str(df_sort[(df_sort.company_name == "Uber")].index[0])))


Uber is the 2394 happiest company

In [16]:
df_sort.head(5)


Out[16]:
index website interview_difficulty ceo_name nb_c_jobs company_description registered_employer similar_companies acquired_by founded ... awards ceo_rating type industry operated_by headquarters_city headquarters_state nb_awards_after_2000 has_website ticker
0 4875 www.elitesem.com 3.3 Ben Kirshner NaN NaN Engaged Employer Andiamo Partners,Crowdtap,Carrot,Twitter,Wink,... NaN 2004 ... Best Places to Work 2013,Crain's New York Busi... 100 Company - Private Business Services NaN New York NY 2 1 NaN
1 3681 www.pluralsight.com 3.3 Aaron Skonnard NaN At Pluralsight, we have three core values that... Engaged Employer Instructure,Domo,Qualtrics,Overstock.com,Curas... NaN 2004 ... Best Companies to Work For (Utah),Utah Busines... 100 Company - Private Education NaN Farmington UT 2 1 NaN
2 3901 www.gohomeside.com 3.8 Michael Baynes NaN Founded by industry veterans, Homeside Financi... Engaged Employer PMAC Lending Services,Residential Finance,Worl... NaN 2013 ... NaN 100 Company - Private Finance NaN Columbia MD 0 1 NaN
3 1986 www.dropbox.com 3.1 Drew Houston 184 Dropbox is the home for your most important st... Engaged Employer NaN NaN 2007 ... Founder of the Year Crunchie,Tech Crunch,2014 98 Company - Private Information Technology NaN San Francisco CA 1 1 NaN
4 4617 www.bestversionmedia.com 1.6 David Durand 3500 Best Version Media (BVM) brings neighbors toge... NaN NaN NaN 2007 ... NaN 100 Company - Private Media NaN Brookfield WI 0 1 NaN

5 rows × 34 columns

Look at stars data


In [17]:
p = plt.hist(df_reviews.stars,bins = 30,histtype="stepfilled", color="#F08080", alpha=.5)


Notes: You can see the problem of a real life distribution (discontinuous because of small companies)


In [18]:
order = [u'1 to 5 Employees',u'6 to 15 Employees',u'16 to 50 Employees',
         u'50 to 149 Employees', u'150 to 499 Employees', u'500 to 999 Employees',
         u'1000 to 5000 Employees', u'5000+ Employees','Unknown']

In [19]:
# Violin plot 
pl.figure(figsize=(20, 10))
sns.violinplot(df_reviews.stars, df_reviews['size'],order=order)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x118ebad50>

Notes: You can see the discontinuity for company with few employees


In [20]:
# Stars per size of the company
pl.figure(figsize=(20, 10))
sns.barplot("size", "stars",order = order,data = df_reviews)


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x119d1af10>

In [21]:
big_companies = df_reviews.loc[df_reviews['size'] == "5000+ Employees"]

In [22]:
sns.distplot(big_companies.stars,color = '#F08080')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c384fd0>

Study on Missing data

Some Theory

Lay and Rubbin

This is a scraped and real dataset with a lot of missing data we are going to try to respond to lay and rubin theory

  • MCAR : "Missing completely at random": the missing values are completely random and doesn/t depend from observations or any other factor.
  • MAR : "Missing at random": missing values depends from the observations.
  • NMAR : , "Not Missing at random": missing values depends on the unobserved original data values.
Purpose of the study

We are going to use statistic such as conditionnal expectation

Using autoc DataExploration class


In [23]:
exploration = DataExploration(df_reviews)

In [24]:
exploration.nacolcount()


Out[24]:
Nanumber Napercentage
website 9775 0.053920
interview_difficulty 122600 0.676276
ceo_name 90716 0.500400
nb_c_jobs 179122 0.988058
company_description 145774 0.804106
registered_employer 465 0.002565
similar_companies 9544 0.052646
acquired_by 180787 0.997242
founded 124865 0.688770
affiliated_company 178694 0.985697
nb_ratings_ceo 0 0.000000
stars 0 0.000000
nb_c_reviews_detailled 0 0.000000
size 0 0.000000
nb_c_interviews 115009 0.634403
friend_recommend 54872 0.302680
url 0 0.000000
employer_id 0 0.000000
revenue 566 0.003122
headquarters 3846 0.021215
competitors 1277 0.007044
company_name 0 0.000000
nb_c_salaries 63758 0.351696
awards 170826 0.942296
ceo_rating 131979 0.728011
type 0 0.000000
industry 0 0.000000
operated_by 179778 0.991676
headquarters_city 3846 0.021215
headquarters_state 4284 0.023631
nb_awards_after_2000 0 0.000000
has_website 0 0.000000
ticker 175389 0.967466

In [25]:
df_test = df_reviews.copy()

In [26]:
df_test['is_na_interview_difficulty'] = df_test.interview_difficulty.isnull().astype(int)

In [27]:
def cserie(serie):
    return serie[serie].index.tolist()
cserie((df_test.dtypes == int) | (df_test.dtypes == float))


Out[27]:
['interview_difficulty',
 'nb_c_jobs',
 'founded',
 'nb_ratings_ceo',
 'stars',
 'nb_c_reviews_detailled',
 'nb_c_interviews',
 'friend_recommend',
 'employer_id',
 'nb_c_salaries',
 'ceo_rating',
 'nb_awards_after_2000',
 'has_website',
 'is_na_interview_difficulty']

In [28]:
def plot_hist_na(df, colname):
    df_h = df.copy()
    na_name = "is_na_{}".format(colname)
    df_h[na_name] = df_h[colname].isnull().astype(int)
    measure_col = cserie((df.dtypes == int) | (df.dtypes == float))
    df_h.groupby(na_name)[measure_col].hist()

In [29]:
plot_hist_na(df_reviews,"revenue")



In [30]:
plot_hist_na(df_reviews,"interview_difficulty")



In [31]:
df_test.dtypes


Out[31]:
website                        object
interview_difficulty          float64
ceo_name                       object
nb_c_jobs                     float64
company_description            object
registered_employer            object
similar_companies              object
acquired_by                    object
founded                       float64
affiliated_company             object
nb_ratings_ceo                  int64
stars                         float64
nb_c_reviews_detailled          int64
size                           object
nb_c_interviews               float64
friend_recommend              float64
url                            object
employer_id                     int64
revenue                        object
headquarters                   object
competitors                    object
company_name                   object
nb_c_salaries                 float64
awards                         object
ceo_rating                    float64
type                           object
industry                       object
operated_by                    object
headquarters_city              object
headquarters_state             object
nb_awards_after_2000          float64
has_website                     int64
ticker                         object
is_na_interview_difficulty      int64
dtype: object

In [32]:
df_test['is_na_interview_difficulty']


Out[32]:
0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
181609    1
181610    1
181611    1
181612    1
181613    1
181614    1
181615    1
181616    1
181617    1
181618    1
181619    1
181620    0
181621    1
181622    1
181623    1
181624    0
181625    1
181626    1
181627    1
181628    1
181630    1
181631    1
181632    1
181633    1
181634    1
181635    1
181636    1
181637    1
181638    1
181648    1
Name: is_na_interview_difficulty, dtype: int64

In [33]:
df_test.groupby('is_na_interview_difficulty').describe()


Out[33]:
ceo_rating employer_id founded friend_recommend has_website interview_difficulty nb_awards_after_2000 nb_c_interviews nb_c_jobs nb_c_reviews_detailled nb_c_salaries nb_ratings_ceo stars
is_na_interview_difficulty
0 count 31279.000000 58687.000000 26797.000000 50790.000000 58687.000000 58687.000000 58687.000000 58675.000000 2052.000000 58687.000000 52746.000000 58687.000000 58687.000000
mean 73.162985 350825.464600 1975.331380 63.559362 0.977985 2.517479 0.284935 10.264815 345.139864 34.341847 61.818470 12.573602 3.202242
std 25.149155 268166.243355 40.582616 25.589916 0.146734 0.835546 0.840877 77.009011 1455.770293 215.913688 592.767673 81.060969 0.902330
min 1.000000 1.000000 1616.000000 2.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 1.000000
25% 54.000000 115120.500000 1967.000000 44.000000 1.000000 2.000000 0.000000 1.000000 16.000000 3.000000 3.000000 0.000000 2.700000
50% 77.000000 320247.000000 1990.000000 62.000000 1.000000 2.700000 0.000000 2.000000 45.000000 7.000000 8.000000 1.000000 3.200000
75% 100.000000 568464.000000 2001.000000 85.000000 1.000000 3.000000 0.000000 5.000000 173.250000 19.000000 24.000000 7.000000 3.800000
max 100.000000 982307.000000 2015.000000 100.000000 1.000000 5.000000 9.000000 6000.000000 33000.000000 13979.000000 70000.000000 4806.000000 5.000000
1 count 18029.000000 122600.000000 29625.000000 75625.000000 122600.000000 0.000000 122600.000000 7603.000000 113.000000 122600.000000 64783.000000 122600.000000 122600.000000
mean 85.580620 543785.492569 1978.486008 83.448754 0.930808 NaN 0.028662 1.076549 39.548673 2.216256 6.867789 0.440587 3.323085
std 23.414423 276571.324115 36.299990 25.156986 0.253782 NaN 0.221837 0.304627 160.747472 2.527796 41.397776 1.381665 1.254790
min 5.000000 9.000000 1601.000000 5.000000 0.000000 NaN 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 1.000000
25% 70.000000 317496.250000 1969.000000 63.000000 1.000000 NaN 0.000000 1.000000 5.000000 1.000000 1.000000 0.000000 2.400000
50% 100.000000 586726.500000 1989.000000 100.000000 1.000000 NaN 0.000000 1.000000 11.000000 1.000000 2.000000 0.000000 3.400000
75% 100.000000 780885.250000 2001.000000 100.000000 1.000000 NaN 0.000000 1.000000 23.000000 2.000000 4.000000 0.000000 4.100000
max 100.000000 982443.000000 2015.000000 100.000000 1.000000 NaN 5.000000 6.000000 1600.000000 76.000000 4300.000000 54.000000 5.000000

In [34]:
df_test.groupby('is_na_interview_difficulty')[['founded','ceo_rating']].hist()


Out[34]:
is_na_interview_difficulty
0    [[Axes(0.125,0.125;0.336957x0.775), Axes(0.563...
1    [[Axes(0.125,0.125;0.336957x0.775), Axes(0.563...
dtype: object

In [35]:
# g = sns.FacetGrid(tips, col="time")
# g.map(plt.hist, "tip");
g = sns.FacetGrid(data=df_test, col='is_na_interview_difficulty')
g.map(plt.hist, "founded")


Out[35]:
<seaborn.axisgrid.FacetGrid at 0x1218fa290>

In [36]:
for col in ['founded','ceo_rating']:
    g = sns.FacetGrid(data=df_test, col='is_na_interview_difficulty',hue="is_na_interview_difficulty")
    g.map(sns.distplot, col)



In [37]:
g = sns.PairGrid(df_test,
                 y_vars=["founded", "ceo_rating", "nb_c_interviews"],
                 x_vars=["is_na_interview_difficulty"],
                 aspect=.75, size=3.5)
g.map(sns.violinplot, palette="pastel")


Out[37]:
<seaborn.axisgrid.PairGrid at 0x12178f9d0>

Naimputer Examples


In [38]:
missing_map(df_reviews, nmax=1000)


Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x12c6bcd10>

In [39]:
na = NaImputer(df_reviews)

In [40]:
na.get_isna_mean(colname='ceo_rating')


Out[40]:
is_na_ceo_rating 0 1
interview_difficulty mean 2.604396 2.418287
std 0.724465 0.936731
sem 0.004096 0.005658
nb_c_jobs mean 344.203731 90.257812
std 1460.309833 280.136475
sem 32.355603 24.760800
founded mean 1977.595504 1976.415594
std 38.078957 38.740949
sem 0.230211 0.227252
nb_ratings_ceo mean 15.779265 0.105161
std 88.115540 0.547349
sem 0.396820 0.001507
stars mean 3.405332 3.238622
std 0.794065 1.259332
sem 0.003576 0.003466
nb_c_reviews_detailled mean 39.134319 2.708749
std 234.874781 8.662609
sem 1.057737 0.023845
nb_c_interviews mean 16.267320 1.966548
std 101.284068 4.734609
sem 0.552764 0.026181
friend_recommend mean 64.669850 81.982950
std 25.160246 26.202291
sem 0.115267 0.093360
employer_id mean 296562.000000 550345.000000
std 270713.666646 263310.671888
sem 1219.134124 724.796428
nb_c_salaries mean 72.722814 6.876085
std 648.780787 35.274075
sem 3.092836 0.130087
nb_awards_after_2000 mean 0.348807 0.023011
std 0.911100 0.202185
sem 0.004103 0.000557
has_website mean 0.992435 0.928761
std 0.086647 0.257224
sem 0.000390 0.000708

In [41]:
na.isna_summary(colname='ceo_rating')


Out[41]:
is_na_ceo_rating 0 1
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
ceo_rating 49308 77.703375 25.247404 1 58.00 87.0 100.0 100 0 NaN NaN NaN NaN NaN NaN NaN
employer_id 49308 296562.216659 270713.666647 1 27534.25 262714.5 460558.5 980483 131979 550345.931178 263310.671888 24 331475.50 585889.0 773030.5 982443
founded 27360 1977.595504 38.078957 1616 1970.00 1991.0 2001.0 2015 29062 1976.415594 38.740949 1601 1967.00 1988.0 2001.0 2015
friend_recommend 47645 64.669850 25.160246 2 46.00 64.0 86.0 100 78770 81.982950 26.202291 2 59.00 100.0 100.0 100
has_website 49308 0.992435 0.086647 0 1.00 1.0 1.0 1 131979 0.928761 0.257224 0 1.00 1.0 1.0 1
interview_difficulty 31279 2.604396 0.724465 0 2.10 2.7 3.0 5 27408 2.418287 0.936731 0 2.00 2.6 3.0 5
nb_awards_after_2000 49308 0.348807 0.911100 0 0.00 0.0 0.0 9 131979 0.023011 0.202185 0 0.00 0.0 0.0 7
nb_c_interviews 33574 16.267320 101.284068 1 1.00 3.0 8.0 6000 32704 1.966548 4.734609 1 1.00 1.0 2.0 545
nb_c_jobs 2037 344.203731 1460.309833 1 16.00 45.0 170.0 33000 128 90.257812 280.136475 2 7.75 19.5 36.5 2400
nb_c_reviews_detailled 49308 39.134319 234.874781 1 4.00 8.0 21.0 13979 131979 2.708749 8.662609 1 1.00 1.0 2.0 2086
nb_c_salaries 44003 72.722814 648.780787 1 3.00 9.0 29.0 70000 73526 6.876085 35.274075 1 1.00 2.0 5.0 4300
nb_ratings_ceo 49308 15.779265 88.115540 1 2.00 4.0 10.0 4806 131979 0.105161 0.547349 0 0.00 0.0 0.0 46
stars 49308 3.405332 0.794065 1 2.90 3.4 4.0 5 131979 3.238622 1.259332 1 2.20 3.1 4.0 5

In [42]:
na.infos_na()


Out[42]:
{'low_na_col': ['registered_employer',
  'nb_ratings_ceo',
  'stars',
  'nb_c_reviews_detailled',
  'size',
  'url',
  'employer_id',
  'revenue',
  'headquarters',
  'competitors',
  'company_name',
  'type',
  'industry',
  'headquarters_city',
  'headquarters_state',
  'nb_awards_after_2000',
  'has_website'],
 'many_na_col': ['nb_c_jobs',
  'acquired_by',
  'affiliated_company',
  'awards',
  'operated_by',
  'ticker'],
 'nacolcount':                         Nanumber  Napercentage
 website                     9775      0.053920
 interview_difficulty      122600      0.676276
 ceo_name                   90716      0.500400
 nb_c_jobs                 179122      0.988058
 company_description       145774      0.804106
 registered_employer          465      0.002565
 similar_companies           9544      0.052646
 acquired_by               180787      0.997242
 founded                   124865      0.688770
 affiliated_company        178694      0.985697
 nb_ratings_ceo                 0      0.000000
 stars                          0      0.000000
 nb_c_reviews_detailled         0      0.000000
 size                           0      0.000000
 nb_c_interviews           115009      0.634403
 friend_recommend           54872      0.302680
 url                            0      0.000000
 employer_id                    0      0.000000
 revenue                      566      0.003122
 headquarters                3846      0.021215
 competitors                 1277      0.007044
 company_name                   0      0.000000
 nb_c_salaries              63758      0.351696
 awards                    170826      0.942296
 ceo_rating                131979      0.728011
 type                           0      0.000000
 industry                       0      0.000000
 operated_by               179778      0.991676
 headquarters_city           3846      0.021215
 headquarters_state          4284      0.023631
 nb_awards_after_2000           0      0.000000
 has_website                    0      0.000000
 ticker                    175389      0.967466,
 'narowcount':         Nanumber  Napercentage
 0              4      0.121212
 1              5      0.151515
 2              4      0.121212
 3              5      0.151515
 4              4      0.121212
 5              3      0.090909
 6              3      0.090909
 7              3      0.090909
 8              4      0.121212
 9              3      0.090909
 10             3      0.090909
 11             4      0.121212
 12             4      0.121212
 13             3      0.090909
 14             5      0.151515
 15             5      0.151515
 16             7      0.212121
 17             4      0.121212
 18             3      0.090909
 19             5      0.151515
 20             6      0.181818
 21             5      0.151515
 22             6      0.181818
 23             5      0.151515
 24             6      0.181818
 25             3      0.090909
 26             3      0.090909
 27             7      0.212121
 28             4      0.121212
 29             4      0.121212
 ...          ...           ...
 181609        14      0.424242
 181610        15      0.454545
 181611        15      0.454545
 181612        16      0.484848
 181613        15      0.454545
 181614        14      0.424242
 181615        15      0.454545
 181616        16      0.484848
 181617        15      0.454545
 181618        14      0.424242
 181619        14      0.424242
 181620        12      0.363636
 181621        15      0.454545
 181622        14      0.424242
 181623        14      0.424242
 181624        13      0.393939
 181625        16      0.484848
 181626        14      0.424242
 181627        15      0.454545
 181628        15      0.454545
 181630        15      0.454545
 181631        15      0.454545
 181632        15      0.454545
 181633        15      0.454545
 181634        15      0.454545
 181635        14      0.424242
 181636        15      0.454545
 181637        15      0.454545
 181638        15      0.454545
 181648        14      0.424242
 
 [181287 rows x 2 columns],
 'nb_total_na': 1947772,
 'total_pct_na': 0.325579848193163}

In [44]:
na.plot_corrplot_na(size=7,figsize=(20,10))



In [ ]: