notebook.community

Edit and run



In [112]:

    
#The HR Analytics dateset from kaggle competitions 
#Import of  necessary libraries, Modules and classifiers
import numpy as np  #fundamental package for scientific computing with Python
import pandas as pd #package providing fast, flexible, and expressive data structures
import matplotlib.pyplot as plt #for plotting different kinds of diagrams
#commands in cells below the cell that outputs a plot will not affect the plot inline command 
#(commentation on the same line causes an error):
%matplotlib inline 
import seaborn as sns #visualization library based on matplotlib, for statistical data visualization
from IPython.display import display_html
import scipy.stats as sp
from scipy.stats.stats import pearsonr

hr_data=pd.read_csv('.\HR_comma_sep.csv',header=0) #read the data from a csv-file; ensure that the  v
#alues are separated by commas otherwise you need to specify the delimiter explicitly within the load-statement

hr_data_copy=hr_data.copy()  #create a deep copy of the data set for demonstrating how to handle missing values (mv)

hr_data.head() #show the first five entries; attribute in brackets will give the # of printed lines









    Out[112]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      0
      0.38
      0.53
      2
      157
      3
      0
      1
      0
      sales
      low
    
    
      1
      0.80
      0.86
      5
      262
      6
      0
      1
      0
      sales
      medium
    
    
      2
      0.11
      0.88
      7
      272
      4
      0
      1
      0
      sales
      medium
    
    
      3
      0.72
      0.87
      5
      223
      5
      0
      1
      0
      sales
      low
    
    
      4
      0.37
      0.52
      2
      159
      3
      0
      1
      0
      sales
      low



In [113]:

    
###We define small helper functions (Code from E-M-A-D; https://www.kaggle.com/etakla)
#A function for annotating the bars with its total and relative number. 
def annotate_bars(bar_plt, bar_plt_var, by=None, x_offset=0, y_offset=0, txt_color="white", 
                  fnt_size=12, fnt_weight='bold'):
    if by is None:
        for p in bar_plt.patches:
            bar_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( 
                        (100.0* p.get_height()) /bar_plt_var.count(), 1) )
                             + "%", 
                             (p.get_x() + x_offset, p.get_height()-y_offset),
                             color=txt_color, fontsize=fnt_size, fontweight=fnt_weight)
    else:
        grouped = bar_plt_var.groupby(by)
        for p in bar_plt.patches:            
            #This part is tricky. The problem is that not each x-tick gets drawn in order, 
            #i.e. yes/no of the first group then yes/no of the second group located on the 
            #next tick, but rather all the yes on all the x-ticks get drawn first then all 
            #the nos next. So we need to know we are using a patch that belongs to which 
            #tick (the x-tick) ultimately refers to one of the groups. So, we get the x absolute 
            #coordinate, round it to know this patch is closest to which tick (Assuming that it 
            #will always belong to its closest tick), then get the group count of that tick and 
            #use it as a total to compute the percentage.
            total = grouped.get_group(bar_plot.get_xticks()[int(round(p.get_x()))]).count()
            bar_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( (100.0* 
                                                                             p.get_height()) /total, 1) )+ "%", 
                             (p.get_x() + x_offset, p.get_height()-y_offset),
                             color=txt_color, fontsize=fnt_size, fontweight=fnt_weight)
            
#A function that returns the order of a group_by object according to the average of certain parameter param.
def get_ordered_group_index(df, group_by, param, ascending=False):
    return df.groupby(group_by)[param].mean().sort_values(ascending=ascending).index

#helper function that returns the order of a group_by object according to the average of certain parameter param.
def group_by_2_level_perc(df, level1, level2, level1_index_order = None, level2_index_order = None):
    #http://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
    df_by_lvl1_lvl2 = df.groupby([level1, level2]).agg({level1: 'count'})
    df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
    #Reorder them in logical ascending order, but first make sure it is not an empty input
    if level1_index_order:
        df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2_perc.reindex_axis(level1_index_order, axis=0, level=0)
    #If a second level order is passed, apply it, else use the default
    if level2_index_order:
        df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2_perc.reindex_axis(level2_index_order, axis=0, level=1)
    return df_by_lvl1_lvl2_perc

#A function that adds some styling to the graphs, like custom ticks for axes, axes labels and a grid
def customise_2lvl_perc_area_graph(p, legend_lst, xtick_label = "", x_label="", y_label=""):
    #If custom ticks are passed, spread them on the axe and write the tick values
    if xtick_label:
        p.set_xticks(range(0,len(xtick_label)))
        p.set_xticklabels(xtick_label)
    #Create y ticks for grid. It will always be a percentage, so it is not customisable
    p.set_yticks(range(0,110,10)) 
    p.set_yticklabels(['{:3.0f}%'.format(x) for x in range(0,110,10)])
    p.set_yticks(range(0,110,5), minor=True) 

    #Draw grid and set y limit to be only 100 (By default it had an empty area at the top of the graph)
    p.xaxis.grid('on', which='major', zorder=1, color='gray', linestyle='dashed')
    p.yaxis.grid('on', which='major', zorder=1, color='gray', alpha=0.2)
    p.yaxis.grid('on', which='minor', zorder=1, color='gray', linestyle='dashed', alpha=0.2)
    p.set(ylim=(0,100))

    #Customise legend
    p.legend(labels=legend_lst, frameon=True).get_frame().set_alpha(0.2)

    #Put the axes labels
    if x_label:
        p.set_xlabel(x_label)
    if y_label:
        p.set_ylabel(y_label);



In [114]:

    
hr_data.info() #attribut specifications









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB



In [115]:

    
hr_data.describe()  # show some statistics about the attributes of the data









    Out[115]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
    
    
      mean
      0.612834
      0.716102
      3.803054
      201.050337
      3.498233
      0.144610
      0.238083
      0.021268
    
    
      std
      0.248631
      0.171169
      1.232592
      49.943099
      1.460136
      0.351719
      0.425924
      0.144281
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.440000
      0.560000
      3.000000
      156.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.640000
      0.720000
      4.000000
      200.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.820000
      0.870000
      5.000000
      245.000000
      4.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000



In [116]:

    
#To observe the distribution of satisfaction level among the employees we generate metrics 
#of skewness and plot the histogram of the satisfaction level.
print ("Skew is:", hr_data.satisfaction_level.skew()) #*.skew() shows tendency (0=no skewness, (-)=left skewed)

plt.hist(hr_data.satisfaction_level, color='blue',bins=5) #plot the histogram
#plt.hist needs argument "data" in form of a 1d numpy array
#we adress columns as numpy arrays by just adding their name to the data frame (e.g. df.variable_name).
#we choose the parameter color to be blue --> blue histogram and bins=5 to fit the data to 5 pillars.

plt.xlabel('satisfaction_level') #naming the x-axis
plt.ylabel('Staff') #labeling the y-axis
plt.title('Satisfaction Distribution') 
 
plt.show() #display the plot









    



Skew is: -0.476360341284



In [117]:

    
if(not hr_data.isnull().values.any()):  #Checking for NaN-values  
    #If the iterable is empty, it returns False, True if any element of the iterable is true.  
    print('QC (Y): Dataset does not contain missing values')
else:
    print('QC (N): Dataset contains missing values')









    



QC (Y): Dataset does not contain missing values



In [118]:

    
#for showing how to handle missing values(mv), we randomly create some NaN-values inside the dataset 
import random
ix = [(row, col) for row in range(hr_data_copy.shape[0]) for col in range(hr_data_copy.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):   #for a 10% random sample in the sequenz ix
       hr_data_copy.iloc[row, col] = np.nan                  #entries replaced by NaN-values



In [119]:

    
hr_data_copy.head() #show the first five entries again with NaN-entries









    Out[119]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      0
      0.38
      NaN
      2.0
      NaN
      NaN
      0.0
      1.0
      0.0
      sales
      low
    
    
      1
      NaN
      0.86
      5.0
      262.0
      6.0
      0.0
      NaN
      0.0
      sales
      medium
    
    
      2
      0.11
      0.88
      7.0
      272.0
      NaN
      0.0
      1.0
      0.0
      sales
      medium
    
    
      3
      0.72
      0.87
      NaN
      223.0
      5.0
      0.0
      1.0
      0.0
      sales
      low
    
    
      4
      0.37
      0.52
      2.0
      159.0
      3.0
      0.0
      1.0
      0.0
      sales
      low



In [120]:

    
hr_data_copy.describe() #just to show the mv-effect, now less entries, because of mv









    Out[120]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      13542.000000
      13535.000000
      13487.000000
      13461.000000
      13461.000000
      13499.000000
      13511.000000
      13496.000000
    
    
      mean
      0.613880
      0.716163
      3.801661
      201.211277
      3.494391
      0.145122
      0.236844
      0.021710
    
    
      std
      0.248214
      0.170944
      1.233175
      49.931955
      1.457746
      0.352237
      0.425162
      0.145741
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.440000
      0.560000
      3.000000
      156.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.650000
      0.720000
      4.000000
      200.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.820000
      0.870000
      5.000000
      245.000000
      4.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000



In [121]:

    
##there are 3 possibilities for handling missing values(mv): 
#1.to ignore  #2.to impute(fill)  3.to drop the datapoint with mv



In [122]:

    
##2.way - impute
#imputation strategies:
#  “mean” to replace missing values using the mean along the axis.
#  “median” to replace missing values using the median along the axis.
#  “most_frequent” to replace missing using the most frequent value along the axis.
hr_no_missing_f=hr_data_copy.fillna(hr_data_copy.iloc[:10].median()) 
hr_no_missing_f.describe()









    Out[122]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
    
    
      mean
      0.594075
      0.729226
      3.922461
      203.548037
      3.597507
      0.130609
      0.312554
      0.019535
    
    
      std
      0.243456
      0.167174
      1.223760
      47.805067
      1.414276
      0.336983
      0.463550
      0.138399
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.410000
      0.570000
      3.000000
      159.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.600000
      0.750000
      4.000000
      212.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.800000
      0.850000
      5.000000
      241.000000
      4.500000
      0.000000
      1.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000



In [123]:

    
#3.way - drop mv's
hr_no_missing_d = hr_data_copy.dropna()
hr_no_missing_d.head()









    Out[123]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      4
      0.37
      0.52
      2.0
      159.0
      3.0
      0.0
      1.0
      0.0
      sales
      low
    
    
      5
      0.41
      0.50
      2.0
      153.0
      3.0
      0.0
      1.0
      0.0
      sales
      low
    
    
      6
      0.10
      0.77
      6.0
      247.0
      4.0
      0.0
      1.0
      0.0
      sales
      low
    
    
      7
      0.92
      0.85
      5.0
      259.0
      5.0
      0.0
      1.0
      0.0
      sales
      low
    
    
      8
      0.89
      1.00
      5.0
      224.0
      5.0
      0.0
      1.0
      0.0
      sales
      low



In [124]:

    
hr_no_missing_d.describe()









    Out[124]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      5252.000000
      5252.000000
      5252.000000
      5252.000000
      5252.000000
      5252.000000
      5252.000000
      5252.000000
    
    
      mean
      0.615773
      0.714254
      3.790556
      200.684501
      3.476009
      0.140137
      0.230769
      0.019992
    
    
      std
      0.248805
      0.172041
      1.222808
      50.128170
      1.426681
      0.347162
      0.421365
      0.139987
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.440000
      0.560000
      3.000000
      156.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.650000
      0.720000
      4.000000
      199.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.820000
      0.870000
      5.000000
      245.000000
      4.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000



In [125]:

    
hr_data.info() #attribut specifications, shows the datatype-information about the attributes









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB



In [126]:

    
print('Departments: ', ', '.join(hr_data['sales'].unique())) #show and join the unique entries in sales
#with the more precise description "departments"
print('Salary levels: ', ', '.join(hr_data['salary'].unique())) #show and join the unique entries in 
#salery with a descriptive level









    



Departments:  sales, accounting, hr, technical, support, management, IT, product_mng, marketing, RandD
Salary levels:  low, medium, high



In [127]:

    
hr_data.rename(columns={'sales':'department'}, inplace=True) #rename Column, note: you do need to 
#specify the existing label first followed by the new label to rename it to afterward 
hr_data_new = pd.get_dummies(hr_data, ['department', 'salary'] ,drop_first = True) #Whether to get k-1 
#dummies variables out of k categorical by removing the first level. New in Pandas version 0.18.0.
hr_data_new.head()









    Out[127]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      department_RandD
      department_accounting
      department_hr
      department_management
      department_marketing
      department_product_mng
      department_sales
      department_support
      department_technical
      salary_low
      salary_medium
    
  
  
    
      0
      0.38
      0.53
      2
      157
      3
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
    
    
      1
      0.80
      0.86
      5
      262
      6
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
    
    
      2
      0.11
      0.88
      7
      272
      4
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      1
    
    
      3
      0.72
      0.87
      5
      223
      5
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
    
    
      4
      0.37
      0.52
      2
      159
      3
      0
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0



In [128]:

    
'''Observe that "IT" and "high" are the baseline levels for the assigned department and salary level, 
respectively. Also note that we saved the data with dummy variables as another dataframe in case we need to 
access the string values, such as for a cross-tabulation table.'''

hr_data_new.describe() #Generates descriptive statistics that summarize the central tendency, 
#dispersion and shape of a dataset’s distribution.









    Out[128]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      department_RandD
      department_accounting
      department_hr
      department_management
      department_marketing
      department_product_mng
      department_sales
      department_support
      department_technical
      salary_low
      salary_medium
    
  
  
    
      count
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
      14999.000000
    
    
      mean
      0.612834
      0.716102
      3.803054
      201.050337
      3.498233
      0.144610
      0.238083
      0.021268
      0.052470
      0.051137
      0.049270
      0.042003
      0.057204
      0.060137
      0.276018
      0.148610
      0.181345
      0.487766
      0.429762
    
    
      std
      0.248631
      0.171169
      1.232592
      49.943099
      1.460136
      0.351719
      0.425924
      0.144281
      0.222981
      0.220284
      0.216438
      0.200602
      0.232239
      0.237749
      0.447041
      0.355715
      0.385317
      0.499867
      0.495059
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.440000
      0.560000
      3.000000
      156.000000
      3.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.640000
      0.720000
      4.000000
      200.000000
      3.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.820000
      0.870000
      5.000000
      245.000000
      4.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      1.000000
      1.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000



In [129]:

    
#Employees who Left their Jobs in sum
employees_left_plt = sns.countplot(hr_data.left);

for p in employees_left_plt.patches:
    employees_left_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( (100.0* p.get_height()) 
                                                                              /hr_data.left.count(), 1) )+ "%", 
                                (p.get_x() + 0.3, p.get_height()-1100),
                                color='white', fontsize=12, fontweight='bold')



In [130]:

    
#proportion of leaving and staying in the different departments: 
dept_table = pd.crosstab(hr_data['department'], hr_data['left'],normalize='index')
#We created a cross tabulation of columns left and department, the normalize parameter is
#dividing all values by the sum of values.
#parameter list for pandas.crosstab can be found in pandas documentation:
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html
dept_table.index.names = ['Department'] #naming of the index column
dept_table #print the cross tab









    Out[130]:







  
    
      left
      0
      1
    
    
      Department
      
      
    
  
  
    
      IT
      0.777506
      0.222494
    
    
      RandD
      0.846252
      0.153748
    
    
      accounting
      0.734029
      0.265971
    
    
      hr
      0.709066
      0.290934
    
    
      management
      0.855556
      0.144444
    
    
      marketing
      0.763403
      0.236597
    
    
      product_mng
      0.780488
      0.219512
    
    
      sales
      0.755072
      0.244928
    
    
      support
      0.751009
      0.248991
    
    
      technical
      0.743750
      0.256250



In [131]:

    
g=sns.countplot(x='department', hue='left', data=hr_data) #Show the number of observations 
#in each categorical bin of 'department' using bars.
for item in g.get_xticklabels():  #for every entry in x rotate the x-axis for better reading 
    item.set_rotation(60)         # No'in brackets = degrees in positive direction



In [132]:

    
# Correlation matrix is used to do some basic visualizations and show any relationships in the data.
sns.heatmap(hr_data.corr(), annot=True,fmt='.2f');  #compute pairwise correlation of columns, 
#excluding NA/null values; annot=True presents heatmap with values, the format-configuration 
#makes it better to read (2 decimal places), <;> is hiding the processing steps



In [133]:

    
plt.figure(figsize=(10, 10)) #figure module, which contains all the plot elements
sns.pairplot(hr_data,  hue="left"); #distinguish the left feature









    





<matplotlib.figure.Figure at 0x1d804b6ee80>



In [134]:

    
#Univariate Analysis
hr_by_left = hr_data.groupby('left')  #determine the groups on each value of the object’s index
employees_left = hr_by_left.get_group(1) #determine and store the group that left
employees_stayed = hr_by_left.get_group(0) #determine and store the group that stayed



In [135]:

    
#distribution of Satisfaction Level
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.satisfaction_level, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.satisfaction_level, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.satisfaction_level, ax=axs[1])
box_plot = sns.boxplot(hr_data.satisfaction_level, ax=axs[2])

kde_plot.set(xlim=(0,1.1))
hist_plot.set(xlim=(0,1.1))
box_plot.set(xlim=(0,1.1));



In [136]:

    
#Promotion within the Past 5 Years
fig, axs = plt.subplots(ncols= 2, figsize=(13, 5))

promoted_5years_plt = sns.countplot(hr_data.promotion_last_5years, ax=axs[0]);
annotate_bars(bar_plt=promoted_5years_plt, bar_plt_var=hr_data.promotion_last_5years, 
              x_offset=0.3, txt_color="black")
    
bar_plot = sns.countplot(x=hr_data.promotion_last_5years, 
                         hue=hr_data.left, ax=axs[1])
annotate_bars(bar_plt=bar_plot, by=hr_data.promotion_last_5years, 
              bar_plt_var=hr_data.promotion_last_5years, 
              x_offset=0.1, txt_color="black")
bar_plot.set(ylim=(0,16000));



In [137]:

    
#Create groups
employees_by_promotion = hr_data.groupby("promotion_last_5years")
employees_promoted = employees_by_promotion.get_group(1)
employees_not_promoted = employees_by_promotion.get_group(0)

#Get counts
employees_promoted_stayed = employees_promoted.groupby("left").get_group(0).left.count()
employees_promoted_left = employees_promoted.groupby("left").get_group(1).left.count()

employees_not_promoted_stayed = employees_not_promoted.groupby("left").get_group(0).left.count()
employees_not_promoted_left = employees_not_promoted.groupby("left").get_group(1).left.count()

#Create rows that makeup the contingency table
promoted_row = [employees_promoted_stayed, employees_promoted_left, 
                employees_promoted_stayed + employees_promoted_left]
not_promoted_row = [employees_not_promoted_stayed, employees_not_promoted_left, 
                    employees_not_promoted_stayed + employees_not_promoted_left]
total_row = [employees_promoted_stayed+employees_not_promoted_stayed,
             employees_promoted_left+employees_not_promoted_left,
             hr_data.left.count()]

#Create the contingency table
contingency_table = pd.DataFrame({'Promoted': promoted_row ,
                                  'Not Promoted': not_promoted_row ,
                                  'Total, By Left': total_row},
                                 index = ['Stayed', 'Left', 'Total, by Promotion'], 
                                 columns = [ 'Promoted', 'Not Promoted', 'Total, By Left'])

display_html(contingency_table)









    







  
    
      
      Promoted
      Not Promoted
      Total, By Left
    
  
  
    
      Stayed
      300
      11128
      11428
    
    
      Left
      19
      3552
      3571
    
    
      Total, by Promotion
      319
      14680
      14999



In [138]:

    
chi_squared, p, degrees_of_freedom, expected_frequency = sp.chi2_contingency( contingency_table )

print("Chi Squared: ", chi_squared)
print("p value: ", p)
print("Degrees of Freedom", degrees_of_freedom)
print("Expected Frequency for The Not Promoted Employees:", expected_frequency[0])
print("Expected Frequency for The Promoted Employees:", expected_frequency[1])









    



Chi Squared:  57.2627339495
p value:  1.08970012479e-11
Degrees of Freedom 4
Expected Frequency for The Not Promoted Employees: [   243.05167011  11184.94832989  11428.        ]
Expected Frequency for The Promoted Employees: [   75.94832989  3495.05167011  3571.        ]



In [139]:

    
fig, axs = plt.subplots(figsize=(13, 6))

department_plt = sns.countplot(hr_data.department, order = hr_data.department.value_counts().index);

annotate_bars(bar_plt=department_plt, bar_plt_var=hr_data.department, x_offset=0.2, y_offset=450, 
              txt_color="black")



In [140]:

    
#Departments & Who Left; Is there a pattern?
fig, axs = plt.subplots(figsize=(13, 4))

#Order the bars descendingly according to the PERCENTAGE % of those who left in each department
total_employees_by_dept = hr_data.groupby(["department"]).satisfaction_level.count()
left_count_by_dept = hr_data[hr_data["left"] == 1].groupby(["department"]).satisfaction_level.count()
percentages_left_by_dept = (left_count_by_dept / total_employees_by_dept).sort_values(ascending=False)
axe_name_order = percentages_left_by_dept.index

department_plt = sns.countplot(hr_data.department, order = axe_name_order, color='g');
sns.countplot(employees_left.department, order = axe_name_order, color='r');

department_plt.legend(labels=['Stayed', 'Left'])
department_plt.set(xlabel='Department\n Sorted for "Left" Percentage')

#Annotate the percentages of those who stayed. It was more straightforward to loop for each 
#category (left, stayed) than doing all the work in one loop.
#The zip creates an output that is equal to the shortest parameter, so we do not need to adjust the 
#patches length, since the loop will stop after finishing the columns of those who stayed
for p, current_column in zip(department_plt.patches, axe_name_order):
    current_column_total = hr_data[hr_data['department'] == current_column].department.count()
    stayed_count = p.get_height() - employees_left[employees_left['department'] == current_column].department.count()
    department_plt.annotate(str(round( (100.0* stayed_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.2, p.get_height()-10),
                                color='black', fontsize=12)
    
#In this loop,  use the patches located on the second half of patches list, which are the 
#bars for those who left.
for p, current_column in zip(department_plt.patches[int(len(department_plt.patches)/2):], axe_name_order):
    current_column_total = hr_data[hr_data['department'] == current_column].department.count()
    left_count = p.get_height()
    department_plt.annotate(str(round( (100.0* left_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.2, p.get_height()-10),
                                color='black', fontsize=12)



In [141]:

    
g=sns.boxplot(x='department', y='satisfaction_level', data=hr_data) #Draw a box plot to show 
#distributions (satisfaction level) with respect to categories (departments). A box plot (or
#box-and-whisker plot) shows the distribution of quantitative data in a way that facilitates 
#comparisons between variables or across levels of a categorical variable. The box shows the 
#quartiles of the dataset while the whiskers extend to show the rest of the distribution, 
#except for points that are determined to be “outliers”. 
for item in g.get_xticklabels():  #rotate the x-axis for better reading 
    item.set_rotation(60) # No'in brackets = degrees in positive direction



In [142]:

    
#Satisfaction level of each department
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.satisfaction_level, 
                       order=get_ordered_group_index(hr_data, 'department', 'satisfaction_level'))



In [143]:

    
department_plt = sns.countplot(hr_data.salary, order = hr_data.salary.value_counts().index);

for p in department_plt.patches:
    department_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( 
                (100.0* p.get_height()) /hr_data.salary.count(), 1) )+ "%", 
                                (p.get_x() + 0.3, p.get_height()-800),
                                color='white', fontsize=12, fontweight='bold')



In [144]:

    
g=sns.countplot(x='salary', hue='left', data=hr_data) #shows the number of 
#observations in each categorical bin using bars; 
#here: the dependecy of staying from the catagories of salary



In [145]:

    
sns.boxplot(x='salary', y='satisfaction_level', data=hr_data); #satisfaction level referring to salary;
#again <;> is hiding the internal and irrelevant processing step



In [146]:

    
fig, axs = plt.subplots(nrows=3, figsize=(13, 4))

sns.kdeplot(employees_left.average_montly_hours, ax=axs[0], shade=True, color="r") #tool in seaborn for 
#examining univariate and bivariate distributions
kde_plot = sns.kdeplot(employees_stayed.average_montly_hours, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.average_montly_hours, ax=axs[1])#plot a univariate distribution
box_plot = sns.boxplot(hr_data.average_montly_hours, ax=axs[2]) #Draw a box plot to show distributions

kde_plot.set(xlim=(0,350)) #set or query x-axis limits
hist_plot.set(xlim=(0,350))
box_plot.set(xlim=(0,350));



In [147]:

    
#Number of Years Working for the Company
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.time_spend_company, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.time_spend_company, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.time_spend_company, ax=axs[1], kde=False)
box_plot = sns.boxplot(hr_data.time_spend_company, ax=axs[2])

kde_plot.set(xlim=(0,12))
hist_plot.set(xlim=(0,12))
box_plot.set(xlim=(0,12));



In [148]:

    
#How Hard does Each Department Work?
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.average_montly_hours, 
                       order=get_ordered_group_index(hr_data, 'department', 'average_montly_hours') )



In [149]:

    
g=sns.factorplot(x='number_project', y='last_evaluation', hue='department', data=hr_data,size=8, aspect=1) 
#multiple graphs on the same plot in seaborn with factorplot, hue==colors in the legend, size(extension 
#of x-axis) and aspect ratio for better distinction (extension of y-axis)
#more information about the plot can be found: http://seaborn.pydata.org/generated/seaborn.factorplot.html



In [150]:

    
#showing the satisfaction level against the completed number of projects 
sns.boxplot(x='number_project', y='satisfaction_level', data=hr_data_new);
#again <;> is hiding the internal and irrelevant processing step
#more information about the boxplot can be found in the seaborn-documentation:
#https://seaborn.pydata.org/generated/seaborn.boxplot.html



In [151]:

    
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.number_project, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.number_project, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.number_project, ax=axs[1], kde=False)
box_plot = sns.boxplot(hr_data.number_project, ax=axs[2])

kde_plot.set(xlim=(0,8))
hist_plot.set(xlim=(0,8))
box_plot.set(xlim=(0,8));



In [152]:

    
#How Many Projects are Assigned on Average per Employee?
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.number_project, 
                       order=get_ordered_group_index(hr_data,'department', 'number_project'))



In [153]:

    
#General Overview about salaries
fig, axs = plt.subplots(figsize=(13, 4))

axe_name_order = hr_data.salary.value_counts().index

salary_plt = sns.countplot(hr_data.salary, order = axe_name_order, color='g');
sns.countplot(employees_left.salary, order = axe_name_order, color='r');

salary_plt.legend(labels=['Stayed', 'Left'])

#Annotate the percentages of those who stayed. It was more straightforward to loop for each 
#category (left, stayed) than doing all the work in one loop. The zip creates an output that 
#is equal to the shortest parameter, so we do not need to adjust the patches length, since
#the loop will stop after finishing the columns of those who stayed
for p, current_column in zip(salary_plt.patches, axe_name_order):
    current_column_total = hr_data[hr_data['salary'] == current_column].salary.count()
    stayed_count = p.get_height() - employees_left[employees_left['salary'] == current_column].salary.count()
    salary_plt.annotate(str(round( (100.0* stayed_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.35, p.get_height()-10),
                                color='black', fontsize=12)
    
#In this loop, we want to use the patches located on the second half of patches list, which are the 
#bars for those who left.
for p, current_column in zip(salary_plt.patches[int(len(salary_plt.patches)/2):], axe_name_order):
    current_column_total = hr_data[hr_data['salary'] == current_column].salary.count()
    left_count = p.get_height()
    salary_plt.annotate(str(round( (100.0* left_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.35, p.get_height()-10),
                                color='black', fontsize=12)



In [154]:

    
timeplot = sns.factorplot(x='time_spend_company', hue='left', y='department', row='salary', 
                          data=hr_data, aspect=2)
#the factorplot draws a categorical plot onto a FacetGrid; It is possible to make rather complex 
#plots using this function more information about parameters and using factorplots can be found in 
#documntation: https://seaborn.pydata.org/generated/seaborn.factorplot.html



In [155]:

    
fig, axs = plt.subplots(figsize=(16, 4))

sns.stripplot(y = 'salary', x='average_montly_hours', hue='left', data=hr_data);



In [156]:

    
#Promotions in the departments
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.promotion_last_5years, 
                       order=get_ordered_group_index(hr_data, 'department', 'promotion_last_5years'))



In [157]:

    
#Evaluation of the Management Department 
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.last_evaluation, 
                       order=get_ordered_group_index(hr_data,'department', 'last_evaluation'))



In [158]:

    
#Maybe they Stayed Longer?

fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.time_spend_company, 
                       order=get_ordered_group_index(hr_data, 'department', 'time_spend_company'))



In [159]:

    
fig, axs = plt.subplots(ncols= 2, figsize=(13, 5)) #set the number and size of the diagram spaces

work_accidents_plt = sns.countplot(hr_data.Work_accident, ax=axs[0]);
annotate_bars(bar_plt=work_accidents_plt, bar_plt_var=hr_data.Work_accident, 
              x_offset=0.3, y_offset=1100)
    
bar_plot = sns.countplot(x=hr_data.Work_accident, hue=hr_data.left, ax=axs[1])
annotate_bars(bar_plt=bar_plot, by=hr_data.Work_accident, bar_plt_var=hr_data.Work_accident, 
              x_offset=0.1, txt_color="black")
bar_plot.set(ylim=(0,14000));



In [160]:

    
#Work Related Accidents
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.Work_accident, 
                       order=get_ordered_group_index(hr_data, 'department', 'Work_accident'))



In [161]:

    
accidentplot = plt.figure(figsize=(10,6)) #advanced plotting with figure module for subplotting; 
#figsize-->w,h tuple in inches constructs the plotting area
#more information about the figure module can be found at: https://matplotlib.org/api/figure_api.html
accidentplotax = accidentplot.add_axes([0,0,1,1]) #Add an axes at position[left, bottom, width, height] 
#where all quantities are in fractions of figure width and height.
accidentplotax = sns.violinplot(x='department', y='average_montly_hours', 
                                hue='Work_accident', split=True, data = hr_data, jitter = 0.47)
#A violin plot plays a similar role as a box and whisker plot. It shows the distribution of 
#quantitative data across several levels of one (or more) categorical variables such that those 
#distributions can be compared. 
#more information about parameters and using violinplots can be found in documntation:
#https://seaborn.pydata.org/generated/seaborn.violinplot.html



In [162]:

    
satisaccident = plt.figure(figsize=(10,6)) #we set the overall size of the diagram to 10x6 inches
satisaccidentax = satisaccident.add_axes([0,0,1,1]) #We scale the plot to the axes 

satisaccidentax = sns.violinplot(x='left', hue='Work_accident', y='satisfaction_level', 
                                 split=True, data=hr_data)
#We plot the entries in work accident with different colors (hue) and show the two different values in 
#one plot for the same argument.



In [163]:

    
#A function to bin the average monthly hours into the categories described above
def work_load_cat(avg_mnthly_hrs):
    work_load = "unknown"
    if avg_mnthly_hrs < 168:
        work_load = "low"
    elif (avg_mnthly_hrs >= 168) & (avg_mnthly_hrs < 210):
        work_load = "average"
    elif (avg_mnthly_hrs >= 210) & (avg_mnthly_hrs < 252):
        work_load = "above_average"
    elif avg_mnthly_hrs >= 252:
        work_load = "workoholic"
        
    return work_load



In [164]:

    
hr_data['work_load'] = hr_data.average_montly_hours.apply(work_load_cat)

sns.countplot(x='work_load', hue='left', data=hr_data, order = ['low', 'average', 'above_average', 'workoholic']);



In [165]:

    
#Normalised stacked
departments = list(set(hr_data.department.values))
number_of_departments = len(departments)

fig, axs = plt.subplots(nrows= int(number_of_departments/2), ncols=2, figsize=(13, 20))

for i in range(number_of_departments):
    current_dep = departments[i]
    
    ratio_df = 100*hr_data[hr_data.department == current_dep].groupby(['work_load', 'left']).agg(
        {'work_load': 'count'})/hr_data[hr_data.department == current_dep].groupby(['work_load']).agg(
        {'work_load': 'count'})
    ratio_df = ratio_df.reindex_axis(["low", "average", "above_average", "workoholic"], axis=0, level=0)
    #plot the department
    ratio_df.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', ax=axs[int(i/2),i%2])
    axs[int(i/2),i%2].set_title(current_dep)
    axs[int(i/2),i%2].set_xlabel("")
    
axs[int(i/2),i%2].set_xlabel("work_load")
plt.subplots_adjust(hspace=0.3);



In [166]:

    
#Understanding how the Company Evaluates its Employees
#A function to bin last evaluation into one of 5 categories
def last_evaluation_cat(last_evaluation):
    evaluation = "unknown"
    if last_evaluation < 0.45:
        evaluation = "very_low"
    elif (last_evaluation >= 0.45) & (last_evaluation < 0.55):
        evaluation = "mediocre"
    elif (last_evaluation >= 0.55) & (last_evaluation < 0.8):
        evaluation = "average"
    elif (last_evaluation >= 0.8) & (last_evaluation < 0.9):
        evaluation = "very_good"
    elif last_evaluation >= 0.9:
        evaluation = "excellent"
        
    return evaluation



In [167]:

    
hr_data['evaluation'] = hr_data.last_evaluation.apply(last_evaluation_cat)



In [168]:

    
sns.countplot(x='evaluation',  data=hr_data, order = ["unknown", 'very_low', 'mediocre', 
                                                      'average', 'very_good', 'excellent']);



In [169]:

    
sns.countplot(x='evaluation',  hue = 'left', data=hr_data, 
              order = ["unknown", 'very_low', 'mediocre', 'average', 'very_good', 'excellent']);



In [170]:

    
evaluation_index_order = ["unknown", 'very_low', 'mediocre', 'average', 'very_good', 'excellent']
evaluation_xticks = ['Very Low\n (eval < .45)', 'Mediocre\n ( .45 < eval < .55 )', 
                     'Average\n ( .55 < eval < .8 )', 'Very Good\n ( .8 < eval < .9 )', 
                     'Excellent\n ( .9 < eval)']
evaluation_x_label = "Company Evaluation for the Employee"



In [171]:

    
#A function to bin the average monthly hours into the categories described above
def work_load_cat(avg_mnthly_hrs):
    work_load = "unknown"
    if avg_mnthly_hrs < 168:
        work_load = "low"
    elif (avg_mnthly_hrs >= 168) & (avg_mnthly_hrs < 210):
        work_load = "average"
    elif (avg_mnthly_hrs >= 210) & (avg_mnthly_hrs < 252):
        work_load = "above_average"
    elif avg_mnthly_hrs >= 252:
        work_load = "workoholic"
        
    return work_load



In [172]:

    
#Categories Makeup in Terms of Working Hours
employees_by_eval_and_workload = group_by_2_level_perc(hr_data, 
                                                       'evaluation', 'work_load',
                                                       evaluation_index_order, 
                                                       ['low','average','above_average', 'workoholic'])#Index Order

workload_legend = ['Low Workload (< 40hrs/week)', 'Average Workload (40 < wl < 50 hrs/week)',
                   'Above Average Workload (50 < wl < 60hrs/week)', 
                   'Workoholic Workload (wl > 60hrs/week)']

#Plot the Graph
p=employees_by_eval_and_workload.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                figsize=(15, 6), zorder=0)

customise_2lvl_perc_area_graph(p, workload_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Monthly Workload")



In [173]:

    
#Number of Years with the Company
employees_by_eval_and_time_in_company_perc = group_by_2_level_perc(hr_data, 
                                                                   'evaluation', 
                                                                   'time_spend_company',
                                                                   evaluation_index_order)

#Plot the Graph
p=employees_by_eval_and_time_in_company_perc.unstack().plot(kind='area',stacked=True, 
                                                            colormap= 'Spectral', 
                                                            figsize=(15, 6), zorder=0)

time_spent_legend = [str(x) + " years" for x in range(2,9)] + ['10 years']

customise_2lvl_perc_area_graph(p, time_spent_legend, xtick_label=evaluation_xticks, 
                               x_label=evaluation_x_label, #Company Evaluation Graph
                               y_label="Percentage of Years in Company")



In [174]:

    
#Number of Projects
employees_by_eval_and_time_in_company_perc = group_by_2_level_perc(hr_data, 
                                                                   'evaluation', 'number_project',
                                                                   evaluation_index_order)

#Plot the Graph
p=employees_by_eval_and_time_in_company_perc.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                            figsize=(15, 6), zorder=0)

num_projects_legend = [str(x) + " projects" for x in range(2,8)]

customise_2lvl_perc_area_graph(p, num_projects_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Number of Projects Assigned")



In [175]:

    
#salary
employees_by_eval_and_salary_perc = group_by_2_level_perc(hr_data, 
                                                          'evaluation', 'salary', 
                                                          evaluation_index_order, 
                                                          ['low', 'medium', 'high'])

#Plot the Graph
p=employees_by_eval_and_salary_perc.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                   figsize=(15, 6), zorder=0)

num_projects_legend = ['Low', 'Medium', 'High']

customise_2lvl_perc_area_graph(p, num_projects_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Salary Range")



In [176]:

    
#satisfaction level
#Create a satisfaction categories
#Arbitrary boundaries:
# < 4.5 low
# 4.5 < < 7.5 medium
# 7.5 < high
def rank_satisfaction(employee):
    level = "unknown"
    if employee.satisfaction_level < 0.45:
        level='low'
    elif employee.satisfaction_level < 0.75:
        level = 'medium'
    else:
        level = 'high'
    return level



In [177]:

    
hr_data['satisfaction'] = hr_data.apply(rank_satisfaction, axis=1)



In [178]:

    
employees_by_eval_and_satisfaction_perc = group_by_2_level_perc(hr_data, 
                                                                'evaluation', 
                                                                'satisfaction', 
                                                                evaluation_index_order, 
                                                                ['low', 'medium', 'high'])

#Plot the Graph
p=employees_by_eval_and_satisfaction_perc.unstack().plot(kind='area',stacked=True, 
                                                         colormap= 'Spectral', figsize=(15, 6), 
                                                         zorder=0)


satisfaction_lvl_legend = ['Low', 'Medium', 'High']

customise_2lvl_perc_area_graph(p, satisfaction_lvl_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Employee's Satisfaction Level")



In [ ]:



In [179]:

    
import sys
print(sys.version)









    



3.5.2 |Anaconda custom (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]



In [ ]:

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.238083	0.021268
std	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.425924	0.144281
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	13542.000000	13535.000000	13487.000000	13461.000000	13461.000000	13499.000000	13511.000000	13496.000000
mean	0.613880	0.716163	3.801661	201.211277	3.494391	0.145122	0.236844	0.021710
std	0.248214	0.170944	1.233175	49.931955	1.457746	0.352237	0.425162	0.145741
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.650000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
4	0.37	0.52	2.0	159.0	3.0	1.0	sales	low
5	0.41	0.50	2.0	153.0	3.0	1.0	sales	low
6	0.10	0.77	6.0	247.0	4.0	1.0	sales	low
7	0.92	0.85	5.0	259.0	5.0	1.0	sales	low
8	0.89	1.00	5.0	224.0	5.0	1.0	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	5252.000000	5252.000000	5252.000000	5252.000000	5252.000000	5252.000000	5252.000000	5252.000000
mean	0.615773	0.714254	3.790556	200.684501	3.476009	0.140137	0.230769	0.019992
std	0.248805	0.172041	1.222808	50.128170	1.426681	0.347162	0.421365	0.139987
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.650000	0.720000	4.000000	199.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

left	0	1
Department
IT	0.777506	0.222494
RandD	0.846252	0.153748
accounting	0.734029	0.265971
hr	0.709066	0.290934
management	0.855556	0.144444
marketing	0.763403	0.236597
product_mng	0.780488	0.219512
sales	0.755072	0.244928
support	0.751009	0.248991
technical	0.743750	0.256250

	Promoted	Not Promoted	Total, By Left
Stayed	300	11128	11428
Left	19	3552	3571
Total, by Promotion	319	14680	14999