In [112]:
#The HR Analytics dateset from kaggle competitions 
#Import of  necessary libraries, Modules and classifiers
import numpy as np  #fundamental package for scientific computing with Python
import pandas as pd #package providing fast, flexible, and expressive data structures
import matplotlib.pyplot as plt #for plotting different kinds of diagrams
#commands in cells below the cell that outputs a plot will not affect the plot inline command 
#(commentation on the same line causes an error):
%matplotlib inline 
import seaborn as sns #visualization library based on matplotlib, for statistical data visualization
from IPython.display import display_html
import scipy.stats as sp
from scipy.stats.stats import pearsonr

hr_data=pd.read_csv('.\HR_comma_sep.csv',header=0) #read the data from a csv-file; ensure that the  v
#alues are separated by commas otherwise you need to specify the delimiter explicitly within the load-statement

hr_data_copy=hr_data.copy()  #create a deep copy of the data set for demonstrating how to handle missing values (mv)

hr_data.head() #show the first five entries; attribute in brackets will give the # of printed lines


Out[112]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

In [113]:
###We define small helper functions (Code from E-M-A-D; https://www.kaggle.com/etakla)
#A function for annotating the bars with its total and relative number. 
def annotate_bars(bar_plt, bar_plt_var, by=None, x_offset=0, y_offset=0, txt_color="white", 
                  fnt_size=12, fnt_weight='bold'):
    if by is None:
        for p in bar_plt.patches:
            bar_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( 
                        (100.0* p.get_height()) /bar_plt_var.count(), 1) )
                             + "%", 
                             (p.get_x() + x_offset, p.get_height()-y_offset),
                             color=txt_color, fontsize=fnt_size, fontweight=fnt_weight)
    else:
        grouped = bar_plt_var.groupby(by)
        for p in bar_plt.patches:            
            #This part is tricky. The problem is that not each x-tick gets drawn in order, 
            #i.e. yes/no of the first group then yes/no of the second group located on the 
            #next tick, but rather all the yes on all the x-ticks get drawn first then all 
            #the nos next. So we need to know we are using a patch that belongs to which 
            #tick (the x-tick) ultimately refers to one of the groups. So, we get the x absolute 
            #coordinate, round it to know this patch is closest to which tick (Assuming that it 
            #will always belong to its closest tick), then get the group count of that tick and 
            #use it as a total to compute the percentage.
            total = grouped.get_group(bar_plot.get_xticks()[int(round(p.get_x()))]).count()
            bar_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( (100.0* 
                                                                             p.get_height()) /total, 1) )+ "%", 
                             (p.get_x() + x_offset, p.get_height()-y_offset),
                             color=txt_color, fontsize=fnt_size, fontweight=fnt_weight)
            
#A function that returns the order of a group_by object according to the average of certain parameter param.
def get_ordered_group_index(df, group_by, param, ascending=False):
    return df.groupby(group_by)[param].mean().sort_values(ascending=ascending).index

#helper function that returns the order of a group_by object according to the average of certain parameter param.
def group_by_2_level_perc(df, level1, level2, level1_index_order = None, level2_index_order = None):
    #http://stackoverflow.com/questions/23377108/pandas-percentage-of-total-with-groupby
    df_by_lvl1_lvl2 = df.groupby([level1, level2]).agg({level1: 'count'})
    df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
    #Reorder them in logical ascending order, but first make sure it is not an empty input
    if level1_index_order:
        df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2_perc.reindex_axis(level1_index_order, axis=0, level=0)
    #If a second level order is passed, apply it, else use the default
    if level2_index_order:
        df_by_lvl1_lvl2_perc = df_by_lvl1_lvl2_perc.reindex_axis(level2_index_order, axis=0, level=1)
    return df_by_lvl1_lvl2_perc

#A function that adds some styling to the graphs, like custom ticks for axes, axes labels and a grid
def customise_2lvl_perc_area_graph(p, legend_lst, xtick_label = "", x_label="", y_label=""):
    #If custom ticks are passed, spread them on the axe and write the tick values
    if xtick_label:
        p.set_xticks(range(0,len(xtick_label)))
        p.set_xticklabels(xtick_label)
    #Create y ticks for grid. It will always be a percentage, so it is not customisable
    p.set_yticks(range(0,110,10)) 
    p.set_yticklabels(['{:3.0f}%'.format(x) for x in range(0,110,10)])
    p.set_yticks(range(0,110,5), minor=True) 

    #Draw grid and set y limit to be only 100 (By default it had an empty area at the top of the graph)
    p.xaxis.grid('on', which='major', zorder=1, color='gray', linestyle='dashed')
    p.yaxis.grid('on', which='major', zorder=1, color='gray', alpha=0.2)
    p.yaxis.grid('on', which='minor', zorder=1, color='gray', linestyle='dashed', alpha=0.2)
    p.set(ylim=(0,100))

    #Customise legend
    p.legend(labels=legend_lst, frameon=True).get_frame().set_alpha(0.2)

    #Put the axes labels
    if x_label:
        p.set_xlabel(x_label)
    if y_label:
        p.set_ylabel(y_label);

In [114]:
hr_data.info() #attribut specifications


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

In [115]:
hr_data.describe()  # show some statistics about the attributes of the data


Out[115]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [116]:
#To observe the distribution of satisfaction level among the employees we generate metrics 
#of skewness and plot the histogram of the satisfaction level.
print ("Skew is:", hr_data.satisfaction_level.skew()) #*.skew() shows tendency (0=no skewness, (-)=left skewed)

plt.hist(hr_data.satisfaction_level, color='blue',bins=5) #plot the histogram
#plt.hist needs argument "data" in form of a 1d numpy array
#we adress columns as numpy arrays by just adding their name to the data frame (e.g. df.variable_name).
#we choose the parameter color to be blue --> blue histogram and bins=5 to fit the data to 5 pillars.

plt.xlabel('satisfaction_level') #naming the x-axis
plt.ylabel('Staff') #labeling the y-axis
plt.title('Satisfaction Distribution') 
 
plt.show() #display the plot


Skew is: -0.476360341284

In [117]:
if(not hr_data.isnull().values.any()):  #Checking for NaN-values  
    #If the iterable is empty, it returns False, True if any element of the iterable is true.  
    print('QC (Y): Dataset does not contain missing values')
else:
    print('QC (N): Dataset contains missing values')


QC (Y): Dataset does not contain missing values

In [118]:
#for showing how to handle missing values(mv), we randomly create some NaN-values inside the dataset 
import random
ix = [(row, col) for row in range(hr_data_copy.shape[0]) for col in range(hr_data_copy.shape[1])]
for row, col in random.sample(ix, int(round(.1*len(ix)))):   #for a 10% random sample in the sequenz ix
       hr_data_copy.iloc[row, col] = np.nan                  #entries replaced by NaN-values

In [119]:
hr_data_copy.head() #show the first five entries again with NaN-entries


Out[119]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 NaN 2.0 NaN NaN 0.0 1.0 0.0 sales low
1 NaN 0.86 5.0 262.0 6.0 0.0 NaN 0.0 sales medium
2 0.11 0.88 7.0 272.0 NaN 0.0 1.0 0.0 sales medium
3 0.72 0.87 NaN 223.0 5.0 0.0 1.0 0.0 sales low
4 0.37 0.52 2.0 159.0 3.0 0.0 1.0 0.0 sales low

In [120]:
hr_data_copy.describe() #just to show the mv-effect, now less entries, because of mv


Out[120]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 13542.000000 13535.000000 13487.000000 13461.000000 13461.000000 13499.000000 13511.000000 13496.000000
mean 0.613880 0.716163 3.801661 201.211277 3.494391 0.145122 0.236844 0.021710
std 0.248214 0.170944 1.233175 49.931955 1.457746 0.352237 0.425162 0.145741
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.650000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [121]:
##there are 3 possibilities for handling missing values(mv): 
#1.to ignore  #2.to impute(fill)  3.to drop the datapoint with mv

In [122]:
##2.way - impute
#imputation strategies:
#  “mean” to replace missing values using the mean along the axis.
#  “median” to replace missing values using the median along the axis.
#  “most_frequent” to replace missing using the most frequent value along the axis.
hr_no_missing_f=hr_data_copy.fillna(hr_data_copy.iloc[:10].median()) 
hr_no_missing_f.describe()


Out[122]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.594075 0.729226 3.922461 203.548037 3.597507 0.130609 0.312554 0.019535
std 0.243456 0.167174 1.223760 47.805067 1.414276 0.336983 0.463550 0.138399
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.410000 0.570000 3.000000 159.000000 3.000000 0.000000 0.000000 0.000000
50% 0.600000 0.750000 4.000000 212.000000 3.000000 0.000000 0.000000 0.000000
75% 0.800000 0.850000 5.000000 241.000000 4.500000 0.000000 1.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [123]:
#3.way - drop mv's
hr_no_missing_d = hr_data_copy.dropna()
hr_no_missing_d.head()


Out[123]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
4 0.37 0.52 2.0 159.0 3.0 0.0 1.0 0.0 sales low
5 0.41 0.50 2.0 153.0 3.0 0.0 1.0 0.0 sales low
6 0.10 0.77 6.0 247.0 4.0 0.0 1.0 0.0 sales low
7 0.92 0.85 5.0 259.0 5.0 0.0 1.0 0.0 sales low
8 0.89 1.00 5.0 224.0 5.0 0.0 1.0 0.0 sales low

In [124]:
hr_no_missing_d.describe()


Out[124]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 5252.000000 5252.000000 5252.000000 5252.000000 5252.000000 5252.000000 5252.000000 5252.000000
mean 0.615773 0.714254 3.790556 200.684501 3.476009 0.140137 0.230769 0.019992
std 0.248805 0.172041 1.222808 50.128170 1.426681 0.347162 0.421365 0.139987
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.650000 0.720000 4.000000 199.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [125]:
hr_data.info() #attribut specifications, shows the datatype-information about the attributes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB

In [126]:
print('Departments: ', ', '.join(hr_data['sales'].unique())) #show and join the unique entries in sales
#with the more precise description "departments"
print('Salary levels: ', ', '.join(hr_data['salary'].unique())) #show and join the unique entries in 
#salery with a descriptive level


Departments:  sales, accounting, hr, technical, support, management, IT, product_mng, marketing, RandD
Salary levels:  low, medium, high

In [127]:
hr_data.rename(columns={'sales':'department'}, inplace=True) #rename Column, note: you do need to 
#specify the existing label first followed by the new label to rename it to afterward 
hr_data_new = pd.get_dummies(hr_data, ['department', 'salary'] ,drop_first = True) #Whether to get k-1 
#dummies variables out of k categorical by removing the first level. New in Pandas version 0.18.0.
hr_data_new.head()


Out[127]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical salary_low salary_medium
0 0.38 0.53 2 157 3 0 1 0 0 0 0 0 0 0 1 0 0 1 0
1 0.80 0.86 5 262 6 0 1 0 0 0 0 0 0 0 1 0 0 0 1
2 0.11 0.88 7 272 4 0 1 0 0 0 0 0 0 0 1 0 0 0 1
3 0.72 0.87 5 223 5 0 1 0 0 0 0 0 0 0 1 0 0 1 0
4 0.37 0.52 2 159 3 0 1 0 0 0 0 0 0 0 1 0 0 1 0

In [128]:
'''Observe that "IT" and "high" are the baseline levels for the assigned department and salary level, 
respectively. Also note that we saved the data with dummy variables as another dataframe in case we need to 
access the string values, such as for a cross-tabulation table.'''

hr_data_new.describe() #Generates descriptive statistics that summarize the central tendency, 
#dispersion and shape of a dataset’s distribution.


Out[128]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years department_RandD department_accounting department_hr department_management department_marketing department_product_mng department_sales department_support department_technical salary_low salary_medium
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268 0.052470 0.051137 0.049270 0.042003 0.057204 0.060137 0.276018 0.148610 0.181345 0.487766 0.429762
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281 0.222981 0.220284 0.216438 0.200602 0.232239 0.237749 0.447041 0.355715 0.385317 0.499867 0.495059
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

In [129]:
#Employees who Left their Jobs in sum
employees_left_plt = sns.countplot(hr_data.left);

for p in employees_left_plt.patches:
    employees_left_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( (100.0* p.get_height()) 
                                                                              /hr_data.left.count(), 1) )+ "%", 
                                (p.get_x() + 0.3, p.get_height()-1100),
                                color='white', fontsize=12, fontweight='bold')



In [130]:
#proportion of leaving and staying in the different departments: 
dept_table = pd.crosstab(hr_data['department'], hr_data['left'],normalize='index')
#We created a cross tabulation of columns left and department, the normalize parameter is
#dividing all values by the sum of values.
#parameter list for pandas.crosstab can be found in pandas documentation:
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html
dept_table.index.names = ['Department'] #naming of the index column
dept_table #print the cross tab


Out[130]:
left 0 1
Department
IT 0.777506 0.222494
RandD 0.846252 0.153748
accounting 0.734029 0.265971
hr 0.709066 0.290934
management 0.855556 0.144444
marketing 0.763403 0.236597
product_mng 0.780488 0.219512
sales 0.755072 0.244928
support 0.751009 0.248991
technical 0.743750 0.256250

In [131]:
g=sns.countplot(x='department', hue='left', data=hr_data) #Show the number of observations 
#in each categorical bin of 'department' using bars.
for item in g.get_xticklabels():  #for every entry in x rotate the x-axis for better reading 
    item.set_rotation(60)         # No'in brackets = degrees in positive direction



In [132]:
# Correlation matrix is used to do some basic visualizations and show any relationships in the data.
sns.heatmap(hr_data.corr(), annot=True,fmt='.2f');  #compute pairwise correlation of columns, 
#excluding NA/null values; annot=True presents heatmap with values, the format-configuration 
#makes it better to read (2 decimal places), <;> is hiding the processing steps



In [133]:
plt.figure(figsize=(10, 10)) #figure module, which contains all the plot elements
sns.pairplot(hr_data,  hue="left"); #distinguish the left feature


<matplotlib.figure.Figure at 0x1d804b6ee80>

In [134]:
#Univariate Analysis
hr_by_left = hr_data.groupby('left')  #determine the groups on each value of the object’s index
employees_left = hr_by_left.get_group(1) #determine and store the group that left
employees_stayed = hr_by_left.get_group(0) #determine and store the group that stayed

In [135]:
#distribution of Satisfaction Level
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.satisfaction_level, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.satisfaction_level, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.satisfaction_level, ax=axs[1])
box_plot = sns.boxplot(hr_data.satisfaction_level, ax=axs[2])

kde_plot.set(xlim=(0,1.1))
hist_plot.set(xlim=(0,1.1))
box_plot.set(xlim=(0,1.1));



In [136]:
#Promotion within the Past 5 Years
fig, axs = plt.subplots(ncols= 2, figsize=(13, 5))

promoted_5years_plt = sns.countplot(hr_data.promotion_last_5years, ax=axs[0]);
annotate_bars(bar_plt=promoted_5years_plt, bar_plt_var=hr_data.promotion_last_5years, 
              x_offset=0.3, txt_color="black")
    
bar_plot = sns.countplot(x=hr_data.promotion_last_5years, 
                         hue=hr_data.left, ax=axs[1])
annotate_bars(bar_plt=bar_plot, by=hr_data.promotion_last_5years, 
              bar_plt_var=hr_data.promotion_last_5years, 
              x_offset=0.1, txt_color="black")
bar_plot.set(ylim=(0,16000));



In [137]:
#Create groups
employees_by_promotion = hr_data.groupby("promotion_last_5years")
employees_promoted = employees_by_promotion.get_group(1)
employees_not_promoted = employees_by_promotion.get_group(0)

#Get counts
employees_promoted_stayed = employees_promoted.groupby("left").get_group(0).left.count()
employees_promoted_left = employees_promoted.groupby("left").get_group(1).left.count()

employees_not_promoted_stayed = employees_not_promoted.groupby("left").get_group(0).left.count()
employees_not_promoted_left = employees_not_promoted.groupby("left").get_group(1).left.count()

#Create rows that makeup the contingency table
promoted_row = [employees_promoted_stayed, employees_promoted_left, 
                employees_promoted_stayed + employees_promoted_left]
not_promoted_row = [employees_not_promoted_stayed, employees_not_promoted_left, 
                    employees_not_promoted_stayed + employees_not_promoted_left]
total_row = [employees_promoted_stayed+employees_not_promoted_stayed,
             employees_promoted_left+employees_not_promoted_left,
             hr_data.left.count()]

#Create the contingency table
contingency_table = pd.DataFrame({'Promoted': promoted_row ,
                                  'Not Promoted': not_promoted_row ,
                                  'Total, By Left': total_row},
                                 index = ['Stayed', 'Left', 'Total, by Promotion'], 
                                 columns = [ 'Promoted', 'Not Promoted', 'Total, By Left'])

display_html(contingency_table)


Promoted Not Promoted Total, By Left
Stayed 300 11128 11428
Left 19 3552 3571
Total, by Promotion 319 14680 14999

In [138]:
chi_squared, p, degrees_of_freedom, expected_frequency = sp.chi2_contingency( contingency_table )

print("Chi Squared: ", chi_squared)
print("p value: ", p)
print("Degrees of Freedom", degrees_of_freedom)
print("Expected Frequency for The Not Promoted Employees:", expected_frequency[0])
print("Expected Frequency for The Promoted Employees:", expected_frequency[1])


Chi Squared:  57.2627339495
p value:  1.08970012479e-11
Degrees of Freedom 4
Expected Frequency for The Not Promoted Employees: [   243.05167011  11184.94832989  11428.        ]
Expected Frequency for The Promoted Employees: [   75.94832989  3495.05167011  3571.        ]

In [139]:
fig, axs = plt.subplots(figsize=(13, 6))

department_plt = sns.countplot(hr_data.department, order = hr_data.department.value_counts().index);

annotate_bars(bar_plt=department_plt, bar_plt_var=hr_data.department, x_offset=0.2, y_offset=450, 
              txt_color="black")



In [140]:
#Departments & Who Left; Is there a pattern?
fig, axs = plt.subplots(figsize=(13, 4))

#Order the bars descendingly according to the PERCENTAGE % of those who left in each department
total_employees_by_dept = hr_data.groupby(["department"]).satisfaction_level.count()
left_count_by_dept = hr_data[hr_data["left"] == 1].groupby(["department"]).satisfaction_level.count()
percentages_left_by_dept = (left_count_by_dept / total_employees_by_dept).sort_values(ascending=False)
axe_name_order = percentages_left_by_dept.index

department_plt = sns.countplot(hr_data.department, order = axe_name_order, color='g');
sns.countplot(employees_left.department, order = axe_name_order, color='r');

department_plt.legend(labels=['Stayed', 'Left'])
department_plt.set(xlabel='Department\n Sorted for "Left" Percentage')

#Annotate the percentages of those who stayed. It was more straightforward to loop for each 
#category (left, stayed) than doing all the work in one loop.
#The zip creates an output that is equal to the shortest parameter, so we do not need to adjust the 
#patches length, since the loop will stop after finishing the columns of those who stayed
for p, current_column in zip(department_plt.patches, axe_name_order):
    current_column_total = hr_data[hr_data['department'] == current_column].department.count()
    stayed_count = p.get_height() - employees_left[employees_left['department'] == current_column].department.count()
    department_plt.annotate(str(round( (100.0* stayed_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.2, p.get_height()-10),
                                color='black', fontsize=12)
    
#In this loop,  use the patches located on the second half of patches list, which are the 
#bars for those who left.
for p, current_column in zip(department_plt.patches[int(len(department_plt.patches)/2):], axe_name_order):
    current_column_total = hr_data[hr_data['department'] == current_column].department.count()
    left_count = p.get_height()
    department_plt.annotate(str(round( (100.0* left_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.2, p.get_height()-10),
                                color='black', fontsize=12)



In [141]:
g=sns.boxplot(x='department', y='satisfaction_level', data=hr_data) #Draw a box plot to show 
#distributions (satisfaction level) with respect to categories (departments). A box plot (or
#box-and-whisker plot) shows the distribution of quantitative data in a way that facilitates 
#comparisons between variables or across levels of a categorical variable. The box shows the 
#quartiles of the dataset while the whiskers extend to show the rest of the distribution, 
#except for points that are determined to be “outliers”. 
for item in g.get_xticklabels():  #rotate the x-axis for better reading 
    item.set_rotation(60) # No'in brackets = degrees in positive direction



In [142]:
#Satisfaction level of each department
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.satisfaction_level, 
                       order=get_ordered_group_index(hr_data, 'department', 'satisfaction_level'))



In [143]:
department_plt = sns.countplot(hr_data.salary, order = hr_data.salary.value_counts().index);

for p in department_plt.patches:
    department_plt.annotate(str( int(p.get_height()) ) + "\n" + str(round( 
                (100.0* p.get_height()) /hr_data.salary.count(), 1) )+ "%", 
                                (p.get_x() + 0.3, p.get_height()-800),
                                color='white', fontsize=12, fontweight='bold')



In [144]:
g=sns.countplot(x='salary', hue='left', data=hr_data) #shows the number of 
#observations in each categorical bin using bars; 
#here: the dependecy of staying from the catagories of salary



In [145]:
sns.boxplot(x='salary', y='satisfaction_level', data=hr_data); #satisfaction level referring to salary;
#again <;> is hiding the internal and irrelevant processing step



In [146]:
fig, axs = plt.subplots(nrows=3, figsize=(13, 4))

sns.kdeplot(employees_left.average_montly_hours, ax=axs[0], shade=True, color="r") #tool in seaborn for 
#examining univariate and bivariate distributions
kde_plot = sns.kdeplot(employees_stayed.average_montly_hours, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.average_montly_hours, ax=axs[1])#plot a univariate distribution
box_plot = sns.boxplot(hr_data.average_montly_hours, ax=axs[2]) #Draw a box plot to show distributions

kde_plot.set(xlim=(0,350)) #set or query x-axis limits
hist_plot.set(xlim=(0,350))
box_plot.set(xlim=(0,350));



In [147]:
#Number of Years Working for the Company
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.time_spend_company, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.time_spend_company, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.time_spend_company, ax=axs[1], kde=False)
box_plot = sns.boxplot(hr_data.time_spend_company, ax=axs[2])

kde_plot.set(xlim=(0,12))
hist_plot.set(xlim=(0,12))
box_plot.set(xlim=(0,12));



In [148]:
#How Hard does Each Department Work?
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.average_montly_hours, 
                       order=get_ordered_group_index(hr_data, 'department', 'average_montly_hours') )



In [149]:
g=sns.factorplot(x='number_project', y='last_evaluation', hue='department', data=hr_data,size=8, aspect=1) 
#multiple graphs on the same plot in seaborn with factorplot, hue==colors in the legend, size(extension 
#of x-axis) and aspect ratio for better distinction (extension of y-axis)
#more information about the plot can be found: http://seaborn.pydata.org/generated/seaborn.factorplot.html



In [150]:
#showing the satisfaction level against the completed number of projects 
sns.boxplot(x='number_project', y='satisfaction_level', data=hr_data_new);
#again <;> is hiding the internal and irrelevant processing step
#more information about the boxplot can be found in the seaborn-documentation:
#https://seaborn.pydata.org/generated/seaborn.boxplot.html



In [151]:
fig, axs = plt.subplots(nrows= 3, figsize=(13, 5))

sns.kdeplot(employees_left.number_project, ax=axs[0], shade=True, color="r")
kde_plot = sns.kdeplot(employees_stayed.number_project, ax=axs[0], shade=True, color="g")
kde_plot.legend(labels=['Left', 'Stayed'])

hist_plot = sns.distplot(hr_data.number_project, ax=axs[1], kde=False)
box_plot = sns.boxplot(hr_data.number_project, ax=axs[2])

kde_plot.set(xlim=(0,8))
hist_plot.set(xlim=(0,8))
box_plot.set(xlim=(0,8));



In [152]:
#How Many Projects are Assigned on Average per Employee?
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.number_project, 
                       order=get_ordered_group_index(hr_data,'department', 'number_project'))



In [153]:
#General Overview about salaries
fig, axs = plt.subplots(figsize=(13, 4))

axe_name_order = hr_data.salary.value_counts().index

salary_plt = sns.countplot(hr_data.salary, order = axe_name_order, color='g');
sns.countplot(employees_left.salary, order = axe_name_order, color='r');

salary_plt.legend(labels=['Stayed', 'Left'])

#Annotate the percentages of those who stayed. It was more straightforward to loop for each 
#category (left, stayed) than doing all the work in one loop. The zip creates an output that 
#is equal to the shortest parameter, so we do not need to adjust the patches length, since
#the loop will stop after finishing the columns of those who stayed
for p, current_column in zip(salary_plt.patches, axe_name_order):
    current_column_total = hr_data[hr_data['salary'] == current_column].salary.count()
    stayed_count = p.get_height() - employees_left[employees_left['salary'] == current_column].salary.count()
    salary_plt.annotate(str(round( (100.0* stayed_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.35, p.get_height()-10),
                                color='black', fontsize=12)
    
#In this loop, we want to use the patches located on the second half of patches list, which are the 
#bars for those who left.
for p, current_column in zip(salary_plt.patches[int(len(salary_plt.patches)/2):], axe_name_order):
    current_column_total = hr_data[hr_data['salary'] == current_column].salary.count()
    left_count = p.get_height()
    salary_plt.annotate(str(round( (100.0* left_count) /current_column_total, 1) )+ "%", 
                                (p.get_x() + 0.35, p.get_height()-10),
                                color='black', fontsize=12)



In [154]:
timeplot = sns.factorplot(x='time_spend_company', hue='left', y='department', row='salary', 
                          data=hr_data, aspect=2)
#the factorplot draws a categorical plot onto a FacetGrid; It is possible to make rather complex 
#plots using this function more information about parameters and using factorplots can be found in 
#documntation: https://seaborn.pydata.org/generated/seaborn.factorplot.html



In [155]:
fig, axs = plt.subplots(figsize=(16, 4))

sns.stripplot(y = 'salary', x='average_montly_hours', hue='left', data=hr_data);



In [156]:
#Promotions in the departments
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.promotion_last_5years, 
                       order=get_ordered_group_index(hr_data, 'department', 'promotion_last_5years'))



In [157]:
#Evaluation of the Management Department 
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.last_evaluation, 
                       order=get_ordered_group_index(hr_data,'department', 'last_evaluation'))



In [158]:
#Maybe they Stayed Longer?

fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.time_spend_company, 
                       order=get_ordered_group_index(hr_data, 'department', 'time_spend_company'))



In [159]:
fig, axs = plt.subplots(ncols= 2, figsize=(13, 5)) #set the number and size of the diagram spaces

work_accidents_plt = sns.countplot(hr_data.Work_accident, ax=axs[0]);
annotate_bars(bar_plt=work_accidents_plt, bar_plt_var=hr_data.Work_accident, 
              x_offset=0.3, y_offset=1100)
    
bar_plot = sns.countplot(x=hr_data.Work_accident, hue=hr_data.left, ax=axs[1])
annotate_bars(bar_plt=bar_plot, by=hr_data.Work_accident, bar_plt_var=hr_data.Work_accident, 
              x_offset=0.1, txt_color="black")
bar_plot.set(ylim=(0,14000));



In [160]:
#Work Related Accidents
fig, axs = plt.subplots(figsize=(13, 4))

bar_plot = sns.barplot(x=hr_data.department, y=hr_data.Work_accident, 
                       order=get_ordered_group_index(hr_data, 'department', 'Work_accident'))



In [161]:
accidentplot = plt.figure(figsize=(10,6)) #advanced plotting with figure module for subplotting; 
#figsize-->w,h tuple in inches constructs the plotting area
#more information about the figure module can be found at: https://matplotlib.org/api/figure_api.html
accidentplotax = accidentplot.add_axes([0,0,1,1]) #Add an axes at position[left, bottom, width, height] 
#where all quantities are in fractions of figure width and height.
accidentplotax = sns.violinplot(x='department', y='average_montly_hours', 
                                hue='Work_accident', split=True, data = hr_data, jitter = 0.47)
#A violin plot plays a similar role as a box and whisker plot. It shows the distribution of 
#quantitative data across several levels of one (or more) categorical variables such that those 
#distributions can be compared. 
#more information about parameters and using violinplots can be found in documntation:
#https://seaborn.pydata.org/generated/seaborn.violinplot.html



In [162]:
satisaccident = plt.figure(figsize=(10,6)) #we set the overall size of the diagram to 10x6 inches
satisaccidentax = satisaccident.add_axes([0,0,1,1]) #We scale the plot to the axes 

satisaccidentax = sns.violinplot(x='left', hue='Work_accident', y='satisfaction_level', 
                                 split=True, data=hr_data)
#We plot the entries in work accident with different colors (hue) and show the two different values in 
#one plot for the same argument.



In [163]:
#A function to bin the average monthly hours into the categories described above
def work_load_cat(avg_mnthly_hrs):
    work_load = "unknown"
    if avg_mnthly_hrs < 168:
        work_load = "low"
    elif (avg_mnthly_hrs >= 168) & (avg_mnthly_hrs < 210):
        work_load = "average"
    elif (avg_mnthly_hrs >= 210) & (avg_mnthly_hrs < 252):
        work_load = "above_average"
    elif avg_mnthly_hrs >= 252:
        work_load = "workoholic"
        
    return work_load

In [164]:
hr_data['work_load'] = hr_data.average_montly_hours.apply(work_load_cat)

sns.countplot(x='work_load', hue='left', data=hr_data, order = ['low', 'average', 'above_average', 'workoholic']);



In [165]:
#Normalised stacked
departments = list(set(hr_data.department.values))
number_of_departments = len(departments)

fig, axs = plt.subplots(nrows= int(number_of_departments/2), ncols=2, figsize=(13, 20))

for i in range(number_of_departments):
    current_dep = departments[i]
    
    ratio_df = 100*hr_data[hr_data.department == current_dep].groupby(['work_load', 'left']).agg(
        {'work_load': 'count'})/hr_data[hr_data.department == current_dep].groupby(['work_load']).agg(
        {'work_load': 'count'})
    ratio_df = ratio_df.reindex_axis(["low", "average", "above_average", "workoholic"], axis=0, level=0)
    #plot the department
    ratio_df.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', ax=axs[int(i/2),i%2])
    axs[int(i/2),i%2].set_title(current_dep)
    axs[int(i/2),i%2].set_xlabel("")
    
axs[int(i/2),i%2].set_xlabel("work_load")
plt.subplots_adjust(hspace=0.3);



In [166]:
#Understanding how the Company Evaluates its Employees
#A function to bin last evaluation into one of 5 categories
def last_evaluation_cat(last_evaluation):
    evaluation = "unknown"
    if last_evaluation < 0.45:
        evaluation = "very_low"
    elif (last_evaluation >= 0.45) & (last_evaluation < 0.55):
        evaluation = "mediocre"
    elif (last_evaluation >= 0.55) & (last_evaluation < 0.8):
        evaluation = "average"
    elif (last_evaluation >= 0.8) & (last_evaluation < 0.9):
        evaluation = "very_good"
    elif last_evaluation >= 0.9:
        evaluation = "excellent"
        
    return evaluation

In [167]:
hr_data['evaluation'] = hr_data.last_evaluation.apply(last_evaluation_cat)

In [168]:
sns.countplot(x='evaluation',  data=hr_data, order = ["unknown", 'very_low', 'mediocre', 
                                                      'average', 'very_good', 'excellent']);



In [169]:
sns.countplot(x='evaluation',  hue = 'left', data=hr_data, 
              order = ["unknown", 'very_low', 'mediocre', 'average', 'very_good', 'excellent']);



In [170]:
evaluation_index_order = ["unknown", 'very_low', 'mediocre', 'average', 'very_good', 'excellent']
evaluation_xticks = ['Very Low\n (eval < .45)', 'Mediocre\n ( .45 < eval < .55 )', 
                     'Average\n ( .55 < eval < .8 )', 'Very Good\n ( .8 < eval < .9 )', 
                     'Excellent\n ( .9 < eval)']
evaluation_x_label = "Company Evaluation for the Employee"

In [171]:
#A function to bin the average monthly hours into the categories described above
def work_load_cat(avg_mnthly_hrs):
    work_load = "unknown"
    if avg_mnthly_hrs < 168:
        work_load = "low"
    elif (avg_mnthly_hrs >= 168) & (avg_mnthly_hrs < 210):
        work_load = "average"
    elif (avg_mnthly_hrs >= 210) & (avg_mnthly_hrs < 252):
        work_load = "above_average"
    elif avg_mnthly_hrs >= 252:
        work_load = "workoholic"
        
    return work_load

In [172]:
#Categories Makeup in Terms of Working Hours
employees_by_eval_and_workload = group_by_2_level_perc(hr_data, 
                                                       'evaluation', 'work_load',
                                                       evaluation_index_order, 
                                                       ['low','average','above_average', 'workoholic'])#Index Order

workload_legend = ['Low Workload (< 40hrs/week)', 'Average Workload (40 < wl < 50 hrs/week)',
                   'Above Average Workload (50 < wl < 60hrs/week)', 
                   'Workoholic Workload (wl > 60hrs/week)']

#Plot the Graph
p=employees_by_eval_and_workload.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                figsize=(15, 6), zorder=0)

customise_2lvl_perc_area_graph(p, workload_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Monthly Workload")



In [173]:
#Number of Years with the Company
employees_by_eval_and_time_in_company_perc = group_by_2_level_perc(hr_data, 
                                                                   'evaluation', 
                                                                   'time_spend_company',
                                                                   evaluation_index_order)

#Plot the Graph
p=employees_by_eval_and_time_in_company_perc.unstack().plot(kind='area',stacked=True, 
                                                            colormap= 'Spectral', 
                                                            figsize=(15, 6), zorder=0)

time_spent_legend = [str(x) + " years" for x in range(2,9)] + ['10 years']

customise_2lvl_perc_area_graph(p, time_spent_legend, xtick_label=evaluation_xticks, 
                               x_label=evaluation_x_label, #Company Evaluation Graph
                               y_label="Percentage of Years in Company")



In [174]:
#Number of Projects
employees_by_eval_and_time_in_company_perc = group_by_2_level_perc(hr_data, 
                                                                   'evaluation', 'number_project',
                                                                   evaluation_index_order)

#Plot the Graph
p=employees_by_eval_and_time_in_company_perc.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                            figsize=(15, 6), zorder=0)

num_projects_legend = [str(x) + " projects" for x in range(2,8)]

customise_2lvl_perc_area_graph(p, num_projects_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Number of Projects Assigned")



In [175]:
#salary
employees_by_eval_and_salary_perc = group_by_2_level_perc(hr_data, 
                                                          'evaluation', 'salary', 
                                                          evaluation_index_order, 
                                                          ['low', 'medium', 'high'])

#Plot the Graph
p=employees_by_eval_and_salary_perc.unstack().plot(kind='area',stacked=True, colormap= 'Spectral', 
                                                   figsize=(15, 6), zorder=0)

num_projects_legend = ['Low', 'Medium', 'High']

customise_2lvl_perc_area_graph(p, num_projects_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Salary Range")



In [176]:
#satisfaction level
#Create a satisfaction categories
#Arbitrary boundaries:
# < 4.5 low
# 4.5 < < 7.5 medium
# 7.5 < high
def rank_satisfaction(employee):
    level = "unknown"
    if employee.satisfaction_level < 0.45:
        level='low'
    elif employee.satisfaction_level < 0.75:
        level = 'medium'
    else:
        level = 'high'
    return level

In [177]:
hr_data['satisfaction'] = hr_data.apply(rank_satisfaction, axis=1)

In [178]:
employees_by_eval_and_satisfaction_perc = group_by_2_level_perc(hr_data, 
                                                                'evaluation', 
                                                                'satisfaction', 
                                                                evaluation_index_order, 
                                                                ['low', 'medium', 'high'])

#Plot the Graph
p=employees_by_eval_and_satisfaction_perc.unstack().plot(kind='area',stacked=True, 
                                                         colormap= 'Spectral', figsize=(15, 6), 
                                                         zorder=0)


satisfaction_lvl_legend = ['Low', 'Medium', 'High']

customise_2lvl_perc_area_graph(p, satisfaction_lvl_legend, 
                               xtick_label=evaluation_xticks, x_label=evaluation_x_label, 
                               y_label="Percentage of Employee's Satisfaction Level")



In [ ]:


In [179]:
import sys
print(sys.version)


3.5.2 |Anaconda custom (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]

In [ ]: