notebook.community

Edit and run



In [0]:

    
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
sns.set()



In [0]:

    
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/HR_dataset.csv")



In [10]:

    
df.head()
#head means to show top 5









    Out[10]:







  
    
      
      names
      satisfaction_level
      last_evaluation
      number_project
      average_monthly_hours
      exp_in_company
      work_accident
      left
      promotion_last_5years
      role
      salary
    
  
  
    
      0
      Jessica Stroud
      0.38
      0.53
      2
      157
      3
      0
      1
      0
      sales
      low
    
    
      1
      Daryl Fields
      0.80
      0.86
      5
      262
      6
      0
      1
      0
      sales
      medium
    
    
      2
      Daisy Anderson
      0.11
      0.88
      7
      272
      4
      0
      1
      0
      sales
      medium
    
    
      3
      Joseph Fernandez
      0.72
      0.87
      5
      223
      5
      0
      1
      0
      sales
      low
    
    
      4
      Herbert Moore
      0.37
      0.52
      2
      159
      3
      0
      1
      0
      sales
      low



In [12]:

    
df.tail()
#bottom 5









    Out[12]:







  
    
      
      names
      satisfaction_level
      last_evaluation
      number_project
      average_monthly_hours
      exp_in_company
      work_accident
      left
      promotion_last_5years
      role
      salary
    
  
  
    
      14995
      Roger King
      0.37
      0.48
      2
      160
      3
      0
      1
      0
      support
      low
    
    
      14996
      Ora Lyles
      0.37
      0.53
      2
      143
      3
      0
      1
      0
      support
      low
    
    
      14997
      Steven Freedman
      0.11
      0.96
      6
      280
      4
      0
      1
      0
      support
      low
    
    
      14998
      Randall Williams
      0.37
      0.52
      2
      158
      3
      0
      1
      0
      support
      low
    
    
      14999
      Bessie Coleman
      0.82
      0.91
      5
      232
      5
      0
      1
      0
      technical
      low



In [15]:

    
df.info()
#shows panda technical overview report









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 11 columns):
names                    15000 non-null object
satisfaction_level       15000 non-null float64
last_evaluation          15000 non-null float64
number_project           15000 non-null int64
average_monthly_hours    15000 non-null int64
exp_in_company           15000 non-null int64
work_accident            15000 non-null int64
left                     15000 non-null int64
promotion_last_5years    15000 non-null int64
role                     15000 non-null object
salary                   15000 non-null object
dtypes: float64(2), int64(6), object(3)
memory usage: 1.3+ MB



In [19]:

    
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
    
sns.heatmap(df.corr(), cmap='Blues', annot=True)
#correlation
#one we can observe from here is the highest satisfaction level, the less probability employee will left









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f14a24bfc18>



In [21]:

    
df.describe()









    Out[21]:







  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_monthly_hours
      exp_in_company
      work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      15000.000000
      15000.000000
      15000.000000
      15000.000000
      15000.000000
      15000.000000
      15000.000000
      15000.000000
    
    
      mean
      0.612847
      0.716115
      3.803133
      201.052400
      3.498333
      0.144600
      0.238133
      0.021267
    
    
      std
      0.248628
      0.171171
      1.232590
      49.942074
      1.460139
      0.351709
      0.425955
      0.144277
    
    
      min
      0.090000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.440000
      0.560000
      3.000000
      156.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.640000
      0.720000
      4.000000
      200.000000
      3.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.820000
      0.870000
      5.000000
      245.000000
      4.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      1.000000
      1.000000
      7.000000
      310.000000
      10.000000
      1.000000
      1.000000
      1.000000



In [23]:

    
# Draw a count plot that shows the count of employees per department

ax = sns.countplot(df['role'])
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=90);



In [27]:

    
#configure the data frame, the x axis and the hue
sns.countplot(x="salary", hue="role", data=df, palette='Set2')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f14a216cd68>



In [29]:

    
sns.countplot(y="salary", hue="left", data=df)
#can switch orientation by changing x/y









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f14a2047f98>



In [31]:

    
sns.countplot(x="number_project", hue="left", data=df)









    Out[31]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f14a2011e80>



In [33]:

    
# Factor Plot
import matplotlib.pyplot as plt
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(12,4))

sns.distplot(df['satisfaction_level'], norm_hist=False, kde=False, ax=ax1);
sns.distplot(df['last_evaluation'], norm_hist=False, kde=False, ax=ax2);
sns.distplot(df['average_monthly_hours'], norm_hist=False, kde=False, ax=ax3);



In [0]:

    
from sklearn.linear_model import LogisticRegression

columns = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
'exp_in_company','work_accident','promotion_last_5years']

# Establishing the base dataset
X = df[columns]
y = df['left']



In [41]:

    
from sklearn.metrics import accuracy_score

logmodel = LogisticRegression()
logmodel.fit(X,y)

predictions = logmodel.predict(X)

accuracy_score(y,predictions)









    



/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)






    Out[41]:





0.7658666666666667



In [0]:

    
predict = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/hr_predict.csv")

X_predict = predict[columns]

predictions = logmodel.predict(X_predict)

predict['left_predict'] = predictions

predict

predict.to_csv('predict.csv')

from google.colab import files
files.download('predict.csv')

	names	satisfaction_level	last_evaluation	number_project	average_monthly_hours	exp_in_company	left	role	salary
0	Jessica Stroud	0.38	0.53	2	157	3	1	sales	low
1	Daryl Fields	0.80	0.86	5	262	6	1	sales	medium
2	Daisy Anderson	0.11	0.88	7	272	4	1	sales	medium
3	Joseph Fernandez	0.72	0.87	5	223	5	1	sales	low
4	Herbert Moore	0.37	0.52	2	159	3	1	sales	low

	names	satisfaction_level	last_evaluation	number_project	average_monthly_hours	exp_in_company	left	role	salary
14995	Roger King	0.37	0.48	2	160	3	1	support	low
14996	Ora Lyles	0.37	0.53	2	143	3	1	support	low
14997	Steven Freedman	0.11	0.96	6	280	4	1	support	low
14998	Randall Williams	0.37	0.52	2	158	3	1	support	low
14999	Bessie Coleman	0.82	0.91	5	232	5	1	technical	low

	satisfaction_level	last_evaluation	number_project	average_monthly_hours	exp_in_company	work_accident	left	promotion_last_5years
count	15000.000000	15000.000000	15000.000000	15000.000000	15000.000000	15000.000000	15000.000000	15000.000000
mean	0.612847	0.716115	3.803133	201.052400	3.498333	0.144600	0.238133	0.021267
std	0.248628	0.171171	1.232590	49.942074	1.460139	0.351709	0.425955	0.144277
min	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000