In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
sns.set()

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/HR_dataset.csv")

In [10]:
df.head()
#head means to show top 5


Out[10]:
names satisfaction_level last_evaluation number_project average_monthly_hours exp_in_company work_accident left promotion_last_5years role salary
0 Jessica Stroud 0.38 0.53 2 157 3 0 1 0 sales low
1 Daryl Fields 0.80 0.86 5 262 6 0 1 0 sales medium
2 Daisy Anderson 0.11 0.88 7 272 4 0 1 0 sales medium
3 Joseph Fernandez 0.72 0.87 5 223 5 0 1 0 sales low
4 Herbert Moore 0.37 0.52 2 159 3 0 1 0 sales low

In [12]:
df.tail()
#bottom 5


Out[12]:
names satisfaction_level last_evaluation number_project average_monthly_hours exp_in_company work_accident left promotion_last_5years role salary
14995 Roger King 0.37 0.48 2 160 3 0 1 0 support low
14996 Ora Lyles 0.37 0.53 2 143 3 0 1 0 support low
14997 Steven Freedman 0.11 0.96 6 280 4 0 1 0 support low
14998 Randall Williams 0.37 0.52 2 158 3 0 1 0 support low
14999 Bessie Coleman 0.82 0.91 5 232 5 0 1 0 technical low

In [15]:
df.info()
#shows panda technical overview report


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 11 columns):
names                    15000 non-null object
satisfaction_level       15000 non-null float64
last_evaluation          15000 non-null float64
number_project           15000 non-null int64
average_monthly_hours    15000 non-null int64
exp_in_company           15000 non-null int64
work_accident            15000 non-null int64
left                     15000 non-null int64
promotion_last_5years    15000 non-null int64
role                     15000 non-null object
salary                   15000 non-null object
dtypes: float64(2), int64(6), object(3)
memory usage: 1.3+ MB

In [19]:
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
    
sns.heatmap(df.corr(), cmap='Blues', annot=True)
#correlation
#one we can observe from here is the highest satisfaction level, the less probability employee will left


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f14a24bfc18>

In [21]:
df.describe()


Out[21]:
satisfaction_level last_evaluation number_project average_monthly_hours exp_in_company work_accident left promotion_last_5years
count 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000
mean 0.612847 0.716115 3.803133 201.052400 3.498333 0.144600 0.238133 0.021267
std 0.248628 0.171171 1.232590 49.942074 1.460139 0.351709 0.425955 0.144277
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [23]:
# Draw a count plot that shows the count of employees per department

ax = sns.countplot(df['role'])
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=90);



In [27]:
#configure the data frame, the x axis and the hue
sns.countplot(x="salary", hue="role", data=df, palette='Set2')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f14a216cd68>

In [29]:
sns.countplot(y="salary", hue="left", data=df)
#can switch orientation by changing x/y


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f14a2047f98>

In [31]:
sns.countplot(x="number_project", hue="left", data=df)


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f14a2011e80>

In [33]:
# Factor Plot
import matplotlib.pyplot as plt
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(12,4))

sns.distplot(df['satisfaction_level'], norm_hist=False, kde=False, ax=ax1);
sns.distplot(df['last_evaluation'], norm_hist=False, kde=False, ax=ax2);
sns.distplot(df['average_monthly_hours'], norm_hist=False, kde=False, ax=ax3);



In [0]:
from sklearn.linear_model import LogisticRegression

columns = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
'exp_in_company','work_accident','promotion_last_5years']

# Establishing the base dataset
X = df[columns]
y = df['left']

In [41]:
from sklearn.metrics import accuracy_score

logmodel = LogisticRegression()
logmodel.fit(X,y)

predictions = logmodel.predict(X)

accuracy_score(y,predictions)


/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[41]:
0.7658666666666667

In [0]:
predict = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/hr_predict.csv")

X_predict = predict[columns]

predictions = logmodel.predict(X_predict)

predict['left_predict'] = predictions

predict

predict.to_csv('predict.csv')

from google.colab import files
files.download('predict.csv')