In [0]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
sns.set()
In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/HR_dataset.csv")
In [10]:
df.head()
#head means to show top 5
Out[10]:
In [12]:
df.tail()
#bottom 5
Out[12]:
In [15]:
df.info()
#shows panda technical overview report
In [19]:
sns.set(rc={'figure.figsize':(12,10)})
sns.set_context("talk", font_scale=1)
sns.heatmap(df.corr(), cmap='Blues', annot=True)
#correlation
#one we can observe from here is the highest satisfaction level, the less probability employee will left
Out[19]:
In [21]:
df.describe()
Out[21]:
In [23]:
# Draw a count plot that shows the count of employees per department
ax = sns.countplot(df['role'])
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=90);
In [27]:
#configure the data frame, the x axis and the hue
sns.countplot(x="salary", hue="role", data=df, palette='Set2')
Out[27]:
In [29]:
sns.countplot(y="salary", hue="left", data=df)
#can switch orientation by changing x/y
Out[29]:
In [31]:
sns.countplot(x="number_project", hue="left", data=df)
Out[31]:
In [33]:
# Factor Plot
import matplotlib.pyplot as plt
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(12,4))
sns.distplot(df['satisfaction_level'], norm_hist=False, kde=False, ax=ax1);
sns.distplot(df['last_evaluation'], norm_hist=False, kde=False, ax=ax2);
sns.distplot(df['average_monthly_hours'], norm_hist=False, kde=False, ax=ax3);
In [0]:
from sklearn.linear_model import LogisticRegression
columns = ['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
'exp_in_company','work_accident','promotion_last_5years']
# Establishing the base dataset
X = df[columns]
y = df['left']
In [41]:
from sklearn.metrics import accuracy_score
logmodel = LogisticRegression()
logmodel.fit(X,y)
predictions = logmodel.predict(X)
accuracy_score(y,predictions)
Out[41]:
In [0]:
predict = pd.read_csv("https://raw.githubusercontent.com/theleadio/datascience_demo/master/hr_predict.csv")
X_predict = predict[columns]
predictions = logmodel.predict(X_predict)
predict['left_predict'] = predictions
predict
predict.to_csv('predict.csv')
from google.colab import files
files.download('predict.csv')