address: https://github.com/ghvn7777/kaggle/blob/master/ibm_employee/predict_ibm_attrition.ipynb
保持雇员的快乐和对公司满意度是一个古老的挑战,如果你对雇员投资了很多,而他却离开了,这意味着你还要花费更多时间雇用别人,本着 Kaggle 的精神,让我们构建一个预测模型来根据 IBM 的数据集预测 IBM 员工的流失
这个笔记包括以下内容:
Let's Go.
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Import statements required for Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
import xgboost
# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')
In [2]:
attrition = pd.read_csv('./inputs/WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()
Out[2]:
In [3]:
#Looking for NaN
attrition.isnull().any()
Out[3]:
In [4]:
# Plotting the KDEplots
f, axes = plt.subplots(3, 3, figsize=(10, 10), sharex=False, sharey=False)
# Defining our colormap scheme
# s本来想调颜色的,后来都手工指定了 0.333....
#s = np.linspace(0, 3, 10) # [0,3] 区间等间隔生成 10 个数
# 创建一系列调色板,light 是调色板的最浅颜色的强度,1表示最强,
# as_cmap 为真表示使用 matplotlib 颜色表
cmap = sns.cubehelix_palette(start=0.0, light=1, as_cmap=True)
# Generate and plot
x = attrition['Age'].values
y = attrition['TotalWorkingYears'].values
# 画出单变量或双变量核密度预测, shade=True 表示数据是双变量时候填充轮廓
# cut=5 表示从每个内核的极端数据点切去几个 bw (带宽, 也是 kdeplot 的参数,作用控制估计与数据的拟合程度)
# cut 越大,整个图像越小数据越密集
# ax 参数指定在哪个轴上绘制,默认使用当前轴
sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes[0,0])
axes[0,0].set( title = 'Age against Total working years')
cmap = sns.cubehelix_palette(start=0.333333333333, light=1, as_cmap=True)
# Generate and plot
x = attrition['Age'].values
y = attrition['DailyRate'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[0,1])
axes[0,1].set( title = 'Age against Daily Rate')
cmap = sns.cubehelix_palette(start=0.666666666667, light=1, as_cmap=True)
# Generate and plot
x = attrition['YearsInCurrentRole'].values
y = attrition['Age'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[0,2])
axes[0,2].set( title = 'Years in role against Age')
cmap = sns.cubehelix_palette(start=1.0, light=1, as_cmap=True)
# Generate and plot
x = attrition['DailyRate'].values
y = attrition['DistanceFromHome'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[1,0])
axes[1,0].set( title = 'Daily Rate against DistancefromHome')
cmap = sns.cubehelix_palette(start=1.333333333333, light=1, as_cmap=True)
# Generate and plot
x = attrition['DailyRate'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[1,1])
axes[1,1].set( title = 'Daily Rate against Job satisfaction')
cmap = sns.cubehelix_palette(start=1.666666666667, light=1, as_cmap=True)
# Generate and plot
x = attrition['YearsAtCompany'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[1,2])
axes[1,2].set( title = 'Daily Rate against distance')
cmap = sns.cubehelix_palette(start=2.0, light=1, as_cmap=True)
# Generate and plot
x = attrition['YearsAtCompany'].values
y = attrition['DailyRate'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[2,0])
axes[2,0].set( title = 'Years at company against Daily Rate')
cmap = sns.cubehelix_palette(start=2.333333333333, light=1, as_cmap=True)
# Generate and plot
x = attrition['RelationshipSatisfaction'].values
y = attrition['YearsWithCurrManager'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[2,1])
axes[2,1].set( title = 'Relationship Satisfaction vs years with manager')
cmap = sns.cubehelix_palette(start=2.666666666667, light=1, as_cmap=True)
# Generate and plot
x = attrition['WorkLifeBalance'].values
y = attrition['JobSatisfaction'].values
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=axes[2,2])
axes[2,2].set( title = 'WorklifeBalance against Satisfaction')
f.tight_layout()
In [5]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
attrition["Attrition_numerical"] = attrition["Attrition"].apply(lambda x: target_map[x])
In [6]:
attrition
Out[6]:
In [7]:
# creating a list of only numerical values
numerical = [u'Age', u'DailyRate', u'DistanceFromHome', u'Education', u'EmployeeNumber', u'EnvironmentSatisfaction',
u'HourlyRate', u'JobInvolvement', u'JobLevel', u'JobSatisfaction',
u'MonthlyIncome', u'MonthlyRate', u'NumCompaniesWorked',
u'PercentSalaryHike', u'PerformanceRating', u'RelationshipSatisfaction',
u'StockOptionLevel', u'TotalWorkingYears',
u'TrainingTimesLastYear', u'WorkLifeBalance', u'YearsAtCompany',
u'YearsInCurrentRole', u'YearsSinceLastPromotion',
u'YearsWithCurrManager']
data = [
go.Heatmap(
z= attrition[numerical].astype(float).corr().values, # Generating the Pearson correlation
x=attrition[numerical].columns.values,
y=attrition[numerical].columns.values,
colorscale='Viridis',
reversescale = False, #反转色域
text = True,
opacity = 1.0 #不透明度
)
]
layout = go.Layout(
title='Pearson Correlation of numerical features',
xaxis = dict(ticks='', nticks=36),
yaxis = dict(ticks='' ),
width = 900, height = 700,
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')
In [8]:
# Refining our list of numerical variables
numerical = [u'Age', u'DailyRate', u'JobSatisfaction',
u'MonthlyIncome', u'PerformanceRating',
u'WorkLifeBalance', u'YearsAtCompany', u'Attrition_numerical']
#g = sns.pairplot(attrition[numerical], hue='Attrition_numerical', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True))
#g.set(xticklabels=[])
In [9]:
attrition
Out[9]:
In [10]:
# Drop the Attrition_numerical column from attrition dataset first - Don't want to include that
attrition = attrition.drop(['Attrition_numerical'], axis=1)
# Empty list to store columns with categorical data
categorical = []
for col, value in attrition.iteritems():
if value.dtype == 'object':
categorical.append(col)
# Store the numerical columns in a list numerical
print(categorical)
numerical = attrition.columns.difference(categorical)
In [11]:
numerical
Out[11]:
确定我们的特征包含分类数据,我们可以将 numerical 编码,可以使用 Pandas 的 get_dummies() 方法
In [12]:
# Store the categorical data in a dataframe called attrition_cat
attrition_cat = attrition[categorical] #提取出不是数字的列
attrition_cat = attrition_cat.drop(['Attrition'], axis=1) # Dropping the target column
print(attrition_cat)
应用 get_dummies() 方法自动编码, 我们可以很方便的用以下代码看编码后的结果
In [13]:
attrition_cat = pd.get_dummies(attrition_cat)
attrition_cat.head(3)
Out[13]:
提取出是数字的列
In [14]:
# Store the numerical features to a dataframe attrition_num
attrition_num = attrition[numerical]
我们编码了非数字的变量,并将数字的提取出来,现在我们要将它们合并成最终的训练数据
In [15]:
# Concat the two dataframes together columnwise
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)
In [16]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
target = attrition["Attrition"].apply(lambda x: target_map[x])
target.head(3)
Out[16]:
然而,如果检查 Yes 和 No 的数量就会发现,数据有非常大的偏差
In [17]:
data = [go.Bar(
x=attrition["Attrition"].value_counts().index.values,
y= attrition["Attrition"].value_counts().values
)]
py.iplot(data, filename='basic-bar')
In [18]:
# Import the train_test_split method
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(attrition_final, target, train_size= 0.75,random_state=0);
#train, test, target_train, target_val = StratifiedShuffleSplit(attrition_final, target, random_state=0);
In [19]:
oversampler=SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_sample(train,target_train)
In [20]:
seed = 0 # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
'n_jobs': -1,
'n_estimators': 800,
'warm_start': True,
'max_features': 0.3,
'max_depth': 9,
'min_samples_leaf': 2,
'max_features' : 'sqrt',
'random_state' : seed,
'verbose': 0
}
我们可以使用 scikit-learn 的 RandomForestClassifier() 函数来初始化随机森林并将参数传入
In [21]:
rf = RandomForestClassifier(**rf_params)
我们开始训练:
In [22]:
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest as finished")
现在我们可以在测试数据上进行预测:
In [24]:
rf_predictions = rf.predict(test)
print("Predictions finished")
对预测进行打分:
In [25]:
accuracy_score(target_val, rf_predictions)
Out[25]:
In [26]:
# Scatter plot
trace = go.Scatter(
y = rf.feature_importances_,
x = attrition_final.columns.values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 13,
#size= rf.feature_importances_,
#color = np.random.randn(500), #set color equal to a variable
color = rf.feature_importances_,
colorscale='Portland',
showscale=True
),
text = attrition_final.columns.values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
xaxis= dict(
ticklen= 5,
showgrid=False,
zeroline=False,
showline=False
),
yaxis=dict(
title= 'Feature Importance',
showgrid=False,
zeroline=False,
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
通过上图可以看出对于我们最重要的几个特征,算法将加班特征的重要性拍到最高,其次是婚姻状况
我不知道对于你来说哪个重要,但是对于我来说加班确实影响到了我对工作的满意程度,也许这样我们队分类器就不会感到惊讶,因为我们的分类器已经达到了目标并把加班时间重要性排到最高
让我们显示我们的特征树,可以使用 DecisionTreeClassifier 对象遍历单个决策树特征并使用 export_graphviz() 函数来显示 png 图像:
In [27]:
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re
decision_tree = tree.DecisionTreeClassifier(max_depth = 4)
decision_tree.fit(train, target_train)
# Predicting results for test dataset
y_pred = decision_tree.predict(test)
# Export our trained model as a .dot file
with open("tree1.dot", 'w') as f:
f = tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 4,
impurity = False,
feature_names = attrition_final.columns.values,
class_names = ['No', 'Yes'],
rounded = True,
filled= True )
#Convert .dot to .png to allow display in web notebook
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])
# Annotating chart with PIL
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save('sample-out.png')
PImage("sample-out.png")
Out[27]:
梯度增强法是一种组合技术,非常像随机森林树,是将弱树学习者的组合结合成一棵强树,这个技术涉及到定义一些方法(算法)来最小化损失函数 (loss function)。因此,顾名思义,最小化损失函数的方法就是指梯度下降方法,指向了减少损失函数值的方向。
sklearn 中使用 Gradient Boosted classifier 非常简单,只需要几行代码,我们首先设置分类参数:
一般来说,在设置梯度增强分类有几个关键参数, 估计数量, 模型的最大深度,每个叶子的最少样本。
In [28]:
# Gradient Boosting Parameters
gb_params ={
'n_estimators': 500,
'max_features': 0.9,
'learning_rate' : 0.2,
'max_depth': 11,
'min_samples_leaf': 2,
'subsample': 1,
'max_features' : 'sqrt',
'random_state' : seed,
'verbose': 0
}
定义了参数后,我们可以训练预测得分了
In [30]:
gb = GradientBoostingClassifier(**gb_params)
# Fit the model to our SMOTEd train and target
gb.fit(smote_train, smote_target)
# Get our predictions
gb_predictions = gb.predict(test)
print("Predictions have finished")
accuracy_score(target_val, gb_predictions)
Out[30]:
In [33]:
# Scatter plot
trace = go.Scatter(
y = gb.feature_importances_,
x = attrition_final.columns.values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 13,
#size= rf.feature_importances_,
#color = np.random.randn(500), #set color equal to a variable
color = gb.feature_importances_,
colorscale='Portland',
showscale=True
),
text = attrition_final.columns.values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Gradient Boosting Model Feature Importance',
hovermode= 'closest',
xaxis= dict(
ticklen= 5,
showgrid=False,
zeroline=False,
showline=False
),
yaxis=dict(
title= 'Feature Importance',
showgrid=False,
zeroline=False,
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')