In [2]:

    
import numpy as np
import pandas as pd
import matplotlib as plt



In [3]:

    
df = pd.read_csv('HR_comma_sep.csv')
df.head()









    Out[3]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      0
      0.38
      0.53
      2
      157
      3
      0
      1
      0
      sales
      low
    
    
      1
      0.80
      0.86
      5
      262
      6
      0
      1
      0
      sales
      medium
    
    
      2
      0.11
      0.88
      7
      272
      4
      0
      1
      0
      sales
      medium
    
    
      3
      0.72
      0.87
      5
      223
      5
      0
      1
      0
      sales
      low
    
    
      4
      0.37
      0.52
      2
      159
      3
      0
      1
      0
      sales
      low



In [4]:

    
df.dtypes









    Out[4]:





satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
sales                     object
salary                    object
dtype: object



In [48]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


# import the scatter_matrix functionality
from pandas.tools.plotting import scatter_matrix

# define colors list, to be used to plot survived either red (=0) or green (=1)
colors=['cyan','magenta']

# make a scatter plot
df_numeric = df[['satisfaction_level','last_evaluation','average_montly_hours']]
scatter_matrix(df_numeric,figsize=[20,20],marker='.',c=df.left.apply(lambda x:colors[x]))

df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB



In [6]:

    
left = df[df['left'] == 1]
stayed = df[df['left'] == 0]



In [7]:

    
df.dtypes









    Out[7]:





satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
sales                     object
salary                    object
dtype: object



In [8]:

    
left.describe()









    Out[8]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      3571.000000
      3571.000000
      3571.000000
      3571.000000
      3571.000000
      3571.000000
      3571.0
      3571.000000
    
    
      mean
      0.440098
      0.718113
      3.855503
      207.419210
      3.876505
      0.047326
      1.0
      0.005321
    
    
      std
      0.263933
      0.197673
      1.818165
      61.202825
      0.977698
      0.212364
      0.0
      0.072759
    
    
      min
      0.090000
      0.450000
      2.000000
      126.000000
      2.000000
      0.000000
      1.0
      0.000000
    
    
      25%
      0.130000
      0.520000
      2.000000
      146.000000
      3.000000
      0.000000
      1.0
      0.000000
    
    
      50%
      0.410000
      0.790000
      4.000000
      224.000000
      4.000000
      0.000000
      1.0
      0.000000
    
    
      75%
      0.730000
      0.900000
      6.000000
      262.000000
      5.000000
      0.000000
      1.0
      0.000000
    
    
      max
      0.920000
      1.000000
      7.000000
      310.000000
      6.000000
      1.000000
      1.0
      1.000000



In [9]:

    
stayed.describe()









    Out[9]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      count
      11428.000000
      11428.000000
      11428.000000
      11428.000000
      11428.000000
      11428.000000
      11428.0
      11428.000000
    
    
      mean
      0.666810
      0.715473
      3.786664
      199.060203
      3.380032
      0.175009
      0.0
      0.026251
    
    
      std
      0.217104
      0.162005
      0.979884
      45.682731
      1.562348
      0.379991
      0.0
      0.159889
    
    
      min
      0.120000
      0.360000
      2.000000
      96.000000
      2.000000
      0.000000
      0.0
      0.000000
    
    
      25%
      0.540000
      0.580000
      3.000000
      162.000000
      2.000000
      0.000000
      0.0
      0.000000
    
    
      50%
      0.690000
      0.710000
      4.000000
      198.000000
      3.000000
      0.000000
      0.0
      0.000000
    
    
      75%
      0.840000
      0.850000
      4.000000
      238.000000
      4.000000
      0.000000
      0.0
      0.000000
    
    
      max
      1.000000
      1.000000
      6.000000
      287.000000
      10.000000
      1.000000
      0.0
      1.000000



In [10]:

    
stayed_summary = pd.DataFrame(stayed.describe())
left_summary = pd.DataFrame(left.describe())
mean_stayed = stayed_summary.iloc[1]
mean_left = left_summary.iloc[1]
#pd.mean_stayed
means = pd.concat([mean_stayed,mean_left],axis=1)
means.columns = ['stayed','left']
means









    Out[10]:






  
    
      
      stayed
      left
    
  
  
    
      satisfaction_level
      0.666810
      0.440098
    
    
      last_evaluation
      0.715473
      0.718113
    
    
      number_project
      3.786664
      3.855503
    
    
      average_montly_hours
      199.060203
      207.419210
    
    
      time_spend_company
      3.380032
      3.876505
    
    
      Work_accident
      0.175009
      0.047326
    
    
      left
      0.000000
      1.000000
    
    
      promotion_last_5years
      0.026251
      0.005321



In [55]:

    
#changed the gradiation
correlations = df.corr()
plt.figure(figsize=(10,10))
import seaborn as sns
sns.heatmap(correlations,vmin=-.5,vmax=.5,square=True,annot=True,cmap='cool_r')
plt.title('Correlation Matrix')









    Out[55]:





<matplotlib.text.Text at 0x7fd601353ef0>



In [12]:

    
sns.barplot(x="time_spend_company", y="salary",hue="left", data=df)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd601fa9be0>



In [20]:

    
dependents = df[['last_evaluation','time_spend_company']]
fig, ax = plt.subplots()
ax.scatter(df.last_evaluation, df.time_spend_company, color=df.left.apply(lambda x:colors[x]), marker='.', alpha=.4)
ax.label('')
#scatter_matrix(dependents,figsize=[20,20],marker='.',c=df.left.apply(lambda x:colors[x]))









    Out[20]:





<matplotlib.collections.PathCollection at 0x7fd613fc0d30>



In [1]:

    
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score









    



/home/dsci/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [15]:

    
# histogram of education
df.satisfaction_level.hist()
plt.title('Histogram of Salary')
plt.xlabel('Salary Category')
plt.ylabel('Frequency')









    Out[15]:





<matplotlib.text.Text at 0x7f14a03fbf28>



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Cleaning



In [190]:

    
df['salary_ordinal'] = df['salary']
df['salary_ordinal'] = df['salary_ordinal'].replace('low',0)
df['salary_ordinal'] = df['salary_ordinal'].replace('medium',1)
df['salary_ordinal'] = df['salary_ordinal'].replace('high',2)
df[[9,11]].head()









    Out[190]:






  
    
      
      salary
      salary_ordinal
    
  
  
    
      0
      low
      0
    
    
      1
      medium
      1
    
    
      2
      medium
      1
    
    
      3
      low
      0
    
    
      4
      low
      0



In [90]:

    
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# load the iris datasets
features = ['satisfaction_level','satisfaction_level','last_evaluation','number_project',
            'average_montly_hours','time_spend_company','Work_accident','promotion_last_5years']
#output_df = df[['left']]
X,y = df[features],df['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print(len(X_train))
print(len(X_test))
3750/(3750+11249)









    



11249
3750






    Out[90]:





0.25001666777785186



In [91]:

    
# fit a logistic regression model to the data
model = LogisticRegression(fit_intercept=False)
model.fit(X_train, y_train)









    Out[91]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [112]:

    
coef = pd.DataFrame(np.transpose(model.coef_))
coef.columns = ['coefficients']
coef.index = features
coef









    Out[112]:






  
    
      
      coefficients
    
  
  
    
      satisfaction_level
      -2.009112
    
    
      satisfaction_level
      -2.009112
    
    
      last_evaluation
      0.774820
    
    
      number_project
      -0.296261
    
    
      average_montly_hours
      0.004692
    
    
      time_spend_company
      0.240505
    
    
      Work_accident
      -1.419920
    
    
      promotion_last_5years
      -1.533047



In [114]:

    
from sklearn.metrics import confusion_matrix
# make predictions
expected = y_test
df['predicted'] = model.predict(X)
predicted = model.predict(X_test)
# summarize the fit of the model
pd.DataFrame(confusion_matrix(expected, predicted))

missing categorical variables(salary and sales)



In [192]:

    
#running the logistic with salary
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# load the iris datasets
features = ['satisfaction_level','satisfaction_level','last_evaluation','number_project',
            'average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','salary_ordinal']
#output_df = df[['left']]
X,y = df[features],df['left']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = LogisticRegression(fit_intercept=False)
model.fit(X_train, y_train)
coef = pd.DataFrame(np.transpose(model.coef_))
coef.columns = ['coefficients']
coef.index = features
coef
from sklearn.metrics import confusion_matrix
# make predictions
expected = y_test
df['predicted'] = model.predict(X)
predicted = model.predict(X_test)
# summarize the fit of the model
pd.DataFrame(confusion_matrix(expected, predicted))



In [ ]:



In [ ]:

    
#next step would either be a PCA or to add in those simulation ranges as features



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [109]:

    
#just a different package
from statsmodels.formula.api import logit
affair_mod = logit(("left ~ satisfaction_level+last_evaluation+number_project+"
                   "average_montly_hours+time_spend_company+Work_accident+promotion_last_5years"), df).fit()









    



Optimization terminated successfully.
         Current function value: 0.444362
         Iterations 7



In [110]:

    
print(affair_mod.summary())









    



                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   left   No. Observations:                14999
Model:                          Logit   Df Residuals:                    14991
Method:                           MLE   Df Model:                            7
Date:                Wed, 15 Mar 2017   Pseudo R-squ.:                  0.1904
Time:                        22:35:49   Log-Likelihood:                -6665.0
converged:                       True   LL-Null:                       -8232.3
                                        LLR p-value:                     0.000
=========================================================================================
                            coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 0.2240      0.117      1.915      0.055        -0.005     0.453
satisfaction_level       -4.1234      0.096    -42.731      0.000        -4.312    -3.934
last_evaluation           0.7626      0.146      5.235      0.000         0.477     1.048
number_project           -0.3085      0.021    -14.808      0.000        -0.349    -0.268
average_montly_hours      0.0043      0.001      8.611      0.000         0.003     0.005
time_spend_company        0.2268      0.015     15.295      0.000         0.198     0.256
Work_accident            -1.4952      0.088    -16.949      0.000        -1.668    -1.322
promotion_last_5years    -1.7945      0.256     -7.015      0.000        -2.296    -1.293
=========================================================================================



In [111]:

    
affair_mod.pred_table()









    Out[111]:





array([[ 10581.,    847.],
       [  2670.,    901.]])



In [99]:

    
model.fit_intercept









    Out[99]:





False



In [47]:

    
#plt.subplot(2, 1, 1)
#plt.hist(left['time_spend_company'],bins=10,normed=1,color='red',label='left')
#plt.hist(stayed['time_spend_company'],bins=10,normed=1,rwidth=.5,alpha=0.75,color='green',label='stayed')
#plt.title("Attrition versus Time Spent in Company")
#plt.xlabel("Time Spent in Company")
#plt.ylabel("Proportion")
#plt.legend()
#plt.show()

#plt.subplot(2, 1, 2)
weights_stayed = np.ones_like(stayed['time_spend_company'])/len(stayed)
weights_left = np.ones_like(left['time_spend_company'])/len(left)
plt.hist(left['time_spend_company'],bins=10,color='magenta',label='left',weights=weights_left)
plt.hist(stayed['time_spend_company'],bins=10,rwidth=.5,alpha=0.75,color='cyan',label='stayed',weights=weights_stayed)
plt.title("Attrition versus Time Spent in Company")
plt.xlabel("Time Spent in Company")
plt.ylabel("Count")
plt.legend()
plt.show()



In [45]:

    
weights_stayed = np.ones_like(stayed['time_spend_company'])/len(stayed)
plt.hist(stayed['time_spend_company'],color='cyan',label='stayed',weights=weights_stayed)









    Out[45]:





(array([ 0.27922646,  0.42500875,  0.14586979,  0.0560028 ,  0.        ,
         0.04453973,  0.01645082,  0.01417571,  0.        ,  0.01872594]),
 array([  2. ,   2.8,   3.6,   4.4,   5.2,   6. ,   6.8,   7.6,   8.4,
          9.2,  10. ]),
 <a list of 10 Patch objects>)

Although it doesnt appear the number of individuals leave the longer they are at the agency, it does seem theey're both skewed right which might imply employees typically dont work there longer than 6 years or less.

	0	1
0	2651	230
1	650	219

	0	1
0	2661	220
1	634	235

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	3571.000000	3571.000000	3571.000000	3571.000000	3571.000000	3571.000000	3571.0	3571.000000
mean	0.440098	0.718113	3.855503	207.419210	3.876505	0.047326	1.0	0.005321
std	0.263933	0.197673	1.818165	61.202825	0.977698	0.212364	0.0	0.072759
min	0.090000	0.450000	2.000000	126.000000	2.000000	0.000000	1.0	0.000000
25%	0.130000	0.520000	2.000000	146.000000	3.000000	0.000000	1.0	0.000000
50%	0.410000	0.790000	4.000000	224.000000	4.000000	0.000000	1.0	0.000000
75%	0.730000	0.900000	6.000000	262.000000	5.000000	0.000000	1.0	0.000000
max	0.920000	1.000000	7.000000	310.000000	6.000000	1.000000	1.0	1.000000

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	left	promotion_last_5years
count	11428.000000	11428.000000	11428.000000	11428.000000	11428.000000	11428.000000	11428.0	11428.000000
mean	0.666810	0.715473	3.786664	199.060203	3.380032	0.175009	0.0	0.026251
std	0.217104	0.162005	0.979884	45.682731	1.562348	0.379991	0.0	0.159889
min	0.120000	0.360000	2.000000	96.000000	2.000000	0.000000	0.0	0.000000
25%	0.540000	0.580000	3.000000	162.000000	2.000000	0.000000	0.0	0.000000
50%	0.690000	0.710000	4.000000	198.000000	3.000000	0.000000	0.0	0.000000
75%	0.840000	0.850000	4.000000	238.000000	4.000000	0.000000	0.0	0.000000
max	1.000000	1.000000	6.000000	287.000000	10.000000	1.000000	0.0	1.000000

	stayed	left
satisfaction_level	0.666810	0.440098
last_evaluation	0.715473	0.718113
number_project	3.786664	3.855503
average_montly_hours	199.060203	207.419210
time_spend_company	3.380032	3.876505
Work_accident	0.175009	0.047326
left	0.000000	1.000000
promotion_last_5years	0.026251	0.005321

	coefficients
satisfaction_level	-2.009112
satisfaction_level	-2.009112
last_evaluation	0.774820
number_project	-0.296261
average_montly_hours	0.004692
time_spend_company	0.240505
Work_accident	-1.419920
promotion_last_5years	-1.533047