In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
In [3]:
df = pd.read_csv('HR_comma_sep.csv')
df.head()
Out[3]:
In [4]:
df.dtypes
Out[4]:
In [48]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# import the scatter_matrix functionality
from pandas.tools.plotting import scatter_matrix
# define colors list, to be used to plot survived either red (=0) or green (=1)
colors=['cyan','magenta']
# make a scatter plot
df_numeric = df[['satisfaction_level','last_evaluation','average_montly_hours']]
scatter_matrix(df_numeric,figsize=[20,20],marker='.',c=df.left.apply(lambda x:colors[x]))
df.info()
In [6]:
left = df[df['left'] == 1]
stayed = df[df['left'] == 0]
In [7]:
df.dtypes
Out[7]:
In [8]:
left.describe()
Out[8]:
In [9]:
stayed.describe()
Out[9]:
In [10]:
stayed_summary = pd.DataFrame(stayed.describe())
left_summary = pd.DataFrame(left.describe())
mean_stayed = stayed_summary.iloc[1]
mean_left = left_summary.iloc[1]
#pd.mean_stayed
means = pd.concat([mean_stayed,mean_left],axis=1)
means.columns = ['stayed','left']
means
Out[10]:
In [55]:
#changed the gradiation
correlations = df.corr()
plt.figure(figsize=(10,10))
import seaborn as sns
sns.heatmap(correlations,vmin=-.5,vmax=.5,square=True,annot=True,cmap='cool_r')
plt.title('Correlation Matrix')
Out[55]:
In [12]:
sns.barplot(x="time_spend_company", y="salary",hue="left", data=df)
Out[12]:
In [20]:
dependents = df[['last_evaluation','time_spend_company']]
fig, ax = plt.subplots()
ax.scatter(df.last_evaluation, df.time_spend_company, color=df.left.apply(lambda x:colors[x]), marker='.', alpha=.4)
ax.label('')
#scatter_matrix(dependents,figsize=[20,20],marker='.',c=df.left.apply(lambda x:colors[x]))
Out[20]:
In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
In [15]:
# histogram of education
df.satisfaction_level.hist()
plt.title('Histogram of Salary')
plt.xlabel('Salary Category')
plt.ylabel('Frequency')
Out[15]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [190]:
df['salary_ordinal'] = df['salary']
df['salary_ordinal'] = df['salary_ordinal'].replace('low',0)
df['salary_ordinal'] = df['salary_ordinal'].replace('medium',1)
df['salary_ordinal'] = df['salary_ordinal'].replace('high',2)
df[[9,11]].head()
Out[190]:
In [90]:
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# load the iris datasets
features = ['satisfaction_level','satisfaction_level','last_evaluation','number_project',
'average_montly_hours','time_spend_company','Work_accident','promotion_last_5years']
#output_df = df[['left']]
X,y = df[features],df['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print(len(X_train))
print(len(X_test))
3750/(3750+11249)
Out[90]:
In [91]:
# fit a logistic regression model to the data
model = LogisticRegression(fit_intercept=False)
model.fit(X_train, y_train)
Out[91]:
In [112]:
coef = pd.DataFrame(np.transpose(model.coef_))
coef.columns = ['coefficients']
coef.index = features
coef
Out[112]:
In [114]:
from sklearn.metrics import confusion_matrix
# make predictions
expected = y_test
df['predicted'] = model.predict(X)
predicted = model.predict(X_test)
# summarize the fit of the model
pd.DataFrame(confusion_matrix(expected, predicted))
Out[114]:
missing categorical variables(salary and sales)
In [192]:
#running the logistic with salary
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
# load the iris datasets
features = ['satisfaction_level','satisfaction_level','last_evaluation','number_project',
'average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','salary_ordinal']
#output_df = df[['left']]
X,y = df[features],df['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = LogisticRegression(fit_intercept=False)
model.fit(X_train, y_train)
coef = pd.DataFrame(np.transpose(model.coef_))
coef.columns = ['coefficients']
coef.index = features
coef
from sklearn.metrics import confusion_matrix
# make predictions
expected = y_test
df['predicted'] = model.predict(X)
predicted = model.predict(X_test)
# summarize the fit of the model
pd.DataFrame(confusion_matrix(expected, predicted))
Out[192]:
In [ ]:
In [ ]:
#next step would either be a PCA or to add in those simulation ranges as features
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [109]:
#just a different package
from statsmodels.formula.api import logit
affair_mod = logit(("left ~ satisfaction_level+last_evaluation+number_project+"
"average_montly_hours+time_spend_company+Work_accident+promotion_last_5years"), df).fit()
In [110]:
print(affair_mod.summary())
In [111]:
affair_mod.pred_table()
Out[111]:
In [99]:
model.fit_intercept
Out[99]:
In [47]:
#plt.subplot(2, 1, 1)
#plt.hist(left['time_spend_company'],bins=10,normed=1,color='red',label='left')
#plt.hist(stayed['time_spend_company'],bins=10,normed=1,rwidth=.5,alpha=0.75,color='green',label='stayed')
#plt.title("Attrition versus Time Spent in Company")
#plt.xlabel("Time Spent in Company")
#plt.ylabel("Proportion")
#plt.legend()
#plt.show()
#plt.subplot(2, 1, 2)
weights_stayed = np.ones_like(stayed['time_spend_company'])/len(stayed)
weights_left = np.ones_like(left['time_spend_company'])/len(left)
plt.hist(left['time_spend_company'],bins=10,color='magenta',label='left',weights=weights_left)
plt.hist(stayed['time_spend_company'],bins=10,rwidth=.5,alpha=0.75,color='cyan',label='stayed',weights=weights_stayed)
plt.title("Attrition versus Time Spent in Company")
plt.xlabel("Time Spent in Company")
plt.ylabel("Count")
plt.legend()
plt.show()
In [45]:
weights_stayed = np.ones_like(stayed['time_spend_company'])/len(stayed)
plt.hist(stayed['time_spend_company'],color='cyan',label='stayed',weights=weights_stayed)
Out[45]:
Although it doesnt appear the number of individuals leave the longer they are at the agency, it does seem theey're both skewed right which might imply employees typically dont work there longer than 6 years or less.