In [4]:
#Import the necessary libraries, Modules and classifiers
import numpy as np #NumPy is the fundamental package for scientific computing with Python.
import pandas as pd #Package is providing fast, flexible, and expressive data structures
#designed to make working with “relational” or “labeled” data both easy and intuitive.
import matplotlib.pyplot as plt #Python 2D plotting library
#commands in cells below the cell that outputs a plot will not affect the plot with:
%matplotlib inline
#(Commenting on the same line causes an error)
import seaborn as sns #visualization library based on matplotlib, for statistical data visualization
from sklearn.linear_model import LinearRegression #module for Ordinary least squares Linear Regression.
from sklearn import linear_model #Module for applying the linear model using coefficients
#w_1, ..., w_p to minimize the residual sum of squares between the observed responses in the dataset.
from sklearn.model_selection import train_test_split #module for splitting data in train and test set
from sklearn.model_selection import cross_val_score #module for calculating the cross-validation-score
#We read the data from a csv-file (ensure that the values are separated by commas otherwise you need
#to specify the delimiter explicitly within the following load-statement):
hr_data=pd.read_csv('.\HR_comma_sep.csv',header=0) #mention the line with the headline, be aware:
#counting from the first entry in Python starts with '0'
hr_data.head() #showing the first five entries; attribute in brackets will give the # of printed lines
Out[4]:
In [5]:
hr_data.info() #attribut specifications; shows the datatype-information about the attributes
In [6]:
hr_data.rename(columns={'sales':'department'}, inplace=True) #Renaming Columns, note: You do need
#to specify the existing label first followed by the new label to rename it to the afterward!
hr_data_new = pd.get_dummies(hr_data, ['department', 'salary'] ,drop_first = True) #Whether to get k-1
#dummies out of k categorical levels by removing the first level. New in Pandas version 0.18.0.
hr_data_new.head() #show the first five entries, attribute in brackets will give the # of printed lines
Out[6]:
In [7]:
from sklearn.model_selection import train_test_split #utility function to split the data into a
#development set used for fitting a GridSearchCV instance and an evaluation set for its final evaluation
#separate X (feature-variables) and Y (target-variable)
X = hr_data_new.drop('satisfaction_level', axis=1) #Drop the target-variable from the features
y = hr_data_new['satisfaction_level'] #set it to y
#split the data set into train and test set; proportion of test size = 40%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
#using the same random_state (fixed seed ) will always produce the same result
In [8]:
print (X_train.shape) #returns the dimensions of the array X_train
print (X_test.shape) #"-" X_test
print (y_train.shape) #"-" y_train
print (y_test.shape) #"-" y_test
In [9]:
###linear regression###
In [10]:
list(X) #list the available features
Out[10]:
In [11]:
#Plot pairw. relations in a dataset or draw multiple instances of the same plot on different subsets
sns.pairplot(hr_data,x_vars=['last_evaluation','number_project',
'average_montly_hours','time_spend_company'],
y_vars='satisfaction_level', size=7,aspect=0.5,kind='reg'); #first argument of pairplot
#is the datasource, followed by a selection of featurs that should be observed, next params.
#set the size of the diagrams; the kind-parameter fits a linear regression model to scatter plots
In [12]:
#load the packages for using a linear regression model
from sklearn import linear_model
from sklearn import metrics #for calculating metrics
# instantiate the model
lr = linear_model.LinearRegression(normalize=True, )#regressors are normalized, note that this
#makes the hyperparameters learnt more robust and almost independent from the number of samples
#fit the model to the data
linreg=lr.fit(X_train,y_train)
In [13]:
#we are checking the size of the splitted set
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)
In [14]:
#print the intercept and coefficients of the model
print(linreg.intercept_)
print(lr.coef_)
In [15]:
#Calculate Prediction on the test-set
linreg_score_train = linreg.score(X_train, y_train) #Returns the coefficient of determination R²
print("Training score: ",linreg_score_train)
linreg_score_test = linreg.score(X_test, y_test)
print("Testing score: ",linreg_score_test)
In [16]:
#plot predicted vs. test values - shows the quality of the model visually
y_pred = lr.predict(X_test) #Predict labels based on the testing features
plt.title('Graphical linear Model Evaluation') #printing the titel of the plot
plt.xlabel('Predicted Satisfaction Level') #naming the x-axis of the plot
plt.ylabel('Actual Satisfaction Level') #naming the y-axis of the plot
actual_values = y_test
plt.scatter(y_pred, actual_values,alpha=.75, color='black') #plots the predicted values
#against the testing values
plt.plot(y_test,y_test,linewidth=2.0) #draws a lineplot symbolizing the angle bisectrix
plt.show()
In [17]:
##regression metrics
#Mean Absolute Error
from sklearn.metrics import mean_absolute_error
ae=mean_absolute_error(y_test, y_pred) #calculating the the Mean Absolute Error betw. train- and test-set
print ('absolute error is: \n', ae)
#Mean squared Error, Compared to Mean Absolute Error, RMSE amplifies and severely punishes large errors
from sklearn.metrics import mean_squared_error
print ('RMSE is: \n', mean_squared_error(y_test, y_pred)**0.5)
In [18]:
###logistic regreession###
In [19]:
from sklearn.linear_model import LogisticRegression #import the model for Logistic regression
In [20]:
# We use model_selection and import the method to split the data randomly.
from sklearn.model_selection import train_test_split
# Initialize logistic regression model
logis=LogisticRegression(random_state=42)
#drop the target variable from the available features; separate X and Y
X = hr_data_new.drop('left', axis=1)
y = hr_data_new['left']
#random split into training and test sets, 40% test set, random seed to get comparable results
#with each other and to other classifiers, target parameter <y> is stratified
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1, stratify=y)
#stratify parameter makes a split so that the proportion of values produced in the sample will be the
#same as the proportion of values provided to parameter stratify
In [21]:
logis.fit(X_train, y_train) #fitting the model with the logistic Regression Model
Out[21]:
In [22]:
#calculate the training and Testing score
logis_score_train = logis.score(X_train,y_train)
print("Training score: ", logis_score_train)
logis_score_test = logis.score(X_test,y_test)
print("Testing score: ", logis_score_test)
In [25]:
# Check trained model intercept
print(logis.intercept_)
# Check trained model coefficients
print(logis.coef_)
In [26]:
#we use ROC curve for visualization of the true positive rate(TPR) against the false positive rate(FPR)
from sklearn.metrics import roc_curve, roc_auc_score #import the modules for the curve and metrics
probabilities = logis.predict_proba(X_test) #To plot the curve, probability estimates are used
#and they are calculated with the logistic regression classifier
fpr, tpr, thresholds = roc_curve(y_test, probabilities[:,1]) #curve is calculated for the entries in
#y_test against their calculated prediction with logistic regression classifier
#the roc curve functionality returns fpr, tpr, thresholds; for further information see:
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
rates = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr}) #the returned values
#are saved in a dataframe
roc = plt.figure(figsize = (10,6))
rocax = roc.add_axes([0,0,1,1])
rocax.plot(fpr, tpr, color='b', label='Logistic Regression')
rocax.plot([0,1],[0,1],color='gray',ls='--',label='Baseline (Random Guessing)') #plot of angle bisectrix
rocax.set_xlabel('False Positive Rate') #labeling x-axis
rocax.set_ylabel('True Positive Rate') #labeling y-axis
rocax.set_title('ROC Curve') #labeling the diagram itself
rocax.legend() # showing the legend
print('Area Under the Curve:', roc_auc_score(y_test, probabilities[:,1]))
#calculating and printing AUC = Area Under the Curve