This project analyses the data from a distributor.
In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from time import time
import scipy
import matplotlib.patches as mpatches
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.decomposition import PCA,NMF
from sklearn.preprocessing import normalize
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
import helper as hp
# Pretty display for notebooks
%matplotlib inline
print "Libraries imported successfully"
In [2]:
# Load the backorders training dataset
try:
data = pd.read_csv('backorderTrain.csv',header =0)
print "backorders dataset has {} samples with {} features each.".format(*data.shape)
except:
print "Dataset could not be loaded. Is the dataset missing?"
In [3]:
#create label array
for col in ['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk','stop_auto_buy', 'rev_stop','went_on_backorder']:
data[col] = (data[col] == 'Yes').astype(int)
labels = data.went_on_backorder
In [4]:
# Display a description of the dataset
#display(data.describe())
display(data.isnull().sum())
display(data.sku.unique().shape[0])
display(data.sku.count())
pd.DataFrame.hist(data, column='lead_time', by = 'went_on_backorder',figsize = (12,12))
Out[4]:
In [5]:
#numerical columns
num_col = ['national_inv','in_transit_qty','forecast_3_month','forecast_6_month',
'forecast_9_month','sales_1_month','sales_3_month','sales_6_month','sales_9_month','min_bank',
'pieces_past_due','perf_6_month_avg','perf_12_month_avg','local_bo_qty']
In [6]:
display(pd.DataFrame.hist(data, column='lead_time', by = 'went_on_backorder',figsize = (12,6), sharex = True,
bins =50))
In [7]:
data.drop(['lead_time','sku','went_on_backorder'],axis=1, inplace =True)
In [8]:
#scaling the features
#Normalise the data using the standard scaler
# 0 mean and 1 std
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler(feature_range=(-5000, 5000), copy=True)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(copy=True, with_mean=True, with_std=True)
scaler = Pipeline([('min_max',mms),('std',sc)])
std_data = pd.DataFrame(scaler.fit_transform(data), columns = data.keys())
In [26]:
#save standardised data
pickle.dump((std_data,labels), open('std_data.p', 'wb'))
In [11]:
#save sample data
samples = std_data
samples['label'] = labels
samples = samples.sample(frac=0.4)
pd.DataFrame.hist(samples, column='label', by = 'label',figsize = (12,12), sharey = True)
samples_labels = samples.label
samples.drop('label',axis =1, inplace = True)
pickle.dump((samples,samples_labels), open('samples.p', 'wb'))
In [51]:
kb = SelectKBest(k=10)
feat_data = kb.fit_transform(std_data,labels)
indices = kb.get_support(indices=True) -1
print indices
feat_data = pd.DataFrame(feat_data, columns = data.columns[indices])
display(feat_data.info())
In [ ]:
# Create a DataFrame of the chosen samples
plt.figure(figsize =(12,12))
samples = feat_data
samples['label'] = labels
display(pd.tools.plotting.radviz(samples, 'label'))
In [ ]:
plt.figure(figsize =(12,12))
samples = std_data
samples['label'] = labels
samples_labels = samples.label
pd.tools.plotting.radviz(samples.sample(50000), 'label')
In [15]:
def vs_frontiers(good_data,good_lables):
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
from sklearn.pipeline import Pipeline
pca = PCA(n_components=2, random_state = 42)
X_train = pca.fit_transform(good_data[good_lables == 0])
X_test = pca.transform(good_data[good_lables == 0].sample(10634))
X_outliers = pca.transform(good_data[good_lables == 1])
outliers_fraction = 0.1
# fit the model
clf = IsolationForest(n_jobs = -1,random_state =42).fit(X_train)
#clf = svm.OneClassSVM().fit(X_train)
xmin, ymin = np.amin(X_train, axis=0)
print xmin, ymin
xmax, ymax = np.amax(X_train, axis=0)
print xmax, ymax
xmin, ymin ,xmax, ymax = xmin -20, ymin -20 ,xmax + 20, ymax +20
xx, yy = np.meshgrid(np.linspace(xmin,xmax, 600), np.linspace(ymin,ymax, 600))
#predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers =clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size
# plot the line, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,12))
plt.title("Novelty Detection")
scores_pred = clf.decision_function(X_train)
threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)
#threshold = -0.18
print Z.min()
print threshold
print Z.max()
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='palevioletred')
s = 30
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s)
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s)
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s)
plt.axis('tight')
plt.xlim((xmin,xmax))
plt.ylim((ymin,ymax))
plt.legend([a.collections[0], b1, b2, c],
["learned frontier", "training observations",
"new regular observations", "new abnormal observations"],
loc="upper left",
prop=matplotlib.font_manager.FontProperties(size=11))
plt.xlabel(
"error train: %d/168214 ; errors novel regular: %d/10634 ; "
"errors novel abnormal: %d/10634"
% (n_error_train, n_error_test, n_error_outliers))
plt.show()
In [55]:
vs_frontiers(feat_data,labels)
In [16]:
#vs_frontiers(std_data,labels)
vs_frontiers(samples,samples_labels)
In [12]:
#pca explanation
def pca_results(good_data, pca):
'''
Create a DataFrame of the PCA results
Includes dimension feature weights and explained variance
Visualizes the PCA results
'''
# Dimension indexing
dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
# PCA components
components = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys())
components.index = dimensions
# PCA explained variance
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = dimensions
# Create a bar plot visualization
fig, ax = plt.subplots(figsize = (14,12))
# Plot the feature weights as a function of the components
components.plot(ax = ax, kind = 'bar');
ax.set_ylabel("Feature Weights")
ax.set_xticklabels(dimensions, rotation=1)
# Display the explained variance ratios
for i, ev in enumerate(pca.explained_variance_ratio_):
ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev))
# Return a concatenated DataFrame
return pd.concat([variance_ratios, components], axis = 1)
pca = PCA(n_components =2, random_state =42).fit(samples)
pca_result = pca_results(samples,pca)
In [55]:
pca_result['Explained Variance'].cumsum()
Out[55]:
In [43]:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_selection import SelectKBest
from imblearn.metrics import sensitivity_score
from sklearn.linear_model import LinearRegression
def fit_model(X, y):
# This dataset is too high-dimensional. Better do PCA:
pca = PCA(random_state = 42)
# Maybe some original features where good too?
selection = SelectKBest(k=1)
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
svm = LinearRegression(fit_intercept=False)
#set the pipeline
pipeline = Pipeline([("features", combined_features), ("svm", svm)])
# Create cross-validation sets from the training data
#cv_sets = ShuffleSplit(10, test_size = 0.20, random_state = 42)
#params dictionary for grid search
param_grid = dict(features__pca__n_components=[12],
features__univ_select__k=[1, 2])
# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
scoring_fnc = make_scorer(sensitivity_score)
# TODO: Create the grid search object
grid = GridSearchCV(pipeline, param_grid,scoring = scoring_fnc,cv = 3)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_
In [17]:
# Fit the training data to the model using grid search
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(std_data,labels, test_size =0.3, random_state =42)
In [ ]:
clf = fit_model(X_train,y_train)
print 'operation completed'
display(clf.get_params())
In [ ]:
#save estimator
pickle.dump(clf), open('estimator.p', 'wb')
In [ ]:
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
In [ ]:
# Load the kepler dataset
Testdata = pd.read_csv(os.path.join("project_dataset", 'exoTest.csv'),index_col=0)
# print "kepler dataset has {} samples with {} features each.".format(*data.shape)
#except:
# print "Dataset could not be loaded. Is the dataset missing?"
In [ ]:
#create label array and drop it from features
testlabels = Testdata.LABEL
#labels = [1 if x == 2 else 0 for x in labels]
testlabels.replace(1,0, inplace=True)
testlabels.replace(2,1, inplace=True)
Testdata.drop('LABEL',axis=1, inplace =True)
In [ ]:
X_test = pca.transform(Testdata)
y_pred = clf_C.predict(X_test)
print classification_report(testlabels, y_pred)
In [ ]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(100*cm/float(cm.sum()))
ax = sns.heatmap(df_cm.round(2), annot=True, cmap='Greys', fmt='g', linewidths=1)
ax.set_title("Confusion Matrix - per 100 predictions")
ax.set_xlabel('Predicted', fontsize=16)
ax.set_ylabel('True', fontsize=16, rotation=90)
plt.show()