This notebook contains a very early prototype for a LogisticRegression machine learning model.

There is much more that needs to be done to flesh out this solution. We are predicting the probability of a order item being cancelled, but it may make more sense to predict the probability of the overall order being cancelled.


In [1]:
!pip install sklearn
!pip install pandas


Requirement already satisfied: sklearn in /usr/local/lib/python2.7/site-packages
Requirement already satisfied: scikit-learn in /usr/local/lib/python2.7/site-packages (from sklearn)
Requirement already satisfied: pandas in /usr/local/lib/python2.7/site-packages
Requirement already satisfied: numpy>=1.7.0 in /usr/local/lib/python2.7/site-packages (from pandas)
Requirement already satisfied: python-dateutil in /usr/local/lib/python2.7/site-packages (from pandas)
Requirement already satisfied: pytz>=2011k in /usr/local/lib/python2.7/site-packages (from pandas)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python2.7/site-packages (from python-dateutil->pandas)

In [2]:
from sklearn import cluster
import pandas as pd

In [3]:
ONLINE_RETAIL_XLSX  = '../data/OnlineRetail.xlsx'

In [4]:
df = pd.read_excel(ONLINE_RETAIL_XLSX, sheetname='Online Retail')

In [5]:
df.columns


Out[5]:
Index([u'InvoiceNo', u'StockCode', u'Description', u'Quantity', u'InvoiceDate',
       u'UnitPrice', u'CustomerID', u'Country'],
      dtype='object')

In [6]:
# If this code starts with letter 'c', it indicates a cancellation. 
df['Cancelled'] = df['InvoiceNo'].str.startswith('C')

mask = df['Cancelled'] == True
df.loc[mask, 'Cancelled'] = 1

mask = df['Cancelled'].isnull()
df.loc[mask, 'Cancelled'] = 0

df


Out[6]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country Cancelled
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom 0
1 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 0
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom 0
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 0
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom 0
5 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01 08:26:00 7.65 17850.0 United Kingdom 0
6 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 2010-12-01 08:26:00 4.25 17850.0 United Kingdom 0
7 536366 22633 HAND WARMER UNION JACK 6 2010-12-01 08:28:00 1.85 17850.0 United Kingdom 0
8 536366 22632 HAND WARMER RED POLKA DOT 6 2010-12-01 08:28:00 1.85 17850.0 United Kingdom 0
9 536367 84879 ASSORTED COLOUR BIRD ORNAMENT 32 2010-12-01 08:34:00 1.69 13047.0 United Kingdom 0
10 536367 22745 POPPY'S PLAYHOUSE BEDROOM 6 2010-12-01 08:34:00 2.10 13047.0 United Kingdom 0
11 536367 22748 POPPY'S PLAYHOUSE KITCHEN 6 2010-12-01 08:34:00 2.10 13047.0 United Kingdom 0
12 536367 22749 FELTCRAFT PRINCESS CHARLOTTE DOLL 8 2010-12-01 08:34:00 3.75 13047.0 United Kingdom 0
13 536367 22310 IVORY KNITTED MUG COSY 6 2010-12-01 08:34:00 1.65 13047.0 United Kingdom 0
14 536367 84969 BOX OF 6 ASSORTED COLOUR TEASPOONS 6 2010-12-01 08:34:00 4.25 13047.0 United Kingdom 0
15 536367 22623 BOX OF VINTAGE JIGSAW BLOCKS 3 2010-12-01 08:34:00 4.95 13047.0 United Kingdom 0
16 536367 22622 BOX OF VINTAGE ALPHABET BLOCKS 2 2010-12-01 08:34:00 9.95 13047.0 United Kingdom 0
17 536367 21754 HOME BUILDING BLOCK WORD 3 2010-12-01 08:34:00 5.95 13047.0 United Kingdom 0
18 536367 21755 LOVE BUILDING BLOCK WORD 3 2010-12-01 08:34:00 5.95 13047.0 United Kingdom 0
19 536367 21777 RECIPE BOX WITH METAL HEART 4 2010-12-01 08:34:00 7.95 13047.0 United Kingdom 0
20 536367 48187 DOORMAT NEW ENGLAND 4 2010-12-01 08:34:00 7.95 13047.0 United Kingdom 0
21 536368 22960 JAM MAKING SET WITH JARS 6 2010-12-01 08:34:00 4.25 13047.0 United Kingdom 0
22 536368 22913 RED COAT RACK PARIS FASHION 3 2010-12-01 08:34:00 4.95 13047.0 United Kingdom 0
23 536368 22912 YELLOW COAT RACK PARIS FASHION 3 2010-12-01 08:34:00 4.95 13047.0 United Kingdom 0
24 536368 22914 BLUE COAT RACK PARIS FASHION 3 2010-12-01 08:34:00 4.95 13047.0 United Kingdom 0
25 536369 21756 BATH BUILDING BLOCK WORD 3 2010-12-01 08:35:00 5.95 13047.0 United Kingdom 0
26 536370 22728 ALARM CLOCK BAKELIKE PINK 24 2010-12-01 08:45:00 3.75 12583.0 France 0
27 536370 22727 ALARM CLOCK BAKELIKE RED 24 2010-12-01 08:45:00 3.75 12583.0 France 0
28 536370 22726 ALARM CLOCK BAKELIKE GREEN 12 2010-12-01 08:45:00 3.75 12583.0 France 0
29 536370 21724 PANDA AND BUNNIES STICKER SHEET 12 2010-12-01 08:45:00 0.85 12583.0 France 0
... ... ... ... ... ... ... ... ... ...
541879 581585 22726 ALARM CLOCK BAKELIKE GREEN 8 2011-12-09 12:31:00 3.75 15804.0 United Kingdom 0
541880 581585 22727 ALARM CLOCK BAKELIKE RED 4 2011-12-09 12:31:00 3.75 15804.0 United Kingdom 0
541881 581585 16016 LARGE CHINESE STYLE SCISSOR 10 2011-12-09 12:31:00 0.85 15804.0 United Kingdom 0
541882 581585 21916 SET 12 RETRO WHITE CHALK STICKS 24 2011-12-09 12:31:00 0.42 15804.0 United Kingdom 0
541883 581585 84692 BOX OF 24 COCKTAIL PARASOLS 25 2011-12-09 12:31:00 0.42 15804.0 United Kingdom 0
541884 581585 84946 ANTIQUE SILVER T-LIGHT GLASS 12 2011-12-09 12:31:00 1.25 15804.0 United Kingdom 0
541885 581585 21684 SMALL MEDINA STAMPED METAL BOWL 12 2011-12-09 12:31:00 0.85 15804.0 United Kingdom 0
541886 581585 22398 MAGNETS PACK OF 4 SWALLOWS 12 2011-12-09 12:31:00 0.39 15804.0 United Kingdom 0
541887 581585 23328 SET 6 SCHOOL MILK BOTTLES IN CRATE 4 2011-12-09 12:31:00 3.75 15804.0 United Kingdom 0
541888 581585 23145 ZINC T-LIGHT HOLDER STAR LARGE 12 2011-12-09 12:31:00 0.95 15804.0 United Kingdom 0
541889 581585 22466 FAIRY TALE COTTAGE NIGHT LIGHT 12 2011-12-09 12:31:00 1.95 15804.0 United Kingdom 0
541890 581586 22061 LARGE CAKE STAND HANGING STRAWBERY 8 2011-12-09 12:49:00 2.95 13113.0 United Kingdom 0
541891 581586 23275 SET OF 3 HANGING OWLS OLLIE BEAK 24 2011-12-09 12:49:00 1.25 13113.0 United Kingdom 0
541892 581586 21217 RED RETROSPOT ROUND CAKE TINS 24 2011-12-09 12:49:00 8.95 13113.0 United Kingdom 0
541893 581586 20685 DOORMAT RED RETROSPOT 10 2011-12-09 12:49:00 7.08 13113.0 United Kingdom 0
541894 581587 22631 CIRCUS PARADE LUNCH BOX 12 2011-12-09 12:50:00 1.95 12680.0 France 0
541895 581587 22556 PLASTERS IN TIN CIRCUS PARADE 12 2011-12-09 12:50:00 1.65 12680.0 France 0
541896 581587 22555 PLASTERS IN TIN STRONGMAN 12 2011-12-09 12:50:00 1.65 12680.0 France 0
541897 581587 22728 ALARM CLOCK BAKELIKE PINK 4 2011-12-09 12:50:00 3.75 12680.0 France 0
541898 581587 22727 ALARM CLOCK BAKELIKE RED 4 2011-12-09 12:50:00 3.75 12680.0 France 0
541899 581587 22726 ALARM CLOCK BAKELIKE GREEN 4 2011-12-09 12:50:00 3.75 12680.0 France 0
541900 581587 22730 ALARM CLOCK BAKELIKE IVORY 4 2011-12-09 12:50:00 3.75 12680.0 France 0
541901 581587 22367 CHILDRENS APRON SPACEBOY DESIGN 8 2011-12-09 12:50:00 1.95 12680.0 France 0
541902 581587 22629 SPACEBOY LUNCH BOX 12 2011-12-09 12:50:00 1.95 12680.0 France 0
541903 581587 23256 CHILDRENS CUTLERY SPACEBOY 4 2011-12-09 12:50:00 4.15 12680.0 France 0
541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12 2011-12-09 12:50:00 0.85 12680.0 France 0
541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6 2011-12-09 12:50:00 2.10 12680.0 France 0
541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4 2011-12-09 12:50:00 4.15 12680.0 France 0
541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4 2011-12-09 12:50:00 4.15 12680.0 France 0
541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3 2011-12-09 12:50:00 4.95 12680.0 France 0

541909 rows × 9 columns


In [7]:
df['Cancelled'].value_counts()


Out[7]:
0    532621
1      9288
Name: Cancelled, dtype: int64

In [8]:
df['Quantity'] = df['Quantity'].abs()
df['Quantity'].describe()


Out[8]:
count    541909.000000
mean         11.340487
std         217.995482
min           1.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       80995.000000
Name: Quantity, dtype: float64

In [9]:
df['UnitPrice'] = df['UnitPrice'].abs()
df['UnitPrice'].describe()


Out[9]:
count    541909.000000
mean          4.692766
std          96.755927
min           0.000000
25%           1.250000
50%           2.080000
75%           4.130000
max       38970.000000
Name: UnitPrice, dtype: float64

In [10]:
# Remove rows where CustomerID is null
#df = df[pd.notnull(df['CustomerID'])] 

df.dropna(subset=['CustomerID'], how='all', inplace=True)

In [11]:
df['CustomerID'] = df['CustomerID'].astype(int)

In [12]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

In [13]:
X = df[['UnitPrice', 'Quantity', 'CustomerID']]
Y = df['Cancelled']

In [14]:
logistic.fit(X.values, list(Y.values))


Out[14]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
# predict one row
test = df[['UnitPrice', 'Quantity', 'CustomerID']].iloc[[2]]
logistic.predict(test)


Out[15]:
array([0])

In [26]:
# predict all rows
test = df[['UnitPrice', 'Quantity', 'CustomerID']]
prediction = logistic.predict(test)
pd.DataFrame(prediction).describe()


Out[26]:
0
count 406829.000000
mean 0.000101
std 0.010038
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000

In [30]:
# predict all rows with probability
prediction = logistic.predict_proba(test)
p_df = pd.DataFrame(prediction)
p_df.head()


Out[30]:
0 1
0 0.989178 0.010822
1 0.989154 0.010846
2 0.989171 0.010829
3 0.989154 0.010846
4 0.989154 0.010846

In [31]:
from sklearn.externals import joblib
joblib.dump(logistic, 'logistic.pkl')


Out[31]:
['logistic.pkl']

In [35]:
testdf = pd.DataFrame(data=[[1,1,1]], columns=['UnitPrice', 'Quantity', 'CustomerID'])

In [36]:
prediction = logistic.predict(testdf)

In [37]:
prediction


Out[37]:
array([1])

In [ ]: