This notebook contains a very early prototype for a LogisticRegression machine learning model.
There is much more that needs to be done to flesh out this solution. We are predicting the probability of a order item being cancelled, but it may make more sense to predict the probability of the overall order being cancelled.
In [1]:
!pip install sklearn
!pip install pandas
In [2]:
from sklearn import cluster
import pandas as pd
In [3]:
ONLINE_RETAIL_XLSX = '../data/OnlineRetail.xlsx'
In [4]:
df = pd.read_excel(ONLINE_RETAIL_XLSX, sheetname='Online Retail')
In [5]:
df.columns
Out[5]:
In [6]:
# If this code starts with letter 'c', it indicates a cancellation.
df['Cancelled'] = df['InvoiceNo'].str.startswith('C')
mask = df['Cancelled'] == True
df.loc[mask, 'Cancelled'] = 1
mask = df['Cancelled'].isnull()
df.loc[mask, 'Cancelled'] = 0
df
Out[6]:
In [7]:
df['Cancelled'].value_counts()
Out[7]:
In [8]:
df['Quantity'] = df['Quantity'].abs()
df['Quantity'].describe()
Out[8]:
In [9]:
df['UnitPrice'] = df['UnitPrice'].abs()
df['UnitPrice'].describe()
Out[9]:
In [10]:
# Remove rows where CustomerID is null
#df = df[pd.notnull(df['CustomerID'])]
df.dropna(subset=['CustomerID'], how='all', inplace=True)
In [11]:
df['CustomerID'] = df['CustomerID'].astype(int)
In [12]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
In [13]:
X = df[['UnitPrice', 'Quantity', 'CustomerID']]
Y = df['Cancelled']
In [14]:
logistic.fit(X.values, list(Y.values))
Out[14]:
In [15]:
# predict one row
test = df[['UnitPrice', 'Quantity', 'CustomerID']].iloc[[2]]
logistic.predict(test)
Out[15]:
In [26]:
# predict all rows
test = df[['UnitPrice', 'Quantity', 'CustomerID']]
prediction = logistic.predict(test)
pd.DataFrame(prediction).describe()
Out[26]:
In [30]:
# predict all rows with probability
prediction = logistic.predict_proba(test)
p_df = pd.DataFrame(prediction)
p_df.head()
Out[30]:
In [31]:
from sklearn.externals import joblib
joblib.dump(logistic, 'logistic.pkl')
Out[31]:
In [35]:
testdf = pd.DataFrame(data=[[1,1,1]], columns=['UnitPrice', 'Quantity', 'CustomerID'])
In [36]:
prediction = logistic.predict(testdf)
In [37]:
prediction
Out[37]:
In [ ]: