In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
ecomCustomers = pd.read_csv("Ecommerce Customers")
In [3]:
ecomCustomers.head()
Out[3]:
In [4]:
ecomCustomers.describe()
Out[4]:
In [5]:
ecomCustomers.info()
In [6]:
sns.jointplot(ecomCustomers['Time on Website'], ecomCustomers['Yearly Amount Spent'])
Out[6]:
In [7]:
sns.jointplot(ecomCustomers['Time on App'], ecomCustomers['Yearly Amount Spent'])
Out[7]:
In [8]:
sns.jointplot(ecomCustomers['Time on App'], ecomCustomers['Length of Membership'], kind='hex')
Out[8]:
In [9]:
sns.pairplot(ecomCustomers)
Out[9]:
Time on App and Length of Membership look to have some positive correlation with Yearly Amount Spent
All of the variables look approximately normally distributed
In [10]:
sns.lmplot(x="Length of Membership", y = "Yearly Amount Spent", data=ecomCustomers)
Out[10]:
In [11]:
X = ecomCustomers.select_dtypes(include = ['float64'])
In [12]:
X = X.drop('Yearly Amount Spent', axis = 1)
In [13]:
y = ecomCustomers["Yearly Amount Spent"]
In [14]:
from sklearn.model_selection import train_test_split
In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)
In [16]:
def printShapes(df_list):
for df in df_list:
print(df.shape)
In [17]:
df_list = [X_train, y_train, X_test, y_test]
printShapes(df_list)
In [18]:
from sklearn.linear_model import LinearRegression
In [19]:
lm = LinearRegression()
In [20]:
lm.fit(X_train, y_train)
Out[20]:
In [21]:
print(lm.coef_)
In [22]:
predictions = lm.predict(X_test)
In [23]:
plt.scatter(predictions, y_test)
Out[23]:
In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
In [25]:
mae = mean_absolute_error(y_test, predictions)
print("MAE = ", mae)
In [26]:
mse = mean_squared_error(y_test, predictions)
print("MSE = ", mse)
In [27]:
rmse = np.sqrt(mse)
print("RMSE = ", rmse)
In [28]:
sns.distplot((y_test - predictions), bins = 50 )
Out[28]:
In [29]:
coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=["Coefficient"])
coeff_df
Out[29]:
In [30]:
appOverWebsite = coeff_df["Coefficient"]["Time on App"] / coeff_df["Coefficient"]["Time on Website"]
print("Time on App affects Yearly Amount Spent {0:.2f}x more than Time on Website".format(appOverWebsite))