In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from math import sqrt
In [2]:
# Read data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [3]:
# categorical columns that i chose
categorical_columns = ["Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years",
"Marital_Status", "Product_Category_1", "Product_Category_2", "Product_Category_3"]
In [4]:
# label
train_y = np.array(train["Purchase"])
In [5]:
train_X = train.copy()
test_X = test.copy()
In [6]:
train_X = train_X.fillna(0)
test_X = test_X.fillna(0)
In [7]:
# I came up with a feature on what is the avg amount spent on a product id
# I tried a lot of other options here
# 1. Purchase price avg by gender, age group, product category 1, product category 2, product category 3
product_id_res = train_X.groupby(["Product_ID"])["Purchase"].mean()
avg_cost = train_X["Purchase"].mean()
# If i find a product id for which i dont have an avg pricing i will use global vg pricing.
product_id_res_map = {}
In [8]:
# created a map with product id to avg price map
val = product_id_res.iteritems()
for key, value in val:
p_id = str(key)
product_id_res_map[p_id] = value
In [9]:
def get_purchase_mean(product_id, product_category=None, key=None):
key_pair = str(product_id)
key_pair_pid = str(product_id) + str(product_category)
if key == "1":
if key_pair_pid in product_category_1_res:
return product_category_1_res[key_pair_pid]
elif key == "2":
if key_pair_pid in product_category_2_res:
return product_category_2_res[key_pair_pid]
elif key == "3":
if key_pair_pid in product_category_3_res:
return product_category_3_res[key_pair_pid]
if key_pair in product_id_res:
return product_id_res[key_pair]
return avg_cost
In [10]:
# Create a feature with pruduct_id to avg price of that product map
train_X["purchase_avg_by_p_id"] = map(lambda product_id: get_purchase_mean(product_id), train_X["Product_ID"])
test_X["purchase_avg_by_p_id"] = map(lambda product_id: get_purchase_mean(product_id), test_X["Product_ID"])
In [11]:
# Another feature that i created was
# Use_id to purchase power category
# Basically i came up with a distribution of purchase sum by suer.
# Created 10 hard coded buckets around it.
# The ipython notebook has more detail on this
user_id_to_category_map = {}
customer_purchase_power = train_X.groupby("User_ID")["Purchase"].sum()
values = customer_purchase_power.iteritems()
In [12]:
for key, val in values:
if val <= 146570.0:
user_id_to_category_map[key] = 1
elif val <= 205272.0:
user_id_to_category_map[key] = 2
elif val <= 279288.0:
user_id_to_category_map[key] = 3
elif val <= 383455.0:
user_id_to_category_map[key] = 4
elif val <= 521213.0:
user_id_to_category_map[key] = 5
elif val <= 698842.0:
user_id_to_category_map[key] = 6
elif val <= 942900.0:
user_id_to_category_map[key] = 7
elif val <= 1355245.0:
user_id_to_category_map[key] = 8
elif val <= 2069404.0:
user_id_to_category_map[key] = 9
else:
user_id_to_category_map[key] = 10
In [13]:
def get_customer_category(user_id):
if user_id in user_id_to_category_map:
return user_id_to_category_map[user_id]
return 5
In [14]:
# Tagged each user with a category id
train_X["user_category"] = map(lambda user_id: get_customer_category(user_id), train_X["User_ID"])
test_X["user_category"] = map(lambda user_id: get_customer_category(user_id), test_X["User_ID"])
In [15]:
# Encoding categorical variable with label encoding
for var in categorical_columns:
lb = preprocessing.LabelEncoder()
full_var_data = pd.concat((train_X[var], test_X[var]), axis=0).astype('str')
lb.fit(full_var_data)
train_X[var] = lb.transform(train_X[var].astype('str'))
test_X[var] = lb.transform(test_X[var].astype('str'))
In [16]:
train_X = train_X.drop(['Purchase'], axis=1)
train_X = np.array(train_X)
In [17]:
# I built 3 models to make precictions
# Finally i did an avg of the 3 and submitted that.
print "1st model"
# 1st model
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["max_depth"] = 8
params["early_stopping_rounds"] = 10
params["seed"] = 42
plst = list(params.items())
In [18]:
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)
num_rounds = 1420
In [19]:
model = xgb.train(plst, xgtrain, num_rounds)
In [20]:
pred_test_y_xgb1 = model.predict(xgtest)
In [21]:
print "2nd model"
# 2nd model
# NOTE: I have changed the paramertes since i last uploaded the results. so the final score might vary.
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["max_depth"] = 8
params["early_stopping_rounds"] = 10
params["seed"] = 333
plst = list(params.items())
In [22]:
# This code shuffels the train matrix.
# In ensures that the oder of feature shuffel and label shuffel is same
merged_train_x_and_y = np.c_[train_X.reshape(len(train_X), -1), train_y.reshape(len(train_y), -1)]
shuffled_train_x = merged_train_x_and_y[:, :train_X.size//len(train_X)].reshape(train_X.shape)
shuffled_train_y = merged_train_x_and_y[:, train_X.size//len(train_X):].reshape(train_y.shape)
np.random.shuffle(merged_train_x_and_y)
In [23]:
# Shuffled train matrix is now shuffled_train_x
xgtrain = xgb.DMatrix(shuffled_train_x, label=shuffled_train_y)
In [24]:
model = xgb.train(plst, xgtrain, num_rounds)
In [25]:
pred_test_y_xgb2 = model.predict(xgtest)
In [26]:
print "3rd model"
# 3rd model
# NOTE: I have changed the paramertes since i last uploaded the results. so the final score might vary.
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["max_depth"] = 8
params["early_stopping_rounds"] = 10
params["seed"] = 777
plst = list(params.items())
In [27]:
# Shuffled train matrix again.
merged_train_x_and_y = np.c_[train_X.reshape(len(train_X), -1), train_y.reshape(len(train_y), -1)]
shuffled_train_x = merged_train_x_and_y[:, :train_X.size//len(train_X)].reshape(train_X.shape)
shuffled_train_y = merged_train_x_and_y[:, train_X.size//len(train_X):].reshape(train_y.shape)
np.random.shuffle(merged_train_x_and_y)
In [28]:
xgtrain = xgb.DMatrix(shuffled_train_x, label=shuffled_train_y)
In [29]:
model = xgb.train(plst, xgtrain, num_rounds)
In [30]:
pred_test_y_xgb3 = model.predict(xgtest)
In [31]:
test['Purchase'] = (pred_test_y_xgb1 + pred_test_y_xgb2 + pred_test_y_xgb3) / 3
test.to_csv('final_xgb.csv', columns=['User_ID', 'Product_ID', 'Purchase'], index=False)
In [ ]: