In [83]:
import pandas as pd
import numpy as np
# download the data here: https://www.kaggle.com/c/avazu-ctr-prediction/data
In [8]:
df = pd.read_csv('train', nrows=9977)
In [9]:
df.head()
Out[9]:
In [10]:
df.dtypes
Out[10]:
In [11]:
df.T.apply(lambda x: x.nunique(), axis=1)
Out[11]:
In [13]:
# Step 1 - Data Preprocessing, convert to libsvm format
## drop id, hour here
df = df.drop(['id', 'hour'], axis=1)
label_df = df['click']
In [66]:
# Baseline - Just Keep Current Numerical and Categorical data types as they are
## Convert the data to libsvm format
## Put the Label at the very beginning of each row
## numerical data convert to: column_index:column_index:value
## categorical data convert to: column_idx:encoded_value_index:1
numerical_lst = df.select_dtypes(include=["int64"]).columns.tolist()[1:]
categorical_lst = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_encodes = {}
global cat_code
cat_code = len(numerical_lst)
def convert2libsvm_format(df_row):
global cat_code # put this at the top of my function, so that python interpreter will know it's a global variable
result_str = ""
result_str += str(df_row["click"])
# numerical data
for i in range(len(numerical_lst)):
result_str += " " + str(i) + ":" + str(i) + ":" + str(df_row[numerical_lst[i]])
# categorical data
for j in range(len(categorical_lst)):
cat_col = categorical_lst[j]
cat_value = df_row[cat_col]
cat_encodes.setdefault(cat_col, {})
if cat_value not in cat_encodes[cat_col].keys():
cat_code += 1 # only add 1 for a new categorical value in this categorical column
cat_encodes[cat_col][cat_value] = cat_code
result_str += " " + str(j) + ":" + str(cat_encodes[cat_col][cat_value]) + ":1"
return result_str
str_df = df.apply(convert2libsvm_format, axis=1) # with axis=1, you are applying the function to each row
In [67]:
str_df.head()
Out[67]:
In [93]:
# Split data into train and test, print them into files so that ffm can read
training_df, test_df = str_df[:9000], str_df[9000:] # first 90% as training data, last 10% as testing data
train_df, validate_df = training_df[:7000], training_df[7000:]
def diy_df2txt(my_df, txt_out_path):
with open(txt_out_path, 'w') as out_file:
for r in my_df:
out_file.write(r+"\n")
diy_df2txt(train_df, 'ffm_train.txt')
diy_df2txt(validate_df, 'ffm_validate.txt')
diy_df2txt(test_df, 'ffm_test.txt')
In [106]:
import xlearn as xl
ffm_model = xl.create_ffm()
ffm_model.setTrain("ffm_train.txt")
ffm_model.setValidate("ffm_validate.txt")
param = {'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
'k':2, # Size of latent factor
'lr':0.1, # Learning rate for GD
'lambda':0.0002, # L2 Regularization Parameter
'metric':'auc', # Metric for monitoring validation set performance
'epoch':25 # Maximum number of Epochs
}
ffm_model.fit(param, "model.out")
In [107]:
ffm_model.cv(param) # cross validation
In [108]:
ffm_model.setTest("ffm_test.txt")
ffm_model.setSign() # negative value means negative example, positive value means positive example
ffm_model.predict("model.out", "ffm_output.txt")
In [ ]:
# Then in your terminal:
"""
cd to the folder whether you generate your model output file
In this case, read the file by typing: head -n 5 ffm_output.txt
I got:
-1.55223
-1.55333
-1.6009
-1.6009
-1.56682
Well, all of them are negative, not very accurate
"""
In [ ]:
"""
Details about xLearn ffm: http://xlearn-doc.readthedocs.io/en/latest/python_api.html
The good part of this library is, it allows you to try multi-threading in a simple way
The bad part is, it outputs too many files during valdation and testing stages, you's better to run everything
through terminal, otherwise you will miss out too many information (IPython won't show those)
"""