In [83]:
import pandas as pd
import numpy as np

# download the data here: https://www.kaggle.com/c/avazu-ctr-prediction/data

In [8]:
df = pd.read_csv('train', nrows=9977)

In [9]:
df.head()


Out[9]:
id click hour C1 banner_pos site_id site_domain site_category app_id app_domain ... device_type device_conn_type C14 C15 C16 C17 C18 C19 C20 C21
0 1000009418151094273 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9 ... 1 2 15706 320 50 1722 0 35 -1 79
1 10000169349117863715 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9 ... 1 0 15704 320 50 1722 0 35 100084 79
2 10000371904215119486 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9 ... 1 0 15704 320 50 1722 0 35 100084 79
3 10000640724480838376 0 14102100 1005 0 1fbe01fe f3845767 28905ebd ecad2386 7801e8d9 ... 1 0 15706 320 50 1722 0 35 100084 79
4 10000679056417042096 0 14102100 1005 1 fe8cc448 9166c161 0569f928 ecad2386 7801e8d9 ... 1 0 18993 320 50 2161 0 35 -1 157

5 rows × 24 columns


In [10]:
df.dtypes


Out[10]:
id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
dtype: object

In [11]:
df.T.apply(lambda x: x.nunique(), axis=1)


Out[11]:
id                  9977
click                  2
hour                   1
C1                     6
banner_pos             4
site_id              381
site_domain          317
site_category         14
app_id               313
app_domain            31
app_category          14
device_id           1075
device_ip           7285
device_model        1167
device_type            4
device_conn_type       4
C14                  271
C15                    4
C16                    5
C17                  111
C18                    4
C19                   32
C20                  108
C21                   29
dtype: int64

In [13]:
# Step 1 - Data Preprocessing, convert to libsvm format

## drop id, hour here
df = df.drop(['id', 'hour'], axis=1)

label_df = df['click']

In [66]:
# Baseline - Just Keep Current Numerical and Categorical data types as they are

## Convert the data to libsvm format
## Put the Label at the very beginning of each row
## numerical data convert to: column_index:column_index:value
## categorical data convert to: column_idx:encoded_value_index:1
numerical_lst = df.select_dtypes(include=["int64"]).columns.tolist()[1:]
categorical_lst = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_encodes = {}
global cat_code
cat_code = len(numerical_lst)


def convert2libsvm_format(df_row):
    global cat_code  # put this at the top of my function, so that python interpreter will know it's a global variable
    result_str = ""
    result_str += str(df_row["click"])
    
    # numerical data
    for i in range(len(numerical_lst)):
        result_str += " " + str(i) + ":" + str(i) + ":" + str(df_row[numerical_lst[i]])
        
    # categorical data
    for j in range(len(categorical_lst)):
        cat_col = categorical_lst[j]
        cat_value = df_row[cat_col]
        cat_encodes.setdefault(cat_col, {})
        if cat_value not in cat_encodes[cat_col].keys():
            cat_code += 1  # only add 1 for a new categorical value in this categorical column
            cat_encodes[cat_col][cat_value] = cat_code
        result_str += " " + str(j) + ":" + str(cat_encodes[cat_col][cat_value]) + ":1"
    
    return result_str

str_df = df.apply(convert2libsvm_format, axis=1)  # with axis=1, you are applying the function to each row

In [67]:
str_df.head()


Out[67]:
0    0 0:0:1005 1:1:0 2:2:1 3:3:2 4:4:15706 5:5:320...
1    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15704 5:5:320...
2    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15704 5:5:320...
3    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15706 5:5:320...
4    0 0:0:1005 1:1:1 2:2:1 3:3:0 4:4:18993 5:5:320...
dtype: object

In [93]:
# Split data into train and test, print them into files so that ffm can read

training_df, test_df = str_df[:9000], str_df[9000:]  # first 90% as training data, last 10% as testing data
train_df, validate_df = training_df[:7000], training_df[7000:]

def diy_df2txt(my_df, txt_out_path):
    with open(txt_out_path, 'w') as out_file:
        for r in my_df:
            out_file.write(r+"\n")
            
diy_df2txt(train_df, 'ffm_train.txt')
diy_df2txt(validate_df, 'ffm_validate.txt')
diy_df2txt(test_df, 'ffm_test.txt')

In [106]:
import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("ffm_train.txt")
ffm_model.setValidate("ffm_validate.txt")

param = {'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
         'k':2,           # Size of latent factor
         'lr':0.1,        # Learning rate for GD
         'lambda':0.0002, # L2 Regularization Parameter
         'metric':'auc',  # Metric for monitoring validation set performance
         'epoch':25       # Maximum number of Epochs
        }

ffm_model.fit(param, "model.out")

In [107]:
ffm_model.cv(param)  # cross validation

In [108]:
ffm_model.setTest("ffm_test.txt")
ffm_model.setSign()  # negative value means negative example, positive value means positive example
ffm_model.predict("model.out", "ffm_output.txt")

In [ ]:
# Then in your terminal:

"""
cd to the folder whether you generate your model output file
In this case, read the file by typing: head -n 5 ffm_output.txt

I got:
-1.55223
-1.55333
-1.6009
-1.6009
-1.56682

Well, all of them are negative, not very accurate
"""

In [ ]:
"""
Details about xLearn ffm: http://xlearn-doc.readthedocs.io/en/latest/python_api.html

The good part of this library is, it allows you to try multi-threading in a simple way
The bad part is, it outputs too many files during valdation and testing stages, you's better to run everything
through terminal, otherwise you will miss out too many information (IPython won't show those)
"""