notebook.community

Edit and run



In [83]:

    
import pandas as pd
import numpy as np

# download the data here: https://www.kaggle.com/c/avazu-ctr-prediction/data



In [8]:

    
df = pd.read_csv('train', nrows=9977)



In [9]:

    
df.head()









    Out[9]:







  
    
      
      id
      click
      hour
      C1
      banner_pos
      site_id
      site_domain
      site_category
      app_id
      app_domain
      ...
      device_type
      device_conn_type
      C14
      C15
      C16
      C17
      C18
      C19
      C20
      C21
    
  
  
    
      0
      1000009418151094273
      0
      14102100
      1005
      0
      1fbe01fe
      f3845767
      28905ebd
      ecad2386
      7801e8d9
      ...
      1
      2
      15706
      320
      50
      1722
      0
      35
      -1
      79
    
    
      1
      10000169349117863715
      0
      14102100
      1005
      0
      1fbe01fe
      f3845767
      28905ebd
      ecad2386
      7801e8d9
      ...
      1
      0
      15704
      320
      50
      1722
      0
      35
      100084
      79
    
    
      2
      10000371904215119486
      0
      14102100
      1005
      0
      1fbe01fe
      f3845767
      28905ebd
      ecad2386
      7801e8d9
      ...
      1
      0
      15704
      320
      50
      1722
      0
      35
      100084
      79
    
    
      3
      10000640724480838376
      0
      14102100
      1005
      0
      1fbe01fe
      f3845767
      28905ebd
      ecad2386
      7801e8d9
      ...
      1
      0
      15706
      320
      50
      1722
      0
      35
      100084
      79
    
    
      4
      10000679056417042096
      0
      14102100
      1005
      1
      fe8cc448
      9166c161
      0569f928
      ecad2386
      7801e8d9
      ...
      1
      0
      18993
      320
      50
      2161
      0
      35
      -1
      157
    
  

5 rows × 24 columns



In [10]:

    
df.dtypes









    Out[10]:





id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
dtype: object



In [11]:

    
df.T.apply(lambda x: x.nunique(), axis=1)









    Out[11]:





id                  9977
click                  2
hour                   1
C1                     6
banner_pos             4
site_id              381
site_domain          317
site_category         14
app_id               313
app_domain            31
app_category          14
device_id           1075
device_ip           7285
device_model        1167
device_type            4
device_conn_type       4
C14                  271
C15                    4
C16                    5
C17                  111
C18                    4
C19                   32
C20                  108
C21                   29
dtype: int64



In [13]:

    
# Step 1 - Data Preprocessing, convert to libsvm format

## drop id, hour here
df = df.drop(['id', 'hour'], axis=1)

label_df = df['click']



In [66]:

    
# Baseline - Just Keep Current Numerical and Categorical data types as they are

## Convert the data to libsvm format
## Put the Label at the very beginning of each row
## numerical data convert to: column_index:column_index:value
## categorical data convert to: column_idx:encoded_value_index:1
numerical_lst = df.select_dtypes(include=["int64"]).columns.tolist()[1:]
categorical_lst = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_encodes = {}
global cat_code
cat_code = len(numerical_lst)


def convert2libsvm_format(df_row):
    global cat_code  # put this at the top of my function, so that python interpreter will know it's a global variable
    result_str = ""
    result_str += str(df_row["click"])
    
    # numerical data
    for i in range(len(numerical_lst)):
        result_str += " " + str(i) + ":" + str(i) + ":" + str(df_row[numerical_lst[i]])
        
    # categorical data
    for j in range(len(categorical_lst)):
        cat_col = categorical_lst[j]
        cat_value = df_row[cat_col]
        cat_encodes.setdefault(cat_col, {})
        if cat_value not in cat_encodes[cat_col].keys():
            cat_code += 1  # only add 1 for a new categorical value in this categorical column
            cat_encodes[cat_col][cat_value] = cat_code
        result_str += " " + str(j) + ":" + str(cat_encodes[cat_col][cat_value]) + ":1"
    
    return result_str

str_df = df.apply(convert2libsvm_format, axis=1)  # with axis=1, you are applying the function to each row



In [67]:

    
str_df.head()









    Out[67]:





0    0 0:0:1005 1:1:0 2:2:1 3:3:2 4:4:15706 5:5:320...
1    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15704 5:5:320...
2    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15704 5:5:320...
3    0 0:0:1005 1:1:0 2:2:1 3:3:0 4:4:15706 5:5:320...
4    0 0:0:1005 1:1:1 2:2:1 3:3:0 4:4:18993 5:5:320...
dtype: object



In [93]:

    
# Split data into train and test, print them into files so that ffm can read

training_df, test_df = str_df[:9000], str_df[9000:]  # first 90% as training data, last 10% as testing data
train_df, validate_df = training_df[:7000], training_df[7000:]

def diy_df2txt(my_df, txt_out_path):
    with open(txt_out_path, 'w') as out_file:
        for r in my_df:
            out_file.write(r+"\n")
            
diy_df2txt(train_df, 'ffm_train.txt')
diy_df2txt(validate_df, 'ffm_validate.txt')
diy_df2txt(test_df, 'ffm_test.txt')



In [106]:

    
import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("ffm_train.txt")
ffm_model.setValidate("ffm_validate.txt")

param = {'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
         'k':2,           # Size of latent factor
         'lr':0.1,        # Learning rate for GD
         'lambda':0.0002, # L2 Regularization Parameter
         'metric':'auc',  # Metric for monitoring validation set performance
         'epoch':25       # Maximum number of Epochs
        }

ffm_model.fit(param, "model.out")



In [107]:

    
ffm_model.cv(param)  # cross validation



In [108]:

    
ffm_model.setTest("ffm_test.txt")
ffm_model.setSign()  # negative value means negative example, positive value means positive example
ffm_model.predict("model.out", "ffm_output.txt")



In [ ]:

    
# Then in your terminal:

"""
cd to the folder whether you generate your model output file
In this case, read the file by typing: head -n 5 ffm_output.txt

I got:
-1.55223
-1.55333
-1.6009
-1.6009
-1.56682

Well, all of them are negative, not very accurate
"""



In [ ]:

    
"""
Details about xLearn ffm: http://xlearn-doc.readthedocs.io/en/latest/python_api.html

The good part of this library is, it allows you to try multi-threading in a simple way
The bad part is, it outputs too many files during valdation and testing stages, you's better to run everything
through terminal, otherwise you will miss out too many information (IPython won't show those)
"""

	id	hour	C1	banner_pos	site_id	site_domain	site_category	app_id	app_domain	...	device_type	device_conn_type	C14	C15	C16	C17	C19	C20	C21
0	1000009418151094273	14102100	1005	0	1fbe01fe	f3845767	28905ebd	ecad2386	7801e8d9	...	1	2	15706	320	50	1722	35	-1	79
1	10000169349117863715	14102100	1005	0	1fbe01fe	f3845767	28905ebd	ecad2386	7801e8d9	...	1	0	15704	320	50	1722	35	100084	79
2	10000371904215119486	14102100	1005	0	1fbe01fe	f3845767	28905ebd	ecad2386	7801e8d9	...	1	0	15704	320	50	1722	35	100084	79
3	10000640724480838376	14102100	1005	0	1fbe01fe	f3845767	28905ebd	ecad2386	7801e8d9	...	1	0	15706	320	50	1722	35	100084	79
4	10000679056417042096	14102100	1005	1	fe8cc448	9166c161	0569f928	ecad2386	7801e8d9	...	1	0	18993	320	50	2161	35	-1	157