Factorization machines workshop

Dataset

https://movielens.org - некоммерческая рекомендательная система

https://grouplens.org/datasets/movielens/ - datasets с сайта.

100k - малый dataset, 20M - большой


In [1]:
!unzip /tmp/ml-20m.zip -d .


Archive:  /tmp/ml-20m.zip
   creating: ./ml-20m/
  inflating: ./ml-20m/genome-scores.csv  
  inflating: ./ml-20m/genome-tags.csv  
  inflating: ./ml-20m/links.csv      
  inflating: ./ml-20m/movies.csv     
  inflating: ./ml-20m/ratings.csv    
  inflating: ./ml-20m/README.txt     
  inflating: ./ml-20m/tags.csv       

In [3]:
!unzip /tmp/ml-100k.zip -d .


Archive:  /tmp/ml-100k.zip
   creating: ./ml-100k/
  inflating: ./ml-100k/allbut.pl     
  inflating: ./ml-100k/mku.sh        
  inflating: ./ml-100k/README        
  inflating: ./ml-100k/u.data        
  inflating: ./ml-100k/u.genre       
  inflating: ./ml-100k/u.info        
  inflating: ./ml-100k/u.item        
  inflating: ./ml-100k/u.occupation  
  inflating: ./ml-100k/u.user        
  inflating: ./ml-100k/u1.base       
  inflating: ./ml-100k/u1.test       
  inflating: ./ml-100k/u2.base       
  inflating: ./ml-100k/u2.test       
  inflating: ./ml-100k/u3.base       
  inflating: ./ml-100k/u3.test       
  inflating: ./ml-100k/u4.base       
  inflating: ./ml-100k/u4.test       
  inflating: ./ml-100k/u5.base       
  inflating: ./ml-100k/u5.test       
  inflating: ./ml-100k/ua.base       
  inflating: ./ml-100k/ua.test       
  inflating: ./ml-100k/ub.base       
  inflating: ./ml-100k/ub.test       

In [4]:
!ls ml-100k


README	   u.data   u.item	  u1.base  u2.test  u4.base  u5.test  ub.base
allbut.pl  u.genre  u.occupation  u1.test  u3.base  u4.test  ua.base  ub.test
mku.sh	   u.info   u.user	  u2.base  u3.test  u5.base  ua.test

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
user = pd.read_csv('ml-100k/u.user', header=None, names=['info'])
user['user_id'] = user['info'].apply(lambda rec: rec.split('|')[0])
user['age'] = user['info'].apply(lambda rec: rec.split('|')[1])
user['gender'] = user['info'].apply(lambda rec: rec.split('|')[2])
user['occupation'] = user['info'].apply(lambda rec: rec.split('|')[3])
user.head()


Out[6]:
info user_id age gender occupation
0 1|24|M|technician|85711 1 24 M technician
1 2|53|F|other|94043 2 53 F other
2 3|23|M|writer|32067 3 23 M writer
3 4|24|M|technician|43537 4 24 M technician
4 5|33|F|other|15213 5 33 F other

In [7]:
movie = pd.read_csv('ml-100k/u.item', header=None, names=['info'])
movie['movie_id'] = movie['info'].apply(lambda rec: rec.split('|')[0])
movie['title'] = movie['info'].apply(lambda rec: rec.split('|')[1])
movie['Action'] = movie['info'].apply(lambda rec: rec.split('|')[6] if len(rec.split('|')) == 24 else 0)
movie['Adventure'] = movie['info'].apply(lambda rec: rec.split('|')[7] if len(rec.split('|')) == 24 else 0)
movie['Animation'] = movie['info'].apply(lambda rec: rec.split('|')[8] if len(rec.split('|')) == 24 else 0)
movie['Childrens'] = movie['info'].apply(lambda rec: rec.split('|')[9] if len(rec.split('|')) == 24 else 0)
movie['Comedy'] = movie['info'].apply(lambda rec: rec.split('|')[10] if len(rec.split('|')) == 24 else 0)
movie['Crime'] = movie['info'].apply(lambda rec: rec.split('|')[11] if len(rec.split('|')) == 24 else 0)
movie['Documentary'] = movie['info'].apply(lambda rec: rec.split('|')[12] if len(rec.split('|')) == 24 else 0)
movie['Drama'] = movie['info'].apply(lambda rec: rec.split('|')[13] if len(rec.split('|')) == 24 else 0)
movie['Fantasy'] = movie['info'].apply(lambda rec: rec.split('|')[14] if len(rec.split('|')) == 24 else 0)
movie['Film-Noir'] = movie['info'].apply(lambda rec: rec.split('|')[15] if len(rec.split('|')) == 24 else 0)
movie['Horror'] = movie['info'].apply(lambda rec: rec.split('|')[16] if len(rec.split('|')) == 24 else 0)
movie['Musical'] = movie['info'].apply(lambda rec: rec.split('|')[17] if len(rec.split('|')) == 24 else 0)
movie['Mystery'] = movie['info'].apply(lambda rec: rec.split('|')[18] if len(rec.split('|')) == 24 else 0)
movie['Romance'] = movie['info'].apply(lambda rec: rec.split('|')[19] if len(rec.split('|')) == 24 else 0)
movie['Sci-Fi'] = movie['info'].apply(lambda rec: rec.split('|')[20] if len(rec.split('|')) == 24 else 0)
movie['Thriller'] = movie['info'].apply(lambda rec: rec.split('|')[21] if len(rec.split('|')) == 24 else 0)
movie['War'] = movie['info'].apply(lambda rec: rec.split('|')[22] if len(rec.split('|')) == 24 else 0)
movie['Western'] = movie['info'].apply(lambda rec: rec.split('|')[23] if len(rec.split('|')) == 24 else 0)
movie.head()


Out[7]:
info movie_id title Action Adventure Animation Childrens Comedy Crime Documentary ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1|Toy Story (1995)|01-Jan-1995||http://us.imdb... 1 Toy Story (1995) 0 0 1 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 2|GoldenEye (1995)|01-Jan-1995||http://us.imdb... 2 GoldenEye (1995) 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3|Four Rooms (1995)|01-Jan-1995||http://us.imd... 3 Four Rooms (1995) 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 4|Get Shorty (1995)|01-Jan-1995||http://us.imd... 4 Get Shorty (1995) 1 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 5|Copycat (1995)|01-Jan-1995||http://us.imdb.c... 5 Copycat (1995) 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 1 0 0

5 rows × 21 columns


In [13]:
ratings = pd.read_csv('ml-100k/u.data', header=None, names=['info'])
ratings['user_id'] = ratings['info'].apply(lambda rec: rec.split('\t')[0])
ratings['movie_id'] = ratings['info'].apply(lambda rec: rec.split('\t')[1])
ratings['rating'] = ratings['info'].apply(lambda rec: rec.split('\t')[2])
ratings.head()


Out[13]:
info user_id movie_id rating
0 196\t242\t3\t881250949 196 242 3
1 186\t302\t3\t891717742 186 302 3
2 22\t377\t1\t878887116 22 377 1
3 244\t51\t2\t880606923 244 51 2
4 166\t346\t1\t886397596 166 346 1

In [14]:
df = ratings.merge(user, on='user_id').merge(movie, on='movie_id')

Simple dataset


In [118]:
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm


# Read in data
def loadData(filename,path="ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

(train_data, y_train, train_users, train_items) = loadData("ua.base")
(test_data, y_test, test_users, test_items) = loadData("ua.test")
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

Linear Regression


In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)
train_score = mean_squared_error(y_train, lr.predict(X_train))
test_score = mean_squared_error(y_test, lr.predict(X_test))
train_score, test_score


Out[25]:
(0.82847107974499679, 0.92806375584295875)

In [29]:
ratings[(ratings.user_id == '308') & (ratings.movie_id == '1')]


Out[29]:
info user_id movie_id rating
24 308\t1\t4\t887736532 308 1 4

In [31]:
ex = v.transform([{'user_id': '308'}])
print ex.indices
ex = v.transform([{'movie_id': '1'}])
print ex.indices


[1912]
[0]

In [32]:
lr.coef_[0], lr.coef_[1912], lr.intercept_


Out[32]:
(0.53830791489356944, -0.15291660287472478, 3.5238268742409184)

In [33]:
0.53830791489356944 + (-0.15291660287472478) + 3.5238268742409184


Out[33]:
3.9092181862597633

In [35]:
ex = v.transform([{'user_id': '308', 'movie_id': '1'}])
print lr.predict(ex)


[ 3.90921819]

Factorization machine


In [36]:
fm = pylibfm.FM(num_factors=14, num_iter=100, verbose=True, task="regression", 
                initial_learning_rate=0.001, learning_rate_schedule="optimal")
fm.fit(X_train, y_train)
train_score = mean_squared_error(y_train, fm.predict(X_train))
test_score = mean_squared_error(y_test, fm.predict(X_test))
print 'train: ', train_score, ' test: ', test_score


Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.59498
-- Epoch 2
Training MSE: 0.51803
-- Epoch 3
Training MSE: 0.49009
-- Epoch 4
Training MSE: 0.47416
-- Epoch 5
Training MSE: 0.46351
-- Epoch 6
Training MSE: 0.45599
-- Epoch 7
Training MSE: 0.45005
-- Epoch 8
Training MSE: 0.44539
-- Epoch 9
Training MSE: 0.44151
-- Epoch 10
Training MSE: 0.43830
-- Epoch 11
Training MSE: 0.43543
-- Epoch 12
Training MSE: 0.43283
-- Epoch 13
Training MSE: 0.43058
-- Epoch 14
Training MSE: 0.42849
-- Epoch 15
Training MSE: 0.42652
-- Epoch 16
Training MSE: 0.42479
-- Epoch 17
Training MSE: 0.42315
-- Epoch 18
Training MSE: 0.42159
-- Epoch 19
Training MSE: 0.42012
-- Epoch 20
Training MSE: 0.41873
-- Epoch 21
Training MSE: 0.41730
-- Epoch 22
Training MSE: 0.41595
-- Epoch 23
Training MSE: 0.41465
-- Epoch 24
Training MSE: 0.41327
-- Epoch 25
Training MSE: 0.41203
-- Epoch 26
Training MSE: 0.41058
-- Epoch 27
Training MSE: 0.40918
-- Epoch 28
Training MSE: 0.40780
-- Epoch 29
Training MSE: 0.40633
-- Epoch 30
Training MSE: 0.40472
-- Epoch 31
Training MSE: 0.40323
-- Epoch 32
Training MSE: 0.40146
-- Epoch 33
Training MSE: 0.39979
-- Epoch 34
Training MSE: 0.39804
-- Epoch 35
Training MSE: 0.39619
-- Epoch 36
Training MSE: 0.39412
-- Epoch 37
Training MSE: 0.39211
-- Epoch 38
Training MSE: 0.39008
-- Epoch 39
Training MSE: 0.38786
-- Epoch 40
Training MSE: 0.38571
-- Epoch 41
Training MSE: 0.38324
-- Epoch 42
Training MSE: 0.38090
-- Epoch 43
Training MSE: 0.37852
-- Epoch 44
Training MSE: 0.37606
-- Epoch 45
Training MSE: 0.37352
-- Epoch 46
Training MSE: 0.37093
-- Epoch 47
Training MSE: 0.36844
-- Epoch 48
Training MSE: 0.36592
-- Epoch 49
Training MSE: 0.36335
-- Epoch 50
Training MSE: 0.36089
-- Epoch 51
Training MSE: 0.35831
-- Epoch 52
Training MSE: 0.35588
-- Epoch 53
Training MSE: 0.35341
-- Epoch 54
Training MSE: 0.35102
-- Epoch 55
Training MSE: 0.34861
-- Epoch 56
Training MSE: 0.34637
-- Epoch 57
Training MSE: 0.34410
-- Epoch 58
Training MSE: 0.34188
-- Epoch 59
Training MSE: 0.33969
-- Epoch 60
Training MSE: 0.33762
-- Epoch 61
Training MSE: 0.33563
-- Epoch 62
Training MSE: 0.33359
-- Epoch 63
Training MSE: 0.33172
-- Epoch 64
Training MSE: 0.32984
-- Epoch 65
Training MSE: 0.32805
-- Epoch 66
Training MSE: 0.32627
-- Epoch 67
Training MSE: 0.32467
-- Epoch 68
Training MSE: 0.32300
-- Epoch 69
Training MSE: 0.32136
-- Epoch 70
Training MSE: 0.31990
-- Epoch 71
Training MSE: 0.31848
-- Epoch 72
Training MSE: 0.31706
-- Epoch 73
Training MSE: 0.31585
-- Epoch 74
Training MSE: 0.31452
-- Epoch 75
Training MSE: 0.31335
-- Epoch 76
Training MSE: 0.31210
-- Epoch 77
Training MSE: 0.31111
-- Epoch 78
Training MSE: 0.31004
-- Epoch 79
Training MSE: 0.30902
-- Epoch 80
Training MSE: 0.30815
-- Epoch 81
Training MSE: 0.30726
-- Epoch 82
Training MSE: 0.30643
-- Epoch 83
Training MSE: 0.30560
-- Epoch 84
Training MSE: 0.30485
-- Epoch 85
Training MSE: 0.30415
-- Epoch 86
Training MSE: 0.30345
-- Epoch 87
Training MSE: 0.30282
-- Epoch 88
Training MSE: 0.30219
-- Epoch 89
Training MSE: 0.30156
-- Epoch 90
Training MSE: 0.30115
-- Epoch 91
Training MSE: 0.30052
-- Epoch 92
Training MSE: 0.30013
-- Epoch 93
Training MSE: 0.29962
-- Epoch 94
Training MSE: 0.29924
-- Epoch 95
Training MSE: 0.29887
-- Epoch 96
Training MSE: 0.29844
-- Epoch 97
Training MSE: 0.29819
-- Epoch 98
Training MSE: 0.29773
-- Epoch 99
Training MSE: 0.29748
-- Epoch 100
Training MSE: 0.29707
train:  0.594370616337  test:  0.894449652828

In [40]:
ex = v.transform([{'user_id': '308', 'movie_id': '1'}])
print fm.predict(ex)


[ 3.8576649]

Статистически FM лучше, но в данном кокретном эпизоде повела себя чуть хуже linear regression.

Vowpal Wabbit

https://habrahabr.ru/company/mlclass/blog/248779/ Когда данных действительно много: Vowpal Wabbit (@akrot) https://habrahabr.ru/company/ods/blog/326418/ Открытый курс машинного обучения. Тема 8. Обучение на гигабайтах с Vowpal Wabbit (Open Data Science)


In [5]:
with open('ml-100k/ua.base') as fh, open('train.vw', 'w') as vw:
    for line in fh:
        (user, movieid, rating, ts)=line.split('\t')
        vw.write('{rating} |u user:{user} |m movie:{movie}\n'.format(
            rating=rating, user=user, movie=movieid))

with open('ml-100k/ua.test') as fh, open('test.vw', 'w') as vw:
    for line in fh:
        (user, movieid, rating, ts)=line.split('\t')
        vw.write('{rating} |u user:{user} |m movie:{movie}\n'.format(
            rating=rating, user=user, movie=movieid))

In [6]:
!vw --loss_function squared train.vw -b 29 --passes 5 --cache_file cache -f model.vw


final_regressor = model.vw
Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache
Reading datafile = train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        3
14.381521 3.763042            2            2.0   3.0000   1.0601        3
9.299075 4.216629            4            4.0   3.0000   1.7951        3
6.438642 3.578208            8            8.0   1.0000   2.8163        3
4.745547 3.052453           16           16.0   3.0000   3.8444        3
3.134292 1.523038           32           32.0   2.0000   2.8383        3
2.143737 1.153181           64           64.0   1.0000   3.7473        3
2.005677 1.867618          128          128.0   4.0000   2.9109        3
1.732375 1.459073          256          256.0   4.0000   3.7371        3
1.834211 1.936046          512          512.0   5.0000   2.8074        3
1.530229 1.226248         1024         1024.0   3.0000   3.3625        3
1.529839 1.529449         2048         2048.0   3.0000   3.5993        3
1.368792 1.207745         4096         4096.0   4.0000   3.7780        3
1.218920 1.069047         8192         8192.0   5.0000   3.5081        3
1.207996 1.197073        16384        16384.0   3.0000   3.3259        3
1.172874 1.137751        32768        32768.0   2.0000   3.6545        3
1.155978 1.139083        65536        65536.0   4.0000   3.6415        3
1.210399 1.210399       131072       131072.0   3.0000   3.9090        3 h
1.210201 1.210004       262144       262144.0   5.0000   3.5868        3 h

finished run
number of examples per pass = 81513
passes used = 4
weighted example sum = 326052.000000
weighted label sum = 1148632.000000
average loss = 1.157558 h
best constant = 3.522849
total feature number = 978156

In [8]:
!vw --loss_function squared -i model.vw test.vw


Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
5.996206 5.996206            1            1.0   4.0000   1.5513        3
4.226705 2.457205            2            2.0   4.0000   2.4325        3
2.399238 0.571770            4            4.0   3.0000   3.5336        3
2.158566 1.917895            8            8.0   3.0000   3.4092        3
1.774687 1.390807           16           16.0   3.0000   3.7884        3
2.151676 2.528665           32           32.0   4.0000   2.6427        3
1.994070 1.836465           64           64.0   3.0000   3.4915        3
1.529432 1.064793          128          128.0   4.0000   4.0766        3
1.606020 1.682608          256          256.0   3.0000   3.7606        3
1.350834 1.095647          512          512.0   4.0000   3.5630        3
1.289408 1.227982         1024         1024.0   3.0000   3.3484        3
1.306678 1.323949         2048         2048.0   4.0000   3.3871        3
1.225587 1.144495         4096         4096.0   3.0000   3.2708        3
1.237257 1.248926         8192         8192.0   2.0000   3.7169        3

finished run
number of examples per pass = 9430
passes used = 1
weighted example sum = 9430.000000
weighted label sum = 33833.000000
average loss = 1.229125
best constant = 3.587805
total feature number = 28290

In [11]:
!rm cache

In [10]:
!vw --loss_function squared train.vw -b 29 --lrq um14 --passes 5 --cache_file cache -f model.vw


final_regressor = model.vw
creating low rank quadratic features for pairs: um14 
Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache
Reading datafile = train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        3
14.500000 4.000000            2            2.0   3.0000   5.0000        3
7.673673 0.847346            4            4.0   3.0000   3.8256        3
5.585479 3.497286            8            8.0   1.0000   4.2263        3
4.207890 2.830300           16           16.0   3.0000   4.9807        3
2.995360 1.782831           32           32.0   2.0000   2.1405        3
2.074197 1.153034           64           64.0   1.0000   3.6647        3
2.098773 2.123349          128          128.0   4.0000   2.1646        3
1.894462 1.690150          256          256.0   4.0000   3.5794        3
2.028764 2.163067          512          512.0   5.0000   2.6327        3
1.684760 1.340756         1024         1024.0   3.0000   3.1179        3
1.594485 1.504210         2048         2048.0   3.0000   4.1025        3
1.379674 1.164864         4096         4096.0   4.0000   3.8819        3
1.221207 1.062739         8192         8192.0   5.0000   3.6725        3
1.194231 1.167256        16384        16384.0   3.0000   3.3936        3
1.151618 1.109004        32768        32768.0   2.0000   3.8339        3
1.128993 1.106368        65536        65536.0   4.0000   3.8179        3
1.178586 1.178586       131072       131072.0   3.0000   3.8563        3 h
1.184498 1.190409       262144       262144.0   5.0000   3.6205        3 h

finished run
number of examples per pass = 81513
passes used = 4
weighted example sum = 326052.000000
weighted label sum = 1148632.000000
average loss = 1.120113 h
best constant = 3.522849
total feature number = 978156

In [12]:
!vw --loss_function squared -i model.vw test.vw


creating low rank quadratic features for pairs: um14 
Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
6.908770 6.908770            1            1.0   4.0000   1.3715        3
3.954385 1.000000            2            2.0   4.0000   5.0000        3
3.170250 2.386114            4            4.0   3.0000   5.0000        3
2.833448 2.496647            8            8.0   3.0000   4.3165        3
2.450947 2.068445           16           16.0   3.0000   3.2628        3
2.554576 2.658206           32           32.0   4.0000   2.6436        3
2.165291 1.776006           64           64.0   3.0000   4.2277        3
1.714051 1.262811          128          128.0   4.0000   3.8675        3
1.697511 1.680971          256          256.0   3.0000   3.8690        3
1.416145 1.134779          512          512.0   4.0000   3.6455        3
1.320989 1.225834         1024         1024.0   3.0000   3.4087        3
1.324566 1.328142         2048         2048.0   4.0000   3.2788        3
1.230803 1.137041         4096         4096.0   3.0000   2.9843        3
1.242380 1.253957         8192         8192.0   2.0000   3.8026        3

finished run
number of examples per pass = 9430
passes used = 1
weighted example sum = 9430.000000
weighted label sum = 33833.000000
average loss = 1.233394
best constant = 3.587805
total feature number = 28290

In [2]:
!rm cache

In [3]:
!vw --loss_function squared train.vw -b 29 --lrqfa um14 --passes 5 --cache_file cache -f model.vw


final_regressor = model.vw
Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cache
Reading datafile = train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
25.000000 25.000000            1            1.0   5.0000   0.0000        3
14.500000 4.000000            2            2.0   3.0000   5.0000        3
7.673719 0.847438            4            4.0   3.0000   3.8255        3
5.585484 3.497250            8            8.0   1.0000   4.2263        3
4.207909 2.830333           16           16.0   3.0000   4.9807        3
2.995374 1.782839           32           32.0   2.0000   2.1405        3
2.074204 1.153033           64           64.0   1.0000   3.6647        3
2.098776 2.123348          128          128.0   4.0000   2.1646        3
1.894464 1.690151          256          256.0   4.0000   3.5794        3
2.028768 2.163072          512          512.0   5.0000   2.6327        3
1.684764 1.340760         1024         1024.0   3.0000   3.1179        3
1.594488 1.504212         2048         2048.0   3.0000   4.1025        3
1.379676 1.164865         4096         4096.0   4.0000   3.8819        3
1.221208 1.062739         8192         8192.0   5.0000   3.6725        3
1.194232 1.167256        16384        16384.0   3.0000   3.3936        3
1.151618 1.109004        32768        32768.0   2.0000   3.8339        3
1.128993 1.106368        65536        65536.0   4.0000   3.8179        3
1.178586 1.178586       131072       131072.0   3.0000   3.8563        3 h
1.184497 1.190408       262144       262144.0   5.0000   3.6205        3 h

finished run
number of examples per pass = 81513
passes used = 4
weighted example sum = 326052.000000
weighted label sum = 1148632.000000
average loss = 1.120113 h
best constant = 3.522849
total feature number = 978156

In [4]:
!vw --loss_function squared -i model.vw test.vw


Num weight bits = 29
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
6.908787 6.908787            1            1.0   4.0000   1.3715        3
3.954393 1.000000            2            2.0   4.0000   5.0000        3
3.170230 2.386066            4            4.0   3.0000   5.0000        3
2.833476 2.496722            8            8.0   3.0000   4.3166        3
2.450954 2.068432           16           16.0   3.0000   3.2628        3
2.554575 2.658196           32           32.0   4.0000   2.6437        3
2.165282 1.775990           64           64.0   3.0000   4.2277        3
1.714047 1.262812          128          128.0   4.0000   3.8675        3
1.697512 1.680978          256          256.0   3.0000   3.8690        3
1.416144 1.134777          512          512.0   4.0000   3.6455        3
1.320989 1.225833         1024         1024.0   3.0000   3.4087        3
1.324566 1.328143         2048         2048.0   4.0000   3.2788        3
1.230803 1.137041         4096         4096.0   3.0000   2.9843        3
1.242380 1.253957         8192         8192.0   2.0000   3.8026        3

finished run
number of examples per pass = 9430
passes used = 1
weighted example sum = 9430.000000
weighted label sum = 33833.000000
average loss = 1.233394
best constant = 3.587805
total feature number = 28290

LibFFM and classification

File format is similar to LibSVM:

<label> <field1>:<index1>:<value1> <field2>:<index2>:<value2> ...

In [102]:
with open('ml-100k/ua.base') as fh, open('train.libffm', 'w') as vw:
    for line in fh:
        (user, movieid, rating, ts)=line.split('\t')
        rating = int(rating)
        label = -1 if rating == 1 else 1
        vw.write('{label} u:{user}:1 m:{movie}:1\n'.format(
            label=label, user=user, movie=movieid))

with open('ml-100k/ua.test') as fh, open('test.libffm', 'w') as vw:
    for line in fh:
        (user, movieid, rating, ts)=line.split('\t')
        rating = int(rating)
        if rating in {2, 3, 4}:
            continue
        label = -1 if rating == 1 else 1
        vw.write('{label} u:{user}:1 m:{movie}:1\n'.format(
            label=label, user=user, movie=movieid))

In [154]:
!ffm-train -p test.libffm -l 0.05 -t 50 -k 14 train.libffm model.libffm


iter   tr_logloss   va_logloss
   1      0.33135      0.45627
   2      0.25695      0.45696
   3      0.25340      0.45518
   4      0.25215      0.45604
   5      0.25150      0.45771
   6      0.25071      0.45612
   7      0.25058      0.45815
   8      0.25047      0.45632
   9      0.25020      0.45643
  10      0.24997      0.45812
  11      0.24990      0.45714
  12      0.24948      0.45746
  13      0.24975      0.45848
  14      0.24929      0.45791
  15      0.24941      0.45771
  16      0.24935      0.45808
  17      0.24933      0.45783
  18      0.24911      0.45840
  19      0.24889      0.45856
  20      0.24910      0.45799
  21      0.24878      0.45867
  22      0.24911      0.45767
  23      0.24901      0.45831
  24      0.24862      0.45803
  25      0.24896      0.45728
  26      0.24870      0.45716
  27      0.24910      0.45851
  28      0.24857      0.45817
  29      0.24870      0.45805
  30      0.24869      0.45848
  31      0.24837      0.45771
  32      0.24884      0.45867
  33      0.24842      0.45810
  34      0.24855      0.45826
  35      0.24871      0.45813
  36      0.24873      0.45763
  37      0.24848      0.45747
  38      0.24832      0.45797
  39      0.24872      0.45836
  40      0.24812      0.45790
  41      0.24829      0.45827
  42      0.24836      0.45845
  43      0.24844      0.45840
  44      0.24858      0.45796
  45      0.24822      0.45806
  46      0.24828      0.45841
  47      0.24838      0.45814
  48      0.24831      0.45797
  49      0.24834      0.45827
  50      0.24853      0.45837

In [106]:
!ffm-predict test.libffm model.libffm output


logloss = 0.48078

In [138]:
idx = (y_train == 5) | (y_train == 1)
y_train_label = y_train[idx]
X_train_bin = X_train[idx]

idx = (y_test == 5) | (y_test == 1)
y_test_label = y_test[idx]
X_test_bin = X_test[idx]

In [139]:
y_train_label = (y_train_label - 1) / 4
y_test_label = (y_test_label - 1) / 4

In [142]:
lr = LogisticRegression()
lr.fit(X_train_bin, y_train_label)
train_score = log_loss(y_train_label, lr.predict_proba(X_train_bin))
test_score = log_loss(y_test_label, lr.predict_proba(X_test_bin))
train_score, test_score


Out[142]:
(0.24903047224116151, 0.30373051559950098)

In [ ]: