https://movielens.org - некоммерческая рекомендательная система
https://grouplens.org/datasets/movielens/ - datasets с сайта.
100k - малый dataset, 20M - большой
In [1]:
!unzip /tmp/ml-20m.zip -d .
In [3]:
!unzip /tmp/ml-100k.zip -d .
In [4]:
!ls ml-100k
In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
In [6]:
user = pd.read_csv('ml-100k/u.user', header=None, names=['info'])
user['user_id'] = user['info'].apply(lambda rec: rec.split('|')[0])
user['age'] = user['info'].apply(lambda rec: rec.split('|')[1])
user['gender'] = user['info'].apply(lambda rec: rec.split('|')[2])
user['occupation'] = user['info'].apply(lambda rec: rec.split('|')[3])
user.head()
Out[6]:
In [7]:
movie = pd.read_csv('ml-100k/u.item', header=None, names=['info'])
movie['movie_id'] = movie['info'].apply(lambda rec: rec.split('|')[0])
movie['title'] = movie['info'].apply(lambda rec: rec.split('|')[1])
movie['Action'] = movie['info'].apply(lambda rec: rec.split('|')[6] if len(rec.split('|')) == 24 else 0)
movie['Adventure'] = movie['info'].apply(lambda rec: rec.split('|')[7] if len(rec.split('|')) == 24 else 0)
movie['Animation'] = movie['info'].apply(lambda rec: rec.split('|')[8] if len(rec.split('|')) == 24 else 0)
movie['Childrens'] = movie['info'].apply(lambda rec: rec.split('|')[9] if len(rec.split('|')) == 24 else 0)
movie['Comedy'] = movie['info'].apply(lambda rec: rec.split('|')[10] if len(rec.split('|')) == 24 else 0)
movie['Crime'] = movie['info'].apply(lambda rec: rec.split('|')[11] if len(rec.split('|')) == 24 else 0)
movie['Documentary'] = movie['info'].apply(lambda rec: rec.split('|')[12] if len(rec.split('|')) == 24 else 0)
movie['Drama'] = movie['info'].apply(lambda rec: rec.split('|')[13] if len(rec.split('|')) == 24 else 0)
movie['Fantasy'] = movie['info'].apply(lambda rec: rec.split('|')[14] if len(rec.split('|')) == 24 else 0)
movie['Film-Noir'] = movie['info'].apply(lambda rec: rec.split('|')[15] if len(rec.split('|')) == 24 else 0)
movie['Horror'] = movie['info'].apply(lambda rec: rec.split('|')[16] if len(rec.split('|')) == 24 else 0)
movie['Musical'] = movie['info'].apply(lambda rec: rec.split('|')[17] if len(rec.split('|')) == 24 else 0)
movie['Mystery'] = movie['info'].apply(lambda rec: rec.split('|')[18] if len(rec.split('|')) == 24 else 0)
movie['Romance'] = movie['info'].apply(lambda rec: rec.split('|')[19] if len(rec.split('|')) == 24 else 0)
movie['Sci-Fi'] = movie['info'].apply(lambda rec: rec.split('|')[20] if len(rec.split('|')) == 24 else 0)
movie['Thriller'] = movie['info'].apply(lambda rec: rec.split('|')[21] if len(rec.split('|')) == 24 else 0)
movie['War'] = movie['info'].apply(lambda rec: rec.split('|')[22] if len(rec.split('|')) == 24 else 0)
movie['Western'] = movie['info'].apply(lambda rec: rec.split('|')[23] if len(rec.split('|')) == 24 else 0)
movie.head()
Out[7]:
In [13]:
ratings = pd.read_csv('ml-100k/u.data', header=None, names=['info'])
ratings['user_id'] = ratings['info'].apply(lambda rec: rec.split('\t')[0])
ratings['movie_id'] = ratings['info'].apply(lambda rec: rec.split('\t')[1])
ratings['rating'] = ratings['info'].apply(lambda rec: rec.split('\t')[2])
ratings.head()
Out[13]:
In [14]:
df = ratings.merge(user, on='user_id').merge(movie, on='movie_id')
In [118]:
import numpy as np
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm
# Read in data
def loadData(filename,path="ml-100k/"):
data = []
y = []
users=set()
items=set()
with open(path+filename) as f:
for line in f:
(user,movieid,rating,ts)=line.split('\t')
data.append({ "user_id": str(user), "movie_id": str(movieid)})
y.append(float(rating))
users.add(user)
items.add(movieid)
return (data, np.array(y), users, items)
(train_data, y_train, train_users, train_items) = loadData("ua.base")
(test_data, y_test, test_users, test_items) = loadData("ua.test")
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)
In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)
train_score = mean_squared_error(y_train, lr.predict(X_train))
test_score = mean_squared_error(y_test, lr.predict(X_test))
train_score, test_score
Out[25]:
In [29]:
ratings[(ratings.user_id == '308') & (ratings.movie_id == '1')]
Out[29]:
In [31]:
ex = v.transform([{'user_id': '308'}])
print ex.indices
ex = v.transform([{'movie_id': '1'}])
print ex.indices
In [32]:
lr.coef_[0], lr.coef_[1912], lr.intercept_
Out[32]:
In [33]:
0.53830791489356944 + (-0.15291660287472478) + 3.5238268742409184
Out[33]:
In [35]:
ex = v.transform([{'user_id': '308', 'movie_id': '1'}])
print lr.predict(ex)
In [36]:
fm = pylibfm.FM(num_factors=14, num_iter=100, verbose=True, task="regression",
initial_learning_rate=0.001, learning_rate_schedule="optimal")
fm.fit(X_train, y_train)
train_score = mean_squared_error(y_train, fm.predict(X_train))
test_score = mean_squared_error(y_test, fm.predict(X_test))
print 'train: ', train_score, ' test: ', test_score
In [40]:
ex = v.transform([{'user_id': '308', 'movie_id': '1'}])
print fm.predict(ex)
Статистически FM лучше, но в данном кокретном эпизоде повела себя чуть хуже linear regression.
https://habrahabr.ru/company/mlclass/blog/248779/ Когда данных действительно много: Vowpal Wabbit (@akrot) https://habrahabr.ru/company/ods/blog/326418/ Открытый курс машинного обучения. Тема 8. Обучение на гигабайтах с Vowpal Wabbit (Open Data Science)
In [5]:
with open('ml-100k/ua.base') as fh, open('train.vw', 'w') as vw:
for line in fh:
(user, movieid, rating, ts)=line.split('\t')
vw.write('{rating} |u user:{user} |m movie:{movie}\n'.format(
rating=rating, user=user, movie=movieid))
with open('ml-100k/ua.test') as fh, open('test.vw', 'w') as vw:
for line in fh:
(user, movieid, rating, ts)=line.split('\t')
vw.write('{rating} |u user:{user} |m movie:{movie}\n'.format(
rating=rating, user=user, movie=movieid))
In [6]:
!vw --loss_function squared train.vw -b 29 --passes 5 --cache_file cache -f model.vw
In [8]:
!vw --loss_function squared -i model.vw test.vw
In [11]:
!rm cache
In [10]:
!vw --loss_function squared train.vw -b 29 --lrq um14 --passes 5 --cache_file cache -f model.vw
In [12]:
!vw --loss_function squared -i model.vw test.vw
In [2]:
!rm cache
In [3]:
!vw --loss_function squared train.vw -b 29 --lrqfa um14 --passes 5 --cache_file cache -f model.vw
In [4]:
!vw --loss_function squared -i model.vw test.vw
File format is similar to LibSVM:
<label> <field1>:<index1>:<value1> <field2>:<index2>:<value2> ...
In [102]:
with open('ml-100k/ua.base') as fh, open('train.libffm', 'w') as vw:
for line in fh:
(user, movieid, rating, ts)=line.split('\t')
rating = int(rating)
label = -1 if rating == 1 else 1
vw.write('{label} u:{user}:1 m:{movie}:1\n'.format(
label=label, user=user, movie=movieid))
with open('ml-100k/ua.test') as fh, open('test.libffm', 'w') as vw:
for line in fh:
(user, movieid, rating, ts)=line.split('\t')
rating = int(rating)
if rating in {2, 3, 4}:
continue
label = -1 if rating == 1 else 1
vw.write('{label} u:{user}:1 m:{movie}:1\n'.format(
label=label, user=user, movie=movieid))
In [154]:
!ffm-train -p test.libffm -l 0.05 -t 50 -k 14 train.libffm model.libffm
In [106]:
!ffm-predict test.libffm model.libffm output
In [138]:
idx = (y_train == 5) | (y_train == 1)
y_train_label = y_train[idx]
X_train_bin = X_train[idx]
idx = (y_test == 5) | (y_test == 1)
y_test_label = y_test[idx]
X_test_bin = X_test[idx]
In [139]:
y_train_label = (y_train_label - 1) / 4
y_test_label = (y_test_label - 1) / 4
In [142]:
lr = LogisticRegression()
lr.fit(X_train_bin, y_train_label)
train_score = log_loss(y_train_label, lr.predict_proba(X_train_bin))
test_score = log_loss(y_test_label, lr.predict_proba(X_test_bin))
train_score, test_score
Out[142]:
In [ ]: