Predict likes by tags

Getting the data


In [17]:
import sys
import warnings
from tqdm import tqdm

import csv
import pandas as pd
import matplotlib.pyplot as plt

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey
from sqlalchemy.engine.url import URL
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import NullPool
local_name = 'sqlite:///../corpus/stories.sqlite'
engine = create_engine(local_name, echo=True)

%matplotlib inline

In [118]:
df = pd.read_sql_query('select * from stories', engine)


2018-03-21 04:14:35,330 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
INFO:sqlalchemy.engine.base.Engine:SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2018-03-21 04:14:35,338 INFO sqlalchemy.engine.base.Engine ()
INFO:sqlalchemy.engine.base.Engine:()
2018-03-21 04:14:35,343 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
INFO:sqlalchemy.engine.base.Engine:SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2018-03-21 04:14:35,348 INFO sqlalchemy.engine.base.Engine ()
INFO:sqlalchemy.engine.base.Engine:()
2018-03-21 04:14:35,351 INFO sqlalchemy.engine.base.Engine select * from stories
INFO:sqlalchemy.engine.base.Engine:select * from stories
2018-03-21 04:14:35,353 INFO sqlalchemy.engine.base.Engine ()
INFO:sqlalchemy.engine.base.Engine:()

In [119]:
print(df.shape)
df.head()


(23558, 8)
Out[119]:
id title published tags text likes hrefs url
0 2 Это был не металл! 20090910 women sellers Работаю в провинциальном городе в магазине отд... 0 http://zadolba.li/story/2
1 4 Интернет — он большой! 20090915 advert В понедельник у меня образовался очередной кли... 0 http://zadolba.li/story/4
2 5 Атака одушевленного обьекта 20090915 insurers mail Работаю в страховой компании. Любимые клиенты ... 0 http://zadolba.li/story/5
3 6 Мы его почти победили! 20091006 mail Переписка с клиентом:Я: Здравствуйте, а у вас ... 828 http://zadolba.li/story/6
4 8 На фирму Маше 20091006 byphone secretaries Работаю секретарем. Люди встречаются разные.Я:... 1127 http://zadolba.li/story/8

Easy mode: tags -> likes


In [234]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [235]:
df_tags_likes = df[['tags', 'likes']]
print(df_tags_likes.shape)
df_tags_likes[:10]


(23558, 2)
Out[235]:
tags likes
0 women sellers 0
1 advert 0
2 insurers mail 0
3 mail 828
4 byphone secretaries 1127
5 women 2882
6 women 2412
7 advert 2554
8 secretaries 2399
9 byphone 4241

In [248]:
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X, y = vectorizer.fit_transform(df_tags_likes['tags']), df_tags_likes['likes'].values
# one_hot_df = pd.DataFrame(X.toarray(), columns=sorted(vectorizer.vocabulary_, key=lambda k: vectorizer.vocabulary_[k]))

In [249]:
# one_hot_df['likes'] = df_tags_likes['likes']
# one_hot_df.shape
X, y


Out[249]:
(<23558x66 sparse matrix of type '<class 'numpy.int64'>'
 	with 39060 stored elements in Compressed Sparse Row format>,
 array([  0,   0,   0, ..., 541, 530, 860]))

In [252]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [253]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)

In [254]:
lm.score(X_test, y_test)


Out[254]:
0.11439169094616364

Now let's try text


In [258]:
df_text_likes = df[['text', 'likes']]
print(df_text_likes.shape)

vectorizer = CountVectorizer()
X, y = vectorizer.fit_transform(df_text_likes['text']), df_text_likes['likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)

lm.score(X_test, y_test)


(23558, 2)
(18846, 252225) (4712, 252225) (18846,) (4712,)
Out[258]:
-0.23939961968084658

In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_text_likes = df[['text', 'likes']]
print(df_text_likes.shape)

vectorizer  = TfidfVectorizer()
X, y = vectorizer.fit_transform(df_text_likes['text']), df_text_likes['likes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)

lm.score(X_test, y_test)


(23558, 2)
(18846, 252225) (4712, 252225) (18846,) (4712,)
Out[261]:
0.090341157290913188

In [ ]: