In [ ]:
import json, io, h5py
import pandas as pd
import numpy as np
import artm
%matplotlib inline
import seaborn as sns
from collections import Counter
In [ ]:
from utils import sample_from
In [ ]:
with open('../data/insception-classes.tsv') as income:
class_names = dict(map(lambda l: map(str.strip, l.split('\t')), income))
In [ ]:
with open('../data/second_big_cleaned_andlenght-filtered.json') as income:
df = pd.DataFrame(map(json.loads, income))
with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
%time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!
df.dropna(axis=0, subset=['classes'], inplace=True)
In [ ]:
df = df.sample(frac=1., random_state=42) # some shuffling
In [ ]:
samples_size = 300
In [ ]:
only_pics = df.iloc[:samples_size]
only_texts = df.iloc[samples_size:2*samples_size]
df = df.iloc[2*samples_size:]
In [ ]:
only_pics.drop('tag', axis=1, inplace=True)
only_pics.drop('text', axis=1, inplace=True)
only_pics.shape
In [ ]:
only_texts.drop('classes', axis=1, inplace=True)
only_texts.shape
In [ ]:
df.shape
In [ ]:
idx = 1
sns.plt.scatter(np.log(df.classes[idx]), zip(*sample_from(df.classes[idx]).items())[-1])
sns.plt.xlabel('prob')
sns.plt.ylabel('times')
In [ ]:
for _ in range (3):
sns.barplot(zip(*sample_from(df.classes[idx]).items())[0], zip(*sample_from(df.classes[idx]).items())[1] )
In [ ]:
df.text = df.text.apply(Counter)
only_texts.text = only_texts.text.apply(Counter)
In [ ]:
df.tag = df.tag.apply(Counter)
only_texts.tag = only_texts.tag.apply(Counter)
In [ ]:
df.classes = df.classes.apply(sample_from)
only_pics.classes = only_pics.classes.apply(sample_from)
In [ ]:
once_awared = set()
def to_vw(row, key='img_url', fields=['tag', 'text', 'classes']):
line = '%s '%row[key][len('https://'):]
for field in fields:
if field not in row:
if field not in once_awared:
print 'WARNING: there is no %s field in the row %s'%(field, row[key])
once_awared.add(field)
continue
if row[field] is not None and len(row[field]) > 0:
line += '|%s ' % field
line += '%s '%' '.join('%s:%i'%(unicode(pair[0]).replace(':', ''), pair[-1]) for pair in row[field].items() if pair[-1]>0 and len(unicode(pair[0]).replace(':', ''))>0)
return '%s\n'%line
In [ ]:
df.shape
In [ ]:
dataset_name = '68743_of_tags_text_classes'
In [ ]:
dataset_name = '17424_of_tags_text_classes'
In [ ]:
dataset_name = 'to_filter_of_tags_text'
In [ ]:
In [ ]:
def prepare_for_artm(df, dataset_name):
vw_path = '../data/%s.vw'%dataset_name
with io.open(vw_path, 'w', encoding='utf8') as outcome:
for k, row in df.iterrows():
outcome.write(to_vw(row))
artm_path = '../data/%s_batches'%dataset_name
artm.BatchVectorizer(target_folder=artm_path, data_path=vw_path, data_format='vowpal_wabbit', batch_size=1000)
In [ ]:
prepare_for_artm(df, 'main_trainset')
In [ ]:
prepare_for_artm(only_texts, 'only_text_testset')
In [ ]:
prepare_for_artm(only_pics, 'only_pics_testset')
In [ ]:
dataset_name = 'main_trainset'
In [ ]:
artm_path = '../data/%s_batches'%dataset_name
In [ ]:
batch_to_train = artm.BatchVectorizer(data_path=artm_path)
In [ ]:
In [ ]:
from nltk.corpus import stopwords
# from pymorphy2 import MorphAnalyzer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import RussianStemmer
rus_stop_words = set(stopwords.words('russian'))
en_stop_words = set(stopwords.words('english'))
In [ ]:
batch_to_train.dictionary.filter('text', min_df=2, max_df_rate=0.5)
batch_to_train.dictionary.filter('tag', min_df=2)
batch_to_train.dictionary.filter('classes', min_df=2, max_df_rate=0.5)
In [ ]:
batch_to_train.dictionary.save_text('../data/dict.csv')
In [ ]:
artm_dict = pd.read_csv('../data/dict.csv', skiprows=1, sep=', ')
In [ ]:
artm_dict.token = artm_dict.token.apply(lambda w: unicode(w.decode('utf8')))
In [ ]:
stemmer = PorterStemmer()
ru_stemmer = RussianStemmer()
print artm_dict.shape
for stoplist in [map(stemmer.stem, en_stop_words)]:
artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]
print artm_dict.shape
morph = MorphAnalyzer()
stoplist = set([ru_stemmer.stem(tok) for tok in rus_stop_words])
artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]
print artm_dict.shape
In [ ]:
artm_dict = artm_dict[~((artm_dict.class_id == 'text') & (artm_dict.token.str.len()<2))]
In [ ]:
filtered_path = '../data/dict.filtered'
In [ ]:
artm_dict.to_csv(filtered_path, encoding='utf8', index=None, header=True)
! echo "name: en_ru_stops_filtered" > /tmp/header_of_dict
! cp $filtered_path /tmp/full.dict
! cat /tmp/full.dict | sed -ne 's/,\(\w\)/, \1/gp' > /tmp/full.dict.filtered
! cat /tmp/header_of_dict /tmp/full.dict.filtered > $filtered_path
! rm /tmp/full.dict.filtered /tmp/full.dict
In [ ]:
cleaned_dict = artm.Dictionary()
cleaned_dict.load_text(filtered_path if filtered_path is not None else '../data/dict.filtered')
In [ ]:
sparsed_topics = 39
smoothed_topics = 5
topics = ['good_%i'%_ for _ in range(sparsed_topics)] + ['mess_%s'%_ for _ in range(smoothed_topics)]
In [ ]:
tm = artm.ARTM(topic_names=topics, num_processors=2)
tm.initialize(cleaned_dict)
In [ ]:
tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))
tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))
tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))
tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))
tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))
In [ ]:
In [ ]:
tm.scores.add(artm.PerplexityScore(name='perp', class_ids=['text', 'tag', 'classes']))
tm.scores.add(artm.TopTokensScore(name='top_words', class_id='text', num_tokens=250))
tm.scores.add(artm.TopTokensScore(name='top_tags', class_id='tag'))
tm.scores.add(artm.TopTokensScore(name='top_classes', class_id='classes', num_tokens=50))
In [ ]:
In [ ]:
%time tm.fit_offline(batch_to_train, 10)
In [ ]:
tm.regularizers.add(artm.TopicSelectionThetaRegularizer(name='selector', tau=0.4, topic_names=topics))
In [ ]:
%%time
for _ in range(10):
tm.regularizers['selector'].tau +=0.02
tm.fit_offline(batch_to_train, 2)
In [ ]:
%%time
for _ in range(4):
tm.regularizers['selector'].tau +=0.05
tm.fit_offline(batch_to_train, 5)
In [ ]:
%time tm.fit_offline(batch_to_train, 5)
In [ ]:
tm.regularizers['selector'].tau
In [ ]:
perplexity_score = tm.score_tracker['perp']
In [ ]:
sns.plt.plot(perplexity_score.value)
In [ ]:
In [ ]:
def get_weighted(modality_score_tracker):
res = {}
lst_tokens = modality_score_tracker.last_tokens
lst_weights = modality_score_tracker.last_weights
for k in sorted(lst_tokens.keys()):
res[k] = [(word, v) for (word, v) in
zip(lst_tokens[k], lst_weights[k])
if v > 0]
topics, weights = zip(*res.items())
weights = map(dict, weights)
return pd.DataFrame(weights, index=topics).fillna(0).sort_index()
def get_top(row, treshold=0.95, number=5):
sorted_row = row.sort_values(ascending=False)
res = []
prob_mass = 0
for k, val in sorted_row.iteritems():
prob_mass+=val
res.append((k, val))
if prob_mass>treshold:
break
if len(res)==number:
res.append(('other', (1. - prob_mass)))
break
return res
In [ ]:
def print_modality_top(mod_name, subst_name=None, thresold=0.95, max_items=5, need_print = False):
topic_words = get_weighted(tm.score_tracker[mod_name])\
.apply(lambda r: get_top(r, thresold, max_items), axis=1)
if need_print:
for t, words in topic_words.iteritems():
print '%s: %s'%(t, ',\t'.join(
'%s:%3.3f'%(name if subst_name is None else subst_name.get(name, 'empty_subst'), value)
for name, value in words if value>0))
return topic_words
In [ ]:
print_modality_top('top_tags')
In [ ]:
print_modality_top('top_words', max_items=15, need_print=True)
In [ ]:
print_modality_top('top_classes', subst_name=class_names)
In [ ]:
In [ ]:
tm.save('../data/big_model.artm.mtx')
In [ ]:
def save_modality_mat(mat, name, ):
mat.to_csv('../data/%s.csv'%name, sep = '\t', encoding='utf8')
with io.open('../data/%s.json'%name, 'w', encoding='utf8') as outcome:
for u, data in mat.iterrows():
ans = dict((k, float(v)) for k,v in data.to_dict().items())
ans['token'] = u
outcome.write(u'%s\n'% json.dumps(ans))
In [ ]:
phi_mat_text = tm.get_phi(topic_names=topics, class_ids=['text'])
In [ ]:
(phi_mat_text.sum(axis=1) < 10**-9).sum()
In [ ]:
phi_mat_text.sum() > 0.5
In [ ]:
save_modality_mat(phi_mat_text, 'phi_text')
In [ ]:
phi_mat_classes = tm.get_phi(topic_names=topics, class_ids=['classes'])
In [ ]:
save_modality_mat(phi_mat_classes, 'phi_classes')
In [ ]:
phi_mat_tag = tm.get_phi(topic_names=topics, class_ids=['tag'])
In [ ]:
save_modality_mat(phi_mat_tag, 'phi_tag')
In [ ]:
only_pic_batch = artm.BatchVectorizer(data_path='../data/only_pics_testset_batches')
In [ ]:
only_text_batch = artm.BatchVectorizer(data_path='../data/only_text_testset_batches')
In [ ]:
In [ ]:
only_pic_topics = tm.transform(only_pic_batch, predict_class_id='text')
In [ ]:
for _,v in only_pic_topics.apply(lambda x: get_top(x), axis=0).iteritems():
print ', '.join('-'.join(map(unicode, r)) for r in v)
In [ ]:
In [ ]:
res = pd.DataFrame(index=df.img_url)
In [ ]:
tm2 = artm.ARTM(44)
In [ ]:
tm2.load('../data/big_model.artm.mtx')
In [ ]:
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
badset = set()
In [ ]:
goodset = set()
In [ ]:
very_bad_words = [u'арт', u'артикул', u'app2255775', u'app4216068']
In [ ]:
for word in very_bad_words:
print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
badset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.01, replace=True).index)
In [ ]:
bad_words = [u'продать', u'продажа', u'склад', u'товар', u'прокат', u'ретушь', u'сантиметр']
In [ ]:
for word in bad_words:
print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
badset.update(df[df.text.apply(lambda l: word in l)].sample(100, replace=True).index)
In [ ]:
good_words = [u'замок', u'дружба', u'семья', u'дружба',
u'работа', u'поехать', u'отель', u'остров',
u'гулять', u'сдать', u'повезти', u'плавать',
u'восхитительно', u'смеяться', u'уехать']
In [ ]:
for word in good_words:
print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
goodset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.1, replace=True).index)
In [ ]:
len(goodset)
In [ ]:
len(badset)
In [ ]:
from sklearn.svm import SVC
In [ ]:
model = SVC(kernel='linear', probability=True)
In [ ]:
topics_to_filter.shape
topics_to_filter = tm.transform(batch_to_predict).T
topics_to_filter.sort_index(inplace=True)
topics_to_filter.index = df.img_url
In [ ]:
topics_to_filter = topics
In [ ]:
train_df = topics_to_filter.loc[goodset].append(topics_to_filter.loc[badset])
In [ ]:
train_df['is_ad'] = [0]*len(goodset)+[1]*len(badset)
In [ ]:
train_df = train_df.sample(frac=1)
In [ ]:
model.fit(train_df.drop('is_ad', axis=1), train_df.is_ad)
In [ ]:
preds = model.predict(topics_to_filter)
In [ ]:
preds.shape[0] - (preds).sum()
In [ ]:
In [ ]:
with open('../data/for_ivan_', 'w') as outcome:
for url in topics_to_filter[preds < 0.5].index:
outcome.write('%s\n'%url)
In [ ]:
import io
In [ ]:
with io.open('../data/big_to_download.json', 'w', encoding='utf8') as outcome:
for u, data in df[preds < 0.5].iterrows():
ans = data.to_dict()
ans['img_url'] = u
outcome.write(u'%s\n'% json.dumps(ans))
In [ ]: