notebook.community

Edit and run



In [ ]:

    
import json, io, h5py
import pandas as pd
import numpy as np

import artm

%matplotlib inline
import seaborn as sns

from collections import Counter



In [ ]:

    
from utils import sample_from



In [ ]:

    
with open('../data/insception-classes.tsv') as income:
    class_names = dict(map(lambda l: map(str.strip, l.split('\t')), income))



In [ ]:

    
with open('../data/second_big_cleaned_andlenght-filtered.json') as income:
    df = pd.DataFrame(map(json.loads, income))

with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
    %time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!

df.dropna(axis=0, subset=['classes'], inplace=True)



In [ ]:

    
df = df.sample(frac=1., random_state=42) # some shuffling



In [ ]:

    
samples_size = 300



In [ ]:

    
only_pics = df.iloc[:samples_size]
only_texts = df.iloc[samples_size:2*samples_size]
df = df.iloc[2*samples_size:]



In [ ]:

    
only_pics.drop('tag', axis=1, inplace=True)
only_pics.drop('text', axis=1, inplace=True)
only_pics.shape



In [ ]:

    
only_texts.drop('classes', axis=1, inplace=True)
only_texts.shape



In [ ]:

    
df.shape



In [ ]:

    
idx = 1
sns.plt.scatter(np.log(df.classes[idx]), zip(*sample_from(df.classes[idx]).items())[-1])
sns.plt.xlabel('prob')
sns.plt.ylabel('times')



In [ ]:

    
for _ in range (3):
    sns.barplot(zip(*sample_from(df.classes[idx]).items())[0], zip(*sample_from(df.classes[idx]).items())[1] )



In [ ]:

    
df.text = df.text.apply(Counter)
only_texts.text = only_texts.text.apply(Counter)



In [ ]:

    
df.tag = df.tag.apply(Counter)
only_texts.tag = only_texts.tag.apply(Counter)



In [ ]:

    
df.classes = df.classes.apply(sample_from)
only_pics.classes = only_pics.classes.apply(sample_from)



In [ ]:

    
once_awared = set()


def to_vw(row, key='img_url', fields=['tag', 'text', 'classes']):
    line = '%s '%row[key][len('https://'):]
        
    for field in fields:
        
        if field not in row:
            if field not in once_awared:
                print 'WARNING: there is no %s field in the row %s'%(field, row[key])
                once_awared.add(field)
            continue
        
        if row[field] is not None and len(row[field]) > 0:
            line += '|%s ' % field
            line += '%s '%' '.join('%s:%i'%(unicode(pair[0]).replace(':', ''), pair[-1]) for pair in row[field].items() if pair[-1]>0 and len(unicode(pair[0]).replace(':', ''))>0)
    return '%s\n'%line



In [ ]:

    
df.shape



In [ ]:

    
dataset_name = '68743_of_tags_text_classes'



In [ ]:

    
dataset_name = '17424_of_tags_text_classes'



In [ ]:

    
dataset_name = 'to_filter_of_tags_text'



In [ ]:



In [ ]:

    
def prepare_for_artm(df, dataset_name):
    vw_path = '../data/%s.vw'%dataset_name
    with io.open(vw_path, 'w', encoding='utf8') as outcome:
        for k, row in df.iterrows():
            outcome.write(to_vw(row))
    artm_path = '../data/%s_batches'%dataset_name
    artm.BatchVectorizer(target_folder=artm_path, data_path=vw_path, data_format='vowpal_wabbit', batch_size=1000)



In [ ]:

    
prepare_for_artm(df, 'main_trainset')



In [ ]:

    
prepare_for_artm(only_texts, 'only_text_testset')



In [ ]:

    
prepare_for_artm(only_pics, 'only_pics_testset')



In [ ]:

    
dataset_name = 'main_trainset'



In [ ]:

    
artm_path = '../data/%s_batches'%dataset_name



In [ ]:

    
batch_to_train = artm.BatchVectorizer(data_path=artm_path)



In [ ]:

Чищу словарь. Закройте глаза.



In [ ]:

    
from nltk.corpus import stopwords
# from pymorphy2 import MorphAnalyzer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import RussianStemmer

rus_stop_words = set(stopwords.words('russian'))
en_stop_words = set(stopwords.words('english'))



In [ ]:

    
batch_to_train.dictionary.filter('text', min_df=2, max_df_rate=0.5)
batch_to_train.dictionary.filter('tag', min_df=2)
batch_to_train.dictionary.filter('classes', min_df=2, max_df_rate=0.5)



In [ ]:

    
batch_to_train.dictionary.save_text('../data/dict.csv')



In [ ]:

    
artm_dict = pd.read_csv('../data/dict.csv', skiprows=1, sep=', ')



In [ ]:

    
artm_dict.token = artm_dict.token.apply(lambda w: unicode(w.decode('utf8')))



In [ ]:

    
stemmer = PorterStemmer()
ru_stemmer = RussianStemmer()

print artm_dict.shape
for stoplist in [map(stemmer.stem, en_stop_words)]:
    artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]
    
print artm_dict.shape

morph = MorphAnalyzer()
stoplist = set([ru_stemmer.stem(tok) for tok in rus_stop_words])
artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]

print artm_dict.shape



In [ ]:

    
artm_dict = artm_dict[~((artm_dict.class_id == 'text') & (artm_dict.token.str.len()<2))]



In [ ]:

    
filtered_path = '../data/dict.filtered'



In [ ]:

    
artm_dict.to_csv(filtered_path, encoding='utf8', index=None, header=True)

! echo "name: en_ru_stops_filtered" > /tmp/header_of_dict
! cp $filtered_path /tmp/full.dict
! cat /tmp/full.dict | sed -ne 's/,\(\w\)/, \1/gp' > /tmp/full.dict.filtered
! cat /tmp/header_of_dict /tmp/full.dict.filtered > $filtered_path

! rm /tmp/full.dict.filtered /tmp/full.dict

Load it!!!



In [ ]:

    
cleaned_dict = artm.Dictionary()
cleaned_dict.load_text(filtered_path if filtered_path is not None else '../data/dict.filtered')

ARTM!



In [ ]:

    
sparsed_topics = 39
smoothed_topics = 5

topics = ['good_%i'%_ for _ in range(sparsed_topics)] + ['mess_%s'%_ for _ in range(smoothed_topics)]



In [ ]:

    
tm = artm.ARTM(topic_names=topics, num_processors=2)

tm.initialize(cleaned_dict)



In [ ]:

    
tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))

tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))

tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))

tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))

tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))



In [ ]:



In [ ]:

    
tm.scores.add(artm.PerplexityScore(name='perp', class_ids=['text', 'tag', 'classes']))

tm.scores.add(artm.TopTokensScore(name='top_words', class_id='text', num_tokens=250))

tm.scores.add(artm.TopTokensScore(name='top_tags', class_id='tag'))

tm.scores.add(artm.TopTokensScore(name='top_classes', class_id='classes', num_tokens=50))



In [ ]:



In [ ]:

    
%time tm.fit_offline(batch_to_train, 10)



In [ ]:

    
tm.regularizers.add(artm.TopicSelectionThetaRegularizer(name='selector', tau=0.4, topic_names=topics))



In [ ]:

    
%%time
for _ in range(10):
    tm.regularizers['selector'].tau +=0.02
    tm.fit_offline(batch_to_train, 2)



In [ ]:

    
%%time
for _ in range(4):
    tm.regularizers['selector'].tau +=0.05
    tm.fit_offline(batch_to_train, 5)



In [ ]:

    
%time tm.fit_offline(batch_to_train, 5)



In [ ]:

    
tm.regularizers['selector'].tau

ARTM training end



In [ ]:

    
perplexity_score = tm.score_tracker['perp']



In [ ]:

    
sns.plt.plot(perplexity_score.value)



In [ ]:



In [ ]:

    
def get_weighted(modality_score_tracker):
    res = {}
    
    lst_tokens = modality_score_tracker.last_tokens
    lst_weights = modality_score_tracker.last_weights
    
    for k in sorted(lst_tokens.keys()):
        res[k] = [(word, v) for (word, v) in 
                  zip(lst_tokens[k], lst_weights[k]) 
                      if v > 0]
    topics, weights = zip(*res.items())
    weights = map(dict, weights)
    return pd.DataFrame(weights, index=topics).fillna(0).sort_index()

def get_top(row, treshold=0.95, number=5):
    sorted_row = row.sort_values(ascending=False)
    
    res = []
    prob_mass = 0
    for k, val in sorted_row.iteritems():
        prob_mass+=val
        res.append((k, val))
        if prob_mass>treshold:
            break
        if len(res)==number:
            res.append(('other', (1. - prob_mass)))
            break
    
    return res



In [ ]:

    
def print_modality_top(mod_name, subst_name=None, thresold=0.95, max_items=5, need_print = False):
    topic_words = get_weighted(tm.score_tracker[mod_name])\
                                .apply(lambda r: get_top(r, thresold, max_items), axis=1)
    if need_print:
        for t, words in topic_words.iteritems():
            print '%s: %s'%(t, ',\t'.join(
                '%s:%3.3f'%(name if subst_name is None else subst_name.get(name, 'empty_subst'), value)
                                            for name, value in words if value>0))
    return topic_words



In [ ]:

    
print_modality_top('top_tags')



In [ ]:

    
print_modality_top('top_words', max_items=15, need_print=True)



In [ ]:

    
print_modality_top('top_classes', subst_name=class_names)



In [ ]:



In [ ]:

    
tm.save('../data/big_model.artm.mtx')



In [ ]:

    
def save_modality_mat(mat, name, ):
    mat.to_csv('../data/%s.csv'%name, sep = '\t', encoding='utf8')

    with io.open('../data/%s.json'%name, 'w', encoding='utf8') as outcome:
        for u, data in mat.iterrows():
            ans = dict((k, float(v)) for k,v in data.to_dict().items())
            ans['token'] = u
            outcome.write(u'%s\n'% json.dumps(ans))



In [ ]:

    
phi_mat_text = tm.get_phi(topic_names=topics, class_ids=['text'])



In [ ]:

    
(phi_mat_text.sum(axis=1) < 10**-9).sum()



In [ ]:

    
phi_mat_text.sum() > 0.5



In [ ]:

    
save_modality_mat(phi_mat_text, 'phi_text')



In [ ]:

    
phi_mat_classes = tm.get_phi(topic_names=topics, class_ids=['classes'])



In [ ]:

    
save_modality_mat(phi_mat_classes, 'phi_classes')



In [ ]:

    
phi_mat_tag = tm.get_phi(topic_names=topics, class_ids=['tag'])



In [ ]:

    
save_modality_mat(phi_mat_tag, 'phi_tag')

Дальше лучше не читать, оно там просто валяется.



In [ ]:

    
only_pic_batch = artm.BatchVectorizer(data_path='../data/only_pics_testset_batches')



In [ ]:

    
only_text_batch = artm.BatchVectorizer(data_path='../data/only_text_testset_batches')



In [ ]:



In [ ]:

    
only_pic_topics = tm.transform(only_pic_batch, predict_class_id='text')



In [ ]:

    
for _,v in only_pic_topics.apply(lambda x: get_top(x), axis=0).iteritems():
    print ', '.join('-'.join(map(unicode, r)) for r in v)



In [ ]:



In [ ]:

    
res = pd.DataFrame(index=df.img_url)



In [ ]:

    
tm2 = artm.ARTM(44)



In [ ]:

    
tm2.load('../data/big_model.artm.mtx')



In [ ]:

    
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
badset = set()



In [ ]:

    
goodset = set()



In [ ]:

    
very_bad_words = [u'арт', u'артикул', u'app2255775', u'app4216068']



In [ ]:

    
for word in very_bad_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    badset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.01, replace=True).index)



In [ ]:

    
bad_words = [u'продать', u'продажа', u'склад', u'товар', u'прокат', u'ретушь', u'сантиметр']



In [ ]:

    
for word in bad_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    badset.update(df[df.text.apply(lambda l: word in l)].sample(100, replace=True).index)



In [ ]:

    
good_words = [u'замок', u'дружба', u'семья', u'дружба', 
              u'работа', u'поехать', u'отель', u'остров', 
              u'гулять', u'сдать', u'повезти', u'плавать',
              u'восхитительно', u'смеяться', u'уехать']



In [ ]:

    
for word in good_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    goodset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.1, replace=True).index)



In [ ]:

    
len(goodset)



In [ ]:

    
len(badset)



In [ ]:

    
from sklearn.svm import SVC



In [ ]:

    
model = SVC(kernel='linear', probability=True)



In [ ]:

    
topics_to_filter.shape

topics_to_filter = tm.transform(batch_to_predict).T

topics_to_filter.sort_index(inplace=True)

topics_to_filter.index = df.img_url



In [ ]:

    
topics_to_filter = topics



In [ ]:

    
train_df = topics_to_filter.loc[goodset].append(topics_to_filter.loc[badset])



In [ ]:

    
train_df['is_ad'] = [0]*len(goodset)+[1]*len(badset)



In [ ]:

    
train_df = train_df.sample(frac=1)



In [ ]:

    
model.fit(train_df.drop('is_ad', axis=1), train_df.is_ad)



In [ ]:

    
preds = model.predict(topics_to_filter)



In [ ]:

    
preds.shape[0] - (preds).sum()



In [ ]:



In [ ]:

    
with open('../data/for_ivan_', 'w') as outcome:
    for url in topics_to_filter[preds < 0.5].index:
        outcome.write('%s\n'%url)



In [ ]:

    
import io



In [ ]:

    
with io.open('../data/big_to_download.json', 'w', encoding='utf8') as outcome:
    for u, data in df[preds < 0.5].iterrows():
        ans = data.to_dict()
        ans['img_url'] = u
        outcome.write(u'%s\n'% json.dumps(ans))



In [ ]: