import json, io, h5py
import pandas as pd
import numpy as np

import artm

%matplotlib inline
import seaborn as sns

from collections import Counter

from utils import sample_from

with open('../data/insception-classes.tsv') as income:
    class_names = dict(map(lambda l: map(str.strip, l.split('\t')), income))

with open('../data/second_big_cleaned_andlenght-filtered.json') as income:
    df = pd.DataFrame(map(json.loads, income))

with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
    %time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!

df.dropna(axis=0, subset=['classes'], inplace=True)

df = df.sample(frac=1., random_state=42) # some shuffling

samples_size = 300

only_pics = df.iloc[:samples_size]
only_texts = df.iloc[samples_size:2*samples_size]
df = df.iloc[2*samples_size:]

only_pics.drop('tag', axis=1, inplace=True)
only_pics.drop('text', axis=1, inplace=True)

only_texts.drop('classes', axis=1, inplace=True)

idx = 1
sns.plt.scatter(np.log(df.classes[idx]), zip(*sample_from(df.classes[idx]).items())[-1])

for _ in range (3):
    sns.barplot(zip(*sample_from(df.classes[idx]).items())[0], zip(*sample_from(df.classes[idx]).items())[1] )

df.text = df.text.apply(Counter)
only_texts.text = only_texts.text.apply(Counter)

df.tag = df.tag.apply(Counter)
only_texts.tag = only_texts.tag.apply(Counter)

df.classes = df.classes.apply(sample_from)
only_pics.classes = only_pics.classes.apply(sample_from)

once_awared = set()

def to_vw(row, key='img_url', fields=['tag', 'text', 'classes']):
    line = '%s '%row[key][len('https://'):]
    for field in fields:
        if field not in row:
            if field not in once_awared:
                print 'WARNING: there is no %s field in the row %s'%(field, row[key])
        if row[field] is not None and len(row[field]) > 0:
            line += '|%s ' % field
            line += '%s '%' '.join('%s:%i'%(unicode(pair[0]).replace(':', ''), pair[-1]) for pair in row[field].items() if pair[-1]>0 and len(unicode(pair[0]).replace(':', ''))>0)
    return '%s\n'%line

dataset_name = '68743_of_tags_text_classes'

In [ ]:
dataset_name = '17424_of_tags_text_classes'

In [ ]:
dataset_name = 'to_filter_of_tags_text'

def prepare_for_artm(df, dataset_name):
    vw_path = '../data/%s.vw'%dataset_name
    with, 'w', encoding='utf8') as outcome:
        for k, row in df.iterrows():
    artm_path = '../data/%s_batches'%dataset_name
    artm.BatchVectorizer(target_folder=artm_path, data_path=vw_path, data_format='vowpal_wabbit', batch_size=1000)

prepare_for_artm(df, 'main_trainset')

prepare_for_artm(only_texts, 'only_text_testset')

prepare_for_artm(only_pics, 'only_pics_testset')

dataset_name = 'main_trainset'

artm_path = '../data/%s_batches'%dataset_name

batch_to_train = artm.BatchVectorizer(data_path=artm_path)

Чищу словарь. Закройте глаза.

from nltk.corpus import stopwords
# from pymorphy2 import MorphAnalyzer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import RussianStemmer

rus_stop_words = set(stopwords.words('russian'))
en_stop_words = set(stopwords.words('english'))

batch_to_train.dictionary.filter('text', min_df=2, max_df_rate=0.5)
batch_to_train.dictionary.filter('tag', min_df=2)
batch_to_train.dictionary.filter('classes', min_df=2, max_df_rate=0.5)

artm_dict = pd.read_csv('../data/dict.csv', skiprows=1, sep=', ')

artm_dict.token = artm_dict.token.apply(lambda w: unicode(w.decode('utf8')))

stemmer = PorterStemmer()
ru_stemmer = RussianStemmer()

print artm_dict.shape
for stoplist in [map(stemmer.stem, en_stop_words)]:
    artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]
print artm_dict.shape

morph = MorphAnalyzer()
stoplist = set([ru_stemmer.stem(tok) for tok in rus_stop_words])
artm_dict = artm_dict[~artm_dict.token.isin(stoplist)]

print artm_dict.shape

artm_dict = artm_dict[~((artm_dict.class_id == 'text') & (artm_dict.token.str.len()<2))]

filtered_path = '../data/dict.filtered'

artm_dict.to_csv(filtered_path, encoding='utf8', index=None, header=True)

! echo "name: en_ru_stops_filtered" > /tmp/header_of_dict
! cp $filtered_path /tmp/full.dict
! cat /tmp/full.dict | sed -ne 's/,\(\w\)/, \1/gp' > /tmp/full.dict.filtered
! cat /tmp/header_of_dict /tmp/full.dict.filtered > $filtered_path

! rm /tmp/full.dict.filtered /tmp/full.dict

Load it!!!

cleaned_dict = artm.Dictionary()
cleaned_dict.load_text(filtered_path if filtered_path is not None else '../data/dict.filtered')


sparsed_topics = 39
smoothed_topics = 5

topics = ['good_%i'%_ for _ in range(sparsed_topics)] + ['mess_%s'%_ for _ in range(smoothed_topics)]

tm = artm.ARTM(topic_names=topics, num_processors=2)


tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))

tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))

tm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))

tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))

tm.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))

tm.scores.add(artm.PerplexityScore(name='perp', class_ids=['text', 'tag', 'classes']))

tm.scores.add(artm.TopTokensScore(name='top_words', class_id='text', num_tokens=250))

tm.scores.add(artm.TopTokensScore(name='top_tags', class_id='tag'))

tm.scores.add(artm.TopTokensScore(name='top_classes', class_id='classes', num_tokens=50))

%time tm.fit_offline(batch_to_train, 10)

tm.regularizers.add(artm.TopicSelectionThetaRegularizer(name='selector', tau=0.4, topic_names=topics))

for _ in range(10):
    tm.regularizers['selector'].tau +=0.02
    tm.fit_offline(batch_to_train, 2)

for _ in range(4):
    tm.regularizers['selector'].tau +=0.05
    tm.fit_offline(batch_to_train, 5)

%time tm.fit_offline(batch_to_train, 5)

ARTM training end

perplexity_score = tm.score_tracker['perp']

def get_weighted(modality_score_tracker):
    res = {}
    lst_tokens = modality_score_tracker.last_tokens
    lst_weights = modality_score_tracker.last_weights
    for k in sorted(lst_tokens.keys()):
        res[k] = [(word, v) for (word, v) in 
                  zip(lst_tokens[k], lst_weights[k]) 
                      if v > 0]
    topics, weights = zip(*res.items())
    weights = map(dict, weights)
    return pd.DataFrame(weights, index=topics).fillna(0).sort_index()

def get_top(row, treshold=0.95, number=5):
    sorted_row = row.sort_values(ascending=False)
    res = []
    prob_mass = 0
    for k, val in sorted_row.iteritems():
        res.append((k, val))
        if prob_mass>treshold:
        if len(res)==number:
            res.append(('other', (1. - prob_mass)))
    return res

def print_modality_top(mod_name, subst_name=None, thresold=0.95, max_items=5, need_print = False):
    topic_words = get_weighted(tm.score_tracker[mod_name])\
                                .apply(lambda r: get_top(r, thresold, max_items), axis=1)
    if need_print:
        for t, words in topic_words.iteritems():
            print '%s: %s'%(t, ',\t'.join(
                '%s:%3.3f'%(name if subst_name is None else subst_name.get(name, 'empty_subst'), value)
                                            for name, value in words if value>0))
    return topic_words

print_modality_top('top_words', max_items=15, need_print=True)

print_modality_top('top_classes', subst_name=class_names)

In [ ]:'../data/big_model.artm.mtx')

def save_modality_mat(mat, name, ):
    mat.to_csv('../data/%s.csv'%name, sep = '\t', encoding='utf8')

    with'../data/%s.json'%name, 'w', encoding='utf8') as outcome:
        for u, data in mat.iterrows():
            ans = dict((k, float(v)) for k,v in data.to_dict().items())
            ans['token'] = u
            outcome.write(u'%s\n'% json.dumps(ans))

phi_mat_text = tm.get_phi(topic_names=topics, class_ids=['text'])

(phi_mat_text.sum(axis=1) < 10**-9).sum()

phi_mat_text.sum() > 0.5

save_modality_mat(phi_mat_text, 'phi_text')

phi_mat_classes = tm.get_phi(topic_names=topics, class_ids=['classes'])

save_modality_mat(phi_mat_classes, 'phi_classes')

phi_mat_tag = tm.get_phi(topic_names=topics, class_ids=['tag'])

save_modality_mat(phi_mat_tag, 'phi_tag')

Дальше лучше не читать, оно там просто валяется.

only_pic_batch = artm.BatchVectorizer(data_path='../data/only_pics_testset_batches')

only_text_batch = artm.BatchVectorizer(data_path='../data/only_text_testset_batches')

only_pic_topics = tm.transform(only_pic_batch, predict_class_id='text')

for _,v in only_pic_topics.apply(lambda x: get_top(x), axis=0).iteritems():
    print ', '.join('-'.join(map(unicode, r)) for r in v)

res = pd.DataFrame(index=df.img_url)

tm2 = artm.ARTM(44)

tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='text_sparser', tau=-0.7, class_ids=['text']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='classes_sparser', tau=-0.3, class_ids=['classes']))
tm2.regularizers.add(artm.SmoothSparsePhiRegularizer(name='tags_smoother', tau=1, class_ids=['tag']))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_sparser', tau=-2, topic_names=topics[:sparsed_topics]))
tm2.regularizers.add(artm.SmoothSparseThetaRegularizer(name='topic_smoother', tau=2.5, topic_names=topics[smoothed_topics:]))

badset = set()

goodset = set()

very_bad_words = [u'арт', u'артикул', u'app2255775', u'app4216068']

for word in very_bad_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    badset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.01, replace=True).index)

bad_words = [u'продать', u'продажа', u'склад', u'товар', u'прокат', u'ретушь', u'сантиметр']

for word in bad_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    badset.update(df[df.text.apply(lambda l: word in l)].sample(100, replace=True).index)

good_words = [u'замок', u'дружба', u'семья', u'дружба', 
              u'работа', u'поехать', u'отель', u'остров', 
              u'гулять', u'сдать', u'повезти', u'плавать',
              u'восхитительно', u'смеяться', u'уехать']

for word in good_words:
    print 'for', word, 'there', df.text.apply(lambda l: word in l).sum(), 'records'
    goodset.update(df[df.text.apply(lambda l: word in l)].sample(frac=0.1, replace=True).index)

from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True)

topics_to_filter = tm.transform(batch_to_predict).T


topics_to_filter.index = df.img_url

topics_to_filter = topics

train_df = topics_to_filter.loc[goodset].append(topics_to_filter.loc[badset])

train_df['is_ad'] = [0]*len(goodset)+[1]*len(badset)

train_df = train_df.sample(frac=1)

In [ ]:'is_ad', axis=1), train_df.is_ad)

preds = model.predict(topics_to_filter)

preds.shape[0] - (preds).sum()

with open('../data/for_ivan_', 'w') as outcome:
    for url in topics_to_filter[preds < 0.5].index:

import io

with'../data/big_to_download.json', 'w', encoding='utf8') as outcome:
    for u, data in df[preds < 0.5].iterrows():
        ans = data.to_dict()
        ans['img_url'] = u
        outcome.write(u'%s\n'% json.dumps(ans))

