In [1]:
import json

import pandas as pd
import numpy as np

In [5]:
import sys, os
sys.path.append('bigartm-0.8.2-py2.7.egg')

In [6]:
# from gensim.corpora import dictionary
from sklearn.svm import SVC

from random import sample

import re
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from pymorphy2 import MorphAnalyzer

In [7]:
from utils import get_text_processor

In [8]:
hash_tags_regex = '#[^#\s]*'
hash_tags_tokenizer = RegexpTokenizer(hash_tags_regex, gaps=False)

text_processing = get_text_processor()

def get_tags_and_process_text(text):
    tokens = hash_tags_tokenizer.tokenize(text)
    other = re.sub(hash_tags_regex, '', text)
    return tokens, list(text_processing(other))

In [23]:
def do_all_with_line(line):
    try:
        doc = json.loads(line)

        res = get_tags_and_process_text(doc['text'])

        is_trash_predicted = len(res[-1])<20# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))

        return None if is_trash_predicted else (doc['img_url'], res)
    except Exception as e:
        print e.message
        return None

In [24]:
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
import io

TODO не забыть сделать записи уникальными


In [ ]:
%%time
with open('../data/up_sample_gena.json') as income:
    data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))

In [16]:
%%time
with open('/home/shtechgen/study_space/annotator/data/4gena/x0000_uids') as income:
    data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))


/home/shtechgen/envs/artm_next/local/lib/python2.7/site-packages/ipywidgets/widgets/widget.py:166: DeprecationWarning: Widget._keys_default is deprecated in traitlets 4.1: use @default decorator instead.
  def _keys_default(self):
CPU times: user 7.28 s, sys: 504 ms, total: 7.79 s
Wall time: 17.9 s

In [25]:
def get_all_from_dir(path):
    paths = sorted([os.path.join(path, filename) for filename in os.listdir(path)])
    for path in tqdm_notebook(paths):
        with io.open(path, encoding='utf8') as income:
            for line in income:
                yield line

In [26]:
data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) 
                          for line 
                          in tqdm_notebook(get_all_from_dir('/home/shtechgen/study_space/annotator/data/4gena/')))



/home/shtechgen/envs/artm_next/lib/python2.7/site-packages/ipykernel/__main__.py:11: DeprecationWarning: BaseException.message has been deprecated as of Python 2.6
string index out of range

In [29]:
import itertools as it

In [31]:
with io.open('/home/shtechgen/study_space/annotator/data/second_data_wave.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(filter(lambda x: x is not None, data)):
        outcome.write(u'%s\n'%json.dumps(line))

In [ ]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))

In [ ]:
import itertools as it

from tqdm import tqdm_notebook

In [ ]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))

In [ ]:
import io

In [ ]:
import h5py

In [ ]:
with io.open('../data/big_clean_data.json') as income:
    long_enough = map(json.loads, income)

In [ ]:
len(long_enough)

In [ ]:
with open('../data/big_cleaned_andlenght-filtered.json') as income:
    df = pd.DataFrame(map(json.loads, income))

with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
    %time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!

df.dropna(axis=0, subset=['classes'], inplace=True)

In [ ]:
interesting = set(df.img_url)

In [ ]:
filtered_raw = filter(lambda row: row[0] in interesting, long_enough)

In [ ]:
len(filtered_raw)

In [ ]:
long_enough = filtered_raw

In [ ]:
def do_all_with_line(line):
    doc = json.loads(line)
    
    is_trash_predicted = doc['img_url'] not in interesting# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
    
    return None if is_trash_predicted else doc

In [ ]:
from tqdm import tqdm_notebook
import itertools as it

In [ ]:
%%time

res = []

with open('../data/users_photos.full.backup.json') as income:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            res.append(line)

In [ ]:
alt_long_enouth = map(lambda doc: (doc['img_url'], get_tags_and_process_text(doc['text'])), tqdm_notebook(res))

In [ ]:
long_enough = alt_long_enouth

In [ ]:
# long_enough = [d for d in data if len(d[-1][-1]) > 33]

long_enough_df = pd.DataFrame(map(list, zip(*long_enough)[-1]))

long_enough_df.index=zip(*long_enough)[0]
long_enough_df.columns=['tag', 'text']

long_enough_df = long_enough_df.reset_index().drop_duplicates('index').set_index('index')

In [ ]:
gd = dictionary.Dictionary(documents=long_enough_df.text)

gd.filter_extremes()

gd.compactify()

In [ ]:
goods, bads = set(), set()

In [ ]:
def add_to(name, setting, what):
    print 'adding "%s"' % what.strip()
    setting.add(what.strip())
    print 'length of %s is %i'%(name, len(setting))

In [ ]:
for u, d in sample(long_enough, 1):
    print u, ', '.join(d[-1])
    print

In [ ]:
add_to('bads', bads, u)

In [ ]:
add_to('goods', goods, u)

In [ ]:
def to_vector(bow):
    bow = dict(bow)
    return [bow.get(_, 0) for _ in range(len(gd)+1)]

In [ ]:
model = SVC(kernel='linear', probability=True, random_state=42)

In [ ]:
model.fit(np.array(map(list, long_enough_df.loc[bads].append(long_enough_df.loc[goods])\
                       .text.apply(gd.doc2bow).apply(to_vector))), [1]*len(bads)+[0]*len(goods))

In [ ]:
preds = long_enough_df.text.apply(gd.doc2bow).apply(to_vector).apply(lambda x: model.predict(np.matrix(x)))

In [ ]:
add_to('goods', goods, 'https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg')

In [ ]:
add_to('bads', bads, 'https://pp.vk.me/c836121/v836121322/1674e/mz9wrbfnuYU.jpg')

In [ ]:
sum(preds.apply(lambda x: x[0]) < 0.5)

In [ ]:
long_enough_df.loc[preds.apply(lambda x: x[0]) < 0.5].sample(replace=True, n=10)

In [ ]:
print ', '.join(long_enough_df.loc['https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg'].text)

In [ ]:


In [ ]:
with open('../data/bads', 'w') as outcome:
    for bad in bads:
        outcome.write('%s\n'%bad)

In [ ]:
with open('../data/bads') as income:
    bads = set(map(str.strip, income))

In [ ]:
with open('../data/goods', 'w') as outcome:
    for good in goods:
        outcome.write('%s\n'%good)

In [ ]:
with open('../data/goods') as income:
    goods = set(map(str.strip, income))

In [ ]:


In [ ]:


In [ ]:
import io

In [ ]:
with io.open('../data/big_cleaned_andlenght-filtered.json', 'w', encoding='utf8') as outcome:
    for u, data in long_enough_df.iterrows():
        ans = data.to_dict()
        ans['img_url'] = u
        outcome.write(u'%s\n'% json.dumps(ans))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: