notebook.community

Edit and run



In [1]:

    
import json

import pandas as pd
import numpy as np



In [5]:

    
import sys, os
sys.path.append('bigartm-0.8.2-py2.7.egg')



In [6]:

    
# from gensim.corpora import dictionary
from sklearn.svm import SVC

from random import sample

import re
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from pymorphy2 import MorphAnalyzer



In [7]:

    
from utils import get_text_processor



In [8]:

    
hash_tags_regex = '#[^#\s]*'
hash_tags_tokenizer = RegexpTokenizer(hash_tags_regex, gaps=False)

text_processing = get_text_processor()

def get_tags_and_process_text(text):
    tokens = hash_tags_tokenizer.tokenize(text)
    other = re.sub(hash_tags_regex, '', text)
    return tokens, list(text_processing(other))



In [23]:

    
def do_all_with_line(line):
    try:
        doc = json.loads(line)

        res = get_tags_and_process_text(doc['text'])

        is_trash_predicted = len(res[-1])<20# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))

        return None if is_trash_predicted else (doc['img_url'], res)
    except Exception as e:
        print e.message
        return None



In [24]:

    
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
import io

TODO не забыть сделать записи уникальными



In [ ]:

    
%%time
with open('../data/up_sample_gena.json') as income:
    data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))



In [16]:

    
%%time
with open('/home/shtechgen/study_space/annotator/data/4gena/x0000_uids') as income:
    data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))









    



/home/shtechgen/envs/artm_next/local/lib/python2.7/site-packages/ipywidgets/widgets/widget.py:166: DeprecationWarning: Widget._keys_default is deprecated in traitlets 4.1: use @default decorator instead.
  def _keys_default(self):






    



CPU times: user 7.28 s, sys: 504 ms, total: 7.79 s
Wall time: 17.9 s



In [25]:

    
def get_all_from_dir(path):
    paths = sorted([os.path.join(path, filename) for filename in os.listdir(path)])
    for path in tqdm_notebook(paths):
        with io.open(path, encoding='utf8') as income:
            for line in income:
                yield line



In [26]:

    
data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) 
                          for line 
                          in tqdm_notebook(get_all_from_dir('/home/shtechgen/study_space/annotator/data/4gena/')))









    









    



/home/shtechgen/envs/artm_next/lib/python2.7/site-packages/ipykernel/__main__.py:11: DeprecationWarning: BaseException.message has been deprecated as of Python 2.6






    



string index out of range



In [29]:

    
import itertools as it



In [31]:

    
with io.open('/home/shtechgen/study_space/annotator/data/second_data_wave.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(filter(lambda x: x is not None, data)):
        outcome.write(u'%s\n'%json.dumps(line))



In [ ]:

    
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))



In [ ]:

    
import itertools as it

from tqdm import tqdm_notebook



In [ ]:

    
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))



In [ ]:

    
import io



In [ ]:

    
import h5py



In [ ]:

    
with io.open('../data/big_clean_data.json') as income:
    long_enough = map(json.loads, income)



In [ ]:

    
len(long_enough)



In [ ]:

    
with open('../data/big_cleaned_andlenght-filtered.json') as income:
    df = pd.DataFrame(map(json.loads, income))

with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
    %time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!

df.dropna(axis=0, subset=['classes'], inplace=True)



In [ ]:

    
interesting = set(df.img_url)



In [ ]:

    
filtered_raw = filter(lambda row: row[0] in interesting, long_enough)



In [ ]:

    
len(filtered_raw)



In [ ]:

    
long_enough = filtered_raw



In [ ]:

    
def do_all_with_line(line):
    doc = json.loads(line)
    
    is_trash_predicted = doc['img_url'] not in interesting# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
    
    return None if is_trash_predicted else doc



In [ ]:

    
from tqdm import tqdm_notebook
import itertools as it



In [ ]:

    
%%time

res = []

with open('../data/users_photos.full.backup.json') as income:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            res.append(line)



In [ ]:

    
alt_long_enouth = map(lambda doc: (doc['img_url'], get_tags_and_process_text(doc['text'])), tqdm_notebook(res))



In [ ]:

    
long_enough = alt_long_enouth



In [ ]:

    
# long_enough = [d for d in data if len(d[-1][-1]) > 33]

long_enough_df = pd.DataFrame(map(list, zip(*long_enough)[-1]))

long_enough_df.index=zip(*long_enough)[0]
long_enough_df.columns=['tag', 'text']

long_enough_df = long_enough_df.reset_index().drop_duplicates('index').set_index('index')



In [ ]:

    
gd = dictionary.Dictionary(documents=long_enough_df.text)

gd.filter_extremes()

gd.compactify()



In [ ]:

    
goods, bads = set(), set()



In [ ]:

    
def add_to(name, setting, what):
    print 'adding "%s"' % what.strip()
    setting.add(what.strip())
    print 'length of %s is %i'%(name, len(setting))



In [ ]:

    
for u, d in sample(long_enough, 1):
    print u, ', '.join(d[-1])
    print



In [ ]:

    
add_to('bads', bads, u)



In [ ]:

    
add_to('goods', goods, u)



In [ ]:

    
def to_vector(bow):
    bow = dict(bow)
    return [bow.get(_, 0) for _ in range(len(gd)+1)]



In [ ]:

    
model = SVC(kernel='linear', probability=True, random_state=42)



In [ ]:

    
model.fit(np.array(map(list, long_enough_df.loc[bads].append(long_enough_df.loc[goods])\
                       .text.apply(gd.doc2bow).apply(to_vector))), [1]*len(bads)+[0]*len(goods))



In [ ]:

    
preds = long_enough_df.text.apply(gd.doc2bow).apply(to_vector).apply(lambda x: model.predict(np.matrix(x)))



In [ ]:

    
add_to('goods', goods, 'https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg')



In [ ]:

    
add_to('bads', bads, 'https://pp.vk.me/c836121/v836121322/1674e/mz9wrbfnuYU.jpg')



In [ ]:

    
sum(preds.apply(lambda x: x[0]) < 0.5)



In [ ]:

    
long_enough_df.loc[preds.apply(lambda x: x[0]) < 0.5].sample(replace=True, n=10)



In [ ]:

    
print ', '.join(long_enough_df.loc['https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg'].text)



In [ ]:



In [ ]:

    
with open('../data/bads', 'w') as outcome:
    for bad in bads:
        outcome.write('%s\n'%bad)



In [ ]:

    
with open('../data/bads') as income:
    bads = set(map(str.strip, income))



In [ ]:

    
with open('../data/goods', 'w') as outcome:
    for good in goods:
        outcome.write('%s\n'%good)



In [ ]:

    
with open('../data/goods') as income:
    goods = set(map(str.strip, income))



In [ ]:



In [ ]:



In [ ]:

    
import io



In [ ]:

    
with io.open('../data/big_cleaned_andlenght-filtered.json', 'w', encoding='utf8') as outcome:
    for u, data in long_enough_df.iterrows():
        ans = data.to_dict()
        ans['img_url'] = u
        outcome.write(u'%s\n'% json.dumps(ans))



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: