In [1]:
import json
import pandas as pd
import numpy as np
In [5]:
import sys, os
sys.path.append('bigartm-0.8.2-py2.7.egg')
In [6]:
# from gensim.corpora import dictionary
from sklearn.svm import SVC
from random import sample
import re
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from pymorphy2 import MorphAnalyzer
In [7]:
from utils import get_text_processor
In [8]:
hash_tags_regex = '#[^#\s]*'
hash_tags_tokenizer = RegexpTokenizer(hash_tags_regex, gaps=False)
text_processing = get_text_processor()
def get_tags_and_process_text(text):
tokens = hash_tags_tokenizer.tokenize(text)
other = re.sub(hash_tags_regex, '', text)
return tokens, list(text_processing(other))
In [23]:
def do_all_with_line(line):
try:
doc = json.loads(line)
res = get_tags_and_process_text(doc['text'])
is_trash_predicted = len(res[-1])<20# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
return None if is_trash_predicted else (doc['img_url'], res)
except Exception as e:
print e.message
return None
In [24]:
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
import io
In [ ]:
%%time
with open('../data/up_sample_gena.json') as income:
data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))
In [16]:
%%time
with open('/home/shtechgen/study_space/annotator/data/4gena/x0000_uids') as income:
data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))
In [25]:
def get_all_from_dir(path):
paths = sorted([os.path.join(path, filename) for filename in os.listdir(path)])
for path in tqdm_notebook(paths):
with io.open(path, encoding='utf8') as income:
for line in income:
yield line
In [26]:
data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line)
for line
in tqdm_notebook(get_all_from_dir('/home/shtechgen/study_space/annotator/data/4gena/')))
In [29]:
import itertools as it
In [31]:
with io.open('/home/shtechgen/study_space/annotator/data/second_data_wave.json', 'w', encoding='utf8') as outcome:
for line in tqdm_notebook(filter(lambda x: x is not None, data)):
outcome.write(u'%s\n'%json.dumps(line))
In [ ]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
for line in tqdm_notebook(Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))):
if line is not None:
outcome.write(u'%s\n'%json.dumps(line))
In [ ]:
import itertools as it
from tqdm import tqdm_notebook
In [ ]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
for line in tqdm_notebook(it.imap(do_all_with_line, income)):
if line is not None:
outcome.write(u'%s\n'%json.dumps(line))
In [ ]:
import io
In [ ]:
import h5py
In [ ]:
with io.open('../data/big_clean_data.json') as income:
long_enough = map(json.loads, income)
In [ ]:
len(long_enough)
In [ ]:
with open('../data/big_cleaned_andlenght-filtered.json') as income:
df = pd.DataFrame(map(json.loads, income))
with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
%time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!
df.dropna(axis=0, subset=['classes'], inplace=True)
In [ ]:
interesting = set(df.img_url)
In [ ]:
filtered_raw = filter(lambda row: row[0] in interesting, long_enough)
In [ ]:
len(filtered_raw)
In [ ]:
long_enough = filtered_raw
In [ ]:
def do_all_with_line(line):
doc = json.loads(line)
is_trash_predicted = doc['img_url'] not in interesting# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
return None if is_trash_predicted else doc
In [ ]:
from tqdm import tqdm_notebook
import itertools as it
In [ ]:
%%time
res = []
with open('../data/users_photos.full.backup.json') as income:
for line in tqdm_notebook(it.imap(do_all_with_line, income)):
if line is not None:
res.append(line)
In [ ]:
alt_long_enouth = map(lambda doc: (doc['img_url'], get_tags_and_process_text(doc['text'])), tqdm_notebook(res))
In [ ]:
long_enough = alt_long_enouth
In [ ]:
# long_enough = [d for d in data if len(d[-1][-1]) > 33]
long_enough_df = pd.DataFrame(map(list, zip(*long_enough)[-1]))
long_enough_df.index=zip(*long_enough)[0]
long_enough_df.columns=['tag', 'text']
long_enough_df = long_enough_df.reset_index().drop_duplicates('index').set_index('index')
In [ ]:
gd = dictionary.Dictionary(documents=long_enough_df.text)
gd.filter_extremes()
gd.compactify()
In [ ]:
goods, bads = set(), set()
In [ ]:
def add_to(name, setting, what):
print 'adding "%s"' % what.strip()
setting.add(what.strip())
print 'length of %s is %i'%(name, len(setting))
In [ ]:
for u, d in sample(long_enough, 1):
print u, ', '.join(d[-1])
print
In [ ]:
add_to('bads', bads, u)
In [ ]:
add_to('goods', goods, u)
In [ ]:
def to_vector(bow):
bow = dict(bow)
return [bow.get(_, 0) for _ in range(len(gd)+1)]
In [ ]:
model = SVC(kernel='linear', probability=True, random_state=42)
In [ ]:
model.fit(np.array(map(list, long_enough_df.loc[bads].append(long_enough_df.loc[goods])\
.text.apply(gd.doc2bow).apply(to_vector))), [1]*len(bads)+[0]*len(goods))
In [ ]:
preds = long_enough_df.text.apply(gd.doc2bow).apply(to_vector).apply(lambda x: model.predict(np.matrix(x)))
In [ ]:
add_to('goods', goods, 'https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg')
In [ ]:
add_to('bads', bads, 'https://pp.vk.me/c836121/v836121322/1674e/mz9wrbfnuYU.jpg')
In [ ]:
sum(preds.apply(lambda x: x[0]) < 0.5)
In [ ]:
long_enough_df.loc[preds.apply(lambda x: x[0]) < 0.5].sample(replace=True, n=10)
In [ ]:
print ', '.join(long_enough_df.loc['https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg'].text)
In [ ]:
In [ ]:
with open('../data/bads', 'w') as outcome:
for bad in bads:
outcome.write('%s\n'%bad)
In [ ]:
with open('../data/bads') as income:
bads = set(map(str.strip, income))
In [ ]:
with open('../data/goods', 'w') as outcome:
for good in goods:
outcome.write('%s\n'%good)
In [ ]:
with open('../data/goods') as income:
goods = set(map(str.strip, income))
In [ ]:
In [ ]:
In [ ]:
import io
In [ ]:
with io.open('../data/big_cleaned_andlenght-filtered.json', 'w', encoding='utf8') as outcome:
for u, data in long_enough_df.iterrows():
ans = data.to_dict()
ans['img_url'] = u
outcome.write(u'%s\n'% json.dumps(ans))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: