In [1]:
import sys; sys.path.append('..')

In [47]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pandas as pd

In [2]:
from time import time
import re

from nlp import *
from frame import *
from tsv import *
from cache import *

from multiprocessing import Pool

In [3]:
def load_lem_dict():
    global lem_dict
    t0=time()
    lem_dict = marshal.load(open('../data/lem_dict.mrl','rb'))
    #print('lem_dict {:.2f} s'.format(time()-t0))

def lem_only(tokens):
    out = []
    for t in tokens:
        lem = lem_dict.get(t)
        if not lem: continue
        out.append(lem)
    return out
def in_dict_only(tokens):
    out = []
    for t in tokens:
        lem = lem_dict.get(t)
        if not lem: continue
        out.append(t)
    return out

def export_part(args):
    p,lo,hi,feature_cnt,frame,V = args
    with open('../data/vectorized_{}.tsv'.format(p),'wb',100000) as fo:
        for i in range(lo,hi):
            fo.write(frame['col'][i])
            fo.write('\t')
            fo.write(frame['id'][i])
            fo.write('\t')

            vi = V[i]
            v = [str(vi.get(j,0)) for j in range(feature_cnt)]
            fo.write('\t'.join(v))
            fo.write('\n')

re_num = re.compile(r'\b\d+\b')
def replace_numbers(text):
    return re_num.sub('_NUM_',text)

re_link = re.compile(r'>>>>[^<]*<<<<')
def replace_links(text):
    return re_link.sub('_LINK_',text)

cache = disk_cache('../cache','t5v2',verbose=True,linear=True)

pool = Pool(4,load_lem_dict)

In [4]:
# frame
t0=time()
rows = tsv_iter('../data/__all__.txt')
frame = frame_from_iter(rows, ['col', 'id', 'text'])
print('frame\t{:.2f} s'.format(time()-t0))
print('frame\t{} rows'.format(len(frame['id'])))


frame	4.85 s
frame	80736 rows

In [5]:
# noise
# TODO test split with also "..."
noise = cache.use('noise', get_df,
    frame['text'],
    split_pattern='[\s;]*[;.][\s;]*',
    preprocessor=[replace_numbers,replace_links], postprocessor=None,
    min_df_part=2, min_df=2)


noise	0.59 s	36.3 MB	from cache

In [6]:
# clean_x
X = cache.use('clean_x',get_clean_x,
    frame['text'],
    split_pattern='[\s;]*[;.][\s;]*',
    preprocessor=[replace_numbers,replace_links], postprocessor=None,
    replace=u' ; ', stop_words=noise)


clean_x	0.85 s	195.6 MB	from cache

In [7]:
# dfy
dfy = cache.use('dfy', get_dfy,
    X, frame['col'],
    postprocessor=lem_only,
    min_df=10,
    mp_pool=pool)

# row weight
ny = Counter(frame['col'])
max_ny = max(ny.values())
wy = {y:1.0*max_ny/ny[y] for y in dfy}
frame['rw'] = [wy[y] for y in frame['col']]

# df
df = get_df_from_dfy(dfy)


dfy	0.13 s	2.2 MB	from cache

In [10]:
cache.missed = False

# chiy
chiy = cache.use('chiy', get_chiy, df, len(X), dfy, Counter(frame['col']))
mchiy = cache.use('mchiy', get_mcdy, chiy)

# wcpy
wcpy = cache.use('wcpy', get_wcpy, df, dfy)
mwcpy = cache.use('mwcpy', get_mcdy, wcpy)

# giniy
giniy = cache.use('giniy', get_giniy, df, dfy, Counter(frame['col']))
mginiy = cache.use('mginiy', get_mcdy, giniy)

# cmfsy
cmfsy = cache.use('cmfsy', get_cmfsy, df, dfy)
mcmfsy = cache.use('mcmfsy', get_mcdy, cmfsy)

# mdfy
mdfy = cache.use('mdfy', get_mcdy, dfy)

# TODO wiecej slow dla kategorii z duza iloscia zer
# TODO test mieszania algo do wyboru ficzerow

In [67]:
measure = mcmfsy
df = [] 
for y in dfy:
    for t,v in Counter(measure[y]).most_common(10):
        df.append([y,t,v])
pd.DataFrame(df,columns=['topic','token','value'])


Out[67]:
topic token value
0 ogrod ogrodowisko 2.662843e-06
1 ogrod kwitnienie 2.206400e-06
2 ogrod ogrodnictwo 2.202467e-06
3 ogrod rozgałęziony/rozgałęzić 2.166497e-06
4 ogrod nawożenie/nawozić 2.076825e-06
5 ogrod storczyk 2.030102e-06
6 ogrod fotogaleria 1.996873e-06
7 ogrod jednoroczny 1.955641e-06
8 ogrod opryskiwać 1.933721e-06
9 ogrod uprawa 1.929924e-06
10 nieruchomosci kanalizacja 9.708574e-07
11 nieruchomosci deweloperski 9.493907e-07
12 nieruchomosci kotłownia 9.462552e-07
13 nieruchomosci przynależeć 9.354584e-07
14 nieruchomosci kondygnacja 9.240832e-07
15 nieruchomosci pośrednik 9.126564e-07
16 nieruchomosci wielorodzinny 9.070873e-07
17 nieruchomosci ocieplenie/ocieplić 8.822207e-07
18 nieruchomosci schodowy 8.776037e-07
19 nieruchomosci garażowy 8.756797e-07
20 alergie alergologiczny 1.369115e-05
21 alergie odczulać 1.096319e-05
22 alergie pyłek 1.012854e-05
23 alergie immunoterapia 9.228771e-06
24 alergie zarodnik 9.228771e-06
25 alergie uczulający/uczulać 8.638549e-06
26 alergie anafilaktyczny 8.498646e-06
27 alergie wziewny 8.332759e-06
28 alergie pylić 8.214689e-06
29 alergie uczulenie/uczulić 7.821463e-06
... ... ... ...
240 fitness ostrzec 2.988082e-06
241 fitness redukcyjny 1.876669e-06
242 fitness dietetyk 1.578952e-06
243 fitness glikemiczny 1.557451e-06
244 fitness noworoczny 1.525348e-06
245 fitness muskulatura 1.515296e-06
246 fitness kulturystyczny 1.515296e-06
247 fitness odchudzać/odchudzanie 1.382399e-06
248 fitness kulturystyka 1.364415e-06
249 fitness termogeneza 1.359138e-06
250 milosnicy morusek 1.040947e-05
251 milosnicy psiak 9.217034e-06
252 milosnicy galopować 9.034227e-06
253 milosnicy jeździec 8.310064e-06
254 milosnicy szczeniak 7.930044e-06
255 milosnicy owczarka/owczarek 7.880943e-06
256 milosnicy weterynarz 7.719668e-06
257 milosnicy jeździecki 7.680406e-06
258 milosnicy jeździectwo 6.720928e-06
259 milosnicy sunia 6.583003e-06
260 podroznicy agroturystyka 1.517833e-06
261 podroznicy kwatera 1.508356e-06
262 podroznicy zajazd 1.507497e-06
263 podroznicy wczasowy 1.503790e-06
264 podroznicy noclegowy 1.490757e-06
265 podroznicy namiotowy 1.486570e-06
266 podroznicy hostel 1.482087e-06
267 podroznicy motel/motela 1.480459e-06
268 podroznicy czajnik 1.478482e-06
269 podroznicy wyżywienie/wyżywić 1.463162e-06

270 rows × 3 columns


In [69]:
measure = mcmfsy
# vocaby
vocaby = {}
for y in dfy:
    #_v = Counter(chiy[y]).most_common(20) # 200 -> 9% zeros, GLM 19% err
    #t_v = Counter(wcpy[y]).most_common(20) # 200 -> 48% zeros, GLM 10% err
    #t_v = Counter(giniy[y]).most_common(20) # 200 -> 16% zeros, GLM 18% err
    #t_v = Counter(cmfsy[y]).most_common(20) # 200 -> 47% zeros, GLM 10% err
    #t_v = Counter(mginiy[y]).most_common(200) # 200 -> 32% zeros, GLM % err
    t_v = Counter(measure[y]).most_common(200)
    vocaby[y] = set([t for t,v in t_v])

In [70]:
cache.missed = False

df = []
# score vocaby
# TODO list words
vy_score = {}
for y1 in dfy:
    for y2 in dfy:
        if y2<=y1: continue
        common = len(vocaby[y1]&vocaby[y2])
        score = 1.0 * common / len(vocaby[y1])
        vy_score[y1,y2] = score
print
vy_score = Counter(vy_score)
for (y1,y2),p in vy_score.most_common(100):
    df.append([y1,y2,p])
pd.DataFrame(df,columns=['topic1','topic2','common_features_pct'])


Out[70]:
topic1 topic2 common_features_pct
0 domowi ogrod 0.395
1 kobiety trendsetterzy 0.370
2 kadra przedsiebiorcy 0.315
3 biegacze sportowcy 0.315
4 kinomaniacy nowoczesny 0.190
5 fitness sportowcy 0.130
6 pochwa zdrowie 0.125
7 alergie pochwa 0.090
8 alergie zdrowie 0.090
9 parenting pochwa 0.080
10 alergie smog 0.055
11 fitness zdrowie 0.040
12 alergie parenting 0.035
13 milosnicy nieruchomosci 0.030
14 parenting zdrowie 0.025
15 fitness gospodyni 0.020
16 kadra smog 0.020
17 kinomaniacy mlodziez 0.020
18 jedzacy milosnicy 0.015
19 nieruchomosci smog 0.015
20 sportowcy wedkarstwo 0.015
21 milosnicy sportowcy 0.015
22 alergie milosnicy 0.015
23 kinomaniacy smog 0.015
24 nowoczesny trendsetterzy 0.015
25 milosnicy smog 0.010
26 alergie domowi 0.010
27 biegacze milosnicy 0.010
28 planszomaniak smog 0.010
29 alergie fitness 0.010
... ... ... ...
70 domowi jedzacy 0.005
71 kobiety sportowcy 0.005
72 alergie ogrod 0.005
73 planszomaniak wedkarstwo 0.005
74 gadzeciarze wedkarstwo 0.005
75 kibic milosnicy 0.005
76 nowoczesny wedkarstwo 0.005
77 kobiety milosnicy 0.005
78 biegacze fitness 0.005
79 kinomaniacy pochwa 0.005
80 fitness jedzacy 0.005
81 nowoczesny smog 0.005
82 gadzeciarze planszomaniak 0.005
83 kobiety pochwa 0.005
84 jedzacy wedkarstwo 0.005
85 smog sportowcy 0.005
86 alergie kobiety 0.000
87 mlodziez parenting 0.000
88 domowi sportowcy 0.000
89 kibic parenting 0.000
90 kinomaniacy nieruchomosci 0.000
91 kibic kobiety 0.000
92 automaniak domowi 0.000
93 milosnicy mlodziez 0.000
94 biegacze pochwa 0.000
95 kibic smog 0.000
96 domowi kinomaniacy 0.000
97 mlodziez smog 0.000
98 jedzacy kibic 0.000
99 automaniak planszomaniak 0.000

100 rows × 3 columns


In [12]:
# vocab
vocab = set()
for y in vocaby:
    vocab.update(vocaby[y])
print('len_vocab',len(vocab))

# term_id
term_id = {t:i for i,t in enumerate(vocab)}

# vec_vocaby
vec_vocaby = {}
for y in vocaby:
    vec_vocaby[y] = set([term_id[t] for t in vocaby[y]])

# vectorized
V = cache.use('vectorized',
    #vectorize, X, vocab,
    vectorize, frame['text'], vocab,
    #preprocessor=[replace_numbers,replace_links],
    postprocessor=lem_only,
    mp_pool=pool)

frame['tf'] = V


('len_vocab', 5013)
vectorized	2.06 s	39.4 MB	from cache

In [13]:
# col_score & all_score
t0 = time()
col_score = []
all_score = []
for col,tf in zip(frame['col'],frame['tf']):
    common = set(tf) & vec_vocaby[col]
    col_score.append(len(common))
    all_score.append(len(tf))
frame['col_score'] = col_score
frame['all_score'] = all_score
print('col_score\t{:.2f} s'.format(time()-t0))


col_score	0.72 s

In [16]:
# low score examples
topic='kibic'
for col,als,cls,text in zip(frame['col'],frame['all_score'],frame['col_score'],frame['text']):
    if col==topic and cls==0:
        pass
        #print(col,als,cls,text)

In [30]:
df = []
# score vocab
cs_zero = Counter()
as_zero = Counter()
all = Counter()
for col,cs,as_ in zip(frame['col'],frame['col_score'],frame['all_score']):
    all[col] += 1
    if cs==0:
        cs_zero[col] += 1
    if as_==0:
        as_zero[col] += 1
for col in sorted(all):
    x = 1.0 * cs_zero[col]/all[col]
    df.append([col,x,as_zero[col],cs_zero[col],all[col]])

total_cs_zero = sum(cs_zero.values())
total_as_zero = sum(as_zero.values())
total_all = sum(all.values())
x = 1.0 * total_cs_zero / total_all

df.sort(key=lambda x:x[1],reverse=True)
df.insert(0,['TOTAL',x,total_as_zero,total_cs_zero,total_all])
pd.DataFrame(df,columns=['topic','zero_score_topic_pct','zero_score_cnt','zero_score_topic_cnt','topic_cnt'])


Out[30]:
topic zero_score_topic_pct zero_score_cnt zero_score_topic_cnt topic_cnt
0 TOTAL 0.350897 558 28330 80736
1 trendsetterzy 0.687018 18 10341 15052
2 kinomaniacy 0.679796 2 1467 2158
3 fitness 0.550598 0 691 1255
4 milosnicy 0.494459 19 580 1173
5 gospodyni 0.409593 5 1076 2627
6 nieruchomosci 0.376267 175 2746 7298
7 nowoczesny 0.350254 2 1035 2955
8 podroznicy 0.330857 16 2093 6326
9 domowi 0.278566 120 1041 3737
10 kadra 0.271887 12 618 2273
11 kobiety 0.270588 3 759 2805
12 zdrowie 0.244043 3 891 3651
13 jedzacy 0.238710 1 333 1395
14 kibic 0.230263 2 560 2432
15 ogrod 0.222462 1 206 926
16 przedsiebiorcy 0.211291 13 509 2409
17 smog 0.190476 0 32 168
18 sportowcy 0.188794 0 310 1642
19 planszomaniak 0.180165 2 109 605
20 mlodziez 0.177062 12 176 994
21 biegacze 0.170183 0 121 711
22 gadzeciarze 0.164951 4 705 4274
23 wedkarstwo 0.157706 0 88 558
24 parenting 0.150690 17 1638 10870
25 automaniak 0.098639 131 203 2058
26 alergie 0.012739 0 2 157
27 pochwa 0.000000 0 0 227

In [ ]:
# export
t0 = time()
if 0:
    topics = list(sorted(dfy))
    feature_cnt = len(vocab)
    with open('../data/vectorized.tsv','wb',100000) as fo:
        for row in iter_from_frame(frame,['col','rw','id','col_score','tf']):
            if row[-2]==0: continue # omit col_score==0

            # col/topic
            fo.write(row[0]+'\t')

            # is_topic
            if 0:
                is_topic = ['n']*len(topics)
                ti = topics.index(row[0])
                is_topic[ti]='y'
                fo.write('\t'.join(is_topic)+'\t')

            # row weight
            fo.write('{:.2f}'.format(row[1])+'\t')

            # id
            fo.write(row[2]+'\t')

            # features
            tf = row[-1]
            features = [str(tf.get(j,0)) for j in range(feature_cnt)]
            #features = [str(min(1,tf.get(j,0))) for j in range(feature_cnt)] # binarne
            fo.write('\t'.join(features))
            fo.write('\n')
elif 0:
    args = []
    feature_cnt = len(vocab)
    for p,(lo,hi) in enumerate(partitions(len(frame['id']),4)):
        f = {}
        f['col'] = frame['col']
        f['id'] = frame['id']
        args += [(p,lo,hi,feature_cnt,f,V)]
    pool.map(export_part,args)
print('export',time()-t0)


frame	5.55 s
frame	80736 rows
noise	0.68 s	36.3 MB	from cache
clean_x	1.00 s	195.6 MB	from cache
dfy	0.11 s	2.2 MB	from cache
chiy	3.28 s	14.0 MB	
mchiy	14.29 s	14.0 MB	
wcpy	1.39 s	14.0 MB	

In [ ]: