In [ ]:
TODO
- join info from dropped keywords

In [1]:
from pattern.en import pluralize, singularize
 
print pluralize('child') #children
print singularize('wolves') #wolf


children
wolf

In [2]:
from pattern.es import pluralize, singularize
 
print pluralize('calefactor') #children
print singularize('calefactores') #calefactor


calefactores
calefactor

In [5]:
print pluralize('ropas y calzados') #children
print singularize('ropas y calzados') #wolf


ropas y calzados
ropas y calzado

In [36]:
import unicodedata
def strip_accents(s):
    s= s.decode("utf-8")
    return ''.join((c for c in unicodedata.normalize('NFD', unicode(s)) if unicodedata.category(c) != 'Mn'))

strip_accents('Östblocket')


Out[36]:
u'Ostblocket'

In [37]:
pluralize(strip_accents('grafía'))


Out[37]:
u'grafias'

In [33]:
'grafía'.decode("utf-8") == u'grafía'


Out[33]:
True

In [8]:
from pandas import DataFrame
import psycopg2

def connect_clm():
    
    host_url = 'group-clm-redshift.onap.io'
    db = "clm"

    conn = psycopg2.connect(
    host= host_url,
    user="johannes_braun",
    port=5439,
    password="b-A2#83e670d2d72",
    dbname=db)
 
    cur = conn.cursor() # create a cursor for executing queries
    return cur, conn

def execute_query(query):
    cur, conn = connect_clm()
    cur.execute(query);
    res = cur.fetchall()
    df = DataFrame(res, columns = [desc[0] for desc in cur.description])
    return df

In [175]:
top_100 = execute_query("""
select stats.*, cat.category_nk, cat.freq as cat_freq, lkp.category_name_lc, lkp.category_l1_nk, lkp.category_l1_name_lc
from recommendations.temp_johannes_search_stats stats
left join recommendations.recommend_search2category cat
on stats.search_term = cat.search_term
and cat.country_sk = 'letgo|latam|ar'
and cat.category_level = 2
and cat.rec_rank = 1
left join global_bi.dim_categories lkp
on lkp.country_sk = 'letgo|latam|ar'
and lkp.category_nk = cat.category_nk
order by distinct_sessions desc
""")

In [415]:
search_terms_with_cat = execute_query("""
with total_freq as (
 select country_sk, search_term, sum(freq) as total_freq
 FROM recommendations.recommend_search2category
 group by 1,2
),
full_breakdown as (
SELECT cat.*, tf.total_freq, freq :: DECIMAL(19,4)/total_freq :: DECIMAL(19,4) as significance, lkp.category_name_lc, lkp.category_l1_nk, lkp.category_l1_name_lc
FROM recommendations.recommend_search2category cat

inner join  total_freq tf on(
tf.country_sk = cat.country_sk
and tf.search_term = cat.search_term)

left join global_bi.dim_categories lkp
on lkp.country_sk = 'letgo|latam|ar'
and lkp.category_nk = cat.category_nk

where cat.country_sk = 'letgo|latam|ar'
--and cat.search_term = 'honda wave'
---and cat.category_level in (1,2,3)
--and cat.rec_rank in ( 1,2)
),

first_three_cats AS(
SELECT
country_sk,
search_term,
total_freq as total_cat_freq,

max(case when (category_level = 2 and rec_rank = 1) then category_nk else '0' end) as first_cat_id,

max(case when (category_level = 2 and rec_rank = 2) then category_nk else '0' end) as second_cat_id,
max(case when (category_level = 2 and rec_rank = 3) then category_nk else '0' end) as third_cat_id,

max(case when (category_level = 2 and rec_rank = 1) then category_name_lc else '' end) as first_cat_name,
max(case when (category_level = 2 and rec_rank = 2) then category_name_lc else '' end) as second_cat_name,
max(case when (category_level = 2 and rec_rank = 3) then category_name_lc else '' end) as third_cat_name,

max(case when category_level = 3 then significance else 0 end) :: DECIMAL(19,4) as no_cat_significance,
max(case when (category_level = 2 and rec_rank = 1) then significance else 0 end) :: DECIMAL(19,4) as first_cat_significance,
max(case when (category_level = 2 and rec_rank = 2) then significance else 0 end) :: DECIMAL(19,4) as second_cat_significance,
max(case when (category_level = 2 and rec_rank = 3) then significance else 0 end) :: DECIMAL(19,4) as third_cat_significance
from full_breakdown
where significance >0.05
and  category_level in (2,3)
-- and search_term in ('zapatillas', 'nike')
group by 1,2,3)

select 
stats.reply_conversion, 
stats.research_quote, 
stats.distinct_sessions, 
stats.num_subsequent_replies,
stats.reply_5min_conversion, 
stats.research_15sec_conversion, 
stats.rows,
cat.country_sk as cat_country_sk,
cat.total_cat_freq, 
cat.first_cat_id,
second_cat_id,
third_cat_id, 
first_cat_name,
second_cat_name,
third_cat_name,
no_cat_significance,
first_cat_significance, 
second_cat_significance,
third_cat_significance,
stats.search_term

from recommendations.temp_johannes_search_stats stats
left join first_three_cats cat
on stats.search_term = cat.search_term
and cat.country_sk = 'letgo|latam|ar'
order by distinct_sessions desc
""")

In [416]:
len(search_terms_with_cat)


Out[416]:
82665

In [417]:
search_terms_with_cat[:10]


Out[417]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id second_cat_id third_cat_id first_cat_name second_cat_name third_cat_name no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term
0 0.0532 0.0061 118822 19417 6317 729 118822 letgo|latam|ar 2195.0 362 0 0 Vehículos 0.2810 0.2756 0.0000 0.0000 fiat 147
1 0.0481 0.0096 114212 22293 5491 1097 114212 letgo|latam|ar 7510.0 362 800 0 Vehículos Electrónica 0.2194 0.1989 0.0617 0.0000 motos
2 0.0613 0.0105 101423 22720 6214 1067 101423 letgo|latam|ar 25689.0 815 888 853 Ropa y Accesorios de Mujer Ropa y Accesorios de Hombre Bebés y Niños 0.1946 0.1541 0.1271 0.0896 zapatillas
3 0.0505 0.0118 97578 17432 4930 1152 97579 letgo|latam|ar 4861.0 881 0 0 Deportes y Hobbies 0.2316 0.2135 0.0000 0.0000 bicicletas
4 0.0405 0.0145 97207 15176 3938 1409 97207 letgo|latam|ar 1321.0 362 0 0 Vehículos 0.2778 0.2694 0.0000 0.0000 fiat uno
5 0.0377 0.0153 88181 19924 3322 1347 88181 letgo|latam|ar 9600.0 362 853 0 Vehículos Bebés y Niños 0.2227 0.1928 0.0589 0.0000 autos
6 0.0530 0.0115 83509 16491 4425 958 83509 letgo|latam|ar 1571.0 362 0 0 Vehículos 0.2845 0.2762 0.0000 0.0000 gol
7 0.0819 0.0101 77226 28572 6322 780 77226 letgo|latam|ar 19082.0 800 0 0 Electrónica 0.2274 0.2133 0.0000 0.0000 celulares
8 0.0481 0.0074 75213 11209 3614 557 75213 letgo|latam|ar 1212.0 362 0 0 Vehículos 0.2920 0.2904 0.0000 0.0000 fiat 128
9 0.0506 0.0117 73770 11375 3736 865 73770 letgo|latam|ar 121.0 362 0 0 Vehículos 0.3140 0.3057 0.0000 0.0000 toyota hilux

In [418]:
search_terms_with_cat.columns


Out[418]:
Index([u'reply_conversion', u'research_quote', u'distinct_sessions',
       u'num_subsequent_replies', u'reply_5min_conversion',
       u'research_15sec_conversion', u'rows', u'cat_country_sk',
       u'total_cat_freq', u'first_cat_id', u'second_cat_id', u'third_cat_id',
       u'first_cat_name', u'second_cat_name', u'third_cat_name',
       u'no_cat_significance', u'first_cat_significance',
       u'second_cat_significance', u'third_cat_significance', u'search_term'],
      dtype='object')

In [419]:
search_terms_with_cat['ascii'] = search_terms_with_cat['search_term'].apply(strip_accents)

In [420]:
search_terms_with_cat['singular'] = search_terms_with_cat['ascii'].apply(singularize)

In [421]:
'fiat 147' in search_terms_with_cat['search_term'].unique()


Out[421]:
True

In [422]:
#u = search_terms_with_cat['search_term'].unique()
#search_terms_with_cat['is_valid'] = [ s['singular'] in u    for i, s in search_terms_with_cat.iterrows()]

In [423]:
#search_terms_with_cat['is_org'] = [ s['ascii']==s['singular'] for i, s in search_terms_with_cat.iterrows()]

In [424]:
search_terms_with_cat[search_terms_with_cat['search_term'] == 'peugeot chocado'].index.tolist()


Out[424]:
[24400]

In [127]:
def safe_list_get (l, idx, default):
    try:
        return l[idx]
    except IndexError:
        return default

In [425]:
search_terms_with_cat['index'] = search_terms_with_cat.index

In [426]:
search_terms_with_cat[:5]


Out[426]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id ... second_cat_name third_cat_name no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term ascii singular index
0 0.0532 0.0061 118822 19417 6317 729 118822 letgo|latam|ar 2195.0 362 ... 0.2810 0.2756 0.0000 0.0000 fiat 147 fiat 147 fiat 147 0
1 0.0481 0.0096 114212 22293 5491 1097 114212 letgo|latam|ar 7510.0 362 ... Electrónica 0.2194 0.1989 0.0617 0.0000 motos motos moto 1
2 0.0613 0.0105 101423 22720 6214 1067 101423 letgo|latam|ar 25689.0 815 ... Ropa y Accesorios de Hombre Bebés y Niños 0.1946 0.1541 0.1271 0.0896 zapatillas zapatillas zapatilla 2
3 0.0505 0.0118 97578 17432 4930 1152 97579 letgo|latam|ar 4861.0 881 ... 0.2316 0.2135 0.0000 0.0000 bicicletas bicicletas bicicleta 3
4 0.0405 0.0145 97207 15176 3938 1409 97207 letgo|latam|ar 1321.0 362 ... 0.2778 0.2694 0.0000 0.0000 fiat uno fiat uno fiat uno 4

5 rows × 23 columns


In [427]:
search_terms_with_cat['singular_pos'] = [ safe_list_get(search_terms_with_cat[search_terms_with_cat['ascii']==s['singular']].index.tolist(),0,False) for i, s in search_terms_with_cat.iterrows()]

In [428]:
# more popular plural version exists
a = list( search_terms_with_cat[(search_terms_with_cat['singular_pos']>search_terms_with_cat['index']) &(search_terms_with_cat['singular_pos'] >0) ]['singular_pos'])

In [429]:
# more popular singular version exists
b = list(search_terms_with_cat[(search_terms_with_cat['singular_pos']<search_terms_with_cat['index']) & (search_terms_with_cat['singular_pos'] >0) ]['index'])

In [430]:
bad_mask = a+b

In [431]:
bad_mask[:10]


Out[431]:
[41, 543, 18, 208, 100, 70, 38, 974, 2503, 653]

In [432]:
len(bad_mask)


Out[432]:
6293

In [433]:
search_terms_with_cat.loc[bad_mask[:10]]


Out[433]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id ... third_cat_name no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term ascii singular index singular_pos
41 0.0589 0.0212 36897 8974 2175 782 36897 letgo|latam|ar 6394.0 362 ... 0.2219 0.1878 0.0619 0.0000 moto moto moto 41 41
543 0.0658 0.0124 7430 1827 489 92 7430 letgo|latam|ar 2903.0 815 ... Bebés y Niños 0.1904 0.1395 0.1181 0.0974 zapatilla zapatilla zapatilla 543 543
18 0.0654 0.0166 53392 12918 3493 885 53392 letgo|latam|ar 9360.0 881 ... 0.2292 0.2081 0.0000 0.0000 bicicleta bicicleta bicicleta 18 18
208 0.0425 0.0224 15652 4735 665 350 15652 letgo|latam|ar 3347.0 362 ... 0.2201 0.1819 0.0567 0.0000 auto auto auto 208 208
100 0.1032 0.0135 25163 11339 2596 339 25163 letgo|latam|ar 11728.0 800 ... 0.2268 0.2136 0.0000 0.0000 celular celular celular 100 100
70 0.0246 0.0096 31320 3150 770 301 31320 letgo|latam|ar 1064.0 362 ... 0.2612 0.2481 0.0000 0.0000 camioneta camioneta camioneta 70 70
38 0.0851 0.0092 37229 10627 3167 341 37230 letgo|latam|ar 7240.0 806 ... 0.2291 0.2147 0.0563 0.0000 heladera heladera heladera 38 38
974 0.0434 0.0280 4243 391 184 119 4243 letgo|latam|ar 49.0 815 ... Hogar y Jardín 0.2244 0.1224 0.1224 0.1020 masaje masaje masaje 974 974
2503 0.0640 0.0257 1673 359 107 43 1673 letgo|latam|ar 767.0 815 ... Ropa y Accesorios de Hombre 0.2346 0.2203 0.0586 0.0573 bota bota bota 2503 2503
653 0.0316 0.0232 6238 642 197 145 6238 letgo|latam|ar 172.0 362 ... 0.2732 0.2616 0.0000 0.0000 llanta llanta llanta 653 653

10 rows × 24 columns


In [434]:
search_terms_with_cat[bad_rows][:10]


Out[434]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id ... third_cat_name no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term ascii singular index singular_pos
18 0.0654 0.0166 53392 12918 3493 885 53392 letgo|latam|ar 9360.0 881 ... 0.2292 0.2081 0.0000 0.0000 bicicleta bicicleta bicicleta 18 18
38 0.0851 0.0092 37229 10627 3167 341 37230 letgo|latam|ar 7240.0 806 ... 0.2291 0.2147 0.0563 0.0000 heladera heladera heladera 38 38
41 0.0589 0.0212 36897 8974 2175 782 36897 letgo|latam|ar 6394.0 362 ... 0.2219 0.1878 0.0619 0.0000 moto moto moto 41 41
65 0.0822 0.0183 31914 11026 2624 584 31914 letgo|latam|ar 23049.0 815 ... Bebés y Niños 0.2135 0.1737 0.1124 0.0765 campera campera campera 65 65
70 0.0246 0.0096 31320 3150 770 301 31320 letgo|latam|ar 1064.0 362 ... 0.2612 0.2481 0.0000 0.0000 camioneta camioneta camioneta 70 70
84 0.0592 0.0079 28551 5772 1690 225 28551 letgo|latam|ar 2078.0 806 ... 0.2353 0.2170 0.0000 0.0000 cocinas cocinas cocina 84 75
100 0.1032 0.0135 25163 11339 2596 339 25163 letgo|latam|ar 11728.0 800 ... 0.2268 0.2136 0.0000 0.0000 celular celular celular 100 100
104 0.0178 0.0086 24726 2884 440 213 24726 letgo|latam|ar 978.0 362 ... 0.2443 0.1973 0.0000 0.0000 autos usados autos usados autos usado 104 72
106 0.0371 0.0077 24115 3105 894 185 24115 letgo|latam|ar 141.0 362 ... 0.2836 0.2836 0.0000 0.0000 ford ranger ford ranger ford ranger 106 106
114 0.0874 0.0225 23281 8584 2034 524 23281 letgo|latam|ar 1516.0 800 ... 0.2717 0.2658 0.0000 0.0000 iphone 6s iphone 6s iphone 6 114 74

10 rows × 24 columns


In [435]:
bad_rows = search_terms_with_cat.index.isin(bad_mask)
clean_df = search_terms_with_cat[~bad_rows]

In [436]:
clean_df[:10]


Out[436]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id ... third_cat_name no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term ascii singular index singular_pos
0 0.0532 0.0061 118822 19417 6317 729 118822 letgo|latam|ar 2195.0 362 ... 0.2810 0.2756 0.0000 0.0000 fiat 147 fiat 147 fiat 147 0 0
1 0.0481 0.0096 114212 22293 5491 1097 114212 letgo|latam|ar 7510.0 362 ... 0.2194 0.1989 0.0617 0.0000 motos motos moto 1 41
2 0.0613 0.0105 101423 22720 6214 1067 101423 letgo|latam|ar 25689.0 815 ... Bebés y Niños 0.1946 0.1541 0.1271 0.0896 zapatillas zapatillas zapatilla 2 543
3 0.0505 0.0118 97578 17432 4930 1152 97579 letgo|latam|ar 4861.0 881 ... 0.2316 0.2135 0.0000 0.0000 bicicletas bicicletas bicicleta 3 18
4 0.0405 0.0145 97207 15176 3938 1409 97207 letgo|latam|ar 1321.0 362 ... 0.2778 0.2694 0.0000 0.0000 fiat uno fiat uno fiat uno 4 4
5 0.0377 0.0153 88181 19924 3322 1347 88181 letgo|latam|ar 9600.0 362 ... 0.2227 0.1928 0.0589 0.0000 autos autos auto 5 208
6 0.0530 0.0115 83509 16491 4425 958 83509 letgo|latam|ar 1571.0 362 ... 0.2845 0.2762 0.0000 0.0000 gol gol gol 6 6
7 0.0819 0.0101 77226 28572 6322 780 77226 letgo|latam|ar 19082.0 800 ... 0.2274 0.2133 0.0000 0.0000 celulares celulares celular 7 100
8 0.0481 0.0074 75213 11209 3614 557 75213 letgo|latam|ar 1212.0 362 ... 0.2920 0.2904 0.0000 0.0000 fiat 128 fiat 128 fiat 128 8 8
9 0.0506 0.0117 73770 11375 3736 865 73770 letgo|latam|ar 121.0 362 ... 0.3140 0.3057 0.0000 0.0000 toyota hilux toyota hilux toyota hilux 9 9

10 rows × 24 columns


In [437]:
def num_there(s):
    return any(i.isdigit() for i in s)

In [438]:
def ends_with_num_s(s):
    ews = False
    if len(s)>1:
        ews = (s[-2].isdigit()) & (s[-1]=="s")
    return ews

In [439]:
ends_with_num_s('iphons')


Out[439]:
False

In [440]:
search_terms_with_cat['ends_with_number'] = search_terms_with_cat['search_term'].apply(ends_with_num_s)

In [441]:
iphone6s = search_terms_with_cat[search_terms_with_cat['ends_with_number']]

In [442]:
iphone6s[:10]


Out[442]:
reply_conversion research_quote distinct_sessions num_subsequent_replies reply_5min_conversion research_15sec_conversion rows cat_country_sk total_cat_freq first_cat_id ... no_cat_significance first_cat_significance second_cat_significance third_cat_significance search_term ascii singular index singular_pos ends_with_number
114 0.0874 0.0225 23281 8584 2034 524 23281 letgo|latam|ar 1516.0 800 ... 0.2717 0.2658 0.0000 0.0000 iphone 6s iphone 6s iphone 6 114 74 True
166 0.1032 0.0141 17765 7161 1834 250 17765 letgo|latam|ar 1888.0 800 ... 0.2611 0.2537 0.0000 0.0000 iphone 5s iphone 5s iphone 5 166 307 True
1215 0.1062 0.0121 3465 1463 368 42 3465 letgo|latam|ar 301.0 800 ... 0.2724 0.2558 0.0000 0.0000 iphone 4s iphone 4s iphone 4 1215 991 True
4947 0.0344 0.0652 813 105 28 53 813 letgo|latam|ar 15.0 362 ... 0.2000 0.2000 0.2000 0.0000 renault 4s renault 4s renault 4 4947 445 True
8554 0.0584 0.0360 445 93 26 16 445 letgo|latam|ar 25.0 800 ... 0.2800 0.2000 0.0800 0.0800 iphone 7s iphone 7s iphone 7 8554 136 True
8821 0.0837 0.0186 430 170 36 8 430 None NaN None ... None None None None i phone 5s i phone 5s i phone 5 8821 11243 True
13710 0.1732 0.0197 254 183 44 5 254 letgo|latam|ar 54.0 800 ... 0.2962 0.2962 0.0000 0.0000 6s 6s 6 13710 26726 True
15791 0.1395 0.0186 215 165 30 4 215 letgo|latam|ar 12.0 800 ... 0.3333 0.3333 0.0000 0.0000 5s 5s 5 15791 29332 True
16426 0.0728 0.0485 206 57 15 10 206 letgo|latam|ar 13.0 800 ... 0.2307 0.2307 0.1538 0.0000 4s 4s 4 16426 20238 True
16698 0.0746 0.0149 201 42 15 3 201 None NaN None ... None None None None iphones 5s iphones 5s iphones 5 16698 21677 True

10 rows × 25 columns


In [443]:
df = iphone6s.append(clean_df)

In [444]:
df = df.sort_values(by='distinct_sessions', ascending= False)

In [445]:
len(df)


Out[445]:
76436

In [446]:
df[df['search_term'].str.startswith("iph")]['search_term'][:10]


Out[446]:
16             iphone
74           iphone 6
114         iphone 6s
136          iphone 7
166         iphone 5s
166         iphone 5s
278     iphone 6 plus
326     iphone 7 plus
396    iphone 6s plus
714           iphones
Name: search_term, dtype: object

In [447]:
df[df['search_term'].str.startswith("ford r")]['search_term'][:10]


Out[447]:
66                   ford rangers
889               ford ranger 4x4
929                 ford ranchero
2026                ford ranchera
3890    ford ranger cabina simple
4084              ford ranger xlt
4608          ford ranger limited
6206             ford ranger 2012
6577             ford ranger 2013
6800              ford ranger 4x2
Name: search_term, dtype: object

In [448]:
df.columns


Out[448]:
Index([u'ascii', u'cat_country_sk', u'distinct_sessions', u'ends_with_number',
       u'first_cat_id', u'first_cat_name', u'first_cat_significance', u'index',
       u'no_cat_significance', u'num_subsequent_replies',
       u'reply_5min_conversion', u'reply_conversion',
       u'research_15sec_conversion', u'research_quote', u'rows',
       u'search_term', u'second_cat_id', u'second_cat_name',
       u'second_cat_significance', u'singular', u'singular_pos',
       u'third_cat_id', u'third_cat_name', u'third_cat_significance',
       u'total_cat_freq'],
      dtype='object')

In [449]:
df['popularity'] = df["distinct_sessions"]/100000

In [450]:
df["popularity"][df["popularity"]>1.0] = 1.0


/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [451]:
len(df)


Out[451]:
76436

In [452]:
liquid_kws_df = df[df['reply_conversion']>0]

In [453]:
len(liquid_kws_df)


Out[453]:
63676

In [454]:
liquid_kws_df[:100]


Out[454]:
ascii cat_country_sk distinct_sessions ends_with_number first_cat_id first_cat_name first_cat_significance index no_cat_significance num_subsequent_replies ... second_cat_id second_cat_name second_cat_significance singular singular_pos third_cat_id third_cat_name third_cat_significance total_cat_freq popularity
0 fiat 147 letgo|latam|ar 118822 NaN 362 Vehículos 0.2756 0 0.2810 19417 ... 0 0.0000 fiat 147 0 0 0.0000 2195.0 1.00000
1 motos letgo|latam|ar 114212 NaN 362 Vehículos 0.1989 1 0.2194 22293 ... 800 Electrónica 0.0617 moto 41 0 0.0000 7510.0 1.00000
2 zapatillas letgo|latam|ar 101423 NaN 815 Ropa y Accesorios de Mujer 0.1541 2 0.1946 22720 ... 888 Ropa y Accesorios de Hombre 0.1271 zapatilla 543 853 Bebés y Niños 0.0896 25689.0 1.00000
3 bicicletas letgo|latam|ar 97578 NaN 881 Deportes y Hobbies 0.2135 3 0.2316 17432 ... 0 0.0000 bicicleta 18 0 0.0000 4861.0 0.97578
4 fiat uno letgo|latam|ar 97207 NaN 362 Vehículos 0.2694 4 0.2778 15176 ... 0 0.0000 fiat uno 4 0 0.0000 1321.0 0.97207
5 autos letgo|latam|ar 88181 NaN 362 Vehículos 0.1928 5 0.2227 19924 ... 853 Bebés y Niños 0.0589 auto 208 0 0.0000 9600.0 0.88181
6 gol letgo|latam|ar 83509 NaN 362 Vehículos 0.2762 6 0.2845 16491 ... 0 0.0000 gol 6 0 0.0000 1571.0 0.83509
7 celulares letgo|latam|ar 77226 NaN 800 Electrónica 0.2133 7 0.2274 28572 ... 0 0.0000 celular 100 0 0.0000 19082.0 0.77226
8 fiat 128 letgo|latam|ar 75213 NaN 362 Vehículos 0.2904 8 0.2920 11209 ... 0 0.0000 fiat 128 8 0 0.0000 1212.0 0.75213
9 toyota hilux letgo|latam|ar 73770 NaN 362 Vehículos 0.3057 9 0.3140 11375 ... 0 0.0000 toyota hilux 9 0 0.0000 121.0 0.73770
10 renault 12 letgo|latam|ar 72740 NaN 362 Vehículos 0.2814 10 0.2894 12301 ... 0 0.0000 renault 12 10 0 0.0000 995.0 0.72740
11 ford falcon letgo|latam|ar 71176 NaN 362 Vehículos 0.2744 11 0.2800 7298 ... 0 0.0000 ford falcon 11 0 0.0000 725.0 0.71176
12 corsa letgo|latam|ar 70450 NaN 362 Vehículos 0.2667 12 0.2740 13051 ... 0 0.0000 corsa 12 0 0.0000 1091.0 0.70450
13 ford ka letgo|latam|ar 70059 NaN 362 Vehículos 0.2607 13 0.2702 11804 ... 0 0.0000 ford ka 13 0 0.0000 1047.0 0.70059
14 volkswagen gol letgo|latam|ar 68971 NaN 362 Vehículos 0.2601 14 0.2926 11528 ... 0 0.0000 volkswagen gol 14 0 0.0000 123.0 0.68971
15 fiat 600 letgo|latam|ar 62422 NaN 362 Vehículos 0.2590 15 0.2618 8112 ... 0 0.0000 fiat 600 15 0 0.0000 1795.0 0.62422
16 iphone letgo|latam|ar 56656 NaN 800 Electrónica 0.2446 16 0.2546 20035 ... 0 0.0000 iphone 16 0 0.0000 10135.0 0.56656
17 peugeot 206 letgo|latam|ar 56381 NaN 362 Vehículos 0.2595 17 0.2745 8190 ... 0 0.0000 peugeot 206 17 0 0.0000 601.0 0.56381
19 ford f100 letgo|latam|ar 53296 NaN 362 Vehículos 0.2477 19 0.2566 6223 ... 0 0.0000 ford f100 19 0 0.0000 226.0 0.53296
20 camionetas letgo|latam|ar 53051 NaN 362 Vehículos 0.2478 20 0.2631 6165 ... 0 0.0000 camioneta 70 0 0.0000 1049.0 0.53051
21 motos 110 letgo|latam|ar 48876 NaN 362 Vehículos 0.2568 21 0.2684 10955 ... 0 0.0000 motos 110 21 0 0.0000 950.0 0.48876
22 renault clio letgo|latam|ar 48149 NaN 362 Vehículos 0.2668 22 0.2849 7363 ... 0 0.0000 renault clio 22 0 0.0000 386.0 0.48149
23 chevrolet corsa letgo|latam|ar 47911 NaN 362 Vehículos 0.2933 23 0.3016 7030 ... 0 0.0000 chevrolet corsa 23 0 0.0000 242.0 0.47911
24 peugeot 504 letgo|latam|ar 46143 NaN 362 Vehículos 0.2854 24 0.2917 6764 ... 0 0.0000 peugeot 504 24 0 0.0000 634.0 0.46143
25 clio letgo|latam|ar 45201 NaN 362 Vehículos 0.2781 25 0.2837 8477 ... 0 0.0000 clio 25 0 0.0000 888.0 0.45201
26 fiat palio letgo|latam|ar 43716 NaN 362 Vehículos 0.2801 26 0.2862 6566 ... 0 0.0000 fiat palio 26 0 0.0000 489.0 0.43716
27 bora letgo|latam|ar 43593 NaN 362 Vehículos 0.2620 27 0.2720 5728 ... 0 0.0000 bora 27 0 0.0000 500.0 0.43593
28 sillones letgo|latam|ar 42457 NaN 806 Hogar y Jardín 0.2345 28 0.2483 6161 ... 0 0.0000 sillón False 0 0.0000 4618.0 0.42457
29 f100 letgo|latam|ar 41482 NaN 362 Vehículos 0.2567 29 0.2719 5036 ... 0 0.0000 f100 29 0 0.0000 331.0 0.41482
30 bmw letgo|latam|ar 40963 NaN 362 Vehículos 0.2361 30 0.2515 3958 ... 0 0.0000 bmw 30 0 0.0000 652.0 0.40963
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
75 cocina letgo|latam|ar 30604 NaN 806 Hogar y Jardín 0.2091 75 0.2223 6821 ... 0 0.0000 cocina 75 0 0.0000 5450.0 0.30604
76 sillon letgo|latam|ar 30456 NaN 806 Hogar y Jardín 0.2356 76 0.2461 4981 ... 0 0.0000 sillon 76 0 0.0000 6942.0 0.30456
77 samsung j7 letgo|latam|ar 30426 NaN 800 Electrónica 0.2682 77 0.2769 9416 ... 0 0.0000 samsung j7 77 0 0.0000 1394.0 0.30426
78 play 3 letgo|latam|ar 29986 NaN 811 Gaming 0.1952 78 0.2211 8741 ... 800 Electrónica 0.0862 play 3 78 0 0.0000 3523.0 0.29986
79 permuto letgo|latam|ar 29762 NaN 800 Electrónica 0.1147 79 0.1681 14080 ... 362 Vehículos 0.0586 permuto 79 0 0.0000 4829.0 0.29762
80 gol trend letgo|latam|ar 29505 NaN 362 Vehículos 0.2721 80 0.2819 4328 ... 0 0.0000 gol trend 80 0 0.0000 305.0 0.29505
81 zapatillas nike letgo|latam|ar 29405 NaN 815 Ropa y Accesorios de Mujer 0.1589 81 0.1962 6620 ... 888 Ropa y Accesorios de Hombre 0.1429 zapatillas nike 81 853 Bebés y Niños 0.0678 6247.0 0.29405
82 puertas letgo|latam|ar 28844 NaN 806 Hogar y Jardín 0.2025 82 0.2149 4139 ... 0 0.0000 puerta 251 0 0.0000 2019.0 0.28844
83 focus letgo|latam|ar 28839 NaN 362 Vehículos 0.2608 83 0.2771 3424 ... 0 0.0000 focu 34324 0 0.0000 184.0 0.28839
85 cuatriciclos letgo|latam|ar 28278 NaN 362 Vehículos 0.1889 85 0.2119 3428 ... 853 Bebés y Niños 0.0829 cuatriciclo 259 0 0.0000 217.0 0.28278
86 chevrolet s10 letgo|latam|ar 28095 NaN 362 Vehículos 0.3333 86 0.3333 3571 ... 0 0.0000 chevrolet s10 86 0 0.0000 21.0 0.28095
87 peugeot letgo|latam|ar 28091 NaN 362 Vehículos 0.2511 87 0.2677 3998 ... 0 0.0000 peugeot 87 0 0.0000 661.0 0.28091
88 falcon letgo|latam|ar 27845 NaN 362 Vehículos 0.2482 88 0.2646 3480 ... 0 0.0000 falcon 88 0 0.0000 733.0 0.27845
89 perros letgo|latam|ar 27760 NaN 887 Otros 0.1723 89 0.2067 3266 ... 806 Hogar y Jardín 0.0700 perro 707 0 0.0000 2298.0 0.27760
90 volkswagen saveiro letgo|latam|ar 27751 NaN 362 Vehículos 0.3333 90 0.3333 3571 ... 0 0.0000 volkswagen saveiro 90 0 0.0000 9.0 0.27751
91 fox letgo|latam|ar 27227 NaN 362 Vehículos 0.2123 91 0.2486 4146 ... 888 Ropa y Accesorios de Hombre 0.0544 fox 91 0 0.0000 551.0 0.27227
92 patines letgo|latam|ar 27111 NaN 881 Deportes y Hobbies 0.1766 92 0.2011 4075 ... 853 Bebés y Niños 0.0887 patín False 815 Ropa y Accesorios de Mujer 0.0654 3605.0 0.27111
93 chevrolet letgo|latam|ar 27107 NaN 362 Vehículos 0.2512 93 0.2564 3246 ... 0 0.0000 chevrolet 93 0 0.0000 390.0 0.27107
94 ps3 letgo|latam|ar 27032 NaN 811 Gaming 0.1992 94 0.2232 7480 ... 800 Electrónica 0.0827 ps3 94 0 0.0000 3579.0 0.27032
95 honda tornado letgo|latam|ar 26640 NaN 362 Vehículos 0.2938 95 0.2938 4807 ... 0 0.0000 honda tornado 95 0 0.0000 194.0 0.26640
96 casa letgo|latam|ar 26143 NaN 806 Hogar y Jardín 0.0892 96 0.1946 2382 ... 800 Electrónica 0.0548 casa 96 853 Bebés y Niños 0.0516 930.0 0.26143
97 kangoo letgo|latam|ar 25857 NaN 362 Vehículos 0.2747 97 0.2852 4306 ... 0 0.0000 kangoo 97 0 0.0000 666.0 0.25857
98 alquiler departamentos None 25768 NaN None None None 98 None 10917 ... None None None alquiler departamento 1545 None None None NaN 0.25768
99 lanchas letgo|latam|ar 25689 NaN 362 Vehículos 0.2041 99 0.2145 2165 ... 881 Deportes y Hobbies 0.0726 lancha 167 887 Otros 0.0519 289.0 0.25689
101 ford ecosports None 24996 NaN None None None 101 None 3227 ... None None None ford ecosport 799 None None None NaN 0.24996
102 peugeot 307 letgo|latam|ar 24948 NaN 362 Vehículos 0.2901 102 0.2941 2654 ... 0 0.0000 peugeot 307 102 0 0.0000 255.0 0.24948
103 netbook letgo|latam|ar 24763 NaN 800 Electrónica 0.2289 103 0.2383 6260 ... 0 0.0000 netbook 103 0 0.0000 4066.0 0.24763
105 torino letgo|latam|ar 24512 NaN 362 Vehículos 0.2442 105 0.2519 1946 ... 0 0.0000 torino 105 0 0.0000 389.0 0.24512
107 peugeot 207 letgo|latam|ar 24105 NaN 362 Vehículos 0.2978 107 0.3262 2805 ... 0 0.0000 peugeot 207 107 0 0.0000 141.0 0.24105
108 palio letgo|latam|ar 23963 NaN 362 Vehículos 0.2665 108 0.2705 4865 ... 0 0.0000 palio 108 0 0.0000 499.0 0.23963

100 rows × 26 columns


In [455]:
export = liquid_kws_df[['search_term', 'popularity', 'reply_conversion', 'research_quote', 'cat_country_sk', 
             'no_cat_significance', 'first_cat_significance', 'second_cat_significance', 'third_cat_significance',
            'first_cat_id', 'second_cat_id', 'third_cat_id','first_cat_name', 'second_cat_name', 'third_cat_name']]

In [456]:
export[:50]


Out[456]:
search_term popularity reply_conversion research_quote cat_country_sk no_cat_significance first_cat_significance second_cat_significance third_cat_significance first_cat_id second_cat_id third_cat_id first_cat_name second_cat_name third_cat_name
0 fiat 147 1.00000 0.0532 0.0061 letgo|latam|ar 0.2810 0.2756 0.0000 0.0000 362 0 0 Vehículos
1 motos 1.00000 0.0481 0.0096 letgo|latam|ar 0.2194 0.1989 0.0617 0.0000 362 800 0 Vehículos Electrónica
2 zapatillas 1.00000 0.0613 0.0105 letgo|latam|ar 0.1946 0.1541 0.1271 0.0896 815 888 853 Ropa y Accesorios de Mujer Ropa y Accesorios de Hombre Bebés y Niños
3 bicicletas 0.97578 0.0505 0.0118 letgo|latam|ar 0.2316 0.2135 0.0000 0.0000 881 0 0 Deportes y Hobbies
4 fiat uno 0.97207 0.0405 0.0145 letgo|latam|ar 0.2778 0.2694 0.0000 0.0000 362 0 0 Vehículos
5 autos 0.88181 0.0377 0.0153 letgo|latam|ar 0.2227 0.1928 0.0589 0.0000 362 853 0 Vehículos Bebés y Niños
6 gol 0.83509 0.0530 0.0115 letgo|latam|ar 0.2845 0.2762 0.0000 0.0000 362 0 0 Vehículos
7 celulares 0.77226 0.0819 0.0101 letgo|latam|ar 0.2274 0.2133 0.0000 0.0000 800 0 0 Electrónica
8 fiat 128 0.75213 0.0481 0.0074 letgo|latam|ar 0.2920 0.2904 0.0000 0.0000 362 0 0 Vehículos
9 toyota hilux 0.73770 0.0506 0.0117 letgo|latam|ar 0.3140 0.3057 0.0000 0.0000 362 0 0 Vehículos
10 renault 12 0.72740 0.0522 0.0083 letgo|latam|ar 0.2894 0.2814 0.0000 0.0000 362 0 0 Vehículos
11 ford falcon 0.71176 0.0320 0.0059 letgo|latam|ar 0.2800 0.2744 0.0000 0.0000 362 0 0 Vehículos
12 corsa 0.70450 0.0509 0.0101 letgo|latam|ar 0.2740 0.2667 0.0000 0.0000 362 0 0 Vehículos
13 ford ka 0.70059 0.0518 0.0084 letgo|latam|ar 0.2702 0.2607 0.0000 0.0000 362 0 0 Vehículos
14 volkswagen gol 0.68971 0.0440 0.0231 letgo|latam|ar 0.2926 0.2601 0.0000 0.0000 362 0 0 Vehículos
15 fiat 600 0.62422 0.0436 0.0070 letgo|latam|ar 0.2618 0.2590 0.0000 0.0000 362 0 0 Vehículos
16 iphone 0.56656 0.0926 0.0158 letgo|latam|ar 0.2546 0.2446 0.0000 0.0000 800 0 0 Electrónica
17 peugeot 206 0.56381 0.0397 0.0196 letgo|latam|ar 0.2745 0.2595 0.0000 0.0000 362 0 0 Vehículos
19 ford f100 0.53296 0.0362 0.0060 letgo|latam|ar 0.2566 0.2477 0.0000 0.0000 362 0 0 Vehículos
20 camionetas 0.53051 0.0274 0.0103 letgo|latam|ar 0.2631 0.2478 0.0000 0.0000 362 0 0 Vehículos
21 motos 110 0.48876 0.0664 0.0055 letgo|latam|ar 0.2684 0.2568 0.0000 0.0000 362 0 0 Vehículos
22 renault clio 0.48149 0.0415 0.0192 letgo|latam|ar 0.2849 0.2668 0.0000 0.0000 362 0 0 Vehículos
23 chevrolet corsa 0.47911 0.0401 0.0124 letgo|latam|ar 0.3016 0.2933 0.0000 0.0000 362 0 0 Vehículos
24 peugeot 504 0.46143 0.0417 0.0099 letgo|latam|ar 0.2917 0.2854 0.0000 0.0000 362 0 0 Vehículos
25 clio 0.45201 0.0516 0.0118 letgo|latam|ar 0.2837 0.2781 0.0000 0.0000 362 0 0 Vehículos
26 fiat palio 0.43716 0.0395 0.0124 letgo|latam|ar 0.2862 0.2801 0.0000 0.0000 362 0 0 Vehículos
27 bora 0.43593 0.0444 0.0100 letgo|latam|ar 0.2720 0.2620 0.0000 0.0000 362 0 0 Vehículos
28 sillones 0.42457 0.0430 0.0078 letgo|latam|ar 0.2483 0.2345 0.0000 0.0000 806 0 0 Hogar y Jardín
29 f100 0.41482 0.0380 0.0069 letgo|latam|ar 0.2719 0.2567 0.0000 0.0000 362 0 0 Vehículos
30 bmw 0.40963 0.0293 0.0110 letgo|latam|ar 0.2515 0.2361 0.0000 0.0000 362 0 0 Vehículos
31 fiat 0.40957 0.0411 0.0224 letgo|latam|ar 0.2605 0.2498 0.0000 0.0000 362 0 0 Vehículos
32 saveiro 0.40692 0.0446 0.0099 letgo|latam|ar 0.3144 0.3109 0.0000 0.0000 362 0 0 Vehículos
33 heladeras 0.40065 0.0729 0.0069 letgo|latam|ar 0.2263 0.2091 0.0612 0.0000 806 800 0 Hogar y Jardín Electrónica
34 masajes 0.40001 0.0589 0.0102 letgo|latam|ar 0.1894 0.0912 0.0807 0.0666 806 887 815 Hogar y Jardín Otros Ropa y Accesorios de Mujer
35 ps4 0.39890 0.0804 0.0090 letgo|latam|ar 0.2296 0.2030 0.0961 0.0000 811 800 0 Gaming Electrónica
36 renault 9 0.39748 0.0518 0.0074 letgo|latam|ar 0.2782 0.2739 0.0000 0.0000 362 0 0 Vehículos
37 jeep 0.39597 0.0244 0.0117 letgo|latam|ar 0.2527 0.2416 0.0000 0.0000 362 0 0 Vehículos
39 botas 0.37048 0.0656 0.0117 letgo|latam|ar 0.2305 0.2193 0.0687 0.0506 815 853 888 Ropa y Accesorios de Mujer Bebés y Niños Ropa y Accesorios de Hombre
40 llantas 0.36927 0.0547 0.0135 letgo|latam|ar 0.2639 0.2562 0.0000 0.0000 362 0 0 Vehículos
42 ford focus 0.36850 0.0306 0.0073 letgo|latam|ar 0.2877 0.2625 0.0000 0.0000 362 0 0 Vehículos
43 amarok 0.36743 0.0441 0.0084 letgo|latam|ar 0.3004 0.2906 0.0000 0.0000 362 0 0 Vehículos
44 sillas 0.36720 0.0514 0.0092 letgo|latam|ar 0.2321 0.2188 0.0000 0.0000 806 0 0 Hogar y Jardín
45 muebles 0.36608 0.0359 0.0092 letgo|latam|ar 0.2254 0.2131 0.0000 0.0000 806 0 0 Hogar y Jardín
46 renault 19 0.36166 0.0466 0.0085 letgo|latam|ar 0.2906 0.2889 0.0000 0.0000 362 0 0 Vehículos
47 vento 0.35676 0.0387 0.0094 letgo|latam|ar 0.3036 0.2827 0.0000 0.0000 362 0 0 Vehículos
48 botines 0.35620 0.0739 0.0119 letgo|latam|ar 0.1678 0.1455 0.0997 0.0809 888 881 815 Ropa y Accesorios de Hombre Deportes y Hobbies Ropa y Accesorios de Mujer
49 toyota 0.34473 0.0289 0.0162 letgo|latam|ar 0.2795 0.2639 0.0000 0.0000 362 0 0 Vehículos
50 renault kangoo 0.34215 0.0421 0.0062 letgo|latam|ar 0.3442 0.3278 0.0000 0.0000 362 0 0 Vehículos
51 toyota corolla 0.34042 0.0353 0.0086 letgo|latam|ar 0.2525 0.2323 0.0000 0.0000 362 0 0 Vehículos
52 lavarropas 0.33703 0.0830 0.0064 letgo|latam|ar 0.2508 0.2429 0.0000 0.0000 806 0 0 Hogar y Jardín

In [463]:
synonyms_to_generate = search_terms_with_cat[bad_rows]
synonyms_to_generate = synonyms_to_generate[synonyms_to_generate['search_term']!=synonyms_to_generate['singular']]
synonyms_to_generate[['singular','search_term']]


/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/pandas/core/ops.py:737: UnicodeWarning: Unicode unequal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  result = lib.vec_compare(x, y, op)
Out[463]:
singular search_term
84 cocina cocinas
104 autos usado autos usados
114 iphone 6 iphone 6s
159 mesa mesas
179 chocado chocados
222 notebook notebooks
244 casa casas
260 tablet tablets
311 guitarra guitarras
319 caballo caballos
337 fiat duna fiat dunas
341 fiat uno fiat unos
350 tractor tractores
370 ropero roperos
372 alquiler alquileres
383 casco cascos
390 escritorio escritorios
393 trabajo trabajos
407 fondo de comercio fondo de comercios
418 salamandra salamandras
427 estereo estereos
435 ford fiesta ford fiestas
441 balanza balanzas
442 cachorro cachorros
492 trailer trailers
510 estufa estufas
567 alacena alacenas
572 bateria baterias
577 potencia potencias
590 freezer freezers
... ... ...
82130 mameluco termico mameluco térmico
82131 lustradora de auto lustradora de autos
82154 led para moto led para motos
82178 repuestos de falcon repuestos de falcón
82184 heladera batea heladera bateas
82187 peugeot chocado peugeot chocados
82192 estetica esteticas
82199 mochila tactica mochila táctica
82200 pala cargadora pala cargadoras
82201 zapatillas bebe zapatillas bebé
82226 tricargo tricargos
82227 reloj para hombre reloj para hombres
82238 tanque de nafta moto tanque de nafta motos
82272 cosas para bebe cosas para bebé
82288 botines de papi futbol botines de papi fútbol
82347 scooter moto scooter motos
82367 camioneta chocada camioneta chocadas
82382 sillon de algarrobo sillón de algarrobo
82388 liquido urgente líquido urgente
82394 panaderia panaderías
82416 bajo mesada y alacena bajo mesada y alacenas
82431 maquina de cortar cesped máquina de cortar cesped
82485 cubierta usada cubierta usadas
82491 camion mercedes benz 1518 camión mercedes benz 1518
82505 alquiler sin garantia alquiler sin garantía
82509 auto roto auto rotos
82531 tablet rota tablet rotas
82624 sacos para hombre sacos para hombres
82636 estereo peugeot estéreo peugeot
82653 autos en oferta autos en ofertas

4378 rows × 2 columns


In [468]:
export.columns


Out[468]:
Index([u'search_term', u'popularity', u'reply_conversion', u'research_quote',
       u'cat_country_sk', u'no_cat_significance', u'first_cat_significance',
       u'second_cat_significance', u'third_cat_significance', u'first_cat_id',
       u'second_cat_id', u'third_cat_id', u'first_cat_name',
       u'second_cat_name', u'third_cat_name'],
      dtype='object')

In [470]:
export = export.rename( columns={"search_term": "query"})

In [471]:
len(export)


Out[471]:
63676

In [481]:
type(export['popularity'][0])


Out[481]:
numpy.float64

In [482]:
type(export['first_cat_significance'][0])


Out[482]:
decimal.Decimal

In [484]:
export['first_cat_significance'] = export['first_cat_significance'].astype(float)

In [485]:
export['no_cat_significance'] = export['no_cat_significance'].astype(float)
export['reply_conversion'] = export['reply_conversion'].astype(float)
export['research_quote'] = export['research_quote'].astype(float)
export['second_cat_significance'] = export['second_cat_significance'].astype(float)
export['third_cat_significance'] = export['third_cat_significance'].astype(float)

In [508]:
import numpy as np
export['first_cat_id'] = export['first_cat_id'].replace('0', np.nan)
export['second_cat_id'] = export['second_cat_id'].replace('0', np.nan)
export['third_cat_id'] = export['third_cat_id'].replace('0', np.nan)

In [507]:
export['first_cat_significance'][export['first_cat_significance']<=0] = None
export['second_cat_significance'][export['second_cat_significance']<=0] = None
export['third_cat_significance'][export['third_cat_significance']<=0] = None


/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [ ]:
export['third_cat_significance'][export['third_cat_significance']<=0] = None

In [472]:
export.to_csv("liquid_kws_ar_v2.csv", index=False)

In [513]:
sample_json = []
for i, s in export[0:50000].iterrows():
    d = s.dropna().to_dict()
    sample_json.append(d)

In [563]:
import json
with open('popular_queries.json', 'w') as fp:
    json.dump(sample_json, fp)

In [564]:
pwd


Out[564]:
u'/Users/johannes.braun/dev/solr-6.3.0'

In [565]:
ls


CHANGES.txt                     example/
LICENSE.txt                     image_name.jpg
LUCENE_CHANGES.txt              licenses/
NOTICE.txt                      liquid_kws_ar_v2.csv
README.txt                      plural singular synonyms.ipynb
bin/                            popular_queries.json
contrib/                        server/
dist/                           twiggle python 2.ipynb
docs/

In [514]:
sample_json[:5]


Out[514]:
[{'cat_country_sk': 'letgo|latam|ar',
  'first_cat_id': '362',
  'first_cat_name': 'Veh\xc3\xadculos',
  'first_cat_significance': 0.2756,
  'no_cat_significance': 0.281,
  'popularity': 1.0,
  'query': 'fiat 147',
  'reply_conversion': 0.0532,
  'research_quote': 0.0061,
  'second_cat_name': '',
  'third_cat_name': ''},
 {'cat_country_sk': 'letgo|latam|ar',
  'first_cat_id': '362',
  'first_cat_name': 'Veh\xc3\xadculos',
  'first_cat_significance': 0.1989,
  'no_cat_significance': 0.2194,
  'popularity': 1.0,
  'query': 'motos',
  'reply_conversion': 0.0481,
  'research_quote': 0.0096,
  'second_cat_id': '800',
  'second_cat_name': 'Electr\xc3\xb3nica',
  'second_cat_significance': 0.0617,
  'third_cat_name': ''},
 {'cat_country_sk': 'letgo|latam|ar',
  'first_cat_id': '815',
  'first_cat_name': 'Ropa y Accesorios de Mujer',
  'first_cat_significance': 0.1541,
  'no_cat_significance': 0.1946,
  'popularity': 1.0,
  'query': 'zapatillas',
  'reply_conversion': 0.0613,
  'research_quote': 0.0105,
  'second_cat_id': '888',
  'second_cat_name': 'Ropa y Accesorios de Hombre',
  'second_cat_significance': 0.1271,
  'third_cat_id': '853',
  'third_cat_name': 'Beb\xc3\xa9s y Ni\xc3\xb1os',
  'third_cat_significance': 0.0896},
 {'cat_country_sk': 'letgo|latam|ar',
  'first_cat_id': '881',
  'first_cat_name': 'Deportes y Hobbies',
  'first_cat_significance': 0.2135,
  'no_cat_significance': 0.2316,
  'popularity': 0.97578,
  'query': 'bicicletas',
  'reply_conversion': 0.0505,
  'research_quote': 0.0118,
  'second_cat_name': '',
  'third_cat_name': ''},
 {'cat_country_sk': 'letgo|latam|ar',
  'first_cat_id': '362',
  'first_cat_name': 'Veh\xc3\xadculos',
  'first_cat_significance': 0.2694,
  'no_cat_significance': 0.2778,
  'popularity': 0.97207,
  'query': 'fiat uno',
  'reply_conversion': 0.0405,
  'research_quote': 0.0145,
  'second_cat_name': '',
  'third_cat_name': ''}]

In [631]:
import requests
def index_solr_list_of_docs(sample_json):
    url = 'http://localhost:8983/solr/popular_queries/update?commit=true'
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    r = requests.post(url, headers = headers, 
                  data = json.dumps(sample_json)) 
    return r

In [632]:
import json
r = index_solr_list_of_docs(sample_json[:5])

In [633]:
r.json()


Out[633]:
{u'responseHeader': {u'QTime': 10, u'status': 0}}

In [595]:
import requests
def suggest(search_term):
    #url = 'http://localhost:8983/solr/solrpedia_instant/select?q=query_ngram:'+search_term+'&%20sort=popularity%20desc&rows=10&fl=query&wt=json'
    url = 'http://localhost:8983/solr/popular_queries/select?q=query_edgengram:'+search_term+'&%20sort=product(popularity,reply_conversion)%20desc&rows=10&fl=query&wt=json'
    url2 = 'http://localhost:8983/solr/popular_queries/select?q=query_edgengram:'+search_term +'&%20sort='\
            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2))%20desc'\
            '&rows=10&fl=query&wt=json&fl=query,popularity,reply_conversion,research_quote,'\
            'first_cat_name,second_cat_name,second_cat_significance,first_cat_significance,'\
            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2)),'\
            'strdist(%22'+search_term+'%22,query,ngram),'\
            'pow(strdist(%22'+search_term+'%22,query,ngram),2),score&debugQuery=on'
    r = requests.get(url2).json()
    return [d.get('query', 'error')for d in r.get('response', {}).get('docs',{})], url2

In [611]:
import requests
def suggest(search_term, field):
    #url = 'http://localhost:8983/solr/solrpedia_instant/select?q=query_ngram:'+search_term+'&%20sort=popularity%20desc&rows=10&fl=query&wt=json'
    url = 'http://localhost:8983/solr/popular_queries/select?q='+field+':'+search_term+'&%20sort=product(popularity,reply_conversion)%20desc&rows=10&fl=query&wt=json'
    url2 = 'http://localhost:8983/solr/popular_queries/select?q='+field+':'+search_term +'&%20sort='\
            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2))%20desc'\
            '&rows=10&fl=query&wt=json&fl=query,popularity,reply_conversion,research_quote,'\
            'first_cat_name,second_cat_name,second_cat_significance,first_cat_significance,'\
            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2)),'\
            'strdist(%22'+search_term+'%22,query,ngram),'\
            'pow(strdist(%22'+search_term+'%22,query,ngram),2),score&debugQuery=on'
    r = requests.get(url2).json()
    return [d.get('query', 'error')for d in r.get('response', {}).get('docs',{})], url2

In [596]:
import requests
def suggest_sim(search_term):
    #url = 'http://localhost:8983/solr/solrpedia_instant/select?q=query_ngram:'+search_term+'&%20sort=popularity%20desc&rows=10&fl=query&wt=json'
    url2 = 'http://localhost:8983/solr/popular_queries/select?q=query_ngram:'+search_term +'&%20sort='\
            'product(popularity,pow(strdist(%22'+search_term+'%22,query,ngram),10))%20desc'\
            '&rows=10&fl=query&wt=json&fl=query,popularity,'\
            'product(popularity,pow(strdist(%22'+search_term+'%22,query,ngram),10)),'\
            'strdist(%22'+search_term+'%22,query,ngram),score&debugQuery=on'
    r = requests.get(url2).json()
    return [d.get('query', 'error')for d in r.get('response', {}).get('docs',{})], url2

In [627]:
s = suggest('onda', 'query_ngram')
for n in s[0]:
    print(n)
print(s[1])


honda
honda wave
honda biz
microondas
honda tornado
honda titan
honda civic
honda twister
honda fit
honda xr
http://localhost:8983/solr/popular_queries/select?q=query_ngram:onda&%20sort=product(popularity,reply_conversion,pow(strdist(%22onda%22,query,ngram),2))%20desc&rows=10&fl=query&wt=json&fl=query,popularity,reply_conversion,research_quote,first_cat_name,second_cat_name,second_cat_significance,first_cat_significance,product(popularity,reply_conversion,pow(strdist(%22onda%22,query,ngram),2)),strdist(%22onda%22,query,ngram),pow(strdist(%22onda%22,query,ngram),2),score&debugQuery=on

In [628]:
spell_check('qdida', 'olx')


http://localhost:8983/solr/olx/spell?df=query&spellcheck.q=qdida&spellcheck=true&spellcheck.collateParam.q.op=AND&wt=json
Out[628]:
(u'queda',
 [{u'endOffset': 5,
   u'numFound': 8,
   u'origFreq': 0,
   u'startOffset': 0,
   u'suggestion': [{u'freq': 7, u'word': u'adida'},
    {u'freq': 161, u'word': u'queda'},
    {u'freq': 3, u'word': u'quita'},
    {u'freq': 1, u'word': u'qdaba'},
    {u'freq': 1, u'word': u'quiza'},
    {u'freq': 3, u'word': u'qeda'},
    {u'freq': 1, u'word': u'quda'},
    {u'freq': 1, u'word': u'quid'}],
   'token': u'qdida'}])

In [535]:
matches = []
for i, p in export[:1000].iterrows():
    for l in [3]:
        match = p['query'] in suggest(p['query'][:l])
        if not match:
            print(p['query'], suggest(p['query'][:l]))
        matches.append(match)


('fiat punto', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('motorhome', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('volkswagen vento', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('fiat uno scr', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('peugeot 308', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('chevrolet cruze', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('fiat fiorino', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('peugeot 208', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('ford f 100', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('campera de cuero', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('renault trafic', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('play station 4', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('volkswagen suran', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('fiat strada', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('play station 3', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('volkswagen voyage', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('chevrolet onix', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('ford-f100', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('renault duster', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('peugeot 408', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('ford fiesta kinetic', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('fiat ducato', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('chevrolet corsa classic', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('volkswagen polo', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('samsung grand prime', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('renault twingo', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('fiat uno fire', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('peugeot 106', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('fiat toro', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('volkswagen senda', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('moto g4 plus', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('iphone 6s plus', [u'iphone', u'iphone 6', u'iphone 7', u'iphone 6s', u'iphone 5s', u'iphone 5s', u'iphones', u'iphone 6 plus', u'iphone 4', u'iphone 7 plus'])
('juego de comedor', [u'juegos ps4', u'juegos ps3', u'juegos de ps3', u'juegos', u'juegos de ps4', u'juegos de play 3', u'juego ps4', u'juego de living', u'juego ps3', u'juego de ba\xf1o'])
('chevrolet aveo', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('placa de video', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('renault sandero stepway', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('samsung s6 edge', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('ford sierra', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('renault fuego', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('honda tornado 250', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('chevrolet agile', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('renault 4', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('chevrolet prisma', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('honda crv', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('honda biz 125', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('honda titan 150', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('motos honda', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('renault sandero', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('ropa y calzados', [u'ropero', u'ropa', u'roperos y placares', u'ropa deportiva', u'ropa de hombre', u'ropa interior', u'ropa de mujer', u'ropa adidas', u'ropa de bebe', u'ropa de nena'])
('honda cg titan 150', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('samsung s7 edge', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('chevrolet vectra', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('honda xr 125', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('motomel', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('honda falcon', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('chevrolet celta', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('renault master', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('honda city', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('honda dax', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('zapatillas new balance', [u'zapatillas', u'zapatos', u'zapatillas nike', u'zapatillas adidas', u'zapas', u'zapatero', u'zapatillas 42', u'zapatos de mujer', u'zapatillas 38', u'zapatillas de mujer'])
('renault logan', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('renault fluence', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('juego de sillones', [u'juegos ps4', u'juegos ps3', u'juegos de ps3', u'juegos', u'juegos de ps4', u'juegos de play 3', u'juego ps4', u'juego de living', u'juego ps3', u'juego de ba\xf1o'])
('fiat 1500', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('ford 100', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('moto x play', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('campera adidas', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('ford mondeo', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('placares', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('renault laguna', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('motor fuera de borda', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('cascos para motos', [u'casco', u'casa', u'casa en alquiler', u'casio', u'casillas', u'casas en ventas', u'casas en alquiler', u'casa rodante', u'casco moto', u'caseros'])
('camperas de mujer', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('motosierra', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('peugeot 207 compact', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('camas cuchetas', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('peugeot 404', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('ford-ranger', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('renault 6', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('honda cbr', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('peugeot 406', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('chevrolet meriva', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('perros en adopcion', [u'permuto', u'perros', u'perchero', u'permutas', u'perchas', u'perfumes', u'perfume', u'vendo o permuto', u'perritos', u'permuto celular'])
('samsung galaxy j7', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('parlantes para autos', [u'parlantes', u'partner', u'parka', u'peugeot partner', u'parrilla', u'paraguitas', u'parca', u'paraguas', u'paruolo', u'parner'])
('fiat 500', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('chevrolet silverado', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('ford 350', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('renault express', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('moto z play', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('peugeot 307 hdi', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('peugeot boxer', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('cordoba-vende', [u'corsa', u'corolla', u'cortinas', u'chevrolet corsa', u'core', u'corven', u'core 2', u'toyota corolla', u'corsa 2', u'corpi\xf1o'])
('botas de mujer', [u'botas', u'botines', u'botes', u'botitas', u'botin', u'botines de futbol', u'botiquin', u'botinetas', u'botines nike', u'botas de lluvia'])
('chevrolet apache', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('cortadora de fiambre', [u'corsa', u'corolla', u'cortinas', u'chevrolet corsa', u'core', u'corven', u'core 2', u'toyota corolla', u'corsa 2', u'corpi\xf1o'])
('moto z', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('camiones mercedes benz', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('samsung a9', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('cama de dos plazas', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('honda twister 250', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('campera hombre', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('toyota-hilux', [u'toyota', u'toyota hilux', u'toyota corolla', u'toyota hilux 4x4', u'toyota sw4', u'caniche toy', u'toyota etios', u'toyota hilux srv', u'caniche mini toy', u'toyotas hilux'])
('chevrolet s 10', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('mesa de pool', [u'mesa', u'mesada', u'mesa y sillas', u'mesa tv', u'mesita', u'mesa de luz', u'mesa ratona', u'mesas y sillas', u'bajo mesada', u'mesa pc'])
('peugeot partner patagonica', [u'peugeot 206', u'peugeot', u'peugeot 504', u'peugeot 405', u'peugeot 306', u'peugeot 207', u'peugeot 307', u'peugeot partner', u'peugeot 505', u'peugeot 205'])
('chevrolet blazer', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('honda storm', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('zapatillas nike air max', [u'zapatillas', u'zapatos', u'zapatillas nike', u'zapatillas adidas', u'zapas', u'zapatero', u'zapatillas 42', u'zapatos de mujer', u'zapatillas 38', u'zapatillas de mujer'])
('motomel 150', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('samsung j3', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('vw vento', [u'vw gol', u'vw fox', u'vw', u'vw bora', u'vw golf', u'vw polo', u'vw senda', u'vw up', u'vw amarok', u'vw saveiro'])
('honda xr 150', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('renault r12', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('chevrolet chevy', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('campera de mujer', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('casa-rodante', [u'casco', u'casa', u'casa en alquiler', u'casio', u'casillas', u'casas en ventas', u'casas en alquiler', u'casa rodante', u'casco moto', u'caseros'])
('ford transit', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('camperas de cuero', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('moto g 3', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('moto x style', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('fiat 128 super europa', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('motomel 110', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('cocina industrial', [u'cocina', u'coche', u'coches', u'cocker', u'coca', u'cochecitos', u'cocinero', u'cochesito', u'cochera', u'cocinas usada'])
('motos usadas', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('camionetas ford', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('chevrolet spin', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('casillas rodantes', [u'casco', u'casa', u'casa en alquiler', u'casio', u'casillas', u'casas en ventas', u'casas en alquiler', u'casa rodante', u'casco moto', u'caseros'])
('bicicleta rodado 26', [u'bici', u'bicicletas', u'bicimoto', u'bici fija', u'bicicleta fija', u'bici bmx', u'bicicleta playera', u'bici moto', u'bicicletas usada', u'bicicletas bmx'])
('campera nike', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('renault scenic', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('ford-ka', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('ford ranger 4x4', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('playstation 4', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('honda cb1', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('ford orion', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('motorhomes', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('golf gti', [u'gol', u'golf', u'gol trend', u'vw gol', u'volkswagen gol', u'gol gl', u'golden', u'gol power', u'gola', u'gol 94'])
('motos 125', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('volkswagen gol power', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('fiat idea', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('fiat tipo', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('fiat palio adventure', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('camisetas de futbol', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('ford ranchero', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('corsa 3 puertas', [u'corsa', u'corolla', u'cortinas', u'chevrolet corsa', u'core', u'corven', u'core 2', u'toyota corolla', u'corsa 2', u'corpi\xf1o'])
('venta de fondo de comercio', [u'vento', u'ventanas', u'venzo', u'vendo o permuto', u'ventilador', u'vento tdi', u'vw vento', u'ventiluz', u'volkswagen vento', u'vendo'])
('samsung a3', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('cochecito de bebe', [u'cocina', u'coche', u'coches', u'cocker', u'coca', u'cochecitos', u'cocinero', u'cochesito', u'cochera', u'cocinas usada'])
('departamentos en venta', [u'alquiler departamentos', u'depiladora', u'departamento', u'depilacion', u'ropa deportiva', u'departamentos en alquiler', u'deportivo', u'deposito', u'depto', u'deportiva'])
/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:4: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
('juego de ba\xc3\xb1o', [u'juegos ps4', u'juegos ps3', u'juegos de ps3', u'juegos', u'juegos de ps4', u'juegos de play 3', u'juego ps4', u'juego de living', u'juego ps3', u'juego de ba\xf1o'])
('camionetas en venta', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('vw gol trend', [u'vw gol', u'vw fox', u'vw', u'vw bora', u'vw golf', u'vw polo', u'vw senda', u'vw up', u'vw amarok', u'vw saveiro'])
('camion volcador', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('motos 150', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('cama cucheta', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('chevrolet luv', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('chery tiggo', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
('plantas', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('zapatillas mujer', [u'zapatillas', u'zapatos', u'zapatillas nike', u'zapatillas adidas', u'zapas', u'zapatero', u'zapatillas 42', u'zapatos de mujer', u'zapatillas 38', u'zapatillas de mujer'])
('motos 110 en venta', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('campera mujer', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('ford eco sport', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('honda fan', [u'honda', u'honda wave', u'honda biz', u'honda tornado', u'honda titan', u'honda civic', u'honda twister', u'honda fit', u'honda xr', u'honda cg'])
('zapatillas hombre', [u'zapatillas', u'zapatos', u'zapatillas nike', u'zapatillas adidas', u'zapas', u'zapatero', u'zapatillas 42', u'zapatos de mujer', u'zapatillas 38', u'zapatillas de mujer'])
('gol 2000', [u'gol', u'golf', u'gol trend', u'vw gol', u'volkswagen gol', u'gol gl', u'golden', u'gol power', u'gola', u'gol 94'])
('camilla', [u'camas', u'camperas', u'camisa', u'camionetas', u'camiones', u'camion', u'camara', u'camiseta', u'cambio', u'camperon'])
('juego de sillones de living', [u'juegos ps4', u'juegos ps3', u'juegos de ps3', u'juegos', u'juegos de ps4', u'juegos de play 3', u'juego ps4', u'juego de living', u'juego ps3', u'juego de ba\xf1o'])
('termotanque electrico', [u'terrenos', u'termo', u'termotanque', u'terrenos en ventas', u'termotanques', u'tern', u'bull terrier', u'terrier', u'termofusora', u'terios'])
('renault clio mio', [u'renault 12', u'renault 9', u'renault 19', u'renault', u'renault clio', u'renault 11', u'renault 18', u'renault kangoo', u'renault megane', u'renault 21'])
('moto e', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('planchita de pelo', [u'play 3', u'play 2', u'play 4', u'play', u'placard', u'plan', u'plancha', u'play station 2', u'planchita', u'playera'])
('botines adidas', [u'botas', u'botines', u'botes', u'botitas', u'botin', u'botines de futbol', u'botiquin', u'botinetas', u'botines nike', u'botas de lluvia'])
('ford fiesta max', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('fiat palio gnc', [u'fiat', u'fiat 147', u'fiat uno', u'fiat 128', u'fiat 600', u'fiat duna', u'fiat palio', u'fiat siena', u'fiat 1', u'fiat 125'])
('ford k', [u'ford ka', u'ford', u'ford f100', u'ford falcon', u'ford escort', u'ford fiesta', u'ford focus', u'ford rangers', u'ford taunus', u'ford ecosports'])
('motor-fuera-de-borda', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('yamaha cripton', [u'yamaha', u'yamaha fz', u'yamaha ybr', u'yamaha ybr 125', u'yamaha crypton', u'yamaha xtz 125', u'yamaha xtz', u'yamaha r6', u'yamaha fz 16', u'yamaha dt'])
('cortadora de cesped', [u'corsa', u'corolla', u'cortinas', u'chevrolet corsa', u'core', u'corven', u'core 2', u'toyota corolla', u'corsa 2', u'corpi\xf1o'])
('casilla rodante', [u'casco', u'casa', u'casa en alquiler', u'casio', u'casillas', u'casas en ventas', u'casas en alquiler', u'casa rodante', u'casco moto', u'caseros'])
('motomel skua', [u'motos', u'moto g', u'motos 110', u'moto 110', u'moto g3', u'moto g4', u'moto x', u'motor', u'motorola', u'moto g2'])
('samsung core 2', [u'samsung', u'samsung j7', u'samsung s6', u'samsung s7', u'samsung j5', u'samsung s4', u'samsung s5', u'samsung a5', u'samsung j1', u'samsung j2'])
('volkswagen up', [u'volkswagen gol', u'volvo', u'volkswagen bora', u'volkswagen saveiro', u'volkswagen gol trend', u'volkswagen fox', u'volkswagen amarok', u'volkswagen', u'volkswagen golf', u'volcom'])
('chevrolet sonic', [u'chevy', u'chevrolet', u'chevrolet corsa', u'chevrolet s10', u'chery', u'cher', u'chevrolet c10', u'chevrolet 400', u'chevrolet astra', u'cheeky'])
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-535-f644699e54b2> in <module>()
      2 for i, p in export[:1000].iterrows():
      3     for l in [3]:
----> 4         match = p['query'] in suggest(p['query'][:l])
      5         if not match:
      6             print(p['query'], suggest(p['query'][:l]))

<ipython-input-518-296c2104f9be> in suggest(search_term)
      4     url = 'http://localhost:8983/solr/popular_queries/select?q=query_ngram:'+search_term+'&%20sort=product(popularity,reply_conversion)%20desc&rows=10&fl=query&wt=json'
      5     url2 = 'http://localhost:8983/solr/popular_queries/select?q=query_ngram:'+search_term +'&%20sort='            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2))%20desc'            '&rows=10&fl=query&wt=json&fl=query,popularity,reply_conversion,research_quote,'            'first_cat_name,second_cat_name,second_cat_significance,first_cat_significance,'            'product(popularity,reply_conversion,pow(strdist(%22'+search_term+'%22,query,ngram),2)),'            'strdist(%22'+search_term+'%22,query,ngram),'            'pow(strdist(%22'+search_term+'%22,query,ngram),2),score&debugQuery=on'
----> 6     r = requests.get(url2).json()
      7     return [d.get('query', 'error')for d in r.get('response', {}).get('docs',{})]

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/api.pyc in get(url, params, **kwargs)
     68 
     69     kwargs.setdefault('allow_redirects', True)
---> 70     return request('get', url, params=params, **kwargs)
     71 
     72 

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/api.pyc in request(method, url, **kwargs)
     54     # cases, and look like a memory leak in others.
     55     with sessions.Session() as session:
---> 56         return session.request(method=method, url=url, **kwargs)
     57 
     58 

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    472             hooks = hooks,
    473         )
--> 474         prep = self.prepare_request(req)
    475 
    476         proxies = proxies or {}

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/sessions.pyc in prepare_request(self, request)
    405             auth=merge_setting(auth, self.auth),
    406             cookies=merged_cookies,
--> 407             hooks=merge_hooks(request.hooks, self.hooks),
    408         )
    409         return p

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/models.pyc in prepare(self, method, url, headers, files, data, params, auth, cookies, hooks, json)
    300 
    301         self.prepare_method(method)
--> 302         self.prepare_url(url, params)
    303         self.prepare_headers(headers)
    304         self.prepare_cookies(cookies)

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/site-packages/requests/models.pyc in prepare_url(self, url, params)
    340         #: https://github.com/kennethreitz/requests/pull/2238
    341         if isinstance(url, bytes):
--> 342             url = url.decode('utf8')
    343         else:
    344             url = unicode(url) if is_py2 else str(url)

/Users/johannes.braun/anaconda3/envs/python2/lib/python2.7/encodings/utf_8.pyc in decode(input, errors)
     14 
     15 def decode(input, errors='strict'):
---> 16     return codecs.utf_8_decode(input, errors, True)
     17 
     18 class IncrementalEncoder(codecs.IncrementalEncoder):

UnicodeDecodeError: 'utf8' codec can't decode byte 0xc3 in position 66: invalid continuation byte

In [536]:
(sum(matches)+0.0)/len(matches)


Out[536]:
0.8052910052910053

In [630]:
sum(matches)


Out[630]:
761

In [629]:
len(matches)


Out[629]:
945

In [567]:
def transform_tolist(sc):
    d = []
    while len(sc) >= 2:
        sc[1]['token'] = sc[0]
        d.append(sc[1])
        sc = sc[2:]
    return d

In [568]:
def spell_check(search_term, index):
    #url = 'http://localhost:8983/solr/letgoar/select?q='+search_term+'&df=suggest&wt=json'
    url = 'http://localhost:8983/solr/'+index+'/spell?df=query&spellcheck.q='+search_term+'&spellcheck=true&spellcheck.collateParam.q.op=AND&wt=json'
    print(url)
    r = requests.get(url).json()
    suggestions = r.get('spellcheck', {}).get('suggestions', [])
    suggestions = transform_tolist(suggestions)
    checked = search_term
    for s in suggestions:
        t = s['token']
        freq = s['origFreq']
        for ss in s['suggestion']:
            if freq < ss['freq']:
                t = ss['word']
                freq = ss['freq']
        checked = checked.replace(s['token'], t)
    return checked , suggestions

In [571]:
suggest('adida')


Out[571]:
([u'adidas',
  u'zapatillas adidas',
  u'campera adidas',
  u'adidas superstar',
  u'buzo adidas',
  u'conjunto adidas',
  u'camperas adidas',
  u'ropa adidas',
  u'botines adidas',
  u'pantalon adidas'],
 'http://localhost:8983/solr/popular_queries/select?q=query_ngram:adida&%20sort=product(popularity,reply_conversion,pow(strdist(%22adida%22,query,ngram),2))%20desc&rows=10&fl=query&wt=json&fl=query,popularity,reply_conversion,research_quote,first_cat_name,second_cat_name,second_cat_significance,first_cat_significance,product(popularity,reply_conversion,pow(strdist(%22adida%22,query,ngram),2)),strdist(%22adida%22,query,ngram),pow(strdist(%22adida%22,query,ngram),2),score&debugQuery=on')

In [569]:
spell_check("adida", 'popular_queries')


http://localhost:8983/solr/popular_queries/spell?df=query&spellcheck.q=adida&spellcheck=true&spellcheck.collateParam.q.op=AND&wt=json
Out[569]:
(u'adidas',
 [{u'endOffset': 5,
   u'numFound': 10,
   u'origFreq': 0,
   u'startOffset': 0,
   u'suggestion': [{u'freq': 218, u'word': u'adidas'},
    {u'freq': 1, u'word': u'adidaa'},
    {u'freq': 1, u'word': u'adidad'},
    {u'freq': 5, u'word': u'adda'},
    {u'freq': 3, u'word': u'akira'},
    {u'freq': 3, u'word': u'akita'},
    {u'freq': 2, u'word': u'addidas'},
    {u'freq': 2, u'word': u'adiddas'},
    {u'freq': 1, u'word': u'adela'},
    {u'freq': 1, u'word': u'awada'}],
   'token': u'adida'}])

In [570]:
spell_check("adida", 'olx')


http://localhost:8983/solr/olx/spell?df=query&spellcheck.q=adida&spellcheck=true&spellcheck.collateParam.q.op=AND&wt=json
Out[570]:
(u'adidas',
 [{u'endOffset': 5,
   u'numFound': 5,
   u'origFreq': 7,
   u'startOffset': 0,
   u'suggestion': [{u'freq': 586, u'word': u'adidas'},
    {u'freq': 3, u'word': u'adidad'},
    {u'freq': 1, u'word': u'aadida'},
    {u'freq': 1, u'word': u'addida'},
    {u'freq': 1, u'word': u'adita'}],
   'token': u'adida'}])

In [ ]: