In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
# import pfile.to
# import pstr.to
from bs4 import BeautifulSoup
import requests
# import pstr
import re
# import pickle
# import urllib2
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys
import urlparse
import urllib
from datetime import datetime
import random

import pfile.to
import pstr.to
import parse.util
import pstr.trans
import pickle

import pandas as pd
import numpy as np

import pfile.accessor

import pprint
ppr = pprint.PrettyPrinter(indent=4)

In [210]:
%pdb


Automatic pdb calling has been turned ON


In [4]:
##########
# SETTINGS
root_folder = os.path.join(os.environ['GD_FOLDER'], 'Shared/ms_otosense')
parse_dicts_folder = os.path.join(root_folder, 'parse_dicts')

facc = pfile.accessor.for_local(root_folder)
dfacc = pfile.accessor.for_local(parse_dicts_folder, extension='dict_list', force_extension=True)

list_of_gshop_text_filename = os.path.join(root_folder, 'list_of_gshop_files.txt')

In [5]:
def get_subdict(d, list_of_keys):
    '''
    :param d: dict
    :param subset_of_keys: list of keys
    :return: the subset of key:value pairs of d where key is in list_of_keys
    '''
    return dict([(i, d[i]) for i in list_of_keys if i in d])

In [6]:
# dict_list = pfile.name.files_of(parse_dicts_folder)
dict_list = os.listdir(parse_dicts_folder)[:-1]
len(dict_list)


Out[6]:
498

In [7]:
dict_list[-1]


Out[7]:
'http{\xc2\xa7\xc2\xa7www.google.com\xc2\xa7search?start=401&num=100&tbm=shop&query=deaf+clock+vibrating&tbs=p_ord%3Arv.dict_list'

In [8]:
def mk_item_info_dict(parse_dict_item):
    info_dict = dict()
    if 'desc' in parse_dict_item.keys():
        info_dict['desc'] = parse_dict_item['desc']
#     info_dict['href'] = parse_dict_item['href']
    if 'img_src' in parse_dict_item.keys():
        info_dict['img_src'] = parse_dict_item['img_src']
    if 'href_text' in parse_dict_item.keys():
        info_dict['title'] = parse_dict_item['href_text']
    if 'psliprice_divs_text' in parse_dict_item.keys():
        t = parse_dict_item['psliprice_divs_text']
        if len(t) >= 2:
            info_dict['price_str'] = t[0]
            info_dict['provider'] = t[1]
    return info_dict

def mk_item_info_df_from_parse_dict(parse_dict):
    return pd.DataFrame([mk_item_info_dict(dd) for dd in parse_dict])

def mk_item_info_df_from_multiple_parse_dicts(parse_dict_enum):
    #     df = pd.DataFrame()
    #     for parse_dict in parse_dict_enum:
    #         df = pd.concat([df, mk_item_info_df_from_parse_dict(parse_dict)])
    #     return df.reset_index(drop=True)
    ## !!! BETTER WAY (because it uses the builtin reduce function)
    return reduce(lambda x,y: pd.concat([x,mk_item_info_df_from_parse_dict(y)]), 
            DictLoader(), pd.DataFrame()) \
            .reset_index(drop=True)    


class DictLoader:
    def __init__(self, max_idx=None):
        self.dict_list = os.listdir(parse_dicts_folder)[:-1]
        self.max_idx = max_idx or (len(self.dict_list)-1)
        self.idx = -1

    def __iter__(self):
        return self

    def next(self):
        self.idx += 1
        if self.idx > self.max_idx:
            raise StopIteration
        else:
            return dfacc.load(self.dict_list[self.idx])

In [215]:
df = mk_item_info_df_from_multiple_parse_dicts(DictLoader())

In [19]:
## !!! BETTER WAY (because it uses the builtin reduce function)
df = reduce(lambda x,y: pd.concat([x,mk_item_info_df_from_parse_dict(y)]), 
            DictLoader(), pd.DataFrame()) \
            .reset_index(drop=True)

In [15]:


In [16]:
tt = reduce(lambda x,y: pd.concat([x,mk_item_info_df_from_parse_dict(y)]), t, pd.DataFrame())

In [17]:
tt


Out[17]:
desc img_src price_str provider title
0 Bundle including the CLARITY-52510 AL10 AlertM... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWv... $178.00 eBay Clarity-52510120 Al10 Alertmaster Alert Master...
1 MSRP: $179.95 Clarity AlertMaster visual alert... http://t0.gstatic.com/shopping?q=tbn:ANd9GcTIJ... $165.00 eBay Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
0 Sleep peacefully while feeling confident that ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQlF... $248.99 LibertyHealthSupply.com Safe Awake
0 Smoke alarm with silence and latching, screene... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQ4U... $7 from 25+ stores First Alert 9120-12st Smoke Alarm, 120 Volt
1 A 120V AC Wire In Alarm with Battery Backup pr... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQeY... $29.95 Maxi-Aids Kidde Smoke Alarm with Battery Backup
2 120V AC Wire In Combination Carbon Monoxide an... http://t3.gstatic.com/shopping?q=tbn:ANd9GcTrf... $179.95 Maxi-Aids Kidde Combo Carbon Monoxide and Smoke Alarm wi...
3 A 120V AC Wire In Smoke Alarm with Battery Bac... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQyi... $129.95 Maxi-Aids Kidde Smoke Alarm with Strobe Light
4 http://t2.gstatic.com/shopping?q=tbn:ANd9GcTXK... $6.88 Electric Bargain Store First Alert BRK SA100B 120V AC Hardwired Ioniz...
5 "The 710CS smoke detector is a hard wired smok... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQHc... $86 from 5+ stores Gentex 710CS Hard Wired Ceiling Mount Smoke Alarm
6 A 120V AC Wire In Alarm with Battery Backup pr... http://t2.gstatic.com/shopping?q=tbn:ANd9GcR-j... $81.95 Maxi-Aids Kidde Combination Smoke and Carbon Monoxide Al...
7 120V AC Wire In hard wiriring Carbon Monoxide ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcTsi... $189.95 Maxi-Aids Kidde Carbon Monoxide and Smoke Alarms with St...
8 Sleep peacefully while feeling confident that ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQlF... $248.99 LibertyHealthSupply.com Safe Awake
9 Save $39.95 with this package over purchasing ... http://t0.gstatic.com/shopping?q=tbn:ANd9GcSGw... $127 from 10+ stores Central Alert System Receiver and Clock HC-CA360
0 Save $39.95 with this package over purchasing ... http://t0.gstatic.com/shopping?q=tbn:ANd9GcSGw... $127 from 10+ stores Central Alert System Receiver and Clock HC-CA360
0 We dare you to sleep through this alarm clock.... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQt3... $31 from 10+ stores Reizen SBB500SS Sonic Bomb Alarm Clock and Bed...
1 Guaranteed to wake even the deepest sleeper in... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQ3e... $45 from 50+ stores Sonic Alert SA-SBD375SS - Dual Alarm Clock w/ ...
2 Built-in telephone signaler. 87 dB extra-loud ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQG2... $90 from 25+ stores Clear Sounds CLS-SW200 Shake Up Wake Up Alarm ...
3 Sonic Boom Alarm Clock The Sonic Boom is a cos... http://t1.gstatic.com/shopping?q=tbn:ANd9GcSCX... $38 from 25+ stores Sonic Alert Sonic Boom SB300ss Large Display A...
4 The Multi-Alarm Vibrating Reminder Watch for t... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQ3p... $232.91 Rehabmart.com Multi-Alarm Vibrating Reminder Watch for the H...
5 Shake - n - Wake zzZ Silent Alarm Clock wakes ... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQNO... $27.99 eBay Vibrating "shake-n-wake" Personal Alarm Clock ...
6 SILENT CALL MIDLAND NOAA WEATHER HAZARDS LERT ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcSUL... $144.50 eBay - myheargear Silent Call Midland Noaa Weather Alert Radio -...
7 Bundle including the CLARITY-52510 AL10 AlertM... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWv... $178.00 eBay Clarity-52510120 Al10 Alertmaster Alert Master...
8 Wake up to a reliable discreet vibration The R... http://t2.gstatic.com/shopping?q=tbn:ANd9GcSyw... $14.95 Maxi-Aids Reizen Shake U Up Compact Vibrating Alarm Clock
9 No more forgetting appointments or medicine do... http://t3.gstatic.com/shopping?q=tbn:ANd9GcRCb... $11.95 Maxi-Aids Vibrating Alarm Clock Keychain
10 Great for the deaf and the hard of hearing as ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQA_... $19.70 Maxi-Aids Hydas Pillow Vibrating Alarm
11 Silent Vibrating Personal Alarm Clock "Shake-N... http://t3.gstatic.com/shopping?q=tbn:ANd9GcShC... $38.95 eBay - sportsc4you Wrist Worn Silent Vibrating Alarm Clock Detach...
12 Sonic Alert SB200ss Portable Vibrating Travel ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQvM... $38.88 As Seen On TV Junkies Sonic Alert Sb200ss Portable Vibrating Travel ...
13 The Global 360 Vibrating Alarm Clock is design... http://t2.gstatic.com/shopping?q=tbn:ANd9GcTBy... $48 from 5+ stores Global Assistive 360 Vibrating Alarm Clock,Cha...
14 The stylish yet functional Casio Futurist Mens... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWc... $65 Maxi-Aids Casio Futurist Mens Vibrating Alarm Watch Black
15 This extra loud vibrating alarm clock is desig... http://t3.gstatic.com/shopping?q=tbn:ANd9GcRDi... $59.00 Dynamic Living Wake Assure Vibrating Alarm Clock
16 The stylish Sonic Boom SB300ss from Sonic Aler... http://t1.gstatic.com/shopping?q=tbn:ANd9GcRVo... $48.00 Vaughn Engineering Sonic Alert Sonic Boom SB300ss Vibrating Alarm...
17 Silent Vibrating Alarm Clock Wrist Watch Pillo... http://t3.gstatic.com/shopping?q=tbn:ANd9GcSh5... $39.95 Bonanza - Abby Winston Bargain Shop Silent Vibrating Alarm Clock Wrist Watch Pillo...
18 mrtiny1 store SONIC ALERT SONIC BOOM SB300SS A... http://t3.gstatic.com/shopping?q=tbn:ANd9GcTMx... $36.95 eBay - myheargear Sonic Alert Sonic Boom Sb300ss Vibrating Alarm...
19 IN STOCK NOW AND READY TO SHIP Be As Green As ... http://t2.gstatic.com/shopping?q=tbn:ANd9GcR_3... $35.75 eBay - thewholesalegoliath 113db Really Very Loud Alarm Clock Bed Shaker/...
20 Product Features:Silent Vibrating Alarm Won't ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcSYo... $29.99 eBay Personal Alarm Clock "shake-n-wake" Silent Vib...
21 Product Features:Silent Vibrating Alarm Won't ... http://t0.gstatic.com/shopping?q=tbn:ANd9GcT2m... $32.79 eBay Silent Vibrating Personal Alarm Clock "shake-n...
22 The Amplicom TCL 100 Analog Alarm Clock with W... http://t3.gstatic.com/shopping?q=tbn:ANd9GcSfs... $72.95 Maxi-Aids Amplicom Alarm Clock Ring Signaler with Vibrator
23 Sonic Boom Extra Loud Vibrating Alarm Clock Al... http://t3.gstatic.com/shopping?q=tbn:ANd9GcSUF... $49.95 eBay - samw4174 Sonic Alert Adjustable Extra Loud Vibrating Al...
24 You are bidding on a brand new Sonic Boom Elec... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQP-... $39.95 eBay - samw4174 Sonic Boom Analog Vibrating Extra Loud Alarm C...
25 NaN NaN NaN NaN NaN
0 We dare you to sleep through this alarm clock.... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQt3... $31 from 10+ stores Reizen SBB500SS Sonic Bomb Alarm Clock and Bed...
1 Guaranteed to wake even the deepest sleeper in... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQ3e... $45 from 50+ stores Sonic Alert SA-SBD375SS - Dual Alarm Clock w/ ...
2 Built-in telephone signaler. 87 dB extra-loud ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQG2... $90 from 25+ stores Clear Sounds CLS-SW200 Shake Up Wake Up Alarm ...
3 Sonic Boom Alarm Clock The Sonic Boom is a cos... http://t1.gstatic.com/shopping?q=tbn:ANd9GcSCX... $38 from 25+ stores Sonic Alert Sonic Boom SB300ss Large Display A...
4 The Multi-Alarm Vibrating Reminder Watch for t... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQ3p... $232.91 Rehabmart.com Multi-Alarm Vibrating Reminder Watch for the H...
5 Shake - n - Wake zzZ Silent Alarm Clock wakes ... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQNO... $27.99 eBay Vibrating "shake-n-wake" Personal Alarm Clock ...
6 NaN NaN NaN NaN NaN
0 Built-in telephone signaler. 87 dB extra-loud ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQG2... $90 from 25+ stores Clear Sounds CLS-SW200 Shake Up Wake Up Alarm ...
1 Sonic Alert Sonic Boom SB1000 Alarm Clock w/ L... http://t1.gstatic.com/shopping?q=tbn:ANd9GcSJU... $54.25 eBay Sonic Alert Sonic Boom Sb1000 Alarm Clock W/ L...
2 The Amplicom TCL 100 Analog Alarm Clock with W... http://t3.gstatic.com/shopping?q=tbn:ANd9GcSfs... $72.95 Maxi-Aids Amplicom Alarm Clock Ring Signaler with Vibrator
3 The Amplicom TCL 200 Talking Digital Alarm Clo... http://t2.gstatic.com/shopping?q=tbn:ANd9GcS_V... $92.95 Maxi-Aids Amplicom Talking Digital Alarm Clock with Vibr...
4 You are bidding on a brand new Sonic Boom Elec... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQP-... $39.95 eBay - samw4174 Sonic Boom Analog Vibrating Extra Loud Alarm C...
5 Bundle including the CLARITY-52510 AL10 AlertM... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWv... $178.00 eBay Clarity-52510120 Al10 Alertmaster Alert Master...
6 MSRP: $179.95 Clarity AlertMaster visual alert... http://t0.gstatic.com/shopping?q=tbn:ANd9GcTIJ... $165.00 eBay Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
7 Sleep peacefully while feeling confident that ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQlF... $248.99 LibertyHealthSupply.com Safe Awake
8 The Quake N Wake 3 Alert Multi Timer offers 3 ... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQ9n... $19.95 Maxi-Aids Quake N Wake 3 Alert Multi Timer Silver
9 The Quake N Wake 3 Alert Multi Timer offers 3 ... http://t2.gstatic.com/shopping?q=tbn:ANd9GcQFI... $19.95 Maxi-Aids Quake N Wake 3 Alert Multi Timer Black
10 Features 4 different timer stations that can b... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQjR... $58.95 Maxi-Aids Multi Station Digital Timer 95dB
11 SquareTrade AP6.0 SILENT CALL MIDLAND NOAA EME... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQ41... $99.50 eBay - myheargear Silent Call Midland Noaa Weather Alert Radio W...
12 The Sonic Ring Jr TR50 telephone signaler aler... http://t3.gstatic.com/shopping?q=tbn:ANd9GcRHS... $25 from 25+ stores Sonic Alert SA-TR50 Sonic Ring Jr. Telephone S...
... ... ... ... ...

1113 rows × 5 columns


In [216]:
len(df)


Out[216]:
1113

In [217]:
df.head()


Out[217]:
desc img_src price_str provider title
0 Bundle including the CLARITY-52510 AL10 AlertM... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWv... $178.00 eBay Clarity-52510120 Al10 Alertmaster Alert Master...
1 MSRP: $179.95 Clarity AlertMaster visual alert... http://t0.gstatic.com/shopping?q=tbn:ANd9GcTIJ... $165.00 eBay Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
2 Sleep peacefully while feeling confident that ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQlF... $248.99 LibertyHealthSupply.com Safe Awake
3 Smoke alarm with silence and latching, screene... http://t3.gstatic.com/shopping?q=tbn:ANd9GcQ4U... $7 from 25+ stores First Alert 9120-12st Smoke Alarm, 120 Volt
4 A 120V AC Wire In Alarm with Battery Backup pr... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQeY... $29.95 Maxi-Aids Kidde Smoke Alarm with Battery Backup

5 rows × 5 columns


In [218]:
print len(df['desc'].unique())
print len(df['img_src'].unique())
print len(df['price_str'].unique())
print len(df['provider'].unique())
print len(df['title'].unique())


442
465
359
91
469

In [219]:
print len(df)
print len(df.drop_duplicates())
print len(df)-len(df.drop_duplicates())


1113
532
581

In [220]:
d = df[['provider']].groupby('provider').count()
d = d.sort

In [221]:
import daf.gr
d = daf.gr.group_and_count(df[['provider']])
d = d.sort(columns=['count'], ascending=False)

In [223]:
d.head()


Out[223]:
provider count
81 from 2 stores 158
36 Maxi-Aids 110
83 from 25+ stores 104
87 from 50+ stores 71
86 from 5+ stores 55

5 rows × 2 columns


In [225]:
t = df.sort(columns=['title', 'price_str'], ascending=[True, False])
t.head()


Out[225]:
desc img_src price_str provider title
806 NaN http://t1.gstatic.com/shopping?q=tbn:ANd9GcQ34... $99.95 Sears - MaxiAids.com
543 NaN http://t0.gstatic.com/shopping?q=tbn:ANd9GcQru... $54.99 LibertyHealthSupply.com
513 NaN http://t2.gstatic.com/shopping?q=tbn:ANd9GcSly... $468.99 CHIEF
746 NaN http://t2.gstatic.com/shopping?q=tbn:ANd9GcRKA... $41.00 Newegg.com - IPC Store
231 NaN http://t3.gstatic.com/shopping?q=tbn:ANd9GcROR... $152 from 2 stores

5 rows × 5 columns


In [227]:
t['title'].iloc[0]


Out[227]:
u''

In [239]:
print len(t)
dd = t[t['title']!='']
print len(dd)
dd.head()


1113
1107
Out[239]:
desc img_src price_str provider title
297 Bellman Classic Vibrating Alarm Clock includes... http://t3.gstatic.com/shopping?q=tbn:ANd9GcTuI... $74.99 Buy Amplified Phones "Bellman Classic Vibrating Alarm Clock,4.3""H ...
300 "Bellman Pro Vibrating Alarm Clock with LED Fl... http://t1.gstatic.com/shopping?q=tbn:ANd9GcRjD... $98.99 Buy Amplified Phones "Bellman Pro Vibrating Alarm Clock with LED Fl...
313 "Bellman Pro Vibrating Alarm Clock with LED Fl... http://t1.gstatic.com/shopping?q=tbn:ANd9GcRjD... $98.99 Buy Amplified Phones "Bellman Pro Vibrating Alarm Clock with LED Fl...
347 "Bellman Pro Vibrating Alarm Clock with LED Fl... http://t1.gstatic.com/shopping?q=tbn:ANd9GcRjD... $98.99 Buy Amplified Phones "Bellman Pro Vibrating Alarm Clock with LED Fl...
622 "Bellman Pro Vibrating Alarm Clock with LED Fl... http://t1.gstatic.com/shopping?q=tbn:ANd9GcRjD... $98.99 Buy Amplified Phones "Bellman Pro Vibrating Alarm Clock with LED Fl...

5 rows × 5 columns


In [229]:
t.tail(3)


Out[229]:
desc img_src price_str provider title
563 Smoke Alarm, Detection Method Ionization, No I... http://t0.gstatic.com/shopping?q=tbn:ANd9GcQZK... $9.40 ZORO Tools kidde 0914e smoke alarm,ionization,9v
564 Smoke Alarm, Detection Method Ionization, No I... http://t2.gstatic.com/shopping?q=tbn:ANd9GcTI5... $7.99 ZORO Tools kidde 0916e smoke alarm,ionization,9v
228 USI-2413 USI Hearing Impaired Smoke and Fire A... http://t3.gstatic.com/shopping?q=tbn:ANd9GcRi2... $114.54 homegarden universal security instruments usi-2413 smoke ...

3 rows × 5 columns


In [234]:
exp = re.compile('from \d+\+ stores')
m = exp.match('from 25+ stores')
if m:
    print m.pos
else:
    print 'nope'


nope

In [272]:
print len(t)
dd = df[df['provider']!='']
dd = dd.dropna(subset=['provider'])
print len(dd)


1113
1095

In [278]:
lidx = [x==None for x in map(exp.match, list(dd['provider']))]

In [280]:
ddd = dd[lidx]
ddd.head()


Out[280]:
desc img_src price_str provider title
0 Bundle including the CLARITY-52510 AL10 AlertM... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWv... $178.00 eBay Clarity-52510120 Al10 Alertmaster Alert Master...
1 MSRP: $179.95 Clarity AlertMaster visual alert... http://t0.gstatic.com/shopping?q=tbn:ANd9GcTIJ... $165.00 eBay Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
2 Sleep peacefully while feeling confident that ... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQlF... $248.99 LibertyHealthSupply.com Safe Awake
4 A 120V AC Wire In Alarm with Battery Backup pr... http://t1.gstatic.com/shopping?q=tbn:ANd9GcQeY... $29.95 Maxi-Aids Kidde Smoke Alarm with Battery Backup
5 120V AC Wire In Combination Carbon Monoxide an... http://t3.gstatic.com/shopping?q=tbn:ANd9GcTrf... $179.95 Maxi-Aids Kidde Combo Carbon Monoxide and Smoke Alarm wi...

5 rows × 5 columns


In [285]:
d = daf.gr.group_and_count(ddd[['provider']])
d = d.sort(columns=['count'], ascending=False)
d.to_excel(facc('provider_counts.xlsx'))
d.head()


Out[285]:
provider count
79 from 2 stores 158
36 Maxi-Aids 110
80 from 3 stores 55
61 eBay 55
41 Phone Merchants 51

5 rows × 2 columns


In [285]:


In [286]:
url = df['img_src'].iloc[0]
url


Out[286]:
'http://t1.gstatic.com/shopping?q=tbn:ANd9GcQWvScogopDaS-VRp_NpZLe1xFemevuM657iSHjbJZWvjWE_ek4T3IUfdu7LEW3wiLQUWoV7w&usqp=CAE'

In [290]:
r = requests.get(url, stream=True)

In [292]:
import shutil

In [293]:
with open(facc('img.png'), 'wb') as out_file:
    shutil.copyfileobj(r.raw, out_file)

In [ ]:


In [294]:
w = list(df['img_src'].unique())

In [297]:
facc.save(w, 'img_src_list.pickle')

In [ ]:


In [162]:
root_folder


Out[162]:
'/Users/thor/Google Drive/Shared/ms_otosense'

In [164]:
facc('raw_provider_counts.html')


Out[164]:
'/Users/thor/Google Drive/Shared/ms_otosense/raw_provider_counts.html'

In [165]:
d.to_html(facc('raw_provider_counts.html'))

In [166]:
parse.util.open_in_firefox(facc('raw_provider_counts.html'))


opening /Users/thor/Google Drive/Shared/ms_otosense/raw_provider_counts.html

In [ ]:


In [184]:
import semantics.term_stats
import semantics.term_stats_maker

In [189]:
w = df[['title']].reset_index()

In [190]:
w.head()


Out[190]:
index title
0 0 Clarity-52510120 Al10 Alertmaster Alert Master...
1 1 Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
2 2 Safe Awake
3 3 First Alert 9120-12st Smoke Alarm, 120 Volt
4 4 Kidde Smoke Alarm with Battery Backup

5 rows × 2 columns


In [193]:
w = semantics.term_stats_maker.mk_terms_df(df[['title']], text_cols=['title'], id_cols=['index'])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-193-18cdc288bbdc> in <module>()
----> 1 w = semantics.term_stats_maker.mk_terms_df(df[['title']], text_cols=['title'], id_cols=['index'])

/D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats_maker.py in mk_terms_df(df, text_cols, id_cols, tokenizer_re)
     32     for c in text_cols:
     33         d = df[id_cols]
---> 34         d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c])
     35         d = daf_manip.rollout_cols(d, cols_to_rollout='term')
     36         dd = pd.concat([dd, d])

/D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats_maker.py in <lambda>(x)
     32     for c in text_cols:
     33         d = df[id_cols]
---> 34         d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c])
     35         d = daf_manip.rollout_cols(d, cols_to_rollout='term')
     36         dd = pd.concat([dd, d])

/Users/thor/anaconda/lib/python2.7/re.pyc in findall(pattern, string, flags)
    175 
    176     Empty matches are included in the result."""
--> 177     return _compile(pattern, flags).findall(string)
    178 
    179 if sys.hexversion >= 0x02020000:

TypeError: expected string or buffer
> /Users/thor/anaconda/lib/python2.7/re.py(177)findall()
    176     Empty matches are included in the result."""
--> 177     return _compile(pattern, flags).findall(string)
    178 

ipdb> u
> /D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats_maker.py(34)<lambda>()
     33         d = df[id_cols]
---> 34         d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c])
     35         d = daf_manip.rollout_cols(d, cols_to_rollout='term')

ipdb> u
> /D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats_maker.py(34)mk_terms_df()
     33         d = df[id_cols]
---> 34         d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c])
     35         d = daf_manip.rollout_cols(d, cols_to_rollout='term')

ipdb> df[c]
0     Clarity-52510120 Al10 Alertmaster Alert Master...
1     Clarity-52510-100 Al10 Alertmaster W/ Door Kno...
2                                            Safe Awake
3           First Alert 9120-12st Smoke Alarm, 120 Volt
4                 Kidde Smoke Alarm with Battery Backup
5     Kidde Combo Carbon Monoxide and Smoke Alarm wi...
6                   Kidde Smoke Alarm with Strobe Light
7     First Alert BRK SA100B 120V AC Hardwired Ioniz...
8     Gentex 710CS Hard Wired Ceiling Mount Smoke Alarm
9     Kidde Combination Smoke and Carbon Monoxide Al...
10    Kidde Carbon Monoxide and Smoke Alarms with St...
11                                           Safe Awake
12     Central Alert System Receiver and Clock HC-CA360
13     Central Alert System Receiver and Clock HC-CA360
14    Reizen SBB500SS Sonic Bomb Alarm Clock and Bed...
...
1098    Harris Communications Amplicall 10 Telephone R...
1099    Harris Communications CL-XL45 Clarity XL45 Amp...
1100    Geemarc Telephone Ringer and Flasher,14cm x 11...
1101    Serene Innovations CentralAlert Door Chime, In...
1102    Silent Call Communications Silent Call Wireles...
1103    "Krown Tactile Telephone Ring Signaler,Tactile...
1104    Serene Innovations CentralAlert Door Hanger wi...
1105        Sonic Informer Telephone Receiver - P_SASA101
1106                   Silent Call Vibra Call Basic Kit A
1107                                Vibra Call Deluxe Kit
1108               Silent Call Sidekick II Receiver Combo
1109             Silent Call VC4002-2 Vibra Call Receiver
1110         Vibralite 3 Chrome with Stainless Steel Band
1111                Vibralite 3 White with Pale Blue Trim
1112                                                  NaN
Name: title, Length: 1113, dtype: object
ipdb> ww = map(lambda x : re.findall(tokenizer_re, x), df[c])
*** TypeError: expected string or buffer
ipdb> tokenizer_re
<_sre.SRE_Pattern object at 0x10a0adc10>
ipdb> df[c].iloc[0]
u'Clarity-52510120 Al10 Alertmaster Alert Master Al12 W/ Remote Hard ...'
ipdb> exit

In [ ]:


In [182]:
w = semantics.term_stats.termdoc_to_termdoc_count(df[['title']])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-182-fdc9491ab3f5> in <module>()
----> 1 w = semantics.term_stats.termdoc_to_termdoc_count(df[['title']])

/D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats.pyc in termdoc_to_termdoc_count(term_doc_df, doc_var, term_var, count_var)
     36 def termdoc_to_termdoc_count(term_doc_df, doc_var=None, term_var='term', count_var='count'):
     37     # processing input
---> 38     term_doc_df, doc_var, term_var = __process_term_doc_var_names__(term_doc_df, doc_var=doc_var, term_var=term_var)
     39     term_doc_df = term_doc_df.groupby([doc_var, term_var]).count()
     40     term_doc_df = daf_ch.ch_col_names(term_doc_df, count_var, term_var)

/D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats.pyc in __process_term_doc_var_names__(term_doc_df, doc_var, term_var)
     72     if doc_var is None:  # try to guess it
     73         if len(cols) != 2:
---> 74             raise ValueError("In order to guess the doc_var, there needs to be only two columns")
     75         else:
     76             doc_var = list(set(cols)-set([term_var]))[0]

ValueError: In order to guess the doc_var, there needs to be only two columns
> /D/Dropbox/dev/py/proj/ms_utils/semantics/term_stats.py(74)__process_term_doc_var_names__()
     73         if len(cols) != 2:
---> 74             raise ValueError("In order to guess the doc_var, there needs to be only two columns")
     75         else:

ipdb> exit

In [ ]:


In [194]:
t = list(df['title'])
len(t)


Out[194]:
1113

IMAGE PROCESSING


In [304]:
imfacc = pfile.accessor.for_local(facc('slurp_images'))
fi = range(3)
fi[0] = 'http{§§t0.gstatic.com§shopping?q=tbn{ANd9GcQruvEIH0LiDP_kQSbHMNQmxZ7fPXxCyig_GhU3C48nmQoEdeBu05QK-B-b9L0bhdtInhZYFQ&usqp=CAE.png'
fi[1] = 'http{§§t0.gstatic.com§shopping?q=tbn{ANd9GcQTYJWmInTSU4Ip9AIb19kdn5rC5Xxxyr9iO5CQrvZwRtf-OhQ22AJ2JkpzC5ysCXq3Th8dlA&usqp=CAE.png'
fi[2] = 'http{§§t0.gstatic.com§shopping?q=tbn{ANd9GcQu0uo9SbTFPnnOCSSMY36niM1lzM9SYSUIuWvPujiIj1cfbWiO4QHzo4U08mBsNwF62nVrhA&usqp=CAE.png'

In [298]:
import scipy as sp
from scipy.misc import imread
from scipy.signal.signaltools import correlate2d

In [300]:
def imscore(imfile):
    # get JPG image as Scipy array, RGB (3 layer)
    data = imread(imfile)
    # convert to grey-scale using W3C luminance calc
    data = sp.inner(data, [299, 587, 114]) / 1000.0
    # normalize per http://en.wikipedia.org/wiki/Cross-correlation
    return (data - data.mean()) / data.std()

In [307]:
n = len(fi)
im = range(n)
for i in range(3):
    im[i] = imscore(imfacc(fi[i]))

In [310]:
im[0].shape


Out[310]:
(114, 120)

In [334]:
def comp(i,j):
    return correlate2d(im[i], im[j], mode='same').max()

In [335]:
compmat = np.zeros([3,3])
for i in range(n):
    for j in range(n):
        compmat[i,j] = comp(i,j)

In [336]:
compmat


Out[336]:
array([[ 13680.        ,  13390.04226973,   8301.97241631],
       [ 13390.04226973,  14400.        ,   8403.24086603],
       [  8301.97241631,   8403.24086603,  12000.        ]])

In [338]:
%%timeit
comp(1,2)


1 loops, best of 3: 1.82 s per loop

In [339]:
500*500*2/3600


Out[339]:
138

In [326]:
pd.DataFrame


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-326-b3d80b2a559e> in <module>()
----> 1 comp[[91,20]]

IndexError: index 91 is out of bounds for size 2
> <ipython-input-326-b3d80b2a559e>(1)<module>()
----> 1 comp[[91,20]]

ipdb> exit

In [ ]:
import ms_utils
import pdict.manip

In [2]:
import importlib

In [7]:
t = importlib.import_module


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-7-606333557576> in <module>()
----> 1 t = importlib.import_module('/D/Dropbox/dev/py/proj/khan')

/Users/thor/anaconda/lib/python2.7/importlib/__init__.pyc in import_module(name, package)
     35             level += 1
     36         name = _resolve_name(name[level:], package, level)
---> 37     __import__(name)
     38     return sys.modules[name]

ImportError: Import by filename is not supported.

In [8]:
importlib.


Out[8]:
<module 'importlib' from '/Users/thor/anaconda/lib/python2.7/importlib/__init__.pyc'>

In [1]:
import ms_utils as ms

In [3]:
ms


Out[3]:
<module 'ms_utils' from '/D/Dropbox/dev/py/proj/ms_utils/__init__.pyc'>

In [5]:
import sys

In [6]:
sys.path


Out[6]:
['',
 '/D/Dropbox/dev/py/packages/xgoogle/xgoogle',
 '/D/Dropbox/dev/py/proj/ms_utils',
 '/D/Dropbox/dev/py/proj',
 '/usr/local/lib/python',
 '/D/Dropbox/dev/py/proj/khan',
 '/D/Dropbox/dev/py/proj/global_spa/notebooks/thor',
 '/Users/thor/anaconda/lib/python27.zip',
 '/Users/thor/anaconda/lib/python2.7',
 '/Users/thor/anaconda/lib/python2.7/plat-darwin',
 '/Users/thor/anaconda/lib/python2.7/plat-mac',
 '/Users/thor/anaconda/lib/python2.7/plat-mac/lib-scriptpackages',
 '/Users/thor/anaconda/lib/python2.7/lib-tk',
 '/Users/thor/anaconda/lib/python2.7/lib-old',
 '/Users/thor/anaconda/lib/python2.7/lib-dynload',
 '/Users/thor/anaconda/lib/python2.7/site-packages',
 '/Users/thor/anaconda/lib/python2.7/site-packages/Numeric',
 '/Users/thor/anaconda/lib/python2.7/site-packages/PIL',
 '/Users/thor/anaconda/lib/python2.7/site-packages/setuptools-0.6c11-py2.7.egg-info',
 '/Users/thor/anaconda/lib/python2.7/site-packages/IPython/extensions']

In [ ]: