this notebook creates a function that will scrape content of the given urls
it then use this function on urls from the video views we retrieved from vidible_raw
in order to create a url to locations table
NOTE because we scrape the web-page content and not the video content later queries should be limited to seq=1
(seq>1 are not necessarily related to the page content)
scraping is done using pup cli tool, download: https://github.com/EricChiang/pup/releases/tag/v0.4.0
define:
site_scrape_dict: site to css selector dictionary
extract_locales: function that get 'site' and 'url' args and return set of locations
In [1]:
import numpy as np
import pandas as pd
from spacy.symbols import pobj
site_scrape_dict = {
# the following represents html selector to retrieve the header + 2 first test paragraphs
'aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
'homepage.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
'hp-desktop.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
'help.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #articlex-wrapper > div.article-content > p:nth-child(3)', # we might need to exclude it
'aol.co.uk': 'body > div.lo-container > div > section > article > header > div.show-article-title > h1, body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(2), body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(3), body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(4)',
'build.aol.com': '#build-video-player > div.video-content-main > div.videoplayer-info > div > div.videotext > h1, #build-video-player > div.video-content-main > div.videoplayer-info > div > div.videotext > span.videodesc',
}
def extract_locales(url, site):
""" returns a set of gpe unicode strings """
raw_text = _scrape_site(url, site)
# print(raw_text) #debugging
gpe_list = _get_gpes(raw_text)
return gpe_list
import spacy
nlp = spacy.load('en')
def _get_gpes(raw_text):
"""
"""
gpe_list = set()
if raw_text is None:
return gpe_list
raw_text = raw_text.strip().replace("\n", " ").replace("\r", " ")
doc = nlp(raw_text)
for chunk in list(doc.noun_chunks):
gpe = None
isPobj = False
for sub_chunk in list(chunk.subtree):
if(sub_chunk.ent_type_ == 'GPE'):
gpe = sub_chunk.string
if(sub_chunk.dep == pobj):
isPobj = True
if ((gpe != None) & isPobj):
# print(gpe) # same value can be added more then once - chunk.subtree may return the same phrase more then once
gpe_list.add(gpe)
return gpe_list
# list(list(doc.noun_chunks)[6].subtree)[1].ent_type_
# list(list(doc.noun_chunks)[6].subtree)[2].dep_
import subprocess
def _scrape_site(url, site):
if site in site_scrape_dict:
html_selector = site_scrape_dict[site]
else:
html_selector = 'h1' # this might be dangerous - returning to many results ..
# return '' another option is to scrape only sites we know
command = "curl -s '" + url + "' |pup '" + html_selector + " text{}'"
# print("DEBUG scrape: {}".format(command))
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
if out:
return out.decode('utf-8')
if err:
print("failed to scrape {}".format(url))
return ''
create a unique ['url', 'site']
NOTE in this example we take only 5 rows (scraping may run for a long time)
In [17]:
df = pd.read_csv('/Users/ezer/dev/ml/factorization_matrix/baseline/data/memsql/memsql_test3.csv', skiprows=1000, header=1, nrows=5, parse_dates=['reporttime'], names=['ip','sid','vid','seq','site','r','pid','countrycode','stateprovince','city','devType','max_vpt','max_t','max_pct','reporttime'])
print("num of rows (before unique): {}".format(df.shape[0]))
df = df.filter(['r','site'], axis=1) # df['seq'] == 1
df = df.groupby(['r', 'site']).count() #.reset_index()
df = df.reset_index()
print("columns: {}".format(df.columns))
print("num of rows for scraping: {}".format(df.shape[0]))
if df.shape[0] > 10:
print("WARNING! executing large number of rows may take a long while: {}".format(df.shape[0]))
create a new csv that will hold url to extracted locations (pipe delimited)
In [20]:
total = df.shape[0]
current = 0
OUTPUT_FILE = '/tmp/locals_of_urls.csv'
with open(OUTPUT_FILE,'w') as f:
f.write('url,locations\n')
for index, row in df.iterrows():
url, site = row['r'], row['site']
local_set = extract_locales(url, site)
csv_locals = '|'.join(str(s).strip() for s in local_set)
line = "{},{}\n".format(url, csv_locals)
f.write(line)
current+=1
if current%10 == 0: # print every 10 urls (reduce garbage..)
print("adding [{} of {}], url: {}".format(current, total, url))
print "*** Done! ***"
check the result of the new file
In [21]:
locations_df = pd.read_csv(OUTPUT_FILE, na_filter='')
print("locations_df num of rows: {}".format(locations_df.shape[0]))
if (locations_df.shape[0] != df.shape[0]):
print("there is a count mismatch between original: {} and location urls: {}")
locations_df.head()
Out[21]:
In [ ]: