scraping video geo context

using url of the page that holds it

this notebook creates a function that will scrape content of the given urls it then use this function on urls from the video views we retrieved from vidible_raw in order to create a url to locations table

NOTE because we scrape the web-page content and not the video content later queries should be limited to seq=1 (seq>1 are not necessarily related to the page content)

scraping is done using pup cli tool, download: https://github.com/EricChiang/pup/releases/tag/v0.4.0

https://github.com/ericchiang/pup

define:

site_scrape_dict: site to css selector dictionary

extract_locales: function that get 'site' and 'url' args and return set of locations


In [1]:
import numpy as np
import pandas as pd

from spacy.symbols import pobj

site_scrape_dict = {
    # the following represents html selector to retrieve the header + 2 first test paragraphs
    'aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
    'homepage.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
    'hp-desktop.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #article-wrapper > div.article-content > p:nth-child(3)',
    'help.aol.com': '#article-wrapper h1, #article-wrapper > div.article-content > p:nth-child(2) , #articlex-wrapper > div.article-content > p:nth-child(3)', # we might need to exclude it
    'aol.co.uk': 'body > div.lo-container > div > section > article > header > div.show-article-title > h1, body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(2), body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(3), body > div.lo-container > div > section > article > section:nth-child(2) > div > div > p:nth-child(4)',
    'build.aol.com': '#build-video-player > div.video-content-main > div.videoplayer-info > div > div.videotext > h1, #build-video-player > div.video-content-main > div.videoplayer-info > div > div.videotext > span.videodesc',
}

def extract_locales(url, site):
    """ returns a set of gpe unicode strings """
    raw_text = _scrape_site(url, site)
#     print(raw_text) #debugging
    gpe_list = _get_gpes(raw_text)
    return gpe_list


import spacy
nlp = spacy.load('en')

def _get_gpes(raw_text):
    """

    """
    gpe_list = set()
    if raw_text is None:
        return gpe_list
    raw_text = raw_text.strip().replace("\n", " ").replace("\r", " ")
    doc = nlp(raw_text)
    for chunk in list(doc.noun_chunks):
        gpe = None
        isPobj = False
        for sub_chunk in list(chunk.subtree):
            if(sub_chunk.ent_type_ == 'GPE'):
                gpe = sub_chunk.string
            if(sub_chunk.dep == pobj):
                isPobj = True
        if ((gpe != None) & isPobj):
#             print(gpe)  # same value can be added more then once - chunk.subtree may return the same phrase more then once
            gpe_list.add(gpe)
    return gpe_list

    # list(list(doc.noun_chunks)[6].subtree)[1].ent_type_
    # list(list(doc.noun_chunks)[6].subtree)[2].dep_


import subprocess

def _scrape_site(url, site):
    if site in site_scrape_dict:
        html_selector = site_scrape_dict[site]
    else:
        html_selector = 'h1' # this might be dangerous - returning to many results ..
        # return ''  another option is to scrape only sites we know
    command = "curl -s '" + url + "' |pup '" + html_selector + " text{}'"
#     print("DEBUG scrape: {}".format(command))
    p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = p.communicate()
    if out:
        return out.decode('utf-8')
    if err:
        print("failed to scrape {}".format(url))
        return ''

create a unique ['url', 'site']

NOTE in this example we take only 5 rows (scraping may run for a long time)


In [17]:
df = pd.read_csv('/Users/ezer/dev/ml/factorization_matrix/baseline/data/memsql/memsql_test3.csv', skiprows=1000, header=1, nrows=5, parse_dates=['reporttime'], names=['ip','sid','vid','seq','site','r','pid','countrycode','stateprovince','city','devType','max_vpt','max_t','max_pct','reporttime'])
print("num of rows (before unique): {}".format(df.shape[0]))
df = df.filter(['r','site'], axis=1)  # df['seq'] == 1
df = df.groupby(['r', 'site']).count() #.reset_index()
df = df.reset_index()
print("columns: {}".format(df.columns))
print("num of rows for scraping: {}".format(df.shape[0]))
if df.shape[0] > 10:
    print("WARNING! executing large number of rows may take a long while: {}".format(df.shape[0]))


num of rows (before unique): 5
columns: Index([u'r', u'site'], dtype='object')
num of rows for scraping: 4

create a new csv that will hold url to extracted locations (pipe delimited)


In [20]:
total = df.shape[0]
current = 0
OUTPUT_FILE = '/tmp/locals_of_urls.csv'
with open(OUTPUT_FILE,'w') as f:
    f.write('url,locations\n')
    for index, row in df.iterrows():
        url, site = row['r'], row['site']
        local_set = extract_locales(url, site)
        csv_locals = '|'.join(str(s).strip() for s in local_set)
        line = "{},{}\n".format(url, csv_locals)
        f.write(line)
        current+=1
        if current%10 == 0: # print every 10 urls (reduce garbage..)
            print("adding [{} of {}], url: {}".format(current, total, url))
print "*** Done! ***"


*** Done! ***

check the result of the new file


In [21]:
locations_df = pd.read_csv(OUTPUT_FILE, na_filter='')
print("locations_df num of rows: {}".format(locations_df.shape[0]))
if (locations_df.shape[0] != df.shape[0]):
    print("there is a count mismatch between original: {} and location urls: {}")
locations_df.head()


locations_df num of rows: 4
Out[21]:
url locations
0 https://www.aol.com/article/entertainment/2017...
1 https://www.aol.com/article/entertainment/2017...
2 https://www.aol.com/article/news/2017/06/23/of... Alabama
3 https://www.aol.com/article/news/2017/06/24/fl... Florida

In [ ]: