notebook.community

Edit and run



In [7]:

    
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import Comment


import re
import glob
import ntpath
import networkx as nx
import os
from bs4.element import Comment
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)



In [2]:

    
csv_file = '/media/rna/yahoo_crawl_data/Yahoo-20190406T235503Z-001/Yahoo/URLtoHTML_yahoo_news.csv'



In [3]:

    
mapping_file_df = pd.read_csv(csv_file).\
                sort_values(by=['filename', 'URL']).\
                reset_index(drop=True)
#mapping_file_df.head()









    Out[3]:







  
    
      
      filename
      URL
    
  
  
    
      0
      00090e25-3aeb-4e0e-abfa-00ea58a6a48d.html
      https://www.yahoo.com/news/harry-s-truman-cour...
    
    
      1
      00193717-117b-45b6-84dc-2e88026d41ca.html
      https://www.yahoo.com/news/clinton-has-the-map...
    
    
      2
      00272c51-112a-455e-81e4-45b033b05475.html
      https://www.yahoo.com/news/definitive-ranking-...
    
    
      3
      003247be-6972-419b-8b05-8d53a013db55.html
      https://www.yahoo.com/news/george-h-w-bush-sli...
    
    
      4
      003ccc65-6f50-4219-a98d-3515db0986df.html
      https://www.yahoo.com/news/lebron-james-hillar...



In [4]:

    
crawl_data_dir = '/media/rna/yahoo_crawl_data/Yahoo-20190406T235503Z-001/Yahoo/yahoo/'
list_of_html_files = glob.glob('{}/*.html'.format(crawl_data_dir))



In [21]:

    
# Credits: https://stackoverflow.com/a/1983219/756986
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def is_likely_a_word(string):
    return string.isalpha()

big = []
with open('./yahoo_big.txt', 'w') as fh:
    for f in list_of_html_files:
        html = open(f).read()
        text = text_from_html(html).split()
        words = filter(is_likely_a_word, text)
        for line in words:
            fh.write('{}\n'.format(line.lower()))
            #big = big + list(set(words))
            #big = list(set(big))
            #fh.write('{}\n'.format(line.lower()))

	filename	URL
0	00090e25-3aeb-4e0e-abfa-00ea58a6a48d.html	https://www.yahoo.com/news/harry-s-truman-cour...
1	00193717-117b-45b6-84dc-2e88026d41ca.html	https://www.yahoo.com/news/clinton-has-the-map...
2	00272c51-112a-455e-81e4-45b033b05475.html	https://www.yahoo.com/news/definitive-ranking-...
3	003247be-6972-419b-8b05-8d53a013db55.html	https://www.yahoo.com/news/george-h-w-bush-sli...
4	003ccc65-6f50-4219-a98d-3515db0986df.html	https://www.yahoo.com/news/lebron-james-hillar...