In [7]:
import pandas as pd
from bs4 import BeautifulSoup
from bs4 import Comment


import re
import glob
import ntpath
import networkx as nx
import os
from bs4.element import Comment
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

In [2]:
csv_file = '/media/rna/yahoo_crawl_data/Yahoo-20190406T235503Z-001/Yahoo/URLtoHTML_yahoo_news.csv'

In [3]:
mapping_file_df = pd.read_csv(csv_file).\
                sort_values(by=['filename', 'URL']).\
                reset_index(drop=True)
#mapping_file_df.head()


Out[3]:
filename URL
0 00090e25-3aeb-4e0e-abfa-00ea58a6a48d.html https://www.yahoo.com/news/harry-s-truman-cour...
1 00193717-117b-45b6-84dc-2e88026d41ca.html https://www.yahoo.com/news/clinton-has-the-map...
2 00272c51-112a-455e-81e4-45b033b05475.html https://www.yahoo.com/news/definitive-ranking-...
3 003247be-6972-419b-8b05-8d53a013db55.html https://www.yahoo.com/news/george-h-w-bush-sli...
4 003ccc65-6f50-4219-a98d-3515db0986df.html https://www.yahoo.com/news/lebron-james-hillar...

In [4]:
crawl_data_dir = '/media/rna/yahoo_crawl_data/Yahoo-20190406T235503Z-001/Yahoo/yahoo/'
list_of_html_files = glob.glob('{}/*.html'.format(crawl_data_dir))

In [21]:
# Credits: https://stackoverflow.com/a/1983219/756986
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def is_likely_a_word(string):
    return string.isalpha()

big = []
with open('./yahoo_big.txt', 'w') as fh:
    for f in list_of_html_files:
        html = open(f).read()
        text = text_from_html(html).split()
        words = filter(is_likely_a_word, text)
        for line in words:
            fh.write('{}\n'.format(line.lower()))
            #big = big + list(set(words))
            #big = list(set(big))
            #fh.write('{}\n'.format(line.lower()))