notebook.community

Edit and run



In [22]:

    
import requests
import os, shutil, re
from bs4 import BeautifulSoup

# establishing session
s = requests.Session()

def load_page(url, session): # load all page contents
    r = session.get(url)
    encoding = r.encoding if 'charset' in r.headers.get('content-type', '').lower() else None
    soup = BeautifulSoup(r.content, 'lxml', from_encoding=encoding)
    return soup
    
def clear_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    for the_file in os.listdir(path):
        file_path = os.path.join(path, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(e)

clear_dir('./cyberleninka_all/')
page = 1
k = 0
while k < 10:
    url = 'http://cyberleninka.ru/article/c/matematika/%d' % (page)
    soup = load_page(url, s)
    if soup:
        article_list = soup.findAll('div', {'class': 'title'}) # find all the relevant information
        for a in article_list:
            english = r'[A-Z|a-z]+$'
            bad_chars = r'(\w)*(\{[^}]+)|(\$)+|(\\)+|(\/)+(\w)*'
            garbage = r'[.jpg]+|[.png]+|[.bmp]+|[.pdf]+'
            if re.search(english, a.text) or re.search(bad_chars, a.text) or re.search(garbage, a.text):
                continue
            with open('./cyberleninka_all/page_%d.html' % (page), 'a') as output_file:
                output_file.write(a.text.encode('utf8') + '\n')
    else:
        break
    page += 1
    k += 1



In [20]:

    
with open('./cyberleninka_all.txt', 'w') as outfile:  # merge all files into one
    for fname in os.listdir('./cyberleninka_all/'):
        file_path = os.path.join('./cyberleninka_all/', fname)
        with open(file_path) as infile:
            outfile.write(infile.read())