In [22]:
import requests
import os, shutil, re
from bs4 import BeautifulSoup
# establishing session
s = requests.Session()
def load_page(url, session): # load all page contents
r = session.get(url)
encoding = r.encoding if 'charset' in r.headers.get('content-type', '').lower() else None
soup = BeautifulSoup(r.content, 'lxml', from_encoding=encoding)
return soup
def clear_dir(path):
if not os.path.exists(path):
os.makedirs(path)
for the_file in os.listdir(path):
file_path = os.path.join(path, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(e)
clear_dir('./cyberleninka_all/')
page = 1
k = 0
while k < 10:
url = 'http://cyberleninka.ru/article/c/matematika/%d' % (page)
soup = load_page(url, s)
if soup:
article_list = soup.findAll('div', {'class': 'title'}) # find all the relevant information
for a in article_list:
english = r'[A-Z|a-z]+$'
bad_chars = r'(\w)*(\{[^}]+)|(\$)+|(\\)+|(\/)+(\w)*'
garbage = r'[.jpg]+|[.png]+|[.bmp]+|[.pdf]+'
if re.search(english, a.text) or re.search(bad_chars, a.text) or re.search(garbage, a.text):
continue
with open('./cyberleninka_all/page_%d.html' % (page), 'a') as output_file:
output_file.write(a.text.encode('utf8') + '\n')
else:
break
page += 1
k += 1
In [20]:
with open('./cyberleninka_all.txt', 'w') as outfile: # merge all files into one
for fname in os.listdir('./cyberleninka_all/'):
file_path = os.path.join('./cyberleninka_all/', fname)
with open(file_path) as infile:
outfile.write(infile.read())