In [1]:
from urllib.request import urlopen # Library for urlopen
from bs4 import BeautifulSoup # Library for html parser (scraper), lxml is also nice
import re
import sys
sys.path.append('..')
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits
In [2]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
site = 'https://www.timeshighereducation.com'
connection = pymysql.connect(
host=mysql_credits.db_host,
user=mysql_credits.db_user,
password=mysql_credits.db_password,
db=mysql_credits.db,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
In [3]:
def stemming_article(article):
from nltk.stem import SnowballStemmer
for_removing = "№#©@&%\\\/=+/~^*,.;:\"'`“”‘’–-—_{}[]()1234567890!@#?$"
lines = open(article, "r", encoding="utf8", errors='replace').readlines()
stemmer = SnowballStemmer("russian")
for line in lines:
line = line.replace("\n", "")
print(stemmer.stem(line))
#stemming_article("minus_words.csv")
In [54]:
def parsing_function(site_tree):
# Преобразовываем файл к объектному типу библиотеки BeautifulSoup
site_bs = BeautifulSoup(site_tree, "html.parser")
# Ищем все вхождения ссылок с главной страницы на статьи
site_titles = site_bs.find_all('div', attrs={"class":"views-row"})
articles_data = []
for site_title in site_titles:
if site_title.find('div', attrs={"class":"submitted"}):
# Создаём что-то вроде ассоциативного массива
article_meta_data = collections.OrderedDict()
# Херачим в него ссылки статьи
article_meta_data['article_url'] = site + site_title.find('a', attrs={"data-position":"list"}).get('href')
# Херачим в него заголовки статьи
article_meta_data['article_title'] = re.sub(' +',' ',site_title.get_text().replace("\\n", "").replace("\n", ""))
article_meta_data['article_date'] =re.sub(' +',' ', site_title.find('div', attrs={"class":"submitted"}).get_text().replace("\\n", "").replace("\n", ""))
#print(site_title.get_text())
# Вкорячиваем этот "ассоциативный массив" в просто массив
articles_data.append(article_meta_data)
return articles_data
In [56]:
for i in range(1,2):
if i == 1:
site_news_section = '/student/news'
else:
site_news_section = '/student/news?page='+str(i)
site_tree = cache_function(site + site_news_section)
articles = parsing_function(site_tree)
print(len(articles))
# Здесь будем парсить все статьи,собранные с главной страницы
for article in articles:
site_tree = cache_function(article['article_url'])
page_article_bs = BeautifulSoup(site_tree, "html.parser")
page_article = page_article_bs.find_all('div', class_='field-item even')
article_content=''
for wrapper in page_article:
if wrapper.find("p"):
article_content=article_content+wrapper.find("p").get_text()
try:
with connection.cursor() as cursor:
# Create a new record
#INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
sql = '''
INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
VALUES (%s, %s, %s,%s, %s, %s, %s);
'''
#sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
#ins=
cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], 0,0,0))
#cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
print('finally')
In [6]:
from pathlib import Path
import json
from time import gmtime, strftime
import hashlib
from urllib.request import urlopen
from urllib.request import Request, urlopen
In [57]:
for i in range(1,2):
if i == 1:
site_news_section = '/student/blogs'
else:
site_news_section = '/student/blogs?page='+str(i)
site_tree = cache_function(site + site_news_section)
articles = parsing_function(site_tree)
print(len(articles))
# Здесь будем парсить все статьи,собранные с главной страницы
for article in articles:
site_tree = cache_function(article['article_url'])
page_article_bs = BeautifulSoup(site_tree, "html.parser")
page_article = page_article_bs.find_all('div', class_='field-item even')
article_content=''
for wrapper in page_article:
if wrapper.find("p"):
article_content=article_content+wrapper.find("p").get_text()
try:
with connection.cursor() as cursor:
# Create a new record
#INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
sql = '''
INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
VALUES (%s, %s, %s,%s, %s, %s, %s);
'''
#sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
#ins=
cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], 0,0,0))
#cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
print('finally')
In [ ]: