In [1]:
from urllib.request import urlopen # Library for urlopen
from bs4 import BeautifulSoup # Library for html parser (scraper), lxml is also nice
import pandas as pd
import re
import sys
sys.path.append('..')
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits
In [2]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
site = 'https://www.unigo.com/colleges/harvard-university'
connection = pymysql.connect(
host=mysql_credits.db_host,
user=mysql_credits.db_user,
password=mysql_credits.db_password,
db=mysql_credits.db,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
In [ ]:
def parsing_function(site_tree):
# Преобразовываем файл к объектному типу библиотеки BeautifulSoup
site_bs = BeautifulSoup(site_tree, "html.parser")
# Ищем все вхождения ссылок с главной страницы на статьи
site_reviews = site_bs.find_all('div', attrs={"class":"minicard"})
articles_data = []
for site_review in site_reviews:
# Создаём что-то вроде ассоциативного массива
article_meta_data = collections.OrderedDict()
# Херачим в него текст отзывов
if site_review.find("div", attrs={"class": "show-on-open"}):
article_meta_data['review_text'] = site_review.find("div", attrs={"class": "show-on-open"}).find("p").get_text()
else:
article_meta_data['review_text'] = 'no_review_text'
# Херачим в него рейтинг отзыва
article_meta_data['review_stars'] = int(site_review.find('div', class_="starCount").attrs['style'][1].replace('stars--lg--', ''))
# Херачим в него дату отзыва
article_meta_data['review_date'] = site_review.find("meta", attrs={"itemprop": "datePublished"}).attrs['content']
# Вкорячиваем этот "ассоциативный массив" в просто массив
articles_data.append(article_meta_data)
return articles_data
In [6]:
with open('UK.txt', 'r', encoding='windows-1251') as f:
content = f.read().splitlines()
print(content[0])
In [9]:
main_site=cache_function(content[0])
In [27]:
site_reviews = BeautifulSoup(main_site, "html.parser").find_all('div', attrs={"class":"minicard"})
In [42]:
dd_cln
Out[42]:
In [40]:
#soup = BeautifulSoup(html_doc,'html.parser')
my_att = [i.find('div', class_="starCount").attrs['style'] for i in site_reviews]
dd = ''.join(my_att).split(";")
dd_cln= filter(None, dd)
dd_cln= [i.strip() for i in dd_cln ]
my_dict = dict(i.split(':') for i in dd_cln)
print (my_dict['width'])
In [56]:
site_reviews[0].find('div', attrs={'style':'width:100%'})
In [21]:
int(BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href')[BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href').rfind('/')+1:])
Out[21]:
In [ ]:
=re.sub(' +',' ', site_title.find('em').get_text().replace("\\n", ""))
In [26]:
re.sub(' +',' ', BeautifulSoup(main_site, "html.parser").find('h1', attrs={"class":"hero-full__content--titleText titleText collegePage"}).get_text().replace("\\n", "").replace("\\r", ""))
Out[26]:
In [ ]:
dict_uni={}
In [ ]:
n=1
for uni in content:
# Преобразовываем файл к объектному типу библиотеки BeautifulSoup
site = uni
main_site=cache_function(site)
max_page = int(BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href')[BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href').rfind('/')+1:])
name_uni = BeautifulSoup(main_site, "html.parser").find('h1', attrs={"class":"hero-full__content--titleText titleText collegePage"}).get_text()
dict_uni[name_uni]=n
max_page=int(max_page)
print(max_page)
for i in range(1,max_page+1):
site_news_section = '?pageno='+str(i)
site_tree = cache_function(site + site_news_section)
articles = parsing_function(site_tree)
dict_uni[name_uni]=n
print(len(articles))
for article in articles:
site_tree = cache_function(article['article_url'])
print()
page_article_bs = BeautifulSoup(site_tree, "html.parser")
#page_article = page_article_bs.find_all('div', class_='field-item even')
if page_article_bs.find("div", attrs={"id": "wide"}):
article_content = page_article_bs.find("div", attrs={"id": "wide"}).get_text()
#print(article_content)
else:
article_content = 'no_review_text'
# Херачим в него рейтинг отзыва
if page_article_bs.find("div", attrs={"id": "wide"}):
review_stars = dict_stars[page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]]#page_article_bs.find_all('div', attrs={"class":"snapshot"})[-1].find('span').get_text()
print(review_stars)
else:
review_stars = 'no_review_stars'
try:
with connection.cursor() as cursor:
# Create a new record
#INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
sql = '''
INSERT INTO `article`
(`article_pub_date`, `article_title`, `article_text`,
`article_url`, `article_rating`, `article_uni`, `uni_site_id`)
VALUES (%s, %s, %s,%s, %s, %s, %s);
'''
#sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
#ins=
cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], review_stars,name_uni,0))
#cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
print('finally')
n+=1