In [1]:
from urllib.request import urlopen  # Library for urlopen
from bs4 import BeautifulSoup  # Library for html parser (scraper), lxml is also nice
import pandas as pd
import re
import sys
sys.path.append('..') 
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits

In [2]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
site = 'https://www.unigo.com/colleges/harvard-university'


connection = pymysql.connect(
    host=mysql_credits.db_host,
    user=mysql_credits.db_user,
    password=mysql_credits.db_password,
    db=mysql_credits.db,
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

In [ ]:
def parsing_function(site_tree):
    # Преобразовываем файл к объектному типу библиотеки BeautifulSoup
    site_bs = BeautifulSoup(site_tree, "html.parser")
    # Ищем все вхождения ссылок с главной страницы на статьи
    site_reviews = site_bs.find_all('div', attrs={"class":"minicard"})
    articles_data = []
    for site_review in site_reviews:
        # Создаём что-то вроде ассоциативного массива
        article_meta_data = collections.OrderedDict()
        # Херачим в него текст отзывов
        if site_review.find("div", attrs={"class": "show-on-open"}):
            article_meta_data['review_text'] = site_review.find("div", attrs={"class": "show-on-open"}).find("p").get_text()
        else:
            article_meta_data['review_text'] = 'no_review_text'
        # Херачим в него рейтинг отзыва
        article_meta_data['review_stars'] = int(site_review.find('div', class_="starCount").attrs['style'][1].replace('stars--lg--', ''))
        # Херачим в него дату отзыва
        article_meta_data['review_date'] = site_review.find("meta", attrs={"itemprop": "datePublished"}).attrs['content']
        # Вкорячиваем этот "ассоциативный массив" в просто массив
        articles_data.append(article_meta_data)
    return articles_data

In [6]:
with open('UK.txt', 'r', encoding='windows-1251') as f:
    content = f.read().splitlines()
print(content[0])


https://www.unigo.com/colleges/harvard-university

In [9]:
main_site=cache_function(content[0])


Using web page from cache...

In [27]:
site_reviews = BeautifulSoup(main_site, "html.parser").find_all('div', attrs={"class":"minicard"})

In [42]:
dd_cln


Out[42]:
["\\'width:\\'width:\\'width:\\'width:\\'width:\\'width:\\'width:\\'width:\\'width:\\'width:"]

In [40]:
#soup = BeautifulSoup(html_doc,'html.parser')    
my_att = [i.find('div', class_="starCount").attrs['style'] for  i in site_reviews]
dd = ''.join(my_att).split(";")
dd_cln= filter(None, dd)
dd_cln= [i.strip() for i in dd_cln ]
my_dict = dict(i.split(':') for i  in dd_cln)
print (my_dict['width'])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-ba7fe4b03e1c> in <module>()
      4 dd_cln= filter(None, dd)
      5 dd_cln= [i.strip() for i in dd_cln ]
----> 6 my_dict = dict(i.split(':') for i  in dd_cln)
      7 print (my_dict['width'])

ValueError: dictionary update sequence element #0 has length 11; 2 is required

In [56]:
site_reviews[0].find('div', attrs={'style':'width:100%'})

In [21]:
int(BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href')[BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href').rfind('/')+1:])


Out[21]:
25

In [ ]:
=re.sub(' +',' ', site_title.find('em').get_text().replace("\\n", ""))

In [26]:
re.sub(' +',' ', BeautifulSoup(main_site, "html.parser").find('h1', attrs={"class":"hero-full__content--titleText titleText collegePage"}).get_text().replace("\\n", "").replace("\\r", ""))


Out[26]:
' Cambridge, MA Harvard University '

In [ ]:
dict_uni={}

In [ ]:
n=1
for uni in content:
    # Преобразовываем файл к объектному типу библиотеки BeautifulSoup
    site = uni
    main_site=cache_function(site)
    max_page = int(BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href')[BeautifulSoup(main_site, "html.parser").find('li', attrs={"class":"PagedList-skipToLast"}).find('a').get('href').rfind('/')+1:])
    name_uni = BeautifulSoup(main_site, "html.parser").find('h1', attrs={"class":"hero-full__content--titleText titleText collegePage"}).get_text()
    dict_uni[name_uni]=n
    max_page=int(max_page)
    print(max_page)
    for i in range(1,max_page+1):
        site_news_section = '?pageno='+str(i)
        site_tree = cache_function(site + site_news_section)
        articles = parsing_function(site_tree)
        dict_uni[name_uni]=n
        print(len(articles))
        for article in articles:
            site_tree = cache_function(article['article_url'])
            print()
            page_article_bs = BeautifulSoup(site_tree, "html.parser")
            #page_article = page_article_bs.find_all('div', class_='field-item even')
            if page_article_bs.find("div", attrs={"id": "wide"}):
                article_content = page_article_bs.find("div", attrs={"id": "wide"}).get_text()
                #print(article_content)
            else:
                article_content = 'no_review_text'
             # Херачим в него рейтинг отзыва
            if page_article_bs.find("div", attrs={"id": "wide"}):
                review_stars = dict_stars[page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]]#page_article_bs.find_all('div', attrs={"class":"snapshot"})[-1].find('span').get_text()
                print(review_stars)
            else:
                review_stars = 'no_review_stars'

            try:
                with connection.cursor() as cursor:
                    # Create a new record
                    #INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
                    sql = '''
                    INSERT INTO `article` 
                    (`article_pub_date`, `article_title`, `article_text`, 
                    `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
                    VALUES (%s, %s, %s,%s, %s, %s, %s);
                    '''

                    #sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"

                    #ins=
                    cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], review_stars,name_uni,0))
                    #cursor.execute(sql)
                # connection is not autocommit by default. So you must commit to save
                # your changes.
                connection.commit()
            finally:
                print('finally')
    
    n+=1