In [1]:
from urllib.request import urlopen  # Library for urlopen
from bs4 import BeautifulSoup  # Library for html parser (scraper), lxml is also nice
import re
import sys
sys.path.append('..') 
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits

In [2]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
site = 'https://www.timeshighereducation.com'


connection = pymysql.connect(
    host=mysql_credits.db_host,
    user=mysql_credits.db_user,
    password=mysql_credits.db_password,
    db=mysql_credits.db,
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

In [3]:
def stemming_article(article): 
    from nltk.stem import SnowballStemmer 
    for_removing = "№#©@&%\\\/=+/~^*,.;:\"'`“”‘’–-—_{}[]()1234567890!@#?$" 
    lines = open(article, "r", encoding="utf8", errors='replace').readlines() 
    stemmer = SnowballStemmer("russian") 
    for line in lines: 
        line = line.replace("\n", "") 
        print(stemmer.stem(line)) 
#stemming_article("minus_words.csv")

In [54]:
def parsing_function(site_tree):
    # Преобразовываем файл к объектному типу библиотеки BeautifulSoup
    site_bs = BeautifulSoup(site_tree, "html.parser")
    # Ищем все вхождения ссылок с главной страницы на статьи
    site_titles = site_bs.find_all('div', attrs={"class":"views-row"})
    articles_data = []
    for site_title in site_titles:
        if site_title.find('div', attrs={"class":"submitted"}):
            # Создаём что-то вроде ассоциативного массива
            article_meta_data = collections.OrderedDict()
            # Херачим в него ссылки статьи

            article_meta_data['article_url'] = site + site_title.find('a', attrs={"data-position":"list"}).get('href')
            # Херачим в него заголовки статьи
            article_meta_data['article_title'] = re.sub(' +',' ',site_title.get_text().replace("\\n", "").replace("\n", "")) 
            article_meta_data['article_date'] =re.sub(' +',' ', site_title.find('div', attrs={"class":"submitted"}).get_text().replace("\\n", "").replace("\n", "")) 
            #print(site_title.get_text())
            # Вкорячиваем этот "ассоциативный массив" в просто массив
            articles_data.append(article_meta_data)
    return articles_data

In [56]:
for i in range(1,2):
    if i == 1:
        site_news_section = '/student/news'
    else:
        site_news_section = '/student/news?page='+str(i)
    site_tree = cache_function(site + site_news_section)
    articles = parsing_function(site_tree)
    print(len(articles))
    # Здесь будем парсить все статьи,собранные с главной страницы
    for article in articles:
        site_tree = cache_function(article['article_url'])
        page_article_bs = BeautifulSoup(site_tree, "html.parser")
        page_article = page_article_bs.find_all('div', class_='field-item even')
        article_content=''
        for wrapper in page_article:
            if wrapper.find("p"):
                article_content=article_content+wrapper.find("p").get_text()
        try:
            with connection.cursor() as cursor:
                # Create a new record
                #INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
                sql = '''
                INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
                VALUES (%s, %s, %s,%s, %s, %s, %s);
                '''
                
                #sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
            
                #ins=
                cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], 0,0,0))
                #cursor.execute(sql)
            # connection is not autocommit by default. So you must commit to save
            # your changes.
            connection.commit()
        finally:
            print('finally')


Using web page from internet...
8
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally

In [6]:
from pathlib import Path
import json
from time import gmtime, strftime
import hashlib
from urllib.request import urlopen
from urllib.request import Request, urlopen

In [57]:
for i in range(1,2):
    if i == 1:
        site_news_section = '/student/blogs'
    else:
        site_news_section = '/student/blogs?page='+str(i)
    site_tree = cache_function(site + site_news_section)
    articles = parsing_function(site_tree)
    print(len(articles))
    # Здесь будем парсить все статьи,собранные с главной страницы
    for article in articles:
        site_tree = cache_function(article['article_url'])
        page_article_bs = BeautifulSoup(site_tree, "html.parser")
        page_article = page_article_bs.find_all('div', class_='field-item even')
        article_content=''
        for wrapper in page_article:
            if wrapper.find("p"):
                article_content=article_content+wrapper.find("p").get_text()
        try:
            with connection.cursor() as cursor:
                # Create a new record
                #INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
                sql = '''
                INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
                VALUES (%s, %s, %s,%s, %s, %s, %s);
                '''
                
                #sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
            
                #ins=
                cursor.execute(sql, (str(pd.to_datetime(article['article_date'])), article['article_title'], article_content, article['article_url'], 0,0,0))
                #cursor.execute(sql)
            # connection is not autocommit by default. So you must commit to save
            # your changes.
            connection.commit()
        finally:
            print('finally')


Using web page from internet...
8
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally
Using web page from internet...
finally

In [ ]: