In [1]:
from urllib.request import urlopen # Library for urlopen
from bs4 import BeautifulSoup # Library for html parser (scraper), lxml is also nice
import pandas as pd
import numpy as np
import re
import sys
sys.path.append('..')
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits
In [2]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from datetime import datetime, timedelta
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options
In [3]:
import time
from selenium.webdriver.support.ui import Select
In [4]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
#site = 'https://www.studentsreview.com/CA/CIT_comments.html'
connection = pymysql.connect(
host=mysql_credits.db_host,
user=mysql_credits.db_user,
password=mysql_credits.db_password,
db=mysql_credits.db,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
cursor = connection.cursor()
sql=''' select article_categories from article WHERE (article_rating>0) '''
cursor.execute(sql)
raw = cursor.fetchall()
df = pd.DataFrame(raw, columns=[i[0] for i in cursor.description])
display = Display(visible=0, size=(800, 600)) display.start()
chrome_options = Options() chrome_options.add_argument("--no-startup-window")
In [5]:
browser = webdriver.Chrome(executable_path="..\\chromedriver.exe")
url = "https://www.niche.com/colleges/stanford-university/reviews/" browser.get(url) site_bs= BeautifulSoup(browser.page_source, "html.parser")
browser.find_element_by_css_selector('.icon-arrowright-thin--pagination').click()
In [6]:
main_site='https://www.niche.com/colleges/search/best-colleges/' #?page=1'
In [26]:
#Take omy fisrt 500th universities (25 per page)
site_links=[]
for page in range(1,21):
page_site=main_site+'?page='+str(page)
page_site_cache = cache_function(page_site)
#browser.get(page_site)
site_bs= BeautifulSoup(page_site_cache, "html.parser")
link_uni=site_bs.find_all('a', attrs={"class":"search-result__link"})
for link in link_uni:
site_links.append(link.attrs['href'])
#site_links.append(site_bs.find_all('h2', attrs={"class":"search-result__link"}).get_text())
In [8]:
from selenium.webdriver.support.ui import Select
In [9]:
categories=["Academics", "Overall Experience","Value"]
In [50]:
text='''Financial Aid is awesome. The only person I know with a problem has issues because her Dad died in high school and things with insurance are crazy, but she talked with financial aid and they figured out a way for her to be able to stay here without too much in loans. I am frickin broke and I get to live more or less the same way as the super rich people do. Pretty awesome!
'''
In [10]:
import time
In [11]:
import string
In [12]:
string.punctuation
Out[12]:
In [13]:
import json
In [14]:
#sentences = [re.sub('([,.!]){1,}', '', sentence).strip() for sentence in sentence_list]
In [17]:
article_title
Out[17]:
In [21]:
print(page_rewiew)
In [24]:
if not dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}):
print(1)
In [ ]:
for page in site_links[6:]:
page_rewiew=page+'reviews/'
page_rewiew_cache=cache_function(page_rewiew)
page_rewiew_cache_bs= BeautifulSoup(page_rewiew_cache, "html.parser")
name_uni = page_rewiew_cache_bs.find('a', attrs={"class":"entity-name__link"}).get_text()
for category in categories:
page_rewiew_category=page_rewiew+'?category='+category
#print(page_rewiew_category)
#page_rewiew_category_cache=cache_function(page_rewiew_category)
browser.get(page_rewiew)
time.sleep(1)
dropdown = Select(browser.find_element_by_css_selector(".review-categories")).select_by_value(category)#.click()
time.sleep(1)
dropdown_bs=BeautifulSoup(browser.page_source, "html.parser")
#time.sleep(10)
#browser.get(page_rewiew_category)
#page_rewiew_category_cache_bs= BeautifulSoup(browser.page_source, "html.parser")
#page_rewiew_category_cache_bs= BeautifulSoup(page_rewiew_category_cache, "html.parser")
if dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}):
max_page = int(dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}).find_all('option')[-1].attrs['value'])
else:
max_page = 1
print(name_uni, category, max_page)
for i in range(1, max_page+1):
if i > 1:
browser.find_element_by_css_selector('.icon-arrowright-thin--pagination').click()
time.sleep(1)
site_bs= BeautifulSoup(browser.page_source, "html.parser")
site_reviews=site_bs.find_all('div', attrs={"class":"review"})
for site_review in site_reviews:
#article_meta_data = collections.OrderedDict()
#article_meta_data['article_url'] = site_title.find('a', attrs={"class":"readmore"}).get('href')
# Херачим в него рейтинг отзыва
#if np.in1d(list(dict_stars.keys()), page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]).sum()!=0:
# review_stars = dict_stars[page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]]#page_article_bs.find_all('div', attrs={"class":"snapshot"})[-1].find('span').get_text()
# #print(review_stars)
#else:
# review_stars = dict_stars['neutral'] ratingValue
if site_review.find('meta', attrs={"itemprop":"ratingValue"}):
review_stars=site_review.find('meta', attrs={"itemprop":"ratingValue"}).attrs['content']
#article_meta_data['review_stars'] =site_title.find('meta', attrs={"itemprop":"ratingValue"}).attrs['content']
#print(site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content'])
else:
review_stars=0
#article_meta_data['review_stars']=0
if site_review.find('meta', attrs={"itemprop":"datePublished"}):
article_date =site_review.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
#article_meta_data['article_date'] =site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
#print(site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content'])
else:
article_date = datetime.today().strftime('%Y-%m-%d') #site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
#article_meta_data['article_date']=datetime.today().strftime('%Y-%m-%d')
if site_review.find("div", attrs={"itemprop": "reviewBody"}):
article_content=site_review.find("div", attrs={"itemprop": "reviewBody"}).get_text()
else:
article_content = 'no_review_text'
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
article_content = emoji_pattern.sub(r'', article_content)
#article_meta_data['article_text']=article_content
#articles_data.append(article_meta_data)
if len(re.split('(?<=[.!?]) +',article_content.replace("\n", "")))>1:
article_title=re.split('(?<=[.!?]) +',article_content.replace("\n", ""))[0]
else:
article_title=re.split('(?<=[.!?]) +',article_content.replace("\n", ""))
try:
with connection.cursor() as cursor:
# Create a new record
#INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
sql = '''
INSERT INTO `article`
(`article_pub_date`, `article_title`, `article_text`,
`article_url`, `article_rating`, `article_uni`, `uni_site_id`)
VALUES (%s, %s, %s,%s, %s, %s, %s);
'''
#sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"
#ins=
cursor.execute(sql, (str(pd.to_datetime(article_date)),article_title, article_content, page_rewiew_category, review_stars,name_uni,0))
#cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
print('finally')