In [1]:
from urllib.request import urlopen  # Library for urlopen
from bs4 import BeautifulSoup  # Library for html parser (scraper), lxml is also nice
import pandas as pd
import numpy as np
import re
import sys
sys.path.append('..') 
from uni_cache.cache_function import cache_function
import pymysql
import collections
import mysql_credits

In [2]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from datetime import datetime, timedelta
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.options import Options

In [3]:
import time
from selenium.webdriver.support.ui import Select

In [4]:
# This folder should be edited according to this project path on yours computer
project_folder = '/home/bogdan/PycharmProjects/universities_sentiment/'
cache_folder = project_folder + 'cache/'
#site = 'https://www.studentsreview.com/CA/CIT_comments.html'


connection = pymysql.connect(
    host=mysql_credits.db_host,
    user=mysql_credits.db_user,
    password=mysql_credits.db_password,
    db=mysql_credits.db,
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

cursor = connection.cursor()

sql=''' select article_categories from article WHERE (article_rating>0) '''

cursor.execute(sql)

raw = cursor.fetchall()

df = pd.DataFrame(raw, columns=[i[0] for i in cursor.description])

display = Display(visible=0, size=(800, 600)) display.start()

chrome_options = Options() chrome_options.add_argument("--no-startup-window")


In [5]:
browser = webdriver.Chrome(executable_path="..\\chromedriver.exe")

url = "https://www.niche.com/colleges/stanford-university/reviews/" browser.get(url) site_bs= BeautifulSoup(browser.page_source, "html.parser")

browser.find_element_by_css_selector('.icon-arrowright-thin--pagination').click()


In [6]:
main_site='https://www.niche.com/colleges/search/best-colleges/'  #?page=1'

In [26]:
#Take omy fisrt 500th universities (25 per page)
site_links=[]
for page in range(1,21):
    page_site=main_site+'?page='+str(page)
    page_site_cache = cache_function(page_site)
    #browser.get(page_site)
    site_bs= BeautifulSoup(page_site_cache, "html.parser")
    link_uni=site_bs.find_all('a', attrs={"class":"search-result__link"})
    for link in link_uni:
        site_links.append(link.attrs['href'])
    #site_links.append(site_bs.find_all('h2', attrs={"class":"search-result__link"}).get_text())


Using web page from cache...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...
Using web page from internet...

In [8]:
from selenium.webdriver.support.ui import Select

In [9]:
categories=["Academics", "Overall Experience","Value"]

In [50]:
text='''Financial Aid is awesome. The only person I know with a problem has issues because her Dad died in high school and things with insurance are crazy, but she talked with financial aid and they figured out a way for her to be able to stay here without too much in loans. I am frickin broke and I get to live more or less the same way as the super rich people do. Pretty awesome!
'''

In [10]:
import time

In [11]:
import string

In [12]:
string.punctuation


Out[12]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
import json

In [14]:
#sentences = [re.sub('([,.!]){1,}', '', sentence).strip() for sentence in sentence_list]

In [17]:
article_title


Out[17]:
['While some classes are large and impersonal, those classes always have TAs that are devoted and sections that are small.',
 'Professors are world class, though not always world class teachers.']

In [21]:
print(page_rewiew)


https://www.niche.com/colleges/princeton-university/reviews/

In [24]:
if not dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}):
    print(1)


1

In [ ]:
for page in site_links[6:]:
    page_rewiew=page+'reviews/'
    page_rewiew_cache=cache_function(page_rewiew)
    page_rewiew_cache_bs= BeautifulSoup(page_rewiew_cache, "html.parser")
    name_uni = page_rewiew_cache_bs.find('a', attrs={"class":"entity-name__link"}).get_text()
    
    for category in categories:
        page_rewiew_category=page_rewiew+'?category='+category
        #print(page_rewiew_category)
        #page_rewiew_category_cache=cache_function(page_rewiew_category)
        
        browser.get(page_rewiew)
        time.sleep(1)
        
        dropdown = Select(browser.find_element_by_css_selector(".review-categories")).select_by_value(category)#.click()
        time.sleep(1)
        dropdown_bs=BeautifulSoup(browser.page_source, "html.parser")
        #time.sleep(10)
        #browser.get(page_rewiew_category)
        #page_rewiew_category_cache_bs= BeautifulSoup(browser.page_source, "html.parser")
        #page_rewiew_category_cache_bs= BeautifulSoup(page_rewiew_category_cache, "html.parser")
        if dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}):
            max_page = int(dropdown_bs.find('select', attrs={"class":"pagination__pages__selector"}).find_all('option')[-1].attrs['value'])
        else:
            max_page = 1
        print(name_uni, category, max_page)
        for i in range(1, max_page+1):
            if i > 1:
                browser.find_element_by_css_selector('.icon-arrowright-thin--pagination').click()
                time.sleep(1)
            
            site_bs= BeautifulSoup(browser.page_source, "html.parser")
            site_reviews=site_bs.find_all('div', attrs={"class":"review"})
            for site_review in site_reviews:
                #article_meta_data = collections.OrderedDict()
                #article_meta_data['article_url'] = site_title.find('a', attrs={"class":"readmore"}).get('href')
                # Херачим в него рейтинг отзыва
                #if np.in1d(list(dict_stars.keys()), page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]).sum()!=0:
                #            review_stars = dict_stars[page_article_bs.find("div", attrs={"id": "wide"}).attrs['class'][-1]]#page_article_bs.find_all('div', attrs={"class":"snapshot"})[-1].find('span').get_text()
                #            #print(review_stars)
                #else:
                #    review_stars = dict_stars['neutral'] ratingValue

                if site_review.find('meta', attrs={"itemprop":"ratingValue"}):
                    review_stars=site_review.find('meta', attrs={"itemprop":"ratingValue"}).attrs['content']
                    #article_meta_data['review_stars'] =site_title.find('meta', attrs={"itemprop":"ratingValue"}).attrs['content']
                    #print(site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content'])
                else:
                    review_stars=0
                    #article_meta_data['review_stars']=0

                if site_review.find('meta', attrs={"itemprop":"datePublished"}):
                    article_date =site_review.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
                    #article_meta_data['article_date'] =site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
                    #print(site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content'])
                else:
                    article_date = datetime.today().strftime('%Y-%m-%d') #site_title.find('meta', attrs={"itemprop":"datePublished"}).attrs['content']
                    #article_meta_data['article_date']=datetime.today().strftime('%Y-%m-%d')
                
                if site_review.find("div", attrs={"itemprop": "reviewBody"}):
                    article_content=site_review.find("div", attrs={"itemprop": "reviewBody"}).get_text()
                else:
                    article_content = 'no_review_text'
                    
                emoji_pattern = re.compile("["
                                                u"\U0001F600-\U0001F64F"  # emoticons
                                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                                   "]+", flags=re.UNICODE)
                article_content = emoji_pattern.sub(r'', article_content)
                #article_meta_data['article_text']=article_content
                #articles_data.append(article_meta_data)
                if len(re.split('(?<=[.!?]) +',article_content.replace("\n", "")))>1:
                    article_title=re.split('(?<=[.!?]) +',article_content.replace("\n", ""))[0]
                else:
                    article_title=re.split('(?<=[.!?]) +',article_content.replace("\n", ""))
                try:
                    with connection.cursor() as cursor:
                            # Create a new record
                            #INSERT INTO `article` (`article_title`, `article_text`, `article_url`, `article_categories`) VALUES (%s, %s, %s, 'null');
                            sql = '''
                                INSERT INTO `article` 
                                (`article_pub_date`, `article_title`, `article_text`, 
                                `article_url`, `article_rating`, `article_uni`, `uni_site_id`)
                                VALUES (%s, %s, %s,%s, %s, %s, %s);
                                '''

                            #sql="INSERT INTO `article` (`article_pub_date`, `article_title`, `article_text`, `article_url`, `article_rating`, `article_uni`, `uni_site_id`) VALUES ('"+article['article_date']+"','"+article['article_title']+ "', '"+article_content+"', '"+article['article_url']+"', 'null', 'null',null');"

                            #ins=
                            cursor.execute(sql, (str(pd.to_datetime(article_date)),article_title, article_content, page_rewiew_category, review_stars,name_uni,0))
                            #cursor.execute(sql)
                            # connection is not autocommit by default. So you must commit to save
                            # your changes.
                            connection.commit()
                finally:
                     print('finally')


Using web page from internet...
University of Pennsylvania Academics 6
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
University of Pennsylvania Overall Experience 8
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
University of Pennsylvania Value 4
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally
finally