notebook.community

Edit and run



In [ ]:

    
'''
Copyright (C) 2015  Leiden University

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see [http://www.gnu.org/licenses/].
'''



In [ ]:

    
'''
This script takes all HTML files in a folder with peer reviewed assignments and stores them in a SQLite database with
two tables; 

  (1) Assignments (full text)
  (2) Reviews (scores)
  
Meta:
- Written by: Jasper Ginn
- Affiliation: Online learning lab, Leiden Centre for Innovation, Leiden University
- Date: 27-05-2015

'''



In [450]:

    
'''
+++ LOAD MODULES +++
'''

# Create a unique ID for the assignment
import uuid
# OS for folder mapping
import os
# Parse HTML
from bs4 import BeautifulSoup
# Store data in sqlite
import sqlite3 as lite
# Make reque4sts
import urllib2
# Regex
import re
# PDF mining
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

'''
Set encoding settings
'''

import sys    
sys.setdefaultencoding('utf8')



In [513]:

    
'''
+++ MAIN FUNCTION +++
'''

def main(path, dbname, dbpath = '~/desktop', override = "TRUE"):
    # Sort the db
    dbSetup(dbname, dbpath, override = override)
    # Foldermapping on level 1
    dirs = os.walk(path)
    submDir = [x[1] for x in dirs][1]
    # Walk
    dirs = os.walk(path)
    # Get subdir
    subDir = [x[1] for x in dirs][0]
    # Make directories
    fDir = [ "{}{}/{}".format(path, subDir[0], nDir)
             for nDir in submDir ]
    # For each folder . . . 
    for folder in fDir:
        
        '''
        For each submission
        '''
        
        # Folder mapping on level 2. First, get the submission . . .
        dirs = os.walk(folder)
        # Get subdir
        subd = [x[0] for x in dirs]
        PR_sub = "{}/{}".format(subd[1], "fields.html")
        # Extract the information for each submitted PR assignment
        
        '''
        Read html and take out text
        '''
        
        f = urllib2.urlopen("file://{}".format(PR_sub))
        soupi = BeautifulSoup(f, "html.parser")
        # user session ID
        uniqID = getID(soupi)
        # Hash user ID & use as peer assignment ID
        PA_ID = hash(uniqID)
        # Check if PDF
        res = controlLink(soupi)
        if res == True:
            #txt = extractText(soupi)
            MAIN = soup.find("div", {'class':'field-value'})
            # Get link
            link = MAIN.find('a').get('href')
            # Grab text from PDF
            txt = convPDF(str(link))
        else:
            # Get text from html
            txt = extractText(soupi)
        # Create values to send to db
        vals = [ ( PA_ID,
                   uniqID,
                   txt.encode("utf-8") ) ]
        # Store in database
        dbInsert(vals, dbname, "PR_assignment", dbpath)
        
        '''
        For each submission, get the evaluations
        '''
        
        # Folder mapping on level 3
        evals = []
        for edi in subd:
            if "evaluator" in edi:
                fileR = os.listdir(edi)
                evals.append("{}/{}".format(edi, fileR[0]))
        
        # Get evaluations
        for evali in evals:
            f = urllib2.urlopen("file://{}".format(evali))
            soupi = BeautifulSoup(f, "html.parser")
            # Get user session id
            ID = getID(soupi)
            # Get scores
            Qvals = extractValues(soupi)
            # Store in db
            vals = [ ( PA_ID,
                       uniqID,
                       Qvals[0],
                       Qvals[1],
                       Qvals[2]) ]
            # Store in database
            dbInsert(vals, dbname, "PR_review", dbpath)



In [506]:

    
'''
+++ HELPER FUNCTIONS +++
'''

'''
FUNCTION 1 : Helper function that creates the path for the database. It evaluates whether the path specified by the user ends with
'/'. If yes, then paste. If no, then add the '/' to avoid problems.
    parameters :
        dbname : string
            name of the database
        path : string
            system path where the database is stored. Defaults to '~/desktop'
'''

def pathMaker(dbname, path):
    if path.endswith('/'):
        return(path + dbname + '.db')
    else:
        return(path + '/' + dbname + '.db')
    

'''
FUNCTION 2 : create the SQLite database and commit headers
    Parameters :
        dbname    : string
            name of the database
        tablename : string
            name of the table in which to store results
        path  : string
            path to store database. Defaults to '/home/vagrant/Documents/'
'''

def dbSetup(dbname, path = '~/desktop', override = "TRUE"):
    # Want to replace the database?
    if override == 'TRUE':
        pathfile = pathMaker(dbname, path)
        con = lite.connect(pathfile)
        cur = con.cursor()
        # send headers and create table
        # Assignments
        cur.execute("DROP TABLE IF EXISTS PR_assignment;")
        cur.execute("CREATE TABLE PR_assignment(assignment_id INT, session_user_id TEXT, assignment TEXT);")
        # review
        cur.execute("DROP TABLE IF EXISTS PR_review;")
        cur.execute("CREATE TABLE PR_review(assignment_id INT, session_user_id TEXT, Q1 INT, Q2 INT, Q3 INT);")
        # Commit
        con.commit()
        # Destroy
        con.close()
    else:
        print "Database already exists for path {}. You specified the override option to be {}. The database will be left alone . . . yay!".format(dbname, path, str(override))

'''
FUNCTION 3 : Insert results form each page to the database
    Parameters :
        values_list : list 
            list of values to send to the database
        dbname      : string
            name of the database
        tablename   : string
            name of the table in which to store results
        path        : string
            path to the database. Defaults to '/home/vagrant/Documents/'
'''

'''
def dbInsert(values_list, dbname, table, path = '~/desktop/'):
    # Path to db
    pathfile = pathMaker(dbname, path)
    # Try connecting and inserting
    try:
        con = lite.connect(pathfile) 
        with con:  
            # Cursor file
            cur = con.cursor()
            # choose table
            if table == "PR_assignment":
                # Write values to db
                cur.executemany("INSERT INTO PR_assignment (assignment_id, session_user_id, assignment) VALUES(?, ?, ?);", values_list)
                # Commit (i.e. save) changes
                con.commit()
            if table == "PR_review":
                # Write values to db
                cur.executemany("INSERT INTO PR_review (assignment_id, session_user_id, Q1, Q2, Q3) VALUES(?, ?, ?, ?, ?);", values_list)
                # Commit (i.e. save) changes
                con.commit()
        # Close connection
        con.close()           
    except:
        print 'Error while inserting values in the database. Quitting the script now . . . '
'''
def dbInsert(values_list, dbname, table, path = '~/desktop/'):
    # Path to db
    pathfile = pathMaker(dbname, path)
    # Try connecting and inserting

    con = lite.connect(pathfile) 
    con.text_factory = str
    with con:  
        # Cursor file
        cur = con.cursor()
        # choose table
        if table == "PR_assignment":
            # Write values to db
            cur.executemany("INSERT INTO PR_assignment (assignment_id, session_user_id, assignment) VALUES(?, ?, ?);", values_list)
            # Commit (i.e. save) changes
            con.commit()
        if table == "PR_review":
            # Write values to db
            cur.executemany("INSERT INTO PR_review (assignment_id, session_user_id, Q1, Q2, Q3) VALUES(?, ?, ?, ?, ?);", values_list)
            # Commit (i.e. save) changes
            con.commit()
    # Close connection
    con.close()           

        
'''
FUNCTION 4 : Convert PDF provided via URL to text
    parameters : 
        url : string
            url linking to pdf


Function is taken from : http://stackoverflow.com/questions/22800100/parsing-a-pdf-via-url-with-python-using-pdfminer
'''

def convPDF(url): # 
    # Settings
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # Unicode
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str

'''
FUNCTION 5 : Get session_user_id from a submitted assignment
    parameters :
        soup : soup object
'''

def getID(soup):
    # Get metadata
    title = soup.find("title").text
    # Get 
    SUI = re.findall('session_user_id: (.*[^,)])', string = title)[0]
    # Return
    return SUI

'''
FUNCTION 6 : Check whether a link is provided to a document. If so, then return TRUE.
    parameters :
        soup : soup object
'''

def controlLink(soup):
    MAIN = soup.find("div", {'class':'field-value'})
    # Get all 'a' tags
    MIAN = MAIN.findAll('a')
    # Get hyperlinks
    MIAN_list = [ t.get('href') 
                 for t in MIAN ]
    # Filter if no links present
    MIAN_list = [x for x in MIAN_list if x is not None]
    # Check if Amazonaws is in any of the links
    MIAN_lista = [ "amazonaws" in t
                  for t in MIAN_list]
    # Control
    res = True in MIAN_lista
    # If find 'a', it means a link to assignment is provided
    if res == False:
        return False
    else:
        return True
    
'''
FUNCTION 7 : Convert bunch of <p> tags to a document
    parameters : 
        soup : soup object
'''

def extractText(soup):
    # Main text
    MAIN = soup.find("div", {'class':'field-value'})
    # Find all text enclosed in <p> tags
    st = MAIN.findAll('p')
    # Extract text for each
    sd = [ pe.text 
          for pe in st ]
    # Cast into one string
    res = " ".join(sd).replace("\n", " ")
    # Return
    return res

'''
FUNCTION 8 : Extract peer review values
    parameters :
        soup : soup object
'''

def extractValues(soup):
    # For values in soup object
    field_values = [ int(f.text)
                for f in soup.findAll('div', {'class':'field-value'}) ]
    # Return
    return(field_values)



In [ ]:

    
path = "/home/vagrant/PR_ASSIGNMENT_001/"
dbname = "TEST"
dbpath = "/home/vagrant/"
override = "TRUE"

main(path = path, dbname = dbname, dbpath = dbpath, override = override)