In [4]:
'''
Copyright (C) 2015  Leiden University

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see [http://www.gnu.org/licenses/].
'''

'''
This script looks up the IP address information for MOOC users. NOTE: it does not yet distinguish between the use 
of multiple IP addresses per user, although it would be relatively straightforward to do this.

Written by : Jasper Ginn
Affiliation : Online Learning Lab, Leiden Centre for Innovation, Leiden University
Date : 23-03-2015
Last modified : 24-03-2015
Please send suggestions/comments to : Jasperginn@cdh.leidenuniv.nl
'''

# --------------------------------------------------------------------------------

'''
Import modules
'''

# Import IPwhois
from ipwhois import IPWhois
# Import SQLite
import sqlite3 as lite
# Import pymongo
import pymongo

In [6]:
'''
+++ MAIN FUNCTION +++
'''

def mainFunction(dbname, dbpath, tablename_n, mongo_collection, mongo_field, override = "TRUE", **kwargs):
    # Setup database
    dbSetup(dbname, tablename_n, path = dbpath, override = override)
    # Get IP addresses
    q = getUniqueVals(mongo_collection, mongo_field, **kwargs)
    # For each IP, get country code
    for IP in q:
        # Check if there is more than 1 IP address
        iplen = len(IP.split(','))
        # If longer than 1, then we have more IP addresses
        if iplen > 1:
            # For every IP, do . . .
            for ipobs in IP.split(','):
                # Replace whitespace
                ipobs = ipobs.replace(" ", "")
                # Check if already in db
                res = dbCheck(ipobs, dbname, tablename_n, dbpath)
                if res != None and override == "FALSE":
                    continue
                else:
                    try:
                        # Look up
                        vals = [ ( ipobs, IPtoCC(ipobs) ) ]
                        # Insert into database
                        dbInsert(vals, dbname, tablename_n , path = dbpath)
                    except: # IPdefinedError
                        print('There was an error . . . moving on')
        else:
            # Check if already in db
            res = dbCheck(IP, dbname, tablename_n, dbpath)
            if res != None and override == "FALSE":
                continue
            else:
                try:
                    # Look up
                    vals = [ ( IP, IPtoCC(IP) ) ]
                    # Insert into database
                    dbInsert(vals, dbname, tablename_n , path = dbpath)
                except: # IPdefinedError
                    print('There was an error . . . moving on')

In [2]:
'''
+++ HELPER FUNCTIONS +++
'''

'''
FUNCTION 1 : Helper function that creates the path for the database. It evaluates whether the path specified by the user ends with
'/'. If yes, then paste. If no, then add the '/' to avoid problems.
    parameters :
        dbname : string
            name of the database
        path : string
            system path where the database is stored. Defaults to '~/desktop'
'''

def pathMaker(dbname, path):
    if path.endswith('/'):
        return(path + dbname + '.db')
    else:
        return(path + '/' + dbname + '.db')
    
'''
FUNCTION 2 (DEPRECEATED) : Helper function that retrieves all the IP addresses from the database, stores them in a list. 
    parameters :
        db_name : string
            name of database
        db_path : string
            path to the database
        table_name : string
            table in which IP addresses are stored
            
** NOTE: this function is no longer being used.
'''

def getIPs(db_name, db_path, table_name):
    # Create path to db
    dbpath = pathMaker(db_name, db_path)
    # Connect
    conn = lite.connect(dbpath)
    # With connection, do . . .
    with conn:    
        # Cursor
        cur = conn.cursor()
        # Select all observations in the db
        cur.execute("SELECT * FROM {}".format(table_name))
        # Fetch the rows
        rows = cur.fetchall()
        # Store tuples in list
        IPs = [ row[0] 
                for row in rows ]
    # Close connection
    conn.close()
    # Return
    return(IPs)

'''
FUNCTION 3 : Convert IP address to country code (two-letter cc)
    parameters :
        IP_address : string
            IPv4 or IPv6 address to convert 
'''

def IPtoCC(IP_address):
    # Look up the address
    obj = IPWhois(IP_address)
    results = obj.lookup()
    # return cc
    return(results['asn_country_code'])

'''
FUNCTION 4 : create the SQLite database and commit headers
    Parameters :
        dbname    : string
            name of the database
        tablename : string
            name of the table in which to store results
        path  : string
            path to store database. Defaults to '/home/vagrant/Documents/'
'''

def dbSetup(dbname, tablename, path = '~/desktop', override = "TRUE"):
    # Want to replace the database?
    if override == 'TRUE':
        pathfile = pathMaker(dbname, path)
        con = lite.connect(pathfile)
        cur = con.cursor()
        # send headers and create table
        cur.execute("DROP TABLE IF EXISTS {};".format(tablename))
        cur.execute("CREATE TABLE {}(IP TEXT, CC TEXT);".format(tablename))
        # Commit
        con.commit()
    else:
        print "A table with the name {} already exists for path {}. You specified the override option to be {}. The database will be left alone . . . yay!".format(dbname, path, str(override))

'''
FUNCTION 5 : Insert results form each page to the database
    Parameters :
        values_list : list 
            list of values to send to the database
        dbname      : string
            name of the database
        tablename   : string
            name of the table in which to store results
        path        : string
            path to the database. Defaults to '/home/vagrant/Documents/'
'''

def dbInsert(values_list, dbname, tablename , path = '~/desktop/'):
    # Path to db
    pathfile = pathMaker(dbname, path)
    # Try connecting and inserting
    try:
        con = lite.connect(pathfile) 
        with con:  
            # Cursor file
            cur = con.cursor()
            # Write values to db
            cur.executemany("INSERT INTO {} (IP, CC) VALUES(?, ?);".format(tablename), values_list)
            # Commit (i.e. save) changes
            con.commit()
        # Close connection
        con.close()           
    except:
        print 'Error while setting up the database. Quitting the script now . . . '
        
'''
FUNCTION 9 : Helper function to check whether an IP already exists in the database. Here, we are checking the specific IP
(which is basically a unique ID) against all IPs that already exist in the db.
    parameters : 
        url : string
            url of the specific report at reclaimnaija
        dbname : string
            name of the database
        dbtable : string
            table in which reclaimnaija results are stored
        path : string
            system path where the database is stored. Defaults to '~/desktop'
'''

def dbCheck(IP_add, dbname, dbtable, path = '~/desktop/'):
    pathsal = pathMaker(dbname, path)
    con = lite.connect(pathsal)
    # Cursor file
    with con:
        cur = con.cursor()
        cur.execute("SELECT IP FROM {} WHERE IP = ?".format(dbtable), (IP_add,))
        data=cur.fetchone()
        if data is None:
            return(None)
        else:
            print('Report for IP {} already in database . . . moving on'.format(IP_add))
            return(data[0])
    # Close db connection
    con.close()
    
'''
FUNCTION 10 : Helper function to retrieve all unique values of a collection in a mongodb instance
    parameters :
        collection : string
            Collection to query from
        field : string
            Field for which you want unique values
        **kwargs : 
            Additional arguments for mongodb (i.e. host name, user name, password etc.)
'''

def getUniqueVals(collection, field, **kwargs):
    # Connect to mongo
    client = pymongo.MongoClient(**kwargs)
    # Connect to MOOC db
    mon_db = client["MOOCs"]
    # Connect to collection
    coll = mon_db[collection]
    # Get distinct values for field
    UV = coll.distinct(field)
    # Disconnect
    client.close()
    # Return
    return(UV)

In [ ]:
'''
+++ RUN +++
'''

# Details
dbname = "IP_clickstream_metals-001"
dbpath = "/users/jasper/desktop"
tablename_n = "IPandCC"
mongo_collection = "terrorism004"
mongo_field = "user_ip" 
host = 'localhost' 
port = 27017

# Run
mainFunction(dbname, dbpath, tablename_n, mongo_collection, mongo_field, override = "TRUE", host=host, port=port)


There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on
There was an error . . . moving on