In [1]:
from __future__ import print_function
import nltk.tokenize
import psycopg2
import pandas as pd
import sys
databaseConnectionServer = 'srn02.cs.cityu.edu.hk'
from sshtunnel import SSHTunnelForwarder
chunk_size = 1000
authors = [127, 136, 24, 36, 47, 49, 87, 101, 40, 73, 78, 82, 76, 21]

In [2]:
with SSHTunnelForwarder((databaseConnectionServer, 22), ssh_username='stylometry', ssh_password='stylometry', 
                        remote_bind_address=('localhost', 5432), local_bind_address=('localhost', 5400)):
    PORT=5400
    df = pd.DataFrame()
    conn = None
    output = []
    i = 1
    # nltk.download('punkt')
    try:
        conn = psycopg2.connect(user="stylometry", password="stylometry",
                                database="stylometry_v2", host="localhost", port=PORT)
        cur = conn.cursor()
        
        documentTable = 'author'
        
        query = "SELECT author_id FROM " + str(documentTable)
        query += " WHERE gender like '%F%' ;"
        cur.execute(query)
        print("Execution completed")
        rows = cur.fetchall()
        print("Read completed")
        lenOfFemale = len(rows)
        print("Number of rows: %s" % (lenOfFemale))
        authors = []
        for row in rows:
            row = [row[0], "F"]
            authors.append(row)
            
        query = "SELECT author_id FROM " + str(documentTable)
        query += " WHERE gender like '%M%' LIMIT " + str(lenOfFemale) + " ;"
        cur.execute(query)
        print("Execution completed")
        rows = cur.fetchall()
        print("Read completed")
        lenOfMale = len(rows)
        print("Number of rows: %s" % (lenOfFemale))
        
        for row in rows:
            row = [row[0], "M"]
            authors.append(row)
            
        documentTable = 'document'
        
        query = "SELECT author_id, doc_content FROM " + str(documentTable) + " WHERE author_id IN ("
        flag = False
        for auth in authors:
            if not flag:
                query = query + str(auth[0])
                flag = True
            else:
                query = query + ", " + str(auth[0])
        query = query + ") ;"
        cur.execute(query)
        print("Execution completed")
        rows = cur.fetchall()
        print("Read completed")
        print("Number of rows: %s" % (len(rows)))
        authors = dict(authors)

    except psycopg2.Error as e:
        if conn:
            conn.rollback()
        print('Error %s' % e)
        sys.exit(1)

    finally:
        if conn is not None:
            conn.close()


Execution completed
Read completed
Number of rows: 14
Execution completed
Read completed
Number of rows: 14
Execution completed
Read completed
Number of rows: 218

In [3]:
%tb


No traceback available to show.

In [4]:
count = 0
for row in rows:
    tokens = nltk.word_tokenize(row[1].decode("utf8"))
    chunk1 = []
    for x in tokens:
        if (i < chunk_size):
            chunk1.append(x.encode("utf8"))
            i += 1
        else:
            chunk1.append(x.encode("utf8"))
            xx = ' '.join(chunk1)
            xx = str(xx)
            chunk1 = []
            gender = authors[row[0]]
            output.append([row[0], xx, gender])
            i = 1
    if len(chunk1) > 0:
        xx = ' '.join(chunk1)
        xx = str(xx)
        chunk1 = []
        output.append([row[0], xx])
        i = 1
    count += 1
    if (count % 1000) == 0:
        print("%s completed" % (str(count)))
    

df = pd.DataFrame(output, columns=["author_id", "doc_content", "gender"])
del output

In [5]:
print(df.shape)


(24987, 3)

In [ ]: