In [1]:
from __future__ import print_function
import nltk.tokenize
import psycopg2
import pandas as pd
import sys
databaseConnectionServer = 'srn02.cs.cityu.edu.hk'
from sshtunnel import SSHTunnelForwarder
chunk_size = 1000
authors = [127, 136, 24, 36, 47, 49, 87, 101, 40, 73, 78, 82, 76, 21]
In [2]:
with SSHTunnelForwarder((databaseConnectionServer, 22), ssh_username='stylometry', ssh_password='stylometry',
remote_bind_address=('localhost', 5432), local_bind_address=('localhost', 5400)):
PORT=5400
df = pd.DataFrame()
conn = None
output = []
i = 1
# nltk.download('punkt')
try:
conn = psycopg2.connect(user="stylometry", password="stylometry",
database="stylometry_v2", host="localhost", port=PORT)
cur = conn.cursor()
documentTable = 'author'
query = "SELECT author_id FROM " + str(documentTable)
query += " WHERE gender like '%F%' ;"
cur.execute(query)
print("Execution completed")
rows = cur.fetchall()
print("Read completed")
lenOfFemale = len(rows)
print("Number of rows: %s" % (lenOfFemale))
authors = []
for row in rows:
row = [row[0], "F"]
authors.append(row)
query = "SELECT author_id FROM " + str(documentTable)
query += " WHERE gender like '%M%' LIMIT " + str(lenOfFemale) + " ;"
cur.execute(query)
print("Execution completed")
rows = cur.fetchall()
print("Read completed")
lenOfMale = len(rows)
print("Number of rows: %s" % (lenOfFemale))
for row in rows:
row = [row[0], "M"]
authors.append(row)
documentTable = 'document'
query = "SELECT author_id, doc_content FROM " + str(documentTable) + " WHERE author_id IN ("
flag = False
for auth in authors:
if not flag:
query = query + str(auth[0])
flag = True
else:
query = query + ", " + str(auth[0])
query = query + ") ;"
cur.execute(query)
print("Execution completed")
rows = cur.fetchall()
print("Read completed")
print("Number of rows: %s" % (len(rows)))
authors = dict(authors)
except psycopg2.Error as e:
if conn:
conn.rollback()
print('Error %s' % e)
sys.exit(1)
finally:
if conn is not None:
conn.close()
In [3]:
%tb
In [4]:
count = 0
for row in rows:
tokens = nltk.word_tokenize(row[1].decode("utf8"))
chunk1 = []
for x in tokens:
if (i < chunk_size):
chunk1.append(x.encode("utf8"))
i += 1
else:
chunk1.append(x.encode("utf8"))
xx = ' '.join(chunk1)
xx = str(xx)
chunk1 = []
gender = authors[row[0]]
output.append([row[0], xx, gender])
i = 1
if len(chunk1) > 0:
xx = ' '.join(chunk1)
xx = str(xx)
chunk1 = []
output.append([row[0], xx])
i = 1
count += 1
if (count % 1000) == 0:
print("%s completed" % (str(count)))
df = pd.DataFrame(output, columns=["author_id", "doc_content", "gender"])
del output
In [5]:
print(df.shape)
In [ ]: