2017.07.15 - work log - database disagreement evaluation
In [ ]:
# imports
import csv
import datetime
import json
import psycopg2
import psycopg2.extras
import six
print( "packages imported at " + str( datetime.datetime.now() ) )
In [ ]:
%pwd
In [ ]:
%ls
First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.
You need to have installed your virtualenv with django as a kernel, then select that kernel for this notebook.
In [ ]:
%run django_init.py
Import any sourcenet or context_analysis models or classes.
In [ ]:
# context_analysis models.
from context_analysis.models import Reliability_Names
from context_analysis.models import Reliability_Names_Evaluation
print( "sourcenet and context_analysis packages imported at " + str( datetime.datetime.now() ) )
In [ ]:
# ==> PostgreSQL (psycopg2) example:
db_type = "postgresql"
db_username = "<username>"
db_password = "<password>"
db_host = "research.local"
db_port = "5432"
db_database = "sourcenet"
db_encoding = "utf8"
# create psycopg2 connection to PostgreSQL using connection variables defined above.
pgsql_connection = psycopg2.connect( host = db_host, port = db_port, database = db_database, user = db_username, password = db_password )
print( "psycopg2 database connection created at " + str( datetime.datetime.now() ) + ": " + str( pgsql_connection ) )
In [ ]:
# create psycopg2 cursor using pgsql_connection.
pgsql_cursor = pgsql_connection.cursor( cursor_factory = psycopg2.extras.DictCursor )
print( "psycopg2 database cursor created at " + str( datetime.datetime.now() ) + ": " + str( pgsql_cursor ) )
In [ ]:
# rollback transaction:
pgsql_connection.rollback()
print( "connection transaction rolled back at " + str( datetime.datetime.now() ) )
Read and parse delimited file of Reliability_Names rows we've already processed. Create database row for each.
Read and parse delimited file of Reliability_Names rows we've already processed while evaluating disagreements to make sure ground truth is correct. Create database row for each.
In [ ]:
# CONSTANTS-ish
# header indexes:
#ID | Name | Article | Article_Data_List | Status | Error? (SHB = Should Have Been)| Notes
INDEX_RELIABILITY_NAMES_ID = 0
INDEX_PERSON_NAME = 1
INDEX_ARTICLE_INFO = 2
INDEX_ARTICLE_DATA_LIST = 3
INDEX_STATUS = 4
INDEX_STATUS_MESSAGE = 5
INDEX_NOTES = 6
# declare variables - program control
DEBUG = True
do_updates = False
# declare variables
delimited_file_path = ""
delimited_file = None
delimited_reader = None
current_row = None
rn_instance = None
rne_instance = None
row_counter = -1
manual_counter = -1
# declare variables - column values
reliability_names_id = -1
person_name = ""
article_info = ""
article_data_info = ""
status = ""
status_message = ""
notes = ""
label = ""
event_type = ""
# declare varibles - create by hand
work_string = ""
work_list = []
article_id = -1
article_data_list = []
article_data_item = None
article_data_id_list = []
article_data_id = -1
# file path?
delimited_file_path = "reliability_names_evaluations.txt"
# read file
with open( delimited_file_path ) as delimited_file:
# CSV reader
delimited_reader = csv.reader( delimited_file, delimiter='|' )
# first row is column names
current_row = six.next( delimited_reader )
# init shared values
label = "prelim_month"
event_type = Reliability_Names_Evaluation.EVENT_TYPE_REMOVE_TAGS
# now loop over rest of rows.
row_counter = 0
manual_counter = 0
for current_row in delimited_reader:
# increment row counter
row_counter += 1
# print out the row.
column_count = len( current_row )
print( "==> row column count = " + str( column_count ) + " ( " + str( column_count == 7 ) + " ) - row: " + ",".join( current_row ) )
# get values
reliability_names_id = current_row[ INDEX_RELIABILITY_NAMES_ID ].strip()
reliability_names_id = int( reliability_names_id )
person_name = current_row[ INDEX_PERSON_NAME ].strip()
article_info = current_row[ INDEX_ARTICLE_INFO ].strip()
article_data_info = current_row[ INDEX_ARTICLE_DATA_LIST ].strip()
status = current_row[ INDEX_STATUS ].strip()
status_message = current_row[ INDEX_STATUS_MESSAGE ].strip()
notes = current_row[ INDEX_NOTES ].strip()
# parse out additional data.
# ==> article_id
# - Article [20645](http://research.local/research/context/text/article/article_data/view_with_text/?article_id=20645)
# split first on the leading square bracket around the ID.
work_list = article_info.split( "[" )
# then split item 2 (index 1) on right square bracket.
work_string = work_list[ 1 ]
work_list = work_string.split( "]" )
article_id = work_list[ 0 ].strip()
article_id = int( article_id )
#print( "----> article_id = " + str( article_id ) )
# ==> article_data_id
# - Article_Data: [2322 (coder=8)](http://research.local/research/context/text/article/article_data/view/?article_id=20645&article_data_id_select=2322); [2984 (coder=2)](http://research.local/research/context/text/article/article_data/view/?article_id=20645&article_data_id_select=2984)
# split first on semi-colon
article_data_list = article_data_info.split( ";" )
article_data_id_list = []
for article_data_item in article_data_list:
# use work_list to isolate the ID - first, split on left square bracket.
work_list = article_data_item.split( "[" )
# get 2nd thing in list.
work_string = work_list[ 1 ]
# split then on left paren
work_list = work_string.split( "(" )
# get the first item in list.
article_data_id = work_list[ 0 ].strip()
article_data_id = int( article_data_id )
# add to list.
article_data_id_list.append( article_data_id )
#print( "----> article_data_id = " + str( article_data_id ) )
#-- END loop over article data items. --#
if ( DEBUG == True ):
print( "----> reliability_names_id = " + str( reliability_names_id ) )
print( "----> person_name = " + person_name )
print( "----> article_id = " + str( article_id ) )
print( "----> article_data_id_list = " + str( article_data_id_list ) )
print( "----> status = " + status )
print( "----> status_message = " + status_message )
print( "----> notes = " + notes )
#-- END DEBUG --#
if ( do_updates == True ):
# call Reliability_Names_Evaluation.create_from_reliability_data()
rne_instance = Reliability_Names_Evaluation.create_from_reliability_data(
reliability_names_id,
label_IN = label,
person_name_IN = person_name,
article_id_IN = article_id,
article_data_id_list_IN = article_data_id_list,
status_IN = status,
status_message_IN = status_message,
notes_IN = notes
)
# set a few more things and save.
#rne_instance.is_single_name = True
#rne_instance.tags.add( "single_name" )
#rne_instance.save()
#-- END check to see if we do_updates --#
#-- END loop over rows. --#
#-- END with open( delimited_file_path ) --#
Read and parse delimited file of Reliability_Names rows we've already removed while evaluating entries with a single name part for removal. Create database row for each.
In [ ]:
# CONSTANTS-ish
# before running, MAX( id ) = 80.
# header indexes:
# ID | Article | Article_Data | Article_Subject | Type
INDEX_RELIABILITY_NAMES_ID = 0
INDEX_ARTICLE_INFO = 1
INDEX_ARTICLE_DATA_LIST = 2
INDEX_ARTICLE_SUBJECT = 3
INDEX_STATUS = 4
# declare variables - program control
DEBUG = True
do_updates = False
# declare variables
delimited_file_path = ""
delimited_file = None
delimited_reader = None
current_row = None
rn_instance = None
rne_instance = None
row_counter = -1
manual_counter = -1
# declare variables - column values
reliability_names_id = -1
person_name = ""
article_info = ""
article_data_info = ""
article_subject = ""
status_message = ""
status = ""
notes = ""
label = ""
event_type = ""
# declare varibles - create by hand
work_string = ""
work_list = []
article_id = -1
article_data_list = []
article_data_item = None
article_data_id_list = []
article_data_id = -1
status_choice_info = None
status_value = None
# file path?
delimited_file_path = "reliability_single_name_evaluation.txt"
# read file
with open( delimited_file_path ) as delimited_file:
# CSV reader
delimited_reader = csv.reader( delimited_file, delimiter='|' )
# first row is column names
current_row = six.next( delimited_reader )
# init shared values
event_type = Reliability_Names_Evaluation.EVENT_TYPE_DELETE
label = "prelim_month"
notes = "Deleted Reliability_Names because we are not working with single names for now. If CORRECT, OpenCalais got it right, but not doing single names."
# now loop over rest of rows.
row_counter = 0
manual_counter = 0
for current_row in delimited_reader:
# increment row counter
row_counter += 1
# print out the row.
column_count = len( current_row )
print( "==> row column count = " + str( column_count ) + " ( " + str( column_count == 7 ) + " ) - row: " + ",".join( current_row ) )
# initialize
person_name = None
status = None
# get values
reliability_names_id = current_row[ INDEX_RELIABILITY_NAMES_ID ].strip()
reliability_names_id = int( reliability_names_id )
#person_name = current_row[ INDEX_PERSON_NAME ].strip()
article_info = current_row[ INDEX_ARTICLE_INFO ].strip()
article_data_info = current_row[ INDEX_ARTICLE_DATA_LIST ].strip()
article_subject = current_row[ INDEX_ARTICLE_SUBJECT ].strip()
# status = current_row[ INDEX_STATUS ].strip()
status_message = current_row[ INDEX_STATUS ].strip()
# parse out additional data.
# ==> person_name
# - from Article_Subject: 12010 (AS) - Betty ( id = 885; capture_method = OpenCalais_REST_API_v1 ) (mentioned; individual) ==> name: Betty
# split on "==> name:"
work_list = article_subject.split( "==> name:" )
# Take the 2nd item
person_name = work_list[ 1 ].strip()
# ==> article_id
# - Article [20645](http://research.local/research/context/text/article/article_data/view_with_text/?article_id=20645)
# split first on the leading square bracket around the ID.
work_list = article_info.split( "[" )
# then split item 2 (index 1) on right square bracket.
work_string = work_list[ 1 ]
work_list = work_string.split( "]" )
article_id = work_list[ 0 ].strip()
article_id = int( article_id )
# ==> article_data_id
# - Article_Data [3193](http://research.local/research/context/text/article/article_data/view/?article_id=23065&article_data_id_select=3193)
# split first on semi-colon
article_data_list = article_data_info.split( ";" )
article_data_id_list = []
for article_data_item in article_data_list:
# use work_list to isolate the ID - first, split on left square bracket.
work_list = article_data_item.split( "[" )
# get 2nd thing in list.
work_string = work_list[ 1 ]
# split then on right square bracket
work_list = work_string.split( "]" )
# get the first item in list.
article_data_id = work_list[ 0 ].strip()
article_data_id = int( article_data_id )
# add to list.
article_data_id_list.append( article_data_id )
# print( "----> article_data_id = " + str( article_data_id ) )
#-- END loop over article data items. --#
# ==> Status value
# loop over statuses. If a value found in status_message, set status
# to that value.
for status_choice_info in Reliability_Names_Evaluation.STATUS_CHOICES:
# get status value (first thing in status choice info)
status_value = status_choice_info[ 0 ]
# in status_message?
if ( status_value in status_message ):
# CORRECT!
status = status_value
#-- END check for status in message --#
#-- END loop over status choices. --#
# got a status?
if ( ( status is None ) or ( status == "" ) ):
# no status - default to "ERROR".
status = Reliability_Names_Evaluation.STATUS_ERROR
#-- END check for no status match. --#
if ( DEBUG == True ):
print( "----> reliability_names_id = " + str( reliability_names_id ) )
print( "----> person_name = " + person_name )
print( "----> article_id = " + str( article_id ) )
print( "----> article_data_id_list = " + str( article_data_id_list ) )
print( "----> status = " + status )
print( "----> status_message = " + status_message )
#-- END DEBUG --#
if ( do_updates == True ):
# call Reliability_Names_Evaluation.create_from_reliability_data()
rne_instance = Reliability_Names_Evaluation.create_from_reliability_data(
reliability_names_id,
label_IN = label,
person_name_IN = person_name,
article_id_IN = article_id,
article_data_id_list_IN = article_data_id_list,
status_IN = status,
status_message_IN = status_message,
event_type_IN = event_type
)
# set a few more things and save.
rne_instance.is_single_name = True
rne_instance.tags.add( "single_name" )
rne_instance.save()
#-- END check to see if we do_updates --#
#-- END loop over rows. --#
#-- END with open( delimited_file_path ) --#
Reset the Reliability_Names_Evaluation table and its child tables.
In [ ]:
# DROP all rows from the table.
table_name = "context_analysis_reliability_names_evaluation"
#sql_string = "DELETE FROM " + table_name
#sql_string += ";"
# run it.
#pgsql_cursor.execute( sql_string )
# loop over records in table.
evaluation_qs = None
evaluation_qs = Reliability_Names_Evaluation.objects.all()
eval_record = None
record_count = 0
for eval_record in evaluation_qs:
# counter
record_count += 1
# output row details:
print( eval_record )
# delete.
eval_record.delete()
#-- END loop over records. --#
print( "deleted " + str( record_count ) + " records." )
# reset the unique ID sequences to 0.
sequence_name_list = []
sequence_name_list.append( table_name + "_id_seq" )
sequence_name_list.append( "context_analysis_reliability_names_evaluation_article__id_seq" )
sequence_name_list.append( "context_analysis_reliability_names_evaluation_persons_id_seq" )
for sequence_name in sequence_name_list:
sql_string = "ALTER SEQUENCE " + sequence_name + " RESTART WITH 1;"
print( "==> " + sql_string )
pgsql_cursor.execute( sql_string )
#-- END loop over sequence names to reset. --#
# commit
pgsql_connection.commit()
print( "reset table " + table_name + " at " + str( datetime.datetime.now() ) )
TODO:
DONE:
remove_single_names-work_log.ipynb"