2017.07.15 - work log - database disagreement evaluation

Setup

Setup - Imports


In [ ]:
# imports
import csv
import datetime
import json
import psycopg2
import psycopg2.extras
import six

print( "packages imported at " + str( datetime.datetime.now() ) )

In [ ]:
%pwd

In [ ]:
%ls

Setup - Initialize Django

First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.

You need to have installed your virtualenv with django as a kernel, then select that kernel for this notebook.


In [ ]:
%run django_init.py

Import any sourcenet or context_analysis models or classes.


In [ ]:
# context_analysis models.
from context_analysis.models import Reliability_Names
from context_analysis.models import Reliability_Names_Evaluation

print( "sourcenet and context_analysis packages imported at " + str( datetime.datetime.now() ) )

Setup - Database


In [ ]:
# ==> PostgreSQL (psycopg2) example:
db_type = "postgresql"
db_username = "<username>"
db_password = "<password>"
db_host = "research.local"
db_port = "5432"
db_database = "sourcenet"
db_encoding = "utf8"

# create psycopg2 connection to PostgreSQL using connection variables defined above.
pgsql_connection = psycopg2.connect( host = db_host, port = db_port, database = db_database, user = db_username, password = db_password )

print( "psycopg2 database connection created at " + str( datetime.datetime.now() ) + ": " + str( pgsql_connection ) )

In [ ]:
# create psycopg2 cursor using pgsql_connection.
pgsql_cursor = pgsql_connection.cursor( cursor_factory = psycopg2.extras.DictCursor )

print( "psycopg2 database cursor created at " + str( datetime.datetime.now() ) + ": " + str( pgsql_cursor ) )

In [ ]:
# rollback transaction:
pgsql_connection.rollback()

print( "connection transaction rolled back at " + str( datetime.datetime.now() ) )

Work

Read and parse delimited file of Reliability_Names rows we've already processed. Create database row for each.

Work - database ground truth evaluation data

Read and parse delimited file of Reliability_Names rows we've already processed while evaluating disagreements to make sure ground truth is correct. Create database row for each.


In [ ]:
# CONSTANTS-ish

# header indexes:
#ID | Name | Article | Article_Data_List | Status | Error? (SHB = Should Have Been)| Notes
INDEX_RELIABILITY_NAMES_ID = 0
INDEX_PERSON_NAME = 1
INDEX_ARTICLE_INFO = 2
INDEX_ARTICLE_DATA_LIST = 3
INDEX_STATUS = 4
INDEX_STATUS_MESSAGE = 5
INDEX_NOTES = 6

# declare variables - program control
DEBUG = True
do_updates = False

# declare variables
delimited_file_path = ""
delimited_file = None
delimited_reader = None
current_row = None
rn_instance = None
rne_instance = None
row_counter = -1
manual_counter = -1

# declare variables - column values
reliability_names_id = -1
person_name = ""
article_info = ""
article_data_info = ""
status = ""
status_message = ""
notes = ""
label = ""
event_type = ""

# declare varibles - create by hand
work_string = ""
work_list = []
article_id = -1
article_data_list = []
article_data_item = None
article_data_id_list = []
article_data_id = -1

# file path?
delimited_file_path = "reliability_names_evaluations.txt"

# read file
with open( delimited_file_path ) as delimited_file:
    
    # CSV reader
    delimited_reader = csv.reader( delimited_file, delimiter='|' )

    # first row is column names
    current_row = six.next( delimited_reader )
    
    # init shared values
    label = "prelim_month"
    event_type = Reliability_Names_Evaluation.EVENT_TYPE_REMOVE_TAGS
    
    # now loop over rest of rows.
    row_counter = 0
    manual_counter = 0
    for current_row in delimited_reader:
        
        # increment row counter
        row_counter += 1
        
        # print out the row.
        column_count = len( current_row )
        print( "==> row column count = " + str( column_count ) + " ( " + str( column_count == 7 ) + " ) - row: " + ",".join( current_row ) )
        
        # get values
        reliability_names_id = current_row[ INDEX_RELIABILITY_NAMES_ID ].strip()
        reliability_names_id = int( reliability_names_id )
        person_name = current_row[ INDEX_PERSON_NAME ].strip()
        article_info = current_row[ INDEX_ARTICLE_INFO ].strip()
        article_data_info = current_row[ INDEX_ARTICLE_DATA_LIST ].strip()
        status = current_row[ INDEX_STATUS ].strip()
        status_message = current_row[ INDEX_STATUS_MESSAGE ].strip()
        notes = current_row[ INDEX_NOTES ].strip()
        
        # parse out additional data.
        
        # ==> article_id
        # - Article [20645](http://research.local/research/context/text/article/article_data/view_with_text/?article_id=20645)

        # split first on the leading square bracket around the ID.
        work_list = article_info.split( "[" )

        # then split item 2 (index 1) on right square bracket.
        work_string = work_list[ 1 ]
        work_list = work_string.split( "]" )
        article_id = work_list[ 0 ].strip()
        article_id = int( article_id )
        #print( "----> article_id = " + str( article_id ) )

        # ==> article_data_id
        # - Article_Data: [2322 (coder=8)](http://research.local/research/context/text/article/article_data/view/?article_id=20645&article_data_id_select=2322); [2984 (coder=2)](http://research.local/research/context/text/article/article_data/view/?article_id=20645&article_data_id_select=2984)

        # split first on semi-colon
        article_data_list = article_data_info.split( ";" )
        article_data_id_list = []
        for article_data_item in article_data_list:

            # use work_list to isolate the ID - first, split on left square bracket.
            work_list = article_data_item.split( "[" )

            # get 2nd thing in list.
            work_string = work_list[ 1 ]

            # split then on left paren
            work_list = work_string.split( "(" )

            # get the first item in list.
            article_data_id = work_list[ 0 ].strip()
            article_data_id = int( article_data_id )

            # add to list.
            article_data_id_list.append( article_data_id )

            #print( "----> article_data_id = " + str( article_data_id ) )

        #-- END loop over article data items. --#
        
        if ( DEBUG == True ):
            
            print( "----> reliability_names_id = " + str( reliability_names_id ) )
            print( "----> person_name = " + person_name )
            print( "----> article_id = " + str( article_id ) )
            print( "----> article_data_id_list = " + str( article_data_id_list ) )
            print( "----> status = " + status )
            print( "----> status_message = " + status_message )
            print( "----> notes = " + notes )
            
        
        #-- END DEBUG --#
        
        if ( do_updates == True ):
        
            # call Reliability_Names_Evaluation.create_from_reliability_data()
            rne_instance = Reliability_Names_Evaluation.create_from_reliability_data(
                               reliability_names_id,
                               label_IN = label,
                               person_name_IN = person_name,
                               article_id_IN = article_id,
                               article_data_id_list_IN = article_data_id_list,
                               status_IN = status,
                               status_message_IN = status_message,
                               notes_IN = notes
                           )       
            
            # set a few more things and save.
            #rne_instance.is_single_name = True
            #rne_instance.tags.add( "single_name" )
            #rne_instance.save()
            
        #-- END check to see if we do_updates --#

    #-- END loop over rows. --#
    
#-- END with open( delimited_file_path ) --#

Work - database single name evaluation data

Read and parse delimited file of Reliability_Names rows we've already removed while evaluating entries with a single name part for removal. Create database row for each.


In [ ]:
# CONSTANTS-ish

# before running, MAX( id ) = 80.

# header indexes:
# ID | Article | Article_Data | Article_Subject | Type
INDEX_RELIABILITY_NAMES_ID = 0
INDEX_ARTICLE_INFO = 1
INDEX_ARTICLE_DATA_LIST = 2
INDEX_ARTICLE_SUBJECT = 3
INDEX_STATUS = 4

# declare variables - program control
DEBUG = True
do_updates = False

# declare variables
delimited_file_path = ""
delimited_file = None
delimited_reader = None
current_row = None
rn_instance = None
rne_instance = None
row_counter = -1
manual_counter = -1

# declare variables - column values
reliability_names_id = -1
person_name = ""
article_info = ""
article_data_info = ""
article_subject = ""
status_message = ""
status = ""
notes = ""
label = ""
event_type = ""

# declare varibles - create by hand
work_string = ""
work_list = []
article_id = -1
article_data_list = []
article_data_item = None
article_data_id_list = []
article_data_id = -1
status_choice_info = None
status_value = None

# file path?
delimited_file_path = "reliability_single_name_evaluation.txt"

# read file
with open( delimited_file_path ) as delimited_file:
    
    # CSV reader
    delimited_reader = csv.reader( delimited_file, delimiter='|' )

    # first row is column names
    current_row = six.next( delimited_reader )
    
    # init shared values
    event_type = Reliability_Names_Evaluation.EVENT_TYPE_DELETE
    label = "prelim_month"
    notes = "Deleted Reliability_Names because we are not working with single names for now.  If CORRECT, OpenCalais got it right, but not doing single names."
    
    # now loop over rest of rows.
    row_counter = 0
    manual_counter = 0
    for current_row in delimited_reader:
        
        # increment row counter
        row_counter += 1
        
        # print out the row.
        column_count = len( current_row )
        print( "==> row column count = " + str( column_count ) + " ( " + str( column_count == 7 ) + " ) - row: " + ",".join( current_row ) )
        
        # initialize
        person_name = None
        status = None
        
        # get values
        reliability_names_id = current_row[ INDEX_RELIABILITY_NAMES_ID ].strip()
        reliability_names_id = int( reliability_names_id )
        #person_name = current_row[ INDEX_PERSON_NAME ].strip()
        article_info = current_row[ INDEX_ARTICLE_INFO ].strip()
        article_data_info = current_row[ INDEX_ARTICLE_DATA_LIST ].strip()
        article_subject = current_row[ INDEX_ARTICLE_SUBJECT ].strip()
        # status = current_row[ INDEX_STATUS ].strip()
        status_message = current_row[ INDEX_STATUS ].strip()
        
        # parse out additional data.
        
        # ==> person_name
        # - from Article_Subject: 12010 (AS) - Betty ( id = 885; capture_method = OpenCalais_REST_API_v1 ) (mentioned; individual) ==> name: Betty
        
        # split on "==> name:"
        work_list = article_subject.split( "==> name:" )
        
        # Take the 2nd item
        person_name = work_list[ 1 ].strip()

        # ==> article_id
        # - Article [20645](http://research.local/research/context/text/article/article_data/view_with_text/?article_id=20645)

        # split first on the leading square bracket around the ID.
        work_list = article_info.split( "[" )

        # then split item 2 (index 1) on right square bracket.
        work_string = work_list[ 1 ]
        work_list = work_string.split( "]" )
        article_id = work_list[ 0 ].strip()
        article_id = int( article_id )

        # ==> article_data_id
        # - Article_Data [3193](http://research.local/research/context/text/article/article_data/view/?article_id=23065&article_data_id_select=3193)

        # split first on semi-colon
        article_data_list = article_data_info.split( ";" )
        article_data_id_list = []
        for article_data_item in article_data_list:

            # use work_list to isolate the ID - first, split on left square bracket.
            work_list = article_data_item.split( "[" )

            # get 2nd thing in list.
            work_string = work_list[ 1 ]

            # split then on right square bracket
            work_list = work_string.split( "]" )

            # get the first item in list.
            article_data_id = work_list[ 0 ].strip()
            article_data_id = int( article_data_id )

            # add to list.
            article_data_id_list.append( article_data_id )

            # print( "----> article_data_id = " + str( article_data_id ) )

        #-- END loop over article data items. --#
        
        # ==> Status value
        
        # loop over statuses.  If a value found in status_message, set status
        #     to that value.
        for status_choice_info in Reliability_Names_Evaluation.STATUS_CHOICES:
            
            # get status value (first thing in status choice info)
            status_value = status_choice_info[ 0 ]
            
            # in status_message?
            if ( status_value in status_message ):

                # CORRECT!
                status = status_value

            #-- END check for status in message --#
            
        #-- END loop over status choices. --#
        
        # got a status?
        if ( ( status is None ) or ( status == "" ) ):
            
            # no status - default to "ERROR".
            status = Reliability_Names_Evaluation.STATUS_ERROR
            
        #-- END check for no status match. --#
        
        if ( DEBUG == True ):
            
            print( "----> reliability_names_id = " + str( reliability_names_id ) )
            print( "----> person_name = " + person_name )
            print( "----> article_id = " + str( article_id ) )
            print( "----> article_data_id_list = " + str( article_data_id_list ) )
            print( "----> status = " + status )
            print( "----> status_message = " + status_message )
        
        #-- END DEBUG --#
        
        if ( do_updates == True ):
        
            # call Reliability_Names_Evaluation.create_from_reliability_data()
            rne_instance = Reliability_Names_Evaluation.create_from_reliability_data(
                               reliability_names_id,
                               label_IN = label,
                               person_name_IN = person_name,
                               article_id_IN = article_id,
                               article_data_id_list_IN = article_data_id_list,
                               status_IN = status,
                               status_message_IN = status_message,
                               event_type_IN = event_type
                           )
            
            # set a few more things and save.
            rne_instance.is_single_name = True
            rne_instance.tags.add( "single_name" )
            rne_instance.save()
            
        #-- END check to see if we do_updates --#
        
    #-- END loop over rows. --#
    
#-- END with open( delimited_file_path ) --#

Work - reset reliability names evaluation table

Reset the Reliability_Names_Evaluation table and its child tables.


In [ ]:
# DROP all rows from the table.
table_name = "context_analysis_reliability_names_evaluation"
#sql_string = "DELETE FROM " + table_name
#sql_string += ";"

# run it.
#pgsql_cursor.execute( sql_string )

# loop over records in table.
evaluation_qs = None
evaluation_qs = Reliability_Names_Evaluation.objects.all()
eval_record = None
record_count = 0
for eval_record in evaluation_qs:
    
    # counter
    record_count += 1
    
    # output row details:
    print( eval_record )
    
    # delete.
    eval_record.delete()
    
#-- END loop over records. --#

print( "deleted " + str( record_count ) + " records." )

# reset the unique ID sequences to 0.
sequence_name_list = []
sequence_name_list.append( table_name + "_id_seq" )
sequence_name_list.append( "context_analysis_reliability_names_evaluation_article__id_seq" )
sequence_name_list.append( "context_analysis_reliability_names_evaluation_persons_id_seq" )

for sequence_name in sequence_name_list:
    
    sql_string = "ALTER SEQUENCE " + sequence_name + " RESTART WITH 1;"
    print( "==> " + sql_string )
    pgsql_cursor.execute( sql_string )
    
#-- END loop over sequence names to reset. --#

# commit
pgsql_connection.commit()

print( "reset table " + table_name + " at " + str( datetime.datetime.now() ) )

TODO

TODO:

  • TK

DONE:

  • // check if Reliability_Names exists. If not, parse Article, Article_Data, Persons, and make a row by hand...
  • // wipe table and re-run.
  • // go through and update by hand the flags for the other lists.
  • // make a list of those that were merged.
  • // update with single name removals from "remove_single_names-work_log.ipynb"