2017.07.01 - work log - prelim_month - evaluate disagreements
In [ ]:
import datetime
import json
import six
print( "packages imported at " + str( datetime.datetime.now() ) )
In [ ]:
%pwd
First, initialize my dev django project, so I can run code in this notebook that references my django models and can talk to the database using my project's settings.
You need to have installed your virtualenv with django as a kernel, then select that kernel for this notebook.
In [ ]:
%run django_init.py
Import any sourcenet
or context_analysis
models or classes.
In [ ]:
# django imports
from django.contrib.auth.models import User
# sourcenet shared
from context_text.shared.person_details import PersonDetails
# sourcenet models.
from context_text.models import Article
from context_text.models import Article_Data
from context_text.models import Article_Subject
from context_text.models import Person
from context_text.shared.context_text_base import ContextTextBase
from context_text.tests.models.test_Article_Data_model import Article_Data_Copy_Tester
# sourcenet article_coding
from context_text.article_coding.article_coding import ArticleCoder
from context_text.article_coding.manual_coding.manual_article_coder import ManualArticleCoder
# context_analysis models.
from context_analysis.models import Reliability_Names
from context_analysis.models import Reliability_Names_Evaluation
from context_analysis.reliability.reliability_names_builder import ReliabilityNamesBuilder
print( "sourcenet and context_analysis packages imported at " + str( datetime.datetime.now() ) )
Retrieve the ground truth user, then make a deep copy of an Article_Data record, assigning it to the ground truth user.
In [ ]:
def copy_to_ground_truth_user( source_article_data_id_IN ):
'''
Accepts ID of Article_Data instance to copy to ground_truth user,
for correcting coding error made by human coder. Performs a deep
copy of Article_Data instance, then assignes it to the ground_truth
user. Prints any validation errors, returns the new Article_Data.
'''
# return reference
new_article_data_instance_OUT = -1
# declare variables
ground_truth_user = None
ground_truth_user_id = -1
id_of_article_data_to_copy = -1
new_article_data = None
new_article_data_id = -1
validation_error_list = None
validation_error_count = -1
validation_error = None
# set ID of article data we want to copy.
id_of_article_data_to_copy = source_article_data_id_IN
# get the ground_truth user's ID.
ground_truth_user = ContextTextBase.get_ground_truth_coding_user()
ground_truth_user_id = ground_truth_user.id
# make the copy
new_article_data = Article_Data.make_deep_copy( id_of_article_data_to_copy,
new_coder_user_id_IN = ground_truth_user_id )
new_article_data_id = new_article_data.id
# validate it.
validation_error_list = Article_Data_Copy_Tester.validate_article_data_deep_copy( original_article_data_id_IN = id_of_article_data_to_copy,
copy_article_data_id_IN = new_article_data_id,
copy_coder_user_id_IN = ground_truth_user_id )
# get error count:
validation_error_count = len( validation_error_list )
if ( validation_error_count > 0 ):
# loop and output messages
for validation_error in validation_error_list:
print( "- Validation erorr: " + str( validation_error ) )
#-- END loop over validation errors. --#
else:
# no errors - success!
print( "Record copy a success (as far as we know)!" )
#-- END check to see if validation errors --#
print( "copied Article_Data id " + str( id_of_article_data_to_copy ) + " INTO Article_Data id " + str( new_article_data_id ) + " at " + str( datetime.datetime.now() ) )
new_article_data_instance_OUT = new_article_data
return new_article_data_instance_OUT
#-- END function copy_to_ground_truth_user() --#
print( "function copy_to_ground_truth_user() defined at " + str( datetime.datetime.now() ) )
In [ ]:
# Example: set ID of article data we want to copy.
#copy_to_ground_truth_user( 2342 )
Delete the Article_Data whose ID you specify (intended only when you accidentally create a "ground_truth
").
In [ ]:
def delete_article_data( article_data_id_IN, do_delete_IN = False ):
# declare variables
article_data_id = -1
article_data = None
do_delete = False
# set do_delete from parameter.
do_delete = do_delete_IN
# set ID.
article_data_id = article_data_id_IN
# get model instance
article_data = Article_Data.objects.get( id = article_data_id )
# got something?
if ( article_data is not None ):
# yes. Delete?
if ( do_delete == True ):
# delete.
print( "Deleting Article_Data: " + str( article_data ) )
article_data.delete()
else:
# no delete.
print( "Found Article_Data: " + str( article_data ) + ", but not deleting." )
#-- END check to see if we delete --#
#-- END check to see if Article_Data match. --#
#-- END function delete_article_data() --#
print( "function delete_article_data() defined at " + str( datetime.datetime.now() ) )
Steps:
In [ ]:
def update_reliability_names_label_for_article( article_id_IN, new_label_IN ):
# declare variables
article_id = -1
label = ""
row_string_list = None
# first, get existing Reliability_Names rows for article and label.
article_id = article_id_IN
label = "prelim_month"
# Do the update
row_string_list = Reliability_Names.update_reliabilty_names_for_article( article_id,
filter_label_IN = label,
new_label_IN = new_label_IN,
do_delete_IN = False )
# print the strings.
for row_string in row_string_list:
# print it.
print( row_string )
#-- END loop over row strings --#
#-- END function delete_reliability_names_for_article() --#
print( "function update_reliability_names_label_for_article() defined at " + str( datetime.datetime.now() ) )
Steps:
set up a call to the Reliability_Names program that just generates data for:
In [ ]:
def delete_reliability_names_for_article( article_id_IN ):
# declare variables
article_id = -1
label = ""
do_delete = False
row_string_list = None
# first, get existing Reliability_Names rows for article and label.
article_id = article_id_IN
label = "prelim_month"
do_delete = True
# Do the delete
row_string_list = Reliability_Names.delete_reliabilty_names_for_article( article_id,
label_IN = label,
do_delete_IN = do_delete )
# print the strings.
for row_string in row_string_list:
# print it.
print( row_string )
#-- END loop over row strings --#
#-- END function delete_reliability_names_for_article() --#
print( "function delete_reliability_names_for_article() defined at " + str( datetime.datetime.now() ) )
In [ ]:
def rebuild_reliability_names_for_article( article_id_IN, delete_existing_first_IN = True ):
'''
Remove existing Reliability_Names records for article, then rebuild them
from related Article_Data that matches any specified criteria.
Detailed logic:
- remove old Reliability_Names for that article ( [Delete existing `Reliability_Names` for article](#Delete-existing-Reliability_Names-for-article) ). Make sure to specify both label and Article ID, so you don't delete more than you intend.
- re-run Reliability_Names creation for the article ( [Make new `Reliability_Names`](#Make-new-Reliability_Names) ). Specify:
- Article ID list (just put the ID of the article you want to reprocess in the list).
- label: make sure this is the same as the label of the rest of your Reliability_Names records ("prelim_month").
- Tag list: If you want to make even more certain that you don't do something unexpected, also specify the article tags that make up your current data set, so if you accidentally specify the ID of an article not in your data set, it won't process. Current tag is "grp_month".
- Coders to assign to which index in the Reliability_Names record, and in what priority. You can assign multiple coders to a given index, for example, when multiple coders coded subsets of a data set, and you want their combined coding to be used as "coder 1" or "coder 2", for example. See the cell for an example.
- Automated coder type: You can specify the particular automated coding type you want for automated coder, to filter out coding done by other automated methods. See the cell for an example for "OpenCalais v2".
'''
# django imports
#from django.contrib.auth.models import User
# sourcenet imports
#from context_text.shared.context_text_base import ContextTextBase
# context_analysis imports
#from context_analysis.reliability.reliability_names_builder import ReliabilityNamesBuilder
# declare variables
my_reliability_instance = None
tag_in_list = []
article_id_in_list = []
label = ""
# declare variables - user setup
current_coder = None
current_coder_id = -1
current_index = -1
# declare variables - Article_Data filtering.
coder_type = ""
# delete old Reliability_Names?
if ( delete_existing_first_IN == True ):
# delete first
delete_reliability_names_for_article( article_id_IN )
#-- END check to see if we delete first --#
# make reliability instance
my_reliability_instance = ReliabilityNamesBuilder()
#===============================================================================
# configure
#===============================================================================
# list of tags of articles we want to process.
tag_in_list = [ "grp_month", ]
# list of IDs of articles we want to process:
article_id_in_list = [ article_id_IN, ]
# label to associate with results, for subsequent lookup.
label = "prelim_month"
# ! ====> map coders to indices
# set it up so that...
# ...the ground truth user has highest priority (4) for index 1...
current_coder = ContextTextBase.get_ground_truth_coding_user()
current_coder_id = current_coder.id
current_index = 1
current_priority = 4
my_reliability_instance.add_coder_at_index( current_coder_id, current_index, priority_IN = current_priority )
# ...coder ID 8 is priority 3 for index 1...
current_coder_id = 8
current_index = 1
current_priority = 3
my_reliability_instance.add_coder_at_index( current_coder_id, current_index, priority_IN = current_priority )
# ...coder ID 9 is priority 2 for index 1...
current_coder_id = 9
current_index = 1
current_priority = 2
my_reliability_instance.add_coder_at_index( current_coder_id, current_index, priority_IN = current_priority )
# ...coder ID 10 is priority 1 for index 1...
current_coder_id = 10
current_index = 1
current_priority = 1
my_reliability_instance.add_coder_at_index( current_coder_id, current_index, priority_IN = current_priority )
# ...and automated coder (2) is index 2
current_coder = ContextTextBase.get_automated_coding_user()
current_coder_id = current_coder.id
current_index = 2
current_priority = 1
my_reliability_instance.add_coder_at_index( current_coder_id, current_index, priority_IN = current_priority )
# and only look at coding by those users. And...
# configure so that it limits to automated coder_type of OpenCalais_REST_API_v2.
coder_type = "OpenCalais_REST_API_v2"
#my_reliability_instance.limit_to_automated_coder_type = "OpenCalais_REST_API_v2"
my_reliability_instance.automated_coder_type_include_list.append( coder_type )
# output debug JSON to file
#my_reliability_instance.debug_output_json_file_path = "/home/jonathanmorgan/" + label + ".json"
#===============================================================================
# process
#===============================================================================
# process articles
my_reliability_instance.process_articles( tag_in_list,
article_id_in_list_IN = article_id_in_list )
# output to database.
my_reliability_instance.output_reliability_data( label )
#-- END function rebuild_reliability_names_for_article() --#
print( "function rebuild_reliability_names_for_article() defined at " + str( datetime.datetime.now() ) )
First, assign "TODO" tag to all disagreements using the "View reliability name information" screen:
To do this:
First, enter the following in the fields there:
Click the "Submit Query" button. This should load all the disagreement rows (424 after removing single-word names).
TODO
" (without the quotes).Need to go through each disagreement and make sure that the ground truth is correct. In the interest of accuracy/precision/recall, my human coding serves as ground truth to compare computer against. So, will look at all the disagreements and make sure that the human coding is right. This isn't perfect. The error where both incorrectly agree is still unaddressed, and would effectively require me to re-code all the articles (which I could do...). But, better than not checking.
Evaluate disagreements using the "View reliability name information" screen:
To start, enter the following in fields there:
TODO
" (without the quotes).Then click the "Submit Query" button.
You should see all the records with disagreements that still need to be evaluated (we remove "TODO" from records as we go to keep track of which we have evaluated). To start, the same 424 that had disagreements after removing single names should be assigned "TODO" tag.
First, need to make sure that the article in question is actually a news article. Some lists and columns are written with such a different style from traditional news articles that they really shouldn't be included in this study. Others might be OK to include. For now, try to fix ground truth on them all, and I'll come back - probably will make the non-news Article_Data a separate label, then do the analysis on only prelim_month
, then on the combination of news and crazy to see the difference.
Excluded articles (label "prelim_month_exclude
"):
In [ ]:
article_id = 22869
new_label = "prelim_month_exclude"
#update_reliability_names_label_for_article( article_id, new_label )
print( "==> Updated labels for article " + str( article_id ) + " to " + str( new_label ) + " at " + str( datetime.datetime.now() ) )
Need to look at each instance where there is a disagreement and make sure the human coding is correct.
Most are probably instances where the computer screwed up, but since we are calling this human coding "ground truth", want to winnow out as much human error as possible.
For each disagreement, to check for coder error (like just capturing a name part for a person whose full name was in the story), click the "Article ID" in the column that has a link to article ID. It will take you to a view of the article where all the people who coded the article are included, with each detection of a mention or quotation displayed next to the paragraph where the person was originally first detected.
If the disagreement deals with mentions only, and if the person shouldn't instead have been quoted, it is OK to skip fixing it if the human coder was in error since those are not included in this work. It is also OK to fix if you want.
For each disagreement, click on the article ID link in the row to go to the article and check to see if the human coding for the disagreement in question is correct ( http://research.local/research/context/text/article/article_data/view_with_text/ ).
Once you've evaluated and verified the human coding, remove the "TODO
" tag from the current record (either from the single-article view above if you've removed all disagreements, or from the disagreement view if not):
TODO
" (without the quotes).This will also place information on the Reliability_Names
record into a Reliability_Names_Evaluation
record in the database. The message that results from this action completing will include a link to the record (the first number in the output). Click the link to open the record and update it with additional details. Specifically:
status - status of human coder's coding:
status
" to "ERROR".if problems caused by automated coder error, click the "is_automated_error
" checkbox.
status_message
" so it contains a brief description of what exactly happened (should have been mentioned, should have been quoted, missed the person entirely, etc.).update "Notes
" with more details.
add "Tags
" if appropriate (for sports articles, for example, add "sports" tag).
NOTE: Always remove TODO tag first, so you have a record of status for each Reliability Names record. Then, once you've done that, you can merge, delete, etc.
tag | description |
---|---|
ambiguous | if it is something that is ambiguouos because of article's implementation: "ambiguous". |
complex | if it is something genuinely complicated, ambiguous or confusing: "complex". |
complex_titles | issues with long or complex titles: "complex_titles" |
compound_attribution | single statement attributed to two or more people - "... Williams and Helder said.": "compound_attribution" |
compound_names | issue with compound names, later fixed in admin (Dave and Krista Mason)" "compound_names" |
contributed_to | for problems because of reporters credited in last paragraph: "contributed_to". |
dictionary_error | issues with name parts that appear to be in a dictionary: "dictionary_error" |
disambiguation | specific topic of ambiguity when matching name text to stored named entities: "disambiguation" |
editing_error | issues with editing errors: "editing_error" |
error | for particularly interesting OpenCalais errors, "error". |
follow_on_attribution | problems with pronoun attribution ("he said" in paragraph after a person is introduced - follow-on attribution) - "follow_on_attribution". |
foreign_names | for issues related to foreign names, add "foreign_names" tag. |
gender_confusion | issues with gender confusion (names that can refer to both genders - Dominique, etc.): "gender_confusion" |
initials | issues with initials (R.J. Smith): "initials" |
interesting | if something is interesting, "interesting" (for examples for paper). |
layout_or_design | issues with article layout/design: "layout_or_design" |
list | issues with lists within an article: "list" |
lookup | issues with looking up person based on name string: "lookup" |
no_html | for problems because OpenCalais API doesn't deal well with HTML, so I passed flattened text: "no_html" |
non_news | for non-news articles, for example sports or book reviews, add "non_news" tag. |
pronouns / pronoun_attribution | when a pronoun reference is ambiguous/indeterminate, or other "pronoun" chicanery |
proper_nouns | issues with proper nouns and names referring to other than people: "proper_nouns" |
quote_distance | problem with distance between intro of person and quote (a guess...): "quote_distance". |
said_verb | problem with said verb: "said_verb". |
second_hand | for second hand attribution fooling OpenCalais, use "second_hand". |
spanish | for issues related to the Spanish language, add "spanish" tag. |
sports | for sports articles, for example, add "sports" tag. |
straightforward | if a decision seems very straightforward, but OC errored, use "straightforward". |
short_n-gram | issues with short n-gram: "short_n-gram" |
title_prefix | issues with titles that precede name: "title_prefix" |
Order of operations:
If human coder did not detect person or made some other kind of error:
In the Reliability_Names disagreement view ( http://research.local/research/context/analysis/reliability/names/disagreement/view ), remove the "TODO
" tag from any items related to this disagreement and save:
TODO
" (without the quotes).This will also place information on the Reliability_Names
record into a Reliability_Names_Evaluation
record in the database. The message that results from this action completing will include a link to the record (the first number in the output). Click the link to open the record and update it with additional details. Specifically:
status:
status
" to "ERROR".update the "status_message
" so it contains a brief description of what exactly happened (should have been mentioned, should have been quoted, missed the person entirely, etc.).
Notes
" with more details, including the text in question.is_ground_truth_fixed
" checkbox.is_automated_error
" checkbox.is_ambiguous
" checkbox.add "Tags
" if appropriate:
In [ ]:
# Setup variables of interest.
resolve_article_id = 24132
human_article_data_id = 2801
print( "SET variables:" )
print( "- resolve_article_id = " + str( resolve_article_id ) )
print( "- human_article_data_id = " + str( human_article_data_id ) )
print( "at " + str( datetime.datetime.now() ) )
use the function "copy_to_ground_truth_user()
" defined in section Tool - copy Article_Data to user ground_truth to create a copy of the person's Article_Data
and assign it to coder "ground_truth
". Make a code cell and set up a call to "copy_to_ground_truth_user()
", passing it the ID of the Article_Data
you want to copy to ground_truth
. Example:
# copy Article_Data 12345 to ground_truth user.
copy_to_ground_truth_user( 12345 )
In [ ]:
# copy Article_Data to ground_truth user.
print( "==> copy_to_ground_truth_user() (article: " + str( resolve_article_id ) + ") run at " + str( datetime.datetime.now() ) )
print( "" )
copy_to_ground_truth_user( human_article_data_id )
In [ ]:
# if you screw up and create two, you can delete one:
delete_article_data( 3388, do_delete_IN = True )
if this is the first time you've used the "ground_truth
" user, log into the django admin ( http://research.local/research/admin/ ) and:
ground_truth
" user's password.log in to the coding tool ( http://research.local/research/context/text/article/code/ ) as the "ground_truth
" user and fix the coding for the article in question, then save.
make a code cell and call "rebuild_reliability_names_for_article()
", passing it the ID of the article whose Reliability_Names records you want to rebuild. It will automatically delete existing and then rebuild, using all the right parameters. Example:
# rebuild Reliability_Names for article 12345
rebuild_reliability_names_for_article( 12345 )
In [ ]:
# rebuild Reliability_Names for article
print( "==> rebuild_reliability_names_for_article() (article: " + str( resolve_article_id ) + ") run at " + str( datetime.datetime.now() ) )
print( "" )
rebuild_reliability_names_for_article( resolve_article_id )
Then, you'll need to re-fix any other problems with the article. Specifically:
load just the Reliability_Names for this article - http://research.local/research/context/analysis/reliability/names/disagreement/view:
<article_id>
," (without the quotes).check for single names, either to remove, or to tie an erroneously parsed name to the correct person (forgot to capture first name, for example).
add again the "TODO" tag to any rows with disagreement that haven't already been evaluated.
TODO
" (without the quotes).__Note: as you re-process, you should check to see if any of the steps already has a Reliability_Names_Evaluation
row, and if so, remove the newer one so you don't have duplicates of any of the actions recorded._
If there is a problem where human and computer coding of same person are so different they split into different rows, merge the computer row into the human row, then remove the computer row.
First, merge the computer row into the human row:
is_automated_error
" to checked, for example), add notes, adjust status message, etc.Second, delete the computer-only row:
Table of Reliability_Names records with disagreements, then separate tables of those where:
Track each Reliability_Names that we evaluate:
Reliability_Names_Evaluation
table in django: http://research.local/research/admin/context_analysis/reliability_names_evaluation/?label=prelim_month&o=-1.7.8.3.5For some, the error will be on the part of the human coder. For human error, we create a new "ground_truth
" record that we will correct, so we preserve original coding (and evidence of errors) in case we want or need that information later. Below, we have a table of the articles where we had to fix ground truth. To find the original coding, click the Article link.
is_ground_truth_fixed
" set to True in the Reliability_Names_Evaluation
table in django: http://research.local/research/admin/context_analysis/reliability_names_evaluation/?is_ground_truth_fixed__exact=1&label=prelim_month&o=-1.7.8.3.5For some, need to merge a single-name detection by Calais with full-name detection by ground_truth (an OpenCalais error - did not detect full name - combined with lookup error - didn't lookup the right person since missed part of his or her name). Will still have subsequently deleted one or more duplicate rows.
event_type
" set to "merge" in the Reliability_Names_Evaluation table in django: http://research.local/research/admin/context_analysis/reliability_names_evaluation/?event_type__exact=merge&label=prelim_month&o=-1.7.8.3.5Some records are just broken, need to be deleted.
event_type
" set to "deleted" in the Reliability_Names_Evaluation
table in django:
http://research.local/research/admin/context_analysis/reliability_names_evaluation/?event_type__exact=delete&label=prelim_month&o=-1.7.8.3.5Notes and questions:
Errors of note in automated coding:
is_automated_error
" set to True in the Reliability_Names_Evaluation
table in django: http://research.local/research/admin/context_analysis/reliability_names_evaluation/?is_ground_truth_fixed__exact=1&label=prelim_month&o=-1.7.8.3.5TODO:
Want a way to limit to disagreements where quoted? Might not - this is a start to assessing erroneous agreement. If yes, 1 < coding time < 4 hours.
Reliability_Names.person_type
only has three values - "author", "subject", "source" - might need a row-level measure of "has_mention
", "has_quote
" to more readily capture rows where disagreement is over quoted-or-not.TODO:
Update sections of code that output table markdown to also just insert that information into the database it Reliability_Names_Evaluation.
// import all of the existing rows from pipe delimited string.
// update the places where it outputs the pipe-delimited lists to write also to the database.
merge didn't populate merge fields in evaluation record. Need to backup VM, then debug.
// Add field to evaluation table for non-news (and probably need a way to denote this in articles themselves, also...).
Need to set flag for any Reliability_Names_Evaluation row with tags of "sports", or "list", or "non_news".
In [ ]:
# declare variables
tags_in_list = None
rne_qs = None
rne_instance = None
# include only those with certain tags.
tags_in_list = [ "sports", "list", "non_news" ]
if ( len( tags_in_list ) > 0 ):
# filter
print( "filtering to just Reliability_Names_Evaluation records with tags: " + str( tags_in_list ) )
rne_qs = Reliability_Names_Evaluation.objects.filter( tags__name__in = tags_in_list )
# loop
for rne_instance in rne_qs:
# set is_not_hard_news, then save.
print( "==> Updating " + str( rne_instance ) )
rne_instance.is_not_hard_news = True
rne_instance.save()
#-- END loop over Reliability_Names_Evaluation instances --#
#-- END check to see if we have a specific list of tags we want to include --#
Quotes with newlines in them (not sure how that is captured on the way to the server, in the database, etc.) break the article coder: http://research.local/research/context/text/article/code/.
When you load JSON that contains quote text that spans lines, the newlines within the text cause the JSON parsing to break. Looks like it is read and parsed correctly when submitted to serrver (except for the graf number - evaluates to -1 - so that is a bug, too, since there are no newlines in any of the text we are looking at, just paragraph breaks).
How to fix?:
Examples:
Article 21001: http://research.local/research/context/text/article/article_data/view_with_text/?article_id=21001
In ajax-selects lookup filter for person - need to match on first and last name, excluding middle name.
done:
add example of Kevin Matthews and Jack Doles from Article 23356: ""There are two hockey sequences in the movie, and we just had Kevin Matthews of WLAV and Jack Doles from Channel 8 out to record the play-by-play," Zandstra said."
add example of Sam Olivo (talked to sources for the story, quoted second-hand based on the sources' statements) in Article 21627: "Slowly, Sam Olivo, a 55-year-old from St. Johns and a 22-year state prison system employee, revealed he was jumped by at least one and up to five inmates assigned to a nearby work detail."
what to do about sources who are quoted from a letter or document? Not a source - a subject.