In [1]:

    
import sys
sys.path.append('../../common/')
import pandas as pd
from functools import reduce
import glob
import ntpath



In [2]:

    
from crowd_data_aggregator import CrowdsourceAggregator
from data_input_output_functions import *



In [3]:

    
batch = '8'

Aggregate constructiveness annotations



In [4]:

    
# set paths for crowd annotated files
constructiveness_path = '../../CF_output/constructiveness/batch' + str(batch) + '/'



In [5]:

    
input_csv = get_full_annotation_csv(constructiveness_path, batch)



In [6]:

    
# Create a CrowdAggregator object
constructiveness_aggregator = CrowdsourceAggregator(input_csv)



In [7]:

    
# Dataframe with only non gold questions. 
constructiveness_df = constructiveness_aggregator.get_non_gold_questions()



In [8]:

    
# Set attributes
attribs = {}
# set attributes 
attribs['unit_id_col'] = ['_unit_id']
attribs['meta_cols'] = ['article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text','constructive_gold']
attribs['avg_cols'] = ['agree', 'constructive']
attribs['nominal_cols'] = ['constructive_characteristics', 'non_constructive_characteristics']
attribs['text_cols'] = ['crowd_comments', 'other_con_chars', 'other_noncon_chars']



In [9]:

    
# Get aggregated data
constructiveness_aggregated_df = constructiveness_aggregator.aggregate_annotations(constructiveness_df, attribs)



In [10]:

    
# Rename some of the columns
constructiveness_aggregated_df.rename(columns = {'agree':'agree_constructiveness_expt', 
                                                 'crowd_comments':'crowd_comments_constructiveness_expt',
                                                 'constructive_gold':'constructive_internal_gold'
                                                }, inplace=True)



In [11]:

    
# Relevant columns
cols = (['article_id', 'article_author', 'article_published_date', 'article_text', 'article_title',
   'article_url', 'comment_author', 'comment_counter', 'comment_text', 'agree_constructiveness_expt', 'constructive' , 
   'constructive_internal_gold', 
   'constructive_characteristics', 'non_constructive_characteristics', 'other_con_chars', 
   'other_noncon_chars', 'crowd_comments_constructiveness_expt'])

Write aggregated contructiveness CSV



In [12]:

    
constructiveness_aggregator.write_csv(constructiveness_aggregated_df, cols, input_csv.rstrip('.csv') +'_aggregated.csv')









    



CSV written:  ../../CF_output/constructiveness/batch8/batch8_f1285429_aggregated.csv

Aggregate toxicity annotations



In [13]:

    
# set paths for crowd annotated files
toxicity_path = '../../CF_output/toxicity/batch' + str(batch) + '/'
# read the input file. The assumption here is that if the file name contains 
# only one underscore, that means it's the input file to be aggregated. 
#input_csv_files = [f for f in glob.glob(toxicity_path + 'batch' + str(batch) + '_f*.csv') if ntpath.basename(f).count('_') == 1]



In [14]:

    
input_csv = get_full_annotation_csv(toxicity_path, batch)



In [15]:

    
# Create a CrowdAggregator object
toxicity_aggregator = CrowdsourceAggregator(input_csv)



In [16]:

    
# Dataframe with only non gold questions. 
toxicity_df = toxicity_aggregator.get_non_gold_questions()



In [17]:

    
# Set attributes
attribs = {}
# set attributes 
attribs['unit_id_col'] = ['_unit_id']
attribs['meta_cols'] = ['article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text', 'crowd_toxicity_level_gold']
attribs['avg_cols'] = ['agree', 'crowd_toxicity_level']
attribs['nominal_cols'] = ['toxicity_characteristics']
attribs['text_cols'] = ['crowd_comments', 'other_toxic_chars', 'crowd_discard', 'expert_has_content']



In [18]:

    
# Get aggregated data
toxicity_aggregated_df = toxicity_aggregator.aggregate_annotations(toxicity_df, attribs)



In [19]:

    
toxicity_aggregated_df.columns









    Out[19]:





Index(['agree', 'crowd_toxicity_level', 'article_author', 'article_id',
       'article_published_date', 'article_text', 'article_title',
       'article_url', 'comment_author', 'comment_counter', 'comment_text',
       'crowd_toxicity_level_gold', 'toxicity_characteristics',
       'crowd_comments', 'other_toxic_chars', 'crowd_discard',
       'expert_has_content'],
      dtype='object')



In [20]:

    
# Rename some of the columns
toxicity_aggregated_df.rename(columns = {'expert_has_content':'has_content', 
                                         'agree':'agree_toxicity_expt', 
                                         'crowd_comments':'crowd_comments_toxicity_expt',
                                         'crowd_toxicity_level_gold':'crowd_toxicity_level_internal_gold'
                                        }, inplace=True)



In [21]:

    
# relevant columns
cols = (['article_id', 'article_author', 'article_published_date', 'article_text', 'article_title',
   'article_url', 'comment_author', 'comment_counter', 'comment_text', 'agree_toxicity_expt', 'crowd_toxicity_level', 
   'crowd_toxicity_level_internal_gold','has_content', 'crowd_discard',
   'toxicity_characteristics', 'other_toxic_chars', 'crowd_comments_toxicity_expt'])

Write aggregated contructiveness CSV



In [22]:

    
toxicity_aggregator.write_csv(toxicity_aggregated_df, cols, input_csv.rstrip('.csv') +'_aggregated.csv')









    



CSV written:  ../../CF_output/toxicity/batch8/batch8_f1285430_aggregated.csv

Combine constructiveness and toxicity annotations



In [23]:

    
dfs = [constructiveness_aggregated_df, toxicity_aggregated_df]



In [24]:

    
df_merged = constructiveness_aggregated_df.merge(toxicity_aggregated_df, 
                                on=['article_id', 'article_author', 'article_published_date',
                                'article_title', 'article_url', 'article_text',
                                'comment_author', 'comment_counter', 'comment_text'], 
                                how='outer')



In [25]:

    
# Sort the merged dataframe on constructiveness and toxicity
df_sorted = df_merged.sort_values(by=['constructive', 'crowd_toxicity_level'], ascending = False)



In [26]:

    
# Relevant columns
cols = (['article_id', 'article_author', 'article_published_date',
        'article_title', 'article_url', 'article_text',
       'comment_author', 'comment_counter', 'comment_text',
       'agree_constructiveness_expt', 'agree_toxicity_expt', 'constructive', 'constructive_internal_gold', 
       'crowd_toxicity_level', 'crowd_toxicity_level_internal_gold',
       'has_content', 'crowd_discard',  
       'constructive_characteristics', 'non_constructive_characteristics',
       'toxicity_characteristics',                 
       'crowd_comments_constructiveness_expt', 
       'crowd_comments_toxicity_expt',
       'other_con_chars', 'other_noncon_chars', 'other_toxic_chars'         
        ])

Write contructiveness and toxicity combined CSV



In [27]:

    
output_dir = '../../CF_output/combined/'



In [28]:

    
df_sorted.to_csv( output_dir + 'batch' + str(batch) + '_constructiveness_and_toxicity_combined.csv', columns = cols, index = False)



In [ ]: