In [26]:
from articledata import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import names
from nltk.tag import StanfordNERTagger
import os
from nltk.tokenize import word_tokenize

In [8]:
data = pd.read_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')

In [10]:
data.head(1)


Out[10]:
title date body section source condensed_section SA_body SA_title SA_diff total_persons_title total_places_title
0 $5 Million for a Super Bowl Ad. Another Millio... 2017-01-29 This month, Anheuser-Busch InBev hosted a doze... business NYT business 0.01624 -0.023148 0.039388 0 0

In [11]:
labeled_names = ([(name, 'male') for name in names.words("male.txt")] + [(name, 'female') for name in names.words('female.txt')])

In [19]:
male_names = names.words("male.txt")
female_names = names.words("female.txt")

In [16]:
male_names[:5]


Out[16]:
[u'Aamir', u'Aaron', u'Abbey', u'Abbie', u'Abbot']

In [13]:
people, _ = evaluate_entities(data = data, section = 'politics', source = 'Fox')

In [18]:
count = 0
for person in people:
    if person in male_names:
        count +=1

print count


79

In [20]:
count = 0
for person in people:
    if person in female_names:
        count +=1

print count


36

In [31]:
male_counts = []
female_counts = []
os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
for x in data['body']:
    m_count = 0
    f_count = 0
    tokens = word_tokenize(x)
    tags = st.tag(tokens)
    for pair in tags:
        if pair[1] == 'PERSON':
            if pair[0] in male_names:
                m_count += 1
            elif pair[0] in female_names:
                f_count += 1
            else:
                continue
    male_counts.append(m_count)
    female_counts.append(f_count)

In [33]:
data['f_count'] = female_counts
data['m_count'] = male_counts

In [35]:
data.to_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')

In [37]:
data.head(1)


Out[37]:
title date body section source condensed_section SA_body SA_title SA_diff total_persons_title total_places_title f_count m_count
0 $5 Million for a Super Bowl Ad. Another Millio... 2017-01-29 This month, Anheuser-Busch InBev hosted a doze... business NYT business 0.01624 -0.023148 0.039388 0 0 4 5

In [40]:
data.pivot_table(index = ['condensed_section'], values = ['f_count', 'm_count']).sort_values('f_count', ascending = False)


Out[40]:
f_count m_count
condensed_section
entertainment 3.590985 10.497496
politics 1.786932 10.514205
sports 1.758621 15.737069
education 1.692308 8.846154
other 1.407960 9.134328
world 1.175589 5.083512
business 1.097087 6.087379
sci_health 1.082251 3.376623
opinion 0.973684 6.172249
technology 0.333333 3.404040