In [26]:
from articledata import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import names
from nltk.tag import StanfordNERTagger
import os
from nltk.tokenize import word_tokenize
In [8]:
data = pd.read_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')
In [10]:
data.head(1)
Out[10]:
In [11]:
labeled_names = ([(name, 'male') for name in names.words("male.txt")] + [(name, 'female') for name in names.words('female.txt')])
In [19]:
male_names = names.words("male.txt")
female_names = names.words("female.txt")
In [16]:
male_names[:5]
Out[16]:
In [13]:
people, _ = evaluate_entities(data = data, section = 'politics', source = 'Fox')
In [18]:
count = 0
for person in people:
if person in male_names:
count +=1
print count
In [20]:
count = 0
for person in people:
if person in female_names:
count +=1
print count
In [31]:
male_counts = []
female_counts = []
os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
for x in data['body']:
m_count = 0
f_count = 0
tokens = word_tokenize(x)
tags = st.tag(tokens)
for pair in tags:
if pair[1] == 'PERSON':
if pair[0] in male_names:
m_count += 1
elif pair[0] in female_names:
f_count += 1
else:
continue
male_counts.append(m_count)
female_counts.append(f_count)
In [33]:
data['f_count'] = female_counts
data['m_count'] = male_counts
In [35]:
data.to_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')
In [37]:
data.head(1)
Out[37]:
In [40]:
data.pivot_table(index = ['condensed_section'], values = ['f_count', 'm_count']).sort_values('f_count', ascending = False)
Out[40]: