notebook.community

Edit and run



In [26]:

    
from articledata import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import names
from nltk.tag import StanfordNERTagger
import os
from nltk.tokenize import word_tokenize



In [8]:

    
data = pd.read_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')



In [10]:

    
data.head(1)









    Out[10]:






  
    
      
      title
      date
      body
      section
      source
      condensed_section
      SA_body
      SA_title
      SA_diff
      total_persons_title
      total_places_title
    
  
  
    
      0
      $5 Million for a Super Bowl Ad. Another Millio...
      2017-01-29
      This month, Anheuser-Busch InBev hosted a doze...
      business
      NYT
      business
      0.01624
      -0.023148
      0.039388
      0
      0



In [11]:

    
labeled_names = ([(name, 'male') for name in names.words("male.txt")] + [(name, 'female') for name in names.words('female.txt')])



In [19]:

    
male_names = names.words("male.txt")
female_names = names.words("female.txt")



In [16]:

    
male_names[:5]









    Out[16]:





[u'Aamir', u'Aaron', u'Abbey', u'Abbie', u'Abbot']



In [13]:

    
people, _ = evaluate_entities(data = data, section = 'politics', source = 'Fox')



In [18]:

    
count = 0
for person in people:
    if person in male_names:
        count +=1

print count



In [20]:

    
count = 0
for person in people:
    if person in female_names:
        count +=1

print count



In [31]:

    
male_counts = []
female_counts = []
os.environ['CLASSPATH'] = "/Users/teresaborcuch/stanford-ner-2013-11-12/stanford-ner.jar"
os.environ['STANFORD_MODELS'] = '/Users/teresaborcuch/stanford-ner-2013-11-12/classifiers'
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
for x in data['body']:
    m_count = 0
    f_count = 0
    tokens = word_tokenize(x)
    tags = st.tag(tokens)
    for pair in tags:
        if pair[1] == 'PERSON':
            if pair[0] in male_names:
                m_count += 1
            elif pair[0] in female_names:
                f_count += 1
            else:
                continue
    male_counts.append(m_count)
    female_counts.append(f_count)



In [33]:

    
data['f_count'] = female_counts
data['m_count'] = male_counts



In [35]:

    
data.to_pickle('/Users/teresaborcuch/capstone_project/notebooks/ss_entity_data.pkl')



In [37]:

    
data.head(1)









    Out[37]:






  
    
      
      title
      date
      body
      section
      source
      condensed_section
      SA_body
      SA_title
      SA_diff
      total_persons_title
      total_places_title
      f_count
      m_count
    
  
  
    
      0
      $5 Million for a Super Bowl Ad. Another Millio...
      2017-01-29
      This month, Anheuser-Busch InBev hosted a doze...
      business
      NYT
      business
      0.01624
      -0.023148
      0.039388
      0
      0
      4
      5



In [40]:

    
data.pivot_table(index = ['condensed_section'], values = ['f_count', 'm_count']).sort_values('f_count', ascending = False)









    Out[40]:






  
    
      
      f_count
      m_count
    
    
      condensed_section
      
      
    
  
  
    
      entertainment
      3.590985
      10.497496
    
    
      politics
      1.786932
      10.514205
    
    
      sports
      1.758621
      15.737069
    
    
      education
      1.692308
      8.846154
    
    
      other
      1.407960
      9.134328
    
    
      world
      1.175589
      5.083512
    
    
      business
      1.097087
      6.087379
    
    
      sci_health
      1.082251
      3.376623
    
    
      opinion
      0.973684
      6.172249
    
    
      technology
      0.333333
      3.404040

	f_count	m_count
condensed_section
entertainment	3.590985	10.497496
politics	1.786932	10.514205
sports	1.758621	15.737069
education	1.692308	8.846154
other	1.407960	9.134328
world	1.175589	5.083512
business	1.097087	6.087379
sci_health	1.082251	3.376623
opinion	0.973684	6.172249
technology	0.333333	3.404040