Analyzing UK News Data Sets



author: Sands Fish
org: Berkman Center for Internet & Society
date: Aug. 2014

Libraries, data, functions


In [1]:
%matplotlib inline
from IPython.core.display import HTML
from collections import Counter
import pandas as pd
import numpy as np
import json
import dateutil.parser
import math
import numexpr
import re

In [2]:
# Read In All Data
guardian = pd.read_csv('/Users/sands/Data/all_articles_2/guardian_fulltext.tsv', sep='\t')
telegraph = pd.read_csv('/Users/sands/Data/all_articles_2/telegraph_fulltext.tsv', sep='\t')
dailymail = pd.read_csv('/Users/sands/Data/all_articles_2/dailymail_fulltext.tsv', sep='\t')

In [5]:
# Build Section Totals
def column_totals(df, column):
    column_totals = {}
    for c in df[column].unique():
        column_totals[c] = df[df[column] == c][column].count()
    return column_totals

def plot_section_counts(counts, threshold=None):
    section_frame = pd.DataFrame.from_dict(counts, orient='index')
    # section_frame.index.values.tolist()
    section_series = section_frame[section_frame[0] > threshold].sort(ascending=False)
    section_series.plot(kind='bar', alpha=0.3, color='g', figsize=(18,6), title='section counts', legend=[])
    
def summarize_column_stats(df, column, value):
    df_total = float(len(df))
    df_value = float(len(df[df[column] == value]))

    output_html = '<hr>'
    output_html = output_html + 'Total Count: <b>%0.0f</b>' % (df_total) + '<br>'
    output_html = output_html + '\'%s\' Count: <b>%0.0f</b>' % (value, df_value) + '<br>'
    output_html = output_html + "<b>%0.2f</b> %% where '%s' == '%s'" % (df_value/df_total, column, value) + '<hr>'
    return HTML(output_html)

def plot_section_over_time(df, section, color='g', label=''):
    # Subset to relevant section
    df_oped = df[df['section'] == section]
    
    # Create Index DataFrame with DateTime Objects
    date_object_index = [dateutil.parser.parse(d) for d in df_oped['date']]

    # Set index
    df_oped.index = date_object_index
    
    # Plot grouped on year/month/day
    df_oped.groupby([df_oped.index.year, df_oped.index.month, df_oped.index.day]).size().plot(figsize=(18,6), legend='Both, Female, Male', color=color, label=label, alpha=0.5, title = "%s section over time" % section)

Gender Specifications


In [6]:
#  B - both
#  E - error
#  F - female
#  M - male
#  X - unknown?
print('Guardian Genders: ' + str(sorted(guardian['gender'].unique().tolist())))
print('Telegraph Genders: ' + str(sorted(telegraph['gender'].unique().tolist())))
print('Daily Mail Genders: ' + str(sorted(dailymail['gender'].unique().tolist())))


Guardian Genders: ['B', 'E', 'F', 'M', 'X']
Telegraph Genders: ['B', 'F', 'M', 'X']
Daily Mail Genders: ['B', 'F', 'M', 'X']

The Guardian: "commentisfree"


In [7]:
section_totals = column_totals(guardian, 'section')

guardian_male = guardian[guardian['gender'] == 'M']
guardian_female = guardian[guardian['gender'] == 'F']

plot_section_over_time(guardian_female, 'commentisfree', color='g', label='Female')
plot_section_over_time(guardian_male, 'commentisfree', color='b', label='Male')

plot_section_counts(section_totals, threshold=1000)
summarize_column_stats(guardian, 'section', 'commentisfree')


Out[7]:

Total Count: 143515
'commentisfree' Count: 10323
0.07 % where 'section' == 'commentisfree'

The Telegraph: "Comment"


In [10]:
section_totals = column_totals(telegraph, 'section')

# plot_section_over_time(telegraph, 'Comment')

telegraph_male = telegraph[telegraph['gender'] == 'M']
telegraph_female = telegraph[telegraph['gender'] == 'F']

plot_section_over_time(telegraph_male, 'Comment', color='b', label='Male')
plot_section_over_time(telegraph_female, 'Comment', color='g', label='Female')

plot_section_counts(section_totals, threshold=1000)
summarize_column_stats(telegraph, 'section', 'Comment')


Out[10]:

Total Count: 110029
'Comment' Count: 5312
0.05 % where 'section' == 'Comment'

Daily Mail: "debate"


In [9]:
section_totals = column_totals(dailymail, 'section')

plot_section_over_time(dailymail, 'debate')
plot_section_counts(section_totals)
summarize_column_stats(dailymail, 'section', 'debate')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-23094ad8a542> in <module>()
      1 section_totals = column_totals(dailymail, 'section')
      2 
----> 3 plot_section_over_time(dailymail, 'debate')
      4 plot_section_counts(section_totals)
      5 summarize_column_stats(dailymail, 'section', 'debate')

<ipython-input-5-8f6ace3331a3> in plot_section_over_time(df, section, color, label)
     27 
     28     # Create Index DataFrame with DateTime Objects
---> 29     date_object_index = [dateutil.parser.parse(d) for d in df_oped['date']]
     30 
     31     # Set index

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in parse(timestr, parserinfo, **kwargs)
    746         return parser(parserinfo).parse(timestr, **kwargs)
    747     else:
--> 748         return DEFAULTPARSER.parse(timestr, **kwargs)
    749 
    750 

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    308 
    309 
--> 310         res, skipped_tokens = self._parse(timestr, **kwargs)
    311 
    312         if res is None:

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
    364             yearfirst = info.yearfirst
    365         res = self._result()
--> 366         l = _timelex.split(timestr)
    367 
    368 

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in split(cls, s)
    148 
    149     def split(cls, s):
--> 150         return list(cls(s))
    151     split = classmethod(split)
    152 

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in next(self)
    145 
    146     def next(self):
--> 147         return self.__next__()  # Python 2.x support
    148 
    149     def split(cls, s):

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in __next__(self)
    139 
    140     def __next__(self):
--> 141         token = self.get_token()
    142         if token is None:
    143             raise StopIteration

/Users/sands/Virtualenvs/datawork/lib/python2.7/site-packages/dateutil/parser.pyc in get_token(self)
     70                 nextchar = self.charstack.pop(0)
     71             else:
---> 72                 nextchar = self.instream.read(1)
     73                 while nextchar == '\x00':
     74                     nextchar = self.instream.read(1)

AttributeError: 'numpy.int64' object has no attribute 'read'

In [ ]: