In [13]:
%matplotlib inline
from IPython.core.display import HTML
from collections import Counter
import pandas as pd
import numpy as np
import json
import dateutil.parser
import math
import numexpr
import re
In [14]:
# Read in all UK data
guardian = pd.read_csv('/Users/sands/Data/all_articles_2/guardian_fulltext.tsv', sep='\t')
telegraph = pd.read_csv('/Users/sands/Data/all_articles_2/telegraph_fulltext.tsv', sep='\t')
dailymail = pd.read_csv('/Users/sands/Data/all_articles_2/dailymail_fulltext.tsv', sep='\t')
In [15]:
# Function to extract word features into format for NTLK Classifier
def word_features(article_text):
words = str(article_text).split()
return dict((word, True) for word in words)
In [16]:
all_uk = pd.concat([guardian, dailymail, telegraph], ignore_index=True)
clean_uk = all_uk.dropna()
print('Out of %d UK articles, %d are clean/non-nan' % (len(all_uk), len(clean_uk)))
In [17]:
oped_sections = [clean_uk[clean_uk['section'] == 'commentisfree'], clean_uk[clean_uk['section'] == 'Comment'], clean_uk[clean_uk['section'] == 'debate']]
oped = pd.concat(oped_sections)
pos_examples = [(word_features(s['text']), 'oped') for i, s in oped.iterrows()]
for o in oped_sections:
print('Publication Op-Ed Total: %d' % (len(o)))
print('%d Op-Ed Articles in clean UK set' % (len(oped)))
In [18]:
non_oped = clean_uk.query('section != "commentisfree" and section != "Comment" and section != "debate"')
rows = np.random.choice(non_oped.index.values, len(pos_examples))
sampled_rows = non_oped.ix[rows]
neg_examples = [(word_features(s['text']), 'non') for i, s in sampled_rows.iterrows()]
In [19]:
# Verify that sampling from the Non-Op-Ed index resulted in the correct amount of samples
print(len(pos_examples) == len(rows) == len(sampled_rows))
# Verify the Op-Ed and Non-Op-Ed Total to the entire data-set
print(len(non_oped) + len(oped) == len(clean_uk))
# Verify that the amount of negative and positive examples are balanced
print(len(neg_examples) == len(pos_examples))
In [20]:
from nltk.classify import NaiveBayesClassifier
train_set = pos_examples + neg_examples
classifier = NaiveBayesClassifier.train(train_set)
In [21]:
predicted_label = classifier.classify(pos_examples[5][0])
predicted_label
Out[21]:
In [22]:
results = []
for e in pos_examples:
results.append(classifier.classify(e[0]))
Counter(results)
Out[22]:
In [23]:
results = []
for e in neg_examples:
results.append(classifier.classify(e[0]))
Counter(results)
Out[23]:
In [24]:
pos_text_labeled = [(s['text'], 'oped') for i, s in oped.iterrows()]
neg_text_labeled = [(s['text'], 'non-oped') for i, s in sampled_rows.iterrows()]
all_text_and_labels = pos_text_labeled + neg_text_labeled
count = 0
with open('uk_oped_classifier_data.tsv', 'wb') as oped_out:
for story_text, label in all_text_and_labels:
try:
oped_out.write(label + '\t' + story_text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') + '\n')
except TypeError:
count = count + 1
print('%d Errors Writing' % count)
In [25]:
# Minor discrepancy in output counts
open_output = pd.read_csv('uk_oped_classifier_data.tsv', sep='\t')
print(len(open_output))
print(len(all_text_and_labels))
len(open_output) == len(all_text_and_labels)
Out[25]: