In [1]:
import re
import random
from os import path
import nltk
In [2]:
path_train = path.join(path.curdir, "train.txt")
path_final_testing = path.join(path.curdir, "test.csv")
In [3]:
def file_to_sets(fname, ignore_header=True):
"""
Takes a file where each line is in the format "category,text" and turns it into a list of tuples
in format "(category, text)"
"""
f = open(fname, 'r')
if ignore_header:
# This skips the first line of the file
next(f)
out = []
for line in f:
# iterate over lines, use simple regex to separate the category from text
out.append(re.match(r"(\d+),(.+$)", line).groups())
f.close()
return out
In [5]:
sample_sets = file_to_sets(path_train, ignore_header=False)
final_sets = file_to_sets(path_final_testing, ignore_header=True)
In [6]:
def get_sets(samples, test_fraction=3):
"""
takes a set of samples, shuffles them, then returns two lists, train_sets and test_sets.
The size of test_sets is len(samples)/test_fraction, train_sets is the remainder.
"""
test_size = int(len(samples)/test_fraction)
test_sets = samples[0:test_size]
train_sets = samples[test_size:]
return train_sets, test_sets
In [7]:
train_sets, test_sets = get_sets(sample_sets)
In [8]:
class FeatureExtractor(object):
"""A class to make it easy to combine and shuffle around feature extractors"""
def __init__(self, extractors):
"""
Takes a list of extractors to use in extracting features.
Extractors should take a piece of text and return a dictionary where the key is
the desired key and the value is the feature value.
"""
if type(extractors) != 'list':
extractors = [extractors]
self.extractors = extractors
def __call__(self, text):
features = {}
for e in self.extractors:
f = e(text)
for k, v in f.iteritems():
features[k]=v
return features
def add_extractor(self,extractor):
self.extractors.append(extractor)
In [9]:
stopwords = nltk.corpus.stopwords.words('english')
def get_terms(t):
tokens = nltk.word_tokenize(t)
return [w for w in tokens if w not in stopwords]
In [12]:
def category_term_scorer(sample_list):
"""
takes a list of tuples in format (category,text) and creates scores for each term
for its relevance to each category
"""
categories = {}
for c,s in sample_list:
if c not in categories:
categories[c]=[]
for w in get_terms(s):
categories[c].append(w)
fd_all = nltk.FreqDist([w for wl in categories.values() for w in wl])
fd_categories = {c:nltk.FreqDist(v) for c,v in categories.iteritems()}
term_scores = {}
for term in fd_all.iterkeys():
d = {}
for c,fd in fd_categories.iteritems():
d[c]= 1 if fd.freq(term) > fd_all.freq(term) else 0
term_scores[term]=d
return term_scores
In [13]:
class TermScoreClassiffier(nltk.classify.ClassifierI):
"""
Tries to classify text using scored terms.
"""
def __init__(self, samples=None, scorer=category_term_scorer, terms=None, key="TermScore"):
"""
Params:
samples -- a list of samples where each entry is a tuple in format (category,text)
this argument only works if scorer is also passed.
scorer -- a function that takes the list of samples and scores them. Must return a dictionary
in the same format as terms
terms -- a dictionary of terms where keys are the terms and values are dictionaries
with the score for each category. ie: {"term": {"c1":0, "c2":10}
key -- The key to used in the returned dictionary.
"""
self.key = key
if samples and scorer:
terms = scorer(samples)
if not terms:
raise ValueError("You must either pass a list of samples or a list of terms")
self.terms = terms
def __call__(self, text):
"""
Picks a category for text using the term list
"""
tokens = nltk.word_tokenize(text)
scores = {}
for w in tokens:
if w in self.terms:
for c,s in self.terms[w].iteritems():
if c in scores:
scores[c] += s
else:
scores[c] = s
totals = scores.items()
totals.sort(key= lambda s:s[1], reverse=True)
return {self.key: setotals[0][0]}
In [14]:
classifier = TermScoreClassiffier(train_sets)
In [21]:
float(len([1 for s in test_sets if s[0] == classifier(s[1])]))/len(test_sets)
Out[21]: