In [1]:
import re
import random
from os import path

import nltk

In [2]:
path_train = path.join(path.curdir, "train.txt")
path_final_testing = path.join(path.curdir, "test.csv")

In [3]:
def file_to_sets(fname, ignore_header=True):
    """
    Takes a file where each line is in the format "category,text" and turns it into a list of tuples
    in format "(category, text)"
    """
    
    f = open(fname, 'r')
    
    if ignore_header:
        # This skips the first line of the file
        next(f)
    
    out = []
    for line in f:
        # iterate over lines, use simple regex to separate the category from text
        out.append(re.match(r"(\d+),(.+$)", line).groups())
    
    f.close()
    return out

In [5]:
sample_sets = file_to_sets(path_train, ignore_header=False)
final_sets = file_to_sets(path_final_testing, ignore_header=True)

In [6]:
def get_sets(samples, test_fraction=3):
    """
    takes a set of samples, shuffles them, then returns two lists, train_sets and test_sets. 
    The size of test_sets is len(samples)/test_fraction, train_sets is the remainder. 
    """
    
    test_size = int(len(samples)/test_fraction)
    test_sets = samples[0:test_size]
    train_sets = samples[test_size:]
    
    return train_sets, test_sets

In [7]:
train_sets, test_sets = get_sets(sample_sets)

In [8]:
class FeatureExtractor(object):
    """A class to make it easy to combine and shuffle around feature extractors"""
    
    def __init__(self, extractors):
        """
        Takes a list of extractors to use in extracting features. 
        Extractors should take a piece of text and return a dictionary where the key is
        the desired key and the value is the feature value. 
        """
        if type(extractors) != 'list':
            extractors = [extractors]
        self.extractors = extractors
        
    def __call__(self, text):
        features = {}
        for e in self.extractors:
            f = e(text)
            for k, v in f.iteritems():
                features[k]=v
        
        return features
    
    def add_extractor(self,extractor):
        self.extractors.append(extractor)

In [9]:
stopwords = nltk.corpus.stopwords.words('english')

def get_terms(t):
    tokens = nltk.word_tokenize(t)
    return [w for w in tokens if w not in stopwords]

In [12]:
def category_term_scorer(sample_list):
    """
    takes a list of tuples in format (category,text) and creates scores for each term
    for its relevance to each category
    """
    categories = {}
    
    for c,s in sample_list:
        if c not in categories:
            categories[c]=[]
        for w in get_terms(s):
            categories[c].append(w)
    
    fd_all = nltk.FreqDist([w for wl in categories.values() for w in wl])
    
    fd_categories = {c:nltk.FreqDist(v) for c,v in categories.iteritems()}
    
    term_scores = {}
    for term in fd_all.iterkeys():
        d = {}
        for c,fd in fd_categories.iteritems():
            d[c]= 1 if fd.freq(term) > fd_all.freq(term) else 0
        term_scores[term]=d
    
    return term_scores

In [13]:
class TermScoreClassiffier(nltk.classify.ClassifierI):
    """
    Tries to classify text using scored terms. 
    """
    
    def __init__(self, samples=None, scorer=category_term_scorer, terms=None, key="TermScore"):
        """
        Params:
        
        samples -- a list of samples where each entry is a tuple in format (category,text)
                this argument only works if scorer is also passed. 
                
        scorer -- a function that takes the list of samples and scores them. Must return a dictionary
                in the same format as terms
        
        terms -- a dictionary of terms where keys are the terms and values are dictionaries 
        with the score for each category. ie: {"term": {"c1":0, "c2":10}
        
        key -- The key to used in the returned dictionary. 
        """
        self.key = key
        
        if samples and scorer:
            terms = scorer(samples)
        
        if not terms:
            raise ValueError("You must either pass a list of samples or a list of terms")
        
        self.terms = terms
    
    def __call__(self, text):
        """
        Picks a category for text using the term list
        """
        
        tokens = nltk.word_tokenize(text)
        scores = {}
        for w in tokens:
            if w in self.terms:
                for c,s in self.terms[w].iteritems():
                    if c in scores:
                        scores[c] += s
                    else:
                        scores[c] = s
        
        totals = scores.items()
        totals.sort(key= lambda s:s[1], reverse=True)
        
        return {self.key: setotals[0][0]}

In [14]:
classifier = TermScoreClassiffier(train_sets)

In [21]:
float(len([1 for s in test_sets if s[0] == classifier(s[1])]))/len(test_sets)


Out[21]:
1799