In [2]:
import re
import random
from os import path

from testing_util import get_terms
import nltk


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-2-eaed3979feb1> in <module>()
      3 from os import path
      4 
----> 5 from testing_util import get_terms
      6 import nltk

ImportError: cannot import name get_terms

In [ ]:
def category_term_scorer(sample_list):
    """
    takes a list of tuples in format (category,text) and creates scores for each term
    for its relevance to each category
    """
    categories = {}
    
    for c,s in sample_list:
        if c not in categories:
            categories[c]=[]
        for w in get_terms(s):
            categories[c].append(w)
    
    fd_all = nltk.FreqDist([w for wl in categories.values() for w in wl])
    
    fd_categories = {c:nltk.FreqDist(v) for c,v in categories.iteritems()}
    
    term_scores = {}
    for term in fd_all.iterkeys():
        d = {}
        for c,fd in fd_categories.iteritems():
            d[c]= 1 if fd.freq(term) > fd_all.freq(term) else 0
        term_scores[term]=d
    
    return term_scores

In [ ]:
class TermScoreClassiffier(nltk.classify.ClassifierI):
    """
    Tries to classify text using scored terms. 
    """
    
    def __init__(self, samples=None, scorer=category_term_scorer, terms=None, key="TermScore"):
        """
        Params:
        
        samples -- a list of samples where each entry is a tuple in format (category,text)
                this argument only works if scorer is also passed. 
                
        scorer -- a function that takes the list of samples and scores them. Must return a dictionary
                in the same format as terms
        
        terms -- a dictionary of terms where keys are the terms and values are dictionaries 
        with the score for each category. ie: {"term": {"c1":0, "c2":10}
        
        key -- The key to used in the returned dictionary. 
        """
        self.key = key
        
        if samples and scorer:
            terms = scorer(samples)
        
        if not terms:
            raise ValueError("You must either pass a list of samples or a list of terms")
        
        self.terms = terms
    
    def __call__(self, text):
        """
        Picks a category for text using the term list
        """
        
        tokens = nltk.word_tokenize(text)
        scores = {}
        for w in tokens:
            if w in self.terms:
                for c,s in self.terms[w].iteritems():
                    if c in scores:
                        scores[c] += s
                    else:
                        scores[c] = s
        
        totals = scores.items()
        totals.sort(key= lambda s:s[1], reverse=True)
        
        return {self.key: totals[0][0]}