In [1]:
import re
import random
from os import path

import nltk

import testing_util as util
import feature_extractors
from testing_util import sample_sets, final_sets

In [2]:
def feature_num_tokens(text):
    return {"num_tokens": len(nltk.word_tokenize(text))/10}

In [3]:
def feature_avg_len_word(text):
    tokens = nltk.word_tokenize(text)
    return {"word_len": sum([len(w) for w in tokens])/len(tokens)}

In [4]:
feature_term_score = feature_extractors.TermScoreClassiffier(sample_sets)

In [5]:
extractor = util.FeatureExtractor()
extractor.add_extractor(feature_num_tokens)
extractor.add_extractor(feature_avg_len_word)
extractor.add_extractor(feature_term_score)

In [6]:
util.fold_test_extractor(extractor, sample_sets, folds=3)


test 0 - 87.987%
test 1 - 85.762%
test 2 - 84.556%

In [7]:
final_features = util.make_feature(extractor, final_sets)

In [8]:
cl = util.make_classifier(extractor, sample_sets)

In [9]:
submission = util.make_submission(cl, final_features, writeto="submission1.csv")