In [1]:
import re
import random
from os import path
import nltk
import testing_util as util
import feature_extractors
from testing_util import sample_sets, final_sets
In [2]:
def feature_num_tokens(text):
return {"num_tokens": len(nltk.word_tokenize(text))/10}
In [3]:
def feature_avg_len_word(text):
tokens = nltk.word_tokenize(text)
return {"word_len": sum([len(w) for w in tokens])/len(tokens)}
In [4]:
feature_term_score = feature_extractors.TermScoreClassiffier(sample_sets)
In [5]:
extractor = util.FeatureExtractor()
extractor.add_extractor(feature_num_tokens)
extractor.add_extractor(feature_avg_len_word)
extractor.add_extractor(feature_term_score)
In [6]:
util.fold_test_extractor(extractor, sample_sets, folds=3)
In [7]:
final_features = util.make_feature(extractor, final_sets)
In [8]:
cl = util.make_classifier(extractor, sample_sets)
In [9]:
submission = util.make_submission(cl, final_features, writeto="submission1.csv")