Import useful libraries


In [2]:
import nltk
import re, pprint
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np

Install NLTK components:

nltk.download_gui()

The above will open a GUI

Select the below

stopwords from Corpa
averaged_perceptron_tagger from All corpus
wordnet

OR you can download all the nltk components by: nltk.download()

Please Note: The above will take much time (30-60mins depending on Internet speed)


In [ ]:
sentence = "The big brown fox jumped over a lazy dog."
sentence2 = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"

Preprocess


In [ ]:
#convert sentence to lower case
'This' == 'this'
pass

Tokenize - extract individual words


In [ ]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = 
pass

Filter words to remove non-useful words


In [ ]:
filtered_words = pass
filtered_words

In [ ]:
def preprocess(sentence):
    # Your code here:

    
    
    return filtered_words

In [ ]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

Tagging


In [ ]:
# Use the parts of speech tagging
tags = 
print(tags)
# see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html for more details

Extracting only Nouns and Verb nouns


In [ ]:
def filter_words(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag in ('NN', 'VBN', 'NNS', 'VBP', 'RB', 'VBZ', 'VBG', 'PRP', 'JJ'):
            features.append(word)
    return features

In [ ]:
filter_words(tags)

Lemmatize words


In [ ]:
lmtzr = WordNetLemmatizer()
print lmtzr.lemmatize('cacti')

Stem words


In [ ]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']

In [ ]:
stemmer = SnowballStemmer("english")
# stem words

Putting it all together


In [ ]:
def extract_feature(text):
    # Your code here
   
    return stemmed_words

In [ ]:
words = extract_feature(sentence)
print words

Implementing bag of words


In [ ]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [ ]:
word_feats(words)

Parsing the whole document


In [ ]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    for (text,category,answer) in data:
        #Your Code here
        
    return (result, sum(corpus,[]))

In [ ]:
extract_feature_from_doc([['this is text','category','answer to give']])

In [ ]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [ ]:
filename = 'leaves.txt'
data = get_content(filename)

In [ ]:
features_data, corpus = extract_feature_from_doc(data)

In [ ]:
print(features_data[10])

Train a model using these fetures


In [ ]:
## split data into train and test sets
split_ratio = 0.8

In [ ]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [ ]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [ ]:
def train_using_decision_tree(training_data, test_data):
    #Implement NLTKs decision tree classifier. http://www.nltk.org/api/nltk.classify.html
    classifier_name = type(classifier).__name__
    training_set_accuracy = #What's the accuracy on the trainign data?
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = # What's the accuracy on the test data?
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [ ]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

In [ ]:
print(classifier.pretty_format())

In [ ]: