Import useful libraries



In [2]:

    
import nltk
import re, pprint
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np

Install NLTK components:

nltk.download_gui()

The above will open a GUI

Select the below

stopwords from Corpa
averaged_perceptron_tagger from All corpus
wordnet

OR you can download all the nltk components by: nltk.download()

Please Note: The above will take much time (30-60mins depending on Internet speed)



In [ ]:

    
sentence = "The big brown fox jumped over a lazy dog."
sentence2 = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"

Preprocess



In [ ]:

    
#convert sentence to lower case
'This' == 'this'
pass

Tokenize - extract individual words



In [ ]:

    
tokenizer = RegexpTokenizer(r'\w+')
tokens = 
pass

Filter words to remove non-useful words



In [ ]:

    
filtered_words = pass
filtered_words



In [ ]:

    
def preprocess(sentence):
    # Your code here:

    
    
    return filtered_words



In [ ]:

    
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

Tagging



In [ ]:

    
# Use the parts of speech tagging
tags = 
print(tags)
# see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html for more details

Extracting only Nouns and Verb nouns



In [ ]:

    
def filter_words(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag in ('NN', 'VBN', 'NNS', 'VBP', 'RB', 'VBZ', 'VBG', 'PRP', 'JJ'):
            features.append(word)
    return features



In [ ]:

    
filter_words(tags)

Lemmatize words



In [ ]:

    
lmtzr = WordNetLemmatizer()
print lmtzr.lemmatize('cacti')

Stem words



In [ ]:

    
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']



In [ ]:

    
stemmer = SnowballStemmer("english")
# stem words

Putting it all together



In [ ]:

    
def extract_feature(text):
    # Your code here
   
    return stemmed_words



In [ ]:

    
words = extract_feature(sentence)
print words

Implementing bag of words



In [ ]:

    
def word_feats(words):
    return dict([(word, True) for word in words])



In [ ]:

    
word_feats(words)

Parsing the whole document



In [ ]:

    
def extract_feature_from_doc(data):
    result = []
    corpus = []
    for (text,category,answer) in data:
        #Your Code here
        
    return (result, sum(corpus,[]))



In [ ]:

    
extract_feature_from_doc([['this is text','category','answer to give']])



In [ ]:

    
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data



In [ ]:

    
filename = 'leaves.txt'
data = get_content(filename)



In [ ]:

    
features_data, corpus = extract_feature_from_doc(data)



In [ ]:

    
print(features_data[10])

Train a model using these fetures



In [ ]:

    
## split data into train and test sets
split_ratio = 0.8



In [ ]:

    
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])



In [ ]:

    
training_data, test_data = split_dataset(features_data, split_ratio)



In [ ]:

    
def train_using_decision_tree(training_data, test_data):
    #Implement NLTKs decision tree classifier. http://www.nltk.org/api/nltk.classify.html
    classifier_name = type(classifier).__name__
    training_set_accuracy = #What's the accuracy on the trainign data?
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = # What's the accuracy on the test data?
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy



In [ ]:

    
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)



In [ ]:

    
print(classifier.pretty_format())



In [ ]: