In [2]:
import nltk
import re, pprint
import os
import csv
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
nltk.download_gui()
Select the below
stopwords from Corpa
averaged_perceptron_tagger from All corpus
wordnet
OR you can download all the nltk components by: nltk.download()
Please Note: The above will take much time (30-60mins depending on Internet speed)
In [ ]:
sentence = "The big brown fox jumped over a lazy dog."
sentence2 = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"
In [ ]:
#convert sentence to lower case
'This' == 'this'
pass
In [ ]:
tokenizer = RegexpTokenizer(r'\w+')
tokens =
pass
In [ ]:
filtered_words = pass
filtered_words
In [ ]:
def preprocess(sentence):
# Your code here:
return filtered_words
In [ ]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)
In [ ]:
# Use the parts of speech tagging
tags =
print(tags)
# see https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html for more details
In [ ]:
def filter_words(sentences):
features = []
for tagged_word in sentences:
word, tag = tagged_word
if tag in ('NN', 'VBN', 'NNS', 'VBP', 'RB', 'VBZ', 'VBG', 'PRP', 'JJ'):
features.append(word)
return features
In [ ]:
filter_words(tags)
In [ ]:
lmtzr = WordNetLemmatizer()
print lmtzr.lemmatize('cacti')
In [ ]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']
In [ ]:
stemmer = SnowballStemmer("english")
# stem words
In [ ]:
def extract_feature(text):
# Your code here
return stemmed_words
In [ ]:
words = extract_feature(sentence)
print words
In [ ]:
def word_feats(words):
return dict([(word, True) for word in words])
In [ ]:
word_feats(words)
In [ ]:
def extract_feature_from_doc(data):
result = []
corpus = []
for (text,category,answer) in data:
#Your Code here
return (result, sum(corpus,[]))
In [ ]:
extract_feature_from_doc([['this is text','category','answer to give']])
In [ ]:
def get_content(filename):
doc = os.path.join(filename)
with open(doc, 'r') as content_file:
lines = csv.reader(content_file,delimiter='|')
data = [x for x in lines if len(x) == 3]
return data
In [ ]:
filename = 'leaves.txt'
data = get_content(filename)
In [ ]:
features_data, corpus = extract_feature_from_doc(data)
In [ ]:
print(features_data[10])
In [ ]:
## split data into train and test sets
split_ratio = 0.8
In [ ]:
def split_dataset(data, split_ratio):
random.shuffle(data)
data_length = len(data)
train_split = int(data_length * split_ratio)
return (data[:train_split]), (data[train_split:])
In [ ]:
training_data, test_data = split_dataset(features_data, split_ratio)
In [ ]:
def train_using_decision_tree(training_data, test_data):
#Implement NLTKs decision tree classifier. http://www.nltk.org/api/nltk.classify.html
classifier_name = type(classifier).__name__
training_set_accuracy = #What's the accuracy on the trainign data?
print('training set accuracy: ', training_set_accuracy)
test_set_accuracy = # What's the accuracy on the test data?
print('test set accuracy: ', test_set_accuracy)
return classifier, classifier_name, test_set_accuracy, training_set_accuracy
In [ ]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)
In [ ]:
print(classifier.pretty_format())
In [ ]: