In [2]:
# lets start with the nltk package
from nltk.stem.snowball import SnowballStemmer
import nltk
In [3]:
# Create a language specific stemmer instance
stemmer = SnowballStemmer("english")
In [4]:
print('running ---> {}'.format(stemmer.stem("running")))
print('expedition ---> {}'.format(stemmer.stem("expedition")))
print('resignation ---> {}'.format(stemmer.stem("resignation")))
print('Windows ---> {}'.format(stemmer.stem("Windows")))
print('assination ---> {}'.format(stemmer.stem("assination")))
In [ ]:
# A basic POS tagging
from nltk import pos_tag, word_tokenize
# sentence:
sentence = "Information Retrieval is the best course ever"
# 1. tokenize the words
text = word_tokenize(sentence)
# 2. Generate the Parts of Speech
print(pos_tag(text))
In [23]:
print('NN --> {}'.format("Noun"))
print('NNP --> {}'.format("Proper Noun"))
print('VBZ --> {}'.format("Verb"))
print('DT --> {}'.format("Determiner"))
print('JJS --> {}'.format("Adjective"))
print('RB --> {}'.format("Adverb"))
In [6]:
sentence = [("the", "DT"), ("little", "JJ"), ("black", "JJ"), ("cat", "NN"), ("meowed", "VBD"),
("at", "IN"), ("the", "DT"), ("dog", "NN")]
# Lets define a simple grammar using regular experssions
# Rule:
## determiner (DT) is optional, it could be any number of adjectives (JJ) and a noun (NN)
grammar = "NP: {<DT>?<JJ>*<NN>}"
# Invoke the RegexParser and parse the sentence
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
In [14]:
print(result)