In [2]:
# lets start with the nltk package
from nltk.stem.snowball import SnowballStemmer
import nltk

Stemming- Snowball, try Porter


In [3]:
# Create a language specific stemmer instance
stemmer = SnowballStemmer("english")

In [4]:
print('running ---> {}'.format(stemmer.stem("running")))
print('expedition ---> {}'.format(stemmer.stem("expedition")))
print('resignation ---> {}'.format(stemmer.stem("resignation")))
print('Windows ---> {}'.format(stemmer.stem("Windows")))
print('assination ---> {}'.format(stemmer.stem("assination")))


running ---> run
expedition ---> expedit
resignation ---> resign
Windows ---> window
assination ---> assin

POS Tagging


In [ ]:
# A basic POS tagging
from nltk import pos_tag, word_tokenize
# sentence:
sentence = "Information Retrieval is the best course ever"
# 1. tokenize the words
text = word_tokenize(sentence)
# 2. Generate the Parts of Speech
print(pos_tag(text))

In [23]:
print('NN --> {}'.format("Noun"))
print('NNP --> {}'.format("Proper Noun"))
print('VBZ --> {}'.format("Verb"))
print('DT --> {}'.format("Determiner"))
print('JJS --> {}'.format("Adjective"))
print('RB --> {}'.format("Adverb"))


NN --> Noun
NNP --> Proper Noun
VBZ --> Verb
DT --> Determiner
JJS --> Adjective
RB --> Adverb

Chunking:


In [6]:
sentence = [("the", "DT"), ("little", "JJ"), ("black", "JJ"), ("cat", "NN"), ("meowed", "VBD"), 
            ("at", "IN"),  ("the", "DT"), ("dog", "NN")]
# Lets define a simple grammar using regular experssions
# Rule: 
## determiner (DT) is optional, it could be any number of adjectives (JJ) and a noun (NN)
grammar = "NP: {<DT>?<JJ>*<NN>}"

# Invoke the RegexParser and parse the sentence
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)

In [14]:
print(result)


(S
  (NP the/DT little/JJ black/JJ cat/NN)
  meowed/VBD
  at/IN
  (NP the/DT dog/NN))