based on NLTK with Python 3 for Natural Language Processing by Sentdex
In [1]:
import nltk
from nltk import tokenize
In [2]:
# TODO: we don't relly want to download packages each time when we lauch this script
# so it'll better to check somehow whether we have packages or not - or Download on demand
# nltk.download()
In [3]:
example = 'Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. ' \
'The sky is pinkish-blue. You shouldn\'t eat cardboard.'
In [4]:
tokenize.sent_tokenize(example)
Out[4]:
In [5]:
tokenize.word_tokenize(example)
Out[5]:
In [6]:
from nltk import corpus, tokenize
In [7]:
example_sentence = 'This is a sample sentence, showing off the stop words filtration.'
stop_words = set(corpus.stopwords.words('english'))
words = tokenize.word_tokenize(example_sentence)
filtered_sentence = [w for w in words if w not in stop_words]
print(filtered_sentence)
In [8]:
from nltk import stem, tokenize
In [9]:
ps = stem.PorterStemmer()
example_words = ['python', 'pythoner', 'pythoning', 'pythoned', 'pythonly', 'pythonic', 'pythonista']
In [10]:
['{} --> {}'.format(w, ps.stem(w)) for w in example_words]
Out[10]:
In [11]:
example_text = 'It is important to by very pythonly while you are pythoning with python. '\
'All pythoners have pythoned poorly at least once.'
In [12]:
['{} --> {}'.format(w, ps.stem(w)) for w in tokenize.word_tokenize(example_text)]
Out[12]:
POS tag list:
CC coordinating conjunction
CD cardinal digit
DT determiner
EX existential there (like: "there is" ... think of it like "there exists")
FW foreign word
IN preposition/subordinating conjunction
JJ adjective 'big'
JJR adjective, comparative 'bigger'
JJS adjective, superlative 'biggest'
LS list marker 1)
MD modal could, will
NN noun, singular 'desk'
NNS noun plural 'desks'
NNP proper noun, singular 'Harrison'
NNPS proper noun, plural 'Americans'
PDT predeterminer 'all tdhe kids'
POS possessive ending parent's
PRP personal pronoundß I, he, she
PRP$ possessive pronoun my, his, hers
RB adverb very, silently,
RBR adverb, comparative better
RBS adverb, superlative best
RP particle give up
TO to go 'to' the store.
UH interjection errrrrrrrm
VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes
WDT wh-determiner which
WP wh-pronoun who, what
WP$ possessive wh-pronoun whose
WRB wh-abverb where, when
In [13]:
import nltk
from nltk import corpus, tokenize
In [14]:
train_text = corpus.state_union.raw('2005-GWBush.txt')
sample_text = corpus.state_union.raw('2006-GWBush.txt')
In [15]:
# Map tag to description, useful for annotations
tag_to_description = {
'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there (like: "there is" ... think of it like "there exists")',
'FW': 'foreign word',
'IN': 'preposition/subordinating conjunction',
'JJ': 'adjective "big"',
'JJR': 'adjective, comparative "bigger"',
'JJS': 'adjective, superlative "biggest"',
'LS': 'list marker 1)',
'MD': 'modal could, will',
'NN': 'noun, singular "desk"',
'NNS': 'noun plural "desks"',
'NNP': 'proper noun, singular "Harrison"',
'NNPS': 'proper noun, plural "Americans"',
'PDT': 'predeterminer "all tdhe kids"',
'POS': 'possessive ending parent"s',
'PRP': 'personal pronoundß I, he, she',
'PRP$': 'possessive pronoun my, his, hers',
'RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better',
'RBS': 'adverb, superlative best',
'RP': 'particle give up',
'TO': 'to go "to" the store.',
'UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take',
'VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking',
'VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take',
'VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which',
'WP': 'wh-pronoun who, what',
'WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when',
}
In [16]:
from collections import Counter
from operator import itemgetter, attrgetter
custom_sent_tokenizer = tokenize.PunktSentenceTokenizer(train_text)
tokenized_text = custom_sent_tokenizer.tokenize(sample_text)
total_counts = Counter()
for i in tokenized_text[:5]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
print('# Sentence:')
print(i)
print('# Words:')
print(words)
print('# Tagged:')
print(tagged)
counts = Counter(tag for word, tag in tagged)
total_counts += counts
print('\n')
total = sum(total_counts.values())
freq = dict((word, float(count) / total) for word, count in sorted(total_counts.items()))
print('# Counts:')
print('\n\n-----\n\n'.join(['{}\n[{}] {}'.format(f, tag, tag_to_description.get(tag, tag)) for tag, f in sorted(freq.items(), key=itemgetter(1), reverse=True)]))
In [17]:
%matplotlib inline
In [18]:
import matplotlib as mpl
import matplotlib.pyplot as plt
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
for i in tokenized_text[:5]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
chunked = chunkParser.parse(tagged)
# TODO: should fix it
# I'm using jupyter inside of Docker so maybe it is reason why doesn't work :(
# I've found this one https://stackoverflow.com/questions/31779707/how-do-you-make-nltk-draw-trees-that-are-inline-in-ipython-jupyter
# but haven't checkit it yet.
chunked.draw()