In [1]:
# ignore twython library missing, we aren't using it's functionality
# Must use nltk.download() and get the Opinion Lexicon and Vader Lexicon
from nlp import *

Tokenize and Tag


In [ ]:
text = """I had my suspicions in early 2003. Who wouldn't? That is why when my wife, Stacy, out on her daily walk yesterday, I texted my Stacy that I was going shopping and I would be back soon."""

tokens = tokenize(text)
tok = tokenizeFindAllRegex(r"""([A-Za-z0-9&]+[']?[A-Za-z]?)""")
tokens = tok(text)
tagged = pos(tokens)
tokens

Lemmatize


In [ ]:
lemmatize(tagged)

Process Suffixes


In [ ]:
tokenSuffixes(tokens)
dict(freq(tokenSuffixes(tokens)))

Create N-Grams


In [ ]:
dict(freq(grams(tokens, 1)))

POS Grams


In [ ]:
posOnly = posTagOnly(tagged)
freq(grams(posOnly, 3))

Syllable grams


In [ ]:
syllableGrams(tokens, 3)

Vowelless grams


In [ ]:
vowelGrams(tokens, 3)

Uppercase/Lowercase


In [ ]:
ull = upperLowerLen(tokens)

In [ ]:
capLetterFreq(ull)

In [ ]:
cases = wordCases(ull)

In [ ]:
freq(grams(cases, 3))

In [ ]:
sentimentGrams([tokens[:int(len(tokens)/2)]])
sentimentGrams([tokens[int(len(tokens)/2):]])
sentimentGrams(ngrams(tokens, 3))

In [ ]:
sentimentGrams([tokens])

Chunk and Entity Removal


In [ ]:
chunked = chunk(tagged)
removeNamedEntities(chunked, removeNumbers=True)

In [ ]:
removeNamedEntities(chunked, removeNumbers=False)

Punctuation features


In [ ]:
punctuationFeatures(text)

In [ ]:


In [11]:
cleanTokensReddit("/u/dog said to me that he's the funniest guy ever.~")


Out[11]:
['NameTOK',
 'said',
 'to',
 'me',
 'that',
 'he',
 "'s",
 'the',
 'funniest',
 'guy',
 'ever',
 '.']

In [ ]: