In [1]:

    
# ignore twython library missing, we aren't using it's functionality
# Must use nltk.download() and get the Opinion Lexicon and Vader Lexicon
from nlp import *

Tokenize and Tag



In [ ]:

    
text = """I had my suspicions in early 2003. Who wouldn't? That is why when my wife, Stacy, out on her daily walk yesterday, I texted my Stacy that I was going shopping and I would be back soon."""

tokens = tokenize(text)
tok = tokenizeFindAllRegex(r"""([A-Za-z0-9&]+[']?[A-Za-z]?)""")
tokens = tok(text)
tagged = pos(tokens)
tokens

Lemmatize



In [ ]:

    
lemmatize(tagged)

Process Suffixes



In [ ]:

    
tokenSuffixes(tokens)
dict(freq(tokenSuffixes(tokens)))

Create N-Grams



In [ ]:

    
dict(freq(grams(tokens, 1)))

POS Grams



In [ ]:

    
posOnly = posTagOnly(tagged)
freq(grams(posOnly, 3))

Syllable grams



In [ ]:

    
syllableGrams(tokens, 3)

Vowelless grams



In [ ]:

    
vowelGrams(tokens, 3)

Uppercase/Lowercase



In [ ]:

    
ull = upperLowerLen(tokens)



In [ ]:

    
capLetterFreq(ull)



In [ ]:

    
cases = wordCases(ull)



In [ ]:

    
freq(grams(cases, 3))



In [ ]:

    
sentimentGrams([tokens[:int(len(tokens)/2)]])
sentimentGrams([tokens[int(len(tokens)/2):]])
sentimentGrams(ngrams(tokens, 3))



In [ ]:

    
sentimentGrams([tokens])

Chunk and Entity Removal



In [ ]:

    
chunked = chunk(tagged)
removeNamedEntities(chunked, removeNumbers=True)



In [ ]:

    
removeNamedEntities(chunked, removeNumbers=False)

Punctuation features



In [ ]:

    
punctuationFeatures(text)



In [ ]:



In [11]:

    
cleanTokensReddit("/u/dog said to me that he's the funniest guy ever.~")









    Out[11]:





['NameTOK',
 'said',
 'to',
 'me',
 'that',
 'he',
 "'s",
 'the',
 'funniest',
 'guy',
 'ever',
 '.']



In [ ]: