In [1]:
# Import Dependencies
import nltk

In [2]:
# Sample Text
text = """If I could be a superhero, just for the day,
I would want to be Supergirl, in every way.
She’s the young cousin of Superman with long golden locks,
But don’t let that fool you because she’s tougher than rocks.

Her powers consist of flying with speed,
To the moon, around the world or wherever the need.
She can hear a pin drop or the beat of a human’s heart,
Not to mention the faintest whisper, oh how very smart!

In addition to mind control, Supergirl’s vision is x-ray
She also has eyes that generate heat without delay.
Just like her cousin, she has her weakness too,
Kryptonite, oh Kryptonite. There’s only one and not two.

So why would I want to be this superhero for the day?
Well, that’s easy, I will tell you. So listen to what I say.
Bullying has become a major problem everywhere we turn.
Our teachers discuss the issue, but there is more that we need to learn.

Throughout the school halls and at the lunchroom tables,
Students are teased or pushed, and fighting back, well they aren’t able.
As Supergirl I would stop all this nonsense,
By using my powers to aid in every victim’s defense.

Throughout the day, I would listen for the negative chatter
And change each bully’s insults to words that matter.
Before the first punch is thrown or a foot trips another,
I would zap the tormentor’s behind with heat until he calls his mother.

It’s too bad I can’t be this superhero for longer,
It will take more than a day to help bullied victims to become stronger.
The truth is that no one deserves this cruel and hateful treatment
Everyone deserves happiness and that should be a unanimous agreement."""

In [3]:
# Sentence Tokenization
from nltk.tokenize import sent_tokenize

In [4]:
tokenize_sent = sent_tokenize(text)

In [5]:
tokenize_sent


Out[5]:
['If I could be a superhero, just for the day,\nI would want to be Supergirl, in every way.',
 'She’s the young cousin of Superman with long golden locks,\nBut don’t let that fool you because she’s tougher than rocks.',
 'Her powers consist of flying with speed,\nTo the moon, around the world or wherever the need.',
 'She can hear a pin drop or the beat of a human’s heart,\nNot to mention the faintest whisper, oh how very smart!',
 'In addition to mind control, Supergirl’s vision is x-ray\nShe also has eyes that generate heat without delay.',
 'Just like her cousin, she has her weakness too,\nKryptonite, oh Kryptonite.',
 'There’s only one and not two.',
 'So why would I want to be this superhero for the day?',
 'Well, that’s easy, I will tell you.',
 'So listen to what I say.',
 'Bullying has become a major problem everywhere we turn.',
 'Our teachers discuss the issue, but there is more that we need to learn.',
 'Throughout the school halls and at the lunchroom tables,\nStudents are teased or pushed, and fighting back, well they aren’t able.',
 'As Supergirl I would stop all this nonsense,\nBy using my powers to aid in every victim’s defense.',
 'Throughout the day, I would listen for the negative chatter\nAnd change each bully’s insults to words that matter.',
 'Before the first punch is thrown or a foot trips another,\nI would zap the tormentor’s behind with heat until he calls his mother.',
 'It’s too bad I can’t be this superhero for longer,\nIt will take more than a day to help bullied victims to become stronger.',
 'The truth is that no one deserves this cruel and hateful treatment\nEveryone deserves happiness and that should be a unanimous agreement.']

In [6]:
import re

In [7]:
# Preprocess Sentences
for i in range(len(tokenize_sent)-1):
    # Convert all words to lowercase
    tokenize_sent[i] = tokenize_sent[i].lower()
    # Replace all characters (punctuations) with a space
    tokenize_sent[i] = re.sub(r'\W',' ',tokenize_sent[i])
    # Remove extra spaces
    tokenize_sent[i] = re.sub(r'\s+',' ', tokenize_sent[i])

In [8]:
tokenize_sent


Out[8]:
['if i could be a superhero just for the day i would want to be supergirl in every way ',
 'she s the young cousin of superman with long golden locks but don t let that fool you because she s tougher than rocks ',
 'her powers consist of flying with speed to the moon around the world or wherever the need ',
 'she can hear a pin drop or the beat of a human s heart not to mention the faintest whisper oh how very smart ',
 'in addition to mind control supergirl s vision is x ray she also has eyes that generate heat without delay ',
 'just like her cousin she has her weakness too kryptonite oh kryptonite ',
 'there s only one and not two ',
 'so why would i want to be this superhero for the day ',
 'well that s easy i will tell you ',
 'so listen to what i say ',
 'bullying has become a major problem everywhere we turn ',
 'our teachers discuss the issue but there is more that we need to learn ',
 'throughout the school halls and at the lunchroom tables students are teased or pushed and fighting back well they aren t able ',
 'as supergirl i would stop all this nonsense by using my powers to aid in every victim s defense ',
 'throughout the day i would listen for the negative chatter and change each bully s insults to words that matter ',
 'before the first punch is thrown or a foot trips another i would zap the tormentor s behind with heat until he calls his mother ',
 'it s too bad i can t be this superhero for longer it will take more than a day to help bullied victims to become stronger ',
 'The truth is that no one deserves this cruel and hateful treatment\nEveryone deserves happiness and that should be a unanimous agreement.']

In [9]:
# Create a Histogram
word2count = {}

for sent in tokenize_sent:
    # Tokenize Sentences into Words
    words = nltk.word_tokenize(sent)
    for word in words:
        # If word not in bow, add it
        if word not in word2count.keys():
            word2count[word] = 1
        # If word present in bow, increment the counter
        else:
            word2count[word] += 1

In [10]:
word2count


Out[10]:
{'if': 1,
 'i': 9,
 'could': 1,
 'be': 5,
 'a': 7,
 'superhero': 3,
 'just': 2,
 'for': 4,
 'the': 15,
 'day': 4,
 'would': 5,
 'want': 2,
 'to': 11,
 'supergirl': 3,
 'in': 3,
 'every': 2,
 'way': 1,
 'she': 5,
 's': 10,
 'young': 1,
 'cousin': 2,
 'of': 3,
 'superman': 1,
 'with': 3,
 'long': 1,
 'golden': 1,
 'locks': 1,
 'but': 2,
 'don': 1,
 't': 3,
 'let': 1,
 'that': 7,
 'fool': 1,
 'you': 2,
 'because': 1,
 'tougher': 1,
 'than': 2,
 'rocks': 1,
 'her': 3,
 'powers': 2,
 'consist': 1,
 'flying': 1,
 'speed': 1,
 'moon': 1,
 'around': 1,
 'world': 1,
 'or': 4,
 'wherever': 1,
 'need': 2,
 'can': 2,
 'hear': 1,
 'pin': 1,
 'drop': 1,
 'beat': 1,
 'human': 1,
 'heart': 1,
 'not': 2,
 'mention': 1,
 'faintest': 1,
 'whisper': 1,
 'oh': 2,
 'how': 1,
 'very': 1,
 'smart': 1,
 'addition': 1,
 'mind': 1,
 'control': 1,
 'vision': 1,
 'is': 4,
 'x': 1,
 'ray': 1,
 'also': 1,
 'has': 3,
 'eyes': 1,
 'generate': 1,
 'heat': 2,
 'without': 1,
 'delay': 1,
 'like': 1,
 'weakness': 1,
 'too': 2,
 'kryptonite': 2,
 'there': 2,
 'only': 1,
 'one': 2,
 'and': 6,
 'two': 1,
 'so': 2,
 'why': 1,
 'this': 4,
 'well': 2,
 'easy': 1,
 'will': 2,
 'tell': 1,
 'listen': 2,
 'what': 1,
 'say': 1,
 'bullying': 1,
 'become': 2,
 'major': 1,
 'problem': 1,
 'everywhere': 1,
 'we': 2,
 'turn': 1,
 'our': 1,
 'teachers': 1,
 'discuss': 1,
 'issue': 1,
 'more': 2,
 'learn': 1,
 'throughout': 2,
 'school': 1,
 'halls': 1,
 'at': 1,
 'lunchroom': 1,
 'tables': 1,
 'students': 1,
 'are': 1,
 'teased': 1,
 'pushed': 1,
 'fighting': 1,
 'back': 1,
 'they': 1,
 'aren': 1,
 'able': 1,
 'as': 1,
 'stop': 1,
 'all': 1,
 'nonsense': 1,
 'by': 1,
 'using': 1,
 'my': 1,
 'aid': 1,
 'victim': 1,
 'defense': 1,
 'negative': 1,
 'chatter': 1,
 'change': 1,
 'each': 1,
 'bully': 1,
 'insults': 1,
 'words': 1,
 'matter': 1,
 'before': 1,
 'first': 1,
 'punch': 1,
 'thrown': 1,
 'foot': 1,
 'trips': 1,
 'another': 1,
 'zap': 1,
 'tormentor': 1,
 'behind': 1,
 'until': 1,
 'he': 1,
 'calls': 1,
 'his': 1,
 'mother': 1,
 'it': 2,
 'bad': 1,
 'longer': 1,
 'take': 1,
 'help': 1,
 'bullied': 1,
 'victims': 1,
 'stronger': 1,
 'The': 1,
 'truth': 1,
 'no': 1,
 'deserves': 2,
 'cruel': 1,
 'hateful': 1,
 'treatment': 1,
 'Everyone': 1,
 'happiness': 1,
 'should': 1,
 'unanimous': 1,
 'agreement': 1,
 '.': 1}

In [11]:
word2count.keys()


Out[11]:
dict_keys(['if', 'i', 'could', 'be', 'a', 'superhero', 'just', 'for', 'the', 'day', 'would', 'want', 'to', 'supergirl', 'in', 'every', 'way', 'she', 's', 'young', 'cousin', 'of', 'superman', 'with', 'long', 'golden', 'locks', 'but', 'don', 't', 'let', 'that', 'fool', 'you', 'because', 'tougher', 'than', 'rocks', 'her', 'powers', 'consist', 'flying', 'speed', 'moon', 'around', 'world', 'or', 'wherever', 'need', 'can', 'hear', 'pin', 'drop', 'beat', 'human', 'heart', 'not', 'mention', 'faintest', 'whisper', 'oh', 'how', 'very', 'smart', 'addition', 'mind', 'control', 'vision', 'is', 'x', 'ray', 'also', 'has', 'eyes', 'generate', 'heat', 'without', 'delay', 'like', 'weakness', 'too', 'kryptonite', 'there', 'only', 'one', 'and', 'two', 'so', 'why', 'this', 'well', 'easy', 'will', 'tell', 'listen', 'what', 'say', 'bullying', 'become', 'major', 'problem', 'everywhere', 'we', 'turn', 'our', 'teachers', 'discuss', 'issue', 'more', 'learn', 'throughout', 'school', 'halls', 'at', 'lunchroom', 'tables', 'students', 'are', 'teased', 'pushed', 'fighting', 'back', 'they', 'aren', 'able', 'as', 'stop', 'all', 'nonsense', 'by', 'using', 'my', 'aid', 'victim', 'defense', 'negative', 'chatter', 'change', 'each', 'bully', 'insults', 'words', 'matter', 'before', 'first', 'punch', 'thrown', 'foot', 'trips', 'another', 'zap', 'tormentor', 'behind', 'until', 'he', 'calls', 'his', 'mother', 'it', 'bad', 'longer', 'take', 'help', 'bullied', 'victims', 'stronger', 'The', 'truth', 'no', 'deserves', 'cruel', 'hateful', 'treatment', 'Everyone', 'happiness', 'should', 'unanimous', 'agreement', '.'])

In [12]:
word2count.values()


Out[12]:
dict_values([1, 9, 1, 5, 7, 3, 2, 4, 15, 4, 5, 2, 11, 3, 3, 2, 1, 5, 10, 1, 2, 3, 1, 3, 1, 1, 1, 2, 1, 3, 1, 7, 1, 2, 1, 1, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 4, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 6, 1, 2, 1, 4, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [13]:
vals = []
keys = []
for key in word2count.keys():
    keys.append(key)
    vals.append(word2count.get(key))

In [14]:
vals


Out[14]:
[1,
 9,
 1,
 5,
 7,
 3,
 2,
 4,
 15,
 4,
 5,
 2,
 11,
 3,
 3,
 2,
 1,
 5,
 10,
 1,
 2,
 3,
 1,
 3,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 7,
 1,
 2,
 1,
 1,
 2,
 1,
 3,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 3,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 6,
 1,
 2,
 1,
 4,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [15]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(30,20))
plt.bar(keys, vals, align='center')
plt.xlabel('Keys')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.title('Key Value Count Bar Plot')


Out[15]:
Text(0.5,1,'Key Value Count Bar Plot')

In [16]:
# Select Most Frequent N words
import heapq

In [17]:
# Get top 100 word count key values
frequent_words = heapq.nlargest(100, word2count, key=word2count.get)

In [18]:
frequent_words


Out[18]:
['the',
 'to',
 's',
 'i',
 'a',
 'that',
 'and',
 'be',
 'would',
 'she',
 'for',
 'day',
 'or',
 'is',
 'this',
 'superhero',
 'supergirl',
 'in',
 'of',
 'with',
 't',
 'her',
 'has',
 'just',
 'want',
 'every',
 'cousin',
 'but',
 'you',
 'than',
 'powers',
 'need',
 'can',
 'not',
 'oh',
 'heat',
 'too',
 'kryptonite',
 'there',
 'one',
 'so',
 'well',
 'will',
 'listen',
 'become',
 'we',
 'more',
 'throughout',
 'it',
 'deserves',
 'if',
 'could',
 'way',
 'young',
 'superman',
 'long',
 'golden',
 'locks',
 'don',
 'let',
 'fool',
 'because',
 'tougher',
 'rocks',
 'consist',
 'flying',
 'speed',
 'moon',
 'around',
 'world',
 'wherever',
 'hear',
 'pin',
 'drop',
 'beat',
 'human',
 'heart',
 'mention',
 'faintest',
 'whisper',
 'how',
 'very',
 'smart',
 'addition',
 'mind',
 'control',
 'vision',
 'x',
 'ray',
 'also',
 'eyes',
 'generate',
 'without',
 'delay',
 'like',
 'weakness',
 'only',
 'two',
 'why',
 'easy']

In [19]:
vals = []
keys = []
for words in frequent_words:
    keys.append(words)
    vals.append(word2count.get(words))

In [20]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(30,20))
plt.bar(keys, vals, align='center')
plt.xlabel('Keys')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.title('Key Value Count Bar Plot for top 100 words')


Out[20]:
Text(0.5,1,'Key Value Count Bar Plot for top 100 words')

In [21]:
# BOW Model
X = []

In [22]:
for sent in tokenize_sent:
    word_vector = []
    for word in frequent_words:
        # If word present in frequent word
        if word in nltk.word_tokenize(sent):
            word_vector.append(1)
        else:
            word_vector.append(0)
    # Append vector for each word in the bow list
    X.append(word_vector)

In [23]:
import numpy as np

X = np.asarray(X)

In [24]:
X.shape


Out[24]:
(18, 100)

In [25]:
# Show bow values as heatmap
import seaborn as sns

In [26]:
plt.figure(figsize=(20,10))
sns.heatmap(X,annot=True, linewidths=0.5, cmap="YlGnBu")


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x251c3ef6b00>