In [1]:
# Import Dependencies
import nltk
In [2]:
# Sample Text
text = """If I could be a superhero, just for the day,
I would want to be Supergirl, in every way.
She’s the young cousin of Superman with long golden locks,
But don’t let that fool you because she’s tougher than rocks.
Her powers consist of flying with speed,
To the moon, around the world or wherever the need.
She can hear a pin drop or the beat of a human’s heart,
Not to mention the faintest whisper, oh how very smart!
In addition to mind control, Supergirl’s vision is x-ray
She also has eyes that generate heat without delay.
Just like her cousin, she has her weakness too,
Kryptonite, oh Kryptonite. There’s only one and not two.
So why would I want to be this superhero for the day?
Well, that’s easy, I will tell you. So listen to what I say.
Bullying has become a major problem everywhere we turn.
Our teachers discuss the issue, but there is more that we need to learn.
Throughout the school halls and at the lunchroom tables,
Students are teased or pushed, and fighting back, well they aren’t able.
As Supergirl I would stop all this nonsense,
By using my powers to aid in every victim’s defense.
Throughout the day, I would listen for the negative chatter
And change each bully’s insults to words that matter.
Before the first punch is thrown or a foot trips another,
I would zap the tormentor’s behind with heat until he calls his mother.
It’s too bad I can’t be this superhero for longer,
It will take more than a day to help bullied victims to become stronger.
The truth is that no one deserves this cruel and hateful treatment
Everyone deserves happiness and that should be a unanimous agreement."""
In [3]:
# Sentence Tokenization
from nltk.tokenize import sent_tokenize
In [4]:
tokenize_sent = sent_tokenize(text)
In [5]:
tokenize_sent
Out[5]:
In [6]:
import re
In [7]:
# Preprocess Sentences
for i in range(len(tokenize_sent)-1):
# Convert all words to lowercase
tokenize_sent[i] = tokenize_sent[i].lower()
# Replace all characters (punctuations) with a space
tokenize_sent[i] = re.sub(r'\W',' ',tokenize_sent[i])
# Remove extra spaces
tokenize_sent[i] = re.sub(r'\s+',' ', tokenize_sent[i])
In [8]:
tokenize_sent
Out[8]:
In [9]:
# Create a Histogram
word2count = {}
for sent in tokenize_sent:
# Tokenize Sentences into Words
words = nltk.word_tokenize(sent)
for word in words:
# If word not in bow, add it
if word not in word2count.keys():
word2count[word] = 1
# If word present in bow, increment the counter
else:
word2count[word] += 1
In [10]:
word2count
Out[10]:
In [11]:
word2count.keys()
Out[11]:
In [12]:
word2count.values()
Out[12]:
In [13]:
vals = []
keys = []
for key in word2count.keys():
keys.append(key)
vals.append(word2count.get(key))
In [14]:
vals
Out[14]:
In [15]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(30,20))
plt.bar(keys, vals, align='center')
plt.xlabel('Keys')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.title('Key Value Count Bar Plot')
Out[15]:
In [16]:
# Select Most Frequent N words
import heapq
In [17]:
# Get top 100 word count key values
frequent_words = heapq.nlargest(100, word2count, key=word2count.get)
In [18]:
frequent_words
Out[18]:
In [19]:
vals = []
keys = []
for words in frequent_words:
keys.append(words)
vals.append(word2count.get(words))
In [20]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(30,20))
plt.bar(keys, vals, align='center')
plt.xlabel('Keys')
plt.ylabel('Values')
plt.xticks(rotation=90)
plt.title('Key Value Count Bar Plot for top 100 words')
Out[20]:
In [21]:
# BOW Model
X = []
In [22]:
for sent in tokenize_sent:
word_vector = []
for word in frequent_words:
# If word present in frequent word
if word in nltk.word_tokenize(sent):
word_vector.append(1)
else:
word_vector.append(0)
# Append vector for each word in the bow list
X.append(word_vector)
In [23]:
import numpy as np
X = np.asarray(X)
In [24]:
X.shape
Out[24]:
In [25]:
# Show bow values as heatmap
import seaborn as sns
In [26]:
plt.figure(figsize=(20,10))
sns.heatmap(X,annot=True, linewidths=0.5, cmap="YlGnBu")
Out[26]: