In [4]:
# This function can find the top N common words of every cluster.
def topic(srcaddr,indaddr,topN = 10, N = 9):
# Import packages
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
# Load the original csv file and the vectorized csv file
src = pd.read_csv(srcaddr)
ind = pd.read_csv(indaddr)
# Create an 2D array to store the words of the original csv file according to the cluster number of vectorized csv file
ar = []
for i in range(N):
ar.append([])
# Initialize the punctuation eliminator
tokenizer = RegexpTokenizer(r'\w+')
# Create a list of irrelevant words
irrelevant_lists = ['rose','Rose','I','edu','hulman','Hulman','www','com','http','The','1','2','3','4','5','6','7','8','9','0','If','It','it','if']
# Loop through all rows
for i in range(src.shape[0]):
# Add the title and the body in the same row to a string
index = int(ind.cluster[i])
string = str(src.Title[i])+" "+str(src.Body[i])
# Get rid of the punctuation, stopwords and the irrelevant words
a = tokenizer.tokenize(string)
filtered_word = [word for word in a if word not in stopwords.words('english')]
for word in filtered_word:
if(word not in irrelevant_lists):
# Append the word list to ar of index which we get from cluster number of vectorized csv
ar[index].append(word)
# Create an empty array to store top N words of every cluster
top10 = []
# Loop through number of clusters.
for k in range(N):
# Create an empty map
maps={}
# Store the number of a word as value and the word as key to the map
for word in ar[k]:
maps[word]=0
for word in ar[k]:
maps[word] = maps[word]+1
# Create an empty array to store top N words of a cluster
temp = []
# Loop through to find the top N words of a cluster
for h in range(topN):
mapscount = 0
mapskey = ""
for key in maps:
if(maps[key]>mapscount):
mapskey = key
mapscount = maps[key]
maps[mapskey]=0
temp.append(mapskey)
# Store the temp array to top10 array
top10.append(temp)
# return top10 2Darray
return top10
In [5]:
result = topic('announcements_cbow.csv','tensorflow.csv')
for words in result:
print(words)
In [ ]: