notebook.community

Edit and run



In [4]:

    
# This function can find the top N common words of every cluster.
def topic(srcaddr,indaddr,topN = 10, N = 9):
    # Import packages
    import pandas as pd
    from nltk.corpus import stopwords
    from nltk.tokenize import RegexpTokenizer
    
    # Load the original csv file and the vectorized csv file
    src = pd.read_csv(srcaddr)
    ind = pd.read_csv(indaddr)
    
    # Create an 2D array to store the words of the original csv file according to the cluster number of vectorized csv file
    ar = []
    for i in range(N):
        ar.append([])
    
    # Initialize the punctuation eliminator
    tokenizer = RegexpTokenizer(r'\w+')
    # Create a list of irrelevant words
    irrelevant_lists = ['rose','Rose','I','edu','hulman','Hulman','www','com','http','The','1','2','3','4','5','6','7','8','9','0','If','It','it','if']
    # Loop through all rows
    for i in range(src.shape[0]):
        # Add the title and the body in the same row to a string
        index = int(ind.cluster[i])
        string = str(src.Title[i])+" "+str(src.Body[i])
        # Get rid of the punctuation, stopwords and the irrelevant words
        a = tokenizer.tokenize(string)
        filtered_word = [word for word in a if word not in stopwords.words('english')]
        for word in filtered_word:
            if(word not in irrelevant_lists):
                # Append the word list to ar of index which we get from cluster number of vectorized csv
                ar[index].append(word)
    
    # Create an empty array to store top N words of every cluster
    top10 = []
    # Loop through number of clusters.
    for k in range(N):
        # Create an empty map
        maps={}
        # Store the number of a word as value and the word as key to the map
        for word in ar[k]:
            maps[word]=0
        for word in ar[k]:
            maps[word] = maps[word]+1
        # Create an empty array to store top N words of a cluster
        temp = []
        # Loop through to find the top N words of a cluster
        for h in range(topN):
            mapscount = 0
            mapskey = ""
            for key in maps:
                if(maps[key]>mapscount):
                    mapskey = key
                    mapscount = maps[key]
            maps[mapskey]=0
            temp.append(mapskey)
        # Store the temp array to top10 array
        top10.append(temp)
    # return top10 2Darray
    return top10



In [5]:

    
result = topic('announcements_cbow.csv','tensorflow.csv')
for words in result:
    print(words)









    



['Selling', 'selling', 'sale', 'interested', 'condition', 'asking', 'new', 'email', 'used', 'Sale']
['interested', 'room', 'looking', 'email', 'loft', 'would', 'year', 'Percopo', 'We', 'Thanks']
['book', 'Selling', 'selling', 'books', 'interested', 'Book', 'edition', 'email', 'used', 'Books']
['font', '0in', 'family', 'serif', 'sans', 'span', 'color', 'MsoNormal', 'Calibri', 'margin']
['Lost', 'please', 'lost', 'found', 'email', 'black', 'Thanks', 'Thank', 'contact', 'know']
['ride', 'Indy', 'gas', 'pay', 'Ride', 'airport', 'need', 'back', 'flight', 'Airport']
['Please', 'SimplyWell', 'please', 'IM', 'register', '00', 'points', '2014', 'Wednesday', 'Intramural']
['found', 'claim', 'A', 'Found', 'describe', 'phone', 'Please', 'cell', 'Moench', 'please']
['interested', 'A', '10', 'contact', 'please', 'email', 'Selling', 'one', '30', 'new']



In [ ]: