In [1]:
from pprint import pprint
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Heroes

In [2]:
heroes = Heroes.all()


No abilities for this HeroID == 16

In [3]:
def replace_many(string: str, replacements: list):
    ''' Performs many str.replace() functions in a row.
    
    Args:
        string: string to be changed
        replacements (list of tuples): tuples are args for replace function
            in form (old, new)
            
    Returns:
        str: `string` on which all replace() functions were performed
        
    '''
    for repl in replacements:
        string = string.replace(*repl)
        
    return string

In [4]:
def count_words(doc):
    ''' Returns amount of unique words in the document.'''
    words = set()
    for text in doc:
        words = words.union([w for w in text.split()])
        
    return len(words)

In [5]:
# replace (old, new)
replacements = [('%', ''), ('\\n', ' '), ('%%', ''), ('_', ' ')]

abilities = {h.name: list(map(lambda x: replace_many(x, replacements), 
                              h.abilities.get_description(['texts'])))
                     for h in heroes}

texts_list = [text for hero in abilities.values() for text in hero]
print(count_words(texts_list))

# What above dict comprehension does: 
# abilities = dict()
# for hero in heroes:
#     abilities[hero.name] = list()
#     for ability in hero.abilities.get_texts():
#         abilities[hero.name].append(ability.replace('\\n', ' ').replace('%%', '%'))
# I just love lambdas and comprehensions, so :)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-5-839ac160e129> in <module>()
      4 abilities = {h.name: list(map(lambda x: replace_many(x, replacements), 
      5                               h.abilities.get_description(['texts'])))
----> 6                      for h in heroes}
      7 
      8 texts_list = [text for hero in abilities.values() for text in hero]

<ipython-input-5-839ac160e129> in <dictcomp>(.0)
      4 abilities = {h.name: list(map(lambda x: replace_many(x, replacements), 
      5                               h.abilities.get_description(['texts'])))
----> 6                      for h in heroes}
      7 
      8 texts_list = [text for hero in abilities.values() for text in hero]

AttributeError: 'Abilities' object has no attribute 'get_description'

In [ ]:
# Create list of stop words 
# names of heroes commonly occur in descriptions, so
# there is need to remove them
heroes_names = [h.name for h in heroes]
words_in_heroes_names = [word.lower() 
                         for name in heroes_names 
                         for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

In [ ]:
# stemmer = EnglishStemmer()
corpus = dict()

# # for every hero
for hero, texts in abilities.items():    
#     stemmed_corpus[hero] = list()
#     # concatenate all abilities descriptions into one
#     # and stem all words inside
    corpus[hero] = ' '.join([word for doc in texts
                                  for word in doc.split(' ')])
    
# print(count_words(list(stemmed_corpus.values())))
    
# stemmed_stop_words = [stemmer.stem(word) for word in stop_words]

In [ ]:
vectorizer = TfidfVectorizer(stop_words=stop_words,
                             ngram_range=(1,2),
                             min_df=2,)
vectorizer.fit(corpus.values())

In [ ]:
print(len(vectorizer.vocabulary_))

In [ ]:
print(vectorizer.vocabulary_)

In [ ]:
# find the most popular words
most_popular_words = [('', 0)] * 20
print(most_popular_words)

id2word = {str(id_): word for word, id_ in vectorizer.vocabulary_.items()}
corpus_matrix = vectorizer.transform(corpus.values())

for index in range(corpus_matrix.shape[1]):
    col = corpus_matrix.getcol(index)
    
    if col.nnz > most_popular_words[0][1]:
        most_popular_words[0] = (id2word[str(index)], 
                                  col.nnz)
        most_popular_words = sorted(most_popular_words, key=lambda x: x[1])

In [ ]:
print(most_popular_words)

Conclusion

TFIDF doesn't work for this task, because meaningful words ('stun', 'silence') are common in the corpus and in the single texts they occur same amount of times with unimportant words.

CountVectorizer doesn't work because important word can occur in the text the same amount of times as unimportant one. To improve this method, one can weight words based on how many times they occur in the corpus.

There are too few documents to try something like word2vec.


In [ ]: