Topic modeling of abilities texts

In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.


In [1]:
import pandas as pd
from pprint import pprint
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Abilities, Heroes

NMF


In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [h.name for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])


No abilities for this HeroID == 16

In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun', 'silence', 
               'blink is short distance teleportation', 
               'silence', 
               'healing',
               'invisibility', 'area of usage', 'armor', 'percentage',
               'DOT is damage over time (seconds)',
               'summon or place a ward', 'illusions, images, duplicates, nemesisis',
               'critical damage', 'movement speed', 'attack speed'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')]))

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                   ngram_range=(1,3))
tfidf_vectorizer.fit(corpus)
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()

tf_corpus.shape


Out[4]:
(503, 12251)

In [5]:
nmf = NMF(n_components=40, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))


Topic 0:
area, area deal, raze, raze area, area deal damag, raze area deal, deal damag enemi, enemi unit area, unit area, damag enemi unit
Topic 1:
movement speed, movement, speed, increas movement speed, increas movement, increas, attack movement speed, attack movement, bear, bonus movement
Topic 2:
damag time, time, deal damag time, deal damag, deal, damag, poison, enemi unit, seconds, unit
Topic 3:
dispel, type, dispel type, dispel type basic, basic dispel, type basic dispel, type basic, basic, purg, type strong
Topic 4:
base level, level, base, invok, level quas, base level quas, quas, level cabe68 exort, base level cabe68, level cabe68
Topic 5:
slow movement, slow movement speed, damag slow movement, damag slow, slow, movement, movement speed, speed, unit damag slow, kick unit damag
Topic 6:
friend, friend unit, heal friend unit, heal friend, unit, friend unit damag, unit damag, instant heal, unit damag nearbi, instant heal friend
Topic 7:
stun, damag stun, stun deal, stun damag, damag, stun damag enemi, throw, unit stun, unit damag stun, throw boulder unit
Topic 8:
silenc, enchant, damag silenc, addit damag, addit, silenc enemi unit, silenc enemi, deal addit damag, silenc caus, silenc caus addit
Topic 9:
nearbi enemi, nearbi, damag nearbi enemi, damag nearbi, slam, ground, slam ground, slam ground stun, ground stun, enemi
Topic 10:
attack speed, attack, speed, increas attack, increas attack speed, increas, gain increas, gain increas attack, gain, alli
Topic 11:
unit, enemi unit, enemi, damag enemi unit, damag enemi, send, stone, damag, line, wave
Topic 12:
short distanc teleport, short distanc, distanc teleport, teleport, distanc, short, teleport allow, distanc teleport allow, combat, allow
Topic 13:
poison slow, poison, slow attack, slow attack movement, movement speeds, inflict poison, inflict poison slow, poison slow attack, attack movement speeds, speeds
Topic 14:
armor, reduc, reduc armor, seconds, armor enemi, armor nearbi enemies, reduc armor nearbi, armor nearbi, presenc reduc, presenc reduc armor
Topic 15:
bonus, bonus damag, deal bonus, deal bonus damag, damag, deal, attack, grant, grant bonus, attack enemi
Topic 16:
great, area, area effect, damag time attack, affect enemi unit, great reduced, unit damag time, launch independ, attack speed great, area effect affect
Topic 17:
spike, hurl air stun, straight path enemi, unit hurl air, spike burst, spike burst straight, enemi unit hurl, burst straight path, hurl air, unit hurl
Topic 18:
hero, enemi hero, enemi, upgrad, alli hero, steal, alli, essenc, duration, true sight
Topic 19:
point, strength, morphl, morph, process reversible, shift form, change passiv, change, pull point, morphl shift form
Topic 20:
ward, summon, ward attack enemi, ward attack, attack enemi, immun, summon place, place ward, summon place ward, place
Topic 21:
spells, cast spells, area cast, unit area cast, area cast spells, cast, enemi unit area, unit area, prevent enemi, prevent
Topic 22:
slow, slow enemi, slow enemi unit, damag slow, damag slow enemi, enemi, area, channel, damag, movement attack
Topic 23:
invis, attack use ability, use ability, attack use, ability, becom invis, use, invis unit, becom, unit moment attack
Topic 24:
percentag, health, current health, current, beam, damag enemi heal, enemi heal, heal percentag, percentag current, percentag current health
Topic 25:
deal damag stun, blast enemi, blast enemi unit, blast, wave deal damag, wave deal, enemi unit wave, unit wave deal, unit wave, damag stun
Topic 26:
uniqu, uniqu attack, attack, modifiers, attack modifiers, uniqu attack modifiers, doe, doe stack uniqu, modifier doe, uniqu attack modifier
Topic 27:
instanc provid increas, instanc provid, provid increas, allow manipul, elements, manipul, instanc, allow, provid, provid increas attack
Topic 28:
heal, alli, heal nearbi alli, heal nearbi, nearbi alli, focus magic heal, focus magic, magic heal, magic heal nearbi, unit heal
Topic 29:
miss, caus, caus miss, attacks, miss attacks, caus miss attacks, blind, area, caus attack, unit caus
Topic 30:
mana, equal, lose mana, point mana, lose, pool, mana pool, total mana, absorb, restor
Topic 31:
extra, extra damag, deal extra damag, deal extra, damag attack, deal, chanc deal, nearbi unit attacking, extra damag nearbi, damag nearbi unit
Topic 32:
critic, strike, critic strike, critic damag, chanc, chanc deal, attack, ad, hit, victim
Topic 33:
ani, teleport, snowbal, teleport ani, ani enemi, tether, ani point, launch, point, teleport ani point
Topic 34:
creat, illus, deal damage, creat illus, terrorblad, damage, terrorblad deal damage, illus terrorblad deal, illus terrorblad, terrorblad deal
Topic 35:
enemies, nearbi enemies, nearbi, damag nearbi enemies, damag nearbi, slow nearbi enemies, slow nearbi, attacked, attack deal, trap slow nearbi
Topic 36:
damage, damage upgrad, upgrad, deal massiv, massiv, deal massiv damage, massiv damage, massiv damage upgrad, deal, magic
Topic 37:
gain, gain bonus, bonus, bonus damage, damage, gain bonus damage, kill, unit, die, avoid
Topic 38:
lightn, bolt, bolt lightn, strike, strike enemi, lightn bolt, cast lightn, cast lightn bolt, true sight, sight
Topic 39:
second, damag second, burn, deal damag second, golem, damag, everi, enemi damag second, golem nearbi, golem nearbi enemi

In [6]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 19
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print(descriptions['name'][index])
print(descriptions['description'][index])
print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')
        
    print()


fissure
Slams the ground with a mighty totem, creating an impassable ridge of stone while stunning and damaging enemy units along its line.
-------
enemi, enemi unit, unit, 
deal extra damag, extra damag, extra, 
damag nearbi enemi, nearbi, nearbi enemi, 

In [7]:
# distribution of categories
%matplotlib inline
categories_dist = dict()
skills_by_category = dict()

for index, text in enumerate(descriptions['description']):
    vect = tfidf_vectorizer.transform([text])
    weights = nmf.transform(vect)
    topics = weights.argsort()[0][-2:]
    
    for topic in topics:
        # find words describing topic
        for i in nmf.components_[topic].argsort()[-1:]:
            skills_by_category.setdefault(tf_features_names[i], [])
            skills_by_category[tf_features_names[i]].append(descriptions['name'][index])
            
# categories_dist = categories_dist.dropna()
categories_dist = pd.Series({k: len(skills_by_category[k]) for k in skills_by_category})
# categories_dist.plot()


/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [8]:
print('#categories', categories_dist.shape[0])
print(categories_dist)


NOTE: the name of category can be misleading, check the above example with fissure
#categories 39
ani                        37
armor                      20
attack speed               29
base level                 18
bonus                      10
creat                      43
critic                     14
damag time                  1
damage                    161
deal damag stun            10
dispel                     27
enemies                    78
extra                      12
friend                     11
gain                       79
great                       8
heal                       15
hero                       29
instanc provid increas      7
invis                      13
lightn                     28
mana                       24
miss                       32
movement speed             68
nearbi enemi                7
percentag                  30
point                      14
poison slow                 8
second                     27
short distanc teleport     16
silenc                      3
slow                       13
slow movement               8
spells                     11
spike                      10
stun                       11
uniqu                      15
unit                       18
ward                       11
dtype: int64

In [14]:
# print abilities by category
print(skills_by_category['deal damag stun'])


['dragon_slave', 'death_pulse', 'sonic_wave', 'shadow_wave', 'dual_breath', 'deafening_blast', 'fireblast', 'unrefined_fireblast', 'rip_tide', 'shockwave']

In [ ]:
# print categories which were separated good enough from others
sorted_abilities = dict()
for category in skills_by_category:
    if len(skills_by_category[category]) <= 30:
        sorted_abilities[category] = skills_by_category[category]
        
pprint(sorted_abilities)

In [ ]:


In [ ]: