Topic modeling of abilities texts

In this file I'm trying to cluster the abilities by performing topic modeling (with NMF decomposition) on their descriptions.

In [1]:
import pandas as pd
from pprint import pprint
from sklearn.decomposition import NMF
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from atod import Abilities, Heroes


In [2]:
# create stop-words list
heroes = Heroes.all()
heroes_names = [ for h in heroes]
# names of heroes commonly occur in descriptions, so
# deleting them is a good idea
words_in_heroes_names = [word.lower() for name in heroes_names for word in name.split(' ')]

eng_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()
stop_words = set(words_in_heroes_names + list(eng_stop_words) 
                 + ['font', 'color', '7998b5', 'target', 'enemy', 'friendly', 'allied',
                    'remnant', 'aghanim', 'scepter', 'units', 'deal damage'
                    'cause', 'creep'])

No abilities for this HeroID == 16

In [3]:
# get all texts for all abilities
texts = Abilities.all().get_texts()
# get abilities names and  descriptions
descriptions = texts[['description', 'name']]
corpus = [a.replace('\\n', ' ').replace('%%', '%') for a in descriptions['description']]
corpus.extend(['stun', 'silence', 
               'blink is short distance teleportation', 
               'invisibility', 'area of usage', 'armor', 'percentage',
               'DOT is damage over time (seconds)',
               'summon or place a ward', 'illusions, images, duplicates, nemesisis',
               'critical damage', 'movement speed', 'attack speed'])

stemmer = EnglishStemmer()
stemmed_corpus = []
for doc in corpus:
    stemmed_corpus.append(' '.join([stemmer.stem(word) for word in doc.split(' ')]))

corpus = stemmed_corpus

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
tf_corpus = tfidf_vectorizer.transform(corpus)
tf_features_names = tfidf_vectorizer.get_feature_names()


(503, 12251)

In [5]:
nmf = NMF(n_components=40, init='nndsvd').fit(tf_corpus)

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic %d:" % (topic_idx))
    print(", ".join([tf_features_names[i]
                    for i in topic.argsort()[:-10 - 1:-1]]))

Topic 0:
area, area deal, raze, raze area, area deal damag, raze area deal, deal damag enemi, enemi unit area, unit area, damag enemi unit
Topic 1:
movement speed, movement, speed, increas movement speed, increas movement, increas, attack movement speed, attack movement, bear, bonus movement
Topic 2:
damag time, time, deal damag time, deal damag, deal, damag, poison, enemi unit, seconds, unit
Topic 3:
dispel, type, dispel type, dispel type basic, basic dispel, type basic dispel, type basic, basic, purg, type strong
Topic 4:
base level, level, base, invok, level quas, base level quas, quas, level cabe68 exort, base level cabe68, level cabe68
Topic 5:
slow movement, slow movement speed, damag slow movement, damag slow, slow, movement, movement speed, speed, unit damag slow, kick unit damag
Topic 6:
friend, friend unit, heal friend unit, heal friend, unit, friend unit damag, unit damag, instant heal, unit damag nearbi, instant heal friend
Topic 7:
stun, damag stun, stun deal, stun damag, damag, stun damag enemi, throw, unit stun, unit damag stun, throw boulder unit
Topic 8:
silenc, enchant, damag silenc, addit damag, addit, silenc enemi unit, silenc enemi, deal addit damag, silenc caus, silenc caus addit
Topic 9:
nearbi enemi, nearbi, damag nearbi enemi, damag nearbi, slam, ground, slam ground, slam ground stun, ground stun, enemi
Topic 10:
attack speed, attack, speed, increas attack, increas attack speed, increas, gain increas, gain increas attack, gain, alli
Topic 11:
unit, enemi unit, enemi, damag enemi unit, damag enemi, send, stone, damag, line, wave
Topic 12:
short distanc teleport, short distanc, distanc teleport, teleport, distanc, short, teleport allow, distanc teleport allow, combat, allow
Topic 13:
poison slow, poison, slow attack, slow attack movement, movement speeds, inflict poison, inflict poison slow, poison slow attack, attack movement speeds, speeds
Topic 14:
armor, reduc, reduc armor, seconds, armor enemi, armor nearbi enemies, reduc armor nearbi, armor nearbi, presenc reduc, presenc reduc armor
Topic 15:
bonus, bonus damag, deal bonus, deal bonus damag, damag, deal, attack, grant, grant bonus, attack enemi
Topic 16:
great, area, area effect, damag time attack, affect enemi unit, great reduced, unit damag time, launch independ, attack speed great, area effect affect
Topic 17:
spike, hurl air stun, straight path enemi, unit hurl air, spike burst, spike burst straight, enemi unit hurl, burst straight path, hurl air, unit hurl
Topic 18:
hero, enemi hero, enemi, upgrad, alli hero, steal, alli, essenc, duration, true sight
Topic 19:
point, strength, morphl, morph, process reversible, shift form, change passiv, change, pull point, morphl shift form
Topic 20:
ward, summon, ward attack enemi, ward attack, attack enemi, immun, summon place, place ward, summon place ward, place
Topic 21:
spells, cast spells, area cast, unit area cast, area cast spells, cast, enemi unit area, unit area, prevent enemi, prevent
Topic 22:
slow, slow enemi, slow enemi unit, damag slow, damag slow enemi, enemi, area, channel, damag, movement attack
Topic 23:
invis, attack use ability, use ability, attack use, ability, becom invis, use, invis unit, becom, unit moment attack
Topic 24:
percentag, health, current health, current, beam, damag enemi heal, enemi heal, heal percentag, percentag current, percentag current health
Topic 25:
deal damag stun, blast enemi, blast enemi unit, blast, wave deal damag, wave deal, enemi unit wave, unit wave deal, unit wave, damag stun
Topic 26:
uniqu, uniqu attack, attack, modifiers, attack modifiers, uniqu attack modifiers, doe, doe stack uniqu, modifier doe, uniqu attack modifier
Topic 27:
instanc provid increas, instanc provid, provid increas, allow manipul, elements, manipul, instanc, allow, provid, provid increas attack
Topic 28:
heal, alli, heal nearbi alli, heal nearbi, nearbi alli, focus magic heal, focus magic, magic heal, magic heal nearbi, unit heal
Topic 29:
miss, caus, caus miss, attacks, miss attacks, caus miss attacks, blind, area, caus attack, unit caus
Topic 30:
mana, equal, lose mana, point mana, lose, pool, mana pool, total mana, absorb, restor
Topic 31:
extra, extra damag, deal extra damag, deal extra, damag attack, deal, chanc deal, nearbi unit attacking, extra damag nearbi, damag nearbi unit
Topic 32:
critic, strike, critic strike, critic damag, chanc, chanc deal, attack, ad, hit, victim
Topic 33:
ani, teleport, snowbal, teleport ani, ani enemi, tether, ani point, launch, point, teleport ani point
Topic 34:
creat, illus, deal damage, creat illus, terrorblad, damage, terrorblad deal damage, illus terrorblad deal, illus terrorblad, terrorblad deal
Topic 35:
enemies, nearbi enemies, nearbi, damag nearbi enemies, damag nearbi, slow nearbi enemies, slow nearbi, attacked, attack deal, trap slow nearbi
Topic 36:
damage, damage upgrad, upgrad, deal massiv, massiv, deal massiv damage, massiv damage, massiv damage upgrad, deal, magic
Topic 37:
gain, gain bonus, bonus, bonus damage, damage, gain bonus damage, kill, unit, die, avoid
Topic 38:
lightn, bolt, bolt lightn, strike, strike enemi, lightn bolt, cast lightn, cast lightn bolt, true sight, sight
Topic 39:
second, damag second, burn, deal damag second, golem, damag, everi, enemi damag second, golem nearbi, golem nearbi enemi

In [6]:
# find the ability with `index` in descriptions DataFrame
# and define categories of this ability
index = 19
test = tfidf_vectorizer.transform([descriptions['description'][index]])
weights = nmf.transform(test)
topics = weights.argsort()[0][-3:]

print('-' * len(descriptions['name'][index]))

for topic in topics:
    for i in nmf.components_[topic].argsort()[-3:]:
        print(tf_features_names[i], end=', ')

Slams the ground with a mighty totem, creating an impassable ridge of stone while stunning and damaging enemy units along its line.
enemi, enemi unit, unit, 
deal extra damag, extra damag, extra, 
damag nearbi enemi, nearbi, nearbi enemi, 

In [7]:
# distribution of categories
%matplotlib inline
categories_dist = dict()
skills_by_category = dict()

for index, text in enumerate(descriptions['description']):
    vect = tfidf_vectorizer.transform([text])
    weights = nmf.transform(vect)
    topics = weights.argsort()[0][-2:]
    for topic in topics:
        # find words describing topic
        for i in nmf.components_[topic].argsort()[-1:]:
            skills_by_category.setdefault(tf_features_names[i], [])
# categories_dist = categories_dist.dropna()
categories_dist = pd.Series({k: len(skills_by_category[k]) for k in skills_by_category})
# categories_dist.plot()

/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/matplotlib/ UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/matplotlib/ UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [8]:
print('#categories', categories_dist.shape[0])

NOTE: the name of category can be misleading, check the above example with fissure
#categories 39
ani                        37
armor                      20
attack speed               29
base level                 18
bonus                      10
creat                      43
critic                     14
damag time                  1
damage                    161
deal damag stun            10
dispel                     27
enemies                    78
extra                      12
friend                     11
gain                       79
great                       8
heal                       15
hero                       29
instanc provid increas      7
invis                      13
lightn                     28
mana                       24
miss                       32
movement speed             68
nearbi enemi                7
percentag                  30
point                      14
poison slow                 8
second                     27
short distanc teleport     16
silenc                      3
slow                       13
slow movement               8
spells                     11
spike                      10
stun                       11
uniqu                      15
unit                       18
ward                       11
dtype: int64

In [14]:
# print abilities by category
print(skills_by_category['deal damag stun'])

['dragon_slave', 'death_pulse', 'sonic_wave', 'shadow_wave', 'dual_breath', 'deafening_blast', 'fireblast', 'unrefined_fireblast', 'rip_tide', 'shockwave']

In [ ]:
# print categories which were separated good enough from others
sorted_abilities = dict()
for category in skills_by_category:
    if len(skills_by_category[category]) <= 30:
        sorted_abilities[category] = skills_by_category[category]

In [ ]:

In [ ]: