In [1]:
from __future__ import print_function
from src import main

In [2]:
projects = main.load_projects()
prjs  = filter(lambda x: x.name == "argouml", projects)

In [3]:
prj = prjs[1]
print(prj)


Project(name='argouml', printable_name='ArgoUML', version='v0.22', level='method', ref='refs/tags/VERSION_0_22', alpha=None, eta=None, passes=5, iterations=1000, num_topics=500, src_url='http://argouml-downloads.tigris.org/nonav/argouml-0.22/ArgoUML-0.22-src.zip', data_path='data/argouml/', full_path='data/argouml/v0.22/', src_path='data/argouml/v0.22/src')

In [4]:
repos = main.load_repos(prj)
snapshot = main.create_corpus(prj, repos, main.TaserSnapshotCorpus)
changeset = main.create_corpus(prj, repos, main.ChangesetCorpus, use_level=False)

In [5]:
len(snapshot), len(changeset)


Out[5]:
(12353, 9923)

In [6]:
snapshot.metadata = True
snapshot_list = list(snapshot)

In [7]:
snapshot.metadata = False
snap_lda, snap_fn = main.create_lda_model(prj, snapshot, snapshot.id2word, "Release")
changeset.metadata = False
change_lda, change_fn = main.create_lda_model(prj, changeset, changeset.id2word, "Changeset", use_level=False)


WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy

In [8]:
snap_lda.show_topics(num_words=5)


Out[8]:
[u'0.058*classifier + 0.039*model + 0.036*object + 0.034*previous + 0.033*parse',
 u'0.132*submachine + 0.075*action + 0.043*uml + 0.040*list + 0.034*figs',
 u'0.176*xmi + 0.113*reference + 0.058*object + 0.039*ref + 0.031*string',
 u'0.130*list + 0.055*model + 0.042*choices + 0.033*selected + 0.030*library',
 u'0.056*writer + 0.047*model + 0.036*xmi + 0.034*output + 0.033*count',
 u'0.383*enabled + 0.110*button + 0.082*set + 0.057*code + 0.027*enable',
 u'0.303*language + 0.082*name + 0.064*lang + 0.045*string + 0.043*generator',
 u'0.055*target + 0.029*determine + 0.027*method + 0.027*layout + 0.026*title',
 u'0.168*child + 0.073*name + 0.072*tree + 0.052*parent + 0.051*object',
 u'0.094*value + 0.067*string + 0.049*memento + 0.049*param + 0.029*model']

In [9]:
change_lda.show_topics(num_words=5)


Out[9]:
[u'0.025*set + 0.022*add + 0.020*name + 0.019*item + 0.017*element',
 u'0.031*para + 0.028*list + 0.021*org + 0.021*href + 0.019*argouml',
 u'0.165*token + 0.039*string + 0.037*exception + 0.027*set + 0.022*text',
 u'0.060*name + 0.044*build + 0.044*ant + 0.033*home + 0.030*property',
 u'0.043*graph + 0.037*model + 0.023*edge + 0.020*object + 0.017*event',
 u'0.087*utility + 0.069*stereotype + 0.041*singleton + 0.024*sep + 0.022*argo',
 u'0.017*die + 0.016*ocl + 0.014*title + 0.014*sect + 0.012*para',
 u'0.057*flavor + 0.054*drawing + 0.047*image + 0.039*clipboard + 0.038*diagram',
 u'0.117*object + 0.061*helper + 0.060*lang + 0.060*java + 0.050*model',
 u'0.184*modelelement + 0.067*owning + 0.042*enabler + 0.040*link + 0.026*undo']

In [10]:
gs = main.load_goldsets(prj)
goldset = set()
for issue, classes in gs:
    goldset |= classes
goldset_docs = filter(lambda x: x[1][0] in goldset, snapshot_list)

In [11]:
it = iter(goldset_docs)

In [25]:
doc, meta = next(it)
print(meta)
count = 0
for id_, f in sorted(doc, key=lambda x: x[1], reverse=True):
    if count > 15:
        break
    count += 1
    print(snapshot.id2word[id_], f)

print()
s = sorted(snap_lda[doc], key=lambda x: x[1], reverse=True)
c = sorted(change_lda[doc], key=lambda x: x[1], reverse=True)
print(len(s), len(c))
print(s[0], c[0])


(u'org.argouml.cognitive.ui.TabToDo.setTarget(Object)', u'taser_snapshot')
item 6
target 6
set 3
description 3
object 2
wizard 2
param 1
enabled 1
update 1
current 1
panel 1
tab 1
actions 1
step 1
directly 1
sets 1

11 7
(110, 0.21550797218872364) (88, 0.52802651538824008)

In [26]:
count = 0
for topicid, f in s:
    if count > 10:
        break
    count += 1
    print(topicid, f)
    print(snap_lda.show_topic(topicid, topn=4))
    print()


110 0.215507972189
[(0.52557765129093525, u'target'), (0.073104592259274861, u'set'), (0.061937107208129297, u'event'), (0.03858754656647944, u'org')]

22 0.183831421055
[(0.17430687718627111, u'item'), (0.03436755243794834, u'set'), (0.032147564966498468, u'target'), (0.027389042624312968, u'offender')]

72 0.168766196996
[(0.20443825726912221, u'step'), (0.10489351339446448, u'panel'), (0.065655416517407458, u'wizard'), (0.055260066823427371, u'item')]

466 0.11172058788
[(0.17706299893668043, u'action'), (0.14878792746180786, u'set'), (0.13434218196134937, u'localize'), (0.13288250477999122, u'translator')]

132 0.0873139957126
[(0.061471706729372487, u'fig'), (0.056419230693087075, u'text'), (0.048791446782496473, u'stereotype'), (0.029907841696302968, u'append')]

105 0.0511660602708
[(0.11770739179684299, u'target'), (0.11100148168932976, u'object'), (0.085305502657971308, u'source'), (0.082514604988587634, u'expression')]

46 0.0420854246185
[(0.38317946829028754, u'enabled'), (0.10964957557562469, u'button'), (0.082141092423643758, u'set'), (0.056797883471116938, u'code')]

133 0.0322618858922
[(0.2310841277168558, u'actions'), (0.077886234971077689, u'event'), (0.050652175669684392, u'object'), (0.046152836743998241, u'pop')]

360 0.0289465894835
[(0.12906581374670167, u'sender'), (0.10348407832439553, u'method'), (0.085525015250260267, u'handle'), (0.080538389595597323, u'param')]

51 0.0282812555876
[(0.35297755023091076, u'tab'), (0.10248920294268994, u'panel'), (0.038655711938384692, u'name'), (0.037005710071030679, u'pane')]

71 0.0221757531722
[(0.070718542427537015, u'diagram'), (0.043542444175177573, u'target'), (0.035932704100859622, u'set'), (0.035240321805707082, u'model')]


In [27]:
count = 0
for topicid, f in c:
    if count > 10:
        break
    count += 1
    print(topicid, f)
    print(change_lda.show_topic(topicid, topn=4))
    print()


88 0.528026515388
[(0.023580645224864076, u'fig'), (0.017151698671636694, u'action'), (0.015513341210878356, u'node'), (0.014375489859590434, u'editor')]

34 0.172636474282
[(0.026084768139419876, u'linus'), (0.020634776176311549, u'tolke'), (0.018918245945228947, u'thomas'), (0.018880683627983081, u'thierry')]

348 0.107738639153
[(0.081025864870549719, u'set'), (0.075589543541814255, u'mvw'), (0.050745896144566774, u'inset'), (0.045685152999262478, u'text')]

167 0.0521812300199
[(0.078405858901823819, u'fig'), (0.07535738322254068, u'avoid'), (0.054455759694670904, u'edge'), (0.04249853179101002, u'dst')]

12 0.0505034259026
[(0.07147439814354567, u'object'), (0.071289188847634452, u'factory'), (0.03979859110691316, u'uml'), (0.034451418335677775, u'port')]

70 0.0283540318063
[(0.089534587895765183, u'beta'), (0.044688564912113482, u'argo'), (0.027915414992029808, u'version'), (0.024933294541555493, u'uml')]

209 0.0254785568656
[(0.087986713436332284, u'client'), (0.07982324490089357, u'supplier'), (0.069748992257782194, u'namespace'), (0.066813923923659368, u'uuid')]


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]:


In [14]: