In [25]:
%pylab
from __future__ import print_function
%matplotlib inline
In [26]:
from __future__ import print_function
import csv
from src import main, utils
import sys
projects = main.load_projects()
interests = ['argouml v0.26.2 method', 'mucommander v0.8.5 method']
p = list()
for project in projects:
if ' '.join([project.name, project.version, project.level]) not in interests:
continue
p.append(project)
In [27]:
project = p[0]
print(project)
repos = main.load_repos(project)
goldsets = main.load_goldsets(project)
queries = main.create_queries(project)
snapshot = main.create_release_corpus(project, repos)
changesets = main.create_corpus(project, repos, main.ChangesetCorpus, use_level=False)
snapshot_lda, _ = main.create_lda_model(project, snapshot, None, "Release", use_level=True)
changeset_lda, _ = main.create_lda_model(project, changesets, None, "Changeset", use_level=False)
snapshot_ranks = main.read_ranks(project, "release")
changeset_ranks = main.read_ranks(project, "changeset")
snapshot_frms = dict( (y,(x,z)) for x,y,z in main.get_frms(goldsets, snapshot_ranks))
changeset_frms = dict( (y,(x,z)) for x,y,z in main.get_frms(goldsets, changeset_ranks))
In [28]:
q = dict()
for query, topics in main.get_topics(snapshot_lda, queries, by_ids=["5258", "5088"], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
q[query[0]] = topics
qc = dict()
for query, topics in main.get_topics(changeset_lda, queries, by_ids=["5258", "5088"], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
qc[query[0]] = topics
q5258 = dict()
for query, topics in main.get_topics(snapshot_lda, snapshot, by_ids=[snapshot_frms[5258][1]], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
q5258[query[0]] = topics
#print(q5258)
qc5258 = dict()
for query, topics in main.get_topics(changeset_lda, snapshot, by_ids=[changeset_frms[5258][1]], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
qc5258[query[0]] = topics
#print(q5258)
q5088 = dict()
for query, topics in main.get_topics(snapshot_lda, snapshot, by_ids=[snapshot_frms[5088][1]], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
q5088[query[0]] = topics
#print(q5088)
qc5088 = dict()
for query, topics in main.get_topics(changeset_lda, snapshot, by_ids=[changeset_frms[5088][1]], full=False):
topics = sorted(topics, key=lambda x: x[1], reverse=True)
qc5088[query[0]] = topics
#print(qc5088)
In [29]:
def printer(q, model):
for qid, topics in q.items():
print(qid, "num topics:", len(topics))
for t in topics:
print(" ", t[0], t[1])
for word in model.show_topic(t[0]):
if word[0]>= 0.05:
print(" {0} {1}".format(*word))
In [30]:
print(snapshot_frms[5258])
print(changeset_frms[5258])
print(snapshot_frms[5088])
print(changeset_frms[5088])
In [31]:
!cat data/argouml/v0.26.2/goldsets/method/5258.txt
In [32]:
!cat data/argouml/v0.26.2/goldsets/method/5088.txt
In [33]:
queries.metadata = True
qs = list(filter(lambda x: x[1][0] in ["5258", "5088"], queries))
for query, metadata in qs:
doc = sorted(query, key=lambda x: x[1], reverse=True)
words = [ ( freq, queries.id2word[wid] ) for wid, freq in doc ]
print(metadata[0], "num words:", len(words))
for word in words:
print(" {1} ({0})".format(*word))
In [34]:
print("Snapshot")
printer(q, snapshot_lda)
print()
print("Changeset")
printer(qc, changeset_lda)
In [35]:
print("Snapshot")
printer(q5258, snapshot_lda)
print()
print("Changeset")
printer(qc5258, changeset_lda)
In [36]:
print()
print("Snapshot")
printer(q5088, snapshot_lda)
print()
print("Changeset")
printer(qc5088, changeset_lda)
In [ ]: