In [1]:
import json
import matplotlib.pyplot as plt
In [2]:
with open('data/projects_with_contributors.json', 'r') as f:
projects = json.load(f)
print(len(projects))
In [3]:
pruned_projects = [project for project in projects if len(project['description'].split(' ')) > 10]
In [4]:
print(len(pruned_projects))
In [5]:
project_dict = {}
for project in pruned_projects:
project_dict[project['_id']] = project['description']
In [6]:
author_dict = dict()
for project in pruned_projects:
for contrib in project['contributors']:
if contrib in author_dict:
author_dict[contrib].append(project['_id'])
else :
author_dict[contrib] = [project['_id']]
In [7]:
major_author = []
for author in author_dict:
if len(author_dict[author]) > 10:
major_author.append(author)
print(len(major_author))
In [8]:
with open('cnn/runs/1500578799/prediction.json') as f:
results = json.load(f)
In [9]:
result = {}
for out in results:
result[out['id']] = out['category']
In [10]:
def get_count(score):
total = 0
for author in major_author:
out = {i : 0 for i in range(10)}
count = 0
for project_id in author_dict[author]:
category = result[project_id]
if category == -1:
continue
out[category] += 1
count += 1
concentrate = 0
for cat in sorted(out, key=out.get, reverse=True)[:2]:
concentrate += out[cat]
if concentrate/count > score:
total += 1
return total
In [11]:
count = []
for score in range(20):
score = score/20
count.append(get_count(score) / len(major_author))
In [12]:
def plot(count):
fig, ax1 = plt.subplots()
ax1.set_xlabel('percentage of projects in top 2 categories ')
ax1.set_ylabel('Percentage of authors in major_author')
ax1.plot([c/20 for c in range(20)], count, '-o')
plt.legend(loc='best')
plt.show()
In [13]:
plot(count)