In [ ]:
from pymongo import MongoClient
In [ ]:
# this needs to be done with config files or better a general db management class (talk to Donny)
conn = MongoClient('localhost', 57001, j=False)
db_sp = conn['...']
db_sp.authenticate('...')
db_sp.jobs.count()
conn4 = MongoClient('localhost', 57004, j=False)
db_jp = conn4['...']
db_jp.authenticate('...')
db_jp.materials.count()
In [ ]:
from datetime import datetime
from collections import OrderedDict
import pytz, calendar
from bson.objectid import ObjectId
pacific = pytz.timezone('US/Pacific')
months = OrderedDict()
months['2014-0'] = ( # aggregate all users before operating time as offset
ObjectId.from_datetime(
pacific.localize(datetime(2014,1,1))
),
ObjectId.from_datetime(
pacific.localize(datetime(2014,12,31,23,59,59,999999))
)
)
for year in [2015, 2016]: # operating time of MPComplete
for month in range(1, 13):
last_day = calendar.monthrange(year,month)[1]
dt_start = pacific.localize(datetime(year,month,1,0,0,0,0))
dt_end = pacific.localize(datetime(year,month,last_day,23,59,59,999999))
months['{}-{}'.format(year, month)] = (
ObjectId.from_datetime(dt_start),
ObjectId.from_datetime(dt_end)
)
print months.keys()
In [ ]:
from collections import Counter
# import plotly.plotly as py
# import plotly.graph_objs as go
In [ ]:
users = set() # keep track of user emails up to and including previous month
gateway_users = Counter()
gateway_users_month = OrderedDict()
gateway_new_users_month = OrderedDict()
submissions_month = OrderedDict()
pauling_file_submissions_month = OrderedDict()
for month,interval in months.iteritems():
gateway_users_month[month] = Counter()
gateway_new_users_month[month] = Counter()
pauling_file_submissions_month[month] = Counter()
submissions_month[month] = OrderedDict()
submissions = list(db_sp.jobs.find(
{'about.remarks': 'MP user submission',
'_id': {'$gte': interval[0], '$lte': interval[1]}},
['submitter_email', 'about.authors', 'state', 'submitted_at']))
for s in submissions:
author_emails = [a['email'] for a in s['about']['authors']]
email = s['submitter_email'] if s['submitter_email'] in author_emails else author_emails[0]
if email == 'sbajaj@lbl.gov':
pauling_file_submissions_month[month][s['state']] += 1
continue # skip Pauling File submissions
if email not in submissions_month[month]:
submissions_month[month][email] = Counter()
submissions_month[month][email][s['state']] += 1
gateway_users[email] += 1
gateway_users_month[month][email] += 1 # all monthly users, all submissions
if email not in users:
gateway_new_users_month[month][email] += 1 # only new monthly users and their submissions
users.update(gateway_users.keys())
In [ ]:
monat, jahr = 10, 2016
print 'Since 2014-01-01, a total of', len(gateway_users), 'users submitted', \
sum(gateway_users.values()), 'possibly duplicate structures.',
gateway_users_year = Counter()
for month, emails in gateway_users_month.items():
y, m = map(int, month.split('-'))
if y < jahr or m < monat-1:
continue
print y, m
for email, nsubs in emails.items():
gateway_users_year[email] += nsubs
returning_gateway_users_fraction = 0
for email, nsubs in gateway_users_year.items():
returning_gateway_users_fraction += int(nsubs > 1)
returning_gateway_users_fraction /= float(len(gateway_users_year))
print 'After', '{}-{}'.format(monat, jahr), 'alone,', 'Z={}'.format(len(gateway_users_year)), 'users submitted', \
'Y={}'.format(sum(gateway_users_year.values())), 'of them.', \
'{:.0f}%'.format(returning_gateway_users_fraction*100.), 'of users returned to submit more structures.',
submissions_year = Counter()
submissions_icsd_year = Counter()
for month, emails in submissions_month.items():
if int(month.split('-')[0]) < year:
continue
for email, counter in emails.items():
submissions_year['COMPLETED'] += counter['COMPLETED']
submissions_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
submissions_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
submissions_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']
if email == 'w6ye@ucsd.edu':
submissions_icsd_year['COMPLETED'] += counter['COMPLETED']
submissions_icsd_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
submissions_icsd_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
submissions_icsd_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']
#print month, submissions_year['FIZZLED/ERROR'] # (2696-1118)/(6191*0.44) = 58%
total_submissions = float(sum(submissions_year.values()))
completed_submissions = submissions_year['COMPLETED']/total_submissions*100.
ready_running_submissions = submissions_year['READY/RUNNING']/total_submissions*100.
rejected_defused_submissions = submissions_year['REJECTED/DEFUSED']/total_submissions*100.
errored_submissions = submissions_year['FIZZLED/ERROR']/total_submissions*100.
print '{:.0f}% (X={}) of the {} submissions were successfully completed and ' \
'{:.0f}% are waiting to continue or start. Only {:.0f}% were rejected and ' \
'{:.0f}% failed.'.format(
completed_submissions, submissions_year['COMPLETED'], year, ready_running_submissions,
rejected_defused_submissions, errored_submissions
),
print 'These numbers include the {} ICSD submissions, {} of which were successfully completed and ' \
'{} are waiting to continue or start ({} failed, {} rejected/defused).'.format(
sum(submissions_icsd_year.values()), submissions_icsd_year['COMPLETED'],
submissions_icsd_year['READY/RUNNING'], submissions_icsd_year['FIZZLED/ERROR'],
submissions_icsd_year['REJECTED/DEFUSED']
),
pauling_file_submissions_year = Counter()
for month, counter in pauling_file_submissions_month.items():
if int(month.split('-')[0]) < year:
continue
pauling_file_submissions_year['COMPLETED'] += counter['COMPLETED']
pauling_file_submissions_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
pauling_file_submissions_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
pauling_file_submissions_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']
total_submissions = float(sum(pauling_file_submissions_year.values()))
completed_submissions = pauling_file_submissions_year['COMPLETED']/total_submissions*100.
ready_running_submissions = pauling_file_submissions_year['READY/RUNNING']/total_submissions*100.
rejected_defused_submissions = pauling_file_submissions_year['REJECTED/DEFUSED']/total_submissions*100.
errored_submissions = pauling_file_submissions_year['FIZZLED/ERROR']/total_submissions*100.
print 'However, these numbers excludes the {} Pauling File submissions, ' \
'{:.0f}% of which were successfully completed and ' \
'{:.0f}% are waiting to continue or start ({:.0f}% failed).'.format(
int(total_submissions), completed_submissions, ready_running_submissions, errored_submissions
),
In [ ]:
total_unique = 0
tr_x, tr_y = [], [[], [], []]
averages = OrderedDict()
for month, new_users in gateway_new_users_month.items():
monthly_new = len(new_users)
total_unique += monthly_new
recurring_users = len(gateway_users_month[month]) - monthly_new
current_year = int(month.split('-')[0])
if current_year < 2015:
continue # don't show prior to start of operating time
tr_x.append(month)
tr_y[0].append(total_unique)
tr_y[1].append(monthly_new)
tr_y[2].append(recurring_users)
if not current_year in averages:
averages[current_year] = {'new': 0, 'recurring': 0}
averages[current_year]['new'] += monthly_new/12.
averages[current_year]['recurring'] += recurring_users/12.
name = ['total users', 'new users', 'recurring users']
for yr_idx, (yr, trcs) in enumerate(averages.items()):
for trc_idx, (trc_name, trc_avg) in enumerate(trcs.items()):
trc_avg_arr = [trc_avg]*12
if yr_idx == 0:
tr_y.append(trc_avg_arr)
name.append('yearly averages')
else:
tr_y[trc_idx+3] += trc_avg_arr
data = [go.Scatter(
x=tr_x, y=t, name=name[i], mode='lines',
line=dict(color=('black'), dash='dot') if i > 2 else dict(),
showlegend=bool(i<4)
) for i,t in enumerate(tr_y)
]
layout = dict(title = 'Growth in MPComplete Users', yaxis = dict(type='log'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='mpcomplete-users')
In [ ]:
tr_x, tr_y = [], []
corr_avg = {'numer': 0, 'denom': 0} # average number of submissions/month for months with <300 submissions
for month, emails in submissions_month.items():
current_year = int(month.split('-')[0])
if current_year < 2015:
continue # don't show prior to start of operating time
nsubs = 0
for email, counter in emails.items():
nsubs += sum(counter.values())
tr_x.append(month)
tr_y.append(nsubs)
if nsubs < 300:
corr_avg['numer'] += nsubs
corr_avg['denom'] += 1
print corr_avg['numer']/float(corr_avg['denom'])
trace = go.Bar(x=tr_x, y=tr_y)
layout = dict(title='Number of Submissions per Month', yaxis=dict(type='log', range=[0.5,3.5]))
fig = dict(data=[trace], layout=layout)
py.iplot(fig, filename='mpcomplete-submissions')
In [ ]:
from toolz import concat, pluck
In [ ]:
oid_start = months['2016-1'][0]
oid_end = months['2016-12'][1]
submissions = list(db_sp.jobs.find(
{'about.remarks': 'MP user submission',
'_id': {'$gte': oid_start, '$lte': oid_end}},
['task_dict']))
print(len(submissions))
task_ids = list(concat([d.values() for d in pluck('task_dict', submissions) if d]))
print(len(task_ids))
material_ids = list(pluck('task_id', db_jp.materials.find({
'_id': {'$gte': oid_start}, # sometimes documents are re-built, so no '$lte' spec here
'snl_final.about.remarks': 'MP user submission',
'task_ids': {'$in': task_ids}},
['task_id'])))
print(len(material_ids))