Set DB connections


In [ ]:
from pymongo import MongoClient

In [ ]:
# this needs to be done with config files or better a general db management class (talk to Donny)
conn = MongoClient('localhost', 57001, j=False)
db_sp = conn['...']
db_sp.authenticate('...')
db_sp.jobs.count()
conn4 = MongoClient('localhost', 57004, j=False)
db_jp = conn4['...']
db_jp.authenticate('...')
db_jp.materials.count()

Set time interval of interest


In [ ]:
from datetime import datetime
from collections import OrderedDict
import pytz, calendar
from bson.objectid import ObjectId

pacific = pytz.timezone('US/Pacific')
months = OrderedDict()
months['2014-0'] = ( # aggregate all users before operating time as offset
    ObjectId.from_datetime(
        pacific.localize(datetime(2014,1,1))
    ),
    ObjectId.from_datetime(
        pacific.localize(datetime(2014,12,31,23,59,59,999999))
    )
)
for year in [2015, 2016]: # operating time of MPComplete
    for month in range(1, 13):
        last_day = calendar.monthrange(year,month)[1]
        dt_start = pacific.localize(datetime(year,month,1,0,0,0,0))
        dt_end = pacific.localize(datetime(year,month,last_day,23,59,59,999999))
        months['{}-{}'.format(year, month)] = (
            ObjectId.from_datetime(dt_start),
            ObjectId.from_datetime(dt_end)
        )
print months.keys()

Unique gateway users during time interval


In [ ]:
from collections import Counter
# import plotly.plotly as py
# import plotly.graph_objs as go

In [ ]:
users = set() # keep track of user emails up to and including previous month
gateway_users = Counter()
gateway_users_month = OrderedDict()
gateway_new_users_month = OrderedDict()
submissions_month = OrderedDict()
pauling_file_submissions_month = OrderedDict()

for month,interval in months.iteritems():
    gateway_users_month[month] = Counter()
    gateway_new_users_month[month] = Counter()
    pauling_file_submissions_month[month] = Counter()
    submissions_month[month] = OrderedDict()
    submissions = list(db_sp.jobs.find(
        {'about.remarks': 'MP user submission',
         '_id': {'$gte': interval[0], '$lte': interval[1]}},
        ['submitter_email', 'about.authors', 'state', 'submitted_at']))
    for s in submissions:
        author_emails = [a['email'] for a in s['about']['authors']]
        email = s['submitter_email'] if s['submitter_email'] in author_emails else author_emails[0]
        if email == 'sbajaj@lbl.gov':
            pauling_file_submissions_month[month][s['state']] += 1
            continue # skip Pauling File submissions
        if email not in submissions_month[month]:
            submissions_month[month][email] = Counter()
        submissions_month[month][email][s['state']] += 1
        gateway_users[email] += 1
        gateway_users_month[month][email] += 1 # all monthly users, all submissions
        if email not in users:
            gateway_new_users_month[month][email] += 1 # only new monthly users and their submissions
    users.update(gateway_users.keys())

In [ ]:
monat, jahr = 10, 2016

print 'Since 2014-01-01, a total of', len(gateway_users), 'users submitted', \
    sum(gateway_users.values()), 'possibly duplicate structures.',

gateway_users_year = Counter()
for month, emails in gateway_users_month.items():
    y, m = map(int, month.split('-'))
    if y < jahr or m < monat-1:
        continue
    print y, m
    for email, nsubs in emails.items():
        gateway_users_year[email] += nsubs

returning_gateway_users_fraction = 0
for email, nsubs in gateway_users_year.items():
    returning_gateway_users_fraction += int(nsubs > 1)
returning_gateway_users_fraction /= float(len(gateway_users_year))

print 'After', '{}-{}'.format(monat, jahr), 'alone,', 'Z={}'.format(len(gateway_users_year)), 'users submitted', \
    'Y={}'.format(sum(gateway_users_year.values())), 'of them.', \
    '{:.0f}%'.format(returning_gateway_users_fraction*100.), 'of users returned to submit more structures.',

submissions_year = Counter()
submissions_icsd_year = Counter()
for month, emails in submissions_month.items():
    if int(month.split('-')[0]) < year:
        continue
    for email, counter in emails.items():
        submissions_year['COMPLETED'] += counter['COMPLETED']
        submissions_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
        submissions_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
        submissions_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']
        if email == 'w6ye@ucsd.edu':
            submissions_icsd_year['COMPLETED'] += counter['COMPLETED']
            submissions_icsd_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
            submissions_icsd_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
            submissions_icsd_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']
    #print month, submissions_year['FIZZLED/ERROR'] # (2696-1118)/(6191*0.44) = 58%

total_submissions = float(sum(submissions_year.values()))
completed_submissions = submissions_year['COMPLETED']/total_submissions*100.
ready_running_submissions = submissions_year['READY/RUNNING']/total_submissions*100.
rejected_defused_submissions = submissions_year['REJECTED/DEFUSED']/total_submissions*100.
errored_submissions = submissions_year['FIZZLED/ERROR']/total_submissions*100.

print '{:.0f}% (X={}) of the {} submissions were successfully completed and ' \
      '{:.0f}% are waiting to continue or start. Only {:.0f}% were rejected and ' \
      '{:.0f}% failed.'.format(
        completed_submissions, submissions_year['COMPLETED'], year, ready_running_submissions,
        rejected_defused_submissions, errored_submissions
    ),

print 'These numbers include the {} ICSD submissions, {} of which were successfully completed and ' \
      '{} are waiting to continue or start ({} failed, {} rejected/defused).'.format(
        sum(submissions_icsd_year.values()), submissions_icsd_year['COMPLETED'],
        submissions_icsd_year['READY/RUNNING'], submissions_icsd_year['FIZZLED/ERROR'],
        submissions_icsd_year['REJECTED/DEFUSED']
    ), 
        

pauling_file_submissions_year = Counter()
for month, counter in pauling_file_submissions_month.items():
    if int(month.split('-')[0]) < year:
        continue
    pauling_file_submissions_year['COMPLETED'] += counter['COMPLETED']
    pauling_file_submissions_year['READY/RUNNING'] += counter['READY'] + counter['RUNNING']
    pauling_file_submissions_year['FIZZLED/ERROR'] += counter['FIZZLED'] + counter['ERROR']
    pauling_file_submissions_year['REJECTED/DEFUSED'] += counter['REJECTED'] + counter['DEFUSED']

total_submissions = float(sum(pauling_file_submissions_year.values()))
completed_submissions = pauling_file_submissions_year['COMPLETED']/total_submissions*100.
ready_running_submissions = pauling_file_submissions_year['READY/RUNNING']/total_submissions*100.
rejected_defused_submissions = pauling_file_submissions_year['REJECTED/DEFUSED']/total_submissions*100.
errored_submissions = pauling_file_submissions_year['FIZZLED/ERROR']/total_submissions*100.

print 'However, these numbers excludes the {} Pauling File submissions, ' \
      '{:.0f}% of which were successfully completed and ' \
      '{:.0f}% are waiting to continue or start ({:.0f}% failed).'.format(
        int(total_submissions), completed_submissions, ready_running_submissions, errored_submissions
    ),

In [ ]:
total_unique = 0
tr_x, tr_y = [], [[], [], []]
averages = OrderedDict()

for month, new_users in gateway_new_users_month.items():
    monthly_new = len(new_users)
    total_unique += monthly_new
    recurring_users = len(gateway_users_month[month]) - monthly_new
    current_year = int(month.split('-')[0])
    if current_year < 2015:
        continue # don't show prior to start of operating time
    tr_x.append(month)
    tr_y[0].append(total_unique)
    tr_y[1].append(monthly_new)
    tr_y[2].append(recurring_users)
    if not current_year in averages:
        averages[current_year] = {'new': 0, 'recurring': 0}
    averages[current_year]['new'] += monthly_new/12.
    averages[current_year]['recurring'] += recurring_users/12.

name = ['total users', 'new users', 'recurring users']
for yr_idx, (yr, trcs) in enumerate(averages.items()):
    for trc_idx, (trc_name, trc_avg) in enumerate(trcs.items()):
        trc_avg_arr = [trc_avg]*12
        if yr_idx == 0:        
            tr_y.append(trc_avg_arr)
            name.append('yearly averages')
        else:
            tr_y[trc_idx+3] += trc_avg_arr
            
data = [go.Scatter(
        x=tr_x, y=t, name=name[i], mode='lines',
        line=dict(color=('black'), dash='dot') if i > 2 else dict(),
        showlegend=bool(i<4)
    ) for i,t in enumerate(tr_y)
]
layout = dict(title = 'Growth in MPComplete Users', yaxis = dict(type='log'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='mpcomplete-users')

In [ ]:
tr_x, tr_y = [], []
corr_avg = {'numer': 0, 'denom': 0} # average number of submissions/month for months with <300 submissions
for month, emails in submissions_month.items():
    current_year = int(month.split('-')[0])
    if current_year < 2015:
        continue # don't show prior to start of operating time
    nsubs = 0
    for email, counter in emails.items():
        nsubs += sum(counter.values())
    tr_x.append(month)
    tr_y.append(nsubs)
    if nsubs < 300:
        corr_avg['numer'] += nsubs
        corr_avg['denom'] += 1

print corr_avg['numer']/float(corr_avg['denom'])

trace = go.Bar(x=tr_x, y=tr_y)
layout = dict(title='Number of Submissions per Month', yaxis=dict(type='log', range=[0.5,3.5]))
fig = dict(data=[trace], layout=layout)
py.iplot(fig, filename='mpcomplete-submissions')

New materials built during time interval


In [ ]:
from toolz import concat, pluck

In [ ]:
oid_start = months['2016-1'][0]
oid_end = months['2016-12'][1]
submissions = list(db_sp.jobs.find(
    {'about.remarks': 'MP user submission',
     '_id': {'$gte': oid_start, '$lte': oid_end}},
    ['task_dict']))
print(len(submissions))
task_ids = list(concat([d.values() for d in pluck('task_dict', submissions) if d]))
print(len(task_ids))
material_ids = list(pluck('task_id', db_jp.materials.find({
     '_id': {'$gte': oid_start}, # sometimes documents are re-built, so no '$lte' spec here
     'snl_final.about.remarks': 'MP user submission',
     'task_ids': {'$in': task_ids}},
     ['task_id'])))
print(len(material_ids))