In [ ]:
"""
In theory this should work with reasonable outputs from PPMS system. One needs certain fields, like System, Year. This
script generates various useful statistics for ScopeM members.
Assumptions:
1. Paid project hours are:
* coming from project LM/EM or other EM projects at 0 / 40 / 200 CHF / h
* when using Hour's value metrics (default)
** if activity brings [0,40)CHF income it is assumed that its duration is equal to the number of hours recorded
When the income is suggesting more than 40CHF / hour duration is COMPUTED such as each 40CHF correspond to one hour
2. Unpaid project hours are:
* coming from Trials
* coming from personal R&D
Python:
* requires python-3.2
:bug:
None known.
:organization:
ETH
"""
__authors__="""Szymon Stoma"""
__contact__="xxx"
__license__="Cecill-C"
__date__="17-12-05"
__version__="0.1"
__docformat__= "restructuredtext en"
# ---------------------------------------------------------- imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# ---------------------------------------------------------- const
cfg_sstoma_id = "Stoma Szymon"
cfg_andrzej_id = "Rzepiela Andrzej"
cfg_sfn_id = "Noerrelykke Simon"
cfg_ppms_csv_path = 'detailed-list.csv'
cfg_ppms_csv_path = 'detailed-list-2017.csv'
cfg_hour_baseline = 40 #CHF per h
# ---------------------------------------------------------- reading files & adding columns
# reading and listing headers (tricks and hacks applied to fck strange output format)
df = pd.read_csv(cfg_ppms_csv_path, encoding='cp1250', skiprows=[0])
list(df.columns.values)
# removing records without a project since they are useless
df = df[ pd.notnull(df.Project) ]
# adding month to stratify data
dates = pd.to_datetime(df.Date, format="%d/%m/%Y")
df['Month'] = dates.dt.month
df['Year'] = dates.dt.year
# adding time in hours (decimal)
df['Total Booked Hours'] = df['Length booked'] / 60.
# making sens of Amount* column (* is added when it was not paid yet...)
def convert_amount_starred( a ):
if a.find('*') == -1: return float(a)
else: return float(a[:-1])
df['Amount*'] = df['Amount*'].apply(convert_amount_starred)
# money can be stored everywhere
df['Total Charged [CHF]'] = df['Amount*'] + df['Prepaid']
# adding ratio of charged to all (under assumption 40 CHF / hr.)
df['Total Charged Hours'] = df['Total Booked Hours'] * (df['Total Charged [CHF]'] > 0)
# backcalculating:
# for 0 CHF the actual length will be used
# for >40 CHF the hours will be incrased
df["Hour's value"] = pd.DataFrame([df['Total Booked Hours'], df['Total Charged [CHF]'] / cfg_hour_baseline]).max()
# adding a field for classification
df["PPMS_category"] = "NONE"
# listing columns
list(df.columns.values)
# ---------------------------------------------------------- filtering
# prepare filters
# divide into project types
cat_paid_projects = (df.Project.str.match('[LE]M/.*') | df.Project.str.match('\d+_\w*') | df.Project.str.match('\d+ \w*') | df.Project.str.match('\d+'))
cat_unpaid_projects = (df.Project.str.match('_Trials') | df.Project.str.match('_LM/ScopeM/Trials')| df.Project.str.match('0060_Trials')) # currently (2017) some projects which should be Trials are in Paid_projects
cat_rd_personal_dev = df.Project.str.match('.*_PD') | df.Project.str.match('_LM/ScopeM/Scientific Collaboration')
cat_teaching = (df.Project.str.match('.*(MOOC).*') | df.Project.str.match('_Teach.*') | df.Project.str.match('_LM/ScopeM/Schools') | df.Project.str.match('_Microscopy_Program') )
cat_maintenance = (df.Project.str.match('_Main.*') | df.Project.str.match('_LM/ScopeM/Maintenance') | df.Project.str.match('EAWAG SP5') | df.Project.str.match('_LM/ScopeM/Visitors') )
cat_rd_other = df.Project.str.match('.*(CellStar|Crispr|Flu).*') | df.Project.str.match('_H2020*') | df.Project.str.match('ETHProj201*')
cat_sstoma = (df.System == cfg_sstoma_id)
cat_sfn = (df.System == cfg_sfn_id)
cat_andrzej = (df.System == cfg_andrzej_id)
cat_ida = (df.System == cfg_sstoma_id) | (df.System == cfg_sfn_id) | (df.System == cfg_andrzej_id)
# assign a PPMS tag based on filters
# careful, order matches since some things fall into two exp.
df.loc[cat_maintenance, "PPMS_category"] = "PPMS_Maintenance"
df.loc[cat_paid_projects, "PPMS_category"] = "PPMS_Projects"
df.loc[cat_unpaid_projects, "PPMS_category"] = "PPMS_Trials"
df.loc[cat_rd_personal_dev, "PPMS_category"] = "PPMS_RD_Personal"
df.loc[cat_rd_other, "PPMS_category"] = "PPMS_RD"
df.loc[cat_teaching, "PPMS_category"] = "PPMS_Teaching"
# define groups of users: so far only IDA
users_labels = df.System.unique()
users_filters = [df.System == u for u in users_labels]
users = list(zip(users_labels, users_filters))
users.append(("IDA", cat_ida))
# ---------------------------------------------------------- displaying stats
# if activity brings [0,40)CHF income it is assumed that its duration is equal to the number of hours recorded
# When the income is suggesting more than 40CHF / hour duration is COMPUTED such as each 40CHF correspond to one hour
metrics = "Hour's value"
#metrics = 'Total Booked Hours'
# do text outputs and compute useful stats
print('# Summary: PPMS_project AVG booked hours using metrics ' + metrics )
u_in_years = df.groupby("Year")["System"].nunique()
print(df[ df.PPMS_category == "PPMS_Projects" ].groupby(['Year'])[ metrics ].aggregate(np.sum) / u_in_years)
print('\n')
print('# Summary: Total income from PPMS_projects' )
print(df[ df.PPMS_category == "PPMS_Projects" ].groupby(['Year'])[ 'Total Charged [CHF]' ].aggregate(np.sum))
print('\n')
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
for cat in pd.unique(df.PPMS_category):
print('# Summary: '+ cat+' hours computed using metrics ' + metrics )
print(df[ df.PPMS_category == cat].groupby([ 'Year', 'System'])[ metrics ].sum().round(0))
print('\n')
print('# Summary: Total booked hours computed using metrics ' + metrics )
print(df.groupby([ 'Year', 'System'])[ metrics ].sum().round(0))
print('\n')
print('# Summary: Total booked hours computed using metrics ' + metrics )
print(df.groupby([ 'Year', 'System', 'PPMS_category'])[ metrics ].sum().round(0))
print('\n')
print('# Summary: Total booked hours computed using metrics ' + metrics )
print(df.groupby([ 'Year', 'System', 'PPMS_category', 'Project' ])[ metrics ].sum().round(0))
print('\n')
# do plots by PPMS_category
for cat in pd.unique(df.PPMS_category):
df[ df.PPMS_category == cat ].groupby(['Year', 'System'])[ metrics ].aggregate(np.sum).unstack().plot(kind = 'bar', title = u[ 0 ]+': by Year, '+cat).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.ylabel(metrics)
plt.show()
# do plots foir each user
for u in users:
df[ u[1] ].groupby(['Year'])[ metrics ].aggregate(np.sum).plot(kind = 'bar', title = u[ 0 ]+': by Year, total PPMS_hours').legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.ylabel(metrics)
plt.show()
for u in users:
df[ u[1] ].groupby(['Year', 'PPMS_category'])[ metrics ].aggregate(np.sum).unstack().plot(kind = 'bar', title = u[ 0 ]+': by Year, PPMS_category').legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.ylabel(metrics)
plt.show()