In [ ]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import logging

##us calendar stuff
from pandas.tseries.offsets import CustomBusinessDay, CustomBusinessHour
from pandas.tseries.holiday import USFederalHolidayCalendar


##For merging pandas together from array
from functools import reduce



In [ ]:

    
random.seed(a=33)
#random.random()

CustomBuisnessDay and CustomBusinessHour

Very useful to map out simulation on a real 8-5 schedule with weekends and holidays in play.



In [ ]:

    
## task_duration: the whole duration of the task in minutes
#task_duration = pd.Timedelta(days=10)
## task_duration: in days aware of holidays.
task_duration = CustomBusinessDay(calendar=USFederalHolidayCalendar(),  n=10)

## duration_segments is the duration broken into smaller units (60 minute units)
##duration_segment_length=pd.Timedelta(minutes=60)
## segments of work time 1hr.  And using buisness day with awareness of the us calendar.
duration_segment_length = CustomBusinessHour(calendar=USFederalHolidayCalendar())



In [ ]:

    
# The probability of an interruption 
# 3 in 10 probablity uniform distribution over duration_segment_length
# for each hour worked, there is a 30% chance that there will be an interruption.
def prob_interrupt(row):
    return(random.random() <= 0.30)

#the duration of the interruption. Random and uniform
#between 60 minutes and 180 minutes (3hrs).
#could use the task duration to change interruption based on the data available..
def derive_interruptions(row):
    row['time_interrupt'] = pd.Timedelta(minutes=random.randint(1,3*60) 
                                    if prob_interrupt(row) 
                                    else 0)        
    return(row)

#the time added to context switch back from interruption
#the additional time added due to the occurance of a context switch
# guess 0.1 cost based on the interruption duration plus at least 5 minutes.
# small interruptions cost at least five minutes to recover from.
#
def derive_context_switch_time(row):
    if(row['time_interrupt']):
        ctxt_cost = 0.1 * row['time_interrupt']
        cst = pd.Timedelta(minutes=5) if ctxt_cost < pd.Timedelta(minutes=5) else ctxt_cost
    else:
        cst = pd.Timedelta(minutes=0)
    
    row['ctxt_switch_time'] = cst
    #round up
    return(row)



In [ ]:

    
def interruption_sim_extend(frame, series):
    """How much to extend if simulation isn't done.  If the simulation is done, then extend will be None."""
    
    overtime = frame['time_interrupt'].sum() + frame['ctxt_switch_time'].sum()
    print("overtime: {}".format(overtime))
    
    if(overtime <= pd.Timedelta(minutes=0)):
        return(None)
    else:
        seqs = series.max() + series.freq
        seqe = seqs + overtime
        sclass = type(series)
        ## accumulate the series_steps for each simulation run.
        ## used to start the next run.
        return(sclass(start=seqs,
                      end=seqe,
                      freq=series.freq))



In [ ]:

    
##Simulator.  Builds a dataframe based on possible actions every row.
class StepSim(object):
    """StepSim is a class used to create a step-wise simulation of work.
    """
    def __init__(self, 
                 s, 
                 flist=[lambda row: row],
                 extendFunc=lambda frame: None,
                 maxiter = 50):
        """
        Args:
            s (:obj:`series`): The series type class that defines the first 
                operations of the simulation.
            flist (:obj:`int`, optional): Collection of functions that will be 
                executed on each step of the simulation.
            maxiter (int, optional): Maximum number of iterations 
                to perform before giving up on completion.
        """
        self.flist = flist
        """The list of functions that will operate on the row."""
        
        self.extend_func = extendFunc
        """function that will determine if simulation is done or not"""
        
        ##the sequence of planned simulation steps
        self.series_steps = [s]
        """:obj: list of series.  Changed over time to contain all the series build by the simulation."""
        
        ##the actual calculated simulation steps
        self.sim_steps = []
        """:obj: panda dataframe. Dataframe containing all the derived steps of the simulation"""
        
        self.max_iter = maxiter
        """:int: safety stop condition for the simulation.  If the simulation goes longer than 
                this then there may be some condition that is infinite in nature.  For example, 
                a small very likely event adds a large block of time to the simulation.
        """
        
    def __rep__(self):
        return 'StepSim(do something useful here.)'.format(self)
    def __str__(self):
        return 'StepSim(do something useful here)'.format(self)
    
    def __funcApply(flist, h):
        """
        Helper Apply the list of functions iteratively over the data passed by reference.
        How h is changed depends on the functions.
        
        flist :obj: function list each will be applied to the dictionary sequentially.
        h :obj: object that will be operated on by each function in flist.
        """
        for f in flist:
            f(h) 
        return h
    
    
    def simwork(self):
        """
        run through simulation.
        each outer while loop will build a panda data frame where each row represents a fraction of the series.
        each row will iteratively performing functions from flist on the row.  functions in flist may introduce 
        more columns.  Functions may derive information from previous information in row.
        
        after each row is created in the series, a 'done' check is called to see if the simulation is done or
        there needs to be more iterations to get to done state.
        
        
        """
        loop_index = 0
        while True:
            ## the next series of simulation steps to run.
            series = self.series_steps[-1]
            print("next_start: {}, next_end: {}".format(series.min(), series.max()))
            
            result = pd.DataFrame([StepSim.__funcApply(flist=self.flist, 
                                                       h={'iter': loop_index}) for i in series], 
                                  index=series)
            result.index.name = 'idx'

            ##accumulate the actual simulation steps
            self.sim_steps.append(result)
            
            ##Calculate how much more needs to be done...
            ##if more needs to be done, then expect a type that will be the extension series.
            newseries = self.extend_func(result,series)
            if(isinstance(newseries, type(series))):
                self.series_steps.append(newseries)
            else:
                break

            ##don't go crazy if we've gone more than 50 loops, then something may be wrong...  probably should be set 
            ##in class initiailziation to a default
            loop_index += 1
            if loop_index > self.max_iter:
                raise RuntimeError("hit max_iter. Stopping simulation at: {}". format(self.max_iter))

            
    def simresult(self):
        ##iterate through each dataframe and append it to the next data frame
        #calculate total interrupt hours from context switch time plus actual interruption time
        result = reduce(lambda x,y: x.append(y), self.sim_steps)
        result['total_interrupt_cumsum'] = (result['ctxt_switch_time'] + result['time_interrupt']).cumsum()
        return(result)



In [ ]:

    
##set the starting condition -- the start date of the work and the expected end date of the work.
next_start = pd.to_datetime('2017-05-01 08:00:00')
next_end = next_start +  task_duration

print("Expected start date: {0}\nExpected end date: {1}\nSimulation Segments: {2}\n\n".
      format(next_start,next_end,duration_segment_length))



In [ ]:

    
foo = StepSim(pd.DatetimeIndex(start= next_start,
                               end = next_end,
                               freq= duration_segment_length,),
              flist=[derive_interruptions, derive_context_switch_time ],
              extendFunc=interruption_sim_extend)
foo



In [ ]:

    
foo.simwork()
simr = foo.simresult()
simr.head()



In [ ]:

    
simr



In [ ]:

    
#calculate total interrupt hours from context switch time plus actual interruption time
full_work_sequence['total_interrupt_hours'] = ((full_work_sequence.ctxt_switch_time/np.timedelta64(1, 'h')+
  full_work_sequence.time_interrupt/np.timedelta64(1, 'h'))).cumsum()



In [ ]:

    
#Little nieve.
#shows accumulation of work caused by interruptions.
##without interruptions, this line is flat at zero.
mpl1 = simr.total_interrupt_cumsum.resample('1h')
mpl1.sum().plot()



In [ ]:

    
.sum().plot()
mpl1.set_ylabel('cumulative hours interruption')
mpl1.set_xlabel('work duration')

split and reshape

http://pandas.pydata.org/pandas-docs/stable/reshaping.html

very helpful site on reshaping data.

In this case I want to break out columns for each of the iterations



In [ ]:

    
work_seq_piv = full_work_sequence.pivot(values='total_interrupt_hours',
                               columns='iteration')



In [ ]:

    
#shows work broken up daily.
#the increase in the y axis shows an interruption.
# gaps show end of day and weekend time
wspl1 = work_seq_piv.resample('1h').sum().plot(kind='area')
wspl1.set_ylabel('cumulative hours interruption')
wspl1.set_xlabel('work duration')



In [ ]: