Adding custom functionality

Writing a new parser

Here we will go through the process of writing a parser that parses data from an arbitrary format, in this case the format is provided in the file new_parser_data.xlsx and consists of one sheet with data grouped by time point, and another sheet with the identifiers

Generally, it makes sense to start with one of the existing parsers as a guide. In this case, the spectromax_OD parser would be most relevant. The sample data is available in tests/test_data/sample_parser_docs_data.xlsx


In [1]:
from impact.parsers import Parser, parse_raw_identifier, parse_time_point_list
from impact import TimePoint

def new_parser(experiment, data, id_type='traverse'):
    # Define the layout of our data
    first_row_index = 0
    plate_size = 8
    spacing = 1
    time_row_col = [0,0]
    data_row_col = [1,0]
    
    # Define the type of data this parser accepts
    analyte_name = 'OD600'
    analyte_type = 'biomass'
    
    # In this case, we can first prepare the data by extracting the relevant information from each sheet    
    unparsed_identifiers = data['identifiers']
    raw_data = data['data']

    # The data starts at (1,1) and is in a 8x12 format
    timepoint_list = []

    # We first parse the identifiers, as these can be recycled (the only thing that is changing is the time)
    identifiers = []
    for i, row in enumerate(unparsed_identifiers):
        parsed_row = []
        for j, data in enumerate(row):
            # Here we can implement logic to exclude any data which is not present, for example when a plate is not full
            # In this case, any cell which is empty, 0, or None will be excluded
            if unparsed_identifiers[i][j] not in ['', 0, '0', None]:
                temp_trial_identifier = parse_raw_identifier(unparsed_identifiers[i][j], id_type)
                parsed_row.append(temp_trial_identifier)
            else:
                parsed_row.append(None)
        identifiers.append(parsed_row)

    
    for start_row_index in range(first_row_index, len(raw_data), plate_size+spacing):
        if raw_data[start_row_index][0] != '~End':
            time = int(raw_data[start_row_index+time_row_col[0]][time_row_col[1]])

            # Define the data for a single plate, single timepoint
            plate_data = [row[2:14] for row in raw_data[start_row_index:start_row_index+plate_size]]

            # Load the data point by point
            for i, row in enumerate(plate_data):
                for j, data in enumerate(row):
                    # Skip wells where no identifier is listed or no data present
                    if identifiers[i][j] is not None and data not in [None,'']:
                        ti = identifiers[i][j]
                        ti.analyte_type, ti.analyte_name = analyte_type, analyte_name
                        time_point = TimePoint(ti, time, float(data))
                        timepoint_list.append(time_point)
        else:
            break

    # Finally we parse all of the time points (into their logical strucutre based on identifiers)
    # And add them to the experiment
    replicate_trial_list = parse_time_point_list(timepoint_list)
    for rep in replicate_trial_list:
        experiment.add_replicate_trial(rep)


C:\Users\Naveen\Anaconda3\lib\site-packages\IPython\html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

With the new parser defined, we can register it to the Parser class, and directly parse our data. The parser will return an Experiment instance, containing all the data.


In [2]:
Parser.register_parser('my_new_format',new_parser)
expt = Parser.parse_raw_data('my_new_format',file_name='../tests/test_data/sample_parser_docs_data.xlsx')


Importing data from ../tests/test_data/sample_parser_docs_data.xlsx...0.0s
Parsing time point list...Parsed 246 time points in 0.2s
Parsing analyte list...Parsed 82 analytes in 482.0ms
Parsing single trial list...Parsed 32 replicates in 0.1s

In [3]:
print(expt)


strain             media                 environment    analytes
-----------------  --------------------  -------------  ----------
3KO-D1 + pKDL071   Base + 1.0 a.u. aTc                  ['OD600']
3KO-D1 + pKDL071   Base + 2.0 a.u. IPTG                 ['OD600']
3KO-D28 + pKDL071  Base + 1.0 a.u. aTc                  ['OD600']
3KO-D28 + pKDL071  Base + 2.0 a.u. IPTG                 ['OD600']
3KO-D59 + pKDL071  Base + 1.0 a.u. aTc                  ['OD600']
3KO-D59 + pKDL071  Base + 2.0 a.u. IPTG                 ['OD600']
IMPT1 + pIMPT001   Base + 1.0 a.u. aTc                  ['OD600']
IMPT1 + pIMPT001   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT1 + pIMPT002   Base + 1.0 a.u. aTc                  ['OD600']
IMPT1 + pIMPT002   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT1 + pIMPT003   Base + 1.0 a.u. aTc                  ['OD600']
IMPT1 + pIMPT003   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT1 + pIMPT004   Base + 1.0 a.u. aTc                  ['OD600']
IMPT1 + pIMPT004   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT2 + pIMPT001   Base + 1.0 a.u. aTc                  ['OD600']
IMPT2 + pIMPT001   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT2 + pIMPT002   Base + 1.0 a.u. aTc                  ['OD600']
IMPT2 + pIMPT002   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT2 + pIMPT003   Base + 1.0 a.u. aTc                  ['OD600']
IMPT2 + pIMPT003   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT2 + pIMPT004   Base + 1.0 a.u. aTc                  ['OD600']
IMPT2 + pIMPT004   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT3 + pIMPT001   Base + 1.0 a.u. aTc                  ['OD600']
IMPT3 + pIMPT001   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT3 + pIMPT002   Base + 1.0 a.u. aTc                  ['OD600']
IMPT3 + pIMPT002   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT3 + pIMPT003   Base + 1.0 a.u. aTc                  ['OD600']
IMPT3 + pIMPT003   Base + 2.0 a.u. IPTG                 ['OD600']
IMPT3 + pIMPT004   Base + 1.0 a.u. aTc                  ['OD600']
IMPT3 + pIMPT004   Base + 2.0 a.u. IPTG                 ['OD600']
dlacI + pKDL071    Base + 1.0 a.u. aTc                  ['OD600']
dlacI + pKDL071    Base + 2.0 a.u. IPTG                 ['OD600']