Learning how to write functions

TODO: make paths acceptable! TODO: Make argparse accept: 1.) summary_file or 2.) preset values. Then run xcms


In [1]:
generate_blank_summary_file <- function(path) {
    output = '### Peak Detection Parameters
peak_picking\tcentWave
ppm
peak_width
prefilter

### Peak-grouping parameters
bw
mzwid
minfrac\t0
minsamp\t2

### retention-correction parameters
retcor_method\tloess
missing
extra'
    params_file = paste(path, '/xcms_params.tab', sep='')
    write(output, file=params_file)
}

generate_blank_summary_file(getwd())

In [2]:
# Parse a tab-delim summary file
char_to_numeric <- function(char) {
    # Take in a space-delimited character that represents
    # a number or vector. Convert it to numeric.
    numeric <- as.numeric(strsplit(char, " ")[[1]])
    return(numeric)
}

get_params <- function(path, char_to_numeric) {
    # INPUT: tab-delim file of 
    # space-delimited values 
    # OUTPUT: A labeled list with each label from the first column
    # of the file. Each value, character or numeric, from the second column
    # It will accept vectors as space-delimited entries in the second column 
    # of the text file
    df <- read.table(path, sep='\t', header=FALSE, row.names=1,
                    colClasses="character", blank.lines.skip=TRUE, fill=TRUE)
    print(df)
    print(dim(df))
    # There should only be one column of values
    if (dim(df)[2] > 1){
        stop(paste("Your summary file should only have 1 (tab-delimited) column of values.",
                  sprintf("it has %s columns", dim(df)[2])))
    }
    print('read table output:')
    print(paste('Df dimensions', dim(df)))
    #lst_params <- setNames(split(df, seq(nrow(df))), rownames(df))
    # Make an empty list to gather param values
    lst = setNames(vector("list", dim(df)[1]), rownames(df))
    for (idx in rownames(df)){
        print(df[idx,])
        print(sprintf('----------- %s', df[idx,]))
        if (df[idx,] == "") {
            stop(paste(sprintf('The parameter %s is missing!', idx),
                       'Please set it, or use one of the preset parameter settings'))
            }
        # if no alphabetic characters, convert character to numeric
        digit = grepl("[[:digit:]]", df[idx,])
        alpha = grepl("[[:alpha:]]", df[idx,])
        if (digit &&! alpha) {
            sprintf('%s is a number(s)', df[idx,])
            print(class(df[idx,]))
            lst[[idx]] = char_to_numeric(df[idx,])
            }
        else if (alpha &&! digit){
            lst[[idx]] = df[idx,]
        }
    }
    return(lst)
}



path_txt = paste('/home/irockafe/',
                 'Dropbox (MIT)/Alm_Lab/projects/',
                 'revo_healthcare/notebooks/playing_around/',
                 'summary_file.txt', sep='')
path_relative = ('summary_file.txt')
params = get_params(path_relative, char_to_numeric)

params
class(params)
class(params$method)
class(params$prefilter)


                V2
bw               2
poop            37
prefilter    10 60
method    centWave
[1] 4 1
[1] "read table output:"
[1] "Df dimensions 4" "Df dimensions 1"
[1] "2"
[1] "----------- 2"
[1] "character"
[1] "37"
[1] "----------- 37"
[1] "character"
[1] "10 60"
[1] "----------- 10 60"
[1] "character"
[1] "centWave"
[1] "----------- centWave"
$bw
2
$poop
37
$prefilter
  1. 10
  2. 60
$method
'centWave'
'list'
'character'
'numeric'

In [3]:
if (1 == 2){
    print('poop')
    
} else if(1==1){
    print('yayyy')
}


[1] "yayyy"

In [4]:
# Get default settings from user input string

default_params = function(string) {
    # Default xcms parameters for various MS and LC combinations
    # Taken from "Meta-analysis of untargeted metabolomic 
    # data from multiple profiling experiments", Patti et al   
    # RETURN [param_name1=param_value1, param_name2=param_value2...]
    
    # Define the preset values as nested list, indexed on 
    # as [LC-MS[ppm, peakwidth, bw, mzwid, prefilter]]
    default_params = list(
     hplc_qtof = list(ppm=30, peakwidth=c(10,60), bw=5, 
                    mzwid=0.025, prefilter=c(0,0)),
     hplc_qtof_hires = list(ppm=15, peakwidth=c(10,60), bw=5,
                    mzwid=0.015, prefilter=c(0,0)),
     hplc_orbitrap = list(ppm=2.5, peakwidth=c(10,60), bw=5,
                    mzwid=0.015, prefilter=c(3,5000)),
     uplc_qtof = list(ppm=30, peakwidth=c(5,20), bw=2,
                     mzwid=0.025, prefilter=c(0,0)),
     uplc_qtof_hires = list(ppm=15, peakwidth=c(5,20), bw=2,
                           mzwid=0.015, prefilter=c(0,0)),
     uplc_orbitrap = list(ppm=2.5, peakwidth=c(5,20), bw=2,
                         mzwid=0.015, prefilter=c(3,5000))
    )
    # If your string is found in the default list,
    # return the default params
    if (exists(string, where=default_params)) {
        # double brackets because it removes the name of
        # the top-level nested-list (i.e. uplc_orbitrap$ppm becomes
        # simply $ppm when you use double brackets )
        params = default_params[[string]]
        return(params)
        }
    else{
        msg = sprintf(paste("I couldn't understand your input.",
                    "The allowed inputs are: %s"),
                paste(names(default_params), collapse=', '))
        stop(msg)
    }
    
    
}
        
preset_params = default_params('hplc_qtof')
preset_params
default_params('asfdasfdasd')


$ppm
30
$peakwidth
  1. 10
  2. 60
$bw
5
$mzwid
0.025
$prefilter
  1. 0
  2. 0
Error in default_params("asfdasfdasd"): I couldn't understand your input. The allowed inputs are: hplc_qtof, hplc_qtof_hires, hplc_orbitrap, uplc_qtof, uplc_qtof_hires, uplc_orbitrap
Traceback:

1. default_params("asfdasfdasd")
2. stop(msg)   # at line 38 of file <text>

In [5]:
NULL
print(is.null(NULL))


NULL
[1] TRUE

In [6]:
# TODO: Adapt xcms_script to accept parameters
# ??? How to take a list of {key:value} from r and
# define parameters from it? Use default arguments for everything, 
# then overwrite them if present?
# Could overwrite with default params

# use minsample exclusively to determine how many samples
# a peak must be present in to work
run_xcms = function(data_dir, params, polarity_mode,
                    # peak detection params
                    detection_method='centWave', ppm=25, 
                    peak_width=c(10,60), prefilter=c(3,100),
                    # grouping parameters
                    bw=20, mzwid=0.025, minfrac=0, minsamp=2,
                    # retention-correction parameters
                    retcor_method='loess', 
                    missing=1, extra=1
                   ) 
            {
    
    
    # check if param_value in user-defined list
    
    
    # escape spaces in data_dir path
    output_dir = getwd()
    xcms_feature_table = "xcms_result"
    camera_feature_table = "xcms_camera_results.csv"
    nSlaves=0
 
    ### change working directory to your files, see +++ section
    setwd(data_dir)
    # wrtie parameters to file
    peak_width_str = paste(paste(peak_width), collapse=" ")
    prefilter_str = paste(paste(prefilter), collapse=" ")
    param_string = sprintf('### Peak Detection Parameters
peak_picking\t%s
ppm\t%i
peak_width\t%s
prefilter\t%s

### Peak-grouping parameters
bw\t%i
mzwid\t%.4f
minfrac\t%i
minsamp\t%i

### retention-correction parameters
retcor_method\t%s
missing\t%i
extra\t%i
', 
    detection_method, ppm, peak_width_str, prefilter_str,
    bw, mzwid, minfrac, minsamp, 
    retcor_method, missing, extra)
    
    params_file = paste(output_dir, '/xcms_params.tab', sep='')
    write(param_string, file=params_file)
    
    
    stop("Don't want to actually run xcms when testin")
    
    # Load packages
    library(xcms)
    library(CAMERA)
    library(snowfall)
    
    # Detect peaks
    xset <- xcmsSet(method=detection_method, ppm=ppm, prefilter=prefilter,
                        peakwidth=peak_width, )
    print("finished peak Detection!")
    # Group peaks together across samples
    # save the detected peaks in case downstream processing fails 
    # and you have to go in manually to figure out parameters
    # TODO - should the bw parameter here be changed depending
    # on the platform used (i.e. bw=30 is default, but for uplc, 
    # maybe the coarse grouping should be bw=15 or something)
    xset <- group(xset)
    print("finished first group command!")
    print(xset)
    saveRDS(xset, paste(output_dir, '/xset.Rdata', sep=''))
    
    # Try to retention-correct
    xset2 <- retcor(xset, method=retcor_method,
                    family="s", plottype="m",
                    missing=missing)
    print("finished retcor!")
    
    # regroup after retention time correction
    # Base grouping on minimum number of samples, not fraction.
    xset2 <- group(xset2, bw =bw, mzwid=mzwid,
                  minfrac=0, minsamp=minsamp)
    saveRDS(xset2, paste(output_dir, '/xset2.Rdata', sep=''))
    print("finished second group command!")
    
    # Fill in peaks that weren't detected
    xset3 <- fillPeaks(xset2)
    print("Finished filling peaks!")
    print(xset3)
    # Move back to output_dir and write out your feature table
    setwd(output_dir)
    xcms_peaklist = peakTable(xset3, filebase=xcms_feature_table)
    write.csv(xcms_peaklist, paste(xcms_feature_table,ppm, sep='_'))
    print("Finished xcms!")
    }


# Make paths work!

mzml_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/notebooks/playing_around/sample_data'
mzml_path = gsub(' ', '\ ', mzml_path, fixed=TRUE)
mzml_path
print(paste('Working dir: ', getwd()))
print(mzml_path)
setwd(mzml_path)
print(sprintf('Working dir now: %s', getwd()))
setwd(mzml_path)

#run_xcms(mzml_path, 'positive')


'/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/notebooks/playing_around/sample_data'
[1] "Working dir:  /home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/notebooks/playing_around"
[1] "/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/notebooks/playing_around/sample_data"
[1] "Working dir now: /home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/notebooks/playing_around/sample_data"

In [9]:
preset_params
preset_params$asdf


$ppm
30
$peakwidth
  1. 10
  2. 60
$bw
5
$mzwid
0.025
$prefilter
  1. 0
  2. 0
NULL

In [21]:
get_params('xcms_params.tab')


                    V2
peak_picking  centWave
ppm                   
peak_width       10 60
prefilter             
bw                   2
mzwid           0.0250
minfrac              0
minsamp              5
retcor_method         
missing              1
extra                1
[1] 11  1
[1] "read table output:"
[1] "Df dimensions 11" "Df dimensions 1" 
[1] "centWave"
[1] "----------- centWave"
[1] ""
[1] "----------- "
Error in get_params("xcms_params.tab"): The parameter ppm is missing! Please set it, or use one of the preset parameter settings
Traceback:

1. get_params("xcms_params.tab")
2. stop(paste(sprintf("The parameter %s is missing!", idx), "Please set it, or use one of the preset parameter settings"))   # at line 34-35 of file <text>