In [1]:
import numpy as np
import pandas as pd
from __future__ import division
In [2]:
path_kraken = "/Users/luke/krse2011/kraken/krse2011_kraken_merged.tsv" # argv[1]
path_numreads = "/Users/luke/krse2011/kraken/krse2011_numreads.list" # argv[2]
In [3]:
# save kraken merged report as pd dataframe
df1 = pd.DataFrame.from_csv(path_kraken, header=0, index_col = None, sep='\t')
In [4]:
# save kraken dataframe as ndarray, excluding 0th column (taxonomy)
rawcounts = np.array(df1.iloc[0:,1:])
In [5]:
# save number of reads list as pd dataframe
df2 = pd.DataFrame.from_csv(path_numreads, header=None, index_col = None, sep='\t')
In [6]:
# save numreads dataframe as nd array, transposed
numreads = np.array(df2).transpose()
In [7]:
# divide each column by the total number of reads checked by kraken (prinseq pairs)
normcounts = rawcounts/numreads
# IF I WANTED TO SPLIT UP INTO SEPARATE FILES BY TAXONOMIC LEVEL, COULD DO THIS:
# go through each line of df
# get ID
# split by "|" into array
# pop off last element
# if it begins with [letter x from argv]
# store as new dictionary entry
In [9]:
# save ndarray as pd.DataFrame with same row and column names as df1
df3 = pd.DataFrame(normcounts, index=df1.iloc[:,0:1], columns=df1.columns[1:])
In [14]:
# export as tsv file
df3.to_csv(path_or_buf="temp.tsv", sep="\t")
In [16]:
# remove "('" and "',)" from index (row names)
!cat temp.tsv | sed "s/('//g" | sed "s/',)//g" > krse2011_kraken_merged_norm.tsv
!/bin/rm temp.tsv
Then manually add ID to start:
$ echo -n "ID" | cat - krse2011_kraken_merged_norm.tsv > temp.tsv && mv temp.tsv krse2011_kraken_merged_norm.tsv
Then run these commands to get non-zero lines of merged.tsv and merged_norm.tsv:
cat krse2011_kraken_merged.tsv | perl -alne 'use List::Util qw(sum); shift(@F); print if sum(@F)>0' > krse2011_kraken_merged_nonzero.tsv
cat krse2011_kraken_merged_norm.tsv | perl -alne 'use List::Util qw(sum); shift(@F); print if sum(@F)>0' > krse2011_kraken_merged_norm_nonzero.tsv
Then add header line to those two files.
In [ ]: