This would probably make more sense as a script, but I did not bother to convert it. Will spit out a pickled file for other use.
In [19]:
import pandas as pd
import re
import glob
import cPickle as cpk
from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()()
Now go in a read one BRITE file (originally for the compounds)
In [101]:
textC = re.compile('\d+')
mc = textC.search(line)
mc.group(0)
#so, now I have the map number...how do I get the rest of the line?
Out[101]:
In [116]:
textC = re.compile(r'(\d+)\s*(.*)$')
mC = textC.search(line)
print mC.group(1)
print mC.group(2)
In [ ]:
#next: clean up the top of the data file (want to skip a few rows)...
In [129]:
line = '#DEFINITION KEGG pathway maps'
In [130]:
line
Out[130]:
In [131]:
line[0]
Out[131]:
In [132]:
if line[0] is not '#':
print 'yes'
In [133]:
if line[0] is '#':
print 'yes'
In [ ]:
In [134]:
def ReadBRITEfile(briteFile):
forBrite = pd.DataFrame(columns = ['map','A','B','C','wholeThing'])
# set up the expressions to match each level in the BRITE hierarchy
textA = re.compile(r'(^A<b>)(.+)(</b>)\s*(.*)$')
textB = re.compile(r'(^B)\s*(.*)$')
textC = re.compile(r'(\d+)\s*(.*)$')
#this relies on the fact that the rows are in order: A, with B subheadings, then C subheadings
setA = []
idxA = []
setB = []
setC = []
with open(briteFile) as f:
for idx,line in enumerate(f):
if line[0] is not '#':
mA = textA.search(line)
mB = textB.search(line)
mC = textC.search(line)
if mA:
setA = mA.group(2)
#house cleaning (probably c)
idxA = idx
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'wholeThing'] = line #using this as a double check for now
#forBrite.loc[idx,'map'] = mC.group(1)
elif mB:
setB = mB.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'wholeThing'] = line
#forBrite.loc[idx,'map'] = mC.group(1)
elif mC:
#Tracer()()
setC = mC.group(2)
forBrite.loc[idx,'A'] = setA
forBrite.loc[idx,'B'] = setB
forBrite.loc[idx,'C'] = setC
forBrite.loc[idx,'wholeThing'] = line
forBrite.loc[idx,'map'] = mC.group(1)
return forBrite
In [135]:
D = glob.glob('*keg.txt')
allBRITE=[]
for idx,nof in enumerate(D):
print idx, nof #easy visible counter in Python
allBRITE = ReadBRITEfile(nof)
In [138]:
type(allBRITE)
Out[138]:
In [146]:
allBRITE.loc[allBRITE['map']=='01100']
Out[146]:
In [147]:
allBRITE.loc[allBRITE['map']=='01100'].C
Out[147]:
In [148]:
type(allBRITE.loc[allBRITE['map']=='01100'].C)
Out[148]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [137]:
allBRITE
Out[137]:
In [ ]:
In [9]:
#now...save all that so I don't have to do this everytime
cpk.dump(allBRITE, open('BRITE_compoundsOnly.pickle', 'wb'))
In [10]:
cpk.load(open('BRITE_compoundsOnly.pickle','rb'))
Out[10]: