In [1]:
import shelve

In [17]:
sys.path.append("/home/gavin/Documents/MRes/opencast-bio/")

In [18]:
import ocbio.extract

In [22]:
cd ~/Documents/MRes/


/home/gavin/Documents/MRes

In [80]:
parser = ocbio.extract.ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
                                         "/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db",
                                         protindexes=(1,3),valindexes=[4])

In [81]:
parser.regenerate(force=True, verbose=True)


Forcing regeneration of database /home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db from data file /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt.
Filling database.........................................................................................................................................................................
Parsed 169626 lines.

In [54]:
examplekeys = parser.db.keys()[0:10]

In [82]:
print len(parser.db.keys())


169323

In [88]:
parser.db.close()

In [77]:
parser2 = ocbio.extract.ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
                                         "/home/gavin/Documents/MRes/HIPPIE/feature2.HIPPIE.db",
                                         protindexes=(1,3),valindexes=[4])

In [78]:
print len(parser2.db.keys())


0

In [49]:
parser.close()

In [51]:
cd test/


/home/gavin/Documents/MRes/test

In [55]:
class test():
    def __init__(self):
        self.db = ocbio.extract.ProteinPairDB("test.db")
    def __setitem__(self,key,value):
        self.db[key] = value

In [56]:
inst = test()

In [57]:
for k in examplekeys:
    inst[k] = 1

In [58]:
inst.db.keys()


Out[58]:
[frozenset({'10054', '201626'}),
 frozenset({'10236', '3930'}),
 frozenset({'10320', '6605'}),
 frozenset({'10399', '8364'}),
 frozenset({'10474', '1499'}),
 frozenset({'1058', '1060'}),
 frozenset({'10289', '10857'}),
 frozenset({'11091110911109111091'}),
 frozenset({'112495', '9329'}),
 frozenset({'10379', '1822'}),
 frozenset({'something'})]

In [60]:
inst.db.close()

In [61]:
inst = test()

In [62]:
inst.db.keys()


Out[62]:
[frozenset({'10054', '201626'}),
 frozenset({'10236', '3930'}),
 frozenset({'10320', '6605'}),
 frozenset({'10399', '8364'}),
 frozenset({'10474', '1499'}),
 frozenset({'1058', '1060'}),
 frozenset({'10289', '10857'}),
 frozenset({'11091110911109111091'}),
 frozenset({'112495', '9329'}),
 frozenset({'10379', '1822'}),
 frozenset({'something'})]

In [85]:
%load ../opencast-bio/ocbio/extract.py

In [87]:
# Feature extraction code
# Header pending

import os
import time
import subprocess
import csv
import shelve
import re
import sys
import pickle
import pdb

def verbosecheck(verbose):
    '''returns a function depending on the state of the verbose flag'''
    if verbose:
        def v_print(*args):
            '''declare v_print function that prints to stdout
            if verbose flag is on'''
            for argument in args:
                print argument,
                print
    else:
        def v_print(*args):
            None
    return v_print

class ProteinPairDB(shelve.DbfilenameShelf):
    '''A simple database for protein pairs using shelve.'''
    def __setitem__(self, key, value):
        # key will be frozenset so make it a list first
        key = list(key)
        # then make it a string
        if len(key) == 1:
            key = key[0] * 2
        else:
            key = key[0] + "\t" + key[1]
        shelve.DbfilenameShelf.__setitem__(self, key, value)
        return None

    def __getitem__(self, key):
        # make two strings from the key
        key = list(key)
        if len(key) == 1:
            key1 = key[0] * 2
            key1 = key[0] * 2
        else:
            key1 = key[0] + "\t" + key[1]
            key2 = key[1] + "\t" + key[0]
        # try the first one
        try:
            value = shelve.DbfilenameShelf.__getitem__(self, key1)
        except KeyError:
            # couldn't find the first key, try the second
            value = shelve.DbfilenameShelf.__getitem__(self, key2)
            # if we don't find this one then error out as usual
        return value

    def keys(self):
        # retrieve the string keys used by shelve
        ks = shelve.DbfilenameShelf.keys(self)
        # convert them to frozensets
        ks = map(lambda x: frozenset(x.split("\t")), ks)
        return ks


class ProteinPairParser():
    '''Does simple parsing on data files to produce protein pair files with feature values'''
    def __init__(self,
                 datadir,
                 outdir,
                 protindexes=(0, 1),
                 valindexes=[2],
                 script=None,
                 csvdelim="\t",
                 ignoreheader=0,
                 generator=False):
        # first, store the initialisation
        self.datadir = datadir
        self.outdir = outdir
        self.protindexes = protindexes
        # had to hack this together from the list above
        # passing tuple in as default did not work
        self.valindexes = tuple(valindexes)
        self.script = script
        self.csvdelim = csvdelim
        self.ignoreheader = ignoreheader
        if generator:
            #then open up this pickle file
            f = open(generator)
            self.generator = pickle.load(f)
            f.close()
            self.db = None
        else:
            #otherwise open the database that is assumed to exist 
            self.generator = None
            self.db = openpairshelf(self.outdir)
        return None

    def regenerate(self, force=False, verbose=False):
        '''Regenerate the pair file from the data source
        if the data source is newer than the pair file'''
        v_print = verbosecheck(verbose)
        if self.generator == None:
            # so first check the ages of both files
            datamtime = os.stat(self.datadir)[-2]
            if os.path.isfile(self.outdir):
                pairmtime = os.stat(self.outdir)[-2]
            else:
                # bit of a hack
                pairmtime = 0
            # if the data modification time is greater than output modification time
            if datamtime > pairmtime or force is True:
                # now regenerate the data file according to the options defined above:
                if verbose and datamtime > pairmtime:
                    if pairmtime == 0:
                        print "Database file not found, regenerating at {0} from {1}.".format(self.outdir, self.datadir)
                    else:
                        print "Data file {0} is newer than processed database {1}, regenerating.".format(self.datadir, self.outdir)
                if verbose and force:
                    print "Forcing regeneration of database {0} from data file {1}.".format(self.outdir, self.datadir)
                # if there's a script to run
                if self.script is not None:
                    v_print("Executing script: {0}.".format(self.script))
                    # then execute the script
                    retcode = subprocess.call("python2 {0}".format(self.script), shell=True)

                    v_print("Script returned: {0}".format(retcode))
                # first delete out of date file, if it's there
                if os.path.isfile(self.outdir):
                    os.remove(self.outdir)
                # open the data file
                c = csv.reader(open(self.datadir), delimiter=self.csvdelim)
                # if the header should be ignored then ignore it

                if self.ignoreheader == "1":
                    v_print("Ignoring header.")
                    c.next()

                if verbose:
                    sys.stdout.write("Filling database")
                    lcount = 0

                for line in c:
                    # each line use the protein pair as a key
                    # by formatting it as a frozenset
                    pair = frozenset([line[self.protindexes[0]], line[self.protindexes[1]]])
                    # and the value is indexed by valindexes
                    values = []

                    for i in self.valindexes:
                        values.append(line[i])

                    self.db[pair] = values[:]

                    if verbose:
                        lcount = lcount + 1
                        if lcount % 1000 == 0:
                            sys.stdout.write(".")

                if verbose:
                    sys.stdout.write("\n")
                    print "Parsed {0} lines.".format(lcount)
        else:
            v_print("Custom generator function, no database to regenerate.")

        return None

    def __getitem__(self,key):
        if self.generator != None:
            #try and read a key from the custom generator
            return self.generator[key]
        else:
            #read key from database
            return self.db[key]

    def close(self):
        self.db.close()
        return None


def openpairshelf(filename, flag='c', protocol=None, writeback=False):
    """Returns a ProteinPairDB object, with similar functionality to shelve.open()"""
    return ProteinPairDB(filename, flag, protocol, writeback)

In [89]:
parser = ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
                                         "/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db",
                                         protindexes=(1,3),valindexes=[4])

In [90]:
parser.db.keys()


Out[90]:
[]

In [92]:
parser.regenerate(force=True)

In [93]:
print len(parser.db.keys())


169323

In [80]:
parser = ocbio.extract.ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
                                         "/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db",
                                         protindexes=(1,3),valindexes=[4])

In [ ]: