In [1]:
import sys, os
sys.path.append('../../../libs/')
import os.path
import IO_class
from IO_class import FileOperator
from sklearn import cross_validation
import sklearn
import numpy as np
import csv
from dateutil import parser
from datetime import timedelta
from sklearn import svm
import numpy as np
import pandas as pd
import pdb
import pickle
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import sklearn
import scipy.stats as ss
from sklearn.svm import LinearSVC
import random
from DL_libs import *
from itertools import izip #new
import pprocess


/usr/local/lib/python2.7/dist-packages/scipy/spatial/__init__.py:91: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility
  from .qhull import *
/usr/local/lib/python2.7/dist-packages/scipy/spatial/__init__.py:91: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility
  from .qhull import *
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:35: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)

In [31]:
#filename = 'SUCCESS_log_CrossValidation_load_DL_remoteFisherM1_DL_RE_US_DL_RE_US_1_1_19MAY2014.txt'
filename = 'listOfDDIsHaveOver2InterfacesHave40-75_Examples_2010_real_selected.txt' #for testing
#filename = 'list_of_SUCCESS_log_run3DID_STAT_200_275_examples_201001MAY2014.txt'
file_obj = FileOperator(filename)
ddis = file_obj.readStripLines()
import logging
import time
current = time.strftime("%m_%d_%Y")

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

logname = 'DL_contact_matrix_' + current + '.log'
handler = logging.FileHandler(logname)
handler.setLevel(logging.DEBUG)

# create a logging format

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# add the handlers to the logger

logger.addHandler(handler)

logger.info('Input DDI file: ' + filename)
#logger.debug('This message should go to the log file')


INFO:__main__:Input DDI file: listOfDDIsHaveOver2InterfacesHave40-75_Examples_2010_real_selected.txt
number of lines in listOfDDIsHaveOver2InterfacesHave40-75_Examples_2010_real_selected.txt:37

In [18]:
ddi  = ddis[0]
ddis


Out[18]:
['6PGD_int_NAD_binding_2',
 'Activin_recp_int_TGF_beta',
 'ADSL_C_int_Lyase_1',
 'AICARFT_IMPCHas_int_MGS',
 'AIRS_int_AIRS_C',
 'Ald_Xan_dh_C_int_FAD_binding_5',
 'Alpha-amylase_int_CBM_48',
 'AMNp_N_int_PNP_UDP_1',
 'ARPC4_int_WD40',
 'CagX_int_TrbI',
 'Cation_ATPase_C_int_E1-E2_ATPase',
 'Ca_chan_IQ_int_efhand',
 'CBM_20_int_Glyco_hydro_14',
 'Cytochrom_B_C_int_Rieske',
 'Cytochrom_B_N_int_UCR_14kD',
 'Cytochrom_B_N_int_UCR_Fe-S_N',
 'Dioxygenase_C_int_Dioxygenase_N',
 'E1-E2_ATPase_int_Hydrolase',
 'EFG_C_int_GTP_EFTU',
 'efhand_int_IQ',
 'efhand_int_Troponin',
 'Fapy_DNA_glyco_int_H2TH',
 'Fer4_NifH_int_Oxidored_nitro',
 'FGF_int_I-set',
 'FumaraseC_C_int_Lyase_1',
 'Furin-like_int_Recep_L_domain',
 'Glyco_hydro_10_int_Ricin_B_lectin',
 'GP120_int_ig',
 'H2TH_int_zf-FPG_IleRS',
 'Ion_trans_2_int_V-set',
 'JmjC_int_JmjN',
 'Kringle_int_PAN_1',
 'MDH_int_PQQ',
 'PA_int_Peptidase_M28',
 'Peptidase_M28_int_TFR_dimer',
 'Photo_RC_int_PSII',
 'Stathmin_int_Tubulin']

In [41]:
class DDI_family_base(object):
    def __init__(self, ddi, Vectors_Fishers_aaIndex_raw_folder = '/home/du/Documents/Vectors_Fishers_aaIndex_raw_2014/'):
    #def __init__(self, ddi, Vectors_Fishers_aaIndex_raw_folder = '/home/sun/Downloads/contactmatrix/contactmatrixanddeeplearningcode/data_test/'):
        """ get total number of sequences in a ddi familgy
        Attributes:
            ddi: string ddi name
            Vectors_Fishers_aaIndex_raw_folder: string, folder
            total_number_of_sequences: int
            raw_data: dict raw_data[2]
        LOO_data['FisherM1'][1]

        """
        self.ddi = ddi
        self.Vectors_Fishers_aaIndex_raw_folder = Vectors_Fishers_aaIndex_raw_folder
        self.ddi_folder = self.Vectors_Fishers_aaIndex_raw_folder + ddi + '/'
        self.total_number_of_sequences = self.get_total_number_of_sequences()
        self.raw_data = {}
        self.positve_negative_number = {}
        self.equal_size_data = {}
        for seq_no in range(1, self.total_number_of_sequences+1):
            self.raw_data[seq_no] = self.get_raw_data_for_selected_seq(seq_no)
            try:
                #positive_file = self.ddi_folder + 'numPos_'+ str(seq_no) + '.txt'
                #file_obj = FileOperator(positive_file)
                #lines = file_obj.readStripLines()
                #import pdb; pdb.set_trace()
                count_pos = int(np.sum(self.raw_data[seq_no][:, -1]))
                count_neg = self.raw_data[seq_no].shape[0] - count_pos
                #self.positve_negative_number[seq_no] = {'numPos': int(float(lines[0]))}
                #assert int(float(lines[0])) == count_pos
                self.positve_negative_number[seq_no] = {'numPos': count_pos}
                #negative_file = self.ddi_folder + 'numNeg_'+ str(seq_no) + '.txt'
                #file_obj = FileOperator(negative_file)
                #lines = file_obj.readStripLines()
                #self.positve_negative_number[seq_no]['numNeg'] =  int(float(lines[0]))
                self.positve_negative_number[seq_no]['numNeg'] =  count_neg
            except Exception,e:
                print ddi, seq_no
                print str(e)
            # get data for equal positive and negative
            n_pos = self.positve_negative_number[seq_no]['numPos']
            n_neg = self.positve_negative_number[seq_no]['numNeg']
            index_neg = range(n_pos, n_pos + n_neg)
            random.shuffle(index_neg)
            index_neg = index_neg[: n_pos]
            positive_examples = self.raw_data[seq_no][ : n_pos, :]
            negative_examples = self.raw_data[seq_no][index_neg, :]
            self.equal_size_data[seq_no] = np.vstack((positive_examples, negative_examples))
    def get_LOO_training_and_reduced_traing(self, seq_no, fisher_mode = 'FisherM1ONLY' , reduce_ratio = 4):
        """ get the leave one out traing data, reduced traing
        Parameters:
            seq_no: 
            fisher_mode: default 'FisherM1ONLY'
        Returns:
            (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced),  (test_X, test_y)
        """
        train_X_LOO = np.array([])
        train_y_LOO = np.array([])
        train_X_reduced = np.array([])
        train_y_reduced = np.array([])

        total_number_of_sequences = self.total_number_of_sequences
        equal_size_data_selected_sequence = self.equal_size_data[seq_no]

        #get test data for selected sequence
        test_X, test_y = self.select_X_y(equal_size_data_selected_sequence, fisher_mode = fisher_mode)
        total_sequences = range(1, total_number_of_sequences+1)
        loo_sequences = [i for i in total_sequences if i != seq_no]
        number_of_reduced = len(loo_sequences)/reduce_ratio if len(loo_sequences)/reduce_ratio !=0 else 1
        random.shuffle(loo_sequences)
        reduced_sequences = loo_sequences[:number_of_reduced]

        #for loo data
        for current_no in loo_sequences:
            raw_current_data = self.equal_size_data[current_no]
            current_X, current_y = self.select_X_y(raw_current_data, fisher_mode = fisher_mode)
            if train_X_LOO.ndim ==1:
                train_X_LOO = current_X
            else:
                train_X_LOO = np.vstack((train_X_LOO, current_X))
            train_y_LOO = np.concatenate((train_y_LOO, current_y))

        #for reduced data
        for current_no in reduced_sequences:
            raw_current_data = self.equal_size_data[current_no]
            current_X, current_y = self.select_X_y(raw_current_data, fisher_mode = fisher_mode)
            if train_X_reduced.ndim ==1:
                train_X_reduced = current_X
            else:
                train_X_reduced = np.vstack((train_X_reduced, current_X))
            train_y_reduced = np.concatenate((train_y_reduced, current_y))                

        return (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y)
    def get_total_number_of_sequences(self):
        """ get total number of sequences in a ddi familgy
        Parameters:
            ddi: string
            Vectors_Fishers_aaIndex_raw_folder: string
        Returns:
            n: int
        """
        folder_path = self.Vectors_Fishers_aaIndex_raw_folder + self.ddi + '/' 
        filename = folder_path +'allPairs.txt'
        all_pairs = np.loadtxt(filename)
        return len(all_pairs)

    def get_raw_data_for_selected_seq(self, seq_no):
        """ get raw data for selected seq no in a family
        Parameters:
            ddi: 
            seq_no: 
        Returns:
            data: raw data in the sequence file
        """
        folder_path = self.Vectors_Fishers_aaIndex_raw_folder + self.ddi + '/' 
        filename = folder_path + 'F0_20_F1_20_Sliding_17_11_F0_20_F1_20_Sliding_17_11_ouput_'+ str(seq_no) + '.txt'
        data = np.loadtxt(filename)
        return data
    def select_X_y(self, data, fisher_mode = ''):
        """ select subset from the raw input data set
        Parameters:
            data: data from matlab txt file
            fisher_mode: subset base on this Fisher of AAONLY...
        Returns:
            selected X,  y
        """
        y = data[:,-1] # get lable
        if fisher_mode == 'FisherM1': # fisher m1 plus AA index
            a = data[:, 20:227]
            b = data[:, 247:454]
            X = np.hstack((a,b))
        elif fisher_mode == 'FisherM1ONLY': 
            a = data[:, 20:40]
            b = data[:, 247:267]
            X = np.hstack((a,b))
        elif fisher_mode == 'AAONLY':
            a = data[:, 40:227]
            b = data[:, 267:454]
            X = np.hstack((a,b))
        else:
            raise('there is an error in mode')
        return X, y

In [45]:
import sklearn.preprocessing
class Precessing_Scaler_0_9(sklearn.preprocessing.StandardScaler):
    def __init__(self):
        super(Precessing_Scaler_0_9, self).__init__(self, with_std=0.333)
    def transform(self, X): # transform data to 0.1 to 0.9
        new_X = super(Precessing_Scaler_0_9, self).transform(X)
        print 
        new_X[new_X > 1] = 1
        new_X[new_X < -1] = -1
        new_X = (new_X + 1) * 0.4 + 0.1
        return new_X
    def fit_transform(self):
        print 'Did not implement'
def performance_score(target_label, predicted_label, predicted_score = False, print_report = True): 
    """ get performance matrix for prediction
        Attributes:
            target_label: int 0, 1
            predicted_label: 0, 1 or ranking
            predicted_score: bool if False, predicted_label is from 0, 1. If Ture, predicted_label is ranked, need to get AUC score.
            print_report: if True, print the perfromannce on screen
    """
    import sklearn
    from sklearn.metrics import roc_auc_score
    score = {}
    if predicted_score == False:
        score['accuracy'] = sklearn.metrics.accuracy_score(target_label, predicted_label)
        score['precision'] = sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1)
        score['recall'] = sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1)
    if predicted_score == True:
        auc_score  = roc_auc_score(target_label, predicted_label)
        score['auc_score'] = auc_score
        target_label = [x >= 0.5 for x in target_label]
        score['accuracy'] = sklearn.metrics.accuracy_score(target_label, predicted_label)
        score['precision'] = sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1)
        score['recall'] = sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1)
    if print_report == True:
        for key, value in score.iteritems():
            print key, '{percent:.1%}'.format(percent=value)
    return score

def LOO_out_performance_for_all(ddis):
    for ddi in ddis:
        process_one_ddi(ddi)
def process_one_ddi(ddi):
    """A function to waste CPU cycles"""
    logger.info('DDI: %s' % ddi)
    one_ddi_family = {}
    one_ddi_family[ddi] = LOO_out_performance_for_one_ddi(ddi)
    one_ddi_family[ddi].get_LOO_perfermance('FisherM1', '')
    return None
def parallel_process(function, ddis, nproc = 2):
    # maximum number of simultaneous processes desired
    results = pprocess.Map(limit=nproc, reuse=1)
    parallel_function = results.manage(pprocess.MakeReusable(function))
    [parallel_function(ddi) for ddi in ddis]  # Start computing things
    return results[:]

class LOO_out_performance_for_one_ddi(object):
        """ get the performance of ddi families
        Attributes:
            ddi: string ddi name
            Vectors_Fishers_aaIndex_raw_folder: string, folder
            total_number_of_sequences: int
            raw_data: dict raw_data[2]

        """
        def __init__(self, ddi):
            self.ddi_obj = DDI_family_base(ddi)
            self.ddi = ddi
        def analysis_score(self, target_label, predicted_label): #new
            score = (sklearn.metrics.accuracy_score(target_label, predicted_label),
                     sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1),
                     sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1))
            return score
        def saveAsCsv(self, predicted_score, fname, *arguments): #new
            newfile = False
            logger.info('saving to file: %s' %fname)
            if os.path.isfile(fname + '_report.csv'):
                pass
            else:
                newfile = True
            csvfile = open(fname + '_report.csv', 'a+')
            writer = csv.writer(csvfile)
            if newfile == True:
                if predicted_score == False:
                    writer.writerow(['DDI', 'no.', 'FisherMode', 'method', 'isTest', 'accuracy', 'precision', 'recall']) #, 'AUC'])
                else:
                    writer.writerow(['DDI', 'no.', 'FisherMode', 'method', 'isTest', 'AUC', 'accuracy', 'precision', 'recall'])
            for arg in arguments:        
                writer.writerows(arg)
            csvfile.close()

        def get_LOO_perfermance(self, fisher_mode, settings = None):
            analysis_scr = []
            predicted_score = False
            reduce_ratio = 4
            for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
                print seq_no
                print "SVM"
                (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_LOO_training_and_reduced_traing(seq_no, reduce_ratio = reduce_ratio)
                standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced)
                scaled_train_X = standard_scaler.transform(train_X_reduced)
                scaled_test_X = standard_scaler.transform(test_X)
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_test_y = Linear_SVC.predict(scaled_test_X)
                isTest = True; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new
                
                
                Linear_SVC = LinearSVC(C=1, penalty="l2")
                Linear_SVC.fit(scaled_train_X, train_y_reduced)
                predicted_train_y = Linear_SVC.predict(scaled_train_X)
                isTest = False; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))
                
                
                print "direct deep learning"
                # direct deep learning 
                min_max_scaler = Precessing_Scaler_0_9()
                X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
                X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced)
                x_test_minmax = min_max_scaler.transform(test_X)
                pretraining_X_minmax = min_max_scaler.transform(train_X_LOO)
                x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax, 
                                                                                                  train_y_reduced
                                                                    , test_size=0.2, random_state=42)
                finetune_lr = 1
                batch_size = 30
                pretraining_epochs = cal_epochs(10000, x_train_minmax, batch_size = batch_size)
                #pretrain_lr=0.001
                pretrain_lr = 0.01
                training_epochs = 1000
                hidden_layers_sizes= [100, 100]
                corruption_levels = [0,0]
                sda = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax , 
                             x_test_minmax, test_y,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values()))
                
                test_predicted = sda.predict(x_test_minmax)
                y_test = test_y
                isTest = True; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values()))
                
                
                # deep learning using unlabeled data for pretraining
                print 'deep learning with unlabel data'
                pretraining_epochs = cal_epochs(10000, pretraining_X_minmax, batch_size = batch_size)
                sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
                             x_validation_minmax, y_validation_minmax , 
                             x_test_minmax, test_y, 
                             pretraining_X_minmax = pretraining_X_minmax,
                             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
                             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
                             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
                 )
                print 'hidden_layers_sizes:', hidden_layers_sizes
                print 'corruption_levels:', corruption_levels
                training_predicted = sda_unlabel.predict(x_train_minmax)
                y_train = y_train_minmax
                isTest = False; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))
                
                test_predicted = sda_unlabel.predict(x_test_minmax)
                y_test = test_y
                
                isTest = True; #new
                analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
            report_name = filename + '_' + '_'.join(map(str, hidden_layers_sizes)) + \
                            '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)
            self.saveAsCsv(predicted_score, report_name, analysis_scr)

In [46]:
'DDI: %s' % ddi


Out[46]:
'DDI: 6PGD_int_NAD_binding_2'

In [1]:
process_one_ddi(ddis[0])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-463297ca20f0> in <module>()
----> 1 process_one_ddi(ddis[0])

NameError: name 'process_one_ddi' is not defined

In [ ]:


In [7]:
parallel_process(process_one_ddi, ddis, nproc = 2)


Ald_Xan_dh_C_int_FAD_binding_5 20
error return without exception set
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-7-b21b401df1e6> in <module>()
      1 
----> 2 parallel_process(process_one_ddi, ddis, nproc = 7)

<ipython-input-5-b97ff39cb95a> in parallel_process(function, ddis, nproc)
     53     parallel_function = results.manage(pprocess.MakeReusable(function))
     54     [parallel_function(ddi) for ddi in ddis]  # Start computing things
---> 55     return results[:]
     56 
     57 class LOO_out_performance_for_one_ddi(object):

/usr/local/lib/python2.7/dist-packages/pprocess.pyc in __getitem__(self, i)
    782 
    783         while self.unfinished():
--> 784             self.store()
    785             try:
    786                 return self._get(i)

/usr/local/lib/python2.7/dist-packages/pprocess.pyc in store(self, timeout)
    401             for channel in self.ready(timeout):
    402                 try:
--> 403                     self.store_data(channel)
    404                     self.start_waiting(channel)
    405                 except IOError, exc:

/usr/local/lib/python2.7/dist-packages/pprocess.pyc in store_data(self, channel)
    747         "Accumulate the incoming data, associating results with channels."
    748 
--> 749         data = channel.receive()
    750         self.results[self.channels[channel]] = data
    751         del self.channels[channel]

/usr/local/lib/python2.7/dist-packages/pprocess.pyc in receive(self)
    136 
    137         try:
--> 138             obj = self._receive()
    139             return obj
    140         finally:

/usr/local/lib/python2.7/dist-packages/pprocess.pyc in _receive(self)
    122         obj = pickle.load(self.read_pipe)
    123         if isinstance(obj, Exception):
--> 124             raise obj
    125         else:
    126             return obj

KeyError: 20

In [ ]:


In [2]:
LOO_out_performance_for_all(ddis)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-bc27e0896ebe> in <module>()
----> 1 LOO_out_performance_for_all(ddis)

NameError: name 'LOO_out_performance_for_all' is not defined

In [ ]:


In [3]:
one_ddi_family.get_LOO_perfermance('AAONLY', '1')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-f1a291fe5526> in <module>()
----> 1 one_ddi_family.get_LOO_perfermance('AAONLY', '1')

NameError: name 'one_ddi_family' is not defined

In [ ]:
data =a
a = data[:, 20:227]
b = data[:, 247:454]
X = np.hstack((a,b))
print a.shape, b.shape, X.shape,  X

In [ ]: