In [1]:
import sys, os
sys.path.append('../../../libs/')
import os.path
import IO_class
from IO_class import FileOperator
from sklearn import cross_validation
import sklearn
import numpy as np
import csv
from dateutil import parser
from datetime import timedelta
from sklearn import svm
import numpy as np
import pandas as pd
import pdb
import pickle
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import sklearn
import scipy.stats as ss
from sklearn.svm import LinearSVC
import random
from DL_libs import *
from itertools import izip #new
import pprocess
In [31]:
#filename = 'SUCCESS_log_CrossValidation_load_DL_remoteFisherM1_DL_RE_US_DL_RE_US_1_1_19MAY2014.txt'
filename = 'listOfDDIsHaveOver2InterfacesHave40-75_Examples_2010_real_selected.txt' #for testing
#filename = 'list_of_SUCCESS_log_run3DID_STAT_200_275_examples_201001MAY2014.txt'
file_obj = FileOperator(filename)
ddis = file_obj.readStripLines()
import logging
import time
current = time.strftime("%m_%d_%Y")
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logname = 'DL_contact_matrix_' + current + '.log'
handler = logging.FileHandler(logname)
handler.setLevel(logging.DEBUG)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
logger.info('Input DDI file: ' + filename)
#logger.debug('This message should go to the log file')
In [18]:
ddi = ddis[0]
ddis
Out[18]:
In [41]:
class DDI_family_base(object):
def __init__(self, ddi, Vectors_Fishers_aaIndex_raw_folder = '/home/du/Documents/Vectors_Fishers_aaIndex_raw_2014/'):
#def __init__(self, ddi, Vectors_Fishers_aaIndex_raw_folder = '/home/sun/Downloads/contactmatrix/contactmatrixanddeeplearningcode/data_test/'):
""" get total number of sequences in a ddi familgy
Attributes:
ddi: string ddi name
Vectors_Fishers_aaIndex_raw_folder: string, folder
total_number_of_sequences: int
raw_data: dict raw_data[2]
LOO_data['FisherM1'][1]
"""
self.ddi = ddi
self.Vectors_Fishers_aaIndex_raw_folder = Vectors_Fishers_aaIndex_raw_folder
self.ddi_folder = self.Vectors_Fishers_aaIndex_raw_folder + ddi + '/'
self.total_number_of_sequences = self.get_total_number_of_sequences()
self.raw_data = {}
self.positve_negative_number = {}
self.equal_size_data = {}
for seq_no in range(1, self.total_number_of_sequences+1):
self.raw_data[seq_no] = self.get_raw_data_for_selected_seq(seq_no)
try:
#positive_file = self.ddi_folder + 'numPos_'+ str(seq_no) + '.txt'
#file_obj = FileOperator(positive_file)
#lines = file_obj.readStripLines()
#import pdb; pdb.set_trace()
count_pos = int(np.sum(self.raw_data[seq_no][:, -1]))
count_neg = self.raw_data[seq_no].shape[0] - count_pos
#self.positve_negative_number[seq_no] = {'numPos': int(float(lines[0]))}
#assert int(float(lines[0])) == count_pos
self.positve_negative_number[seq_no] = {'numPos': count_pos}
#negative_file = self.ddi_folder + 'numNeg_'+ str(seq_no) + '.txt'
#file_obj = FileOperator(negative_file)
#lines = file_obj.readStripLines()
#self.positve_negative_number[seq_no]['numNeg'] = int(float(lines[0]))
self.positve_negative_number[seq_no]['numNeg'] = count_neg
except Exception,e:
print ddi, seq_no
print str(e)
# get data for equal positive and negative
n_pos = self.positve_negative_number[seq_no]['numPos']
n_neg = self.positve_negative_number[seq_no]['numNeg']
index_neg = range(n_pos, n_pos + n_neg)
random.shuffle(index_neg)
index_neg = index_neg[: n_pos]
positive_examples = self.raw_data[seq_no][ : n_pos, :]
negative_examples = self.raw_data[seq_no][index_neg, :]
self.equal_size_data[seq_no] = np.vstack((positive_examples, negative_examples))
def get_LOO_training_and_reduced_traing(self, seq_no, fisher_mode = 'FisherM1ONLY' , reduce_ratio = 4):
""" get the leave one out traing data, reduced traing
Parameters:
seq_no:
fisher_mode: default 'FisherM1ONLY'
Returns:
(train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y)
"""
train_X_LOO = np.array([])
train_y_LOO = np.array([])
train_X_reduced = np.array([])
train_y_reduced = np.array([])
total_number_of_sequences = self.total_number_of_sequences
equal_size_data_selected_sequence = self.equal_size_data[seq_no]
#get test data for selected sequence
test_X, test_y = self.select_X_y(equal_size_data_selected_sequence, fisher_mode = fisher_mode)
total_sequences = range(1, total_number_of_sequences+1)
loo_sequences = [i for i in total_sequences if i != seq_no]
number_of_reduced = len(loo_sequences)/reduce_ratio if len(loo_sequences)/reduce_ratio !=0 else 1
random.shuffle(loo_sequences)
reduced_sequences = loo_sequences[:number_of_reduced]
#for loo data
for current_no in loo_sequences:
raw_current_data = self.equal_size_data[current_no]
current_X, current_y = self.select_X_y(raw_current_data, fisher_mode = fisher_mode)
if train_X_LOO.ndim ==1:
train_X_LOO = current_X
else:
train_X_LOO = np.vstack((train_X_LOO, current_X))
train_y_LOO = np.concatenate((train_y_LOO, current_y))
#for reduced data
for current_no in reduced_sequences:
raw_current_data = self.equal_size_data[current_no]
current_X, current_y = self.select_X_y(raw_current_data, fisher_mode = fisher_mode)
if train_X_reduced.ndim ==1:
train_X_reduced = current_X
else:
train_X_reduced = np.vstack((train_X_reduced, current_X))
train_y_reduced = np.concatenate((train_y_reduced, current_y))
return (train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y)
def get_total_number_of_sequences(self):
""" get total number of sequences in a ddi familgy
Parameters:
ddi: string
Vectors_Fishers_aaIndex_raw_folder: string
Returns:
n: int
"""
folder_path = self.Vectors_Fishers_aaIndex_raw_folder + self.ddi + '/'
filename = folder_path +'allPairs.txt'
all_pairs = np.loadtxt(filename)
return len(all_pairs)
def get_raw_data_for_selected_seq(self, seq_no):
""" get raw data for selected seq no in a family
Parameters:
ddi:
seq_no:
Returns:
data: raw data in the sequence file
"""
folder_path = self.Vectors_Fishers_aaIndex_raw_folder + self.ddi + '/'
filename = folder_path + 'F0_20_F1_20_Sliding_17_11_F0_20_F1_20_Sliding_17_11_ouput_'+ str(seq_no) + '.txt'
data = np.loadtxt(filename)
return data
def select_X_y(self, data, fisher_mode = ''):
""" select subset from the raw input data set
Parameters:
data: data from matlab txt file
fisher_mode: subset base on this Fisher of AAONLY...
Returns:
selected X, y
"""
y = data[:,-1] # get lable
if fisher_mode == 'FisherM1': # fisher m1 plus AA index
a = data[:, 20:227]
b = data[:, 247:454]
X = np.hstack((a,b))
elif fisher_mode == 'FisherM1ONLY':
a = data[:, 20:40]
b = data[:, 247:267]
X = np.hstack((a,b))
elif fisher_mode == 'AAONLY':
a = data[:, 40:227]
b = data[:, 267:454]
X = np.hstack((a,b))
else:
raise('there is an error in mode')
return X, y
In [45]:
import sklearn.preprocessing
class Precessing_Scaler_0_9(sklearn.preprocessing.StandardScaler):
def __init__(self):
super(Precessing_Scaler_0_9, self).__init__(self, with_std=0.333)
def transform(self, X): # transform data to 0.1 to 0.9
new_X = super(Precessing_Scaler_0_9, self).transform(X)
print
new_X[new_X > 1] = 1
new_X[new_X < -1] = -1
new_X = (new_X + 1) * 0.4 + 0.1
return new_X
def fit_transform(self):
print 'Did not implement'
def performance_score(target_label, predicted_label, predicted_score = False, print_report = True):
""" get performance matrix for prediction
Attributes:
target_label: int 0, 1
predicted_label: 0, 1 or ranking
predicted_score: bool if False, predicted_label is from 0, 1. If Ture, predicted_label is ranked, need to get AUC score.
print_report: if True, print the perfromannce on screen
"""
import sklearn
from sklearn.metrics import roc_auc_score
score = {}
if predicted_score == False:
score['accuracy'] = sklearn.metrics.accuracy_score(target_label, predicted_label)
score['precision'] = sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1)
score['recall'] = sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1)
if predicted_score == True:
auc_score = roc_auc_score(target_label, predicted_label)
score['auc_score'] = auc_score
target_label = [x >= 0.5 for x in target_label]
score['accuracy'] = sklearn.metrics.accuracy_score(target_label, predicted_label)
score['precision'] = sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1)
score['recall'] = sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1)
if print_report == True:
for key, value in score.iteritems():
print key, '{percent:.1%}'.format(percent=value)
return score
def LOO_out_performance_for_all(ddis):
for ddi in ddis:
process_one_ddi(ddi)
def process_one_ddi(ddi):
"""A function to waste CPU cycles"""
logger.info('DDI: %s' % ddi)
one_ddi_family = {}
one_ddi_family[ddi] = LOO_out_performance_for_one_ddi(ddi)
one_ddi_family[ddi].get_LOO_perfermance('FisherM1', '')
return None
def parallel_process(function, ddis, nproc = 2):
# maximum number of simultaneous processes desired
results = pprocess.Map(limit=nproc, reuse=1)
parallel_function = results.manage(pprocess.MakeReusable(function))
[parallel_function(ddi) for ddi in ddis] # Start computing things
return results[:]
class LOO_out_performance_for_one_ddi(object):
""" get the performance of ddi families
Attributes:
ddi: string ddi name
Vectors_Fishers_aaIndex_raw_folder: string, folder
total_number_of_sequences: int
raw_data: dict raw_data[2]
"""
def __init__(self, ddi):
self.ddi_obj = DDI_family_base(ddi)
self.ddi = ddi
def analysis_score(self, target_label, predicted_label): #new
score = (sklearn.metrics.accuracy_score(target_label, predicted_label),
sklearn.metrics.precision_score(target_label, predicted_label, pos_label=1),
sklearn.metrics.recall_score(target_label, predicted_label, pos_label=1))
return score
def saveAsCsv(self, predicted_score, fname, *arguments): #new
newfile = False
logger.info('saving to file: %s' %fname)
if os.path.isfile(fname + '_report.csv'):
pass
else:
newfile = True
csvfile = open(fname + '_report.csv', 'a+')
writer = csv.writer(csvfile)
if newfile == True:
if predicted_score == False:
writer.writerow(['DDI', 'no.', 'FisherMode', 'method', 'isTest', 'accuracy', 'precision', 'recall']) #, 'AUC'])
else:
writer.writerow(['DDI', 'no.', 'FisherMode', 'method', 'isTest', 'AUC', 'accuracy', 'precision', 'recall'])
for arg in arguments:
writer.writerows(arg)
csvfile.close()
def get_LOO_perfermance(self, fisher_mode, settings = None):
analysis_scr = []
predicted_score = False
reduce_ratio = 4
for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1):
print seq_no
print "SVM"
(train_X_LOO, train_y_LOO),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_LOO_training_and_reduced_traing(seq_no, reduce_ratio = reduce_ratio)
standard_scaler = preprocessing.StandardScaler().fit(train_X_reduced)
scaled_train_X = standard_scaler.transform(train_X_reduced)
scaled_test_X = standard_scaler.transform(test_X)
Linear_SVC = LinearSVC(C=1, penalty="l2")
Linear_SVC.fit(scaled_train_X, train_y_reduced)
predicted_test_y = Linear_SVC.predict(scaled_test_X)
isTest = True; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(test_y, predicted_test_y).values())) #new
Linear_SVC = LinearSVC(C=1, penalty="l2")
Linear_SVC.fit(scaled_train_X, train_y_reduced)
predicted_train_y = Linear_SVC.predict(scaled_train_X)
isTest = False; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'SVM', isTest) + tuple(performance_score(train_y_reduced, predicted_train_y).values()))
print "direct deep learning"
# direct deep learning
min_max_scaler = Precessing_Scaler_0_9()
X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced)
X_train_pre_validation_minmax = min_max_scaler.transform(train_X_reduced)
x_test_minmax = min_max_scaler.transform(test_X)
pretraining_X_minmax = min_max_scaler.transform(train_X_LOO)
x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(X_train_pre_validation_minmax,
train_y_reduced
, test_size=0.2, random_state=42)
finetune_lr = 1
batch_size = 30
pretraining_epochs = cal_epochs(10000, x_train_minmax, batch_size = batch_size)
#pretrain_lr=0.001
pretrain_lr = 0.01
training_epochs = 1000
hidden_layers_sizes= [100, 100]
corruption_levels = [0,0]
sda = trainSda(x_train_minmax, y_train_minmax,
x_validation_minmax, y_validation_minmax ,
x_test_minmax, test_y,
hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
)
print 'hidden_layers_sizes:', hidden_layers_sizes
print 'corruption_levels:', corruption_levels
training_predicted = sda.predict(x_train_minmax)
y_train = y_train_minmax
isTest = False; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_train, training_predicted).values()))
test_predicted = sda.predict(x_test_minmax)
y_test = test_y
isTest = True; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values()))
# deep learning using unlabeled data for pretraining
print 'deep learning with unlabel data'
pretraining_epochs = cal_epochs(10000, pretraining_X_minmax, batch_size = batch_size)
sda_unlabel = trainSda(x_train_minmax, y_train_minmax,
x_validation_minmax, y_validation_minmax ,
x_test_minmax, test_y,
pretraining_X_minmax = pretraining_X_minmax,
hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
training_epochs = training_epochs, pretraining_epochs = pretraining_epochs,
pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
)
print 'hidden_layers_sizes:', hidden_layers_sizes
print 'corruption_levels:', corruption_levels
training_predicted = sda_unlabel.predict(x_train_minmax)
y_train = y_train_minmax
isTest = False; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_train, training_predicted, predicted_score).values()))
test_predicted = sda_unlabel.predict(x_test_minmax)
y_test = test_y
isTest = True; #new
analysis_scr.append((self.ddi, seq_no, fisher_mode, 'DL_U', isTest) + tuple(performance_score(y_test, test_predicted, predicted_score).values()))
report_name = filename + '_' + '_'.join(map(str, hidden_layers_sizes)) + \
'_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str(reduce_ratio)
self.saveAsCsv(predicted_score, report_name, analysis_scr)
In [46]:
'DDI: %s' % ddi
Out[46]:
In [1]:
process_one_ddi(ddis[0])
In [ ]:
In [7]:
parallel_process(process_one_ddi, ddis, nproc = 2)
In [ ]:
In [2]:
LOO_out_performance_for_all(ddis)
In [ ]:
In [3]:
one_ddi_family.get_LOO_perfermance('AAONLY', '1')
In [ ]:
data =a
a = data[:, 20:227]
b = data[:, 247:454]
X = np.hstack((a,b))
print a.shape, b.shape, X.shape, X
In [ ]: