BuildingBatches



In [62]:
import pandas as pd
import numpy as np
import csv
import gc
import os
from sklearn.model_selection import train_test_split
from random import randint

In [63]:
directory='../GeneInteractionsBN_Datasets/Batches'
if not os.path.exists(directory):
    os.makedirs(directory)

In [67]:
# Dat Size and proportions
#files = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
directory = '../GeneInteractionsBN_Datasets/Labeled/'
data_list = {}
total_size = 0
for filename in os.listdir(directory):
    if filename.endswith(".csv"): 
        print(os.path.join(directory, filename))
        csv = pd.read_csv(filepath_or_buffer=os.path.join(directory, filename),sep=',')
        key_name = filename.split('.')[0].split('_')[0]
        data_list[key_name]=[0,[]]
        #print(csv['Target'][0])
        for i in range(0,len(csv)):
            value = int(csv['Target'][i])
            if (value==1 or value==0):
                # data_list[key_name][1][0] for index
                # data_list[key_name][1][2] for marker
                data_list[key_name][1].append([i,value,0])
                data_list[key_name][0]=data_list[key_name][0]+1
        #print(key_name)
        #print(data_list[key_name])
        total_size = total_size + data_list[key_name][0]
        print(data_list[key_name][0])
        print(total_size)
        del csv
        gc.collect()
        continue
    else:
        continue


../GeneInteractionsBN_Datasets/Labeled/LEF1_Target.csv
68070
68070
../GeneInteractionsBN_Datasets/Labeled/CTNNB1_Target.csv
82038
150108
../GeneInteractionsBN_Datasets/Labeled/Erk_Target.csv
140730
290838
../GeneInteractionsBN_Datasets/Labeled/Ikk2_Target.csv
108984
399822
../GeneInteractionsBN_Datasets/Labeled/MYC_Target.csv
64002
463824
../GeneInteractionsBN_Datasets/Labeled/IRF4_Target.csv
96654
560478
../GeneInteractionsBN_Datasets/Labeled/Jnk_Target.csv
213102
773580

In [65]:
marked = 0
for key in data_list:
    for i in data_list[key][1]:
        if i[2]==1:
            marked=marked+1
print(marked)


0

In [ ]:
directory = '../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/'

batch_size = 6000
n_loops = int(total_size/batch_size)
for i in range(0,n_loops):
    csv_output = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            print(os.path.join(directory, filename))
            csv = pd.read_csv(filepath_or_buffer=os.path.join(directory, filename),sep=';')
            pd_headers = csv.columns.insert(0,'TrueIndex')
            key_name = filename.split('.')[0]
            contribution_size = int(data_list[key_name][0]*batch_size/total_size)
            print(contribution_size)
            while contribution_size>0:
                chosen_index = randint(0, data_list[key_name][0]-1)
                # data_list[key_name][1][i][0] for index
                # data_list[key_name][1][i][2] for marker
                if data_list[key_name][1][chosen_index][2] == 0: # proceeds
                    data_list[key_name][1][chosen_index][2] = 1 # sets marker
                    contribution_size = contribution_size-1
                    # inserting first the true index and then the values
                    csv_output.append(np.insert(csv.iloc[data_list[key_name][1][chosen_index][0],:].values,0,data_list[key_name][1][chosen_index][0]))
            del csv
            gc.collect()
            continue
        else:
            continue
    df = pd.DataFrame(data=csv_output,columns=pd_headers)
    df.to_csv('../GeneInteractionsBN_Datasets/Batches/'+'batch_'+str(i)+'.csv')
    del csv_output
    del df


../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/LEF1.csv
527
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/MYC.csv
496
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/CTNNB1.csv
636
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/Erk.csv
1091
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/IRF4.csv
749
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/Ikk2.csv
845
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/Jnk.csv
1652
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/LEF1.csv
527
../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/MYC.csv

In [56]:
marked = 0
for key in data_list:
    for i in data_list[key][1]:
        if i[2]==1:
            marked=marked+1
print(marked)


767488

In [57]:
n_loops


Out[57]:
128

In [ ]: