In [62]:
import pandas as pd
import numpy as np
import csv
import gc
import os
from sklearn.model_selection import train_test_split
from random import randint
In [63]:
directory='../GeneInteractionsBN_Datasets/Batches'
if not os.path.exists(directory):
os.makedirs(directory)
In [67]:
# Dat Size and proportions
#files = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
directory = '../GeneInteractionsBN_Datasets/Labeled/'
data_list = {}
total_size = 0
for filename in os.listdir(directory):
if filename.endswith(".csv"):
print(os.path.join(directory, filename))
csv = pd.read_csv(filepath_or_buffer=os.path.join(directory, filename),sep=',')
key_name = filename.split('.')[0].split('_')[0]
data_list[key_name]=[0,[]]
#print(csv['Target'][0])
for i in range(0,len(csv)):
value = int(csv['Target'][i])
if (value==1 or value==0):
# data_list[key_name][1][0] for index
# data_list[key_name][1][2] for marker
data_list[key_name][1].append([i,value,0])
data_list[key_name][0]=data_list[key_name][0]+1
#print(key_name)
#print(data_list[key_name])
total_size = total_size + data_list[key_name][0]
print(data_list[key_name][0])
print(total_size)
del csv
gc.collect()
continue
else:
continue
In [65]:
marked = 0
for key in data_list:
for i in data_list[key][1]:
if i[2]==1:
marked=marked+1
print(marked)
In [ ]:
directory = '../GeneInteractionsBN_Datasets/DataGeneCausality/Raw/'
batch_size = 6000
n_loops = int(total_size/batch_size)
for i in range(0,n_loops):
csv_output = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
print(os.path.join(directory, filename))
csv = pd.read_csv(filepath_or_buffer=os.path.join(directory, filename),sep=';')
pd_headers = csv.columns.insert(0,'TrueIndex')
key_name = filename.split('.')[0]
contribution_size = int(data_list[key_name][0]*batch_size/total_size)
print(contribution_size)
while contribution_size>0:
chosen_index = randint(0, data_list[key_name][0]-1)
# data_list[key_name][1][i][0] for index
# data_list[key_name][1][i][2] for marker
if data_list[key_name][1][chosen_index][2] == 0: # proceeds
data_list[key_name][1][chosen_index][2] = 1 # sets marker
contribution_size = contribution_size-1
# inserting first the true index and then the values
csv_output.append(np.insert(csv.iloc[data_list[key_name][1][chosen_index][0],:].values,0,data_list[key_name][1][chosen_index][0]))
del csv
gc.collect()
continue
else:
continue
df = pd.DataFrame(data=csv_output,columns=pd_headers)
df.to_csv('../GeneInteractionsBN_Datasets/Batches/'+'batch_'+str(i)+'.csv')
del csv_output
del df
In [56]:
marked = 0
for key in data_list:
for i in data_list[key][1]:
if i[2]==1:
marked=marked+1
print(marked)
In [57]:
n_loops
Out[57]:
In [ ]: