In [20]:
import re
from sys import argv
#script, input_file = argv
input_filename = 'uniprot-sausa300.tab'
input_file = open(input_filename)
output_csv = open(input_filename + '.csv','w', encoding='utf-8')
output_gene = open(input_filename + 'gene.csv','w', encoding='utf-8')
t = re.compile('\t')
b = re.compile(' ')
output_gene.write('Gene, Gene name alias\n')
output_csv.write('Uniprot entry, Entry name, Protein names, Gene, Gene name alias\n')
for line in input_file:
#print(line)
entry = t.split(line)
#print(entry)
gene_name = b.split(entry[4]) #gene names
alias = []
gene = 0
for name in gene_name:
if re.match('SAUSA300', name):
gene = name
else:
alias.append(name)
#print(gene, ' '.join(alias))
if gene:
csv = [entry[0], entry[1], entry[3], gene, ' '.join(alias)]
# entry, entry name, protein names, gene, gene name alias
gene = gene + ',' + ' '.join(alias) + '\n'
#gene = entry[]
output_gene.write(gene)
output_csv.write(','.join(csv) + '\n')
In [ ]: