In [20]:
import re
from sys import argv

#script, input_file = argv

input_filename = 'uniprot-sausa300.tab'
input_file = open(input_filename)

output_csv = open(input_filename + '.csv','w', encoding='utf-8')
output_gene = open(input_filename + 'gene.csv','w', encoding='utf-8')

t = re.compile('\t')
b = re.compile(' ')


output_gene.write('Gene, Gene name alias\n')
output_csv.write('Uniprot entry, Entry name, Protein names, Gene, Gene name alias\n')

for line in input_file:
    #print(line)
    entry = t.split(line)
    #print(entry)
    gene_name = b.split(entry[4]) #gene names
    alias = []
    gene = 0
    for name in gene_name:
        if re.match('SAUSA300', name):
            gene = name
        else:
            alias.append(name)
    #print(gene, ' '.join(alias))
    if gene:
        csv = [entry[0], entry[1], entry[3], gene, ' '.join(alias)] 
            # entry, entry name, protein names, gene, gene name alias
        gene = gene + ',' + ' '.join(alias) + '\n'
        #gene = entry[]
        output_gene.write(gene)
        output_csv.write(','.join(csv) + '\n')

In [ ]: