In [1]:
json_filename = 'x.json'  # accidentally erased json part, re-integrate later
gexf_filename = 'gh43.gexf'

import os
#os.chdir("C:/Users/David/Documents/Dropbox")
os.chdir("C:/_Dropbox/Dropbox")

import pandas as pd
df = pd.read_csv("gh43treetable.txt", dtype=str)

In [2]:
df.columns = [['ancid', 'desc1', 'desc2', 'branchlength1', 'branchlength2']]

In [3]:
# replace spaces with underscores, then make a list of all ids with duplicates included, then remove duplicates
id_list = []
for idx, row in df.iterrows():
    for field in ['ancid', 'desc1', 'desc2']:
        df[field][idx] = (df[field][idx].strip()).replace(" ", "_")
        id_list.append(df[field][idx])
print len(id_list)
id_list = list(set(id_list))
print len(id_list)


639
427

In [4]:
print df.iloc[2]
print id_list[:10]


ancid                      217
desc1                      216
desc2             ASPNI_A2QT85
branchlength1     0.0000000000
branchlength2     0.0000000000
Name: 2, dtype: object
['Paeby1p7_018872', 'Paeby1p7_018871', 'bri_CHGT_02150', 'ABN43C_PENCH', '344', '345', 'ABN43A_PENCH', 'CORTH_1_02834', '340', '341']

In [5]:
# make dicts of IDs and positions in id_list
dict_pti = {}
dict_itp = {}
for pos in range(len(id_list)):
    dict_pti[pos] = id_list[pos]
    dict_itp[id_list[pos]] = pos

In [6]:
#make link list
link_list = []
for idx, row in df.iterrows():
    for descnum in ['desc1', 'desc2']:
        templist = []
        templist.append(df.ancid[idx])
        templist.append(df[descnum][idx])
        link_list.append(templist)
print link_list[:10]


[['215', 'ABN43A_ASPNG'], ['215', 'ANIG203143G'], ['216', '215'], ['216', 'ASPNI_P42256'], ['217', '216'], ['217', 'ASPNI_A2QT85'], ['218', 'PENCH_Q5H7M8'], ['218', 'ABN43A_PENCH'], ['219', 'Paeby1p7_018872'], ['219', '218']]

In [8]:
# write gexf
with open(gexf_filename, "w") as f:
    f.write('<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">\n    <meta lastmodifieddate="2009-03-20">\n        <creator>Gexf.net</creator>\n')
with open(gexf_filename, "a") as f:
    f.write('        <description>')
    f.write(gexf_filename)
    f.write('</description>\n    </meta>\n    <graph mode="static" defaultedgetype="directed">\n')
    f.write('        <nodes>\n')
    for pos in range(len(id_list)):
        f.write('            <node id="')
        f.write(str(pos))
        f.write('" label="')
        f.write(str(dict_pti[pos]))
        f.write('" />\n')
    f.write('        </nodes>\n')
    f.write('        <edges>\n')
    for i in range(len(link_list)):
        f.write('            <edge id="')
        f.write(str(i))
        f.write('" source="')                
        f.write(str(dict_itp[link_list[i][0]]))                
        f.write('" target="')                
        f.write(str(dict_itp[link_list[i][1]]))                 
        f.write('" />\n')
    f.write('        </edges>\n')
    f.write('    </graph>\n')
    f.write('</gexf>\n')

In [ ]: