In [1]:
import pandas as pd
import numpy as np
import csv
import re

In [2]:
# read original species data into a DataFrame named original_species
MW = pd.read_csv('cti_MW.txt',header=None)
original_species = MW.iloc[:,[0,3,4,5]]
original_species.columns = ['name','C_num','H_num','O_num']

In [3]:
original_species


Out[3]:
name C_num H_num O_num
0 'CH3OH' : [32 [1 4.0 1 ]
1 'ETOH' : [46 [2 6.0 1 ]
2 'ALD3' : [58 [3 6.0 1 ]
3 'PLIGH' : [436 [22 28.0 9 ]
4 'LIGH' : [437 NaN NaN NaN
5 'PLIGO' : [422 [20 22.0 10]
6 'LIGO' : [423 NaN NaN NaN
7 'PLIGC' : [300 [17 16.0 5 ]
8 'LIGC' : [301 NaN NaN NaN
9 'PLIGM2' : [378 NaN NaN NaN
10 'LIGM2' : [379 NaN NaN NaN
11 'PLIG' : [258 NaN NaN NaN
12 'LIG' : [259 NaN NaN NaN
13 'PADIOM2' : [227 NaN NaN NaN
14 'ADIOM2' : [228 NaN NaN NaN
15 'PADIO' : [167 NaN NaN NaN
16 'ADIO' : [168 NaN NaN NaN
17 'PFET3' : [165 NaN NaN NaN
18 'PFET3M2' : [225 NaN NaN NaN
19 'SYNAPYL' : [210 NaN NaN NaN
20 'COUMARYL' : [150 NaN NaN NaN
21 'PKETM2' : [225 NaN NaN NaN
22 'KETM2' : [226 NaN NaN NaN
23 'KET' : [166 NaN NaN NaN
24 'KETDM2' : [208 NaN NaN NaN
25 'KETD' : [148 NaN NaN NaN
26 'MGUAI' : [154 NaN NaN NaN
27 'PHENOL' : [94 NaN NaN NaN
28 'C10H2M4' : [244 NaN NaN NaN
29 'C10H2M2' : [182 NaN NaN NaN
... ... ... ... ...
63 'C2H6' : [30 [2 6.0 0 ]]
64 'PCH3' : [15 NaN NaN NaN
65 'CH3CHO' : [44 [2 4.0 1 ]]
66 'CO2' : [44 [1 0.0 2 ]]
67 'CH2CO' : [42 [2 2.0 1 ]]
68 'OH' : [17 [0 1.0 1 ]]
69 'C3H6' : [42 [3 6.0 0 ]]
70 'C3H6O2' : [74 [3 6.0 2 ]
71 'C3H4O2' : [72 [3 4.0 2 ]
72 'RC3H7O2' : [75 [3 7.0 2 ]
73 'RC3H5O2' : [73 [3 5.0 2 ]
74 'RC3H3O' : [55 [3 3.0 1 ]
75 'PRLIGH' : [435 NaN NaN NaN
76 'PRLIGH2' : [435 NaN NaN NaN
77 'RMGUAI' : [153 NaN NaN NaN
78 'RLIGH' : [436 NaN NaN NaN
79 'PCOHP2' : [29 NaN NaN NaN
80 'VADIOM2' : [228 [11 16.0 5 ]
81 'VKETM2' : [226 [11 14.0 5 ]
82 'VKETDM2' : [208 [11 12.0 4 ]
83 'VSYNAPYL' : [210 [11 14.0 4 ]
84 'VMGUAI' : [154 [8 10.0 3 ]
85 'VCOUMARYL': [150 [9 10.0 2 ]
86 'VADIO' : [168 [9 12.0 3 ]
87 'VKET' : [166 [9 10.0 3 ]
88 'VKETD' : [148 [9 8.0 2 ]
89 'VPHENOL' : [94 [6 6.0 1 ]
90 'C3H4O' : [56 [3 4.0 1 ]
91 'C3H8O2' : [76 [3 8.0 2 ]
92 'CH4' : [16 [1 4.0 0 ]]

93 rows × 4 columns


In [4]:
#clear the data of species name and fit them into new dataframe called species
species = pd.DataFrame()
species_name = []

for i in original_species.name:
    species_name.append(i.split(':')[0].split()[0])

species['name'] = species_name

In [5]:
#clear the data of C_num
C_num_list = []
for i in range(93):
    if original_species.C_num[i] is np.nan:
        C_num_list.append(str(np.nan))
    else:
        #print(original_species.C_num[i])
        C_num = str(int(re.findall(r"\d+\.?\d*",original_species.C_num[i])[0]))
        C_num_list.append(C_num)

species['C_num'] = C_num_list

In [6]:
#clear the data of H_num
H_num_list = []
for i in range(93):
    if original_species.C_num[i] is np.nan:
        H_num_list.append(str(np.nan))
    else:
        H_num = str(int(original_species.H_num[i]))
        H_num_list.append(H_num)

species['H_num'] = H_num_list

In [7]:
#clear the data of O_num
O_num_list = []
for i in range(93):
    if original_species.O_num[i] is np.nan:
        O_num_list.append(str(np.nan))
    else:
        O_num = str(int(re.findall(r"\d+\.?\d*",original_species.O_num[i])[0]))
        O_num_list.append(O_num)

species['O_num'] = O_num_list

In [8]:
species.head()


Out[8]:
name C_num H_num O_num
0 'CH3OH' 1 4 1
1 'ETOH' 2 6 1
2 'ALD3' 3 6 1
3 'PLIGH' 22 28 9
4 'LIGH' nan nan nan

In [9]:
df = pd.DataFrame()
species_list = []
for i in range(93):
    #write species name part
    content_name = "species"+"(name="+species.name[i]
    #write species atoms part
    if original_species.C_num[i] is np.nan:
        content_atoms = ")"
        content = content_name+content_atoms
        species_list.append(content)
    else:
        content_atoms = ",atoms='C:"+species.C_num[i]+" "+"H:"+species.H_num[i]+" "+"O:"+species.O_num[i]+"')"
        content = content_name+content_atoms
        species_list.append(content)

df['Species_data'] = species_list

In [25]:
df


Out[25]:
Species_data
0 species(name='CH3OH',atoms='C:1 H:4 O:1')
1 species(name='ETOH',atoms='C:2 H:6 O:1')
2 species(name='ALD3',atoms='C:3 H:6 O:1')
3 species(name='PLIGH',atoms='C:22 H:28 O:9')
4 species(name='LIGH')
5 species(name='PLIGO',atoms='C:20 H:22 O:10')
6 species(name='LIGO')
7 species(name='PLIGC',atoms='C:17 H:16 O:5')
8 species(name='LIGC')
9 species(name='PLIGM2')
10 species(name='LIGM2')
11 species(name='PLIG')
12 species(name='LIG')
13 species(name='PADIOM2')
14 species(name='ADIOM2')
15 species(name='PADIO')
16 species(name='ADIO')
17 species(name='PFET3')
18 species(name='PFET3M2')
19 species(name='SYNAPYL')
20 species(name='COUMARYL')
21 species(name='PKETM2')
22 species(name='KETM2')
23 species(name='KET')
24 species(name='KETDM2')
25 species(name='KETD')
26 species(name='MGUAI')
27 species(name='PHENOL')
28 species(name='C10H2M4')
29 species(name='C10H2M2')
... ...
63 species(name='C2H6',atoms='C:2 H:6 O:0')
64 species(name='PCH3')
65 species(name='CH3CHO',atoms='C:2 H:4 O:1')
66 species(name='CO2',atoms='C:1 H:0 O:2')
67 species(name='CH2CO',atoms='C:2 H:2 O:1')
68 species(name='OH',atoms='C:0 H:1 O:1')
69 species(name='C3H6',atoms='C:3 H:6 O:0')
70 species(name='C3H6O2',atoms='C:3 H:6 O:2')
71 species(name='C3H4O2',atoms='C:3 H:4 O:2')
72 species(name='RC3H7O2',atoms='C:3 H:7 O:2')
73 species(name='RC3H5O2',atoms='C:3 H:5 O:2')
74 species(name='RC3H3O',atoms='C:3 H:3 O:1')
75 species(name='PRLIGH')
76 species(name='PRLIGH2')
77 species(name='RMGUAI')
78 species(name='RLIGH')
79 species(name='PCOHP2')
80 species(name='VADIOM2',atoms='C:11 H:16 O:5')
81 species(name='VKETM2',atoms='C:11 H:14 O:5')
82 species(name='VKETDM2',atoms='C:11 H:12 O:4')
83 species(name='VSYNAPYL',atoms='C:11 H:14 O:4')
84 species(name='VMGUAI',atoms='C:8 H:10 O:3')
85 species(name='VCOUMARYL',atoms='C:9 H:10 O:2')
86 species(name='VADIO',atoms='C:9 H:12 O:3')
87 species(name='VKET',atoms='C:9 H:10 O:3')
88 species(name='VKETD',atoms='C:9 H:8 O:2')
89 species(name='VPHENOL',atoms='C:6 H:6 O:1')
90 species(name='C3H4O',atoms='C:3 H:4 O:1')
91 species(name='C3H8O2',atoms='C:3 H:8 O:2')
92 species(name='CH4',atoms='C:1 H:4 O:0')

93 rows × 1 columns


In [19]:
df.to_csv('cti_species_data.txt', index=False,quoting=csv.QUOTE_NONE,escapechar='\\')