In [32]:
# Approximation data (based on Levenshtein distance)
path = 'data/verification/'

#DB_medium = pandas.read_csv(path + 'DBS_medium_raw2.csv')
#M13_medium = pandas.read_csv(path + 'M13_medium_raw2.csv')

def read_data(filename):
    data = []
    with open(filename, 'r') as f:
        sequence = f.readline().strip()
        energies = []
        for line in f.readlines():
            if line.startswith('0.0'):
                pass
            elif '&' in line:
                energy, _, _ = line.strip().split(',')
                energies.append(float(energy))
            else:
                data.append((sequence, energies))
                sequence = line.strip()
                energies = []
        data.append((sequence, energies))
    return data
                
data = read_data(path + 'DBS_medium_raw2.csv')
print len(data)


629

In [36]:
import bigfloat

def get_boltzmann_distribution(energies):
    R = 8.3144621  # gas constant
    T = 293.15  # room temperature
    factor = 4184.0  # joules_per_kcal
    ps = []
    total = bigfloat.BigFloat(0)
    for energy in energies:
        p = bigfloat.exp((-energy*factor)/(R*T), bigfloat.precision(1000))
        ps.append(p)
        total = bigfloat.add(total, p)
    normal_ps = []
    for p in ps:
        normal_ps.append(float(bigfloat.div(p,total)))
    return normal_ps

with open('DBS_verification.csv', 'w') as out:
    for d in data:
        out.write(str(d[0]) + ',' + str(get_boltzmann_distribution(d[1])[0]) + '\n')