In [48]:
import numpy as np
import pandas as pd

In [49]:
inference_directory = r'C:\mallet\clinton\data\inference'

In [50]:
elda_df = pd.read_csv(r'{}\estimates.csv'.format(inference_directory))
elda_df.columns = [ 'doc#', 'chibs', 'hm', 'is', 'lengths', 'lr' ]

In [51]:
elda_df.loc[:, [ 'doc#', 'chibs', 'hm', 'is', 'lr', 'lengths' ]]


Out[51]:
doc# chibs hm is lr lengths
0 -1 -3308.066350 -4057.316653 -4833.843998 -6979.759686 526.0
1 112 -2194.937182 -2856.354385 -3203.586426 -3753.796858 351.0
2 116 -7483.372249 -10951.847889 -12795.477996 -79225.555694 1403.0
3 120 -48.490558 -55.460270 -16.491044 -24.962511 5.0
4 143 -13447.325183 -18445.449022 -21766.299243 -44525.262588 2321.0
5 145 -97.303550 -188.407712 -145.152475 -116.830846 20.0
6 163 -172.382007 -270.985854 -204.684733 -227.548619 28.0
7 165 -30.017762 -85.502313 -41.103530 -59.482084 8.0
8 172 -56.021985 -76.144785 -34.592093 -52.393771 8.0
9 191 -4106.326833 -6150.510195 -6433.514249 -8551.285793 706.0
10 192 -2803.084441 -4094.311340 -4445.139750 -5262.427147 497.0
11 200 -14.914917 -86.151111 -35.155587 -39.206374 8.0
12 213 -892.787334 -1420.265081 -1329.312324 -1599.878574 158.0
13 225 -244.775343 -347.489823 -324.899533 -3370.872687 53.0
14 226 -125.339999 -177.194220 -118.787917 -142.395256 18.0
15 24 -122.562321 -202.027510 -121.503106 -95.786155 18.0
16 25 -59.670483 -141.808724 -98.170743 -75.758010 15.0
17 30 -1512.635409 -2211.316185 -2311.815227 -3358.740777 250.0
18 38 -6768.717885 -9160.411167 -11088.509753 -20538.781602 1244.0
19 39 -57.926385 -75.653664 -34.567976 -47.054955 7.0
20 66 -19.478866 -20.253009 0.274041 -15.711790 2.0
21 72 -265.247270 -412.648026 -326.038144 -353.309789 45.0
22 81 -20275.023190 -28901.223720 -33727.429473 -93511.181363 3643.0
23 87 -25307.518361 -34634.673918 -41478.053965 -167038.102826 4485.0
24 94 -730.092663 -1163.826986 -1079.985824 -1265.651605 125.0

In [52]:
## perplexity, per document, probably not usable ...

In [53]:
def calc_perplexity(estimate, tokens):
    assert len(estimate) == len(tokens)
    
    output = []
    for i in range(0, len(estimate)):
        output.append(
            np.exp(-1 * (estimate[i] / tokens[i])))
    
    return output

In [54]:
tokens = elda_df.lengths.values.tolist()
elda_df['chibs_perplexity'] = calc_perplexity(elda_df.chibs.values.tolist(), tokens)
elda_df['hm_perplexity'] = calc_perplexity(elda_df.hm.values.tolist(), tokens) 
elda_df['is_perplexity'] = calc_perplexity(elda_df['is'].values.tolist(), tokens) 
elda_df['lr_perplexity'] = calc_perplexity(elda_df.lr.values.tolist(), tokens)

elda_df.loc[:, ['doc#', 'lengths', 'chibs', 'chibs_perplexity', 'hm', 'hm_perplexity', 'is', 'is_perplexity', 'lr', 'lr_perplexity']]


Out[54]:
doc# lengths chibs chibs_perplexity hm hm_perplexity is is_perplexity lr lr_perplexity
0 -1 526.0 -3308.066350 538.668054 -4057.316653 2238.429445 -4833.843998 9796.862762 -6979.759686 5.792591e+05
1 112 351.0 -2194.937182 519.767858 -2856.354385 3421.251781 -3203.586426 9200.626470 -3753.796858 4.411601e+04
2 116 1403.0 -7483.372249 207.231441 -10951.847889 2455.341915 -12795.477996 9136.970030 -79225.555694 3.342223e+24
3 120 5.0 -48.490558 16286.820873 -55.460270 65647.445166 -16.491044 27.064119 -24.962511 1.473045e+02
4 143 2321.0 -13447.325183 328.246071 -18445.449022 2827.643792 -21766.299243 11825.142942 -44525.262588 2.144645e+08
5 145 20.0 -97.303550 129.693954 -188.407712 12337.338630 -145.152475 1418.880901 -116.830846 3.443100e+02
6 163 28.0 -172.382007 471.774088 -270.985854 15963.596684 -204.684733 1495.429945 -227.548619 3.383738e+03
7 165 8.0 -30.017762 42.615594 -85.502313 43817.525298 -41.103530 170.364667 -59.482084 1.694699e+03
8 172 8.0 -56.021985 1099.651030 -76.144785 13603.713839 -34.592093 75.490860 -52.393771 6.987000e+02
9 191 706.0 -4106.326833 335.736612 -6150.510195 6073.988880 -6433.514249 9069.082812 -8551.285793 1.820985e+05
10 192 497.0 -2803.084441 281.465233 -4094.311340 3782.161637 -4445.139750 7661.347491 -5262.427147 3.967135e+04
11 200 8.0 -14.914917 6.451836 -86.151111 47519.189849 -35.155587 80.999937 -39.206374 1.343968e+02
12 213 158.0 -892.787334 284.448650 -1420.265081 8014.594725 -1329.312324 4506.919393 -1599.878574 2.497957e+04
13 225 53.0 -244.775343 101.332044 -347.489823 703.741963 -324.899533 459.518807 -3370.872687 4.185281e+27
14 226 18.0 -125.339999 1057.151499 -177.194220 18847.269391 -118.787917 734.601908 -142.395256 2.726700e+03
15 24 18.0 -122.562321 905.980562 -202.027510 74888.120000 -121.503106 854.206128 -95.786155 2.046811e+02
16 25 15.0 -59.670483 53.411827 -141.808724 12758.014108 -98.170743 695.559233 -75.758010 1.561058e+02
17 30 250.0 -1512.635409 424.342807 -2211.316185 6941.441494 -2311.815227 10376.105652 -3358.740777 6.834871e+05
18 38 1244.0 -6768.717885 230.693859 -9160.411167 1577.623014 -11088.509753 7432.317861 -20538.781602 1.480203e+07
19 39 7.0 -57.926385 3925.299203 -75.653664 49398.052412 -34.567976 139.530375 -47.054955 8.305901e+02
20 66 2.0 -19.478866 16973.911126 -20.253009 24996.836579 0.274041 0.871952 -15.711790 2.580903e+03
21 72 45.0 -265.247270 362.993083 -412.648026 9604.203408 -326.038144 1401.491198 -353.309789 2.569146e+03
22 81 3643.0 -20275.023190 261.249107 -28901.223720 2788.776211 -33727.429473 10489.677809 -93511.181363 1.405360e+11
23 87 4485.0 -25307.518361 282.224252 -34634.673918 2258.227114 -41478.053965 10385.569436 -167038.102826 1.495354e+16
24 94 125.0 -730.092663 344.034281 -1163.826986 11054.754525 -1079.985824 5652.688726 -1265.651605 2.496457e+04

In [ ]: