In [1]:
import pandas as pd

pd.options.display.max_rows = 999

columns_1 = ['leave','value_1']
columns_2 = ['leave','value_2']

In [2]:
df1 = pd.read_csv('leave_values_20200508_filled_nulls.csv', header=None, names=columns_1).set_index('leave')

In [3]:
# df2 = pd.read_csv('leave_values_20191208.csv', header=None)
# df2 = pd.read_csv('leave_values_20191208.csv', header=None)
# df2 = pd.read_csv('leave_values_20200423_filled_nulls.csv', header=None, names=columns_2).set_index('leave')
df2 = pd.read_csv('quackle_leaves.csv', header=None, names=columns_2).set_index('leave')

In [4]:
df = pd.concat([df1,df2],axis=1)
df = df.reset_index()

In [5]:
df['delta'] = df['value_1']-df['value_2']
df['length'] = df['leave'].apply(lambda x: len(x))
df['vowels'] = df['leave'].apply(lambda x: sum([y in 'AEIOU' for y in x]))
df['consonants'] = df['length']-df['vowels']
df['has_a_blank'] = df['leave'].apply(lambda x: sum([y=='?' for y in x])>0)
df['has_two_blanks'] = df['leave'].apply(lambda x: sum([y=='?' for y in x])==2)

In [6]:
df


Out[6]:
leave value_1 value_2 delta length vowels consonants has_a_blank has_two_blanks
0 ? 24.078203 25.570300 -1.492097 1 0 1 True False
1 A 1.371000 -0.636719 2.007719 1 1 0 False False
2 B -2.625032 -2.007810 -0.617222 1 0 1 False False
3 C 0.810967 0.847656 -0.036689 1 0 1 False False
4 D 0.532724 0.449219 0.083505 1 0 1 False False
... ... ... ... ... ... ... ... ... ...
914619 ??WXYY 31.301190 25.257800 6.043390 6 0 6 True True
914620 ?WXYYZ 22.522311 10.207000 12.315311 6 0 6 True False
914621 ??WXYZ 12.106311 29.906200 -17.799889 6 0 6 True True
914622 ??WYYZ 32.654428 25.316400 7.338028 6 0 6 True True
914623 ??XYYZ 30.771954 25.968800 4.803154 6 0 6 True True

914624 rows × 9 columns

Remaining ideas

  1. RMS error by leave length
  2. Heatmap of error by vowel-consonant ratio

In [17]:
x_tile_df = {i: df.loc[df['length']==i][['leave','value_1','value_2','delta']].sort_values('delta') for i in range(1,7)}

In [18]:
one_tile_df = x_tile_df[1].rename(columns={'value_1':'macondo_0508','value_2':'quackle'})

In [19]:
two_tile_df = x_tile_df[2].rename(columns={'value_1':'macondo_0508','value_2':'quackle'})

In [14]:
one_tile_df.to_csv('comp_1tileleaves_0508_quackle.csv', index=False)

In [15]:
two_tile_df.to_csv('comp_2tileleaves_0508_quackle.csv', index=False)

In [25]:
x_tile_df[6]


Out[25]:
leave value_1 value_2 delta
894238 ?MPPYZ -13.209689 24.3984 -37.608089
704275 ??FGGG -9.321189 27.1680 -36.489189
876321 ?LLLLV -34.996129 0.6875 -35.683629
560349 ??DDVV -16.429689 18.8164 -35.246089
532265 ?CLLLL -29.270104 5.4375 -34.707604
... ... ... ... ...
886010 ??LPQX 121.631644 18.6797 102.951944
887144 ??LQXY 134.498811 21.5391 112.959711
879005 ??LLQX 132.487597 17.5586 114.928997
618171 ??DLQX 228.663267 16.8477 211.815567
559834 ??DDQX 230.742978 14.5391 216.203878

737311 rows × 4 columns


In [ ]: