In [1]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
In [2]:
df = pd.read_csv('../thesisgenerator/coverage_stats.csv', index_col=0).convert_objects(convert_numeric=True)
In [3]:
df.head()
Out[3]:
In [4]:
ddf = df.query('param_composer=="Add" & param_algorithm=="word2vec" & param_unlabelled=="wiki"')
In [5]:
ddf = ddf.sort('param_unlabelled_percentage')
In [6]:
# plt.plot(ddf.param_unlabelled_percentage, ddf.AN_count_total, label='AN')
# plt.plot(ddf.param_unlabelled_percentage, ddf.NN_count_total, label='NN')
plt.plot(ddf.param_unlabelled_percentage, ddf.N_count_total, label='N')
# plt.plot(ddf.param_unlabelled_percentage, ddf.N_count_weighted, label='NW')
# plt.plot(ddf.param_unlabelled_percentage, ddf['1-GRAM_count_weighted'], label='1grW')
# plt.plot(ddf.param_unlabelled_percentage, ddf['1-GRAM_count_total'], label='1gr')
plt.plot(ddf.param_unlabelled_percentage, ddf.J_count_total, label='J')
plt.plot(ddf.param_unlabelled_percentage, ddf.V_count_total, label='V')
# plt.plot(ddf.param_unlabelled_percentage, ddf.V_count_weighted, label='VW')
plt.axvline(15, c='k')
plt.legend(loc='upper left')
Out[6]:
In [7]:
cov = ddf['param_unlabelled_percentage N_count_total J_count_total V_count_total AN_count_total NN_count_total'.split()]
cov.columns = 'Percent Nouns Adjs Verbs ANs NNs'.split()
cov = pd.melt(cov, id_vars=['Percent'], value_vars='Nouns Adjs Verbs ANs NNs'.split(),
value_name='Types', var_name='PoS')
In [8]:
cov['unit'] = [0]*len(cov)
cov.head()
Out[8]:
In [9]:
# convert percentages to token counts
def compute_token_count(row):
corpus_sizes = {'cwiki': 525000000, 'wiki':1500000000}
return corpus_sizes['wiki'] * (row.Percent / 100)
cov['Tokens'] = cov.apply(compute_token_count, axis=1)
with sns.color_palette("cubehelix", 5):
sns.tsplot(cov, time='Tokens', condition='PoS', value='Types', unit='unit', marker='s', linewidth=4);
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('plot-type-coverage.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)