This notebook was put together by [Roman Prokofyev](http://prokofyev.ch)@[eXascale Infolab](http://exascale.info/). Source and license info is on [GitHub](https://github.com/dragoon/kilogram/).
In [1]:
import matplotlib.pyplot as plt
from mpltools import style
import numpy as np
style.use('ggplot')
%matplotlib inline
import pandas as pd
import shelve
from collections import defaultdict
In [2]:
count_dict = {}
for line in open('../mapreduce/predicted_label_counts.txt'):
uri, label, values = line.split('\t')
upper_count, lower_count = values.split(',')
count_dict[(uri, label)] = {'infer_normal': int(upper_count), 'infer_lower': int(lower_count), 'len': len(label.split('_')),
'label': label, 'organ_normal': 0, 'organ_lower': 0, 'uri': uri}
for line in open('../mapreduce/organic_label_counts.txt'):
uri, label, values = line.split('\t')
if (uri, label) in count_dict:
upper_count, lower_count = values.split(',')
count_dict[(uri, label)].update({'organ_normal': int(upper_count), 'organ_lower': int(lower_count)})
counts_df = pd.DataFrame(count_dict.values())
del count_dict
counts_df.head()
Out[2]:
In [22]:
from __future__ import division
"""
We never exclude uppercase labels since we don't match at the beginning of a sentence
"""
includes = open('../mapreduce/unambiguous_labels.txt', 'w')
for row in counts_df.iterrows():
row = row[1]
exclude = False
label = row['label']
uri = row['uri']
# skip uppercase
if label.isupper():
includes.write(label+'\t'+uri+'\n')
continue
# if label appears only in lowercase - add to lower includes
if row['organ_normal'] == 0: # means label is lowercase
if row['organ_lower'] > 1:
includes.write(label+'\t'+uri+'\n')
continue
else:
infer_ratio = row['infer_normal']/(row['infer_lower'] or 1)
orig_ratio = row['organ_normal']/(row['organ_lower'] or 1)
if infer_ratio == 0:
# weird label, p. ex. 中华人民共和国
continue
# always write a normal-case label
includes.write(label+'\t'+uri+'\n')
if orig_ratio/infer_ratio < 2 and row['infer_lower'] > 0:
includes.write(label.lower()+'\t'+uri+'\n')
includes.close()
hdfs dfs -cat /user/roman/wikipedia_ngrams/* | python spark_typed_ngrams_from_plain.py > typed_ngrams.txt
hdfs dfs -put typed_ngrams.txt /user/roman/wikipedia_typed_ngrams/
./run_job.py -m ./type_prediction/mapper.py -r ./type_prediction/reducer.py "/user/roman/wikipedia_typed_ngrams" /user/roman/hbase_wikipedia_typed_ngrams
pig -p table=typogram -p path=/user/roman/hbase_wikipedia_typed_ngrams ../extra/hbase_upload_array.pig
In [3]:
counts_df[(counts_df.uri == 'Cicada')]
Out[3]:
In [19]:
counts_df[(counts_df.organ_normal > 0) & (counts_df.infer_lower > 0) & (counts_df.infer_normal == 0)]
Out[19]:
In [ ]: