In [1]:
# !pip install git+https://github.com/windj007/TexSoup timeout-decorator
# !apt-get install -y latexmk
# !pip install ngram
In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import tqdm
%pylab inline
from table_gen import *
In [3]:
# # pdf2samples('./data/arxiv/1/1312.6989.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/44/1601.04208.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/sources/1006.1798.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# # pdf2samples('./data/arxiv/1/5/1201.2088.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/8/0708.1672.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
In [4]:
# frequent_errors = collections.Counter(err
# for f in glob.glob('./data/arxiv/err_logs/*.log')
# for err in {line
# for line in open(f, 'r', errors='replace')
# if "error:" in line})
# frequent_errors.most_common(10)
In [147]:
# preprocess_latex_file('./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex')
compile_latex('./111/tex-playground/')
# !mkdir ./data/arxiv/1/44/pages/
pages = pdf_to_pages('./111/tex-playground/playground.pdf', './111/tex-playground/pages/')
with open('./111/tex-playground/playground.tex') as f:
soup = TexSoup.TexSoup(f.read())
# test_latex = r'''
# \documentclass{llncs}
# \usepackage{graphicx}
# \usepackage{multirow}
# \usepackage{hyperref}
# \usepackage[a4paper, landscape, margin={0.1in, 0.1in}]{geometry}
# \usepackage{tabularx}
# \usepackage{makecell}
# \begin{document}
# \begin{table}
# \renewcommand{\arraystretch}{0.42}
# \setlength{\tabcolsep}{1.52pt}
# \begin{tabular}{ c c r|c|r|l|c|}
# & .Myrnvnl & \multicolumn{5}{ c }{Bd iH VXDy -aL} \\
# & & \multicolumn{2}{|c|}{AlUBLk.cv} & \multicolumn{2}{ c }{ \makecell{ nUd qLoieco jVsmTLRAf \\ UPS TJL xGIH } } & qe.V.. \\
# & & \makecell{ MG MTBSgR, \\ ,lHm Ihmd \\ lbrT } & -OfQuxW & MeY XR & kSG,dEFX & \\
# \hline \makecell{ LuekQjL NSs TVq \\ NDC } & 8.80 Mv & osw & K*Dgc & 53.16 Tr & 8.92 & 44.18 j- \\
# \hline oL & 55.67 UueS & vGkGl & -MUJhqduw & 67.86 sxRy- & 63.51 & 10.85 A*,hKg \\
# nA & 7.46 ll & yVw,P & vuege & 96.36 FuEa & 80.27 & 40.46 NeWuNVi \\
# fA & 0.47 j,Gg.Gv & TrwtXRS & yfhyTWJ & 42.20 sWdg & 8.76 & 98.68 ND \\
# \hline \makecell{ hD XXOl dMCTp Yib \\ p.IE TcBn } & 7.90 Pm & CbyWQtUTY, & FPFh.M & 22.38 Hs & 16.03 & 33.20 hU \\
# \hline \makecell{ LAxtFM cmBvrJj hCRx, \\ LiQYh } & 97.15 *a & ..pb & ejNtniag & 84.67 F.xHN & 10.31 & 23.57 R,rdK \\
# x*d afKGwJw & 82.46 REuwGLME & cIQv & iCLkFNY & 95.92 iHL & 79.26 & 80.85 L-NR \\
# \end{tabular}
# \end{table}
# \end{document}
# '''
# soup = TexSoup.TexSoup(test_latex)
In [84]:
# !cat -n ./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex
In [85]:
tables = list(soup.find_all('table'))
In [86]:
t = tables[0]
In [87]:
t.tabular
Out[87]:
In [91]:
qq = structurize_tabular_contents(t.tabular)
qq
Out[91]:
In [92]:
list(get_all_tokens(qq.rows[8][2]))
Out[92]:
In [93]:
ww = next(iter(get_all_tokens(qq.rows[6][0])))
print(ww)
print(type(ww))
src_pos = soup.char_pos_to_line(ww.position + len(ww.text) // 2)
src_pos
Out[93]:
In [94]:
o = subprocess.check_output(['synctex', 'view',
'-i', '{}:{}:{}'.format(src_pos[0] + 1,
src_pos[1] + 1,
'playground.tex'),
'-o', 'playground.pdf'],
cwd='./111/tex-playground/').decode('ascii')
p = parse_synctex_output(o)
In [95]:
page_i, boxes = list(p.items())[0]
box = boxes[2]
print(page_i, boxes)
In [96]:
pdf = PdfMinerWrapper('./111/tex-playground/playground.pdf')
pdf.load()
In [97]:
page_info = pdf.get_page(page_i-1)
found_boxes = list(pdf.get_boxes(page_i-1, [convert_coords_to_pq(b, page_info[1].cropbox)
for b in boxes]))
print('; '.join(pdf.get_text(page_i-1,
[convert_coords_to_pq(b, page_info[1].cropbox)])
for b in boxes))
In [98]:
table_info = list(get_table_info(soup))[1]
In [99]:
page_img = load_image_opaque(pages[page_i - 1])
make_demo_mask(page_img,
[(1,
(convert_coords_from_pq(fb.bbox, page_info[1].cropbox) * POINTS_TO_PIXELS_FACTOR).astype('int'))
for fb in found_boxes] +
[(1, (numpy.array(b) * POINTS_TO_PIXELS_FACTOR).astype('int')) for b in boxes])
Out[99]:
In [3]:
pdf_latex_to_samples('1',
'.',
'./111/tex-playground/playground.tex',
'./111/tex-playground/playground.pdf',
'./111/tex-playground/',
get_table_info,
boxes_aggregator=aggregate_object_bboxes,
display_demo=True)
In [ ]:
# print('\n*********\n'.join(map(str, get_all_tokens(t.tabular))))
In [11]:
# table_def = gen_table_contents()
# print('columns', len(table_def[2][0]), 'rows', len(table_def[2]))
In [12]:
# # %%prun
# render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated/1.pdf',
# print_latex_content=True,
# display_demo=True,
# on_wrong_parse='ignore')
In [13]:
def gen_and_save_table(i, seed):
numpy.random.seed(seed)
table_def = gen_table_contents()
render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated_with_char_info/big_simple_lined/src/{}'.format(i))
seeds = numpy.random.randint(0, 2000, size=2000)
joblib.Parallel(n_jobs=6)(joblib.delayed(gen_and_save_table)(i, s) for i, s in enumerate(seeds))
Out[13]:
In [32]:
# for dirname in ['complex_clean', 'dense', 'lined', 'multiline_lined', 'no_lined', 'big_simple_lined', 'big_simple_no_lined']:
# print(dirname)
# for subdir in ['demo', 'src']:
# print(subdir)
# src_full_dirname = os.path.join('./data/generated', dirname, subdir)
# target_full_dirname = os.path.join('./data/generated/full', subdir)
# for fname in tqdm.tqdm(os.listdir(src_full_dirname)):
# shutil.copy2(os.path.join(src_full_dirname, fname),
# os.path.join(target_full_dirname, dirname + '_' + fname))
In [2]:
archive_files = list(glob.glob('./data/arxiv/sources/*.tar.gz'))
print('Total downloaded', len(archive_files))
# def _get_archive_content_type(fname):
# return read_metadata(fname)['content_type']
# print('Types:\n', collections.Counter(joblib.Parallel(n_jobs=-1)(joblib.delayed(_get_archive_content_type)(archive)
# for archive in archive_files)).most_common())
# print()
Total downloaded 208559
Types:
[('application/x-eprint-tar', 149642), ('application/x-eprint', 40360), ('application/pdf', 18292), ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', 218), ('application/postscript', 47)]
In [3]:
good_papers = set()
bad_papers = set()
if os.path.exists('./good_papers.lst'):
with open('./good_papers.lst', 'r') as f:
good_papers = set(line.strip() for line in f)
if os.path.exists('./bad_papers.lst'):
with open('./bad_papers.lst', 'r') as f:
bad_papers = set(line.strip() for line in f)
print('Good papers', len(good_papers))
print('Bad papers', len(bad_papers))
In [ ]:
# def check_archive_func(fname):
# return (fname,
# contains_something_interesting(fname, get_table_info))
# archive_files_with_check_res = joblib.Parallel(n_jobs=12)(joblib.delayed(check_archive_func)(fname)
# for fname in archive_files
# if not (fname in bad_papers or fname in good_papers))
# for fname, is_good in archive_files_with_check_res:
# if is_good:
# good_papers.add(fname)
# else:
# bad_papers.add(fname)
In [ ]:
# with open('./good_papers.lst', 'w') as f:
# f.write('\n'.join(sorted(good_papers)))
# with open('./bad_papers.lst', 'w') as f:
# f.write('\n'.join(sorted(bad_papers)))
In [ ]:
ARXIV_INOUT_PAIRS_DIR = './data/arxiv/inout_pairs/'
def _pdf2samples_mp(archive):
try:
pdf2samples(archive,
ARXIV_INOUT_PAIRS_DIR,
lambda s: get_table_info(s, extract_cells=False),
aggregate_object_bboxes)
except Exception as ex:
with open(os.path.join(ARXIV_INOUT_PAIRS_DIR, os.path.basename(archive) + '.log'), 'w') as f:
f.write(str(ex) + '\n')
f.write(traceback.format_exc())
_ = joblib.Parallel(n_jobs=10)(joblib.delayed(_pdf2samples_mp)(arc)
for arc in good_papers)
In [ ]: