In [1]:
    
# !pip install git+https://github.com/windj007/TexSoup timeout-decorator
# !apt-get install -y latexmk
# !pip install ngram
    
In [1]:
    
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import tqdm
%pylab inline
from table_gen import *
    
    
In [3]:
    
# # pdf2samples('./data/arxiv/1/1312.6989.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/44/1601.04208.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/sources/1006.1798.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# # pdf2samples('./data/arxiv/1/5/1201.2088.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/8/0708.1672.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
    
In [4]:
    
# frequent_errors = collections.Counter(err
#                                       for f in glob.glob('./data/arxiv/err_logs/*.log')
#                                       for err in {line
#                                                   for line in open(f, 'r', errors='replace')
#                                                   if "error:" in line})
# frequent_errors.most_common(10)
    
In [147]:
    
# preprocess_latex_file('./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex')
compile_latex('./111/tex-playground/')
# !mkdir ./data/arxiv/1/44/pages/
pages = pdf_to_pages('./111/tex-playground/playground.pdf', './111/tex-playground/pages/')
with open('./111/tex-playground/playground.tex') as f:
    soup = TexSoup.TexSoup(f.read())
# test_latex = r'''
# \documentclass{llncs}
# \usepackage{graphicx}
# \usepackage{multirow}
# \usepackage{hyperref}
# \usepackage[a4paper, landscape, margin={0.1in, 0.1in}]{geometry}
# \usepackage{tabularx}
# \usepackage{makecell}
# \begin{document}
# \begin{table}
# \renewcommand{\arraystretch}{0.42}
# \setlength{\tabcolsep}{1.52pt}
# \begin{tabular}{ c c r|c|r|l|c|}
#   & .Myrnvnl & \multicolumn{5}{ c }{Bd iH VXDy -aL} \\
#   &  & \multicolumn{2}{|c|}{AlUBLk.cv} & \multicolumn{2}{ c }{ \makecell{ nUd qLoieco jVsmTLRAf \\ UPS TJL xGIH } } & qe.V.. \\
#   &  &  \makecell{ MG MTBSgR, \\ ,lHm Ihmd \\ lbrT }  & -OfQuxW & MeY XR & kSG,dEFX &  \\
# \hline  \makecell{ LuekQjL NSs TVq \\ NDC }  & 8.80 Mv & osw & K*Dgc & 53.16 Tr & 8.92 & 44.18 j- \\
# \hline oL & 55.67 UueS & vGkGl & -MUJhqduw & 67.86 sxRy- & 63.51 & 10.85 A*,hKg \\
#  nA & 7.46 ll & yVw,P & vuege & 96.36 FuEa & 80.27 & 40.46 NeWuNVi \\
#  fA & 0.47 j,Gg.Gv & TrwtXRS & yfhyTWJ & 42.20 sWdg & 8.76 & 98.68 ND \\
# \hline  \makecell{ hD XXOl dMCTp Yib \\ p.IE TcBn }  & 7.90 Pm & CbyWQtUTY, & FPFh.M & 22.38 Hs & 16.03 & 33.20 hU \\
# \hline  \makecell{ LAxtFM cmBvrJj hCRx, \\ LiQYh }  & 97.15 *a & ..pb & ejNtniag & 84.67 F.xHN & 10.31 & 23.57 R,rdK \\
#  x*d afKGwJw & 82.46 REuwGLME & cIQv & iCLkFNY & 95.92 iHL & 79.26 & 80.85 L-NR \\
# \end{tabular}
# \end{table}
# \end{document}
# '''
# soup = TexSoup.TexSoup(test_latex)
    
In [84]:
    
# !cat -n ./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex
    
In [85]:
    
tables = list(soup.find_all('table'))
    
In [86]:
    
t = tables[0]
    
In [87]:
    
t.tabular
    
    Out[87]:
In [91]:
    
qq = structurize_tabular_contents(t.tabular)
qq
    
    Out[91]:
In [92]:
    
list(get_all_tokens(qq.rows[8][2]))
    
    Out[92]:
In [93]:
    
ww = next(iter(get_all_tokens(qq.rows[6][0])))
print(ww)
print(type(ww))
src_pos = soup.char_pos_to_line(ww.position + len(ww.text) // 2)
src_pos
    
    
    Out[93]:
In [94]:
    
o = subprocess.check_output(['synctex', 'view',
                          '-i', '{}:{}:{}'.format(src_pos[0] + 1,
                                                  src_pos[1] + 1,
                                                  'playground.tex'),
                          '-o', 'playground.pdf'],
                         cwd='./111/tex-playground/').decode('ascii')
p = parse_synctex_output(o)
    
In [95]:
    
page_i, boxes = list(p.items())[0]
box = boxes[2]
print(page_i, boxes)
    
    
In [96]:
    
pdf = PdfMinerWrapper('./111/tex-playground/playground.pdf')
pdf.load()
    
In [97]:
    
page_info = pdf.get_page(page_i-1)
found_boxes = list(pdf.get_boxes(page_i-1, [convert_coords_to_pq(b, page_info[1].cropbox)
                                            for b in boxes]))
print('; '.join(pdf.get_text(page_i-1,
                             [convert_coords_to_pq(b, page_info[1].cropbox)])
                for b in boxes))
    
    
In [98]:
    
table_info = list(get_table_info(soup))[1]
    
In [99]:
    
page_img = load_image_opaque(pages[page_i - 1])
make_demo_mask(page_img,
               [(1,
                 (convert_coords_from_pq(fb.bbox, page_info[1].cropbox) * POINTS_TO_PIXELS_FACTOR).astype('int'))
                for fb in found_boxes] +
               [(1, (numpy.array(b) * POINTS_TO_PIXELS_FACTOR).astype('int')) for b in boxes])
    
    Out[99]:
In [3]:
    
pdf_latex_to_samples('1',
                     '.',
                     './111/tex-playground/playground.tex',
                     './111/tex-playground/playground.pdf',
                     './111/tex-playground/',
                     get_table_info,
                     boxes_aggregator=aggregate_object_bboxes,
                     display_demo=True)
    
    
In [ ]:
    
# print('\n*********\n'.join(map(str, get_all_tokens(t.tabular))))
    
In [11]:
    
# table_def = gen_table_contents()
# print('columns', len(table_def[2][0]), 'rows', len(table_def[2]))
    
In [12]:
    
# # %%prun
# render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated/1.pdf',
#              print_latex_content=True,
#              display_demo=True,
#              on_wrong_parse='ignore')
    
In [13]:
    
def gen_and_save_table(i, seed):
    numpy.random.seed(seed)
    table_def = gen_table_contents()
    render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated_with_char_info/big_simple_lined/src/{}'.format(i))
seeds = numpy.random.randint(0, 2000, size=2000)
joblib.Parallel(n_jobs=6)(joblib.delayed(gen_and_save_table)(i, s) for i, s in enumerate(seeds))
    
    Out[13]:
In [32]:
    
# for dirname in ['complex_clean', 'dense', 'lined', 'multiline_lined', 'no_lined', 'big_simple_lined', 'big_simple_no_lined']:
#     print(dirname)
#     for subdir in ['demo', 'src']:
#         print(subdir)
#         src_full_dirname = os.path.join('./data/generated', dirname, subdir)
#         target_full_dirname = os.path.join('./data/generated/full', subdir)
#         for fname in tqdm.tqdm(os.listdir(src_full_dirname)):
#             shutil.copy2(os.path.join(src_full_dirname, fname),
#                          os.path.join(target_full_dirname, dirname + '_' + fname))
    
In [2]:
    
archive_files = list(glob.glob('./data/arxiv/sources/*.tar.gz'))
print('Total downloaded', len(archive_files))
# def _get_archive_content_type(fname):
#     return read_metadata(fname)['content_type']
# print('Types:\n', collections.Counter(joblib.Parallel(n_jobs=-1)(joblib.delayed(_get_archive_content_type)(archive)
#                                                                  for archive in archive_files)).most_common())
# print()
    
    
Total downloaded 208559
Types:
[('application/x-eprint-tar', 149642), ('application/x-eprint', 40360), ('application/pdf', 18292), ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', 218), ('application/postscript', 47)]
In [3]:
    
good_papers = set()
bad_papers = set()
if os.path.exists('./good_papers.lst'):
    with open('./good_papers.lst', 'r') as f:
        good_papers = set(line.strip() for line in f)
if os.path.exists('./bad_papers.lst'):
    with open('./bad_papers.lst', 'r') as f:
        bad_papers = set(line.strip() for line in f)
print('Good papers', len(good_papers))
print('Bad papers', len(bad_papers))
    
    
In [ ]:
    
# def check_archive_func(fname):
#     return (fname,
#             contains_something_interesting(fname, get_table_info))
# archive_files_with_check_res = joblib.Parallel(n_jobs=12)(joblib.delayed(check_archive_func)(fname)
#                                                           for fname in archive_files
#                                                           if not (fname in bad_papers or fname in good_papers))
# for fname, is_good in archive_files_with_check_res:
#     if is_good:
#         good_papers.add(fname)
#     else:
#         bad_papers.add(fname)
    
In [ ]:
    
# with open('./good_papers.lst', 'w') as f:
#     f.write('\n'.join(sorted(good_papers)))
# with open('./bad_papers.lst', 'w') as f:
#     f.write('\n'.join(sorted(bad_papers)))
    
In [ ]:
    
ARXIV_INOUT_PAIRS_DIR = './data/arxiv/inout_pairs/'
def _pdf2samples_mp(archive):
    try:
        pdf2samples(archive,
                    ARXIV_INOUT_PAIRS_DIR,
                    lambda s: get_table_info(s, extract_cells=False),
                    aggregate_object_bboxes)
    except Exception as ex:
        with open(os.path.join(ARXIV_INOUT_PAIRS_DIR, os.path.basename(archive) + '.log'), 'w') as f:
            f.write(str(ex) + '\n')
            f.write(traceback.format_exc())
_ = joblib.Parallel(n_jobs=10)(joblib.delayed(_pdf2samples_mp)(arc)
                               for arc in good_papers)
    
In [ ]: