In [1]:
# !pip install git+https://github.com/windj007/TexSoup timeout-decorator
# !apt-get install -y latexmk
# !pip install ngram

In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import tqdm
%pylab inline

from table_gen import *


Populating the interactive namespace from numpy and matplotlib

In [3]:
# # pdf2samples('./data/arxiv/1/1312.6989.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/44/1601.04208.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/sources/1006.1798.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# # pdf2samples('./data/arxiv/1/5/1201.2088.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)
# pdf2samples('./data/arxiv/1/8/0708.1672.tar.gz', './data/arxiv/buf/', get_table_info, aggregate_object_bboxes, display_demo=True)

Analyze error logs


In [4]:
# frequent_errors = collections.Counter(err
#                                       for f in glob.glob('./data/arxiv/err_logs/*.log')
#                                       for err in {line
#                                                   for line in open(f, 'r', errors='replace')
#                                                   if "error:" in line})
# frequent_errors.most_common(10)

Debug


In [147]:
# preprocess_latex_file('./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex')
compile_latex('./111/tex-playground/')
# !mkdir ./data/arxiv/1/44/pages/
pages = pdf_to_pages('./111/tex-playground/playground.pdf', './111/tex-playground/pages/')
with open('./111/tex-playground/playground.tex') as f:
    soup = TexSoup.TexSoup(f.read())

# test_latex = r'''
# \documentclass{llncs}
# \usepackage{graphicx}
# \usepackage{multirow}
# \usepackage{hyperref}
# \usepackage[a4paper, landscape, margin={0.1in, 0.1in}]{geometry}
# \usepackage{tabularx}
# \usepackage{makecell}

# \begin{document}


# \begin{table}
# \renewcommand{\arraystretch}{0.42}
# \setlength{\tabcolsep}{1.52pt}
# \begin{tabular}{ c c r|c|r|l|c|}
#   & .Myrnvnl & \multicolumn{5}{ c }{Bd iH VXDy -aL} \\
#   &  & \multicolumn{2}{|c|}{AlUBLk.cv} & \multicolumn{2}{ c }{ \makecell{ nUd qLoieco jVsmTLRAf \\ UPS TJL xGIH } } & qe.V.. \\
#   &  &  \makecell{ MG MTBSgR, \\ ,lHm Ihmd \\ lbrT }  & -OfQuxW & MeY XR & kSG,dEFX &  \\
# \hline  \makecell{ LuekQjL NSs TVq \\ NDC }  & 8.80 Mv & osw & K*Dgc & 53.16 Tr & 8.92 & 44.18 j- \\
# \hline oL & 55.67 UueS & vGkGl & -MUJhqduw & 67.86 sxRy- & 63.51 & 10.85 A*,hKg \\
#  nA & 7.46 ll & yVw,P & vuege & 96.36 FuEa & 80.27 & 40.46 NeWuNVi \\
#  fA & 0.47 j,Gg.Gv & TrwtXRS & yfhyTWJ & 42.20 sWdg & 8.76 & 98.68 ND \\
# \hline  \makecell{ hD XXOl dMCTp Yib \\ p.IE TcBn }  & 7.90 Pm & CbyWQtUTY, & FPFh.M & 22.38 Hs & 16.03 & 33.20 hU \\
# \hline  \makecell{ LAxtFM cmBvrJj hCRx, \\ LiQYh }  & 97.15 *a & ..pb & ejNtniag & 84.67 F.xHN & 10.31 & 23.57 R,rdK \\
#  x*d afKGwJw & 82.46 REuwGLME & cIQv & iCLkFNY & 95.92 iHL & 79.26 & 80.85 L-NR \\

# \end{tabular}
# \end{table}


# \end{document}
# '''
# soup = TexSoup.TexSoup(test_latex)

In [84]:
# !cat -n ./data/arxiv/1/44/The_Chiral_Anomaly_Final_Posting.tex

In [85]:
tables = list(soup.find_all('table'))

In [86]:
t = tables[0]

In [87]:
t.tabular


Out[87]:
\begin{tabular}{|c c|l c c c|l c }
  &  &  
\makecell{ xk*,h OKJpTH kn \\ ht.WGOf }
  & 
\multicolumn{5}{ c }{is Agd*}
 \\
  &  &  & 
\multicolumn{2}{|c|}{LxL ,Ataf,}
 & 
\multicolumn{3}{ c }{hJwk -QNS .Llf}
 \\
  &  &  & .WWk jUWRtqd & PK & VVRVOo*K & rCqbmH VYbt Iyf*wxi,LmW wFGPCCsbJBr & Lyk \\

\cline{2-4}

 
\multirow{2}{*}{DkEyETN ,TPuErkV}
 & wLp.P DO & 27.95 wyXgy & 10.62 & 55.88 wwRM & 13.59 KIXlEYbdvFb & 58.85 & -c UbUt*Xnvi \\

\cline{2-4}

  & UlJq Ivkht & 75.64 gsPoV & 16.15 & 68.77 Uq & 67.00 dKcnw & 21.23 & lQyaTLStQk lFBrIkM.P \\

\hline DTj JDNKBd XN pXd &  & 64.29 EbtES- & 11.08 & 59.40 .TwXn & 91.38 Hr & 49.64 & Il pg \\

\hline CI &  & 14.97 Rut & 59.36 & 82.93 yi & 57.05 dshL & 99.76 & vySj QHh \\
  
\makecell{ qkXa bxUq \\ LEr JpbVkyB \\ eedHrG }
  &  & 84.47 UWjqn & 54.89 & 44.58 df & 77.92 pQnE & 14.32 & wOEmkE Wd \\
 ff.h LMp KaQ ,o &  & 91.19 bRi & 16.26 & 39.69 th.inRwn & 21.75 brvkFB & 12.32 & oRiQw glGmwj \\

\end{tabular}

In [91]:
qq = structurize_tabular_contents(t.tabular)
qq


Out[91]:
\makecell{ xk*,h OKJpTH kn \\ ht.WGOf } \multicolumn{5}{ c }{is Agd*}
\multicolumn{2}{|c|}{LxL ,Ataf,} \multicolumn{3}{ c }{hJwk -QNS .Llf}
.WWk jUWRtqd PK VVRVOo*K rCqbmH VYbt Iyf*wxi,LmW wFGPCCsbJBr Lyk
\multirow{2}{*}{DkEyETN ,TPuErkV} wLp.P DO 27.95 wyXgy 10.62 55.88 wwRM 13.59 KIXlEYbdvFb 58.85 -c UbUt*Xnvi
UlJq Ivkht 75.64 gsPoV 16.15 68.77 Uq 67.00 dKcnw 21.23 lQyaTLStQk lFBrIkM.P
DTj JDNKBd XN pXd 64.29 EbtES- 11.08 59.40 .TwXn 91.38 Hr 49.64 Il pg
CI 14.97 Rut 59.36 82.93 yi 57.05 dshL 99.76 vySj QHh
\makecell{ qkXa bxUq \\ LEr JpbVkyB \\ eedHrG } 84.47 UWjqn 54.89 44.58 df 77.92 pQnE 14.32 wOEmkE Wd
ff.h LMp KaQ ,o 91.19 bRi 16.26 39.69 th.inRwn 21.75 brvkFB 12.32 oRiQw glGmwj

In [92]:
list(get_all_tokens(qq.rows[8][2]))


Out[92]:
['91.19', 'bRi']

In [93]:
ww = next(iter(get_all_tokens(qq.rows[6][0])))
print(ww)
print(type(ww))
src_pos = soup.char_pos_to_line(ww.position + len(ww.text) // 2)
src_pos


CI
<class 'TexSoup.utils.TokenWithPosition'>
Out[93]:
(25, 8)

In [94]:
o = subprocess.check_output(['synctex', 'view',
                          '-i', '{}:{}:{}'.format(src_pos[0] + 1,
                                                  src_pos[1] + 1,
                                                  'playground.tex'),
                          '-o', 'playground.pdf'],
                         cwd='./111/tex-playground/').decode('ascii')
p = parse_synctex_output(o)

In [95]:
page_i, boxes = list(p.items())[0]
box = boxes[2]
print(page_i, boxes)


1 [[120.86764900000001, 157.165283, 145.941574, 238.599922], [120.86764900000001, 7.20043, 145.941574, 107.33000799999999], [120.86764900000001, 157.165283, 145.941574, 238.599922], [120.86764900000001, 238.599915, 145.941574, 313.12020900000005], [120.86764900000001, 313.120209, 145.941574, 380.22259499999996], [120.86764900000001, 380.222595, 145.941574, 469.603706], [120.86764900000001, 469.603729, 145.941574, 659.9803919999999], [120.86764900000001, 659.980347, 145.941574, 765.1120070000001]]

In [96]:
pdf = PdfMinerWrapper('./111/tex-playground/playground.pdf')
pdf.load()

In [97]:
page_info = pdf.get_page(page_i-1)
found_boxes = list(pdf.get_boxes(page_i-1, [convert_coords_to_pq(b, page_info[1].cropbox)
                                            for b in boxes]))
print('; '.join(pdf.get_text(page_i-1,
                             [convert_coords_to_pq(b, page_info[1].cropbox)])
                for b in boxes))


14.97Rut; CI; 14.97Rut; 59.36; 82.93yi; 57.05dshL; 99.76; vySjQHh

In [98]:
table_info = list(get_table_info(soup))[1]

In [99]:
page_img = load_image_opaque(pages[page_i - 1])
make_demo_mask(page_img,
               [(1,
                 (convert_coords_from_pq(fb.bbox, page_info[1].cropbox) * POINTS_TO_PIXELS_FACTOR).astype('int'))
                for fb in found_boxes] +
               [(1, (numpy.array(b) * POINTS_TO_PIXELS_FACTOR).astype('int')) for b in boxes])


Out[99]:

In [3]:
pdf_latex_to_samples('1',
                     '.',
                     './111/tex-playground/playground.tex',
                     './111/tex-playground/playground.pdf',
                     './111/tex-playground/',
                     get_table_info,
                     boxes_aggregator=aggregate_object_bboxes,
                     display_demo=True)



In [ ]:
# print('\n*********\n'.join(map(str, get_all_tokens(t.tabular))))

Generate tables


In [11]:
# table_def = gen_table_contents()
# print('columns', len(table_def[2][0]), 'rows', len(table_def[2]))

In [12]:
# # %%prun
# render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated/1.pdf',
#              print_latex_content=True,
#              display_demo=True,
#              on_wrong_parse='ignore')

In [13]:
def gen_and_save_table(i, seed):
    numpy.random.seed(seed)
    table_def = gen_table_contents()
    render_table(table_def, '/notebook/templates/springer/', '/notebook/data/generated_with_char_info/big_simple_lined/src/{}'.format(i))

seeds = numpy.random.randint(0, 2000, size=2000)
joblib.Parallel(n_jobs=6)(joblib.delayed(gen_and_save_table)(i, s) for i, s in enumerate(seeds))


Out[13]:
[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 ...]

In [32]:
# for dirname in ['complex_clean', 'dense', 'lined', 'multiline_lined', 'no_lined', 'big_simple_lined', 'big_simple_no_lined']:
#     print(dirname)
#     for subdir in ['demo', 'src']:
#         print(subdir)
#         src_full_dirname = os.path.join('./data/generated', dirname, subdir)
#         target_full_dirname = os.path.join('./data/generated/full', subdir)
#         for fname in tqdm.tqdm(os.listdir(src_full_dirname)):
#             shutil.copy2(os.path.join(src_full_dirname, fname),
#                          os.path.join(target_full_dirname, dirname + '_' + fname))

Get some statistics


In [2]:
archive_files = list(glob.glob('./data/arxiv/sources/*.tar.gz'))
print('Total downloaded', len(archive_files))

# def _get_archive_content_type(fname):
#     return read_metadata(fname)['content_type']
# print('Types:\n', collections.Counter(joblib.Parallel(n_jobs=-1)(joblib.delayed(_get_archive_content_type)(archive)
#                                                                  for archive in archive_files)).most_common())
# print()


Total downloaded 208559

Total downloaded 208559

Types:

[('application/x-eprint-tar', 149642), ('application/x-eprint', 40360), ('application/pdf', 18292), ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', 218), ('application/postscript', 47)]


In [3]:
good_papers = set()
bad_papers = set()

if os.path.exists('./good_papers.lst'):
    with open('./good_papers.lst', 'r') as f:
        good_papers = set(line.strip() for line in f)
if os.path.exists('./bad_papers.lst'):
    with open('./bad_papers.lst', 'r') as f:
        bad_papers = set(line.strip() for line in f)

print('Good papers', len(good_papers))
print('Bad papers', len(bad_papers))


Good papers 15140
Bad papers 193419

In [ ]:
# def check_archive_func(fname):
#     return (fname,
#             contains_something_interesting(fname, get_table_info))

# archive_files_with_check_res = joblib.Parallel(n_jobs=12)(joblib.delayed(check_archive_func)(fname)
#                                                           for fname in archive_files
#                                                           if not (fname in bad_papers or fname in good_papers))
# for fname, is_good in archive_files_with_check_res:
#     if is_good:
#         good_papers.add(fname)
#     else:
#         bad_papers.add(fname)

In [ ]:
# with open('./good_papers.lst', 'w') as f:
#     f.write('\n'.join(sorted(good_papers)))
# with open('./bad_papers.lst', 'w') as f:
#     f.write('\n'.join(sorted(bad_papers)))

Apply pipeline to some papers


In [ ]:
ARXIV_INOUT_PAIRS_DIR = './data/arxiv/inout_pairs/'

def _pdf2samples_mp(archive):
    try:
        pdf2samples(archive,
                    ARXIV_INOUT_PAIRS_DIR,
                    lambda s: get_table_info(s, extract_cells=False),
                    aggregate_object_bboxes)
    except Exception as ex:
        with open(os.path.join(ARXIV_INOUT_PAIRS_DIR, os.path.basename(archive) + '.log'), 'w') as f:
            f.write(str(ex) + '\n')
            f.write(traceback.format_exc())

_ = joblib.Parallel(n_jobs=10)(joblib.delayed(_pdf2samples_mp)(arc)
                               for arc in good_papers)

In [ ]: