Get the text from pdf with the module textbox


In [1]:
import textbox

In [2]:
import importlib
importlib.reload(textbox)


Out[2]:
<module 'textbox' from '/home/benjamin/Documents/eviacybernetics/Projets/OCR/textbox.py'>

In [3]:
df = textbox.extract_text('/media/benjamin/Elements/pdfs/txt')


100%|██████████| 14721/14721 [01:16<00:00, 192.60it/s]  | 35/14721 [00:00<00:42, 345.55it/s]

In [8]:
df = textbox.merge_pages(df)
df = textbox.text_properties(df)

In [9]:
textbox.save(df,'./texts1.pkl')

In [4]:
df


Out[4]:
index filename text
0 0.0 SCAN_2016_11_03_140924 \n\n' cReonAGRICOLE llllllllllllllllllll|ll||...
1 1.0 SCAN_2016_11_03_140924 \n\n!\n\ncRenwAGRICOLE llllllllllllllllllllll...
2 2.0 SCAN_2016_11_03_140924 \n\n' cRED'TAGR'COLE ||ll|llllllllllllll||||l...
3 3.0 SCAN_2016_11_03_140924 \n\n \n\nLoi Informatique. Fichiers et Libert...
4 4.0 SCAN_2016_11_03_140924 d\n\nCREDIT AGRICOLE »\n_ DESSAvore liililllll...
5 5.0 secourismeBenjaminR !Ël '\n\n{iii :\n\nPQOË\n‘5‘)\n\n \n\nmama/:02...
6 6.0 EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU... VOTRE E-BILLET\n\nEVIAN LES BAINS > PARIS GARE...
7 7.0 EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU... VOTRE E-BILLET\n\nEVIAN LES BAINS > PARIS GARE...
8 8.0 EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU... VOTRE E-BILLET\n\nPARIS GARE LYON > EVIAN LES ...
9 9.0 sfr-facture-09-B516-013324644 BZBMOADSV066V038V070\n\nVotre facture du 07/12...
10 10.0 sfr-facture-09-B516-013324644 \n\n \n\nDES QUESTIONS? Pour information\n\nR...
11 11.0 statuttype MODELE de STATUTS pour une ASSOCIATION d’ANCIE...
12 12.0 statuttype ARTICLE 6 - Ressources\n\nLes ressources de l’...
13 13.0 SCAN_2016_11_03_140847 \n\nL AUTO-CERTIFICATION DE RESIDENCE FISCALE...
14 14.0 SCAN_2016_11_03_140847 {\n\nCREDIT AGRICOLE\n_: DES…Œ lllllllllllllll...
15 15.0 SCAN_2016_11_03_140847 I\n_ SEEM?“ llllilllililillllilill…iiilllillll...
16 16.0 attestation(1) à\npâte emptoi\n\nM. RICAUD BENJAMIN\n\n253 RU...
17 17.0 imprimante-hp-envy-4522-compatible-instant-ink... @ HP ENVY série 4520 tout—en—un\n\n
18 18.0 imprimante-hp-envy-4522-compatible-instant-ink...
19 19.0 imprimante-hp-envy-4522-compatible-instant-ink... \n\nSommaire\n\n \n\n \n\n \n\n \n\n \n\n1 Co...
20 20.0 imprimante-hp-envy-4522-compatible-instant-ink... Conseils pour l'utilisation des Services Web ....
21 21.0 imprimante-hp-envy-4522-compatible-instant-ink... \n\nFRWW\n\nComment faire ?\n\nApprenez a uti...
22 22.0 image2016-11-10-154822
23 23.0 image2016-11-10-154822 \n\n‘ _.r'vn: -._—-._—-_— —— -\n\n'10 janvier...
24 24.0 STATUTSDELAPE.doc \n\nSTATUTS DE L'ASSOCIATION DE PARENTS D'ELE...
25 25.0 STATUTSDELAPE.doc 3.1 - La qualité de parent d’élève, membre de ...
26 26.0 STATUTSDELAPE.doc 6.1 — Composition et pouvoirs\nL’APE est admin...
27 27.0 STATUTSDELAPE.doc trésorier-adjoint.\n\n8.1 — Le Président\n\nLe...
28 28.0 STATUTSDELAPE.doc adressées au moins dix jours à l’avance par vo...
29 29.0 statutsEvias Evia Cybernetics\n\nSOCIÉTÉ PAR ACTIONS SIMPLI...
... ... ... ...
14691 14691.0 collection \n\n \n\n
14692 14692.0 figure_today 4.0\n\n3.5\n\n3.0\n\n2.5\n\n2.0\n\n1.5\n\n1.0\...
14693 14693.0 figure_suptitle 1.0\n\n0.8\n\n0.6\n\n0.4\n\n0.2\n\n0%.\n\nwe\n...
14694 14694.0 fancyarrow_test_image
14695 14695.0 move
14696 14696.0 subplots
14697 14697.0 back \n\n
14698 14698.0 hand
14699 14699.0 matplotlib
14700 14700.0 qt4_editor_options
14701 14701.0 home
14702 14702.0 filesave
14703 14703.0 forward
14704 14704.0 zoom_to_rect
14705 14705.0 divider_append_axes 120\n100\n80\n60\n40\n20\n\n \n\n \n\n \n\n \n...
14706 14706.0 quiver3d \n\n \n\n
14707 14707.0 quiver3d_empty
14708 14708.0 wireframe3d \n\n \n\n
14709 14709.0 lines3d .—\n\n
14710 14710.0 quiver3d_masked \n\n \n\n
14711 14711.0 mixedsubplot \n\n \n\n \n\n \n\n \n\n
14712 14712.0 bar3d \n\n \n\n
14713 14713.0 contour3d
14714 14714.0 surface3d \n\n \n\n
14715 14715.0 trisurf3d \n\n
14716 14716.0 contourf3d \n\n \n\n
14717 14717.0 contourf3d_fill
14718 14718.0 text3d 2 D Text\n\n«> {»\n(z,,gM), dir=N99Qé5\n//\ \\...
14719 14719.0 scatter3d .—\n\n
14720 14720.0 xpdfimport_err This PDF file is encrypted and can't be opened....

14721 rows × 3 columns


In [26]:
list(df[df['nb_words']<=5].index)


Out[26]:
['CIOlympe',
 'legend_expand',
 'fig_stft_tiling',
 'hist_stacked_step',
 'markevery_linear_scales',
 'fig_wavelet_tiling',
 'fig_wft_tiling',
 'fig_dilated_kernels3-eps-converted-to',
 'boxplot_rc_parameters',
 'fig_swiss_g_62_450_spectral',
 'markevery_log_scales',
 'skew_rects',
 'markevery_linear_scales_zoomed',
 'fig_wavelet_tiling2',
 'docgraphs',
 'docgraphsorig',
 'example-grid-100x100bp',
 'diagrams',
 'fig_trans_norms_grid',
 'example-image-16x10',
 'euscript',
 'grSQCycle',
 'vline_hline_zorder',
 'example-image-1x1',
 'symlog2',
 'example-image-9x16',
 'fig_swiss_g_62_983_spectral',
 'fig_wgft_gik_hat',
 'example-image-10x16',
 'poster2',
 'dpe-11',
 'example-image-16x9',
 'im12',
 'fig_swiss_g_62_100_spectral',
 'test3-pdf',
 'Fig1c',
 'fig_minn_heat',
 'pcolormesh',
 'fig_swiss_atoms_gsd1f',
 'drawing',
 'mathtext_stixsans_37',
 'plotgaussian1-eps-converted-to',
 'plotgaussian2-eps-converted-to',
 'dpe-10',
 'transparent-final',
 'bbox_inches_tight',
 'fig_swiss_gsd3f',
 't2-ghsb',
 'Fig2a',
 'fig_atom_mod1',
 'scatter_rc1',
 'streamplot_colormap',
 'PoleZeros',
 'fig_classical_atom2',
 'background5',
 'fig_classical_atom1',
 'fig_classical_atom3',
 'something',
 'contour_manual_labels',
 'mathtext_stix_21',
 'fig_clustering_filters',
 'mathtext_dejavusans_21',
 'background3',
 'Fig5',
 'stackplot_test_baseline',
 'foreground2',
 'mathtext_stix_53',
 'mathtext_stixsans_53',
 'fig_atoms_b',
 'mathtext_stix_80',
 'mathtext_cm_53',
 'patheffect1',
 'mathtext_cm_52',
 'mp',
 'mpgraph',
 'foreground1',
 'mathtext_dejavuserif_21',
 'framealpha',
 'system2',
 'mathtext_cm_21',
 'mathtext_dejavusans_75',
 'mathtext_dejavuserif_73',
 'mathtext_dejavuserif_74',
 'spines_axes_positions',
 'mathtext_cm_73',
 'image_alpha',
 'mathtext_dejavusans_73',
 'mathtext_dejavuserif_75',
 'mathtext_stixsans_21',
 'mathtext_cm_74',
 'mathtext_dejavusans_74',
 'mathtext_dejavusans_41',
 'mathtext_dejavuserif_22',
 'genezik',
 'mathtext_stixsans_22',
 'rgba_markers',
 'mathtext_stixsans_20',
 'fig_swiss_gsd1f',
 'result-picture',
 'mathtext_dejavuserif_41',
 'roundtri',
 'mathtext_cm_22',
 'mathtext_dejavusans_22',
 'background4',
 'mathtext_stix_22',
 'figS',
 'mathtext_stix_75',
 'fig3',
 'tiny_example_1',
 'mathtext_cm_41',
 'example-image-golden-upright',
 'graphspectrogram1',
 'circle',
 'legend_auto1',
 'mathtext_stix_41',
 'mathtext_cm_75',
 'skew_axes',
 'nqueen',
 'dpe-9',
 'chrysant',
 'mathtext_dejavuserif_53',
 'polar_theta_position',
 'mathtext_stixsans_68',
 'mathtext_stixsans_54',
 'mathtext_cm_68',
 'mathtext_cm_38',
 'mathtext_dejavusans_68',
 'background1',
 'mathtext_dejavuserif_68',
 'mathtext_stix_52',
 'mathtext_stixsans_41',
 'mathtext_stix_68',
 'fill_between_interpolate',
 'hist_steplog',
 'hist_stacked_stepfilled_alpha',
 'patheffect3',
 'mathtext_dejavusans_53',
 'hist_stacked_stepfilled',
 'polar_units',
 'mathtext_cm_09',
 'low_pass_filter',
 'offset_points',
 'buttrfly',
 'spiral',
 'log_scales',
 'colors',
 'streamplot_startpoints',
 'gr-edgeingraphmodloop',
 'streamplot_masks_and_nans_test_image',
 'earth-moon',
 'mathtext_stix_48',
 'mathtext_stix_47',
 'mathtext_cm_23',
 'fig_swiss_g_62_100_vertex',
 'patch_alpha_coloring',
 'mathtext_stixsans_38',
 'polar_coords',
 'para_equal_perp',
 'activity',
 'polar_rlabel_position',
 'mathtext_cm_05',
 'mathtext_stixsans_31',
 'mathtext_stixsans_05',
 'mathtext_dejavuserif_23',
 'mathtext_dejavusans_23',
 'mathtext_stix_05',
 'mathtext_stix_06',
 'nonfinite_limits',
 'mathtext_stix_23',
 'mathtext_cm_06',
 'mathtext_cm_67',
 'mathtext_dejavusans_05',
 'mathtext_dejavuserif_06',
 'mathtext_dejavuserif_05',
 'polar_rmin',
 'pasted-image-155',
 'mathtext_dejavuserif_70',
 'fig_classical_sliding3',
 'noticeLegoPolice',
 'mathtext_stixsans_23',
 'mathtext_dejavuserif_67',
 'grWheel',
 'example-image',
 'mathtext_stixsans_40',
 'mathtext_dejavusans_40',
 'mathtext_stix_38',
 'mathtext_dejavusans_38',
 'ekflogo',
 'mathtext_cm_31',
 'mathtext_dejavusans_06',
 'mathtext_dejavusans_31',
 'legend_auto2',
 'mathtext_dejavusans_70',
 'xkcd',
 'mathtext_cm_79',
 'mathtext_stix_40',
 'mathtext_stixsans_19',
 'mathtext_cm_40',
 'fig_path_signal',
 'mathtext_stixsans_60',
 'mathtext_cm_60',
 'mathtext_stixsans_70',
 'vector',
 'mathtext_dejavusans_60',
 'mathtext_dejavuserif_60',
 'fs',
 'mathtext_dejavuserif_38',
 'fig_trans_norms_comet',
 'mathtext_stixsans_06',
 'mathtext_stix_67',
 'fig_swiss_g_62_450_vertex',
 'fig_ring_red',
 'labelgraph',
 'test_alpha',
 'truetype-conversion',
 'incgraph-example-b',
 'incgraph-example-c',
 'image_cliprect',
 'incgraph-example-a',
 'fig_minn_heat_mod',
 'mixedsubplot',
 'mathtext_stix_70',
 'mathtext_dejavusans_67',
 'mathtext_stix_79',
 'overflow',
 'background2',
 'mathtext_dejavuserif_31',
 'mathtext_stix_29',
 'patch_custom_linestyle',
 'doclicense-CC-zero',
 'doclicense-CC-pd',
 'bbox_inches_tight_clipping',
 'mathtext_dejavusans_29',
 'mathtext_stixsans_28',
 'mathtext_stixsans_29',
 'colArray',
 'mathtext_cm_70',
 'mathtext_cm_29',
 'mathtext_cm_27',
 'mathtext_dejavusans_27',
 'mathtext_stixsans_03',
 'mathtext_stixsans_27',
 'transistor',
 'polar_wrap_360',
 'unclassified',
 'mathtext_dejavuserif_29',
 'confidential',
 'mathtext_cm_03',
 'mathtext_stix_27',
 'mathtext_stix_31',
 'mathtext_stix_03',
 'mathtext_stix_09',
 'mathtext_dejavuserif_27',
 'polar_wrap_180',
 'wavelet_1',
 'mathtext_dejavuserif_40',
 'line',
 'photomayaPS',
 'syracuse',
 'syracuse-crop',
 'Fig13',
 'mathtext_stix_71',
 'mathtext_cm_71',
 'mathtext_dejavusans_09',
 'mathtext_cm_76',
 'pst-doc',
 'wedge_range',
 'mathtext_stix_02',
 'mathtext_dejavusans_07',
 'topsecret',
 'classified',
 'mathtext_dejavusans_56',
 'dash_offset',
 'mathtext_stixsans_07',
 'mathtext_stix_26',
 'mathtext_dejavuserif_07',
 'mathtext_stixsans_02',
 'mathtext_stix_76',
 'mathtext_dejavusans_76',
 'mathtext_dejavusans_02',
 'fig_T1000',
 'helix',
 'mathtext_stix_60',
 'mathtext_dejavuserif_02',
 'mathtext_dejavuserif_09',
 'standard',
 'mathtext_dejavuserif_00',
 'mathtext_dejavuserif_76',
 'mathtext_cm_39',
 'Fig.2',
 'no_interpolation_origin',
 'dpe-8',
 'mathtext_stixsans_76',
 'mathtext_cm_63',
 'mathtext_cm_07',
 'mathtext_cm_02',
 'fig_minn_T1000',
 'pgf_bbox_inches',
 'eLaToutline',
 'mathtext_stix_00',
 'wiki_memory',
 'mathtext_dejavusans_03',
 'mathtext_cm_10',
 'mathtext_stixsans_09',
 'mathtext_stix_36',
 'mathtext_dejavusans_49',
 'mathtext_stixsans_10',
 'mathtext_cm_00',
 'mathtext_stix_10',
 'mathtext_cm_36',
 'mathtext_stixsans_39',
 'mathtext_stixsans_00',
 'mathtext_stix_07',
 'mathtext_stix_28',
 'dpe-5',
 'dpe-6',
 'mathtext_dejavuserif_03',
 'mathtext_dejavusans_00',
 'mathtext_stixsans_71',
 'mathtext_dejavuserif_63',
 'mathtext_dejavusans_71',
 'mathtext_stixsans_44',
 'mathtext_dejavuserif_36',
 'mathtext_stix_72',
 'zedat',
 'mathtext_cm_32',
 'permisLahille',
 'mathtext_dejavusans_36',
 'mathtext_cm_62',
 'mathtext_stixsans_72',
 'mathtext_dejavuserif_62',
 'mathtext_cm_69',
 'mathtext_dejavusans_10',
 'mathtext_stixsans_32',
 'mathtext_cm_72',
 'mathtext_stixsans_36',
 'mathtext_dejavuserif_10',
 'mathtext_stix_32',
 'dpe-7',
 'mathtext_dejavuserif_25',
 'fig_atoms_a',
 'fig_atoms_c',
 'turtle',
 'mathtext_cm_18',
 'photoAnthoninMS3',
 'patheffect2',
 'Fig2b',
 'mathtext_stix_45',
 'mathtext_dejavuserif_49',
 'mathtext_dejavuserif_57',
 'fig_minn_scaling',
 'Fig4',
 'grCLadder',
 'mathtext_dejavuserif_79',
 'tight_layout5',
 'mathtext_cm_25',
 'Fig1a',
 'Fig1b',
 'mathtext_dejavusans_62',
 'mathtext_cm_49',
 'fig_swiss_vertex3',
 'mathtext_dejavusans_57',
 'bbox_image_inverted',
 'button',
 'mathtext_dejavusans_44',
 'mathtext_dejavusans_39',
 'mathtext_stixsans_18',
 'random_memory',
 'mathtext_stixsans_26',
 'mathtext_dejavusans_26',
 'mathtext_dejavusans_25',
 'fig_swiss_atoms2',
 'mathtext_stixsans_57',
 'image3-eps-converted-to',
 'mathtext_cm_26',
 'mathtext_stixsans_25',
 'poles_zeros',
 'root',
 'mathtext_stix_57',
 'stackplot_test_image',
 'mathtext_stix_63',
 'dpe-4',
 'dpe-2',
 'dpe-3',
 'rotate_image',
 'interp_nearest_vs_none',
 'hist_stacked_weights',
 'fig_trans_norms_minn',
 'mathtext_dejavusans_63',
 'mathtext_stix_13',
 'mathtext_stixsans_13',
 'bclogo',
 'mathtext_stix_25',
 'mathtext_stix_64',
 'pst-uml-encapsuled-pdf-fig',
 'mathtext_stix_62',
 'mathtext_stixsans_64',
 'mathtext_stixsans_62',
 'mathtext_stix_39',
 'mathtext_stix_18',
 'secret',
 'mathtext_dejavusans_08',
 'mathtext_dejavuserif_13',
 'mathtext_dejavuserif_45',
 'mathtext_cm_64',
 'mathtext_dejavusans_13',
 'mathtext_dejavusans_18',
 'mathtext_dejavuserif_18',
 'mathtext_dejavuserif_39',
 'mathtext_dejavuserif_33',
 'mathtext_dejavuserif_64',
 'd2tpstexamples',
 'mathtext_cm_13',
 'mathtext_dejavusans_64',
 'mathtext_stixsans_08',
 'mathtext_dejavuserif_08',
 'mathtext_stix_33',
 'doclicense-CC-by-nc',
 'mathtext_stixsans_33',
 'mathtext_stix_66',
 'mathtext_dejavuserif_32',
 'mathtext_dejavusans_32',
 'mathtext_dejavuserif_71',
 'rasterize_10dpi',
 'mathtext_cm_58',
 'mathtext_cm_08',
 'epfl_logo',
 'mathtext_stix_08',
 'mathtext_stix_61',
 'mathtext_stixsans_66',
 'mathtext_cm_65',
 'pic',
 'heat_tau_25',
 'heat_tau_50',
 'heat_tau_10',
 'wavelet_filtering',
 'mathtext_dejavusans_66',
 'mathtext_stixsans_01',
 'fig_minn_graph',
 'fig_trans7',
 'palm',
 'shema_richmont',
 '3dsystem',
 'wireframe3d',
 'mathtext_dejavusans_45',
 'mathtext_cm_66',
 'fig_joint_counter1',
 'mathtext_cm_51',
 'bar3d',
 'mathtext_stixsans_45',
 'colorbar_closed_patch',
 'mathtext_stix_35',
 'put',
 'heat_tau_1',
 'wavelet_4',
 'quiver3d_masked',
 'quiver3d',
 'fithesis-fi-color',
 'surface3d',
 'mathtext_dejavuserif_66',
 'collection',
 'contourf3d',
 'wavelet_3',
 'wavelet_2',
 'fithesis-fi',
 'mathtext_stixsans_65',
 'mathtext_dejavuserif_43',
 'mathtext_stixsans_51',
 'mathtext_stixsans_59',
 'mathtext_stixsans_69',
 'mathtext_stix_58',
 'mathtext_dejavusans_01',
 'mathtext_stixsans_43',
 'mathtext_stixsans_35',
 'grandcanyon',
 'mathtext_stix_59',
 'mathtext_cm_59',
 'mathtext_dejavuserif_51',
 'mathtext_cm_57',
 'doclicense-CC-by-sa',
 'mathtext_dejavuserif_59',
 'mathtext_cm_43',
 'mathtext_stix_01',
 'mathtext_dejavuserif_65',
 'mathtext_cm_14',
 'mathtext_cm_12',
 'mathtext_cm_01',
 'mathtext_dejavuserif_58',
 'mathtext_dejavusans_51',
 'mathtext_stix_43',
 'mathtext_dejavusans_58',
 'mathtext_stix_42',
 'mathtext_stix_69',
 'mathtext_dejavuserif_69',
 'mathtext_stix_65',
 'mathtext_dejavusans_43',
 'mathtext_dejavusans_35',
 'mathtext_dejavusans_69',
 'mathtext_dejavusans_65',
 'mathtext_dejavuserif_01',
 'mathtext_dejavuserif_04',
 'lines3d',
 'mathtext_stixsans_04',
 'mathtext_dejavusans_59',
 'mathtext_stix_14',
 'compatibility',
 'mathtext_stix_51',
 'scatter3d',
 'doclicense-CC-by-nd',
 'instructions-differential',
 'mathtext_stixsans_14',
 'im06',
 'doclicense-CC-by-nc-eu',
 'mathtext_dejavusans_14',
 'mail_europass_icon',
 'mathtext_cm_04',
 'mathtext_dejavusans_04',
 'mathtext_stixsans_42',
 'mathtext_dejavuserif_14',
 'mathtext_dejavuserif_42',
 'beamernavsymbols',
 'fancytipmark4',
 'LTS_logo',
 'W',
 'fig_clustering_wgft',
 'draft',
 'Montreux_Jazz_s_dataset',
 'Wood-Brown',
 'fig_joint_counter3',
 'fig_clustering_actual',
 'fancytipmark1',
 'Foto',
 'vecb6',
 'IndianBlanket',
 'default-testpage',
 'europasslogo',
 'vecb5',
 'contour_hatching',
 '7_Multitone_example-optimalWinLatShear_zoom',
 '7_Multitone_example-optimalWinLatNoShear_zoom',
 'button1c',
 'trisurf3d',
 'beamerexample-lecture-logo',
 'back',
 'buttongc',
 'beamer2',
 'beamer1',
 'beamer0',
 'tiger',
 'fancytipmark',
 'fig_joint_counter2',
 'contour_colorbar',
 'pgf_mixedmode',
 'fpnummodule-mandelbrot',
 'ifmlogoc',
 'overlay4',
 'fond',
 'overlay3',
 'overlay2',
 'summa',
 'overlay10',
 'grDoubleMod',
 'overlay1',
 'shorthyp_t1xtts',
 'graphspectrogram2',
 'mathtext_dejavusans_34',
 'noscale',
 'ifmlogog',
 'mathtext_cm_42',
 'mollweide_grid',
 'fithesis-phil',
 'lahilleCP005',
 'metalgray',
 'metagreen',
 'hist2d',
 'mathtext_dejavuserif_34',
 'metagray',
 'mathtext_dejavuserif_35',
 'metablue',
 'hypercube',
 'lts2_logo',
 'spines_capstyle',
 'lts4logo2',
 'lvb',
 'markevery_polar',
 'fithesis-phil-color',
 'hist2d_transpose',
 'image_composite_background',
 'fithesis-fsps',
 'put2',
 'fig_sensor_signal6',
 'overlay9',
 'overlay8',
 'fithesis-base',
 'fithesis-base-color',
 'overlay5',
 'fig_sensor_signal',
 'BZ1',
 'fithesis-econ',
 'photoAnthoninMS',
 'fithesis-econ-color',
 'fithesis-fsps-color',
 'fithesis-fss-color',
 'tex',
 'overlay7',
 'fithesis-law-color',
 'fithesis-law',
 'fithesis-fss',
 'fig_sensor_strip',
 'mathtext_stix_30',
 'mathtext_dejavuserif_61',
 'mathtext_stix_24',
 'warn',
 'mathtext_stix_34',
 'mathtext_dejavuserif_72',
 'mathtext_stix_44',
 'mathtext_stix_46',
 'mathtext_stix_49',
 'quiver3d_empty',
 'mathtext_dejavuserif_77',
 'mathtext_stix_19',
 'qt4_editor_options',
 'mathtext_stix_04',
 'clipping_diamond',
 'clipping',
 'clipper_edge',
 'mathtext_stix_11',
 'clip_path_clipping',
 'mathtext_dejavuserif_56',
 'mathtext_stix_12',
 'mathtext_stix_17',
 'mathtext_stix_15',
 'mathtext_stix_16',
 'fig_minn_signal',
 'mathtext_dejavuserif_26',
 'mathtext_dejavuserif_55',
 'mathtext_dejavuserif_48',
 'BZ4',
 'BZ3',
 'mathtext_dejavusans_79',
 'mathtext_dejavusans_77',
 'BZ2',
 'mathtext_dejavusans_72',
 'BZ10',
 'mathtext_dejavusans_61',
 'mathtext_dejavusans_55',
 'mathtext_dejavusans_50',
 'mathtext_dejavusans_48',
 'mathtext_dejavusans_47',
 'default',
 'mathtext_dejavusans_46',
 'Figure2',
 'BZ5',
 'BZ6',
 'BZ7',
 'contourf3d_fill',
 'mathtext_dejavuserif_47',
 'mathtext_dejavuserif_46',
 'mathtext_dejavuserif_44',
 'contour3d',
 'BZ8',
 'mathtext_dejavuserif_30',
 'mathtext_dejavuserif_28',
 'mathtext_dejavuserif_11',
 'mathtext_stix_50',
 'mathtext_dejavuserif_24',
 'mathtext_dejavuserif_17',
 'mathtext_dejavuserif_16',
 'mathtext_dejavuserif_15',
 'mathtext_dejavuserif_12',
 'buttonge',
 'blank',
 'mathtext_stix_54',
 'matplotlib',
 'b6of3',
 'b6of4',
 'overlay0',
 'background-eps-converted-to',
 'overlay',
 'outward_ticks',
 'offsetbox_clipping',
 'xor',
 'news_bgr',
 'move',
 'mobile_europass_icon',
 'mixed_collection',
 'Back_2015',
 'bbox_inches_tight_raster',
 'xfig325',
 'b6of2',
 'b6of1',
 'autoscale_tiny_range',
 'pasted-image-149',
 'pgfmanual-mindmap-1',
 'pgfmanual-mindmap-2',
 'zoom_to_rect',
 'TeXoutline',
 'patch_alpha_override',
 'pasted-image-160',
 'pasted-image-140',
 'arc_ellipse',
 'pasted-image-135',
 'address_europass_icon',
 'overlay6',
 'polycollection_joinstyle',
 'polygon',
 'ITS_logo',
 'BZ9',
 'mathtext_stixsans_79',
 'mathtext_stix_55',
 'mathtext_stixsans_77',
 'mathtext_stixsans_34',
 'mathtext_stixsans_30',
 'mathtext_stixsans_24',
 'but',
 'mathtext_stixsans_17',
 'website_europass_icon',
 'mathtext_stixsans_16',
 'mathtext_stixsans_15',
 'button1e',
 'buttonec',
 'buttonee',
 'mathtext_stixsans_12',
 'mathtext_stixsans_11',
 'mathtext_stix_77',
 'mathtext_stix_56',
 'mathtext_stixsans_46',
 'mathtext_stixsans_49',
 'mathtext_stixsans_50',
 'beamericonbook.20',
 'mathtext_stixsans_75',
 'mathtext_stixsans_67',
 'pst',
 'beamericonarticle',
 'beamericonarticle.20',
 'beamericonbook',
 'beamericononline',
 'mathtext_stixsans_55',
 'beamericononline.20',
 'beamerlogo',
 'mathtext_stixsans_63',
 'mathtext_stixsans_61',
 'mathtext_stixsans_58',
 'mathtext_stixsans_56',
 'mathtext_dejavusans_42',
 'mathtext_cm_61',
 'mathtext_dejavusans_33',
 'fibeamer-mu-fss-english',
 'fibeamer-mu-econ-czech',
 'fibeamer-mu-econ-english',
 'fibeamer-mu-fi-czech',
 'fibeamer-mu-fi-english',
 'fibeamer-mu-fsps-czech',
 'fibeamer-mu-fsps-english',
 'fibeamer-mu-fss-czech',
 'fibeamer-mu-law-czech',
 'fancytipmark3',
 'fibeamer-mu-law-english',
 'fibeamer-mu-med-czech',
 'hilbertcurves',
 'hatching_legend',
 'fibeamer-mu-med-english',
 'fibeamer-mu-ped-czech',
 'fibeamer-mu-ped-english',
 'fft_peaks',
 'fancytipmark2',
 'im13',
 'example-image-c',
 'im10',
 'im1',
 'im05',
 'im04',
 'im03',
 'example-image-a',
 'example-image-b',
 'im02',
 'hist_log',
 'im01',
 'im0',
 'hypercubed',
 'hypercube_simple',
 'houses',
 'home',
 'fancyarrow_test_image',
 'fibeamer-mu-phil-czech',
 'fibeamer-mu-phil-english',
 'fibeamer-mu-sci-czech',
 'telephone_europass_icon',
 'fithesis-ped',
 'fithesis-med-color',
 'fithesis-med',
 'filesave',
 'fig_wgft_f',
 'fig_wgft_Tig',
 'fig_translated',
 'fig_path_signal_300',
 'fibeamer-mu-sci-english',
 'fig_path_signal3',
 'fig_path_signal2',
 'fig_path_red',
 'fig_path_graph_scale',
 'fig_minn_T2000',
 'fig_minn_translated',
 'fig_minn_signal_3',
 'fithesis-ped-color',
 'fithesis-sci',
 'fithesis-sci-color',
 't-ghsb',
 'hatch_simplify',
 'hand',
 'stepHidacs',
 'grid',
 'grenoble',
 'fig_T2000',
 'grayscale_alpha',
 'EventCollection_plot__set_ls_dash',
 'grStar',
 'grLadder',
 'subplots',
 'gnuplot42',
 'genealogytree-example-2',
 'genealogytree-example-1',
 'forward',
 'im11',
 'im14',
 'mathtext_dejavusans_30',
 'mathtext_cm_34',
 'dpe-0',
 'dpe-1',
 'mathtext_cm_47',
 'mathtext_cm_46',
 'mathtext_cm_45',
 'mathtext_cm_44',
 'mathtext_cm_35',
 'mathtext_cm_33',
 'mathtext_cm_50',
 'mathtext_cm_30',
 'mathtext_cm_28',
 'mathtext_cm_24',
 'mathtext_cm_19',
 'mathtext_cm_17',
 'mathtext_cm_16',
 'mathtext_cm_15',
 'mathtext_cm_48',
 'mathtext_cm_55',
 'example-grid-100x100pt',
 'mathtext_dejavusans_15',
 'mathtext_dejavusans_28',
 'Fig9',
 'sample-bxeepic',
 'mathtext_dejavusans_24',
 'mathtext_dejavusans_19',
 'mathtext_dejavusans_17',
 'mathtext_dejavusans_16',
 'doclicense-CC-by',
 'mathtext_cm_56',
 'mathtext_dejavusans_12',
 'mathtext_dejavusans_11',
 'doclicense-CC-by-nc-nd-eu',
 'doclicense-CC-by-nc-sa',
 'mathtext_cm_77',
 'doclicense-CC-by-nc-sa-eu',
 'fig_minn_signal2',
 'mathtext_cm_11',
 'mask_image',
 'marker_paths',
 'im4',
 'im_europass_icon',
 'im9',
 'im8',
 'im7',
 'transparent_markers',
 'im6',
 'im5',
 'europasslogo2013',
 'logo',
 'im3',
 'im2',
 'eventplot',
 'smallgraph',
 'example',
 'im16',
 'im15',
 'image',
 'image1-eps-converted-to',
 'image4-eps-converted-to',
 'image_composite_alpha',
 'log_scale_image',
 'line_dashes',
 'keystroke_right',
 'keystroke_middle',
 'keystroke_left',
 'EuropeFlagBW',
 'EuropeFlagBlueCMYK',
 'Fig14',
 'EuropeFlagCMYK',
 'info',
 'EuropeFlagWB',
 'imshow',
 'image_shift',
 'simplify_curve',
 'sine',
 'marker_edges']

In [19]:
df.loc['CIOlympe'].text


Out[19]:
' \n\nIDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71\ntoturbzozszzoocvure<<<<<<<<ibooirzro\n\n \n\n \n\n \n\n \n\n'

In [20]:
df.loc['CIOlympe']


Out[20]:
text            \n\nIDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71\ntot...
text_length                                                   90
text_list      [IDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71, toturbz...
nb_words                                                       2
Name: CIOlympe, dtype: object

In [31]:
%time textbox.auto_extract('/media/benjamin/Elements/pdfs/txt','./texts2.pkl')


Extracting the texts from /media/benjamin/Elements/pdfs/txt
Extraction done.
Processing the text...
list of empty files written in extract_log.csv
Saving to file ./texts2.pkl
CPU times: user 27 s, sys: 1.09 s, total: 28.1 s
Wall time: 58.2 s

In [ ]: