In [1]:
import textbox
In [2]:
import importlib
importlib.reload(textbox)
Out[2]:
<module 'textbox' from '/home/benjamin/Documents/eviacybernetics/Projets/OCR/textbox.py'>
In [3]:
df = textbox.extract_text('/media/benjamin/Elements/pdfs/txt')
100%|██████████| 14721/14721 [01:16<00:00, 192.60it/s] | 35/14721 [00:00<00:42, 345.55it/s]
In [8]:
df = textbox.merge_pages(df)
df = textbox.text_properties(df)
In [9]:
textbox.save(df,'./texts1.pkl')
In [4]:
df
Out[4]:
index
filename
text
0
0.0
SCAN_2016_11_03_140924
\n\n' cReonAGRICOLE llllllllllllllllllll|ll||...
1
1.0
SCAN_2016_11_03_140924
\n\n!\n\ncRenwAGRICOLE llllllllllllllllllllll...
2
2.0
SCAN_2016_11_03_140924
\n\n' cRED'TAGR'COLE ||ll|llllllllllllll||||l...
3
3.0
SCAN_2016_11_03_140924
\n\n \n\nLoi Informatique. Fichiers et Libert...
4
4.0
SCAN_2016_11_03_140924
d\n\nCREDIT AGRICOLE »\n_ DESSAvore liililllll...
5
5.0
secourismeBenjaminR
!Ël '\n\n{iii :\n\nPQOË\n‘5‘)\n\n \n\nmama/:02...
6
6.0
EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU...
VOTRE E-BILLET\n\nEVIAN LES BAINS > PARIS GARE...
7
7.0
EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU...
VOTRE E-BILLET\n\nEVIAN LES BAINS > PARIS GARE...
8
8.0
EVIAN_LES_BAINS-PARIS_GARE_LYON_21-10-16_RICAU...
VOTRE E-BILLET\n\nPARIS GARE LYON > EVIAN LES ...
9
9.0
sfr-facture-09-B516-013324644
BZBMOADSV066V038V070\n\nVotre facture du 07/12...
10
10.0
sfr-facture-09-B516-013324644
\n\n \n\nDES QUESTIONS? Pour information\n\nR...
11
11.0
statuttype
MODELE de STATUTS pour une ASSOCIATION d’ANCIE...
12
12.0
statuttype
ARTICLE 6 - Ressources\n\nLes ressources de l’...
13
13.0
SCAN_2016_11_03_140847
\n\nL AUTO-CERTIFICATION DE RESIDENCE FISCALE...
14
14.0
SCAN_2016_11_03_140847
{\n\nCREDIT AGRICOLE\n_: DES…Œ lllllllllllllll...
15
15.0
SCAN_2016_11_03_140847
I\n_ SEEM?“ llllilllililillllilill…iiilllillll...
16
16.0
attestation(1)
à\npâte emptoi\n\nM. RICAUD BENJAMIN\n\n253 RU...
17
17.0
imprimante-hp-envy-4522-compatible-instant-ink...
@ HP ENVY série 4520 tout—en—un\n\n
18
18.0
imprimante-hp-envy-4522-compatible-instant-ink...
19
19.0
imprimante-hp-envy-4522-compatible-instant-ink...
\n\nSommaire\n\n \n\n \n\n \n\n \n\n \n\n1 Co...
20
20.0
imprimante-hp-envy-4522-compatible-instant-ink...
Conseils pour l'utilisation des Services Web ....
21
21.0
imprimante-hp-envy-4522-compatible-instant-ink...
\n\nFRWW\n\nComment faire ?\n\nApprenez a uti...
22
22.0
image2016-11-10-154822
23
23.0
image2016-11-10-154822
\n\n‘ _.r'vn: -._—-._—-_— —— -\n\n'10 janvier...
24
24.0
STATUTSDELAPE.doc
\n\nSTATUTS DE L'ASSOCIATION DE PARENTS D'ELE...
25
25.0
STATUTSDELAPE.doc
3.1 - La qualité de parent d’élève, membre de ...
26
26.0
STATUTSDELAPE.doc
6.1 — Composition et pouvoirs\nL’APE est admin...
27
27.0
STATUTSDELAPE.doc
trésorier-adjoint.\n\n8.1 — Le Président\n\nLe...
28
28.0
STATUTSDELAPE.doc
adressées au moins dix jours à l’avance par vo...
29
29.0
statutsEvias
Evia Cybernetics\n\nSOCIÉTÉ PAR ACTIONS SIMPLI...
...
...
...
...
14691
14691.0
collection
\n\n \n\n
14692
14692.0
figure_today
4.0\n\n3.5\n\n3.0\n\n2.5\n\n2.0\n\n1.5\n\n1.0\...
14693
14693.0
figure_suptitle
1.0\n\n0.8\n\n0.6\n\n0.4\n\n0.2\n\n0%.\n\nwe\n...
14694
14694.0
fancyarrow_test_image
14695
14695.0
move
14696
14696.0
subplots
14697
14697.0
back
\n\n
14698
14698.0
hand
14699
14699.0
matplotlib
14700
14700.0
qt4_editor_options
14701
14701.0
home
14702
14702.0
filesave
14703
14703.0
forward
14704
14704.0
zoom_to_rect
14705
14705.0
divider_append_axes
120\n100\n80\n60\n40\n20\n\n \n\n \n\n \n\n \n...
14706
14706.0
quiver3d
\n\n \n\n
14707
14707.0
quiver3d_empty
14708
14708.0
wireframe3d
\n\n \n\n
14709
14709.0
lines3d
.—\n\n
14710
14710.0
quiver3d_masked
\n\n \n\n
14711
14711.0
mixedsubplot
\n\n \n\n \n\n \n\n \n\n
14712
14712.0
bar3d
\n\n \n\n
14713
14713.0
contour3d
14714
14714.0
surface3d
\n\n \n\n
14715
14715.0
trisurf3d
\n\n
14716
14716.0
contourf3d
\n\n \n\n
14717
14717.0
contourf3d_fill
14718
14718.0
text3d
2 D Text\n\n«> {»\n(z,,gM), dir=N99Qé5\n//\ \\...
14719
14719.0
scatter3d
.—\n\n
14720
14720.0
xpdfimport_err
This PDF file is encrypted and can't be opened....
14721 rows × 3 columns
In [26]:
list(df[df['nb_words']<=5].index)
Out[26]:
['CIOlympe',
'legend_expand',
'fig_stft_tiling',
'hist_stacked_step',
'markevery_linear_scales',
'fig_wavelet_tiling',
'fig_wft_tiling',
'fig_dilated_kernels3-eps-converted-to',
'boxplot_rc_parameters',
'fig_swiss_g_62_450_spectral',
'markevery_log_scales',
'skew_rects',
'markevery_linear_scales_zoomed',
'fig_wavelet_tiling2',
'docgraphs',
'docgraphsorig',
'example-grid-100x100bp',
'diagrams',
'fig_trans_norms_grid',
'example-image-16x10',
'euscript',
'grSQCycle',
'vline_hline_zorder',
'example-image-1x1',
'symlog2',
'example-image-9x16',
'fig_swiss_g_62_983_spectral',
'fig_wgft_gik_hat',
'example-image-10x16',
'poster2',
'dpe-11',
'example-image-16x9',
'im12',
'fig_swiss_g_62_100_spectral',
'test3-pdf',
'Fig1c',
'fig_minn_heat',
'pcolormesh',
'fig_swiss_atoms_gsd1f',
'drawing',
'mathtext_stixsans_37',
'plotgaussian1-eps-converted-to',
'plotgaussian2-eps-converted-to',
'dpe-10',
'transparent-final',
'bbox_inches_tight',
'fig_swiss_gsd3f',
't2-ghsb',
'Fig2a',
'fig_atom_mod1',
'scatter_rc1',
'streamplot_colormap',
'PoleZeros',
'fig_classical_atom2',
'background5',
'fig_classical_atom1',
'fig_classical_atom3',
'something',
'contour_manual_labels',
'mathtext_stix_21',
'fig_clustering_filters',
'mathtext_dejavusans_21',
'background3',
'Fig5',
'stackplot_test_baseline',
'foreground2',
'mathtext_stix_53',
'mathtext_stixsans_53',
'fig_atoms_b',
'mathtext_stix_80',
'mathtext_cm_53',
'patheffect1',
'mathtext_cm_52',
'mp',
'mpgraph',
'foreground1',
'mathtext_dejavuserif_21',
'framealpha',
'system2',
'mathtext_cm_21',
'mathtext_dejavusans_75',
'mathtext_dejavuserif_73',
'mathtext_dejavuserif_74',
'spines_axes_positions',
'mathtext_cm_73',
'image_alpha',
'mathtext_dejavusans_73',
'mathtext_dejavuserif_75',
'mathtext_stixsans_21',
'mathtext_cm_74',
'mathtext_dejavusans_74',
'mathtext_dejavusans_41',
'mathtext_dejavuserif_22',
'genezik',
'mathtext_stixsans_22',
'rgba_markers',
'mathtext_stixsans_20',
'fig_swiss_gsd1f',
'result-picture',
'mathtext_dejavuserif_41',
'roundtri',
'mathtext_cm_22',
'mathtext_dejavusans_22',
'background4',
'mathtext_stix_22',
'figS',
'mathtext_stix_75',
'fig3',
'tiny_example_1',
'mathtext_cm_41',
'example-image-golden-upright',
'graphspectrogram1',
'circle',
'legend_auto1',
'mathtext_stix_41',
'mathtext_cm_75',
'skew_axes',
'nqueen',
'dpe-9',
'chrysant',
'mathtext_dejavuserif_53',
'polar_theta_position',
'mathtext_stixsans_68',
'mathtext_stixsans_54',
'mathtext_cm_68',
'mathtext_cm_38',
'mathtext_dejavusans_68',
'background1',
'mathtext_dejavuserif_68',
'mathtext_stix_52',
'mathtext_stixsans_41',
'mathtext_stix_68',
'fill_between_interpolate',
'hist_steplog',
'hist_stacked_stepfilled_alpha',
'patheffect3',
'mathtext_dejavusans_53',
'hist_stacked_stepfilled',
'polar_units',
'mathtext_cm_09',
'low_pass_filter',
'offset_points',
'buttrfly',
'spiral',
'log_scales',
'colors',
'streamplot_startpoints',
'gr-edgeingraphmodloop',
'streamplot_masks_and_nans_test_image',
'earth-moon',
'mathtext_stix_48',
'mathtext_stix_47',
'mathtext_cm_23',
'fig_swiss_g_62_100_vertex',
'patch_alpha_coloring',
'mathtext_stixsans_38',
'polar_coords',
'para_equal_perp',
'activity',
'polar_rlabel_position',
'mathtext_cm_05',
'mathtext_stixsans_31',
'mathtext_stixsans_05',
'mathtext_dejavuserif_23',
'mathtext_dejavusans_23',
'mathtext_stix_05',
'mathtext_stix_06',
'nonfinite_limits',
'mathtext_stix_23',
'mathtext_cm_06',
'mathtext_cm_67',
'mathtext_dejavusans_05',
'mathtext_dejavuserif_06',
'mathtext_dejavuserif_05',
'polar_rmin',
'pasted-image-155',
'mathtext_dejavuserif_70',
'fig_classical_sliding3',
'noticeLegoPolice',
'mathtext_stixsans_23',
'mathtext_dejavuserif_67',
'grWheel',
'example-image',
'mathtext_stixsans_40',
'mathtext_dejavusans_40',
'mathtext_stix_38',
'mathtext_dejavusans_38',
'ekflogo',
'mathtext_cm_31',
'mathtext_dejavusans_06',
'mathtext_dejavusans_31',
'legend_auto2',
'mathtext_dejavusans_70',
'xkcd',
'mathtext_cm_79',
'mathtext_stix_40',
'mathtext_stixsans_19',
'mathtext_cm_40',
'fig_path_signal',
'mathtext_stixsans_60',
'mathtext_cm_60',
'mathtext_stixsans_70',
'vector',
'mathtext_dejavusans_60',
'mathtext_dejavuserif_60',
'fs',
'mathtext_dejavuserif_38',
'fig_trans_norms_comet',
'mathtext_stixsans_06',
'mathtext_stix_67',
'fig_swiss_g_62_450_vertex',
'fig_ring_red',
'labelgraph',
'test_alpha',
'truetype-conversion',
'incgraph-example-b',
'incgraph-example-c',
'image_cliprect',
'incgraph-example-a',
'fig_minn_heat_mod',
'mixedsubplot',
'mathtext_stix_70',
'mathtext_dejavusans_67',
'mathtext_stix_79',
'overflow',
'background2',
'mathtext_dejavuserif_31',
'mathtext_stix_29',
'patch_custom_linestyle',
'doclicense-CC-zero',
'doclicense-CC-pd',
'bbox_inches_tight_clipping',
'mathtext_dejavusans_29',
'mathtext_stixsans_28',
'mathtext_stixsans_29',
'colArray',
'mathtext_cm_70',
'mathtext_cm_29',
'mathtext_cm_27',
'mathtext_dejavusans_27',
'mathtext_stixsans_03',
'mathtext_stixsans_27',
'transistor',
'polar_wrap_360',
'unclassified',
'mathtext_dejavuserif_29',
'confidential',
'mathtext_cm_03',
'mathtext_stix_27',
'mathtext_stix_31',
'mathtext_stix_03',
'mathtext_stix_09',
'mathtext_dejavuserif_27',
'polar_wrap_180',
'wavelet_1',
'mathtext_dejavuserif_40',
'line',
'photomayaPS',
'syracuse',
'syracuse-crop',
'Fig13',
'mathtext_stix_71',
'mathtext_cm_71',
'mathtext_dejavusans_09',
'mathtext_cm_76',
'pst-doc',
'wedge_range',
'mathtext_stix_02',
'mathtext_dejavusans_07',
'topsecret',
'classified',
'mathtext_dejavusans_56',
'dash_offset',
'mathtext_stixsans_07',
'mathtext_stix_26',
'mathtext_dejavuserif_07',
'mathtext_stixsans_02',
'mathtext_stix_76',
'mathtext_dejavusans_76',
'mathtext_dejavusans_02',
'fig_T1000',
'helix',
'mathtext_stix_60',
'mathtext_dejavuserif_02',
'mathtext_dejavuserif_09',
'standard',
'mathtext_dejavuserif_00',
'mathtext_dejavuserif_76',
'mathtext_cm_39',
'Fig.2',
'no_interpolation_origin',
'dpe-8',
'mathtext_stixsans_76',
'mathtext_cm_63',
'mathtext_cm_07',
'mathtext_cm_02',
'fig_minn_T1000',
'pgf_bbox_inches',
'eLaToutline',
'mathtext_stix_00',
'wiki_memory',
'mathtext_dejavusans_03',
'mathtext_cm_10',
'mathtext_stixsans_09',
'mathtext_stix_36',
'mathtext_dejavusans_49',
'mathtext_stixsans_10',
'mathtext_cm_00',
'mathtext_stix_10',
'mathtext_cm_36',
'mathtext_stixsans_39',
'mathtext_stixsans_00',
'mathtext_stix_07',
'mathtext_stix_28',
'dpe-5',
'dpe-6',
'mathtext_dejavuserif_03',
'mathtext_dejavusans_00',
'mathtext_stixsans_71',
'mathtext_dejavuserif_63',
'mathtext_dejavusans_71',
'mathtext_stixsans_44',
'mathtext_dejavuserif_36',
'mathtext_stix_72',
'zedat',
'mathtext_cm_32',
'permisLahille',
'mathtext_dejavusans_36',
'mathtext_cm_62',
'mathtext_stixsans_72',
'mathtext_dejavuserif_62',
'mathtext_cm_69',
'mathtext_dejavusans_10',
'mathtext_stixsans_32',
'mathtext_cm_72',
'mathtext_stixsans_36',
'mathtext_dejavuserif_10',
'mathtext_stix_32',
'dpe-7',
'mathtext_dejavuserif_25',
'fig_atoms_a',
'fig_atoms_c',
'turtle',
'mathtext_cm_18',
'photoAnthoninMS3',
'patheffect2',
'Fig2b',
'mathtext_stix_45',
'mathtext_dejavuserif_49',
'mathtext_dejavuserif_57',
'fig_minn_scaling',
'Fig4',
'grCLadder',
'mathtext_dejavuserif_79',
'tight_layout5',
'mathtext_cm_25',
'Fig1a',
'Fig1b',
'mathtext_dejavusans_62',
'mathtext_cm_49',
'fig_swiss_vertex3',
'mathtext_dejavusans_57',
'bbox_image_inverted',
'button',
'mathtext_dejavusans_44',
'mathtext_dejavusans_39',
'mathtext_stixsans_18',
'random_memory',
'mathtext_stixsans_26',
'mathtext_dejavusans_26',
'mathtext_dejavusans_25',
'fig_swiss_atoms2',
'mathtext_stixsans_57',
'image3-eps-converted-to',
'mathtext_cm_26',
'mathtext_stixsans_25',
'poles_zeros',
'root',
'mathtext_stix_57',
'stackplot_test_image',
'mathtext_stix_63',
'dpe-4',
'dpe-2',
'dpe-3',
'rotate_image',
'interp_nearest_vs_none',
'hist_stacked_weights',
'fig_trans_norms_minn',
'mathtext_dejavusans_63',
'mathtext_stix_13',
'mathtext_stixsans_13',
'bclogo',
'mathtext_stix_25',
'mathtext_stix_64',
'pst-uml-encapsuled-pdf-fig',
'mathtext_stix_62',
'mathtext_stixsans_64',
'mathtext_stixsans_62',
'mathtext_stix_39',
'mathtext_stix_18',
'secret',
'mathtext_dejavusans_08',
'mathtext_dejavuserif_13',
'mathtext_dejavuserif_45',
'mathtext_cm_64',
'mathtext_dejavusans_13',
'mathtext_dejavusans_18',
'mathtext_dejavuserif_18',
'mathtext_dejavuserif_39',
'mathtext_dejavuserif_33',
'mathtext_dejavuserif_64',
'd2tpstexamples',
'mathtext_cm_13',
'mathtext_dejavusans_64',
'mathtext_stixsans_08',
'mathtext_dejavuserif_08',
'mathtext_stix_33',
'doclicense-CC-by-nc',
'mathtext_stixsans_33',
'mathtext_stix_66',
'mathtext_dejavuserif_32',
'mathtext_dejavusans_32',
'mathtext_dejavuserif_71',
'rasterize_10dpi',
'mathtext_cm_58',
'mathtext_cm_08',
'epfl_logo',
'mathtext_stix_08',
'mathtext_stix_61',
'mathtext_stixsans_66',
'mathtext_cm_65',
'pic',
'heat_tau_25',
'heat_tau_50',
'heat_tau_10',
'wavelet_filtering',
'mathtext_dejavusans_66',
'mathtext_stixsans_01',
'fig_minn_graph',
'fig_trans7',
'palm',
'shema_richmont',
'3dsystem',
'wireframe3d',
'mathtext_dejavusans_45',
'mathtext_cm_66',
'fig_joint_counter1',
'mathtext_cm_51',
'bar3d',
'mathtext_stixsans_45',
'colorbar_closed_patch',
'mathtext_stix_35',
'put',
'heat_tau_1',
'wavelet_4',
'quiver3d_masked',
'quiver3d',
'fithesis-fi-color',
'surface3d',
'mathtext_dejavuserif_66',
'collection',
'contourf3d',
'wavelet_3',
'wavelet_2',
'fithesis-fi',
'mathtext_stixsans_65',
'mathtext_dejavuserif_43',
'mathtext_stixsans_51',
'mathtext_stixsans_59',
'mathtext_stixsans_69',
'mathtext_stix_58',
'mathtext_dejavusans_01',
'mathtext_stixsans_43',
'mathtext_stixsans_35',
'grandcanyon',
'mathtext_stix_59',
'mathtext_cm_59',
'mathtext_dejavuserif_51',
'mathtext_cm_57',
'doclicense-CC-by-sa',
'mathtext_dejavuserif_59',
'mathtext_cm_43',
'mathtext_stix_01',
'mathtext_dejavuserif_65',
'mathtext_cm_14',
'mathtext_cm_12',
'mathtext_cm_01',
'mathtext_dejavuserif_58',
'mathtext_dejavusans_51',
'mathtext_stix_43',
'mathtext_dejavusans_58',
'mathtext_stix_42',
'mathtext_stix_69',
'mathtext_dejavuserif_69',
'mathtext_stix_65',
'mathtext_dejavusans_43',
'mathtext_dejavusans_35',
'mathtext_dejavusans_69',
'mathtext_dejavusans_65',
'mathtext_dejavuserif_01',
'mathtext_dejavuserif_04',
'lines3d',
'mathtext_stixsans_04',
'mathtext_dejavusans_59',
'mathtext_stix_14',
'compatibility',
'mathtext_stix_51',
'scatter3d',
'doclicense-CC-by-nd',
'instructions-differential',
'mathtext_stixsans_14',
'im06',
'doclicense-CC-by-nc-eu',
'mathtext_dejavusans_14',
'mail_europass_icon',
'mathtext_cm_04',
'mathtext_dejavusans_04',
'mathtext_stixsans_42',
'mathtext_dejavuserif_14',
'mathtext_dejavuserif_42',
'beamernavsymbols',
'fancytipmark4',
'LTS_logo',
'W',
'fig_clustering_wgft',
'draft',
'Montreux_Jazz_s_dataset',
'Wood-Brown',
'fig_joint_counter3',
'fig_clustering_actual',
'fancytipmark1',
'Foto',
'vecb6',
'IndianBlanket',
'default-testpage',
'europasslogo',
'vecb5',
'contour_hatching',
'7_Multitone_example-optimalWinLatShear_zoom',
'7_Multitone_example-optimalWinLatNoShear_zoom',
'button1c',
'trisurf3d',
'beamerexample-lecture-logo',
'back',
'buttongc',
'beamer2',
'beamer1',
'beamer0',
'tiger',
'fancytipmark',
'fig_joint_counter2',
'contour_colorbar',
'pgf_mixedmode',
'fpnummodule-mandelbrot',
'ifmlogoc',
'overlay4',
'fond',
'overlay3',
'overlay2',
'summa',
'overlay10',
'grDoubleMod',
'overlay1',
'shorthyp_t1xtts',
'graphspectrogram2',
'mathtext_dejavusans_34',
'noscale',
'ifmlogog',
'mathtext_cm_42',
'mollweide_grid',
'fithesis-phil',
'lahilleCP005',
'metalgray',
'metagreen',
'hist2d',
'mathtext_dejavuserif_34',
'metagray',
'mathtext_dejavuserif_35',
'metablue',
'hypercube',
'lts2_logo',
'spines_capstyle',
'lts4logo2',
'lvb',
'markevery_polar',
'fithesis-phil-color',
'hist2d_transpose',
'image_composite_background',
'fithesis-fsps',
'put2',
'fig_sensor_signal6',
'overlay9',
'overlay8',
'fithesis-base',
'fithesis-base-color',
'overlay5',
'fig_sensor_signal',
'BZ1',
'fithesis-econ',
'photoAnthoninMS',
'fithesis-econ-color',
'fithesis-fsps-color',
'fithesis-fss-color',
'tex',
'overlay7',
'fithesis-law-color',
'fithesis-law',
'fithesis-fss',
'fig_sensor_strip',
'mathtext_stix_30',
'mathtext_dejavuserif_61',
'mathtext_stix_24',
'warn',
'mathtext_stix_34',
'mathtext_dejavuserif_72',
'mathtext_stix_44',
'mathtext_stix_46',
'mathtext_stix_49',
'quiver3d_empty',
'mathtext_dejavuserif_77',
'mathtext_stix_19',
'qt4_editor_options',
'mathtext_stix_04',
'clipping_diamond',
'clipping',
'clipper_edge',
'mathtext_stix_11',
'clip_path_clipping',
'mathtext_dejavuserif_56',
'mathtext_stix_12',
'mathtext_stix_17',
'mathtext_stix_15',
'mathtext_stix_16',
'fig_minn_signal',
'mathtext_dejavuserif_26',
'mathtext_dejavuserif_55',
'mathtext_dejavuserif_48',
'BZ4',
'BZ3',
'mathtext_dejavusans_79',
'mathtext_dejavusans_77',
'BZ2',
'mathtext_dejavusans_72',
'BZ10',
'mathtext_dejavusans_61',
'mathtext_dejavusans_55',
'mathtext_dejavusans_50',
'mathtext_dejavusans_48',
'mathtext_dejavusans_47',
'default',
'mathtext_dejavusans_46',
'Figure2',
'BZ5',
'BZ6',
'BZ7',
'contourf3d_fill',
'mathtext_dejavuserif_47',
'mathtext_dejavuserif_46',
'mathtext_dejavuserif_44',
'contour3d',
'BZ8',
'mathtext_dejavuserif_30',
'mathtext_dejavuserif_28',
'mathtext_dejavuserif_11',
'mathtext_stix_50',
'mathtext_dejavuserif_24',
'mathtext_dejavuserif_17',
'mathtext_dejavuserif_16',
'mathtext_dejavuserif_15',
'mathtext_dejavuserif_12',
'buttonge',
'blank',
'mathtext_stix_54',
'matplotlib',
'b6of3',
'b6of4',
'overlay0',
'background-eps-converted-to',
'overlay',
'outward_ticks',
'offsetbox_clipping',
'xor',
'news_bgr',
'move',
'mobile_europass_icon',
'mixed_collection',
'Back_2015',
'bbox_inches_tight_raster',
'xfig325',
'b6of2',
'b6of1',
'autoscale_tiny_range',
'pasted-image-149',
'pgfmanual-mindmap-1',
'pgfmanual-mindmap-2',
'zoom_to_rect',
'TeXoutline',
'patch_alpha_override',
'pasted-image-160',
'pasted-image-140',
'arc_ellipse',
'pasted-image-135',
'address_europass_icon',
'overlay6',
'polycollection_joinstyle',
'polygon',
'ITS_logo',
'BZ9',
'mathtext_stixsans_79',
'mathtext_stix_55',
'mathtext_stixsans_77',
'mathtext_stixsans_34',
'mathtext_stixsans_30',
'mathtext_stixsans_24',
'but',
'mathtext_stixsans_17',
'website_europass_icon',
'mathtext_stixsans_16',
'mathtext_stixsans_15',
'button1e',
'buttonec',
'buttonee',
'mathtext_stixsans_12',
'mathtext_stixsans_11',
'mathtext_stix_77',
'mathtext_stix_56',
'mathtext_stixsans_46',
'mathtext_stixsans_49',
'mathtext_stixsans_50',
'beamericonbook.20',
'mathtext_stixsans_75',
'mathtext_stixsans_67',
'pst',
'beamericonarticle',
'beamericonarticle.20',
'beamericonbook',
'beamericononline',
'mathtext_stixsans_55',
'beamericononline.20',
'beamerlogo',
'mathtext_stixsans_63',
'mathtext_stixsans_61',
'mathtext_stixsans_58',
'mathtext_stixsans_56',
'mathtext_dejavusans_42',
'mathtext_cm_61',
'mathtext_dejavusans_33',
'fibeamer-mu-fss-english',
'fibeamer-mu-econ-czech',
'fibeamer-mu-econ-english',
'fibeamer-mu-fi-czech',
'fibeamer-mu-fi-english',
'fibeamer-mu-fsps-czech',
'fibeamer-mu-fsps-english',
'fibeamer-mu-fss-czech',
'fibeamer-mu-law-czech',
'fancytipmark3',
'fibeamer-mu-law-english',
'fibeamer-mu-med-czech',
'hilbertcurves',
'hatching_legend',
'fibeamer-mu-med-english',
'fibeamer-mu-ped-czech',
'fibeamer-mu-ped-english',
'fft_peaks',
'fancytipmark2',
'im13',
'example-image-c',
'im10',
'im1',
'im05',
'im04',
'im03',
'example-image-a',
'example-image-b',
'im02',
'hist_log',
'im01',
'im0',
'hypercubed',
'hypercube_simple',
'houses',
'home',
'fancyarrow_test_image',
'fibeamer-mu-phil-czech',
'fibeamer-mu-phil-english',
'fibeamer-mu-sci-czech',
'telephone_europass_icon',
'fithesis-ped',
'fithesis-med-color',
'fithesis-med',
'filesave',
'fig_wgft_f',
'fig_wgft_Tig',
'fig_translated',
'fig_path_signal_300',
'fibeamer-mu-sci-english',
'fig_path_signal3',
'fig_path_signal2',
'fig_path_red',
'fig_path_graph_scale',
'fig_minn_T2000',
'fig_minn_translated',
'fig_minn_signal_3',
'fithesis-ped-color',
'fithesis-sci',
'fithesis-sci-color',
't-ghsb',
'hatch_simplify',
'hand',
'stepHidacs',
'grid',
'grenoble',
'fig_T2000',
'grayscale_alpha',
'EventCollection_plot__set_ls_dash',
'grStar',
'grLadder',
'subplots',
'gnuplot42',
'genealogytree-example-2',
'genealogytree-example-1',
'forward',
'im11',
'im14',
'mathtext_dejavusans_30',
'mathtext_cm_34',
'dpe-0',
'dpe-1',
'mathtext_cm_47',
'mathtext_cm_46',
'mathtext_cm_45',
'mathtext_cm_44',
'mathtext_cm_35',
'mathtext_cm_33',
'mathtext_cm_50',
'mathtext_cm_30',
'mathtext_cm_28',
'mathtext_cm_24',
'mathtext_cm_19',
'mathtext_cm_17',
'mathtext_cm_16',
'mathtext_cm_15',
'mathtext_cm_48',
'mathtext_cm_55',
'example-grid-100x100pt',
'mathtext_dejavusans_15',
'mathtext_dejavusans_28',
'Fig9',
'sample-bxeepic',
'mathtext_dejavusans_24',
'mathtext_dejavusans_19',
'mathtext_dejavusans_17',
'mathtext_dejavusans_16',
'doclicense-CC-by',
'mathtext_cm_56',
'mathtext_dejavusans_12',
'mathtext_dejavusans_11',
'doclicense-CC-by-nc-nd-eu',
'doclicense-CC-by-nc-sa',
'mathtext_cm_77',
'doclicense-CC-by-nc-sa-eu',
'fig_minn_signal2',
'mathtext_cm_11',
'mask_image',
'marker_paths',
'im4',
'im_europass_icon',
'im9',
'im8',
'im7',
'transparent_markers',
'im6',
'im5',
'europasslogo2013',
'logo',
'im3',
'im2',
'eventplot',
'smallgraph',
'example',
'im16',
'im15',
'image',
'image1-eps-converted-to',
'image4-eps-converted-to',
'image_composite_alpha',
'log_scale_image',
'line_dashes',
'keystroke_right',
'keystroke_middle',
'keystroke_left',
'EuropeFlagBW',
'EuropeFlagBlueCMYK',
'Fig14',
'EuropeFlagCMYK',
'info',
'EuropeFlagWB',
'imshow',
'image_shift',
'simplify_curve',
'sine',
'marker_edges']
In [19]:
df.loc['CIOlympe'].text
Out[19]:
' \n\nIDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71\ntoturbzozszzoocvure<<<<<<<<ibooirzro\n\n \n\n \n\n \n\n \n\n'
In [20]:
df.loc['CIOlympe']
Out[20]:
text \n\nIDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71\ntot...
text_length 90
text_list [IDFla.:cauo<<<<<<<<<<<<<<<<<<<762l71, toturbz...
nb_words 2
Name: CIOlympe, dtype: object
In [31]:
%time textbox.auto_extract('/media/benjamin/Elements/pdfs/txt','./texts2.pkl')
Extracting the texts from /media/benjamin/Elements/pdfs/txt
Extraction done.
Processing the text...
list of empty files written in extract_log.csv
Saving to file ./texts2.pkl
CPU times: user 27 s, sys: 1.09 s, total: 28.1 s
Wall time: 58.2 s
In [ ]:
Content source: bricaud/OCR-classif
Similar notebooks: