Grab this for later
In [0]:
!wget https://github.com/jimregan/tesseract-gle-uncial/releases/download/v0.1beta2/gle_uncial.traineddata
Install dependencies
In [0]:
!apt-get install libicu-dev libpango1.0-dev libcairo2-dev libleptonica-dev
Clone, compile and set up Tesseract
In [0]:
!git clone https://github.com/tesseract-ocr/tesseract
In [0]:
import os
os.chdir('tesseract')
In [0]:
!sh autogen.sh
In [0]:
!./configure --disable-graphics
In [0]:
!make -j 8
!make install
!ldconfig
!make training
!make training-install
Grab some things to scrape the RIA corpus
In [0]:
import os
os.chdir('/content')
!git clone https://github.com/jimregan/tesseract-gle-uncial/
In [0]:
!apt-get install lynx
Scrape the RIA corpus
In [0]:
! for i in A B C D E F G H I J K L M N O P Q R S T U V W X Y Z;do lynx -dump "http://corpas.ria.ie/index.php?fsg_function=1&fsg_page=$i" |grep http://corpas.ria.ie|awk '{print $NF}' >> list;done
In [0]:
!grep 'function=3' list |sort|uniq|grep corpas.ria|sed -e 's/function=3/function=5/' > input
In [0]:
!wget -x -c -i input
In [0]:
!mkdir text
!for i in corpas.ria.ie/*;do id=$(echo $i|awk -F'=' '{print $NF}');cat $i | perl /content/tesseract-gle-uncial/scripts/extract-ria.pl > text/$id.txt;done
Get the raw corpus in a single text file
In [0]:
!cat text/*.txt|grep -v '^$' > ria-raw.txt
Compress the raw text; this can be downloaded through the file browser on the left, so the scraping steps can be skipped in future
In [0]:
!gzip ria-raw.txt
...and can be re-added using the upload feature in the file browser
In [0]:
!gzip -d ria-raw.txt.gz
This next part is so I can update the langdata files
In [0]:
import os
os.chdir('/content')
!git clone https://github.com/tesseract-ocr/langdata
In [0]:
!cat ria-raw.txt | perl /content/tesseract-gle-uncial/scripts/toponc.pl > ria-ponc.txt
In [0]:
!mkdir genwlout
In [0]:
!perl /content/tesseract-gle-uncial/scripts/genlangdata.pl -i ria-ponc.txt -d genwlout -p gle_uncial
In [0]:
import os
os.chdir('/content/genwlout')
#!for i in gle_uncial.word.bigrams gle_uncial.wordlist gle_uncial.numbers gle_uncial.punc; do cat $i.unsorted | awk -F'\t' '{print $1}' | sort | uniq > $i.sorted;done
!for i in gle_uncial.word.bigrams gle_uncial.wordlist gle_uncial.numbers gle_uncial.punc; do cat $i.sorted /content/langdata/gle_uncial/$i | sort | uniq > $i;done
In [0]:
!for i in gle_uncial.word.bigrams gle_uncial.wordlist gle_uncial.numbers gle_uncial.punc; do cp $i /content/langdata/gle_uncial/;done
In [0]:
Grab the fonts
In [0]:
import os
os.chdir('/content')
!mkdir fonts
os.chdir('fonts')
!wget -i /content/tesseract-gle-uncial/fonts.txt
In [0]:
!for i in *.zip; do unzip $i;done
Generate
In [0]:
os.chdir('/content')
!mkdir unpack
!combine_tessdata -u /content/gle_uncial.traineddata unpack/gle_uncial.
In [0]:
os.chdir('unpack')
!for i in gle_uncial.word.bigrams gle_uncial.wordlist gle_uncial.numbers gle_uncial.punc; do cp /content/genwlout/$i .;done
In [0]:
!wordlist2dawg gle_uncial.numbers gle_uncial.lstm-number-dawg gle_uncial.lstm-unicharset
!wordlist2dawg gle_uncial.punc gle_uncial.lstm-punc-dawg gle_uncial.lstm-unicharset
!wordlist2dawg gle_uncial.wordlist gle_uncial.lstm-word-dawg gle_uncial.lstm-unicharset
In [0]:
!rm gle_uncial.numbers gle_uncial.word.bigrams gle_uncial.punc gle_uncial.wordlist
In [0]:
os.chdir('/content')
!mv gle_uncial.traineddata gle_uncial.traineddata.orig
!combine_tessdata unpack/gle_uncial.
In [0]:
os.chdir('/content')
!bash /content/tesseract/src/training/tesstrain.sh
In [0]:
!text2image --fonts_dir fonts --list_available_fonts
In [0]:
!cat genwlout/gle_uncial.wordlist.unsorted|awk -F'\t' '{print $2 "\t" $1'}|sort -nr > freqlist
In [0]:
!cat freqlist|awk -F'\t' '{print $2}'|grep -v '^$' > wordlist
In [0]:
!cat ria-ponc.txt|sort|uniq|head -n 400000 > gle_uncial.training_text
In [0]:
!cp unpack/gle_uncial.traineddata /usr/share/tesseract-ocr/4.00/tessdata
In [0]:
!cp gle_uncial.trainingtext langdata/gle_uncial/
In [0]:
!mkdir output
In [0]:
!bash tesseract/src/training/tesstrain.sh --fonts_dir fonts --lang gle_uncial --linedata_only --noextract_font_properties --langdata_dir langdata --tessdata_dir /usr/share/tesseract-ocr/4.00/tessdata --output_dir output