diff --git a/.gitignore b/.gitignore index 1174595..ce107aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ .vagrant .idea +Vagrantfile +package.box +*.traineddata \ No newline at end of file diff --git a/README.md b/README.md index ed5ff33..398af87 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,57 @@ # Tesseract OCR Vagrant Box +Машина с настроенным Tesseract OCR и всеми необходимыми либами для обучения/тюнинга моделей ## First UP - ```shell script +cp Vagrantfile.dist Vagrantfile vagrant plugin install vagrant-vbguest vagrant vbguest vagrant up ``` +## Create BOX +Для использования готовой коробки (в которой все установлено и собрано для обучения): +```shell script +vagrant package +vagrant destroy -f +rm Vagrantfile +cp Vagrantfile.feat Vagrantfile +``` + +## Links +* [How to use the tools provided to train Tesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html#additional-libraries-required) + + +## Example +Создайте подпапку в ~/training/ и перейдите в нее. Далее: +```shell script +PAGES=100 +FONT="GOST Type A" +BASE=$TESSDATA_PREFIX/rus.traineddata +MODELNAME=result + +mkdir -p model_existed model_training model_result + +tesstrain.sh --fonts_dir $FONTS_DIR --fontlist $FONT --lang rus \ + --linedata_only --noextract_font_properties --langdata_dir /tesseract-4.1.1/langdata \ + --tessdata_dir $TESSDATA_PREFIX --maxpages $PAGES --output_dir ./ + +combine_tessdata -e $BASE ./model_existed/model.lstm + +lstmtraining --model_output ./model_training/our \ + --continue_from model_existed/model.lstm \ + --traineddata $TESSDATA_PREFIX/rus.traineddata \ + --train_listfile rus.training_files.txt + +lstmtraining --stop_training \ + --continue_from ./model_training/our_checkpoint \ + --traineddata $TESSDATA_PREFIX/rus.traineddata \ + --model_output ./model_result/$MODELNAME.traineddata +``` + +## Notes +`tesstrain_utils.sh.fixed` - исправленная версия оф. хелпера. + +```-l ${LANG_CODE} ``` добавлено в `tesstrain_utils.sh.fixed:467` +иначе будет проблема - язык не указан. Официальный файл заменен исправленным при провижине машины +(см. install.sh) \ No newline at end of file diff --git a/Vagrantfile b/Vagrantfile.dist similarity index 82% rename from Vagrantfile rename to Vagrantfile.dist index b9cd16f..dfb1498 100644 --- a/Vagrantfile +++ b/Vagrantfile.dist @@ -8,19 +8,12 @@ Vagrant.configure("2") do |config| config.vm.provider :virtualbox do |vb| vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] - # Display the VirtualBox GUI when booting the machine - #vb.gui = true - - # Customize the amount of memory on the VM: vb.memory = 2048 - - # Use 1 CPUs - vb.cpus = 2 + vb.cpus = 4 end config.vm.network "forwarded_port", guest: 8000, host: 8001, auto_correct: true config.vm.network "forwarded_port", guest: 80, host: 81, auto_correct: true - config.vm.network "forwarded_port", guest: 5000, host: 5001, auto_correct: true config.vm.synced_folder ".", "/vagrant" config.vm.provision "shell", privileged: true, path: "./provision.sh" @@ -35,7 +28,8 @@ Vagrant.configure("2") do |config| config.trigger.after :up do |trigger| trigger.info = "Trigger: install dependencies on every up" trigger.run_remote = {inline: <<-SHELL - + chmod +x /vagrant/install-fonts.sh + runuser -l vagrant -c './vagrant/install-fonts.sh' SHELL } end diff --git a/Vagrantfile.feat b/Vagrantfile.feat new file mode 100644 index 0000000..306dd11 --- /dev/null +++ b/Vagrantfile.feat @@ -0,0 +1,35 @@ +# encoding: utf-8 +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.configure("2") do |config| + config.vm.box = "package.box" + config.vm.box_check_update = false + + config.vm.provider :virtualbox do |vb| + vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] + vb.memory = 2048 + vb.cpus = 4 + end + + config.vm.network "forwarded_port", guest: 8000, host: 8001, auto_correct: true + config.vm.network "forwarded_port", guest: 80, host: 81, auto_correct: true + config.vm.network "forwarded_port", guest: 5000, host: 5001, auto_correct: true + + config.vm.synced_folder ".", "/vagrant" + config.vm.provision "shell", privileged: true, inline: <<<-SHELL + + SHELL + + config.trigger.after :up do |trigger| + trigger.info = "Trigger: install dependencies on every up" + trigger.run_remote = {inline: <<-SHELL + + SHELL + } end + + # Enable X11 forwarding for graphical apps. + # Make sure you have xquartz installed if using OSX host! + config.ssh.forward_agent = true + config.ssh.forward_x11 = true +end diff --git a/add-fonts-to-ocr.sh b/add-fonts-to-ocr.sh deleted file mode 100755 index 1555068..0000000 --- a/add-fonts-to-ocr.sh +++ /dev/null @@ -1,7 +0,0 @@ -if [ ! -d $FONTS_DIR ]; then - echo "set FONTS_DIR envvar"; exit 1 -fi -LNG_DIR=/vagrant/init/langdata/rus -text2image --find_fonts --fonts_dir $FONTS_DIR \ - --text $LNG_DIR/rus.training_text --min_coverage .9 \ - --outputbase $LNG_DIR/rus |& grep raw | sed -e 's/ :.*/@ \\/g' | sed -e "s/^/ '/" | sed -e "s/@/'/g" > $LNG_DIR/fontslist.txt \ No newline at end of file diff --git a/install-fonts.sh b/install-fonts.sh new file mode 100644 index 0000000..3b9e6b7 --- /dev/null +++ b/install-fonts.sh @@ -0,0 +1,15 @@ +[ -z "$FONTS_DIR" ] && echo "\$FONTS_DIR is empty!" && exit 1 + +sudo chown -R vagrant:vagrant $FONTS_DIR +sudo chmod -R 775 $FONTS_DIR + +echo "Remove installed fonts" +rm -fv $FONTS_DIR/*.ttf + +echo "Copy fonts to /vagrant/fonts" +cp -v /vagrant/fonts/*.ttf $FONTS_DIR + +sudo chmod 644 $FONTS_DIR/* + +fc-cache +fc-list | grep 'GOST' diff --git a/install.sh b/install.sh index b253475..83f3f7f 100755 --- a/install.sh +++ b/install.sh @@ -57,3 +57,7 @@ wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/ru echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; \ echo "export TESSDATA_PREFIX=$TESSDATA_PREFIX"; \ } >> "$HOME"/.bashrc + +# фиксим проблему с фазой Е в tesstrain-utils.sh +mv -v $VBIN/tesstrain_utils.sh $VBIN/tesstrain_utils.sh.bak +cp -v /vagrant/tesstrain_utils.sh $VBIN/tesstrain_utils.sh \ No newline at end of file diff --git a/provision.sh b/provision.sh index 1954479..a3846e1 100644 --- a/provision.sh +++ b/provision.sh @@ -11,7 +11,7 @@ echo "%vagrant ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/vagrant export DEBIAN_FRONTEND=noninteractive locale-gen en_US.UTF-8 apt-get update -apt-get install -y htop nano libsm6 libxext6 libxrender-dev \ +apt-get install -y htop nano tree libsm6 libxext6 libxrender-dev \ libicu-dev libpango1.0-dev libcairo2-dev g++ autoconf automake \ libtool pkg-config libpng-dev libjpeg62-turbo-dev libtiff5-dev \ zlib1g-dev libicu-dev libleptonica-dev @@ -27,12 +27,6 @@ cd /vagrant || exit 1 chmod +x ./install.sh runuser -l vagrant -c '/vagrant/install.sh' -# Устанавливаем чертежные шрифты в систему +# путь к шрифтам export FONTS_DIR=/usr/local/share/fonts -cp /vagrant/fonts/* $FONTS_DIR -chown -R vagrant:vagrant $FONTS_DIR -chmod -R 775 $FONTS_DIR -chmod 644 $FONTS_DIR/* -fc-cache -fc-list | grep 'GOST' -echo "export FONTS_DIR=$FONTS_DIR" >> /home/vagrant/.bashrc +echo "export FONTS_DIR=$FONTS_DIR" >> /home/vagrant/.bashrc \ No newline at end of file diff --git a/tesstrain_utils.sh.fixed b/tesstrain_utils.sh.fixed new file mode 100644 index 0000000..7a1b43f --- /dev/null +++ b/tesstrain_utils.sh.fixed @@ -0,0 +1,627 @@ +#!/bin/bash +# (C) Copyright 2014, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script defines functions that are used by tesstrain.sh +# For a detailed description of the phases, see +# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract +# +# USAGE: source tesstrain_utils.sh + +if [ -n "$BASH_VERSION" ];then + set -u # comment in case of "unbound variable" error or fix the code + set -eo pipefail; +else + echo "Warning: you aren't running script in bash - expect problems..." + fi + +UNAME=$(uname -s | tr 'A-Z' 'a-z') + +FONT_CONFIG_CACHE=$(mktemp -d -t font_tmp.XXXXXXXXXX) + +if [[ ($UNAME == *darwin*) ]]; then + FONTS_DIR="/Library/Fonts/" +else + FONTS_DIR="/usr/share/fonts/" +fi + +DISTORT_IMAGE=false +EXTRACT_FONT_PROPERTIES=false +LINEDATA=false +MAX_PAGES=0 +MY_BOXTIFF_DIR="" +OUTPUT_DIR="/tmp/tesstrain/tessdata" +OVERWRITE=false +RUN_SHAPE_CLUSTERING=false +SAVE_BOX_TIFF=false +WORKSPACE_DIR=$(mktemp -d) +X_SIZE=3600 +PT_SIZE=12 + +# set TESSDATA_PREFIX as empty, if not defined in environment to avoid an unbound variable +TESSDATA_PREFIX=${TESSDATA_PREFIX:-} + +# Logging helper functions. +tlog() { + if test -z "${LOG_FILE:-}"; then + echo -e $* + else + echo -e $* | tee -a ${LOG_FILE} + fi +} + +err_exit() { + if test -z "${LOG_FILE:-}"; then + echo -e "ERROR: "$* + else + echo -e "ERROR: "$* | tee -a ${LOG_FILE} + fi + exit 1 +} + +# Helper function to run a command and append its output to a log. Aborts early +# if the program file is not found. +# Usage: run_command CMD ARG1 ARG2... +run_command() { + local cmd + cmd=$(which $1 || \ + for d in api training; do + which $d/$1 && break + done) || err_exit "'$1' not found" + shift + tlog "[$(date)] ${cmd} $@" + if ! "${cmd}" "$@" 2>&1 | tee -a "${LOG_FILE}"; then + err_exit "Program $(basename ${cmd}) failed. Abort." + fi +} + +# Check if all the given files exist, or exit otherwise. +# Used to check required input files and produced output files in each phase. +# Usage: check_file_readable FILE1 FILE2... +check_file_readable() { + for file in $@; do + if [[ ! -r ${file} ]]; then + err_exit "${file} does not exist or is not readable" + fi + done +} + +# Sets the named variable to given value. Aborts if the value is missing or +# if it looks like a flag. +# Usage: parse_value VAR_NAME VALUE +parse_value() { + local val="${2:-}" + if [[ -z "$val" ]]; then + err_exit "Missing value for variable $1" + exit + fi + if [[ ${val:0:2} == "--" ]]; then + err_exit "Invalid value $val passed for variable $1" + exit + fi + eval $1=\"$val\" +} + +# Does simple command-line parsing and initialization. +parse_flags() { + local i=0 + while test $i -lt ${#ARGV[@]}; do + local j=$((i+1)) + case ${ARGV[$i]} in + --) + break;; + --fontlist) + fn=0 + FONTS="" + while test $j -lt ${#ARGV[@]}; do + test -z "${ARGV[$j]}" && break + test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break + FONTS[$fn]="${ARGV[$j]}" + fn=$((fn+1)) + j=$((j+1)) + done + i=$((j-1)) ;; + --exposures) + exp="" + while test $j -lt ${#ARGV[@]}; do + test -z "${ARGV[$j]}" && break + test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break + exp="$exp ${ARGV[$j]}" + j=$((j+1)) + done + parse_value "EXPOSURES" "$exp" + i=$((j-1)) ;; + --fonts_dir) + parse_value "FONTS_DIR" ${ARGV[$j]:-} + i=$j ;; + --tmp_dir) + parse_value "TMP_DIR" ${ARGV[$j]:-} + i=$j ;; + --lang) + parse_value "LANG_CODE" ${ARGV[$j]:-} + i=$j ;; + --langdata_dir) + parse_value "LANGDATA_ROOT" ${ARGV[$j]:-} + i=$j ;; + --maxpages) + parse_value "MAX_PAGES" ${ARGV[$j]:-} + i=$j ;; + --ptsize) + parse_value "PT_SIZE" ${ARGV[$j]:-} + i=$j ;; + --my_boxtiff_dir) + parse_value "MY_BOXTIFF_DIR" ${ARGV[$j]:-} + i=$j ;; + --distort_image) + DISTORT_IMAGE=true ;; + --output_dir) + parse_value "OUTPUT_DIR" ${ARGV[$j]:-} + i=$j ;; + --overwrite) + OVERWRITE=true ;; + --save_box_tiff) + SAVE_BOX_TIFF=true ;; + --linedata_only) + LINEDATA=true ;; + --extract_font_properties) + EXTRACT_FONT_PROPERTIES=true ;; + --noextract_font_properties) + EXTRACT_FONT_PROPERTIES=false ;; + --tessdata_dir) + parse_value "TESSDATA_DIR" ${ARGV[$j]:-} + i=$j ;; + --training_text) + parse_value "TRAINING_TEXT" "${ARGV[$j]:-}" + i=$j ;; + --wordlist) + parse_value "WORDLIST_FILE" ${ARGV[$j]:-} + i=$j ;; + --workspace_dir) + rmdir "$FONT_CONFIG_CACHE" + rmdir "$WORKSPACE_DIR" + parse_value "WORKSPACE_DIR" ${ARGV[$j]:-} + FONT_CONFIG_CACHE=$WORKSPACE_DIR/fc-cache + mkdir -p $FONT_CONFIG_CACHE + i=$j ;; + --xsize) + parse_value "X_SIZE" ${ARGV[$j]:-} + i=$j ;; + *) + err_exit "Unrecognized argument ${ARGV[$i]}" ;; + esac + i=$((i+1)) + done + if [[ -z ${LANG_CODE:-} ]]; then + err_exit "Need to specify a language --lang" + fi + if [[ -z ${LANGDATA_ROOT:-} ]]; then + err_exit "Need to specify path to language files --langdata_dir" + fi + if [[ -z ${TESSDATA_DIR:-} ]]; then + if [[ -z ${TESSDATA_PREFIX} ]]; then + err_exit "Need to specify a --tessdata_dir or have a "\ + "TESSDATA_PREFIX variable defined in your environment" + else + TESSDATA_DIR="${TESSDATA_PREFIX}" + fi + fi + if [[ ! -d "${OUTPUT_DIR}" ]]; then + tlog "Creating new directory ${OUTPUT_DIR}" + mkdir -p "${OUTPUT_DIR}" + fi + + # Location where intermediate files will be created. + TIMESTAMP=$(date +%Y-%m-%d) + if [[ -z ${TMP_DIR:-} ]]; then + TMP_DIR=$(mktemp -d -t ${LANG_CODE}-${TIMESTAMP}.XXX) + else + TMP_DIR=$(mktemp -d -p ${TMP_DIR} -t ${LANG_CODE}-${TIMESTAMP}.XXX) + fi + TRAINING_DIR=${TMP_DIR} + # Location of log file for the whole run. + LOG_FILE=${TRAINING_DIR}/tesstrain.log + + # Take training text and wordlist from the langdata directory if not + # specified in the command-line. + TRAINING_TEXT=${TRAINING_TEXT:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text} + WORDLIST_FILE=${WORDLIST_FILE:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist} + + WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams + NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers + PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc + BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs + UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs + TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams + GENERATE_DAWGS=1 +} + +# Function initializes font config with a unique font cache dir. +initialize_fontconfig() { + export FONT_CONFIG_CACHE + local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt + echo "Text" >${sample_path} + run_command text2image --fonts_dir=${FONTS_DIR} --ptsize ${PT_SIZE} \ + --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \ + --fontconfig_tmpdir=${FONT_CONFIG_CACHE} +} + +# Helper function for phaseI_generate_image. Generates the image for a single +# language/font combination in a way that can be run in parallel. +generate_font_image() { + local font="$1" + tlog "Rendering using ${font}" + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + + local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}" + common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" + common_args+=" --leading=${LEADING} --xsize=${X_SIZE}" + common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" + common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}" + if $DISTORT_IMAGE; then + common_args+=" --distort_image --invert=false" + fi + + # add --writing_mode=vertical-upright to common_args if the font is + # specified to be rendered vertically. + for vfont in "${VERTICAL_FONTS[@]}"; do + if [[ "${font}" == "${vfont}" ]]; then + common_args+=" --writing_mode=vertical-upright " + break + fi + done + + run_command text2image ${common_args} --font="${font}" --ptsize ${PT_SIZE} \ + --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS:-} + check_file_readable ${outbase}.box ${outbase}.tif + + if $EXTRACT_FONT_PROPERTIES && + [[ -r ${TRAIN_NGRAMS_FILE} ]]; then + tlog "Extracting font properties of ${font}" + run_command text2image ${common_args} --font="${font}" \ + --ligatures=false --text=${TRAIN_NGRAMS_FILE} \ + --only_extract_font_properties --ptsize=32 + check_file_readable ${outbase}.fontinfo + fi +} + +# Phase I : Generate (I)mages from training text for each font. +phase_I_generate_image() { + local par_factor=${1:-} + if ! [[ "${par_factor}" -gt 0 ]]; then + par_factor=1 + fi + tlog "\n=== Phase I: Generating training images ===" + if [[ -z ${TRAINING_TEXT:-} ]] || test ! -r "${TRAINING_TEXT}"; then + err_exit "Could not find training text file ${TRAINING_TEXT:-}" + fi + CHAR_SPACING="0.0" + + for EXPOSURE in $EXPOSURES; do + if $EXTRACT_FONT_PROPERTIES && [[ -r ${BIGRAM_FREQS_FILE} ]]; then + # Parse .bigram_freqs file and compose a .train_ngrams file with text + # for tesseract to recognize during training. Take only the ngrams whose + # combined weight accounts for 95% of all the bigrams in the language. + NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ + | awk '{s=s+$2}; END {print (s/100)*p}' p=99) + sort -rnk2 ${BIGRAM_FREQS_FILE} \ + | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ + x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} + check_file_readable ${TRAIN_NGRAMS_FILE} + fi + + local jobs= + trap "kill $$" INT + for font in "${FONTS[@]}"; do + sleep 1 + test $(jobs -r | wc -l) -ge $par_factor && wait -n + generate_font_image "${font}" & + jobs="$jobs $!" + done + wait $jobs + # Check that each process was successful. + for font in "${FONTS[@]}"; do + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + check_file_readable ${outbase}.box ${outbase}.tif + done + done + if $SAVE_BOX_TIFF && ( ! $LINEDATA ) ; then + tlog "\n=== Saving box/tiff pairs for training data ===" + for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + cp "${f}" "${OUTPUT_DIR}" + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + cp "${f}" "${OUTPUT_DIR}" + done + fi +} + +# Phase UP : Generate (U)nicharset and (P)roperties file. +phase_UP_generate_unicharset() { + tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" + + local box_files=$(ls ${TRAINING_DIR}/*.box) + UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" + run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \ + --norm_mode "${NORM_MODE}" ${box_files} + check_file_readable ${UNICHARSET_FILE} + + XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" + run_command set_unicharset_properties \ + -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ + --script_dir=${LANGDATA_ROOT} + check_file_readable ${XHEIGHTS_FILE} +} + +# Phase D : Generate (D)awg files from unicharset file and wordlist files +phase_D_generate_dawg() { + tlog "\n=== Phase D: Generating Dawg files ===" + + # Skip if requested + if [[ ${GENERATE_DAWGS} -eq 0 ]]; then + tlog "Skipping ${phase_name}" + return + fi + + # Output files + WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg + FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg + PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg + NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg + BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg + + # Word DAWG + local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq + if [[ -s ${WORDLIST_FILE} ]]; then + tlog "Generating word Dawg" + check_file_readable ${UNICHARSET_FILE} + run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ + ${UNICHARSET_FILE} + check_file_readable ${WORD_DAWG} + + FREQ_DAWG_SIZE=100 + head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file} + fi + + # Freq-word DAWG + if [[ -s ${freq_wordlist_file} ]]; then + check_file_readable ${UNICHARSET_FILE} + tlog "Generating frequent-word Dawg" + run_command wordlist2dawg -r 1 ${freq_wordlist_file} \ + ${FREQ_DAWG} ${UNICHARSET_FILE} + check_file_readable ${FREQ_DAWG} + fi + + # Punctuation DAWG + # -r arguments to wordlist2dawg denote RTL reverse policy + # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h). + # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, + # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, + # 2/RRP_FORCE_REVERSE for the punctuation DAWG. + local punc_reverse_policy=0; + if [[ "${LANG_IS_RTL}" == "1" ]]; then + punc_reverse_policy=2 + fi + if [[ ! -s ${PUNC_FILE} ]]; then + PUNC_FILE="${LANGDATA_ROOT}/common.punc" + fi + check_file_readable ${PUNC_FILE} + run_command wordlist2dawg -r ${punc_reverse_policy} \ + ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE} + check_file_readable ${PUNC_DAWG} + + # Numbers DAWG + if [[ -s ${NUMBERS_FILE} ]]; then + run_command wordlist2dawg -r 0 \ + ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE} + check_file_readable ${NUMBER_DAWG} + fi + + # Bigram dawg + if [[ -s ${WORD_BIGRAMS_FILE} ]]; then + run_command wordlist2dawg -r 1 \ + ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} + check_file_readable ${BIGRAM_DAWG} + fi +} + +# Phase E : (E)xtract .tr feature files from .tif/.box files +phase_E_extract_features() { + local box_config=$1 + local par_factor=$2 + local ext=$3 + if ! [[ "${par_factor}" -gt 0 ]]; then + par_factor=1 + fi + tlog "\n=== Phase E: Generating ${ext} files ===" + + local img_files="" + for exposure in ${EXPOSURES}; do + img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) + done + + # Use any available language-specific configs. + local config="" + if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then + config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config + fi + + OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX} + export TESSDATA_PREFIX=${TESSDATA_DIR} + tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" + local jobs= + trap "kill $$" INT + for img_file in ${img_files}; do + test $(jobs -r | wc -l) -ge $par_factor && wait -n + run_command tesseract -l ${LANG_CODE} ${img_file} ${img_file%.*} \ + ${box_config} ${config} & + jobs="$jobs $!" + done + wait $jobs + export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX} + # Check that all the output files were produced. + for img_file in ${img_files}; do + check_file_readable "${img_file%.*}.${ext}" + done +} + +# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) +# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto +phase_C_cluster_prototypes() { + tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" + local out_normproto=$1 + + run_command cntraining -D "${TRAINING_DIR}/" \ + $(ls ${TRAINING_DIR}/*.tr) + + check_file_readable ${TRAINING_DIR}/normproto + mv ${TRAINING_DIR}/normproto ${out_normproto} +} + +# Phase S : (S)hape clustering +phase_S_cluster_shapes() { + if ! $RUN_SHAPE_CLUSTERING; then + tlog "\n=== Shape Clustering disabled ===" + return + fi + check_file_readable ${LANGDATA_ROOT}/font_properties + local font_props="-F ${LANGDATA_ROOT}/font_properties" + if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\ + [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then + font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" + fi + + run_command shapeclustering \ + -D "${TRAINING_DIR}/" \ + -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ + -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ + ${font_props} \ + $(ls ${TRAINING_DIR}/*.tr) + check_file_readable ${TRAINING_DIR}/shapetable \ + ${TRAINING_DIR}/${LANG_CODE}.mfunicharset +} + +# Phase M : Clustering microfeatures (mfTraining) +phase_M_cluster_microfeatures() { + tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ===" + + check_file_readable ${LANGDATA_ROOT}/font_properties + font_props="-F ${LANGDATA_ROOT}/font_properties" + if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \ + [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then + font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" + fi + + run_command mftraining \ + -D "${TRAINING_DIR}/" \ + -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ + -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ + ${font_props} \ + $(ls ${TRAINING_DIR}/*.tr) + check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \ + ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset + mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp + mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable + mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable + mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset +} + +phase_B_generate_ambiguities() { + tlog "\n=== Phase B : ambiguities training ===" + + # Check for manually created ambiguities data. + if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then + tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs" + cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \ + ${TRAINING_DIR}/${LANG_CODE}.unicharambigs + # Make it writable, as it may be read-only in the client. + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs + return + else + tlog "No unicharambigs file found!" + fi + + # TODO: Add support for generating ambiguities automatically. +} + +make__lstmdata() { + tlog "\n=== Constructing LSTM training data ===" + local lang_prefix="${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}" + local lang_is_rtl="" + if [[ "${LANG_IS_RTL}" == "1" ]]; then + lang_is_rtl="--lang_is_rtl" + fi + local pass_through="" + if [[ "${NORM_MODE}" -ge "2" ]]; then + pass_through="--pass_through_recoder" + fi + + # Build the starter traineddata from the inputs. + run_command combine_lang_model \ + --input_unicharset "${TRAINING_DIR}/${LANG_CODE}.unicharset" \ + --script_dir "${LANGDATA_ROOT}" \ + --words "${lang_prefix}.wordlist" \ + --numbers "${lang_prefix}.numbers" \ + --puncs "${lang_prefix}.punc" \ + --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \ + "${pass_through}" "${lang_is_rtl}" + + if $SAVE_BOX_TIFF; then + tlog "\n=== Saving box/tiff pairs for training data ===" + for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + fi + + tlog "\n=== Moving lstmf files for training data ===" + for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt" + ls -1 "${OUTPUT_DIR}/${LANG_CODE}".*.lstmf > "${lstm_list}" +} + +make__traineddata() { + tlog "\n=== Making final traineddata file ===" + local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE} + + # Combine available files for this language from the langdata dir. + if [[ -r ${lang_prefix}.config ]]; then + tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}" + cp ${lang_prefix}.config ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config + fi + if [[ -r ${lang_prefix}.params-model ]]; then + tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}" + cp ${lang_prefix}.params-model ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model + fi + + # Compose the traineddata file. + run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}. + + # Copy it to the output dir, overwriting only if allowed by the cmdline flag. + local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; + if [[ -f ${destfile} ]] && ! $OVERWRITE; then + err_exit "File ${destfile} exists and no --overwrite specified"; + fi + tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" + cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile} +} diff --git a/train.sh b/train.sh deleted file mode 100644 index 32a1042..0000000 --- a/train.sh +++ /dev/null @@ -1,5 +0,0 @@ - - -tesstrain.sh --fonts_dir $FONTS_DIR --fontlist "GOST Type A" --lang rus \ - --linedata_only --noextract_font_properties --langdata_dir /tesseract-4.1.1/langdata \ - --tessdata_dir $TESSDATA_PREFIX --output_dir ./ \ No newline at end of file diff --git a/training/example1.jpg b/training/example1.jpg deleted file mode 100644 index 58f3f0f..0000000 Binary files a/training/example1.jpg and /dev/null differ