записал рабочий процесс обучения

удалил лишние файлы доработал провижн и вынес установку шрифтов в отдельный шаг
2020-02-13 19:56:11 +03:00 · 2020-02-13 19:56:11 +03:00 · c5b51f1c3f
commit c5b51f1c3f
parent a34b9ffbab
11 changed files with 738 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,5 @@
 .vagrant
 .idea
+Vagrantfile
+package.box
+*.traineddata
--- a/README.md
+++ b/README.md
@ -1,10 +1,57 @@
 # Tesseract OCR Vagrant Box
+Машина с настроенным Tesseract OCR и всеми необходимыми либами для обучения/тюнинга моделей

 ## First UP
-
 ```shell script
+cp Vagrantfile.dist Vagrantfile
 vagrant plugin install vagrant-vbguest
 vagrant vbguest
 vagrant up
 ```

+## Create BOX
+Для использования готовой коробки (в которой все установлено и собрано для обучения):
+```shell script
+vagrant package
+vagrant destroy -f
+rm Vagrantfile
+cp Vagrantfile.feat Vagrantfile
+```
+
+## Links
+* [How to use the tools provided to train Tesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html#additional-libraries-required)
+
+
+## Example
+Создайте подпапку в ~/training/ и перейдите в нее. Далее:
+```shell script
+PAGES=100
+FONT="GOST Type A"
+BASE=$TESSDATA_PREFIX/rus.traineddata
+MODELNAME=result
+
+mkdir -p model_existed model_training model_result
+
+tesstrain.sh --fonts_dir $FONTS_DIR --fontlist $FONT --lang rus \
+  --linedata_only --noextract_font_properties --langdata_dir /tesseract-4.1.1/langdata \
+  --tessdata_dir $TESSDATA_PREFIX --maxpages $PAGES --output_dir ./
+
+combine_tessdata -e $BASE ./model_existed/model.lstm
+
+lstmtraining --model_output ./model_training/our \
+  --continue_from model_existed/model.lstm \
+  --traineddata $TESSDATA_PREFIX/rus.traineddata \
+  --train_listfile rus.training_files.txt
+
+lstmtraining --stop_training \
+  --continue_from ./model_training/our_checkpoint \
+  --traineddata $TESSDATA_PREFIX/rus.traineddata \
+  --model_output ./model_result/$MODELNAME.traineddata
+```
+
+## Notes
+`tesstrain_utils.sh.fixed` - исправленная версия оф. хелпера.
+
+```-l ${LANG_CODE} ``` добавлено в `tesstrain_utils.sh.fixed:467` 
+иначе будет проблема - язык не указан. Официальный файл заменен исправленным при провижине машины
+(см. install.sh) 
--- a/Vagrantfile.dist
+++ b/Vagrantfile.dist
@ -8,19 +8,12 @@ Vagrant.configure("2") do |config|

    config.vm.provider :virtualbox do |vb|
        vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
-        # Display the VirtualBox GUI when booting the machine
-        #vb.gui = true
-
-        # Customize the amount of memory on the VM:
        vb.memory = 2048
-
-        # Use 1 CPUs
-        vb.cpus = 2
+        vb.cpus = 4
    end

    config.vm.network "forwarded_port", guest: 8000, host: 8001, auto_correct: true
    config.vm.network "forwarded_port", guest: 80, host: 81, auto_correct: true
-    config.vm.network "forwarded_port", guest: 5000, host: 5001, auto_correct: true

    config.vm.synced_folder ".", "/vagrant"
    config.vm.provision "shell", privileged: true, path: "./provision.sh"
@ -35,7 +28,8 @@ Vagrant.configure("2") do |config|
    config.trigger.after :up do |trigger|
        trigger.info = "Trigger: install dependencies on every up"
        trigger.run_remote = {inline: <<-SHELL
-
+            chmod +x /vagrant/install-fonts.sh
+            runuser -l vagrant -c './vagrant/install-fonts.sh'
        SHELL
    } end

--- a/Vagrantfile.feat
+++ b/Vagrantfile.feat
@ -0,0 +1,35 @@
+# encoding: utf-8
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+Vagrant.configure("2") do |config|
+    config.vm.box = "package.box"
+    config.vm.box_check_update = false
+
+    config.vm.provider :virtualbox do |vb|
+        vb.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
+        vb.memory = 2048
+        vb.cpus = 4
+    end
+
+    config.vm.network "forwarded_port", guest: 8000, host: 8001, auto_correct: true
+    config.vm.network "forwarded_port", guest: 80, host: 81, auto_correct: true
+    config.vm.network "forwarded_port", guest: 5000, host: 5001, auto_correct: true
+
+    config.vm.synced_folder ".", "/vagrant"
+    config.vm.provision "shell", privileged: true, inline: <<<-SHELL
+
+    SHELL
+
+    config.trigger.after :up do |trigger|
+        trigger.info = "Trigger: install dependencies on every up"
+        trigger.run_remote = {inline: <<-SHELL
+
+        SHELL
+    } end
+
+    # Enable X11 forwarding for graphical apps.
+    # Make sure you have xquartz installed if using OSX host!
+    config.ssh.forward_agent = true
+    config.ssh.forward_x11 = true
+end
--- a/add-fonts-to-ocr.sh
+++ b/add-fonts-to-ocr.sh
@ -1,7 +0,0 @@
-if [ ! -d $FONTS_DIR ]; then
-  echo "set FONTS_DIR envvar"; exit 1
-fi
-LNG_DIR=/vagrant/init/langdata/rus
-text2image --find_fonts --fonts_dir $FONTS_DIR \
-  --text $LNG_DIR/rus.training_text --min_coverage .9 \
-  --outputbase $LNG_DIR/rus |& grep raw | sed -e 's/ :.*/@ \\/g' | sed -e "s/^/  '/" | sed -e "s/@/'/g" > $LNG_DIR/fontslist.txt
--- a/install-fonts.sh
+++ b/install-fonts.sh
@ -0,0 +1,15 @@
+[ -z "$FONTS_DIR" ] && echo "\$FONTS_DIR is empty!" && exit 1
+
+sudo chown -R vagrant:vagrant $FONTS_DIR
+sudo chmod -R 775 $FONTS_DIR
+
+echo "Remove installed fonts"
+rm -fv $FONTS_DIR/*.ttf
+
+echo "Copy fonts to /vagrant/fonts"
+cp -v /vagrant/fonts/*.ttf $FONTS_DIR
+
+sudo chmod 644 $FONTS_DIR/*
+
+fc-cache
+fc-list | grep 'GOST'
--- a/install.sh
+++ b/install.sh
@ -57,3 +57,7 @@ wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/ru
  echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; \
  echo "export TESSDATA_PREFIX=$TESSDATA_PREFIX"; \
 } >> "$HOME"/.bashrc
+
+# фиксим проблему с фазой Е в tesstrain-utils.sh
+mv -v $VBIN/tesstrain_utils.sh $VBIN/tesstrain_utils.sh.bak
+cp -v /vagrant/tesstrain_utils.sh $VBIN/tesstrain_utils.sh
--- a/provision.sh
+++ b/provision.sh
@ -11,7 +11,7 @@ echo "%vagrant ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/vagrant
 export DEBIAN_FRONTEND=noninteractive
 locale-gen en_US.UTF-8
 apt-get update
-apt-get install -y htop nano libsm6 libxext6 libxrender-dev \
+apt-get install -y htop nano tree libsm6 libxext6 libxrender-dev \
  libicu-dev libpango1.0-dev libcairo2-dev g++ autoconf automake \
  libtool pkg-config libpng-dev libjpeg62-turbo-dev libtiff5-dev \
  zlib1g-dev libicu-dev libleptonica-dev
@ -27,12 +27,6 @@ cd /vagrant || exit 1
 chmod +x ./install.sh
 runuser -l vagrant -c '/vagrant/install.sh'

-# Устанавливаем чертежные шрифты в систему
+# путь к шрифтам
 export FONTS_DIR=/usr/local/share/fonts
-cp /vagrant/fonts/* $FONTS_DIR
-chown -R vagrant:vagrant $FONTS_DIR
-chmod -R 775 $FONTS_DIR
-chmod 644 $FONTS_DIR/*
-fc-cache
-fc-list | grep 'GOST'
 echo "export FONTS_DIR=$FONTS_DIR" >> /home/vagrant/.bashrc
--- a/tesstrain_utils.sh.fixed
+++ b/tesstrain_utils.sh.fixed
@ -0,0 +1,627 @@
+#!/bin/bash
+# (C) Copyright 2014, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script defines functions that are used by tesstrain.sh
+# For a detailed description of the phases, see
+# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
+#
+# USAGE: source tesstrain_utils.sh
+
+if [ -n "$BASH_VERSION" ];then
+  set -u  # comment in case of "unbound variable" error or fix the code
+  set -eo pipefail;
+else
+   echo "Warning: you aren't running script in bash - expect problems..."
+ fi
+
+UNAME=$(uname -s | tr 'A-Z' 'a-z')
+
+FONT_CONFIG_CACHE=$(mktemp -d -t font_tmp.XXXXXXXXXX)
+
+if [[ ($UNAME == *darwin*) ]]; then
+    FONTS_DIR="/Library/Fonts/"
+else
+    FONTS_DIR="/usr/share/fonts/"
+fi
+
+DISTORT_IMAGE=false
+EXTRACT_FONT_PROPERTIES=false
+LINEDATA=false
+MAX_PAGES=0
+MY_BOXTIFF_DIR=""
+OUTPUT_DIR="/tmp/tesstrain/tessdata"
+OVERWRITE=false
+RUN_SHAPE_CLUSTERING=false
+SAVE_BOX_TIFF=false
+WORKSPACE_DIR=$(mktemp -d)
+X_SIZE=3600
+PT_SIZE=12
+
+# set TESSDATA_PREFIX as empty, if not defined in environment to avoid an unbound variable
+TESSDATA_PREFIX=${TESSDATA_PREFIX:-}
+
+# Logging helper functions.
+tlog() {
+    if test -z "${LOG_FILE:-}"; then
+        echo -e $*
+    else
+        echo -e $* | tee -a ${LOG_FILE}
+    fi
+}
+
+err_exit() {
+    if test -z "${LOG_FILE:-}"; then
+        echo -e "ERROR: "$*
+    else
+        echo -e "ERROR: "$* | tee -a ${LOG_FILE}
+    fi
+    exit 1
+}
+
+# Helper function to run a command and append its output to a log. Aborts early
+# if the program file is not found.
+# Usage: run_command CMD ARG1 ARG2...
+run_command() {
+    local cmd
+    cmd=$(which $1 || \
+              for d in api training; do
+                  which $d/$1 && break
+              done) || err_exit "'$1' not found"
+    shift
+    tlog "[$(date)] ${cmd} $@"
+    if ! "${cmd}" "$@" 2>&1 | tee -a "${LOG_FILE}"; then
+        err_exit "Program $(basename ${cmd}) failed. Abort."
+    fi
+}
+
+# Check if all the given files exist, or exit otherwise.
+# Used to check required input files and produced output files in each phase.
+# Usage: check_file_readable FILE1 FILE2...
+check_file_readable() {
+    for file in $@; do
+        if [[ ! -r ${file} ]]; then
+            err_exit "${file} does not exist or is not readable"
+        fi
+    done
+}
+
+# Sets the named variable to given value. Aborts if the value is missing or
+# if it looks like a flag.
+# Usage: parse_value VAR_NAME VALUE
+parse_value() {
+    local val="${2:-}"
+    if [[ -z "$val" ]]; then
+        err_exit "Missing value for variable $1"
+        exit
+    fi
+    if [[ ${val:0:2} == "--" ]]; then
+        err_exit "Invalid value $val passed for variable $1"
+        exit
+    fi
+    eval $1=\"$val\"
+}
+
+# Does simple command-line parsing and initialization.
+parse_flags() {
+    local i=0
+    while test $i -lt ${#ARGV[@]}; do
+        local j=$((i+1))
+        case ${ARGV[$i]} in
+            --)
+                break;;
+            --fontlist)
+                fn=0
+                FONTS=""
+                while test $j -lt ${#ARGV[@]}; do
+                    test -z "${ARGV[$j]}" && break
+                    test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
+                    FONTS[$fn]="${ARGV[$j]}"
+                    fn=$((fn+1))
+                    j=$((j+1))
+                done
+                i=$((j-1)) ;;
+            --exposures)
+                exp=""
+                while test $j -lt ${#ARGV[@]}; do
+                    test -z "${ARGV[$j]}" && break
+                    test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
+                    exp="$exp ${ARGV[$j]}"
+                    j=$((j+1))
+                done
+                parse_value "EXPOSURES" "$exp"
+                i=$((j-1)) ;;
+            --fonts_dir)
+                parse_value "FONTS_DIR" ${ARGV[$j]:-}
+                i=$j ;;
+	    --tmp_dir)
+		parse_value "TMP_DIR"   ${ARGV[$j]:-}
+		i=$j ;;
+            --lang)
+                parse_value "LANG_CODE" ${ARGV[$j]:-}
+                i=$j ;;
+            --langdata_dir)
+                parse_value "LANGDATA_ROOT" ${ARGV[$j]:-}
+                i=$j ;;
+            --maxpages)
+                parse_value "MAX_PAGES" ${ARGV[$j]:-}
+                i=$j ;;
+            --ptsize)
+                parse_value "PT_SIZE" ${ARGV[$j]:-}
+                i=$j ;;
+            --my_boxtiff_dir)
+                parse_value "MY_BOXTIFF_DIR" ${ARGV[$j]:-}
+                i=$j ;;
+            --distort_image)
+                DISTORT_IMAGE=true ;;
+            --output_dir)
+                parse_value "OUTPUT_DIR" ${ARGV[$j]:-}
+                i=$j ;;
+            --overwrite)
+                OVERWRITE=true ;;
+            --save_box_tiff)
+                SAVE_BOX_TIFF=true ;;
+            --linedata_only)
+                LINEDATA=true ;;
+            --extract_font_properties)
+                EXTRACT_FONT_PROPERTIES=true ;;
+            --noextract_font_properties)
+                EXTRACT_FONT_PROPERTIES=false ;;
+            --tessdata_dir)
+                parse_value "TESSDATA_DIR" ${ARGV[$j]:-}
+                i=$j ;;
+            --training_text)
+                parse_value "TRAINING_TEXT" "${ARGV[$j]:-}"
+                i=$j ;;
+            --wordlist)
+                parse_value "WORDLIST_FILE" ${ARGV[$j]:-}
+                i=$j ;;
+            --workspace_dir)
+                rmdir "$FONT_CONFIG_CACHE"
+                rmdir "$WORKSPACE_DIR"
+                parse_value "WORKSPACE_DIR" ${ARGV[$j]:-}
+                FONT_CONFIG_CACHE=$WORKSPACE_DIR/fc-cache
+                mkdir -p $FONT_CONFIG_CACHE
+                i=$j ;;
+            --xsize)
+                parse_value "X_SIZE" ${ARGV[$j]:-}
+                i=$j ;;
+            *)
+                err_exit "Unrecognized argument ${ARGV[$i]}" ;;
+        esac
+        i=$((i+1))
+    done
+    if [[ -z ${LANG_CODE:-} ]]; then
+        err_exit "Need to specify a language --lang"
+    fi
+    if [[ -z ${LANGDATA_ROOT:-} ]]; then
+        err_exit "Need to specify path to language files --langdata_dir"
+    fi
+    if [[ -z ${TESSDATA_DIR:-} ]]; then
+        if [[ -z ${TESSDATA_PREFIX} ]]; then
+            err_exit "Need to specify a --tessdata_dir or have a "\
+        "TESSDATA_PREFIX variable defined in your environment"
+        else
+            TESSDATA_DIR="${TESSDATA_PREFIX}"
+        fi
+    fi
+    if [[ ! -d "${OUTPUT_DIR}" ]]; then
+        tlog "Creating new directory ${OUTPUT_DIR}"
+        mkdir -p "${OUTPUT_DIR}"
+    fi
+
+    # Location where intermediate files will be created.
+    TIMESTAMP=$(date +%Y-%m-%d)
+    if [[ -z ${TMP_DIR:-} ]]; then
+        TMP_DIR=$(mktemp -d -t ${LANG_CODE}-${TIMESTAMP}.XXX)
+    else
+        TMP_DIR=$(mktemp -d -p ${TMP_DIR} -t ${LANG_CODE}-${TIMESTAMP}.XXX)
+    fi
+    TRAINING_DIR=${TMP_DIR}
+    # Location of log file for the whole run.
+    LOG_FILE=${TRAINING_DIR}/tesstrain.log
+
+    # Take training text and wordlist from the langdata directory if not
+    # specified in the command-line.
+    TRAINING_TEXT=${TRAINING_TEXT:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text}
+    WORDLIST_FILE=${WORDLIST_FILE:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist}
+
+    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams
+    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
+    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
+    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
+    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
+    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
+    GENERATE_DAWGS=1
+}
+
+# Function initializes font config with a unique font cache dir.
+initialize_fontconfig() {
+    export FONT_CONFIG_CACHE
+    local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
+    echo "Text" >${sample_path}
+    run_command text2image --fonts_dir=${FONTS_DIR} --ptsize ${PT_SIZE} \
+        --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
+        --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
+}
+
+# Helper function for phaseI_generate_image. Generates the image for a single
+# language/font combination in a way that can be run in parallel.
+generate_font_image() {
+    local font="$1"
+    tlog "Rendering using ${font}"
+    local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
+    local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
+
+    local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
+    common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
+    common_args+=" --leading=${LEADING} --xsize=${X_SIZE}"
+    common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
+    common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
+    if $DISTORT_IMAGE; then
+        common_args+=" --distort_image --invert=false"
+    fi
+
+    # add --writing_mode=vertical-upright to common_args if the font is
+    # specified to be rendered vertically.
+    for vfont in "${VERTICAL_FONTS[@]}"; do
+      if [[ "${font}" == "${vfont}" ]]; then
+        common_args+=" --writing_mode=vertical-upright "
+        break
+      fi
+    done
+
+    run_command text2image ${common_args} --font="${font}" --ptsize ${PT_SIZE} \
+        --text=${TRAINING_TEXT}  ${TEXT2IMAGE_EXTRA_ARGS:-}
+    check_file_readable ${outbase}.box ${outbase}.tif
+
+    if $EXTRACT_FONT_PROPERTIES &&
+        [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
+        tlog "Extracting font properties of ${font}"
+        run_command text2image ${common_args} --font="${font}" \
+            --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
+            --only_extract_font_properties --ptsize=32
+        check_file_readable ${outbase}.fontinfo
+    fi
+}
+
+# Phase I : Generate (I)mages from training text for each font.
+phase_I_generate_image() {
+    local par_factor=${1:-}
+    if ! [[ "${par_factor}" -gt 0 ]]; then
+        par_factor=1
+    fi
+    tlog "\n=== Phase I: Generating training images ==="
+    if [[ -z ${TRAINING_TEXT:-} ]] || test ! -r "${TRAINING_TEXT}"; then
+        err_exit "Could not find training text file ${TRAINING_TEXT:-}"
+    fi
+    CHAR_SPACING="0.0"
+
+    for EXPOSURE in $EXPOSURES; do
+        if $EXTRACT_FONT_PROPERTIES && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
+            # Parse .bigram_freqs file and compose a .train_ngrams file with text
+            # for tesseract to recognize during training. Take only the ngrams whose
+            # combined weight accounts for 95% of all the bigrams in the language.
+            NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
+                | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
+            sort -rnk2 ${BIGRAM_FREQS_FILE} \
+                | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
+                x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
+            check_file_readable ${TRAIN_NGRAMS_FILE}
+        fi
+
+        local jobs=
+        trap "kill $$" INT
+        for font in "${FONTS[@]}"; do
+            sleep 1
+            test $(jobs -r | wc -l) -ge $par_factor && wait -n
+            generate_font_image "${font}" &
+            jobs="$jobs $!"
+        done
+        wait $jobs
+        # Check that each process was successful.
+        for font in "${FONTS[@]}"; do
+            local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
+            local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
+            check_file_readable ${outbase}.box ${outbase}.tif
+        done
+    done
+    if $SAVE_BOX_TIFF && ( ! $LINEDATA ) ; then
+    tlog "\n=== Saving box/tiff pairs for training data ==="
+        for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do
+            tlog "Moving ${f} to ${OUTPUT_DIR}"
+            cp "${f}" "${OUTPUT_DIR}"
+        done
+        for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do
+            tlog "Moving ${f} to ${OUTPUT_DIR}"
+            cp "${f}" "${OUTPUT_DIR}"
+        done
+    fi
+}
+
+# Phase UP : Generate (U)nicharset and (P)roperties file.
+phase_UP_generate_unicharset() {
+    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
+
+    local box_files=$(ls ${TRAINING_DIR}/*.box)
+    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
+    run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
+      --norm_mode "${NORM_MODE}" ${box_files}
+    check_file_readable ${UNICHARSET_FILE}
+
+    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
+    run_command set_unicharset_properties \
+        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
+        --script_dir=${LANGDATA_ROOT}
+    check_file_readable ${XHEIGHTS_FILE}
+}
+
+# Phase D : Generate (D)awg files from unicharset file and wordlist files
+phase_D_generate_dawg() {
+    tlog "\n=== Phase D: Generating Dawg files ==="
+
+    # Skip if requested
+    if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
+      tlog "Skipping ${phase_name}"
+      return
+    fi
+
+    # Output files
+    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
+    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
+    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
+    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
+    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
+
+    # Word DAWG
+    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
+    if [[ -s ${WORDLIST_FILE} ]]; then
+        tlog "Generating word Dawg"
+        check_file_readable ${UNICHARSET_FILE}
+        run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
+            ${UNICHARSET_FILE}
+        check_file_readable ${WORD_DAWG}
+
+        FREQ_DAWG_SIZE=100
+        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
+    fi
+
+    # Freq-word DAWG
+    if [[ -s ${freq_wordlist_file} ]]; then
+        check_file_readable ${UNICHARSET_FILE}
+        tlog "Generating frequent-word Dawg"
+        run_command wordlist2dawg  -r 1 ${freq_wordlist_file} \
+            ${FREQ_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${FREQ_DAWG}
+    fi
+
+    # Punctuation DAWG
+    # -r arguments to wordlist2dawg denote RTL reverse policy
+    # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
+    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
+    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
+    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
+    local punc_reverse_policy=0;
+    if [[ "${LANG_IS_RTL}" == "1" ]]; then
+      punc_reverse_policy=2
+    fi
+    if [[ ! -s ${PUNC_FILE} ]]; then
+        PUNC_FILE="${LANGDATA_ROOT}/common.punc"
+    fi
+    check_file_readable ${PUNC_FILE}
+    run_command wordlist2dawg -r ${punc_reverse_policy} \
+        ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
+    check_file_readable ${PUNC_DAWG}
+
+    # Numbers DAWG
+    if [[ -s ${NUMBERS_FILE} ]]; then
+        run_command wordlist2dawg -r 0 \
+            ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${NUMBER_DAWG}
+    fi
+
+    # Bigram dawg
+    if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
+        run_command wordlist2dawg -r 1 \
+            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${BIGRAM_DAWG}
+    fi
+}
+
+# Phase E : (E)xtract .tr feature files from .tif/.box files
+phase_E_extract_features() {
+    local box_config=$1
+    local par_factor=$2
+    local ext=$3
+    if ! [[ "${par_factor}" -gt 0 ]]; then
+        par_factor=1
+    fi
+    tlog "\n=== Phase E: Generating ${ext} files ==="
+
+    local img_files=""
+    for exposure in ${EXPOSURES}; do
+        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
+    done
+
+    # Use any available language-specific configs.
+    local config=""
+    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
+        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
+    fi
+
+    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
+    export TESSDATA_PREFIX=${TESSDATA_DIR}
+    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+    local jobs=
+    trap "kill $$" INT
+    for img_file in ${img_files}; do
+        test $(jobs -r | wc -l) -ge $par_factor && wait -n
+        run_command tesseract -l ${LANG_CODE} ${img_file} ${img_file%.*} \
+            ${box_config} ${config} &
+        jobs="$jobs $!"
+    done
+    wait $jobs
+    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
+    # Check that all the output files were produced.
+    for img_file in ${img_files}; do
+        check_file_readable "${img_file%.*}.${ext}"
+    done
+}
+
+# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
+# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
+phase_C_cluster_prototypes() {
+    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
+    local out_normproto=$1
+
+    run_command cntraining -D "${TRAINING_DIR}/" \
+        $(ls ${TRAINING_DIR}/*.tr)
+
+    check_file_readable ${TRAINING_DIR}/normproto
+    mv ${TRAINING_DIR}/normproto ${out_normproto}
+}
+
+# Phase S : (S)hape clustering
+phase_S_cluster_shapes() {
+    if ! $RUN_SHAPE_CLUSTERING; then
+        tlog "\n=== Shape Clustering disabled ==="
+        return
+    fi
+    check_file_readable ${LANGDATA_ROOT}/font_properties
+    local font_props="-F ${LANGDATA_ROOT}/font_properties"
+    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
+       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+    fi
+
+    run_command shapeclustering \
+        -D "${TRAINING_DIR}/" \
+        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+        ${font_props} \
+        $(ls ${TRAINING_DIR}/*.tr)
+    check_file_readable ${TRAINING_DIR}/shapetable \
+        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+}
+
+# Phase M : Clustering microfeatures (mfTraining)
+phase_M_cluster_microfeatures() {
+    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
+
+    check_file_readable ${LANGDATA_ROOT}/font_properties
+    font_props="-F ${LANGDATA_ROOT}/font_properties"
+    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
+       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+    fi
+
+    run_command mftraining \
+        -D "${TRAINING_DIR}/" \
+        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+        ${font_props} \
+        $(ls ${TRAINING_DIR}/*.tr)
+    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
+        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
+    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
+    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
+    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
+}
+
+phase_B_generate_ambiguities() {
+  tlog "\n=== Phase B : ambiguities training ==="
+
+  # Check for manually created ambiguities data.
+  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
+      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
+      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
+          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+      # Make it writable, as it may be read-only in the client.
+      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+      return
+  else
+      tlog "No unicharambigs file found!"
+  fi
+
+  # TODO: Add support for generating ambiguities automatically.
+}
+
+make__lstmdata() {
+  tlog "\n=== Constructing LSTM training data ==="
+  local lang_prefix="${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}"
+  local lang_is_rtl=""
+  if [[ "${LANG_IS_RTL}" == "1" ]]; then
+    lang_is_rtl="--lang_is_rtl"
+  fi
+  local pass_through=""
+  if [[ "${NORM_MODE}" -ge "2" ]]; then
+    pass_through="--pass_through_recoder"
+  fi
+
+  # Build the starter traineddata from the inputs.
+  run_command combine_lang_model \
+    --input_unicharset "${TRAINING_DIR}/${LANG_CODE}.unicharset" \
+    --script_dir "${LANGDATA_ROOT}" \
+    --words "${lang_prefix}.wordlist" \
+    --numbers "${lang_prefix}.numbers" \
+    --puncs "${lang_prefix}.punc" \
+    --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \
+    "${pass_through}" "${lang_is_rtl}"
+
+  if $SAVE_BOX_TIFF; then
+    tlog "\n=== Saving box/tiff pairs for training data ==="
+  for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do
+    tlog "Moving ${f} to ${OUTPUT_DIR}"
+    mv "${f}" "${OUTPUT_DIR}"
+  done
+  for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do
+    tlog "Moving ${f} to ${OUTPUT_DIR}"
+    mv "${f}" "${OUTPUT_DIR}"
+  done
+  fi
+
+  tlog "\n=== Moving lstmf files for training data ==="
+  for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do
+    tlog "Moving ${f} to ${OUTPUT_DIR}"
+    mv "${f}" "${OUTPUT_DIR}"
+  done
+  local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt"
+  ls -1 "${OUTPUT_DIR}/${LANG_CODE}".*.lstmf > "${lstm_list}"
+}
+
+make__traineddata() {
+  tlog "\n=== Making final traineddata file ==="
+  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
+
+  # Combine available files for this language from the langdata dir.
+  if [[ -r ${lang_prefix}.config ]]; then
+    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
+    cp ${lang_prefix}.config ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
+  fi
+  if [[ -r ${lang_prefix}.params-model ]]; then
+    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
+    cp ${lang_prefix}.params-model ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
+  fi
+
+  # Compose the traineddata file.
+  run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
+
+  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
+  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
+  if [[ -f ${destfile} ]] && ! $OVERWRITE; then
+      err_exit "File ${destfile} exists and no --overwrite specified";
+  fi
+  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
+  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
+}
--- a/train.sh
+++ b/train.sh
@ -1,5 +0,0 @@
-
-
-tesstrain.sh --fonts_dir $FONTS_DIR --fontlist "GOST Type A" --lang rus \
-  --linedata_only --noextract_font_properties --langdata_dir /tesseract-4.1.1/langdata \
-  --tessdata_dir $TESSDATA_PREFIX --output_dir ./
--- a/training/example1.jpg
+++ b/training/example1.jpg