tesseract-box/install.sh
Nikita Orlov c5b51f1c3f записал рабочий процесс обучения
удалил лишние файлы
доработал провижн и вынес установку шрифтов в отдельный шаг
2020-02-13 19:56:11 +03:00

63 lines
2.7 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

cd /tesseract-4.1.1 || exit 1
VLOC=$HOME/local
VBIN=$VLOC/bin
VINC=$VLOC/include
VLIB=$VLOC/lib
mkdir -p $VINC $VLIB $VBIN/pkgconfig
export PKG_CONFIG_PATH=$VLIB/pkgconfig
export LD_LIBRARY_PATH=$VLIB
./autogen.sh
LIBLEPT_HEADERSDIR=$VINC ./configure --prefix=$VLOC --with-extra-libraries=$VLIB
make
make install
make training
make training-install
export TESSDATA_PREFIX=/tesseract-4.1.1/tessdata
cd $TESSDATA_PREFIX || exit 1
wget https://github.com/tesseract-ocr/tessdata_best/raw/master/rus.traineddata
# download langdata
cd /tesseract-4.1.1
mkdir langdata
cd langdata
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/radical-stroke.txt
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/common.punc
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/font_properties
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/Latin.unicharset
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/Latin.xheights
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/Cyrillic.unicharset
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/Cyrillic.xheights
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/desired_bigrams.txt
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/common.unicharambigs
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/forbidden_characters_default
mkdir rus
cd rus
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.training_text
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.punc
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.numbers
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.wordlist
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/desired_characters
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/okfonts.txt
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.singles_text
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.unicharambigs
wget https://raw.githubusercontent.com/tesseract-ocr/langdata_lstm/master/rus/rus.unicharset
{
echo "export PATH=$PATH:$VBIN"; \
echo "export PKG_CONFIG_PATH=$PKG_CONFIG_PATH"; \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH"; \
echo "export TESSDATA_PREFIX=$TESSDATA_PREFIX"; \
} >> "$HOME"/.bashrc
# фиксим проблему с фазой Е в tesstrain-utils.sh
mv -v $VBIN/tesstrain_utils.sh $VBIN/tesstrain_utils.sh.bak
cp -v /vagrant/tesstrain_utils.sh $VBIN/tesstrain_utils.sh