diff options
Diffstat (limited to 'debian/ocr-scripts/xsane2tess3.sh')
-rwxr-xr-x | debian/ocr-scripts/xsane2tess3.sh | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/debian/ocr-scripts/xsane2tess3.sh b/debian/ocr-scripts/xsane2tess3.sh new file mode 100755 index 0000000..14683bf --- /dev/null +++ b/debian/ocr-scripts/xsane2tess3.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# xsane2tess3 - tesseractOCR directly from xsane +# Copyright (C) 2012 Heinrich Schwietering +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# +############################################################################## +# +# xsane2tess3 0.1 +# +# *** tesseract made simple *** +# +# +############################################################################## +# +# xsane2tess is a TesseractOCR 3.0x wrapper to be able to use tesseract with xsane +# +# +# +TEMP_DIR=/tmp/ # folder for temporary files (all files) +ERRORLOG="xsane2tess3.log" # file where STDERR goes + +if [[ -z "$1" ]] + then + echo "Usage: $0 [OPTIONS] + + xsane2tess3 scans images with TesseractOCR + and outputs the text in a file or as hocr/html document + + OPTIONS: + -i <file1> define input file (any image-format supported) + -o <file2> define output-file (*.txt/hOCR) + -l <lang> define language-data tesseract should use + -e <config> filename for tesseract + -f </path/to/Final> name and path fot multiscan document + + Progress- & error-messages will be stored in this logfile: + $TEMP_DIR$ERRORLOG + + xsane2tess depends on + - XSane, http://www.xsane.org/ + - TesseractOCR, http://code.google.com/p/tesseract-ocr/ + + Some coding was stolen from 'ocube' + http://www.geocities.com/thierryguy/ocube.html + + This adaption is based on xsane2tess + http://doc.ubuntu-fr.org/xsane2tess, + + Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de + +" + exit +fi + + +# get options... +while getopts ":i:o:l:c:f:" OPTION + do + case $OPTION in + i ) # input filename (with path) + FILE_PATH="$OPTARG" + ;; + o ) # output filename + FILE_OUT="$OPTARG" + ;; + l ) # Language-selection + LANG="$OPTARG" + ;; + c ) # use hocr configfile + CONFILE="$OPTARG" + ;; + f ) # final name for multiscan ocr file + FINAL="$OPTARG" + ;; + esac +done + +# redirect STDOUT to FILE_OUT +exec 1>>$FILE_OUT + +# redirect STDERR to ERRORLOG +exec 2>>$TEMP_DIR$ERRORLOG + +# strip path from FILE_PATH, use filename only +IN_FILE="${FILE_PATH##*/.*}" + +echo "~~~+++~~~~+++~~~" 1>&2 + +# start OCR (tesseract expands output with *.txt/.html) +tesseract "$IN_FILE" "$FILE_OUT" -l "$LANG" "$CONFILE" 1>&2 +echo Tesseract used with "$LANG" "$CONFILE" 1>&2 + +{ if [[ "$FINAL" != '' ]] + then + { if [[ "$CONFILE" == "" ]] + then + # check if final txt file is already existing + { if [[ ! -a "$FINAL".txt ]] + then +# start final ocr file + cp "$FILE_OUT".txt "$FINAL".txt 1>&2 + echo "$FINAL.txt started" 1>&2 + else + mv "$FINAL".txt "$FINAL".new.txt + cat "$FINAL".new.txt "$FILE_OUT".txt > "$FINAL".txt + echo "$FILE_OUT.txt added to $FINAL.txt" 1>&2 + rm "$FINAL".new.txt + fi } + else +# check if final hocr file is already existing + { if [[ ! -a "$FINAL".html ]] + then +# start final ocr file + cp "$FILE_OUT.html" "$FINAL".html 1>&2 + echo "$FINAL.html started" 1>&2 + else + mv "$FINAL".html "$FINAL".new.html + cat "$FINAL".new.html "$FILE_OUT".html > "$FINAL".html + echo "$FILE_OUT.html added to $FINAL.html" 1>&2 + rm "$FINAL".new.html + fi } + fi } + rm $FILE_OUT + else +# STDOUT scanned text => FILE_OUT + cat "$FILE_OUT".* + +fi } + +rm $FILE_OUT.* + +echo "~~~+++~~~~+++~~~"$(date +%c) 1>&2 |