#!/bin/bash # xsane2tess3 - tesseractOCR directly from xsane # Copyright (C) 2012 Heinrich Schwietering # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # ############################################################################## # # xsane2tess3 0.1 # # *** tesseract made simple *** # # ############################################################################## # # xsane2tess is a TesseractOCR 3.0x wrapper to be able to use tesseract with xsane # # # TEMP_DIR=/tmp/ # folder for temporary files (all files) ERRORLOG="xsane2tess3.log" # file where STDERR goes if [[ -z "$1" ]] then echo "Usage: $0 [OPTIONS] xsane2tess3 scans images with TesseractOCR and outputs the text in a file or as hocr/html document OPTIONS: -i define input file (any image-format supported) -o define output-file (*.txt/hOCR) -l define language-data tesseract should use -e filename for tesseract -f name and path fot multiscan document Progress- & error-messages will be stored in this logfile: $TEMP_DIR$ERRORLOG xsane2tess depends on - XSane, http://www.xsane.org/ - TesseractOCR, http://code.google.com/p/tesseract-ocr/ Some coding was stolen from 'ocube' http://www.geocities.com/thierryguy/ocube.html This adaption is based on xsane2tess http://doc.ubuntu-fr.org/xsane2tess, Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de " exit fi # get options... while getopts ":i:o:l:c:f:" OPTION do case $OPTION in i ) # input filename (with path) FILE_PATH="$OPTARG" ;; o ) # output filename FILE_OUT="$OPTARG" ;; l ) # Language-selection LANG="$OPTARG" ;; c ) # use hocr configfile CONFILE="$OPTARG" ;; f ) # final name for multiscan ocr file FINAL="$OPTARG" ;; esac done # redirect STDOUT to FILE_OUT exec 1>>$FILE_OUT # redirect STDERR to ERRORLOG exec 2>>$TEMP_DIR$ERRORLOG # strip path from FILE_PATH, use filename only IN_FILE="${FILE_PATH##*/.*}" echo "~~~+++~~~~+++~~~" 1>&2 # start OCR (tesseract expands output with *.txt/.html) tesseract "$IN_FILE" "$FILE_OUT" -l "$LANG" "$CONFILE" 1>&2 echo Tesseract used with "$LANG" "$CONFILE" 1>&2 { if [[ "$FINAL" != '' ]] then { if [[ "$CONFILE" == "" ]] then # check if final txt file is already existing { if [[ ! -a "$FINAL".txt ]] then # start final ocr file cp "$FILE_OUT".txt "$FINAL".txt 1>&2 echo "$FINAL.txt started" 1>&2 else mv "$FINAL".txt "$FINAL".new.txt cat "$FINAL".new.txt "$FILE_OUT".txt > "$FINAL".txt echo "$FILE_OUT.txt added to $FINAL.txt" 1>&2 rm "$FINAL".new.txt fi } else # check if final hocr file is already existing { if [[ ! -a "$FINAL".html ]] then # start final ocr file cp "$FILE_OUT.html" "$FINAL".html 1>&2 echo "$FINAL.html started" 1>&2 else mv "$FINAL".html "$FINAL".new.html cat "$FINAL".new.html "$FILE_OUT".html > "$FINAL".html echo "$FILE_OUT.html added to $FINAL.html" 1>&2 rm "$FINAL".new.html fi } fi } rm $FILE_OUT else # STDOUT scanned text => FILE_OUT cat "$FILE_OUT".* fi } rm $FILE_OUT.* echo "~~~+++~~~~+++~~~"$(date +%c) 1>&2