diff options
author | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2015-11-26 22:16:11 +0100 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2015-11-26 22:16:11 +0100 |
commit | 8d7557a5f5b0006448ddc6c29a3cfa610008adf0 (patch) | |
tree | 8c6b626d3fa94c07db7b226173698fb35e8e60b4 /debian/ocr-scripts | |
parent | 665bbcae09168c39c73117e04c5f14d09e25b50c (diff) |
Add some more ocr commandline tools to suggests
Diffstat (limited to 'debian/ocr-scripts')
-rwxr-xr-x | debian/ocr-scripts/xsane2cunei.sh | 84 | ||||
-rwxr-xr-x | debian/ocr-scripts/xsane2ocrad.sh | 87 | ||||
-rwxr-xr-x | debian/ocr-scripts/xsane2tess3.sh | 145 |
3 files changed, 316 insertions, 0 deletions
diff --git a/debian/ocr-scripts/xsane2cunei.sh b/debian/ocr-scripts/xsane2cunei.sh new file mode 100755 index 0000000..b704d52 --- /dev/null +++ b/debian/ocr-scripts/xsane2cunei.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# ############################################################################### +# # +# xsane2cunei 0.3 # +# # +# *** cuneiform made simple *** # +# # +# ############################################################################### +# +# xane2cunei is a wrapper to be able to use Cuneiform-Linux with XSane +# +# +# +TEMP_DIR=~/tmp/ # folder for temporary files +ERRORLOG="xsane2cunei.log" # file where STDERR goes + +if [[ -z "$1" ]] + then + echo "Usage: $0 [OPTIONS] + + xsane2cunei scans image files with XSane, + recognizes the text using cuneiform-linux + and outputs the text in a file. + + OPTIONS: + -i <file1> define input file (any image-format supported) + -o <file2> define output file (txt, html, hocr, rtf) + -l <language> define the language used for recognition + -f <format> define the format used for output + -e <extraoptions> optional: dotmatrix, fax, singlecolumn + + Progress- & error-messages will be stored in this logfile: + $TEMP_DIR$ERRORLOG + + xsane2cunei depends on + - XSane http://www.xsane.org/ + - libmagick-++dev http://www.imagemagick.org/ + - cuneiform-linux https://launchpad.net/cuneiform-linux Cuneiform-Linux + + Some coding was stolen from 'ocube' + http://www.geocities.com/thierryguy/ocube.html + + This Cuneiform adaption is based on xsane2tess + http://doc.ubuntu-fr.org/xsane2tess, + + Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de +" + exit +fi + +# get options... +while getopts ":i:o:l:f:e:" OPTION + do + case $OPTION in + i) # input filename (with path) + FILE_PATH="$OPTARG" + ;; + o ) # output filename + FILE_OUT="$OPTARG" + ;; + l ) # recognition language + LANGUAGE="$OPTARG" + ;; + f ) # output format + FORMAT="$OPTARG" + ;; + e ) # extra option format + EXTRA="$OPTARG" + ;; + esac +done + +# redirect STDOUT to FILE_OUT +exec 1>>$FILE_OUT + +# redirect STDERR to ERRORLOG +exec 2>>$TEMP_DIR$ERRORLOG + +# strip path from FILE_PATH, use filename only +IN_FILE="${FILE_PATH##*/.*}" + +# start OCR +cuneiform -l "$LANGUAGE" -f "$FORMAT" -o "$FILE_OUT" "--$EXTRA" "$IN_FILE" 1>&2 diff --git a/debian/ocr-scripts/xsane2ocrad.sh b/debian/ocr-scripts/xsane2ocrad.sh new file mode 100755 index 0000000..0e5fb13 --- /dev/null +++ b/debian/ocr-scripts/xsane2ocrad.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# xsane2ocrad - ocr with ocrad directly from xsane +# Copyright (C) 2012 Heinrich Schwietering +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +################################################################################ +# # +# xsane2ocrad 0.1 # +# # +# *** ocrad made simple *** # +# # +################################################################################ +# +# xane2ocrad is a wrapper to use Ocrad with XSane +# +# +# +TEMP_DIR=/tmp/ # folder for temporary files +ERRORLOG="xsane2ocrad.log" # file where STDERR goes + +if [[ -z "$1" ]] + then + echo "Usage: $0 [OPTIONS] + + xsane2ocrad scans image files with XSane, + recognizes the text using ocrad + and outputs the text in a file. + + OPTIONS: + -i <file1> define input file (any image-format supported) + -o <file2> define output file (txt, html, hocr, rtf) + -e <options> optional, all ocrad-Options, use quotes + + Progress- & error-messages will be stored in this logfile: + $TEMP_DIR$ERRORLOG + + xsane2ocrad depends on + - XSane, http://www.xsane.org/ + - ocrad, http://www.gnu.org/software/ocrad/ + + Some coding was stolen from 'ocube' + http://www.geocities.com/thierryguy/ocube.html + + This ocrad adaption is based on xsane2tess + http://doc.ubuntu-fr.org/xsane2tess, + + Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de +" + exit +fi + +# get options... +while getopts ":i:o:e:" OPTION + do + case $OPTION in + i ) # input filename (with path) + FILE_PATH="$OPTARG" + ;; + o ) # output filename + FILE_OUT="$OPTARG" + ;; + e ) # extra options + EXTRA="$OPTARG" + ;; + esac +done + +# redirect STDERR to ERRORLOG +exec 2>>$TEMP_DIR$ERRORLOG +echo "~~~+++~~~~+++~~~" 1>&2 + +ocrad "$FILE_PATH" -o "$FILE_OUT" $EXTRA 1>&2 +echo "ocrad "$FILE_PATH" -o "$FILE_OUT" $EXTRA ausgeführt" 1>&2 + +echo "~~~+++~~~~+++~~~"$(date +%c) 1>&2 diff --git a/debian/ocr-scripts/xsane2tess3.sh b/debian/ocr-scripts/xsane2tess3.sh new file mode 100755 index 0000000..14683bf --- /dev/null +++ b/debian/ocr-scripts/xsane2tess3.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# xsane2tess3 - tesseractOCR directly from xsane +# Copyright (C) 2012 Heinrich Schwietering +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# +############################################################################## +# +# xsane2tess3 0.1 +# +# *** tesseract made simple *** +# +# +############################################################################## +# +# xsane2tess is a TesseractOCR 3.0x wrapper to be able to use tesseract with xsane +# +# +# +TEMP_DIR=/tmp/ # folder for temporary files (all files) +ERRORLOG="xsane2tess3.log" # file where STDERR goes + +if [[ -z "$1" ]] + then + echo "Usage: $0 [OPTIONS] + + xsane2tess3 scans images with TesseractOCR + and outputs the text in a file or as hocr/html document + + OPTIONS: + -i <file1> define input file (any image-format supported) + -o <file2> define output-file (*.txt/hOCR) + -l <lang> define language-data tesseract should use + -e <config> filename for tesseract + -f </path/to/Final> name and path fot multiscan document + + Progress- & error-messages will be stored in this logfile: + $TEMP_DIR$ERRORLOG + + xsane2tess depends on + - XSane, http://www.xsane.org/ + - TesseractOCR, http://code.google.com/p/tesseract-ocr/ + + Some coding was stolen from 'ocube' + http://www.geocities.com/thierryguy/ocube.html + + This adaption is based on xsane2tess + http://doc.ubuntu-fr.org/xsane2tess, + + Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de + +" + exit +fi + + +# get options... +while getopts ":i:o:l:c:f:" OPTION + do + case $OPTION in + i ) # input filename (with path) + FILE_PATH="$OPTARG" + ;; + o ) # output filename + FILE_OUT="$OPTARG" + ;; + l ) # Language-selection + LANG="$OPTARG" + ;; + c ) # use hocr configfile + CONFILE="$OPTARG" + ;; + f ) # final name for multiscan ocr file + FINAL="$OPTARG" + ;; + esac +done + +# redirect STDOUT to FILE_OUT +exec 1>>$FILE_OUT + +# redirect STDERR to ERRORLOG +exec 2>>$TEMP_DIR$ERRORLOG + +# strip path from FILE_PATH, use filename only +IN_FILE="${FILE_PATH##*/.*}" + +echo "~~~+++~~~~+++~~~" 1>&2 + +# start OCR (tesseract expands output with *.txt/.html) +tesseract "$IN_FILE" "$FILE_OUT" -l "$LANG" "$CONFILE" 1>&2 +echo Tesseract used with "$LANG" "$CONFILE" 1>&2 + +{ if [[ "$FINAL" != '' ]] + then + { if [[ "$CONFILE" == "" ]] + then + # check if final txt file is already existing + { if [[ ! -a "$FINAL".txt ]] + then +# start final ocr file + cp "$FILE_OUT".txt "$FINAL".txt 1>&2 + echo "$FINAL.txt started" 1>&2 + else + mv "$FINAL".txt "$FINAL".new.txt + cat "$FINAL".new.txt "$FILE_OUT".txt > "$FINAL".txt + echo "$FILE_OUT.txt added to $FINAL.txt" 1>&2 + rm "$FINAL".new.txt + fi } + else +# check if final hocr file is already existing + { if [[ ! -a "$FINAL".html ]] + then +# start final ocr file + cp "$FILE_OUT.html" "$FINAL".html 1>&2 + echo "$FINAL.html started" 1>&2 + else + mv "$FINAL".html "$FINAL".new.html + cat "$FINAL".new.html "$FILE_OUT".html > "$FINAL".html + echo "$FILE_OUT.html added to $FINAL.html" 1>&2 + rm "$FINAL".new.html + fi } + fi } + rm $FILE_OUT + else +# STDOUT scanned text => FILE_OUT + cat "$FILE_OUT".* + +fi } + +rm $FILE_OUT.* + +echo "~~~+++~~~~+++~~~"$(date +%c) 1>&2 |