Skip to content

Instantly share code, notes, and snippets.

@kba
Last active September 9, 2021 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kba/2b4dc816d0eaadc5253218b03f97ddb6 to your computer and use it in GitHub Desktop.
Save kba/2b4dc816d0eaadc5253218b03f97ddb6 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -e
ADD_TO_WORKSPACE=true
INPUT_FILE_GROUP=OCR-D-OCR-TESS
OUTPUT_FILE_GROUP=TSV
DIRECTORY=$PWD
PPN=
IIIF_URL_TEMPLATE="https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ pageno }}/left,top,width,height/full/0/default.jpg"
SCALE_FILEGRP=
while [[ $1 = -* ]];do
case "$1" in
-I|--input-file-grp) INPUT_FILE_GROUP=$2; shift;;
-O|--output-file-grp) OUTPUT_FILE_GROUP=$2; shift;;
-d|--directory) DIRECTORY=$2; shift;;
-P|--ppn) PPN=$2; shift;;
-W|--no-add-to-workspace) ADD_TO_WORKSPACE=false;;
-w|--add-to-workspace) ADD_TO_WORKSPACE=true;;
--scale-by-filegrp) SCALE_FILEGRP=$2; shift;;
esac
shift
done
cd "$DIRECTORY"
if [[ -z $PPN ]];then
PPN=$(basename "$DIRECTORY")
fi
if [[ ! -d "$OUTPUT_FILE_GROUP" ]];then
mkdir $OUTPUT_FILE_GROUP
fi
for infile in $INPUT_FILE_GROUP/*;do
outfile=$OUTPUT_FILE_GROUP/$(basename $infile)
outfile=${outfile%.xml}.tsv
# XXX this does not help, we need IIIF URL
# img=$(grep -Po 'imageFilename=".*?"' "$infile")
# img=${img:15:-1}
scale_factor=1.0
if [[ -n "$SCALE_FILEGRP" ]];then
numeric_part=$(basename $infile|grep -Po '\d+')
compare_file=$(find $SCALE_FILEGRP -name "*$numeric_part*")
should_width=$(identify -format '%w' $compare_file)
is_width=$(grep -Po 'imageWidth="[^"]*' $infile|grep -Po '\d+')
scale_factor=$(echo "$is_width / $should_width"|bc)
fi
pageno=$(basename $infile|grep -Po '\d\d\d\d'|sed 's,^0*,,')
iiif_url=$(echo $IIIF_URL_TEMPLATE|sed "s/{{ PPN }}/$PPN/"|sed "s/{{ pageno }}/$(printf '%08d' $pageno)/")
page2tsv --purpose OCR --scale-factor $scale_factor --image-url "$iiif_url" $infile $outfile
if [[ $ADD_TO_WORKSPACE = true ]];then
ocrd workspace add -i $(basename $outfile) -m text/tsv -G $OUTPUT_FILE_GROUP -g PHYS_$(printf '%04d' $pageno) $outfile
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment