Skip to content

Instantly share code, notes, and snippets.

@bertsky
Last active October 18, 2019 12:45
Show Gist options
  • Save bertsky/1f3a69a2defae662ee40e6b00b4d0d39 to your computer and use it in GitHub Desktop.
Save bertsky/1f3a69a2defae662ee40e6b00b4d0d39 to your computer and use it in GitHub Desktop.
Commands to prepare pixel classifier training data from OCR-D GT
# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis
# Runs a preprocessing and resegmentation workflow for GT annotation,
# then extracts page images along JSON descriptions of region polygons and classes;
# finally, creates a flattened directory under $TARGET.
# Run: preprocess-ocrd-gt.sh [TARGET-DIRECTORY [METS-FILE]]
# (default is all METS files anywhere under CWD)
TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-repair}
WORKSPACES=${2:-$(find . -name mets.xml)}
#set -e
function process {
echo starting $1
pushd ${1%mets.xml}
# fix MIME type:
sed -i.orig 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml
# fix PAGE imageFilename:
if [[ $1 =~ 1000pages ]]; then
# fix imageFilename (relative to METS, not to PAGE)
for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
test -f $file || continue
sed -i.orig 's|imageFilename="../|imageFilename="|' $file
done
else
# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
for page in $(ocrd workspace find -k pageId | sort -u); do
img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do
test -f $file || continue
sed -i.orig "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file
done
done
fi
# process
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN || \
ocrd-olena-binarize -I OCR-D-GT-SEG-BLOCK -O OCR-D-GT-SEG-BLOCK-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE || \
ocrd-cis-ocropy-denoise -I OCR-D-GT-SEG-BLOCK-BIN -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
ocrd workspace list-group | grep -e OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW || \
ocrd-cis-ocropy-deskew -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE -O OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
ocrd workspace list-group | grep -e OCR-D-SEG-LINE || \
ocrd-cis-ocropy-segment -I OCR-D-GT-SEG-BLOCK-BIN-DENOISE-DESKEW -O OCR-D-SEG-LINE -p <(echo '{"spread": 2.4}')
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK || \
ocrd-segment-repair -I OCR-D-SEG-LINE -O OCR-D-SEG-BLOCK -p <(echo '{"sanitize": true}')
ocrd workspace list-group | grep -e OCR-D-IMG-CROP || \
ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK -O OCR-D-IMG-REGIONS -p <(echo '{"transparency": true}')
echo done with $1
popd
}
export -f process
echo starting workflow
# for mets in $(find . -name mets.xml); do
# sem --id preprocess-ocrd-gt -j6 process $mets || return
# done
# sem --id preprocess-ocrd-gt --wait
parallel process ::: $WORKSPACES
echo done with workflow
echo creating flat $TARGET
mkdir $TARGET
for file in $(find . -type f -name "OCR-D-IMG-REGIONS_*"); do
dir=${file%/OCR-D-IMG-REGIONS/*}
dir=${dir#./}
ln -rs $file $TARGET/${dir//\//_}_$(basename $file);
done
#pushd $(dirname $TARGET)
#tar -chvf $(basename $TARGET).tar $(basename $TARGET)
#popd
echo done with everything
# Needs OCR-D/core#327 OCR-D/ocrd_olena#10 OCR-D/ocrd_segment#11 bertsky/ocrd_cis OCR-D/ocrd_tesserocr#80 anyocrbase
# Runs a preprocessing and segmentation workflow for input images,
# then extracts page images along JSON descriptions of region polygons and classes;
# finally, creates a flattened directory under $TARGET.
# Run: preprocess-ocrd-baseline-tesseract.sh [TARGET-DIRECTORY [METS-FILE]]
# (default is all METS files anywhere under CWD)
TARGET=${1:-../1000pages-crop-sauvola-denoise-deskew-tess-repair}
WORKSPACES=${2:-$(find . -name mets.xml)}
#set -e
function process {
echo starting $1
pushd ${1%mets.xml}
# fix MIME type:
sed -i 's|MIMETYPE="image/jpeg" ID="OCR-D-GT-SEG|MIMETYPE="application/vnd.prima.page+xml" ID="OCR-D-GT-SEG|' mets.xml
# fix PAGE imageFilename:
if [[ $1 =~ 1000pages ]]; then
# fix imageFilename (relative to METS, not to PAGE)
for file in $(ocrd workspace find -m application/vnd.prima.page+xml -k local_filename); do
test -f $file || continue
sed -i 's|imageFilename="../|imageFilename="|' $file
done
else
# fix imageFilename (find PAGE filename in METS, find image filename via same pageId in METS):
for page in $(ocrd workspace find -k pageId | sort -u); do
img=$(ocrd workspace find -G OCR-D-IMG -g $page -k local_filename)
for file in $(ocrd workspace find -G OCR-D-GT-SEG-PAGE -g $page -k local_filename) $(ocrd workspace find -G OCR-D-GT-SEG-BLOCK -g $page -k local_filename); do
test -f $file || continue
sed -i "s|imageFilename=\"[^\"]*\"|imageFilename=\"$img\"|" $file
done
done
fi
# process
if ocrd workspace list-group | grep -q -e OCR-D-GT-SEG-PAGE; then
input_file_group=OCR-D-GT-SEG-PAGE
else
ocrd workspace list-group | grep -e OCR-D-SEG-PAGE || \
ocrd-anyocrbase-crop -I OCR-D-IMG -O OCR-D-SEG-PAGE
input_file_group=OCR-D-SEG-PAGE
fi
ocrd workspace list-group | grep -e ${input_file_group}-BIN || \
ocrd-olena-binarize -I $input_file_group -O ${input_file_grop}-BIN,OCR-D-IMG-BIN -p <(echo '{"impl": "sauvola-ms-split"}')
ocrd workspace list-group | grep -e ${input_file_group}-BIN-DENOISE || \
ocrd-cis-ocropy-denoise -I ${input_file_group}-BIN -O ${input_file_group}-BIN-DENOISE -p <(echo '{"level-of-operation": "page"}')
ocrd workspace list-group | grep -e ${input_file_gorup}-BIN-DENOISE-DESKEW || \
ocrd-cis-ocropy-deskew -I ${input_file_group}-BIN-DENOISE -O ${input_file_group}-BIN-DENOISE-DESKEW -p <(echo '{"level-of-operation": "page"}')
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS || \
ocrd-tesserocr-segment-region -I ${input_file_group}-BIN-DENOISE-DESKEW -O OCR-D-SEG-BLOCK-TESS
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-DESKEW || \
ocrd-cis-ocropy-deskew -I OCR-D-SEG-BLOCK-TESS -O OCR-D-SEG-BLOCK-TESS-DESKEW -p <(echo '{"level-of-operation": "region"}')
ocrd workspace list-group | grep -e OCR-D-SEG-LINE-TESS || \
ocrd-cis-ocropy-segment -I OCR-D-SEG-BLOCK-TESS-DESKEW -O OCR-D-SEG-LINE-TESS -p <(echo '{"spread": 2.4}')
ocrd workspace list-group | grep -e OCR-D-SEG-BLOCK-TESS-TIGHT || \
ocrd-segment-repair -I OCR-D-SEG-LINE-TESS -O OCR-D-SEG-BLOCK-TESS-TIGHT -p <(echo '{"sanitize": true}')
ocrd workspace list-group | grep -e OCR-D-IMG-CROP-TESS || \
ocrd-segment-extract-regions -I OCR-D-SEG-BLOCK-TESS-TIGHT -O OCR-D-IMG-REGIONS-TESS -p <(echo '{"transparency": true}')
echo done with $1
popd
}
export -f process
echo starting workflow
# for mets in $(find . -name mets.xml); do
# sem --id preprocess-ocrd-gt -j6 process $mets || return
# done
# sem --id preprocess-ocrd-gt --wait
parallel process ::: $WORKSPACES
echo done with workflow
echo creating flat $TARGET
mkdir $TARGET
for file in $(find . -type f -name "OCR-D-IMG-REGIONS-TESS_*"); do
dir=${file%/OCR-D-IMG-REGIONS-TESS/*};
dir=${dir#./}
ln -rs $file $TARGET/${dir//\//_}_$(basename $file);
done
#pushd $(dirname $TARGET)
#tar -chvf $(basename $TARGET).tar $(basename $TARGET)
#popd
echo done with everything
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment