Skip to content

Instantly share code, notes, and snippets.

View kba's full-sized avatar

Konstantin Baierer kba

View GitHub Profile
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2017-11-30T16:18:26">
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER">
<mets:name>DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D)</mets:name>
<mets:note>OCR-D</mets:note>
</mets:agent>
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="OTHER" OTHERROLE="preprocessing/optimization/binarization">
<mets:name>ocrd-cis-ocropy-binarize v0.1.5</mets:name>
<mets:note xmlns:
time ocrd process --overwrite "tesserocr-segment -P find_tables true -I OCR-D-IMG -O TESS" -g PHYS_0020
12:45:37.784 INFO ocrd.task_sequence.run_tasks - Start processing task 'tesserocr-segment -I OCR-D-IMG -O TESS -p '{"find_tables": true, "dpi": 0, "padding": 4, "shrink_polygons": false, "block_polygons": false, "find_staves": false, "sparse_text": false}''
12:45:38.758 INFO processor.TesserocrSegment - INPUT FILE 0 / PHYS_0020
12:45:38.835 INFO processor.TesserocrSegment - Page 'PHYS_0020' images will use 300 DPI from image meta-data
12:45:38.835 INFO processor.TesserocrSegment - Processing page 'PHYS_0020'
libpng warning: iCCP: profile 'ICC Profile': 'desc': ICC profile tag start not a multiple of 4
libpng warning: iCCP: profile 'ICC Profile': 'rXYZ': ICC profile tag start not a multiple of 4
libpng warning: iCCP: profile 'ICC Profile': 'gXYZ': ICC profile tag start not a multiple of 4
libpng warning: iCCP: profile 'ICC Profile': 'bXYZ': ICC profile tag start not a multiple of 4
libpng warning: iCCP: prof
@kba
kba / git-pr
Created October 21, 2021 15:19
#!/usr/bin/env bash
# reset environment variables that could interfere with normal usage
unset GREP_OPTIONS
# put all utility functions here
# make a temporary file
git_extra_mktemp() {
mktemp -t "$(basename "$0")".XXXXXXX
}
@kba
kba / ocrd-page2tsv
Last active September 9, 2021 16:05
#!/bin/bash
set -e
ADD_TO_WORKSPACE=true
INPUT_FILE_GROUP=OCR-D-OCR-TESS
OUTPUT_FILE_GROUP=TSV
DIRECTORY=$PWD
PPN=
#!/bin/bash
ocrd () {
"C:/Users/kb/AppData/Local/Programs/Python/Python39/Scripts/ocrd.exe" "$@" |tr -d '\r'
}
layouteval () {
set -x
"/cygdrive/c/Users/kb/Downloads/LayoutEvalCmd_1-9-106/LayoutEvalCmd 1.9/layoutevalcmd-1-9-106.exe" \
-printWarnings \
-csvValues \

Typische OCR-Fehler

       1        0   {A}-{H} 
       1        0   {h}-{b} 
       1        0   {l}-{i} 
       1        0   {n}-{u} 
       1        0   {n}-{v} 
       1        0   {n}-{y} 
 1 0 {t}-{k} 

Fix imageFilename in PAGE

sed -i 's,imageFilename=",imageFilename="page/",' page/*.xml

Initiate mets.xml

ocrd workspace init
# tei2textpages.py
import lxml.etree as ET
import sys
import re
tei_file = sys.argv[1]
with open(tei_file, 'r') as f:
tei_xml = f.read()
tei_xml = re.sub(r'<\?xml version="1.0" encoding="UTF-8"\?>', '', tei_xml)
@kba
kba / ocrd.csl
Last active February 20, 2020 10:44
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="note" default-locale="de-CH" version="1.0" name-delimiter="; " delimiter-precedes-last="always" delimiter-precedes-et-al="never">
<info>
<title>OCR-D_infoclio.ch (German - Switzerland)</title>
<id>http://www.zotero.org/styles/ocr-d_infoclioch</id>
<link href="http://www.zotero.org/styles/ocr-d_infoclioch" rel="self"/>
<link href="http://www.zotero.org/styles/infoclio-fr-smallcaps" rel="template"/>
<link href="https://www.infoclio.ch/de/node/133932" rel="documentation"/>
<author>
<name>Nicolas Chachereau</name>
@kba
kba / mock-data.txt
Created November 27, 2018 13:17
bagit mock data
Lorem ipsum dolor sit