Last active
March 16, 2023 13:40
-
-
Save pdaengeli/4ac6112e841493bc09b53084b94ffdf0 to your computer and use it in GitHub Desktop.
Data Hackdays BE 2023, Datafying Bärn: line by line content from ALTO XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:xs="http://www.w3.org/2001/XMLSchema" | |
xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl" | |
exclude-result-prefixes="xs xd" | |
xpath-default-namespace="http://www.loc.gov/standards/alto/ns-v3#" | |
expand-text="true" | |
version="3.0"> | |
<xd:doc scope="stylesheet"> | |
<xd:desc> | |
<xd:p><xd:b>Created on:</xd:b> Mar 15, 2023</xd:p> | |
<xd:p><xd:b>Author:</xd:b> pd</xd:p> | |
<xd:p>Data Hackdays BE 2023, Datafying Bärn: line by line content from ALTO XML</xd:p> | |
</xd:desc> | |
</xd:doc> | |
<!-- TODO: iterate over METS urls | |
https://www.e-rara.ch/oai?verb=GetRecord&metadataPrefix=mets&identifier=26012591 | |
//mets:fileGrp/mets:file/mets:FLocat/@xlink:href) | |
--> | |
<!-- TODO: grouping by first comma separated token or resolving dash references | |
by first preceding name --> | |
<!-- TODO: add tel. regions (as they appear in the source) to $tel-regions-regex --> | |
<xsl:param name="method" static="true" select="'text'"/><!-- text | xml --> | |
<xsl:output indent="true"/> | |
<xsl:output method="text" item-separator="
" use-when="$method='text'"/> | |
<xsl:mode name="lines-pass2" on-no-match="shallow-copy"/> | |
<xsl:mode name="lines-pass3" on-no-match="shallow-copy"/> | |
<xsl:mode name="lines-pass4" on-no-match="shallow-copy"/> | |
<xsl:mode name="lines-pass4" on-no-match="shallow-copy"/> | |
<xsl:mode name="lines-pass5" on-no-match="shallow-copy"/> | |
<xsl:mode name="lines-last-pass" on-no-match="shallow-skip"/> | |
<xsl:param name="input-doc" as="document-node()" select="doc('https://www.e-rara.ch/bes_1/download/fulltext/alto3/26012749')"/> | |
<xsl:param name="tel-regions-regex" select="'(Bollw|Ch|Chr|Christ|Zähr)\.'"/> | |
<!-- 1st pass: mark hyphenation --> | |
<xsl:variable name="textlines-pass1" as="element(Q{}block)+"> | |
<block xmlns=""> | |
<xsl:apply-templates mode="lines-pass1" select="$input-doc//TextLine"/> | |
</block> | |
</xsl:variable> | |
<!-- 2nd pass: mark some trailing lines (addresses, phone numbers) --> | |
<xsl:variable name="textlines-pass2" as="element(Q{}block)+"> | |
<xsl:apply-templates mode="lines-pass2" select="$textlines-pass1"/> | |
</xsl:variable> | |
<!-- 3rd pass: mark lines without comma for joining --> | |
<xsl:variable name="textlines-pass3" as="element(Q{}block)+"> | |
<xsl:apply-templates mode="lines-pass3" select="$textlines-pass2"/> | |
</xsl:variable> | |
<!-- 4th pass: detect alphabetical sections --> | |
<xsl:variable name="textlines-pass4" as="element(Q{}block)+"> | |
<xsl:apply-templates mode="lines-pass4" select="$textlines-pass3"/> | |
</xsl:variable> | |
<!-- 5th pass: mark lines starting with a capital letter not belonging to the current section --> | |
<xsl:variable name="textlines-pass5" as="element(Q{}block)+"> | |
<xsl:apply-templates mode="lines-pass5" select="$textlines-pass4"/> | |
</xsl:variable> | |
<!-- last pass: join lines --> | |
<xsl:variable name="textlines-last-pass" as="element(Q{}l)+"> | |
<xsl:apply-templates mode="lines-last-pass" select="$textlines-pass5"/> | |
</xsl:variable> | |
<!-- initiate transformation --> | |
<xsl:template match="/"> | |
<xsl:sequence select="$textlines-last-pass"/> | |
</xsl:template> | |
<xsl:template match="TextLine" mode="lines-pass1"> | |
<l xmlns=""> | |
<xsl:if test="String[last()][@SUBS_TYPE='HypPart1' or @CONTENT='-']"> | |
<xsl:attribute name="join" select="'following'"/> | |
</xsl:if> | |
<xsl:apply-templates mode="lines-pass1"/> | |
</l> | |
</xsl:template> | |
<xsl:template match="String" mode="lines-pass1">{@CONTENT}</xsl:template> | |
<xsl:template match="SP" mode="lines-pass1">{' '}</xsl:template> | |
<xsl:template match="*:l" mode="lines-pass2"> | |
<xsl:copy> | |
<xsl:copy-of select="@*"/> | |
<xsl:variable name="regex" select="'^(\w+\.?\s)?\d*\s?'||$tel-regions-regex||'\s?\d+'"/> | |
<xsl:choose> | |
<xsl:when test="matches(following-sibling::*:l[1]/text(),$regex)"> | |
<xsl:attribute name="join" select="'following'"/> | |
<xsl:sequence select="text()"/> | |
</xsl:when> | |
<xsl:otherwise> | |
<xsl:sequence select="text()"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template match="*:l" mode="lines-pass3"> | |
<xsl:copy> | |
<xsl:copy-of select="@*"/> | |
<xsl:choose> | |
<xsl:when test="not(contains(following-sibling::*:l[1]/text(),','))"> | |
<xsl:attribute name="join" select="'following'"/> | |
<xsl:sequence select="text()||(if (matches(following-sibling::*:l[1]/text(),'[A-Z]')) then ' ' else '')"/> | |
</xsl:when> | |
<xsl:otherwise> | |
<xsl:sequence select="text()"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template match="*:l[matches(text(),'^[A-Z]\.$')]" mode="lines-pass4"> | |
<l xmlns=""/> | |
<l xmlns="">%%%%%%%%%%%%%%%%%%%%</l> | |
<l xmlns="">%% SECTION BEGINS %%</l> | |
<l xmlns="">%%%%%%%%%%%%%%%%%%%%</l> | |
<xsl:copy> | |
<xsl:copy-of select="@*"/> | |
<xsl:attribute name="section" select="."/> | |
<xsl:sequence select="text()"/> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template match="*:l" mode="lines-pass5"> | |
<xsl:variable name="current-section" select="preceding-sibling::*:l[@section][1] => replace('\W','')"/> | |
<xsl:copy> | |
<xsl:copy-of select="@*"/> | |
<xsl:choose> | |
<xsl:when test="matches(following-sibling::*:l[1]/text(),'^[A-Z]') and not(starts-with(following-sibling::*:l[1]/text(),$current-section))"> | |
<xsl:attribute name="join" select="'following'"/> | |
<xsl:sequence select="text() => normalize-space()||' '"/> | |
</xsl:when> | |
<xsl:otherwise> | |
<xsl:sequence select="text()"/> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template match="*:l[not(@join) and not(preceding-sibling::*:l[1][@join='following'])]" mode="lines-last-pass"> | |
<xsl:sequence select="."/> | |
</xsl:template> | |
<xsl:template match="*:l[@join='following'][not(preceding-sibling::*:l[1][@join='following'])]" mode="lines-last-pass"> | |
<xsl:copy> | |
<xsl:copy-of select="@* except @join"/> | |
<xsl:sequence select="if (@join='following') then replace(text()[last()],'-\s?$','') => normalize-space() || (if(matches(following-sibling::*:l[1]/text(),'^[A-Z]')) then ' ' else '') else text()"/> | |
<!-- apply joining recursively for long lines (running over >2 lines in the source) --> | |
<xsl:value-of> | |
<xsl:call-template name="join"> | |
<xsl:with-param name="next" select="following-sibling::*:l[1]"/> | |
</xsl:call-template> | |
</xsl:value-of> | |
</xsl:copy> | |
</xsl:template> | |
<xsl:template name="join" as="xs:string*"> | |
<xsl:param name="next" as="node()*"/> | |
<xsl:sequence select="if ($next/@join='following') then replace($next/text()[last()],'-\s?$','') => normalize-space() || (if(matches($next/following-sibling::*:l[1]/text(),'^[A-Z]')) then ' ' else '') else $next/text()"/> | |
<xsl:if test="$next[@join='following']"> | |
<xsl:call-template name="join"> | |
<xsl:with-param name="next" select="$next/following-sibling::*:l[1]"/> | |
</xsl:call-template> | |
</xsl:if> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment