Skip to content

Instantly share code, notes, and snippets.

@pdaengeli
Last active March 16, 2023 13:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pdaengeli/4ac6112e841493bc09b53084b94ffdf0 to your computer and use it in GitHub Desktop.
Save pdaengeli/4ac6112e841493bc09b53084b94ffdf0 to your computer and use it in GitHub Desktop.
Data Hackdays BE 2023, Datafying Bärn: line by line content from ALTO XML
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
exclude-result-prefixes="xs xd"
xpath-default-namespace="http://www.loc.gov/standards/alto/ns-v3#"
expand-text="true"
version="3.0">
<xd:doc scope="stylesheet">
<xd:desc>
<xd:p><xd:b>Created on:</xd:b> Mar 15, 2023</xd:p>
<xd:p><xd:b>Author:</xd:b> pd</xd:p>
<xd:p>Data Hackdays BE 2023, Datafying Bärn: line by line content from ALTO XML</xd:p>
</xd:desc>
</xd:doc>
<!-- TODO: iterate over METS urls
https://www.e-rara.ch/oai?verb=GetRecord&metadataPrefix=mets&identifier=26012591
//mets:fileGrp/mets:file/mets:FLocat/@xlink:href)
-->
<!-- TODO: grouping by first comma separated token or resolving dash references
by first preceding name -->
<!-- TODO: add tel. regions (as they appear in the source) to $tel-regions-regex -->
<xsl:param name="method" static="true" select="'text'"/><!-- text | xml -->
<xsl:output indent="true"/>
<xsl:output method="text" item-separator="&#xA;" use-when="$method='text'"/>
<xsl:mode name="lines-pass2" on-no-match="shallow-copy"/>
<xsl:mode name="lines-pass3" on-no-match="shallow-copy"/>
<xsl:mode name="lines-pass4" on-no-match="shallow-copy"/>
<xsl:mode name="lines-pass4" on-no-match="shallow-copy"/>
<xsl:mode name="lines-pass5" on-no-match="shallow-copy"/>
<xsl:mode name="lines-last-pass" on-no-match="shallow-skip"/>
<xsl:param name="input-doc" as="document-node()" select="doc('https://www.e-rara.ch/bes_1/download/fulltext/alto3/26012749')"/>
<xsl:param name="tel-regions-regex" select="'(Bollw|Ch|Chr|Christ|Zähr)\.'"/>
<!-- 1st pass: mark hyphenation -->
<xsl:variable name="textlines-pass1" as="element(Q{}block)+">
<block xmlns="">
<xsl:apply-templates mode="lines-pass1" select="$input-doc//TextLine"/>
</block>
</xsl:variable>
<!-- 2nd pass: mark some trailing lines (addresses, phone numbers) -->
<xsl:variable name="textlines-pass2" as="element(Q{}block)+">
<xsl:apply-templates mode="lines-pass2" select="$textlines-pass1"/>
</xsl:variable>
<!-- 3rd pass: mark lines without comma for joining -->
<xsl:variable name="textlines-pass3" as="element(Q{}block)+">
<xsl:apply-templates mode="lines-pass3" select="$textlines-pass2"/>
</xsl:variable>
<!-- 4th pass: detect alphabetical sections -->
<xsl:variable name="textlines-pass4" as="element(Q{}block)+">
<xsl:apply-templates mode="lines-pass4" select="$textlines-pass3"/>
</xsl:variable>
<!-- 5th pass: mark lines starting with a capital letter not belonging to the current section -->
<xsl:variable name="textlines-pass5" as="element(Q{}block)+">
<xsl:apply-templates mode="lines-pass5" select="$textlines-pass4"/>
</xsl:variable>
<!-- last pass: join lines -->
<xsl:variable name="textlines-last-pass" as="element(Q{}l)+">
<xsl:apply-templates mode="lines-last-pass" select="$textlines-pass5"/>
</xsl:variable>
<!-- initiate transformation -->
<xsl:template match="/">
<xsl:sequence select="$textlines-last-pass"/>
</xsl:template>
<xsl:template match="TextLine" mode="lines-pass1">
<l xmlns="">
<xsl:if test="String[last()][@SUBS_TYPE='HypPart1' or @CONTENT='-']">
<xsl:attribute name="join" select="'following'"/>
</xsl:if>
<xsl:apply-templates mode="lines-pass1"/>
</l>
</xsl:template>
<xsl:template match="String" mode="lines-pass1">{@CONTENT}</xsl:template>
<xsl:template match="SP" mode="lines-pass1">{' '}</xsl:template>
<xsl:template match="*:l" mode="lines-pass2">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:variable name="regex" select="'^(\w+\.?\s)?\d*\s?'||$tel-regions-regex||'\s?\d+'"/>
<xsl:choose>
<xsl:when test="matches(following-sibling::*:l[1]/text(),$regex)">
<xsl:attribute name="join" select="'following'"/>
<xsl:sequence select="text()"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="text()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:copy>
</xsl:template>
<xsl:template match="*:l" mode="lines-pass3">
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:choose>
<xsl:when test="not(contains(following-sibling::*:l[1]/text(),','))">
<xsl:attribute name="join" select="'following'"/>
<xsl:sequence select="text()||(if (matches(following-sibling::*:l[1]/text(),'[A-Z]')) then ' ' else '')"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="text()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:copy>
</xsl:template>
<xsl:template match="*:l[matches(text(),'^[A-Z]\.$')]" mode="lines-pass4">
<l xmlns=""/>
<l xmlns="">%%%%%%%%%%%%%%%%%%%%</l>
<l xmlns="">%% SECTION BEGINS %%</l>
<l xmlns="">%%%%%%%%%%%%%%%%%%%%</l>
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:attribute name="section" select="."/>
<xsl:sequence select="text()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="*:l" mode="lines-pass5">
<xsl:variable name="current-section" select="preceding-sibling::*:l[@section][1] => replace('\W','')"/>
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:choose>
<xsl:when test="matches(following-sibling::*:l[1]/text(),'^[A-Z]') and not(starts-with(following-sibling::*:l[1]/text(),$current-section))">
<xsl:attribute name="join" select="'following'"/>
<xsl:sequence select="text() => normalize-space()||' '"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="text()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:copy>
</xsl:template>
<xsl:template match="*:l[not(@join) and not(preceding-sibling::*:l[1][@join='following'])]" mode="lines-last-pass">
<xsl:sequence select="."/>
</xsl:template>
<xsl:template match="*:l[@join='following'][not(preceding-sibling::*:l[1][@join='following'])]" mode="lines-last-pass">
<xsl:copy>
<xsl:copy-of select="@* except @join"/>
<xsl:sequence select="if (@join='following') then replace(text()[last()],'-\s?$','') => normalize-space() || (if(matches(following-sibling::*:l[1]/text(),'^[A-Z]')) then ' ' else '') else text()"/>
<!-- apply joining recursively for long lines (running over >2 lines in the source) -->
<xsl:value-of>
<xsl:call-template name="join">
<xsl:with-param name="next" select="following-sibling::*:l[1]"/>
</xsl:call-template>
</xsl:value-of>
</xsl:copy>
</xsl:template>
<xsl:template name="join" as="xs:string*">
<xsl:param name="next" as="node()*"/>
<xsl:sequence select="if ($next/@join='following') then replace($next/text()[last()],'-\s?$','') => normalize-space() || (if(matches($next/following-sibling::*:l[1]/text(),'^[A-Z]')) then ' ' else '') else $next/text()"/>
<xsl:if test="$next[@join='following']">
<xsl:call-template name="join">
<xsl:with-param name="next" select="$next/following-sibling::*:l[1]"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment