Last active
January 9, 2024 22:19
-
-
Save DavidBuchanan314/798db4d18ed264920b9afd1c71b7f8bd to your computer and use it in GitHub Desktop.
A second attempt at simdutf incremental utf8 validation (proof-of-concept, not rigorously tested, see https://github.com/simdutf/simdutf/issues/361 )
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include "utf8_incremental.h" | |
// a very page-aligned buffer, which maybe helps io performance (untested!) | |
static char buf[0x10000] __attribute__ ((aligned (0x10000))); | |
int main() | |
{ | |
unsigned int state = UTF8_ACCEPT; | |
for (;;) { | |
size_t readlen = fread(buf, 1, sizeof(buf), stdin); | |
state = validate_utf8_incremental(state, buf, readlen); | |
if (state == 1) break; // optional early-exit (would still work without this line, though) | |
if (readlen < sizeof(buf)) break; | |
} | |
if (feof(stdin) && (state == UTF8_ACCEPT)) { | |
printf("Success!\n"); | |
return 0; | |
} | |
printf("failed :(\n%d\n", state); | |
return -1; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CFLAGS := -Wall -Wextra -Wpedantic -O3 | |
CXXFLAGS := ${CFLAGS} | |
LDFLAGS := -lsimdutf | |
main: main.o utf8_incremental.o |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <simdutf.h> | |
// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim | |
static inline size_t trim_partial_utf8(const char *input, size_t length) { | |
if (length < 3) { | |
switch (length) { | |
case 2: | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left | |
return length; | |
case 1: | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
return length; | |
case 0: | |
return length; | |
} | |
} | |
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left | |
if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left | |
return length; | |
} | |
extern "C" { | |
#include "utf8_incremental.h" | |
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
// Modified by David Buchanan, 2024 | |
// NB: the values in these tables are all 4-bit, and so could be packed into nibbles, but it's perhaps not worth it. | |
static const unsigned char utf8_class_lut[] = { | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df | |
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef | |
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff | |
}; | |
static const unsigned char utf8_transition_lut[9][16] = { | |
{0,1,2,3,5,8,7,1,1,1,4,6,1,1,1,1}, // s0 | |
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s1 | |
{1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1}, // s2 | |
{1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1}, // s3 | |
{1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1}, // s4 | |
{1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1}, // s5 | |
{1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s6 | |
{1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s7 | |
{1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s8 | |
}; | |
static unsigned int inline bytewise_utf8_validate_next_state(unsigned int state, unsigned char byte) { | |
return utf8_transition_lut[state][utf8_class_lut[byte]]; | |
} | |
// end Bjoern Hoehrmann copyright // | |
/* | |
This function takes the performance of simdutf, and wraps it an (imho) more flexible and ergonomic API | |
by combining it with Bjoern Hoehrmann's byte-wise state machine validator. | |
simdutf does the heavy lifting, but the state machine handles the fiddly boundary conditions. | |
This ought to work equally well with any other byte-wise state machine validation function, | |
since its performance is largely irrelevant, but Hoehrmann's is particularly concise. | |
Return value: | |
0 (UTF8_ACCEPT) Definitely valid and complete string, up to this point. | |
1 (UTF8_REJECT) Definitely invalid UTF-8 | |
other values: Incomplete string, but plausibly-valid thus far | |
*/ | |
unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len) | |
{ | |
if (state > 8) return UTF8_REJECT; // not a valid state (also functions as a bounds check) | |
// search for the next accept state, i.e. a codepoint boundary. | |
// NB: this loop will execute a maximum of 3 iterations | |
size_t i = 0; | |
while ((i < len) && (state > 1)) { | |
state = bytewise_utf8_validate_next_state(state, buf[i++]); | |
} | |
if ((state == UTF8_REJECT) || (i >= len)) return state; | |
//assert(state == UTF8_ACCEPT); // always true | |
//assert(i <= 3); // also always true | |
size_t partial_len = trim_partial_utf8(buf + i, len - i); // i<len, so this can't overflow | |
// the bulk of the work should happen here | |
if (!simdutf::validate_utf8(buf + i, partial_len)) return UTF8_REJECT; | |
i += partial_len; | |
while (i < len) { // this will also execute a maximum of 3 iterations | |
state = bytewise_utf8_validate_next_state(state, buf[i++]); | |
} | |
return state; | |
} | |
} // extern "C" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define UTF8_ACCEPT 0 | |
#define UTF8_REJECT 1 | |
unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment