Skip to content

Instantly share code, notes, and snippets.

@DavidBuchanan314
Last active January 9, 2024 22:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavidBuchanan314/798db4d18ed264920b9afd1c71b7f8bd to your computer and use it in GitHub Desktop.
Save DavidBuchanan314/798db4d18ed264920b9afd1c71b7f8bd to your computer and use it in GitHub Desktop.
A second attempt at simdutf incremental utf8 validation (proof-of-concept, not rigorously tested, see https://github.com/simdutf/simdutf/issues/361 )
#include <stdio.h>
#include "utf8_incremental.h"
// a very page-aligned buffer, which maybe helps io performance (untested!)
static char buf[0x10000] __attribute__ ((aligned (0x10000)));
int main()
{
unsigned int state = UTF8_ACCEPT;
for (;;) {
size_t readlen = fread(buf, 1, sizeof(buf), stdin);
state = validate_utf8_incremental(state, buf, readlen);
if (state == 1) break; // optional early-exit (would still work without this line, though)
if (readlen < sizeof(buf)) break;
}
if (feof(stdin) && (state == UTF8_ACCEPT)) {
printf("Success!\n");
return 0;
}
printf("failed :(\n%d\n", state);
return -1;
}
CFLAGS := -Wall -Wextra -Wpedantic -O3
CXXFLAGS := ${CFLAGS}
LDFLAGS := -lsimdutf
main: main.o utf8_incremental.o
#include <simdutf.h>
// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
static inline size_t trim_partial_utf8(const char *input, size_t length) {
if (length < 3) {
switch (length) {
case 2:
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
return length;
case 1:
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return length;
case 0:
return length;
}
}
if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
return length;
}
extern "C" {
#include "utf8_incremental.h"
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// Modified by David Buchanan, 2024
// NB: the values in these tables are all 4-bit, and so could be packed into nibbles, but it's perhaps not worth it.
static const unsigned char utf8_class_lut[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
};
static const unsigned char utf8_transition_lut[9][16] = {
{0,1,2,3,5,8,7,1,1,1,4,6,1,1,1,1}, // s0
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s1
{1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1}, // s2
{1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1}, // s3
{1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1}, // s4
{1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1}, // s5
{1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s6
{1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s7
{1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s8
};
static unsigned int inline bytewise_utf8_validate_next_state(unsigned int state, unsigned char byte) {
return utf8_transition_lut[state][utf8_class_lut[byte]];
}
// end Bjoern Hoehrmann copyright //
/*
This function takes the performance of simdutf, and wraps it an (imho) more flexible and ergonomic API
by combining it with Bjoern Hoehrmann's byte-wise state machine validator.
simdutf does the heavy lifting, but the state machine handles the fiddly boundary conditions.
This ought to work equally well with any other byte-wise state machine validation function,
since its performance is largely irrelevant, but Hoehrmann's is particularly concise.
Return value:
0 (UTF8_ACCEPT) Definitely valid and complete string, up to this point.
1 (UTF8_REJECT) Definitely invalid UTF-8
other values: Incomplete string, but plausibly-valid thus far
*/
unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len)
{
if (state > 8) return UTF8_REJECT; // not a valid state (also functions as a bounds check)
// search for the next accept state, i.e. a codepoint boundary.
// NB: this loop will execute a maximum of 3 iterations
size_t i = 0;
while ((i < len) && (state > 1)) {
state = bytewise_utf8_validate_next_state(state, buf[i++]);
}
if ((state == UTF8_REJECT) || (i >= len)) return state;
//assert(state == UTF8_ACCEPT); // always true
//assert(i <= 3); // also always true
size_t partial_len = trim_partial_utf8(buf + i, len - i); // i<len, so this can't overflow
// the bulk of the work should happen here
if (!simdutf::validate_utf8(buf + i, partial_len)) return UTF8_REJECT;
i += partial_len;
while (i < len) { // this will also execute a maximum of 3 iterations
state = bytewise_utf8_validate_next_state(state, buf[i++]);
}
return state;
}
} // extern "C"
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment