DavidBuchanan314 · January 9, 2024 22:19
diff --git a/main.c b/main.c
 #include <stdio.h>

 #include "utf8_incremental.h"

 // a very page-aligned buffer, which maybe helps io performance (untested!)
 static char buf[0x10000] __attribute__ ((aligned (0x10000)));

 int main()
 {
 	unsigned int state = UTF8_ACCEPT;
 	for (;;) {
 		size_t readlen = fread(buf, 1, sizeof(buf), stdin);
 		state = validate_utf8_incremental(state, buf, readlen);
 		if (state == 1) break; // optional early-exit (would still work without this line, though)
 		if (readlen < sizeof(buf)) break;
 	}
 	if (feof(stdin) && (state == UTF8_ACCEPT)) {
 		printf("Success!\n");
 		return 0;
 	}
 	printf("failed :(\n%d\n", state);
 	return -1;
 }
diff --git a/Makefile b/Makefile
 CFLAGS := -Wall -Wextra -Wpedantic -O3
 CXXFLAGS := ${CFLAGS}
 LDFLAGS := -lsimdutf

 main: main.o utf8_incremental.o
diff --git a/utf8_incremental.cpp b/utf8_incremental.cpp
 #include <simdutf.h>

 // XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
 static inline size_t trim_partial_utf8(const char *input, size_t length) {
  if (length < 3) {
    switch (length) {
      case 2:
        if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
        if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
        return length;
      case 1:
        if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
        return length;
      case 0:
        return length;
    }
  }
  if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
  return length;
 }

 extern "C" {

 #include "utf8_incremental.h"

 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 // Modified by David Buchanan, 2024

 // NB: the values in these tables are all 4-bit, and so could be packed into nibbles, but it's perhaps not worth it.
 static const unsigned char utf8_class_lut[] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
 };

 static const unsigned char utf8_transition_lut[9][16] = {
 	{0,1,2,3,5,8,7,1,1,1,4,6,1,1,1,1}, // s0
 	{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s1
 	{1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1}, // s2
 	{1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1}, // s3
 	{1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1}, // s4
 	{1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1}, // s5
 	{1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s6
 	{1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s7
 	{1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s8
 };

 static unsigned int inline bytewise_utf8_validate_next_state(unsigned int state, unsigned char byte) {
 	return utf8_transition_lut[state][utf8_class_lut[byte]];
 }

 // end Bjoern Hoehrmann copyright //


 /*

 This function takes the performance of simdutf, and wraps it an (imho) more flexible and ergonomic API
 by combining it with Bjoern Hoehrmann's byte-wise state machine validator.

 simdutf does the heavy lifting, but the state machine handles the fiddly boundary conditions.

 This ought to work equally well with any other byte-wise state machine validation function,
 since its performance is largely irrelevant, but Hoehrmann's is particularly concise.

 Return value:
 0 (UTF8_ACCEPT) Definitely valid and complete string, up to this point.
 1 (UTF8_REJECT) Definitely invalid UTF-8
 other values: Incomplete string, but plausibly-valid thus far
 */
 unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len)
 {
 	if (state > 8) return UTF8_REJECT; // not a valid state (also functions as a bounds check)

 	// search for the next accept state, i.e. a codepoint boundary.
 	// NB: this loop will execute a maximum of 3 iterations
 	size_t i = 0;
 	while ((i < len) && (state > 1)) {
 		state = bytewise_utf8_validate_next_state(state, buf[i++]);
 	}

 	if ((state == UTF8_REJECT) || (i >= len)) return state;
 	//assert(state == UTF8_ACCEPT); // always true
 	//assert(i <= 3); // also always true

 	size_t partial_len = trim_partial_utf8(buf + i, len - i); // i<len, so this can't overflow

 	// the bulk of the work should happen here
 	if (!simdutf::validate_utf8(buf + i, partial_len)) return UTF8_REJECT;

 	i += partial_len;
 	while (i < len) { // this will also execute a maximum of 3 iterations
 		state = bytewise_utf8_validate_next_state(state, buf[i++]);
 	}

 	return state;
 }

 } // extern "C"
diff --git a/utf8_incremental.h b/utf8_incremental.h
 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 1

 unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len);
	#include <stdio.h>

	#include "utf8_incremental.h"

	// a very page-aligned buffer, which maybe helps io performance (untested!)
	static char buf[0x10000] __attribute__ ((aligned (0x10000)));

	int main()
	{
	unsigned int state = UTF8_ACCEPT;
	for (;;) {
	size_t readlen = fread(buf, 1, sizeof(buf), stdin);
	state = validate_utf8_incremental(state, buf, readlen);
	if (state == 1) break; // optional early-exit (would still work without this line, though)
	if (readlen < sizeof(buf)) break;
	}
	if (feof(stdin) && (state == UTF8_ACCEPT)) {
	printf("Success!\n");
	return 0;
	}
	printf("failed :(\n%d\n", state);
	return -1;
	}
	CFLAGS := -Wall -Wextra -Wpedantic -O3
	CXXFLAGS := ${CFLAGS}
	LDFLAGS := -lsimdutf

	main: main.o utf8_incremental.o
	#include <simdutf.h>

	// XXX: The version of simdutf I have installed is outdated, so I'm copying simdutf::trim_partial_utf8 here, verbatim
	static inline size_t trim_partial_utf8(const char *input, size_t length) {
	if (length < 3) {
	switch (length) {
	case 2:
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 2 bytes left
	return length;
	case 1:
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	return length;
	case 0:
	return length;
	}
	}
	if (uint8_t(input[length-1]) >= 0xc0) { return length-1; } // 2-, 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-2]) >= 0xe0) { return length-2; } // 3- and 4-byte characters with only 1 byte left
	if (uint8_t(input[length-3]) >= 0xf0) { return length-3; } // 4-byte characters with only 3 bytes left
	return length;
	}

	extern "C" {

	#include "utf8_incremental.h"

	// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	// Modified by David Buchanan, 2024

	// NB: the values in these tables are all 4-bit, and so could be packed into nibbles, but it's perhaps not worth it.
	static const unsigned char utf8_class_lut[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
	};

	static const unsigned char utf8_transition_lut[9][16] = {
	{0,1,2,3,5,8,7,1,1,1,4,6,1,1,1,1}, // s0
	{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s1
	{1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1}, // s2
	{1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1}, // s3
	{1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1}, // s4
	{1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1}, // s5
	{1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s6
	{1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1}, // s7
	{1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, // s8
	};

	static unsigned int inline bytewise_utf8_validate_next_state(unsigned int state, unsigned char byte) {
	return utf8_transition_lut[state][utf8_class_lut[byte]];
	}

	// end Bjoern Hoehrmann copyright //


	/*

	This function takes the performance of simdutf, and wraps it an (imho) more flexible and ergonomic API
	by combining it with Bjoern Hoehrmann's byte-wise state machine validator.

	simdutf does the heavy lifting, but the state machine handles the fiddly boundary conditions.

	This ought to work equally well with any other byte-wise state machine validation function,
	since its performance is largely irrelevant, but Hoehrmann's is particularly concise.

	Return value:
	0 (UTF8_ACCEPT) Definitely valid and complete string, up to this point.
	1 (UTF8_REJECT) Definitely invalid UTF-8
	other values: Incomplete string, but plausibly-valid thus far
	*/
	unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len)
	{
	if (state > 8) return UTF8_REJECT; // not a valid state (also functions as a bounds check)

	// search for the next accept state, i.e. a codepoint boundary.
	// NB: this loop will execute a maximum of 3 iterations
	size_t i = 0;
	while ((i < len) && (state > 1)) {
	state = bytewise_utf8_validate_next_state(state, buf[i++]);
	}

	if ((state == UTF8_REJECT) \|\| (i >= len)) return state;
	//assert(state == UTF8_ACCEPT); // always true
	//assert(i <= 3); // also always true

	size_t partial_len = trim_partial_utf8(buf + i, len - i); // i<len, so this can't overflow

	// the bulk of the work should happen here
	if (!simdutf::validate_utf8(buf + i, partial_len)) return UTF8_REJECT;

	i += partial_len;
	while (i < len) { // this will also execute a maximum of 3 iterations
	state = bytewise_utf8_validate_next_state(state, buf[i++]);
	}

	return state;
	}

	} // extern "C"
	#define UTF8_ACCEPT 0
	#define UTF8_REJECT 1

	unsigned int validate_utf8_incremental(unsigned int state, const char *buf, size_t len);