初始版本

2025-08-19 09:49:41 +08:00
parent 10f1ddf1c1
commit 6df0f7d96e
2974 changed files with 1712873 additions and 54 deletions
--- a/external/utf_converter/LICENSE
+++ b/external/utf_converter/LICENSE
@@ -0,0 +1,9 @@
+Copyright (c) 2016 Pietro Gagliardi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+(this is called the MIT License or Expat License; see http://www.opensource.org/licenses/MIT)
--- a/external/utf_converter/utf.c
+++ b/external/utf_converter/utf.c
@@ -0,0 +1,350 @@
+// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
+// 10 november 2016
+#include "utf.h"
+
+// this code imitates Go's unicode/utf8 and unicode/utf16
+// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
+// it is also an imitation so we can license it under looser terms than the Go source
+#define badrune 0xFFFD
+
+// encoded must be at most 4 bytes
+// TODO clean this code up somehow
+size_t utf8EncodeRune(uint32_t rune, char *encoded)
+{
+	uint8_t b;
+	uint8_t c = 0;
+	uint8_t d = 0;
+	uint8_t e = 0;
+	size_t  n;
+
+	// not in the valid range for Unicode
+	if (rune > 0x10FFFF)
+		rune = badrune;
+	// surrogate runes cannot be encoded
+	if (rune >= 0xD800 && rune < 0xE000)
+		rune = badrune;
+
+	if (rune < 0x80) {		// ASCII bytes represent themselves
+		b = (uint8_t) (rune & 0xFF);
+		n = 1;
+		goto done;
+	}
+	if (rune < 0x800) {		// two-byte encoding
+		c = (uint8_t) (rune & 0x3F);
+		c |= 0x80;
+		rune >>= 6;
+		b = (uint8_t) (rune & 0x1F);
+		b |= 0xC0;
+		n = 2;
+		goto done;
+	}
+	if (rune < 0x10000) {	// three-byte encoding
+		d = (uint8_t) (rune & 0x3F);
+		d |= 0x80;
+		rune >>= 6;
+		c = (uint8_t) (rune & 0x3F);
+		c |= 0x80;
+		rune >>= 6;
+		b = (uint8_t) (rune & 0x0F);
+		b |= 0xE0;
+		n = 3;
+		goto done;
+	}
+	// otherwise use a four-byte encoding
+	e = (uint8_t) (rune & 0x3F);
+	e |= 0x80;
+	rune >>= 6;
+	d = (uint8_t) (rune & 0x3F);
+	d |= 0x80;
+	rune >>= 6;
+	c = (uint8_t) (rune & 0x3F);
+	c |= 0x80;
+	rune >>= 6;
+	b = (uint8_t) (rune & 0x07);
+	b |= 0xF0;
+	n = 4;
+
+done:
+	encoded[0] = b;
+	if (n > 1)
+		encoded[1] = c;
+	if (n > 2)
+		encoded[2] = d;
+	if (n > 3)
+		encoded[3] = e;
+	return n;
+}
+
+const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
+{
+	uint8_t b, c;
+	uint8_t lowestAllowed, highestAllowed;
+	size_t i, expected;
+	int bad;
+
+	b = (uint8_t) (*s);
+	if (b < 0x80) {		// ASCII bytes represent themselves
+		*rune = b;
+		s++;
+		return s;
+	}
+	// 0xC0 and 0xC1 cover 2-byte overlong equivalents
+	// 0xF5 to 0xFD cover values > 0x10FFFF
+	// 0xFE and 0xFF were never defined (always illegal)
+	if (b < 0xC2 || b > 0xF4) {		// invalid
+		*rune = badrune;
+		s++;
+		return s;
+	}
+
+	// this determines the range of allowed first continuation bytes
+	lowestAllowed = 0x80;
+	highestAllowed = 0xBF;
+	switch (b) {
+	case 0xE0:
+		// disallow 3-byte overlong equivalents
+		lowestAllowed = 0xA0;
+		break;
+	case 0xED:
+		// disallow surrogate characters
+		highestAllowed = 0x9F;
+		break;
+	case 0xF0:
+		// disallow 4-byte overlong equivalents
+		lowestAllowed = 0x90;
+		break;
+	case 0xF4:
+		// disallow values > 0x10FFFF
+		highestAllowed = 0x8F;
+		break;
+	}
+
+	// and this determines how many continuation bytes are expected
+	expected = 1;
+	if (b >= 0xE0)
+		expected++;
+	if (b >= 0xF0)
+		expected++;
+	if (nElem != 0) {				// are there enough bytes?
+		nElem--;
+		if (nElem < expected) {	// nope
+			*rune = badrune;
+			s++;
+			return s;
+		}
+	}
+
+	// ensure that everything is correct
+	// if not, **only** consume the initial byte
+	bad = 0;
+	for (i = 0; i < expected; i++) {
+		c = (uint8_t) (s[1 + i]);
+		if (c < lowestAllowed || c > highestAllowed) {
+			bad = 1;
+			break;
+		}
+		// the old lowestAllowed and highestAllowed is only for the first continuation byte
+		lowestAllowed = 0x80;
+		highestAllowed = 0xBF;
+	}
+	if (bad) {
+		*rune = badrune;
+		s++;
+		return s;
+	}
+
+	// now do the topmost bits
+	if (b < 0xE0)
+		*rune = b & 0x1F;
+	else if (b < 0xF0)
+		*rune = b & 0x0F;
+	else
+		*rune = b & 0x07;
+	s++;		// we can finally move on
+
+	// now do the continuation bytes
+	for (; expected; expected--) {
+		c = (uint8_t) (*s);
+		s++;
+		c &= 0x3F;		// strip continuation bits
+		*rune <<= 6;
+		*rune |= c;
+	}
+
+	return s;
+}
+
+// encoded must have at most 2 elements
+size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
+{
+	uint16_t low, high;
+
+	// not in the valid range for Unicode
+	if (rune > 0x10FFFF)
+		rune = badrune;
+	// surrogate runes cannot be encoded
+	if (rune >= 0xD800 && rune < 0xE000)
+		rune = badrune;
+
+	if (rune < 0x10000) {
+		encoded[0] = (uint16_t) rune;
+		return 1;
+	}
+
+	rune -= 0x10000;
+	low = (uint16_t) (rune & 0x3FF);
+	rune >>= 10;
+	high = (uint16_t) (rune & 0x3FF);
+	encoded[0] = high | 0xD800;
+	encoded[1] = low | 0xDC00;
+	return 2;
+}
+
+// TODO see if this can be cleaned up somehow
+const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
+{
+	uint16_t high, low;
+
+	if (*s < 0xD800 || *s >= 0xE000) {
+		// self-representing character
+		*rune = *s;
+		s++;
+		return s;
+	}
+	if (*s >= 0xDC00) {
+		// out-of-order surrogates
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	if (nElem == 1) {		// not enough elements
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	high = *s;
+	high &= 0x3FF;
+	if (s[1] < 0xDC00 || s[1] >= 0xE000) {
+		// bad surrogate pair
+		*rune = badrune;
+		s++;
+		return s;
+	}
+	s++;
+	low = *s;
+	s++;
+	low &= 0x3FF;
+	*rune = high;
+	*rune <<= 10;
+	*rune |= low;
+	*rune += 0x10000;
+	return s;
+}
+
+// TODO find a way to reduce the code in all of these somehow
+// TODO find a way to remove u as well
+size_t utf8RuneCount(const char *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+
+	if (nElem != 0) {
+		const char *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf8DecodeRune(t, nElem, &rune);
+			len++;
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf8DecodeRune(s, nElem, &rune);
+		len++;
+	}
+	return len;
+}
+
+size_t utf8UTF16Count(const char *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+	uint16_t encoded[2];
+
+	if (nElem != 0) {
+		const char *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf8DecodeRune(t, nElem, &rune);
+			len += utf16EncodeRune(rune, encoded);
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf8DecodeRune(s, nElem, &rune);
+		len += utf16EncodeRune(rune, encoded);
+	}
+	return len;
+}
+
+size_t utf16RuneCount(const uint16_t *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+
+	if (nElem != 0) {
+		const uint16_t *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf16DecodeRune(t, nElem, &rune);
+			len++;
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf16DecodeRune(s, nElem, &rune);
+		len++;
+	}
+	return len;
+}
+
+size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
+{
+	size_t len;
+	uint32_t rune;
+	char encoded[4];
+
+	if (nElem != 0) {
+		const uint16_t *t, *u;
+
+		len = 0;
+		t = s;
+		while (nElem != 0) {
+			u = utf16DecodeRune(t, nElem, &rune);
+			len += utf8EncodeRune(rune, encoded);
+			nElem -= u - t;
+			t = u;
+		}
+		return len;
+	}
+	len = 0;
+	while (*s) {
+		s = utf16DecodeRune(s, nElem, &rune);
+		len += utf8EncodeRune(rune, encoded);
+	}
+	return len;
+}
--- a/external/utf_converter/utf.h
+++ b/external/utf_converter/utf.h
@@ -0,0 +1,61 @@
+// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
+// 10 november 2016
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated
+// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements*
+
+extern size_t utf8EncodeRune(uint32_t rune, char *encoded);
+extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune);
+extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded);
+extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune);
+
+extern size_t utf8RuneCount(const char *s, size_t nElem);
+extern size_t utf8UTF16Count(const char *s, size_t nElem);
+extern size_t utf16RuneCount(const uint16_t *s, size_t nElem);
+extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem);
+
+#ifdef __cplusplus
+}
+
+// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default).
+// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions.
+// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!).
+// TODO check this on MinGW-w64
+// TODO check this under /Wall
+// TODO C-style casts enough? or will that fail in /Wall?
+// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about
+#if defined(_MSC_VER)
+
+inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded)
+{
+	return utf16EncodeRune(rune, reinterpret_cast<uint16_t *>(encoded));
+}
+
+inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune)
+{
+	const uint16_t *ret;
+
+	ret = utf16DecodeRune(reinterpret_cast<const uint16_t *>(s), nElem, rune);
+	return reinterpret_cast<const __wchar_t *>(ret);
+}
+
+inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem)
+{
+	return utf16RuneCount(reinterpret_cast<const uint16_t *>(s), nElem);
+}
+
+inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem)
+{
+	return utf16UTF8Count(reinterpret_cast<const uint16_t *>(s), nElem);
+}
+
+#endif
+
+#endif