初始版本

This commit is contained in:
xiaozhengsheng
2025-08-19 09:49:41 +08:00
parent 10f1ddf1c1
commit 6df0f7d96e
2974 changed files with 1712873 additions and 54 deletions

9
external/utf_converter/LICENSE vendored Normal file
View File

@@ -0,0 +1,9 @@
Copyright (c) 2016 Pietro Gagliardi
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
(this is called the MIT License or Expat License; see http://www.opensource.org/licenses/MIT)

350
external/utf_converter/utf.c vendored Normal file
View File

@@ -0,0 +1,350 @@
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
// 10 november 2016
#include "utf.h"
// this code imitates Go's unicode/utf8 and unicode/utf16
// the biggest difference is that a rune is unsigned instead of signed (because Go guarantees what a right shift on a signed number will do, whereas C does not)
// it is also an imitation so we can license it under looser terms than the Go source
#define badrune 0xFFFD
// encoded must be at most 4 bytes
// TODO clean this code up somehow
size_t utf8EncodeRune(uint32_t rune, char *encoded)
{
uint8_t b;
uint8_t c = 0;
uint8_t d = 0;
uint8_t e = 0;
size_t n;
// not in the valid range for Unicode
if (rune > 0x10FFFF)
rune = badrune;
// surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000)
rune = badrune;
if (rune < 0x80) { // ASCII bytes represent themselves
b = (uint8_t) (rune & 0xFF);
n = 1;
goto done;
}
if (rune < 0x800) { // two-byte encoding
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x1F);
b |= 0xC0;
n = 2;
goto done;
}
if (rune < 0x10000) { // three-byte encoding
d = (uint8_t) (rune & 0x3F);
d |= 0x80;
rune >>= 6;
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x0F);
b |= 0xE0;
n = 3;
goto done;
}
// otherwise use a four-byte encoding
e = (uint8_t) (rune & 0x3F);
e |= 0x80;
rune >>= 6;
d = (uint8_t) (rune & 0x3F);
d |= 0x80;
rune >>= 6;
c = (uint8_t) (rune & 0x3F);
c |= 0x80;
rune >>= 6;
b = (uint8_t) (rune & 0x07);
b |= 0xF0;
n = 4;
done:
encoded[0] = b;
if (n > 1)
encoded[1] = c;
if (n > 2)
encoded[2] = d;
if (n > 3)
encoded[3] = e;
return n;
}
const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune)
{
uint8_t b, c;
uint8_t lowestAllowed, highestAllowed;
size_t i, expected;
int bad;
b = (uint8_t) (*s);
if (b < 0x80) { // ASCII bytes represent themselves
*rune = b;
s++;
return s;
}
// 0xC0 and 0xC1 cover 2-byte overlong equivalents
// 0xF5 to 0xFD cover values > 0x10FFFF
// 0xFE and 0xFF were never defined (always illegal)
if (b < 0xC2 || b > 0xF4) { // invalid
*rune = badrune;
s++;
return s;
}
// this determines the range of allowed first continuation bytes
lowestAllowed = 0x80;
highestAllowed = 0xBF;
switch (b) {
case 0xE0:
// disallow 3-byte overlong equivalents
lowestAllowed = 0xA0;
break;
case 0xED:
// disallow surrogate characters
highestAllowed = 0x9F;
break;
case 0xF0:
// disallow 4-byte overlong equivalents
lowestAllowed = 0x90;
break;
case 0xF4:
// disallow values > 0x10FFFF
highestAllowed = 0x8F;
break;
}
// and this determines how many continuation bytes are expected
expected = 1;
if (b >= 0xE0)
expected++;
if (b >= 0xF0)
expected++;
if (nElem != 0) { // are there enough bytes?
nElem--;
if (nElem < expected) { // nope
*rune = badrune;
s++;
return s;
}
}
// ensure that everything is correct
// if not, **only** consume the initial byte
bad = 0;
for (i = 0; i < expected; i++) {
c = (uint8_t) (s[1 + i]);
if (c < lowestAllowed || c > highestAllowed) {
bad = 1;
break;
}
// the old lowestAllowed and highestAllowed is only for the first continuation byte
lowestAllowed = 0x80;
highestAllowed = 0xBF;
}
if (bad) {
*rune = badrune;
s++;
return s;
}
// now do the topmost bits
if (b < 0xE0)
*rune = b & 0x1F;
else if (b < 0xF0)
*rune = b & 0x0F;
else
*rune = b & 0x07;
s++; // we can finally move on
// now do the continuation bytes
for (; expected; expected--) {
c = (uint8_t) (*s);
s++;
c &= 0x3F; // strip continuation bits
*rune <<= 6;
*rune |= c;
}
return s;
}
// encoded must have at most 2 elements
size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded)
{
uint16_t low, high;
// not in the valid range for Unicode
if (rune > 0x10FFFF)
rune = badrune;
// surrogate runes cannot be encoded
if (rune >= 0xD800 && rune < 0xE000)
rune = badrune;
if (rune < 0x10000) {
encoded[0] = (uint16_t) rune;
return 1;
}
rune -= 0x10000;
low = (uint16_t) (rune & 0x3FF);
rune >>= 10;
high = (uint16_t) (rune & 0x3FF);
encoded[0] = high | 0xD800;
encoded[1] = low | 0xDC00;
return 2;
}
// TODO see if this can be cleaned up somehow
const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune)
{
uint16_t high, low;
if (*s < 0xD800 || *s >= 0xE000) {
// self-representing character
*rune = *s;
s++;
return s;
}
if (*s >= 0xDC00) {
// out-of-order surrogates
*rune = badrune;
s++;
return s;
}
if (nElem == 1) { // not enough elements
*rune = badrune;
s++;
return s;
}
high = *s;
high &= 0x3FF;
if (s[1] < 0xDC00 || s[1] >= 0xE000) {
// bad surrogate pair
*rune = badrune;
s++;
return s;
}
s++;
low = *s;
s++;
low &= 0x3FF;
*rune = high;
*rune <<= 10;
*rune |= low;
*rune += 0x10000;
return s;
}
// TODO find a way to reduce the code in all of these somehow
// TODO find a way to remove u as well
size_t utf8RuneCount(const char *s, size_t nElem)
{
size_t len;
uint32_t rune;
if (nElem != 0) {
const char *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf8DecodeRune(t, nElem, &rune);
len++;
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf8DecodeRune(s, nElem, &rune);
len++;
}
return len;
}
size_t utf8UTF16Count(const char *s, size_t nElem)
{
size_t len;
uint32_t rune;
uint16_t encoded[2];
if (nElem != 0) {
const char *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf8DecodeRune(t, nElem, &rune);
len += utf16EncodeRune(rune, encoded);
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf8DecodeRune(s, nElem, &rune);
len += utf16EncodeRune(rune, encoded);
}
return len;
}
size_t utf16RuneCount(const uint16_t *s, size_t nElem)
{
size_t len;
uint32_t rune;
if (nElem != 0) {
const uint16_t *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf16DecodeRune(t, nElem, &rune);
len++;
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf16DecodeRune(s, nElem, &rune);
len++;
}
return len;
}
size_t utf16UTF8Count(const uint16_t *s, size_t nElem)
{
size_t len;
uint32_t rune;
char encoded[4];
if (nElem != 0) {
const uint16_t *t, *u;
len = 0;
t = s;
while (nElem != 0) {
u = utf16DecodeRune(t, nElem, &rune);
len += utf8EncodeRune(rune, encoded);
nElem -= u - t;
t = u;
}
return len;
}
len = 0;
while (*s) {
s = utf16DecodeRune(s, nElem, &rune);
len += utf8EncodeRune(rune, encoded);
}
return len;
}

61
external/utf_converter/utf.h vendored Normal file
View File

@@ -0,0 +1,61 @@
// utf by pietro gagliardi (andlabs) — https://github.com/andlabs/utf/
// 10 november 2016
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>
#include <stdint.h>
// if nElem == 0, assume the buffer has no upper limit and is '\0' terminated
// otherwise, assume buffer is NOT '\0' terminated but is bounded by nElem *elements*
extern size_t utf8EncodeRune(uint32_t rune, char *encoded);
extern const char *utf8DecodeRune(const char *s, size_t nElem, uint32_t *rune);
extern size_t utf16EncodeRune(uint32_t rune, uint16_t *encoded);
extern const uint16_t *utf16DecodeRune(const uint16_t *s, size_t nElem, uint32_t *rune);
extern size_t utf8RuneCount(const char *s, size_t nElem);
extern size_t utf8UTF16Count(const char *s, size_t nElem);
extern size_t utf16RuneCount(const uint16_t *s, size_t nElem);
extern size_t utf16UTF8Count(const uint16_t *s, size_t nElem);
#ifdef __cplusplus
}
// Provide overloads on Windows for using these functions with wchar_t and WCHAR when wchar_t is a keyword in C++ mode (the default).
// Otherwise, you'd need to cast to pass a wchar_t pointer, WCHAR pointer, or equivalent to these functions.
// We use __wchar_t to be independent of the setting; see https://blogs.msdn.microsoft.com/oldnewthing/20161201-00/?p=94836 (ironically posted one day after I initially wrote this code!).
// TODO check this on MinGW-w64
// TODO check this under /Wall
// TODO C-style casts enough? or will that fail in /Wall?
// TODO same for UniChar/unichar on Mac? if both are unsigned then we have nothing to worry about
#if defined(_MSC_VER)
inline size_t utf16EncodeRune(uint32_t rune, __wchar_t *encoded)
{
return utf16EncodeRune(rune, reinterpret_cast<uint16_t *>(encoded));
}
inline const __wchar_t *utf16DecodeRune(const __wchar_t *s, size_t nElem, uint32_t *rune)
{
const uint16_t *ret;
ret = utf16DecodeRune(reinterpret_cast<const uint16_t *>(s), nElem, rune);
return reinterpret_cast<const __wchar_t *>(ret);
}
inline size_t utf16RuneCount(const __wchar_t *s, size_t nElem)
{
return utf16RuneCount(reinterpret_cast<const uint16_t *>(s), nElem);
}
inline size_t utf16UTF8Count(const __wchar_t *s, size_t nElem)
{
return utf16UTF8Count(reinterpret_cast<const uint16_t *>(s), nElem);
}
#endif
#endif