bpe_framework/src/tokenizer/unicode_utils (copy 1).cpp
2025-09-13 12:45:42 -07:00

129 lines
3.7 KiB
C++
Executable File
Raw Blame History

// src/tokenizer/unicode_utils.cpp
#include "lm/tokenizer/unicode_utils.hpp"
#include <unicode/uchar.h>
#include <unicode/unistr.h>
#include <unicode/normlzr.h>
#include <unicode/ustring.h>
#include <stdexcept>
#include <algorithm>
namespace lm::unicode {
bool is_whitespace(uint32_t codepoint) {
return u_isUWhiteSpace(codepoint);
}
bool is_punctuation(uint32_t codepoint) {
return u_ispunct(codepoint);
}
bool is_control(uint32_t codepoint) {
return u_iscntrl(codepoint);
}
std::string normalize(const std::string& text) {
try {
icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(text);
icu::UnicodeString normalized;
UErrorCode status = U_ZERO_ERROR;
icu::Normalizer::normalize(unicode_str, UNORM_NFC, 0, normalized, status);
if (U_FAILURE(status)) {
throw std::runtime_error("Unicode normalization failed");
}
std::string result;
normalized.toUTF8String(result);
return result;
} catch (const std::exception& e) {
throw std::runtime_error("Unicode normalization error: " + std::string(e.what()));
}
}
std::vector<CodePoint> to_code_points(const std::string& text) {
std::vector<CodePoint> code_points;
for (size_t i = 0; i < text.size(); ) {
CodePoint cp;
uint32_t codepoint;
int offset = 0;
// Decode UTF-8
U8_NEXT(text.c_str(), i, text.size(), codepoint);
if (codepoint == U_SENTINEL) {
// Handle invalid UTF-8 gracefully instead of throwing
// Use replacement character (U+FFFD) for invalid sequences
cp.value = 0xFFFD;
cp.utf8 = "<EFBFBD>"; // Replacement character
code_points.push_back(cp);
// Skip this byte and continue
i++;
continue;
}
// Get the UTF-8 bytes for this code point
char utf8_buf[5] = {0};
U8_APPEND_UNSAFE(utf8_buf, offset, codepoint);
cp.value = codepoint;
cp.utf8 = std::string(utf8_buf, offset);
code_points.push_back(cp);
i += offset;
}
return code_points;
}
std::string from_code_points(const std::vector<CodePoint>& code_points) {
std::string result;
for (const auto& cp : code_points) {
result += cp.utf8;
}
return result;
}
// Remove the "unicode::" qualification - we're already in the lm::unicode namespace
std::vector<std::string> unicode_split(const std::string& text) {
std::vector<std::string> characters;
int i = 0;
while (i < text.length()) {
int char_len = 1;
// Check for UTF-8 multi-byte characters
if ((text[i] & 0x80) == 0) {
// ASCII character
char_len = 1;
} else if ((text[i] & 0xE0) == 0xC0) {
// 2-byte UTF-8 character
char_len = 2;
} else if ((text[i] & 0xF0) == 0xE0) {
// 3-byte UTF-8 character
char_len = 3;
} else if ((text[i] & 0xF8) == 0xF0) {
// 4-byte UTF-8 character
char_len = 4;
}
characters.push_back(text.substr(i, char_len));
i += char_len;
}
return characters;
}
std::vector<std::string> split_on_character_boundaries(const std::string& text) {
std::vector<std::string> characters;
auto code_points = to_code_points(text);
for (const auto& cp : code_points) {
characters.push_back(cp.utf8);
}
return characters;
}
} // namespace lm::unicode