bpe_framework/include/lm/tokenizer/bpe_tokenizer.hpp
2025-08-27 14:02:03 -07:00

58 lines
1.5 KiB
C++
Executable File

#pragma once
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <memory>
#include <utility>
#include <cstdint> // For uint16_t
#include <queue>
#include <functional>
#include <Eigen/Dense>
namespace lm {
using TokenID = uint16_t; // Support for 65k vocabulary
class BPETokenizer {
public:
BPETokenizer();
~BPETokenizer();
// Training methods
void train(const std::vector<std::string>& corpus, size_t vocab_size = 30000);
void train_from_file(const std::string& filename, size_t vocab_size = 30000);
// Tokenization methods
std::vector<TokenID> encode(const std::string& text) const;
std::string decode(const std::vector<TokenID>& tokens) const;
// Serialization
bool save(const std::string& filename) const;
bool load(const std::string& filename);
// Vocabulary access
size_t vocab_size() const;
std::string id_to_token(TokenID id) const;
TokenID token_to_id(const std::string& token) const;
// Configuration
void set_unknown_token(const std::string& token);
void add_special_token(const std::string& token);
// Unicode-specific methods
void set_normalization(bool enabled);
void set_byte_fallback(bool enabled);
Eigen::VectorXi encode_to_vector(const std::string& text) const;
std::string decode_from_vector(const Eigen::VectorXi& tokens) const;
Eigen::VectorXf token_frequencies() const;
private:
struct Impl;
std::unique_ptr<Impl> pimpl_;
};
} // namespace lm