From 86e4024d7250d098a244e8c043048096b93cd9c9 Mon Sep 17 00:00:00 2001 From: "Tim O\\'Neil" Date: Sat, 16 Aug 2025 20:25:23 -0700 Subject: [PATCH] Latest commit to Fred's git server --- .gitignore | 41 +++++ LICENSE | 29 ++- README.md | 6 +- TODO | 244 +++++++++++++++++++++++++ docs/spec.md | Bin 0 -> 8788 bytes src/tokenizer/openmt_adapter.cpp | 0 src/tokenizer/tok_adapter.h | 39 ++++ tests/tokenizer/test_openmtadapter.cpp | 12 ++ 8 files changed, 366 insertions(+), 5 deletions(-) create mode 100644 .gitignore create mode 100644 TODO create mode 100644 docs/spec.md create mode 100644 src/tokenizer/openmt_adapter.cpp create mode 100644 src/tokenizer/tok_adapter.h create mode 100644 tests/tokenizer/test_openmtadapter.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d4fb281 --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Linker files +*.ilk + +# Debugger Files +*.pdb + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# debug information files +*.dwo diff --git a/LICENSE b/LICENSE index 850ea9f..588435f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,28 @@ -Copyright (C) 2025 by interval interval1066@gmail.com +BSD 3-Clause License -Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. +Copyright (c) 2025, interval1066 -THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 08dc2ec..fdaf7d8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ -# big_llm +# Generative Pre-trained Transformer AI Language Model +My own large language model +This is just the very start of a agetic ai llm that I've been thinking about. Has a few external dependancies, the main one being the OpenNMT Tokenizer from MIT (https://github.com/OpenNMT/Tokenizer), for now. -My agentic model effort. \ No newline at end of file +Todo: build NMT as a shared lib and register it with the linux system librarian. diff --git a/TODO b/TODO new file mode 100644 index 0000000..64fa05b --- /dev/null +++ b/TODO @@ -0,0 +1,244 @@ +1. Initialization Module (lm::runtime) +include/lm/runtime/init.hpp +cpp + +#pragma once +#include +#include // JSON library +#include + +namespace lm::runtime { + +class SystemState { +public: + // Singleton access + static SystemState& get_instance(); + + // Initialize from JSON config + void initialize(const std::filesystem::path& config_path); + + // Configuration accessors + const nlohmann::json& config() const noexcept; + std::string get_string(const std::string& key) const; + int get_int(const std::string& key, int default_val = 0) const; + + // Subsystem states + bool is_tokenizer_ready() const noexcept; + bool is_model_loaded() const noexcept; + +private: + SystemState() = default; // Private constructor + nlohmann::json config_; + bool tokenizer_ready_ = false; + bool model_loaded_ = false; +}; + +} // namespace lm::runtime + +2. Shutdown Module (lm::runtime) +include/lm/runtime/shutdown.hpp +cpp + +#pragma once +#include +#include + +namespace lm::runtime { + +class ShutdownHandler { +public: + // Serialize state to JSON + static void save_state( + const std::filesystem::path& output_path, + bool include_model_weights = false + ); + + // Cleanup hooks + static void register_cleanup(void (*func)()); + static void execute_cleanup(); +}; + +} // namespace lm::runtime + +3. Implementation Files +src/runtime/init.cpp +cpp + +#include "lm/runtime/init.hpp" +#include +#include + +using namespace lm::runtime; + +void SystemState::initialize(const std::filesystem::path& config_path) { + try { + // Load JSON config + std::ifstream f(config_path); + config_ = nlohmann::json::parse(f); + + // Validate required fields + if (!config_.contains("tokenizer") || !config_.contains("model")) { + throw std::runtime_error("Invalid config: missing required sections"); + } + + // Initialize subsystems + tokenizer_ready_ = initialize_tokenizer(config_["tokenizer"]); + model_loaded_ = initialize_model(config_["model"]); + + } catch (const std::exception& e) { + throw std::runtime_error("Initialization failed: " + std::string(e.what())); + } +} + +// ... (Other method implementations) + +src/runtime/shutdown.cpp +cpp + +#include "lm/runtime/shutdown.hpp" +#include +#include + +namespace { + std::vector cleanup_functions; + std::mutex cleanup_mutex; +} + +void ShutdownHandler::save_state( + const std::filesystem::path& output_path, + bool include_model_weights) +{ + nlohmann::json state; + + // Capture framework state + state["tokenizer"] = serialize_tokenizer_state(); + state["model"] = serialize_model_state(include_model_weights); + state["threading"] = serialize_thread_pool_stats(); + + // Write to file + std::ofstream(output_path) << state.dump(2); // Pretty print +} + +void ShutdownHandler::register_cleanup(void (*func)()) { + std::lock_guard lock(cleanup_mutex); + cleanup_functions.push_back(func); +} + +void ShutdownHandler::execute_cleanup() { + std::lock_guard lock(cleanup_mutex); + for (auto it = cleanup_functions.rbegin(); it != cleanup_functions.rend(); ++it) { + (*it)(); // Execute in reverse order + } +} + +4. Configuration JSON Schema +config_schema.json +json + +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["tokenizer", "model"], + "properties": { + "tokenizer": { + "type": "object", + "properties": { + "type": {"enum": ["bpe", "sentencepiece"]}, + "vocab_path": {"type": "string"}, + "dropout": {"type": "number", "minimum": 0, "maximum": 1} + } + }, + "model": { + "type": "object", + "properties": { + "path": {"type": "string"}, + "max_seq_len": {"type": "integer", "minimum": 1} + } + }, + "threading": { + "type": "object", + "properties": { + "worker_threads": {"type": "integer", "minimum": 1} + } + } + } +} + +5. Usage Example +Initialization +cpp + +#include "lm/runtime/init.hpp" + +int main() { + try { + auto& state = lm::runtime::SystemState::get_instance(); + state.initialize("config.json"); + + if (!state.is_model_loaded()) { + throw std::runtime_error("Model failed to load"); + } + } catch (const std::exception& e) { + std::cerr << "Fatal error: " << e.what() << std::endl; + return 1; + } +} + +Shutdown +cpp + +#include "lm/runtime/shutdown.hpp" + +void cleanup_db_connections() { + // Custom cleanup logic +} + +int main() { + lm::runtime::ShutdownHandler::register_cleanup(cleanup_db_connections); + + // ... run application ... + + lm::runtime::ShutdownHandler::save_state("shutdown_state.json"); + lm::runtime::ShutdownHandler::execute_cleanup(); +} + +6. CMake Integration +cmake + +# Add to your CMakeLists.txt +find_package(nlohmann_json 3.9 REQUIRED) + +add_library(lm_runtime + src/runtime/init.cpp + src/runtime/shutdown.cpp +) + +target_link_libraries(lm_runtime + PRIVATE nlohmann_json::nlohmann_json +) + +Key Features + + Thread-Safe Cleanup: Reverse-order execution of registered handlers + + Config Validation: JSON schema verification during initialization + + State Capture: Full system snapshot on shutdown + + Singleton Pattern: Controlled access to system state + +Error Handling +Error Case Recovery Action +Invalid JSON Throw std::runtime_error with path info +Missing Required Field Fail fast during initialization +Shutdown Serialization Failure Log warning, continue cleanup + +This implementation provides a robust foundation for managing the framework lifecycle. Would you like to add any of the following? + + Configuration hot-reloading + + Encrypted config files + + Distributed state serialization + + diff --git a/docs/spec.md b/docs/spec.md new file mode 100644 index 0000000000000000000000000000000000000000..da45f5c7d2c902134301a57f2f0bed0aa7b38d75 GIT binary patch literal 8788 zcma)h1yG#Jwl(f9gImzx1SdEN1b270!QGwUu3>Nqkl+phg1cLAhk>BM{lU5Sy_0j^ zedqsoS53`S_4=m1wR^Ab-Ah3l8U_af0RaKR^$Vgl#BYK8{9WJC6zIgl{PJ0rAS(?m zf)aZCfGc*i3}guFAH94fYWL~_J~;j0XnYi&-LmXyMFgo?AtI;mo5$G+@ubb@V~!js zGE$_g)KbKOb)uQo%6(CQh0I`zrCD7tZdxi`hrV}~^$R??LZV#V>f zQjcpB3t-EH(a1BCXJ(rRMBCN18mnr3s=vFv0^kGpNxfTkv!eINYS5lzIz{04E!n5mlNrOQ`2uCajIt>m)u|pt5aYTqecF zNOq|yDA82{3;mk&^AKiiGhTP4ka6x8A8^GTqwv% zLSk4%6l$9z1sX+9j-5V{t*Y$ zAEY=jkcXY%EY-`+^(12DhY095SV`~JyU~wR6%%?lInuwGU(en3Rz_OuavTserUm4F zV)$wMBVGs#P5oVpcEbr&LAzFM0>uod`c?1f^28kRkFE(1oq-FIaOI;X%H={yM>Zw9 zakbgX&5>1xym~N1M)}C)CU)_mIH7+eQvL1=)AuUtP5n!o@+swmtth$lYk2LFz zw7PlsqSy0Z=sLsfCQndHZU(>0DJ)60mHMUFDz{7ZO9ZUtEG{NFx;BuoT2;pm&{0^U zq_WUY;DTw%nH~aot{X>j(s*rpVk$I;etOW8bZ7fD9)!jroeZrMrji z9#4zNgExCs#XORKlpbtD3uQ0eFsGY$VgSWKa-V#qixYYyzOEK>xr`JZ$DfJ2S+5v;2C@KEmhD=4l6Xzg z*^+hoJy!ud0(|{g1@|-QH2+9C@iXaec8(@2{}PMm0&%f1wRL{s{tw~kN$jui#tLExUvU{aba0S1fE`3X^dIsNN3#YkOvq>9=c06LR~#8wfU z%z*mATeRG{#gEGMcEkg<^<-WvT88klXyH@Km1tv46iB0m*@DkF zh0J1*YJ&CdR}M&C2#%dR=)`*__#FEGB|Co*{5!RWx<73RB>|7^@`^?U{faFmg`>PKd)ZIEn?H&r8s*cyk_L;|U%%5y)SR{O`7i%dxeoxoA^qil-RC6y^W^;yh=g9O^G^!FrlpC zugB#SsaF;x@TOo^FJ{4O6pAHpC#~juw&NR-I~E%{&q4tgF7(W)VSe?^ww4P~O%FJ_ zU;gsK8^)0hvX3!ok$JPxGKfY*I_`iJDx$<#p_rxy?$@F+8E$!-m5~AVG|PFVPtvV$ zrOGgHP~l2BQe6E_bN^K(BDb(*BE+bIG&4HkGRs;Dd5+z@j{D}Z*?EDimT0PjW+B@> zTE_QH>l4md(5JOwn^fY8AfNq@(QiUIZebm~9l}R>T@eh%-*d*M;e1zIi1bW%mRn&s z$8H|IgDg%Ya)4qTf$*2rRYIj2=fPmaO@SbXw^O3Lj>=8cr{>FVj3R&4^MMna{)*=m z@i>0^6!sGK7Obg(@x{MaK{F?2Q*4tdOz-FdbQ1b2IRoi9FA;pA;|UoGN}5|AJ6@{0USZHk~tEP&lJC`s_U z8d5rPmxYGpJ1?m5>AGolCQNLwK|`7N_R{)ezN4{>Pwpf{!+=USach&moPO&dX%0zh zu2riZSAVo5Kc`Z=%#SPLgD-B4YI~a3-1Bd6<7ldPHvu9&Qu7i4$hTlhUUzJ3S7TLU zNbc~hgF`dlN3hg>JGJO9Vo}Fsw~i-xeNO_4*URo)oF&2n5$Y6JZIwtN-wXAjCQo3p z2Ojail+GO+$bEf#L4$Sk4oh76tTCuJupJG&mY;vsMeuqcVxZ@iI%SnvYrAffB^b@0>b${NgB{Y zhA3#tRp~Wq;T?K#ePzDhIE;EhWwMVLf-KoEsrLeFoYY?4v7gVBc)9luEzh!tR`i7c z{vLs!VWitM?p;%;`vlbY zPF3(VwRPzDP=$F$Y4!KDvT<4}W?Vz-W1LI~F!E`MF%C9}2#}7P_=lx$W)+N7auSt~ zg@PRsqu>>9oE~G!`ubjFdnv<@R$J?O>EosKcwvE1g{Vr+#)8M0FT6v4AXI#1p$is>CD{UUFGc12OI7sU7JsmP> zH)0!@RM}6wT@ZEg4W!puutUzSHCC6taJ$7+J<%?Oh&V!QDN^_W_k%3*M990GyB{gK-`b~E z!VBrTArtZpsvs1SbMn<7!>!hxHm0#H?SyG-k)I3y$sUt4;xnB88d0}+m%JFK&n9ho{yU9O z)*w}4_Q2L|A_Re)?}`I30dYJ!3ql|nw3#NC@uYK;@T7xZV{@GFr09MDanMx>T>KhK zAH=t520!mvuf89hbLbEI$h`5jeRpk@z%P%@z2pg*5JtiJcEpdm?r&}^vegXarG~o|8*f_Bi1X)k z2=HRSfK+!c$(}x}Vz&G)5Wqwzn!<36g+MDPTkCiP`f|dq^N1eN%z!>SE=6elQ&C5a zPDfJXU>8|6@%~3dnR9Y~FxzCgarL&fRA3ZClIxa)82QKMXpBl-oaFV4Vn)fWug0t= zg5QZ-0{LeGMDB!CZr1|SmsIho=ukvy!aSM2bBS$U5Rz;HVI7$H&NRXzf~xSLdX!^C7QPvtgpVN_}*xg zr;P64^|%mjI#P;pmcptGzs=)+#NsEv_1H1zOFu`Q5Ev**%3mrSkox z!Wy#a0Sh7Is*-PebpyeaQYb#1!}657RjOn1@mO_b;-`yN3M$$Vq8O0xTNO$q`5NIx zZqGWe`^A>%$x(FZU!8(|+D;^9l%EJ$NCa7L znLpMz{JBQ4T_I$CnwW8$W&~6+Wam?KgvMP|zO|w-GEV zT3N^dt6m95vCzcTY#t43gtxprcsW$i=_f~je?e^|(8*i(q1uWfyn~A#&MQ+~HIxZn zo|LjETjrW3ag3H0ThRW^`ud`ntjAZ5?4fO1oc59J(3!9I2BPyxh~sv9ypD$V0wP7x zj}PYibUXJv!ZU1k*a4$reZG+23w}KExF_%T?BNqUgv1bd!OiG=Q3my5ai^a@hFTmh z81Ymi8R6l6Nd=cW`%ue2e3onc=qGmWZG-vft@bsQrd5H34TgaoX5N{G4Q9wt1KUbf zrAle#eR2Sp6_2xo;p1pG1b;Xr>+Jw$;36h1o+hH-H@OmtDj|=mycFVG2hgl7mY6VEy6BI*Z*r{H~^o_z1wRbv^qcj1q z{(blJ0`?7`^Lo9Fr6~LclC5$>qsXNG zPTVe$+H)Y$yGjARA2~>u{h8eHebJKmO-SNL6>0ImGKt=|FmZRe9s%n!BHe!0ik*o( zd0GtQdKD1<@Gs=3EUARKcFEDh;~E$@KZ~^{mbh_(6%gSR`R0(pGjhnnAHD5eQ;5Xf zXRlM03lWZXf$5@N6*yFxwnyYB;=NF1W_X!YNHYEjQYCOj8B#B)4=ddLRd_KRLp*d- zJHDi*hA7;3WgvE{JtIOXC^Mg=yaD-^RlG&`E`0Q>Oz?(AnPsgS1zt~$ex&;6!W!{1 zB|hEJcL+PIqni4eL6-<(;+K{t6d7Zr%(N(&Ard@K8wn7-(7_Z6oEMg>ZxniBF&3yS zA~)ITS6Nq3$O9$zy-oI(a2d8e)@R`(1VYaCv6X|%;91_j`U%y-&`8oerq^1^}*-O(YwPsqiCD=;WYjX&`0&FhsLCF**c@RI^Sp8LAA4HXJ*RJADl^1uI3q1n>4SB=uFNPKyjHkTEIYgZ+T| zKI2A7pE?7UVGWK9y7yU+g+ngwf{Ky|7wJm4K7R zR0Hebe0lc}$I=4Qh>ASo6v{KFuJnXavnZ2y{oSq9(u@Wkqaav-$#x)plTo!;NW6-A zLQeSG-3UBK^V?nhQKQnC_+U$Ziou)UV@xq?s<5AXKF5^?C!GR$*xZcoemT(MYYnIz z4bd4{Gjst@d=x*PrlAx$F=-{>v(lQdX8a7-v#e@3{s{Cd@Z+M|$$0H9Zm&f?P$MG@ zqq801w_=89_SQgZt8s~f!L7J_kJFu%0-Ug@cUm@a9 zukeTP?Qw5+=oK;W5a#V%Md1D3y#&b7Fqh(FUmhKK^RDgQyY~ZLMox!9iG3i{ES4m( zpP|=+Xx~Xtb%foC_sFIVZQUC&Og0*3HXPWmOjn##wMrVos3;Fc%wxFJD61<|`ia-3 zr+)NT{UkA;wgAtnXR}=HsbnTseC$@^?NCqZ6XeD-Ocj>?x66-B%bijf9 z6Q*AlKL#dF&WS?=c=`Ouh+RlcSe$UEg_U4@iydnHDYMI-KGeWSz9N#rwKD%~=icV5 zf_q|}>m2Cr;ARB3RlHo`BvoZ)YrC!Mb^Ng(58RHo=saLa&9mS6xw3f3g9#k-8!) z!?MWpbvqTc&?Tgc%;*hcgOM{FnP5{S!HnEGuZ}9aT5Ly?heP`IvYrM{MV{iz6+_Sn z1NOmTksG|H!f8m(esODI5fjp#%8GK<$G2o$RwxJz1UD$u(Lr)##!p+fzif|03t{3O zeb?3a_0Fy;M^-C1_fz`K;@LX6wD%2Fnl9^4S9&I`bLA9^j0*y02lX9&wfhwHj&sk~ zUS*m}3^%yG`naekbg)v&ZE-ah)pMP@UM6~L)^_V~?uXlWOhP(RbI}JC%G71+Bv6TrSdl6Dd>O4Zxn&=-TB>8?*u6aY(2Yk#2n^LFK z%y?(S{8rzE_V4PIw{m3#rw(P8;)Uq%_>V1fch5M9>-15wv1k!8!m)!7!;Q1Iwx3?K z2w9pZSsn%gA{_spvmvs($@S%jt)U1whO`-eaAjXCMA7qc47)LDkTx0 zNK2(8$qd-6E1St?j|e;KSZ4b68Jt`3pLy#jJZE`t-JjiImD)Jg#}b^ zFkW@~u3fM)T-O@aEZzNbcjG+wRHEQ|AIuK&Vqk1tJk7}14&!tV3@QJ)2at(LY2sEd zCB125+(~;izsqUHr|u{cKN)MZ!_t&C8;GYNRaVd)8%Tckx-@OBZoO#^fotDWXQC8C z=a-T$E-1!bQw{bgCR!B4?#|>f78z<(G;0h?^I^|x$U}8VwjspkF)%@vglUvOI{tcL zx~zUSp9Z&NdJtu`Rx@-!B>QlFMN-|Ts)Qt9afnWof7x8vJcmn zK15F%z&w~#U5^LN+K-dRlbIQW0-aOs*33?J7&;Bxk4}a?MSSy@n&uZ>#8v4x=HgwA zUyhn1j~X{+1}-tk{&eV_NE_PB8 zYk9L&5Z86c=h;VTDQ~=nNS)krjQje0(^-6Ip5gz)fx8UUl1 zfVBTKn7Tm_Y|#H?}`3yMgQ5tLHiiE%zB&k$S0qQ%;(DSazNW z+{41Ciw|0Jr0WNoYZ-&kn$B7PO_NhgO<+8Ux66TMiiXLarKVN9>1WgQ_gp+oLsGhODzBYRfT4Dw$3VsM!-KDq_XD=@=n%gXU!{u?rNgb@Dm2C zY=$eL4B-hxk&9&kRry9z=OPPE98HR(^I};z$y2*fOXB>BMU_}z1d!%qkP1>QFYcf% zdy`)4ijY(XDG>4#quDQiM)Ro_$+ye{o`ChHZHJ73!8=b76-eE}A6_LNwJHGwNfu+9 zQM(nfpcpfp+dMHqCrgF}Jvit@;3iE#FP*%Jyd5p_Q~o`%)k!XNxyp#6rz`D36)d() zMUn^r8Q23)P^l}b7huf4b0BQ~M%Hw)6*~WX-SKDhIkN&owtDIv2@fn$bf1DF)*VeV ze;Vpu@hrH#E=yEOhkgOj5OI^LWM>sr9>kOaR+%peyo|(Xoo?F6>;|De5p{a~NM;$Y z;CBh@UMQ_xqT3m2ECHM3oZ_6S7$8Ppa(8}8EKI8~w*rZ}70d)BBFZ-67b_iBHM;m3 z+(@Vh4Bwk(-ab+bm^nPI?a{tuC)-*vUhnyyefwD_|JmOAZ{Ki2p)b?<#(H}%9JW7b=+K$= zRBYaDGT>;kI`cT@z?CN%t_mLP@Fj+oI_46ImhZyB#cL2H9tx8> z4a5glXC+EZnDb?%E8}5n4{Az0SY(y%&$~NAYU$o|9Yi+8Br(EHIMw5(5rK>QER!g+ z&^i^V@ra4sOI83;i6?!WXgCTSeVNjqEHE`LF-^s2ITlh! z`ZUMDjp5cLKBLCt%))jRd+ueRU5eQS$4~XT_7}b?=znFOKJOTI&z$|=d|^M&&=z}M z$u~Ha;`Y^B$tC`MvYawwOL&&71@WHFs5+|3Q zg9#_)bhveQlt)rEZwGWxsg%e~sT9{l5c$_gG)7?mq+T`LF($@%?xB?{49X z5&LJ // OpenNMT's tokenizer +#include + +namespace lm::tokenizer { + +class OpenNMTAdapter : public BPETokenizer { +public: + // Supported tokenization modes + enum class Mode { BPE, SENTENCEPIECE, WORDPIECE }; + + // Initialize with OpenNMT config file + explicit OpenNMTAdapter(const std::filesystem::path& config_path); + + // Tokenization with mode selection + std::vector encode(std::string_view text, + Mode mode = Mode::BPE, + const SamplingOptions& opts = {}) const override; + + // Conversion utilities + static std::vector to_opennmt_tokens(const std::vector& our_tokens); + static std::vector from_opennmt_tokens(const std::vector& opennmt_tokens); + + // Configuration + void set_mode(Mode mode) { mode_ = mode; } + +private: + mutable std::mutex mutex_; // Thread safety for OpenNMT's tokenizer + Mode mode_ = Mode::BPE; + std::unique_ptr opennmt_tokenizer_; + + // Internal implementations + std::vector encode_bpe(std::string_view text, const SamplingOptions& opts) const; + std::vector encode_sp(std::string_view text) const; + std::vector encode_wp(std::string_view text) const; +}; + +} // namespace lm::tokenizer diff --git a/tests/tokenizer/test_openmtadapter.cpp b/tests/tokenizer/test_openmtadapter.cpp new file mode 100644 index 0000000..068a57a --- /dev/null +++ b/tests/tokenizer/test_openmtadapter.cpp @@ -0,0 +1,12 @@ +#include "tokenizer/opennmt_adapter.hpp" +#include + +TEST(OpenNMTAdapter, ModeSwitch) { + OpenNMTAdapter tokenizer("config.json"); + + auto bpe_tokens = tokenizer.encode("hello", OpenNMTAdapter::Mode::BPE); + auto sp_tokens = tokenizer.encode("hello", OpenNMTAdapter::Mode::SENTENCEPIECE); + + EXPECT_NE(bpe_tokens, sp_tokens); // Different tokenization schemes +} +