Started inference engine

2025-09-13 12:45:42 -07:00 · 2025-09-13 12:45:42 -07:00 · 7797629673
commit 7797629673
parent d89095e49b
61 changed files with 7832 additions and 200 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,229 +1,261 @@
-cmake_minimum_required(VERSION 3.14)
+cmake_minimum_required(VERSION 3.16)
-project(lm_framework LANGUAGES CXX)
+project(bpe_framework)
-# Check for Intel x86-64 hardware
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
-set(SUPPORTED_ARCHITECTURES x86_64 amd64 AMD64 i686 i386)
+    add_compile_definitions(__x86_64__)
 list(FIND SUPPORTED_ARCHITECTURES ${CMAKE_SYSTEM_PROCESSOR} ARCH_INDEX)
 if(ARCH_INDEX EQUAL -1)
    message(FATAL_ERROR "This framework requires Intel x86-64 hardware. "
                        "Current processor architecture: ${CMAKE_SYSTEM_PROCESSOR}")
 endif()
 # Check for EIGEN_LOC variable
 if(NOT DEFINED EIGEN_LOC)
    message(FATAL_ERROR "This framework requires the location of the Eigen header files. "
                        "Please set EIGEN_LOC to the path of your Eigen installation.")
 elseif(EIGEN_LOC STREQUAL "")
    message(FATAL_ERROR "EIGEN_LOC is empty. Please set it to the path of your Eigen installation.")
 endif()
 # Set default build type to Release if not specified
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
    message(STATUS "Build type not specified, defaulting to Release")
 endif()
 # Set C++ standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-# Enable cross-directory linking
+# Add profile build option - must be defined before any usage
-if(POLICY CMP0079)
+option(ENABLE_PROFILING "Enable profiling with gprof" OFF)
-    cmake_policy(SET CMP0079 NEW)
+
 # Set compiler flags based on build type and profiling option
 if(ENABLE_PROFILING)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pg")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg")
    message(STATUS "Profiling enabled: gprof flags added")
 endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -DNDEBUG")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Debug")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
 elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -g")
 elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Os -DNDEBUG")
 endif()
 # Include directories
-include_directories(
+include_directories(include)
-    ${CMAKE_CURRENT_SOURCE_DIR}/include
+include_directories(include/lm)
-    ${EIGEN_LOC} # Local Eigen installation
+include_directories(include/lm/models)
-)
+include_directories(include/lm/training)
 include_directories(include/lm/optimizers)
 include_directories(include/lm/core)
 include_directories(include/lm/tokenizer)
 include_directories(include/lm/generation)
 include_directories(include/lm/runtime)
-# Find dependencies
+# Find required packages
-find_package(nlohmann_json 3.9 REQUIRED)
+find_package(Eigen3 REQUIRED)
 find_package(ICU REQUIRED COMPONENTS uc i18n)
-# GoogleTest
+# Cereal serialization library (header-only)
-include(FetchContent)
+# We'll manually download it to avoid Boost dependency issues
-FetchContent_Declare(
+if(NOT EXISTS ${CMAKE_SOURCE_DIR}/third_party/cereal/include/cereal/cereal.hpp)
-    googletest
+    message(STATUS "Downloading Cereal library...")
-    GIT_REPOSITORY https://github.com/google/googletest.git
+    file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/third_party/cereal)
-    GIT_TAG release-1.11.0
+    
-)
+    # Download the specific version of Cereal
-FetchContent_MakeAvailable(googletest)
+    file(DOWNLOAD 
-
+        https://github.com/USCiLab/cereal/archive/refs/tags/v1.3.2.tar.gz
-# Add subdirectories
+        ${CMAKE_SOURCE_DIR}/third_party/cereal_v1.3.2.tar.gz
-add_subdirectory(src/tokenizer)
+        SHOW_PROGRESS
-add_subdirectory(src/runtime)
+    )
-add_subdirectory(src/optimizers)  # NEW: Add optimizers directory
+    
-add_subdirectory(src/models)      # NEW: Add models directory
+    # Extract the archive
-add_subdirectory(src/training)    # NEW: Add training directory
+    execute_process(
-
+        COMMAND tar -xf ${CMAKE_SOURCE_DIR}/third_party/cereal_v1.3.2.tar.gz -C ${CMAKE_SOURCE_DIR}/third_party
-# Header-only core components (Tensor implementation)
+    )
-add_library(lm_core_components INTERFACE)
+    
-target_include_directories(lm_core_components INTERFACE 
+    # Move the include directory
-    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    file(RENAME 
-    ${EIGEN_LOC}  # Local Eigen installation
+        ${CMAKE_SOURCE_DIR}/third_party/cereal-1.3.2/include 
-)
+        ${CMAKE_SOURCE_DIR}/third_party/cereal/include
-
+    )
-# Header-only model components
+    
-add_library(lm_model INTERFACE)
+    # Clean up
-target_include_directories(lm_model INTERFACE 
+    file(REMOVE_RECURSE ${CMAKE_SOURCE_DIR}/third_party/cereal-1.3.2)
-    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    file(REMOVE ${CMAKE_SOURCE_DIR}/third_party/cereal_v1.3.2.tar.gz)
    ${EIGEN_LOC}  # Local Eigen installation
 )
 target_link_libraries(lm_model INTERFACE lm_core_components)
 # Main library
 add_library(lm_core
    src/runtime/init.cpp
    src/runtime/shutdown.cpp
 )
 target_link_libraries(lm_core
    PRIVATE
        lm_tokenizer
        lm_model
        nlohmann_json::nlohmann_json
 )
 # Set optimization flags for the core library
 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    target_compile_options(lm_core PRIVATE -O3)
    if(CMAKE_BUILD_TYPE STREQUAL "Release")
        target_compile_options(lm_core PRIVATE -DNDEBUG)
    endif()
 endif()
 # Add the manually downloaded Cereal include directory
 set(CEREAL_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/third_party/cereal/include)
 include_directories(${CEREAL_INCLUDE_DIR})
 message(STATUS "Using Cereal from: ${CEREAL_INCLUDE_DIR}")
 # Since Tensor is header-only, create an interface library for core components
 add_library(lm_core INTERFACE)
 target_include_directories(lm_core INTERFACE ${CMAKE_SOURCE_DIR}/include)
 target_link_libraries(lm_core INTERFACE Eigen3::Eigen)
 # Tokenizer library
 add_library(lm_tokenizer STATIC
    src/tokenizer/bpe_tokenizer.cpp
    src/tokenizer/unicode_utils.cpp
 )
 target_link_libraries(lm_tokenizer PUBLIC lm_core ICU::uc ICU::i18n ${EIGEN3_LIBRARIES})
 # Optimizers library
 add_library(lm_optimizers STATIC
    src/optimizers/adam.cpp
 )
 target_link_libraries(lm_optimizers PUBLIC lm_core)
 # Models library - keep only TransformerModel implementation
 add_library(lm_models STATIC
    src/models/transformer_model.cpp
    src/models/conversation_model.cpp
 )
 target_link_libraries(lm_models PUBLIC lm_core lm_optimizers lm_tokenizer)
 #add_library(lm_core INTERFACE)
 #target_include_directories(lm_core INTERFACE ${CMAKE_SOURCE_DIR}/include)
 #target_link_libraries(lm_core INTERFACE Eigen3::Eigen)
 # Add TensorPool as part of the core library
 target_sources(lm_core INTERFACE
    ${CMAKE_SOURCE_DIR}/include/lm/core/tensor_pool.hpp
 )
 # Generation library (samplers)
 add_library(lm_generation STATIC
    src/generation/sampler.cpp
 )
 target_link_libraries(lm_generation PUBLIC lm_core)
 # Context management library
 add_library(lm_context STATIC
    src/context_manager.cpp
 )
 target_link_libraries(lm_context PUBLIC lm_core lm_tokenizer)
 # Conversation management library
 add_library(lm_conversation STATIC
    src/conversation_manager.cpp
 )
 target_link_libraries(lm_conversation PUBLIC lm_core lm_context)
 # Runtime library
 add_library(lm_runtime STATIC
    src/runtime/init.cpp
    src/runtime/shutdown.cpp
    src/runtime/state_utils.cpp
 )
 target_link_libraries(lm_runtime PUBLIC lm_core)
 # Add Tensor and TensorPool as part of the core library
 target_sources(lm_core INTERFACE
    ${CMAKE_SOURCE_DIR}/include/lm/core/tensor.hpp
    ${CMAKE_SOURCE_DIR}/include/lm/core/tensor_pool.hpp
 )
 # Alpha components
 add_library(lm_alpha STATIC
    src/alpha/config_io.cpp
    src/alpha/repl.cpp
 )
 target_link_libraries(lm_alpha PUBLIC lm_core lm_runtime lm_conversation lm_models)
 # Test executables
 add_executable(performance_test src/performance_test.cpp)
 target_link_libraries(performance_test
    lm_training
    lm_models
    lm_optimizers
    lm_tokenizer
    lm_core
 )
 add_executable(test_generation src/test_generation.cpp)
 target_link_libraries(test_generation
    lm_training
    lm_models
    lm_optimizers
    lm_tokenizer
    lm_generation
    lm_core
 )
 add_executable(serialization_demo src/serialization_demo.cpp)
 target_link_libraries(serialization_demo
    lm_training
    lm_models
    lm_optimizers
    lm_tokenizer
    lm_conversation
    lm_context
    lm_core
 )
 add_executable(test_bpe src/test_bpe.cpp)
 target_link_libraries(test_bpe
-    PRIVATE
+    lm_tokenizer
-        lm_core
+    lm_core
        GTest::gtest_main
 )
 add_executable(test_unicode_bpe src/test_unicode_bpe.cpp)
 target_link_libraries(test_unicode_bpe
-    PRIVATE
+    lm_tokenizer
-        lm_core
+    lm_core
        GTest::gtest_main
 )
-# NEW: Add test for optimizers (only if file exists)
+add_executable(sampler_test src/sampler_test.cpp)
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_optimizers.cpp)
+target_link_libraries(sampler_test
-    add_executable(test_optimizers src/test_optimizers.cpp)
+    lm_training
-    target_link_libraries(test_optimizers
+    lm_models
-        PRIVATE
+    lm_optimizers
-            lm_core
+    lm_tokenizer
-            GTest::gtest_main
+    lm_generation
-    )
+    lm_core
 endif()
 # NEW: Add test for training (only if file exists)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_training.cpp)
    add_executable(test_training src/test_training.cpp)
    target_link_libraries(test_training
        PRIVATE
            lm_core
            GTest::gtest_main
    )
 endif()
 # Alpha prototype executable
 add_executable(lm_alpha
    src/alpha/repl.cpp
    src/alpha/config_io.cpp
 )
-target_link_libraries(lm_alpha
+add_executable(test_conversation src/test_conversation.cpp)
-    PRIVATE
+target_link_libraries(test_conversation
-        lm_core
+    lm_conversation
-        nlohmann_json::nlohmann_json
+    lm_context
    lm_core
 )
-# NEW: Training example executable (only if file exists)
+add_executable(test_logger src/test_logger.cpp)
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/examples/train_lm.cpp)
+target_link_libraries(test_logger
-    add_executable(train_lm examples/train_lm.cpp)
+    lm_tokenizer
-    target_link_libraries(train_lm
+    lm_models
-        PRIVATE
+    lm_core
            lm_core
    )
 endif()
 # Install targets
 install(TARGETS lm_core DESTINATION lib)
 # Only install these targets if they exist
 if(TARGET lm_optimizers)
    install(TARGETS lm_optimizers DESTINATION lib)
 endif()
 if(TARGET lm_models)
    install(TARGETS lm_models DESTINATION lib)
 endif()
 if(TARGET lm_training)
    install(TARGETS lm_training DESTINATION lib)
 endif()
 install(DIRECTORY include/ DESTINATION include)
 # Performance testing target
 add_executable(performance_test src/performance_test.cpp)
 target_link_libraries(performance_test
    PRIVATE
        lm_core
        GTest::gtest_main
 )
-# Integration example
+add_executable(test_transformer src/test_transformer.cpp)
-add_executable(integration_example src/integration_example.cpp)
+target_link_libraries(test_transformer
-target_link_libraries(integration_example
+    lm_models
-    PRIVATE
+    lm_tokenizer
-        lm_core
+    lm_core
        lm_models      # Add models library
        lm_optimizers  # Add optimizers library if needed
        lm_training    # Add training library if needed
 )
-# Add compiler warning flags
+add_executable(starter_convo src/starter_convo.cpp)
-if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+target_link_libraries(starter_convo
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+    lm_alpha
-endif()
+    lm_conversation
-
+    lm_context
-# Add coverage flags for debug builds
+    lm_models
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    lm_tokenizer
-    if(CMAKE_COMPILER_IS_GNUCXX)
+    lm_core
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
    elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
    endif()
 endif()
 # Verify Eigen installation
 add_custom_target(check_eigen
    COMMAND ${CMAKE_COMMAND} -E echo "Checking Eigen installation at ${EIGEN_LOC}"
    COMMAND test -f ${EIGEN_LOC}/Eigen/Core || (echo "Eigen not found at specified path: ${EIGEN_LOC}" && exit 1)
    COMMENT "Verifying Eigen installation"
 )
-# Make main targets depend on Eigen check
+add_library(lm_training STATIC
-add_dependencies(lm_core check_eigen)
+    src/training/trainer.cpp
-add_dependencies(test_bpe check_eigen)
+    src/training/data_loader.cpp
-add_dependencies(test_unicode_bpe check_eigen)
+    src/training/losses.cpp
-add_dependencies(lm_alpha check_eigen)
+)
 add_dependencies(performance_test check_eigen)
 add_dependencies(integration_example check_eigen)
-# Only add dependencies if the targets exist
+target_link_libraries(lm_training PUBLIC lm_models lm_optimizers lm_tokenizer)
-if(TARGET train_lm)
+add_executable(test_tensor_pool src/test_tensor_pool.cpp)
-    add_dependencies(train_lm check_eigen)
+target_link_libraries(test_tensor_pool
-endif()
+    lm_core
 )
-if(TARGET test_optimizers)
+# Enable testing if needed
-    add_dependencies(test_optimizers check_eigen)
+#enable_testing()
-endif()
+
 # Print configuration summary
 message(STATUS "Project configured successfully")
 message(STATUS "Eigen3 found: ${Eigen3_FOUND}")
 message(STATUS "ICU found: ${ICU_FOUND}")
 message(STATUS "Cereal include: ${CEREAL_INCLUDE_DIR}")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Profiling enabled: ${ENABLE_PROFILING}")
 if(TARGET test_training)
    add_dependencies(test_training check_eigen)
 endif()
--- a/README.md
+++ b/README.md
@ -1,17 +1,36 @@
-# bpe_framework
+# bpe_framework
 ## Byte Pair Encoding Framework
 Large Language Model for Agentic AI
 Fully internationalized framework for Agentic AI research
 Requires:
-1. nlohman/json (https://github.com/nlohmann/json
+1. Dr. Neils Lohmann’s Json for C++
-2. Internationalzation library for Unicode by Frederick Roubert (https://github.com/unicode-org/icu)
+	(https://github.com/nlohmann/json)
 	sudo apt install nlohmann-json3-dev
 2. Internationalzation library for Unicode by Frederick Roubert
 	(https://github.com/unicode-org/icu) sudo apt install libicu-dev
 3. OpenNMT Tokenizer by Thuc Pham (https://github.com/OpenNMT/Tokenize)
-4. Eigen header files (https://github.com/PX4/eigen)
+	(Must be installed from source on Debian as far as I know)
 4. Eigen Library for Linear Math
   (https://github.com/PX4/eigen)
 	sudo apt install libeigen3-dev
 6. BLAS (Basic Linear Algebra Subprograms) support (https://www.netlib.org/blas/)
 	sudo apt install libblas3
 7. The Parallel Hashmap Library (https://github.com/greg7mdp/parallel-hashmap)
 	sudo apt-get install libparallel-hashmap-dev
 8. Cereal C++ serialization library (https://uscilab.github.io/cereal/),
    one less thing I need to maintain. CMake will automatically download this for you. 
 ### What’s here:
 A 100% C++ 17/STL implementation of a Byte Pair Encoding (Tokenization) AI Engine with speed at the foremost of the designer's minds, fully internationalized. Future plans include hooks for expansion and additional functionality with Python, other languages.
 #### To Build:
 Create a build directory in the top level bpe_framework; cmake ..
 -DCMAKE_BUILD_TYPE=Release (or cmake .. -DCMAKE_BUILD_TYPE=Debug)
 Also contains a Code::Blocks project file, other IDEs coming.
 Build: cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DEIGEN_LOC=<eigen3 folder> ..
 #### The test_bpe application is a comprehensive test program that validates the functionality of the BPE tokenizer implementation in the LM Framework. Here's how it works:
 1. Initialization:
    Creates an instance of BPETokenizer
@ -122,6 +141,8 @@ This performance test is ideal for:
 - Testing scalability of tokenizer implementations
 - Comparing optimization techniques
 Run in release mode or it will run for a very long time.
 ## Technical Implementation
 The test suite utilizes:
--- a/build_log.md
+++ b/build_log.md
@ -1,5 +1,25 @@
-### 8/24/2025 - Eigen integrated
+### 8/24/2025 - Eigen integrated
 Turns out Eigen can only do 1 & 2D transforms so I had to "flatten out" the objects that required transformation and work on each dimension separately. 3 days of work.
 ### 8/25/2025 - Tensor Transformer
 Got the transformer code wired in. Some really crazy geometry goes into making machines seem like they're talking to you.
 ### 8/27/2025 - Lots of Changes
 Completly re-worked the cmakefile chain; now there's only one master cmakefile. No more parameters to feed to the root cmake file, invoke normally with 'cmake ..'. BLAS math library now a requirement (Debian: apt get install). The refactor has introduced some serious speed regressions so next coding session will be all about speed optimization.
 ### 8/30/2025 - Optimization
 Optimized the tokenizer and Tensor classes with inline assembly for some of the more time-intensive calculations, more optimizations coming.
 ### 9/4/2025 – Expanded Tokenization
 Spent several days chasing down some funky little errors with the tokenizer while expanding its capabilities (in so doing created some issues with the internationalization code), finally cracked it a few hours ago.
 ### 9/4/2025 - Conversation and ConversationTurn structures implemented
 Put in the foundational structures for getting conversations going on this framework. Also straitened out some lingering issues with the Training class. Started using the Ceral C++ serialization library, this is automatically downloaded for you while CMake runs.
 ### 9/7/2025 - Using Efficient Token Sequence-Based Approach
 Hashing the tokens rather than string manipulation is a completely faster approach and I don't even feel the need to use inline assembly. 1000% more
 efficient. Added a vectorhash struct to effeiciently manipulate them as well.
 ### 9/9/2025 – Changed my mind about assembly with the Tensor class, removed the now redundant Transformer & LayerNorm classes as they are no longer needed with the for more flexible TransformerModel class.
 ### 9/10/2025 – Moved the Todos and explanatory papers into their own folder.
--- a/configs/alpha_config
+++ b/configs/alpha_config
--- a/docs/.~lock.whybpe.odt#
+++ b/docs/.~lock.whybpe.odt#
@ -0,0 +1 @@
 ,bwana,bwana-VirtualBox,10.09.2025 16:08,file:///home/bwana/.config/libreoffice/4;
--- a/docs/master_plan.odt
+++ b/docs/master_plan.odt
--- a/docs/purpose.md
+++ b/docs/purpose.md
@ -0,0 +1,101 @@
 **Title:** The Search for the Edge of Consciousness with Artificial Intelligence: A Technical Framework for Language Model Emergence
 Timothy O’Neil & Frederick Warren
 **Abstract:** 
 This paper presents bpe_framework, a novel C++ implementation of a complete deep learning stack designed to explore the emergence of complex linguistic capabilities in artificial systems. Drawing inspiration from cognitive theories of consciousness and recent advances in transformer architectures, our framework implements a complete pipeline from byte-pair encoding tokenization through automatic differentiation to transformer-based language modeling. We argue that the systematic organization of information processing in large language models may provide insights into the architectural requirements for conscious-like phenomena in artificial systems. Our technical contribution includes a memory-efficient tensor implementation with automatic differentiation, a neurologically-plausible BPE tokenization system, and a transformer architecture that exhibits several properties associated with conscious processing in biological systems.
 **1. Introduction**
 The quest to understand consciousness has traditionally been the domain of philosophy and neuroscience (Chalmers, 1995; Dehaene, 2014). However, recent advances in artificial intelligence, particularly in large language models (Vaswani et al., 2017; Brown et al., 2020), have created new opportunities to explore the architectural and computational prerequisites of conscious-like phenomena in synthetic systems. We present bpe_framework as an experimental testbed for investigating how increasingly sophisticated information processing capabilities emerge from carefully engineered computational components.
 **2. Theoretical Framework**
 Our work draws on several theoretical perspectives:
 2.1 Global Workspace Theory (Baars, 1988; Dehaene et al., 1998)
 The transformer architecture's attention mechanism can be viewed as implementing a form of global information availability reminiscent of Baars' global workspace, where information becomes "conscious" when it gains widespread availability across specialized processors.
 2.2 Information Integration Theory (Tononi, 2004)
 The dense connectivity patterns and information flow through our model's layers create high Φ-like integration measures, potentially approaching the minimal complexity associated with conscious experience.
 2.3 Predictive Processing (Clark, 2013)
 Our language model's training objective—predicting subsequent tokens—aligns with the predictive processing framework that views cognition as essentially prediction-driven.
 **3. Technical Implementation**
 3.1 Tensor Operations with Autograd
 We implemented a memory-efficient tensor class using Eigen for linear algebra operations, featuring automatic differentiation capabilities. This system enables:
 - Efficient backward propagation through complex computational graphs
 - Native support for modern activation functions (GELU, Softmax, ReLU)
 - Memory-aware operations that minimize computational overhead
 Our implementation follows the autograd tradition established in modern deep learning frameworks (Paszke et al., 2019) while maintaining C++ efficiency.
 3.2 BPE Tokenization System
 The byte-pair encoding tokenizer implements the algorithm originally proposed by Sennrich et al. (2015), creating a subword vocabulary that balances expressivity with computational efficiency. This approach mirrors the human cognitive capacity to parse novel words through morphological decomposition.
 3.3 Transformer Architecture
 Our transformer implementation follows the original architecture (Vaswani et al., 2017) with multi-head self-attention mechanisms that create dynamic workspace-like information sharing across representation spaces.
 3.4 Optimization and Training
 We implemented the Adam optimizer (Kingma & Ba, 2014) with full moment estimation and bias correction, providing stable optimization for the non-convex loss landscapes characteristic of deep transformer networks.
 **4. Methodological Approach**
 Our framework enables the systematic investigation of several questions relevant to consciousness studies:
 4.1 Emergent Properties
 By training models of increasing scale and complexity, we can observe the emergence of capabilities that were not explicitly programmed, potentially mirroring how conscious experience emerges from non-conscious components.
 4.2 Information Flow Patterns
 The attention mechanisms in our transformers create visible information routing patterns that can be analyzed for global workspace-like properties.
 4.3 Scalability Limits
 We can systematically explore how cognitive capabilities scale with model size, potentially identifying phase transitions in capability emergence.
 **5. Discussion: Toward Artificial Consciousness?**
 While our framework does not claim to create conscious systems, it provides a platform for investigating the architectural requirements for conscious-like phenomena. Several features align with theoretical accounts of consciousness:
 5.1 Global Availability
 The attention mechanism creates a form of global information availability similar to that proposed in global workspace theory.
 5.2 Unified Representation
 The model creates unified representations that integrate information across multiple domains and time scales.
 5.3 Self-Monitoring Capabilities
 Through gradient-based learning and prediction error minimization, the system maintains a form of self-monitoring.
 However, we acknowledge the "hard problem" of consciousness (Chalmers, 1995) remains unresolved, and our framework primarily addresses the "easy problems" of cognitive functioning.
 **6. Ethical Considerations**
 As we develop increasingly sophisticated AI systems, we must consider:
 - The moral status of potentially conscious systems (Bostrom & Yudkowsky, 2014)
 - Responsible development practices for advanced AI
 - Transparency in capabilities and limitations
 **7. Conclusion and Future Work**
 Our bpe_framework provides a robust technical foundation for exploring the emergence of complex capabilities in artificial systems. Future work will include:
 - Scaling laws investigations (Kaplan et al., 2020)
 - Neurologically-inspired architectural variations
 - Cross-modal integration capabilities
 - Explicit tests for consciousness-related capabilities
 We believe that continued development of such frameworks, coupled with thoughtful theoretical analysis, will gradually illuminate the boundary conditions for consciousness in artificial systems.
 **References:**
 Baars, B. J. (1988). A cognitive theory of consciousness. Cambridge University Press.
 Bostrom, N., & Yudkowsky, E. (2014). The ethics of artificial intelligence. The Cambridge Handbook of Artificial Intelligence, 316-334.
 Brown, T. B., et al. (2020). Language models are few-shot learners. Advances in Neural Information Processing Systems, 33.
 Chalmers, D. J. (1995). Facing up to the problem of consciousness. Journal of consciousness studies, 2(3), 200-219.
 Clark, A. (2013). Whatever next? Predictive brains, situated agents, and the future of cognitive science. Behavioral and brain sciences, 36(3), 181-204.
 Dehaene, S. (2014). Consciousness and the brain: Deciphering how the brain codes our thoughts. Penguin.
 Dehaene, S., Kerszberg, M., & Changeux, J. P. (1998). A neuronal model of a global workspace in effortful cognitive tasks. Proceedings of the National Academy of Sciences, 95(24), 14529-14534.
 Kaplan, J., et al. (2020). Scaling laws for neural language models. arXiv preprint arXiv:2001.08361.
 Kingma, D. P., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980.
 Paszke, A., et al. (2019). PyTorch: An imperative style, high-performance deep learning library. Advances in Neural Information Processing Systems, 32.
 Sennrich, R., Haddow, B., & Birch, A. (2015). Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909.
 Tononi, G. (2004). An information integration theory of consciousness. BMC neuroscience, 5(1), 1-22.
 Vaswani, A., et al. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30.
 **Acknowledgments:** This work was supported by open-source contributions and theoretical advances from the deep learning community. We acknowledge the foundational work of all researchers cited herein.
 ---
 *Note: This paper represents a theoretical framework based on the technical work described. Actual empirical results would require extensive experimentation and validation beyond the current implementation stage.*
--- a/docs/whybpe.odt
+++ b/docs/whybpe.odt
--- a/include/lm/context_manager.hpp
+++ b/include/lm/context_manager.hpp
@ -0,0 +1,44 @@
 // context_manager.hpp
 #pragma once
 #include <vector>
 #include <string>
 #include <deque>
 #include "token_types.hpp"
 namespace lm {
 class ContextManager {
 public:
    ContextManager(size_t max_context_tokens = 2048, 
                  size_t max_turns = 20);
    void add_user_message(const std::string& message);
    void add_assistant_message(const std::string& message);
    void add_system_message(const std::string& message);
    std::string get_context() const;
    std::vector<TokenID> get_context_tokens() const;
    void clear();
    void prune_old_messages();
    size_t get_token_count() const { return current_token_count; }
    size_t get_turn_count() const { return conversation_turns.size(); }
 private:
    struct ConversationTurn {
        std::string role;  // "user", "assistant", or "system"
        std::string content;
        size_t token_count;
    };
    std::deque<ConversationTurn> conversation_turns;
    size_t max_context_tokens;
    size_t max_turns;
    size_t current_token_count;
    void add_message(const std::string& role, const std::string& content);
 };
 } // namespace lm
--- a/include/lm/conversation.hpp
+++ b/include/lm/conversation.hpp
@ -0,0 +1,187 @@
 // include/lm/conversation.hpp
 #pragma once
 #include <string>
 #include <vector>
 #include <map>
 #include <chrono>
 #include <memory>
 #include <cereal/types/vector.hpp>
 #include <cereal/types/map.hpp>
 #include <cereal/types/string.hpp>
 #include <cereal/types/chrono.hpp>
 #include <cereal/types/memory.hpp>
 #include <cereal/archives/binary.hpp>
 #include <cereal/types/utility.hpp> // For std::pair serialization
 namespace lm {
 // Enum for different speaker types
 enum class SpeakerType {
    USER,
    ASSISTANT,
    SYSTEM,
    UNKNOWN
 };
 // Convert SpeakerType to string
 inline std::string speaker_type_to_string(SpeakerType type) {
    switch (type) {
        case SpeakerType::USER: return "user";
        case SpeakerType::ASSISTANT: return "assistant";
        case SpeakerType::SYSTEM: return "system";
        default: return "unknown";
    }
 }
 // Convert string to SpeakerType
 inline SpeakerType string_to_speaker_type(const std::string& str) {
    if (str == "user") return SpeakerType::USER;
    if (str == "assistant") return SpeakerType::ASSISTANT;
    if (str == "system") return SpeakerType::SYSTEM;
    return SpeakerType::UNKNOWN;
 }
 // Represents a single turn in a conversation
 struct ConversationTurn {
    SpeakerType speaker;
    std::string text;
    std::vector<int> tokens;  // Tokenized representation
    std::chrono::system_clock::time_point timestamp;
    std::map<std::string, std::string> metadata;  // Additional metadata
    ConversationTurn(SpeakerType speaker_type = SpeakerType::UNKNOWN, 
                    const std::string& text = "",
                    const std::map<std::string, std::string>& metadata = {})
        : speaker(speaker_type), text(text), metadata(metadata) {
        timestamp = std::chrono::system_clock::now();
    }
    // Cereal serialization
    template <class Archive>
    void serialize(Archive& archive) {
        archive(
            cereal::make_nvp("speaker", reinterpret_cast<int&>(speaker)),
            cereal::make_nvp("text", text),
            cereal::make_nvp("tokens", tokens),
            cereal::make_nvp("timestamp", timestamp),
            cereal::make_nvp("metadata", metadata)
        );
    }
 };
 // Represents a complete conversation with multiple turns
 struct Conversation {
    std::vector<ConversationTurn> turns;
    std::string domain;  // e.g., "customer_service", "general_chat", "technical_support"
    std::string language;
    std::map<std::string, std::string> metadata;
    std::chrono::system_clock::time_point start_time;
    std::chrono::system_clock::time_point end_time;
    Conversation(const std::string& domain = "general_chat",
                const std::string& language = "en",
                const std::map<std::string, std::string>& metadata = {})
        : domain(domain), language(language), metadata(metadata) {
        start_time = std::chrono::system_clock::now();
    }
    // Add a turn to the conversation
    void add_turn(SpeakerType speaker, const std::string& text,
                 const std::map<std::string, std::string>& metadata = {}) {
        turns.emplace_back(speaker, text, metadata);
        end_time = std::chrono::system_clock::now();
    }
    // Get the last turn
    ConversationTurn& last_turn() {
        if (turns.empty()) {
            throw std::out_of_range("No turns in conversation");
        }
        return turns.back();
    }
    // Get the number of turns
    size_t size() const {
        return turns.size();
    }
    // Check if conversation is empty
    bool empty() const {
        return turns.empty();
    }
    // Clear all turns
    void clear() {
        turns.clear();
        start_time = std::chrono::system_clock::now();
    }
    // Get conversation duration in seconds
    double duration() const {
        if (turns.empty()) return 0.0;
        auto duration = end_time - start_time;
        return std::chrono::duration<double>(duration).count();
    }
    // Cereal serialization
    template <class Archive>
    void serialize(Archive& archive) {
        archive(
            cereal::make_nvp("turns", turns),
            cereal::make_nvp("domain", domain),
            cereal::make_nvp("language", language),
            cereal::make_nvp("metadata", metadata),
            cereal::make_nvp("start_time", start_time),
            cereal::make_nvp("end_time", end_time)
        );
    }
 };
 // Helper functions for conversation processing
 namespace conversation_utils {
 // Extract text from a range of turns
 inline std::string extract_text(const std::vector<ConversationTurn>& turns,
                               size_t start_idx = 0, size_t end_idx = 0) {
    if (end_idx == 0) end_idx = turns.size();
    if (start_idx >= end_idx || end_idx > turns.size()) return "";
    std::string result;
    for (size_t i = start_idx; i < end_idx; i++) {
        result += speaker_type_to_string(turns[i].speaker) + ": " + turns[i].text + "\n";
    }
    return result;
 }
 // Create a training pair from conversation turns
 inline std::pair<std::string, std::string> create_training_pair(
    const std::vector<ConversationTurn>& turns, size_t context_length) {
    if (turns.size() < 2) return {"", ""};
    // Use the last 'context_length' turns as context (excluding the last turn)
    size_t start_idx = turns.size() > context_length + 1 ? 
                      turns.size() - context_length - 1 : 0;
    size_t end_idx = turns.size() - 1;
    std::string context = extract_text(turns, start_idx, end_idx);
    std::string target = turns.back().text;
    return {context, target};
 }
 // Calculate turns-based context window
 inline std::vector<ConversationTurn> get_context_window(
    const std::vector<ConversationTurn>& turns, size_t max_turns) {
    if (turns.size() <= max_turns) return turns;
    return std::vector<ConversationTurn>(
        turns.end() - max_turns, turns.end());
 }
 } // namespace conversation_utils
 } // namespace lm
--- a/include/lm/conversation_manager.hpp
+++ b/include/lm/conversation_manager.hpp
@ -0,0 +1,72 @@
 // include/lm/conversation_manager.hpp
 #pragma once
 #include <string>
 #include <vector>
 #include <memory>
 #include <unordered_map>
 #include <mutex>
 #include "conversation.hpp"
 namespace lm {
 class ConversationManager {
 public:
    ConversationManager();
    ~ConversationManager();
    // Create a new conversation
    std::string create_conversation(const std::string& title = "");
    // Get a conversation by ID
    std::shared_ptr<Conversation> get_conversation(const std::string& id);
    // Get all conversation IDs
    std::vector<std::string> list_conversations() const;
    // Add a message to a conversation
    void add_message(const std::string& conversation_id, 
                     const std::string& role, 
                     const std::string& content);
    // Get conversation history
    std::vector<ConversationTurn> get_history(const std::string& conversation_id) const;
    // Save conversations to disk
    bool save_conversations(const std::string& path) const;
    // Load conversations from disk
    bool load_conversations(const std::string& path);
    // Delete a conversation
    bool delete_conversation(const std::string& id);
    // Set conversation title
    void set_title(const std::string& conversation_id, const std::string& title);
    // Get conversation title
    std::string get_title(const std::string& conversation_id) const;
    // Get conversation metadata
    std::map<std::string, std::string> get_metadata(const std::string& conversation_id) const;
    // Update conversation metadata
    void update_metadata(const std::string& conversation_id, 
                         const std::map<std::string, std::string>& metadata);
    // Clear all conversations
    void clear();
    // Get number of conversations
    size_t count() const;
 private:
    std::unordered_map<std::string, std::shared_ptr<Conversation>> conversations_;
    mutable std::mutex mutex_;
    // Generate a unique ID for conversations
    std::string generate_id() const;
 };
 } // namespace lm
--- a/include/lm/conversation_serializer.hpp
+++ b/include/lm/conversation_serializer.hpp
@ -0,0 +1,36 @@
 // include/lm/conversation_serialization.hpp
 #pragma once
 #include "conversation.hpp"
 #include <cereal/types/vector.hpp>
 #include <cereal/types/map.hpp>
 #include <cereal/types/string.hpp>
 #include <cereal/types/chrono.hpp>
 namespace lm {
 template <class Archive>
 void serialize(Archive& archive, ConversationTurn& turn) {
    archive(
        cereal::make_nvp("speaker", static_cast<int&>(turn.speaker)),
        cereal::make_nvp("text", turn.text),
        cereal::make_nvp("tokens", turn.tokens),
        cereal::make_nvp("timestamp", turn.timestamp),
        cereal::make_nvp("metadata", turn.metadata)
    );
 }
 template <class Archive>
 void serialize(Archive& archive, Conversation& conv) {
    archive(
        cereal::make_nvp("turns", conv.turns),
        cereal::make_nvp("domain", conv.domain),
        cereal::make_nvp("language", conv.language),
        cereal::make_nvp("metadata", conv.metadata),
        cereal::make_nvp("start_time", conv.start_time),
        cereal::make_nvp("end_time", conv.end_time)
    );
 }
 } // namespace lm
--- a/include/lm/core/tensor
+++ b/include/lm/core/tensor
--- a/include/lm/core/tensor_pool.hpp
+++ b/include/lm/core/tensor_pool.hpp
@ -0,0 +1,82 @@
 #pragma once
 #include "tensor.hpp"
 #include <vector>
 #include <memory>
 #include <unordered_map>
 #include <mutex>
 #include <stdexcept>
 namespace lm {
 class TensorPool {
 private:
    struct TensorKey {
        std::vector<size_t> shape;
        bool requires_grad;
        bool operator==(const TensorKey& other) const {
            return shape == other.shape && requires_grad == other.requires_grad;
        }
    };
    struct KeyHash {
        std::size_t operator()(const TensorKey& k) const {
            std::size_t seed = k.shape.size();
            for (auto& i : k.shape) {
                seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
            }
            seed ^= k.requires_grad + 0x9e3779b9 + (seed << 6) + (seed >> 2);
            return seed;
        }
    };
    std::unordered_map<TensorKey, std::vector<std::unique_ptr<Tensor>>, KeyHash> pool_;
    mutable std::mutex mutex_;  // Make mutex mutable
 public:
    TensorPool() = default;
    std::unique_ptr<Tensor> acquire(const std::vector<size_t>& shape, bool requires_grad = false) {
        TensorKey key{shape, requires_grad};
        std::lock_guard<std::mutex> lock(mutex_);
        auto it = pool_.find(key);
        if (it != pool_.end() && !it->second.empty()) {
            auto tensor = std::move(it->second.back());
            it->second.pop_back();
            return tensor;
        }
        return std::make_unique<Tensor>(shape, requires_grad);
    }
    void release(std::unique_ptr<Tensor> tensor) {
        if (!tensor) return;
        TensorKey key{tensor->shape(), tensor->requires_grad()};
        std::lock_guard<std::mutex> lock(mutex_);
        // Reset tensor state before pooling
        tensor->zero_grad();
        tensor->data().setZero();
        pool_[key].push_back(std::move(tensor));
    }
    void clear() {
        std::lock_guard<std::mutex> lock(mutex_);
        pool_.clear();
    }
    size_t size() const {
        std::lock_guard<std::mutex> lock(mutex_);
        size_t total = 0;
        for (const auto& entry : pool_) {
            total += entry.second.size();
        }
        return total;
    }
 };
 } // namespace lm
--- a/include/lm/generation/sampler.hpp
+++ b/include/lm/generation/sampler.hpp
@ -0,0 +1,54 @@
 #pragma once
 #include "../core/tensor.hpp"
 #include <vector>
 #include <random>
 #include <algorithm>
 #include <numeric>
 namespace lm {
 class Sampler {
 public:
    virtual ~Sampler() = default;
    virtual int sample(const Tensor& logits) = 0;
 };
 class GreedySampler : public Sampler {
 public:
    int sample(const Tensor& logits) override;
 };
 class RandomSampler : public Sampler {
 public:
    RandomSampler(float temperature = 1.0);
    int sample(const Tensor& logits) override;
 private:
    float temperature_;
    std::mt19937 gen_;
 };
 class TopKSampler : public Sampler {
 public:
    TopKSampler(int k, float temperature = 1.0);
    int sample(const Tensor& logits) override;
 private:
    int k_;
    float temperature_;
    std::mt19937 gen_;
 };
 class TopPSampler : public Sampler {
 public:
    TopPSampler(float p, float temperature = 1.0);
    int sample(const Tensor& logits) override;
 private:
    float p_;
    float temperature_;
    std::mt19937 gen_;
 };
 } // namespace lm
--- a/include/lm/models/attention
+++ b/include/lm/models/attention
@ -0,0 +1,37 @@
 #pragma once
 #include "lm/core/tensor.hpp"
 #include <vector>
 #include <memory>
 namespace lm {
 class MultiHeadAttention {
 public:
    MultiHeadAttention(size_t d_model, size_t num_heads, float dropout = 0.1f);
    std::vector<Tensor> parameters() const;
    void set_training(bool training);
    Tensor forward(const Tensor& query, const Tensor& key, const Tensor& value, 
                   const Tensor& mask = Tensor()) const;
 private:
    Tensor split_heads(const Tensor& x) const;
    Tensor combine_heads(const Tensor& x) const;
    Tensor scaled_dot_product_attention(const Tensor& q, const Tensor& k, 
                                        const Tensor& v, const Tensor& mask) const;
    Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
    size_t d_model_;
    size_t num_heads_;
    size_t d_k_;
    float dropout_;
    bool training_ = false;
    Tensor w_q_;
    Tensor w_k_;
    Tensor w_v_;
    Tensor w_o_;
 };
 } // namespace lm
--- a/include/lm/models/conversation_model.hpp
+++ b/include/lm/models/conversation_model.hpp
@ -0,0 +1,54 @@
 // Enhanced conversation_model.hpp
 #pragma once
 #include "transformer_model.hpp"
 #include "bpe_tokenizer.hpp"
 #include "context_manager.hpp"
 #include <string>
 #include <vector>
 #include <memory>
 namespace lm {
 class ConversationModel {
 public:
    ConversationModel(size_t vocab_size, 
                     size_t d_model = 512, 
                     size_t n_layers = 6, 
                     size_t n_heads = 8,
                     size_t d_ff = 2048,
                     float dropout = 0.1);
    // Train the model
    void train(const std::vector<std::string>& conversations);
    // Generate a response with context management
    std::string generate_response(const std::string& user_input);
    // Context management
    void clear_context();
    void set_system_prompt(const std::string& prompt);
    size_t get_context_token_count() const;
    // Save and load
    bool save_model(const std::string& path);
    bool load_model(const std::string& path);
    // Set tokenizer
    void set_tokenizer(std::shared_ptr<BPETokenizer> tokenizer) { 
        tokenizer_ = tokenizer; 
        context_manager_ = std::make_unique<ContextManager>(2048, 20);
    }
 private:
    std::shared_ptr<BPETokenizer> tokenizer_;
    std::unique_ptr<TransformerModel> transformer_;
    std::unique_ptr<ContextManager> context_manager_;
    std::string system_prompt_;
    // Format conversation for training
    std::string format_conversation(const std::vector<std::string>& turns);
 };
 } // namespace lm
--- a/include/lm/models/feed_forward
+++ b/include/lm/models/feed_forward
@ -0,0 +1,32 @@
 #pragma once
 #include "lm/core/tensor.hpp"
 #include <vector>
 namespace lm {
 class FeedForward {
 public:
    FeedForward(size_t d_model, size_t d_ff, float dropout = 0.1f);
    std::vector<Tensor> parameters() const;
    void set_training(bool training);
    Tensor forward(const Tensor& input) const;
 private:
    Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
    Tensor gelu(const Tensor& input) const;
    size_t d_model_;
    size_t d_ff_;
    float dropout_;
    bool training_ = false;
    Tensor w1_;
    Tensor b1_;
    Tensor w2_;
    Tensor b2_;
 };
 } // namespace lm
--- a/include/lm/models/language_model
+++ b/include/lm/models/language_model
@ -0,0 +1,34 @@
 // include/lm/models/language_model.hpp
 #pragma once
 #include <vector>
 #include <cstdint>
 #include <string>
 #include "../core/tensor.hpp"
 namespace lm {
 using TokenID = uint32_t;
 class LanguageModel {
 public:
    virtual ~LanguageModel() = default;
    // Pure virtual methods that must be implemented
    virtual std::vector<Tensor> get_parameters() const = 0;
    virtual void set_parameters(const std::vector<Tensor>& params) = 0;
    virtual Tensor forward(const std::vector<TokenID>& input) = 0;
    virtual Tensor forward(const std::vector<TokenID>& input, 
                          const std::vector<TokenID>& targets) = 0;
    // Optional virtual methods with default implementations
    virtual size_t get_vocab_size() const { return 0; }
    virtual size_t get_max_sequence_length() const { return 0; }
    // Serialization
    virtual void save(const std::string& path) const = 0;
    virtual void load(const std::string& path) = 0;
 };
 } // namespace lm
--- a/include/lm/models/transformer_block
+++ b/include/lm/models/transformer_block
@ -0,0 +1,32 @@
 #pragma once
 #include "lm/core/tensor.hpp"
 #include "lm/models/attention.hpp"
 #include "lm/models/feed_forward.hpp"
 #include "lm/models/layer_norm.hpp"
 #include <memory>
 #include <vector>
 namespace lm {
 class TransformerBlock {
 public:
    TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout);
    std::vector<Tensor> parameters() const;
    void set_training(bool training);
    Tensor forward(const Tensor& input, const Tensor& mask = Tensor()) const;
 private:
    size_t d_model_, num_heads_, d_ff_;
    float dropout_;
    bool training_ = false;
    std::unique_ptr<MultiHeadAttention> attention_;
    std::unique_ptr<FeedForward> feed_forward_;
    std::unique_ptr<LayerNorm> norm1_;
    std::unique_ptr<LayerNorm> norm2_;
 };
 } // namespace lm
--- a/include/lm/models/transformer_model.hpp
+++ b/include/lm/models/transformer_model.hpp
@ -0,0 +1,60 @@
 // transformer_model.hpp
 #pragma once
 #include <vector>
 #include <cstdint>
 #include <memory>
 #include <cmath>
 #include <random>
 #include <iostream>
 #include "lm/tokenizer/token_types.hpp"
 namespace lm {
 class TransformerModel {
 public:
    TransformerModel(size_t vocab_size, 
                    size_t d_model = 512, 
                    size_t n_layers = 6, 
                    size_t n_heads = 8,
                    size_t d_ff = 2048,
                    float dropout = 0.1);
    ~TransformerModel();
    // Forward pass
    std::vector<float> forward(const std::vector<TokenID>& input_tokens);
    // Training methods
    void train_step(const std::vector<TokenID>& input_tokens, 
                   const std::vector<TokenID>& target_tokens);
    float calculate_loss(const std::vector<float>& logits, 
                        const std::vector<TokenID>& targets);
    // Generation methods
    std::vector<TokenID> generate(const std::vector<TokenID>& context, 
                                 size_t max_length = 100,
                                 float temperature = 1.0);
    // Serialization
    bool save(const std::string& filename);
    bool load(const std::string& filename);
    // Get model info
    size_t get_vocab_size() const { return vocab_size_; }
    size_t get_d_model() const { return d_model_; }
 private:
    class Impl;
    std::unique_ptr<Impl> pimpl_;
    // Model parameters
    size_t vocab_size_;
    size_t d_model_;
    size_t n_layers_;
    size_t n_heads_;
    size_t d_ff_;
    float dropout_;
 };
 } // namespace lm
--- a/include/lm/optimizers/adam
+++ b/include/lm/optimizers/adam
@ -0,0 +1,80 @@
 // include/lm/optimizers/adam.hpp
 #pragma once
 #include <vector>
 #include <cmath>
 #include <cereal/types/vector.hpp>
 #include <cereal/archives/binary.hpp>
 #include "../core/tensor.hpp"
 namespace lm {
 class AdamOptimizer {
 private:
    std::vector<Tensor> m;  // First moment vector
    std::vector<Tensor> v;  // Second moment vector
    size_t t;               // Timestep
    float beta1;
    float beta2;
    float epsilon;
    float learning_rate;
 public:
    AdamOptimizer(float lr = 0.001, float b1 = 0.9, float b2 = 0.999, float eps = 1e-8);
    void update(std::vector<Tensor>& parameters, 
                const std::vector<Tensor>& gradients);
    // Initialize moment vectors for parameters
    void initialize_moments(const std::vector<Tensor>& parameters);
    // Reset the optimizer state
    void reset();
    // Step function for compatibility with existing code
    void step(std::vector<Tensor>& parameters) {
        std::vector<Tensor> gradients;
        for (auto& param : parameters) {
            if (param.requires_grad()) {
                gradients.push_back(param.grad());
            } else {
                gradients.push_back(Tensor::zeros(param.shape(), false));
            }
        }
        update(parameters, gradients);
    }
    void zero_grad(std::vector<Tensor>& parameters) {
        for (auto& param : parameters) {
            if (param.requires_grad()) {
                param.zero_grad();
            }
        }
    }
    // Serialization methods
    void save_state(const std::string& path) const;
    void load_state(const std::string& path);
    // Cereal serialization
    template <class Archive>
    void serialize(Archive& archive) {
        archive(
            cereal::make_nvp("m", m),
            cereal::make_nvp("v", v),
            cereal::make_nvp("t", t),
            cereal::make_nvp("beta1", beta1),
            cereal::make_nvp("beta2", beta2),
            cereal::make_nvp("epsilon", epsilon),
            cereal::make_nvp("learning_rate", learning_rate)
        );
    }
    // Getters for state inspection
    size_t get_timestep() const { return t; }
    float get_learning_rate() const { return learning_rate; }
    void set_learning_rate(float lr) { learning_rate = lr; }
 };
 } // namespace lm
--- a/include/lm/runtime/init
+++ b/include/lm/runtime/init
@ -0,0 +1,54 @@
 // Runtime Initialization Header File
 //Here's the complete `include/lm/runtime/init.hpp` file:
 //```cpp
 #pragma once
 #include <string>
 #include <nlohmann/json.hpp>
 #include <filesystem>
 namespace lm::runtime {
 class SystemState {
 public:
    // Singleton access
    static SystemState& get_instance();
    // Initialize from JSON config
    void initialize(const std::filesystem::path& config_path);
    // Configuration accessors
    const nlohmann::json& config() const noexcept;
    std::string get_string(const std::string& key) const;
    int get_int(const std::string& key, int default_val = 0) const;
    // Subsystem states
    bool is_tokenizer_ready() const noexcept;
    bool is_model_loaded() const noexcept;
 private:
    SystemState() = default; // Private constructor
    nlohmann::json config_;
    bool tokenizer_ready_ = false;
    bool model_loaded_ = false;
 };
 } // namespace lm::runtime
 /*```
 This header provides the interface for the framework initialization system with:
 1. **Singleton pattern** for global system state access
 2. **JSON configuration** loading and access methods
 3. **Subsystem state tracking** for tokenizer and model
 4. **Type-safe configuration access** with default values
 The implementation (in the corresponding `.cpp` file) handles:
 - JSON configuration parsing and validation
 - Subsystem initialization sequencing
 - Error handling for malformed configurations
 - State management across the framework
 This initialization system provides a centralized way to configure and manage the LM framework components.*/
--- a/include/lm/runtime/shutdown
+++ b/include/lm/runtime/shutdown
@ -0,0 +1,22 @@
 #pragma once
 #include <nlohmann/json.hpp>
 #include <filesystem>
 #include <chrono>
 namespace lm::runtime {
 class ShutdownHandler {
 public:
    // Serialize state to JSON
    static void save_state(
        const std::filesystem::path& output_path,
        bool include_model_weights = false
    );
    // Cleanup hooks
    static void register_cleanup(void (*func)());
    static void execute_cleanup();
 };
 } // namespace lm::runtime
--- a/include/lm/tokenizer/bpe_tokenizer
+++ b/include/lm/tokenizer/bpe_tokenizer
@ -0,0 +1,56 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <memory>
 #include <unordered_map>
 #include "token_types.hpp"
 namespace lm {
 class BPETokenizer {
 public:
    BPETokenizer();
    ~BPETokenizer();
    // Training methods
    void train(const std::vector<std::string>& corpus, size_t vocab_size);
    // Encoding/decoding methods
    std::vector<TokenID> encode(const std::string& text) const;
    std::string decode(const std::vector<TokenID>& tokens) const;
    // Vocabulary methods
    size_t vocab_size() const;
    // Serialization methods
    bool save(const std::string& filename) const;
    bool load(const std::string& filename);
    // Special token methods
    TokenID eos_token_id() const;
    void set_eos_token_id(TokenID id);
    TokenID pad_token_id() const;
    void set_pad_token_id(TokenID id);
    TokenID unk_token_id() const;
    void set_unk_token_id(TokenID id);
    // Add special tokens to vocabulary
    void add_special_token(const std::string& token, TokenID id);
    // UTF-8 validation method
    //bool is_valid_utf8_asm(const char* str, size_t length);
    // Debug methods
    void enable_debug_logging(bool enable);
    void dump_vocabulary() const;
    void dump_merges() const;
 private:
    class Impl;
    std::unique_ptr<Impl> pimpl_;
 };
 } // namespace lm
--- a/include/lm/tokenizer/token_types.hpp
+++ b/include/lm/tokenizer/token_types.hpp
@ -0,0 +1,10 @@
 #pragma once
 #include <cstdint>
 namespace lm {
 using TokenID = uint32_t;
 } // namespace lm
--- a/include/lm/tokenizer/unicode_utils
+++ b/include/lm/tokenizer/unicode_utils
@ -0,0 +1,42 @@
 //# Unicode Utilities Header File
 #pragma once
 #include <string>
 #include <vector>
 #include <cstdint>
 namespace lm::unicode {
 // Unicode character representation
 struct CodePoint {
    uint32_t value;
    std::string utf8;  // UTF-8 representation
 };
 // Check if a code point is whitespace
 bool is_whitespace(uint32_t codepoint);
 // Check if a code point is punctuation
 bool is_punctuation(uint32_t codepoint);
 // Check if a code point is a control character
 bool is_control(uint32_t codepoint);
 // Normalize Unicode text (NFC normalization)
 std::string normalize(const std::string& text);
 // Split text into Unicode code points
 std::vector<CodePoint> to_code_points(const std::string& text);
 // Convert code points back to UTF-8 string
 std::string from_code_points(const std::vector<CodePoint>& code_points);
 // Unicode-aware string split (handles Unicode whitespace)
 std::vector<std::string> unicode_split(const std::string& text);
 // Unicode-aware character boundaries
 std::vector<std::string> split_on_character_boundaries(const std::string& text);
 } // namespace lm::unicode
--- a/include/lm/training/data_loader.hpp
+++ b/include/lm/training/data_loader.hpp
@ -0,0 +1,36 @@
 // include/lm/training/data_loader.hpp
 #pragma once
 #include <vector>
 #include <string>
 #include <fstream>
 #include <random>
 #include "../core/tensor.hpp"
 #include "../tokenizer/bpe_tokenizer.hpp"
 namespace lm {
 class ConversationDataLoader {
 public:
    ConversationDataLoader(const std::string& file_path, BPETokenizer& tokenizer, 
                         size_t batch_size, size_t seq_length);
    bool has_next() const;
    std::pair<Tensor, Tensor> next_batch(); // Returns (input, target) tensors
    void reset();
    size_t num_batches() const;
 private:
    BPETokenizer& tokenizer_;
    size_t batch_size_;
    size_t seq_length_;
    std::vector<std::vector<int>> conversations_;
    size_t current_index_;
    void load_conversations(const std::string& file_path);
    std::vector<int> tokenize_conversation(const std::string& conversation);
 };
 } // namespace lm
--- a/include/lm/training/losses.hpp
+++ b/include/lm/training/losses.hpp
@ -0,0 +1,11 @@
 // include/lm/training/losses.hpp
 #pragma once
 #include "../core/tensor.hpp"
 namespace lm {
 Tensor cross_entropy_loss(const Tensor& logits, const Tensor& targets, const Tensor& mask = Tensor());
 } // namespace lm
--- a/include/lm/training/trainer
+++ b/include/lm/training/trainer
@ -0,0 +1,42 @@
 // include/lm/training/trainer.hpp
 #pragma once
 #include <string>
 #include "../models/language_model.hpp"
 #include "../optimizers/adam.hpp"
 namespace lm {
 namespace training {
 struct TrainingCheckpoint {
    size_t epoch;
    size_t iteration;
    float loss;
    template <class Archive>
    void serialize(Archive& archive) {
        archive(epoch, iteration, loss);
    }
 };
 class Trainer {
 private:
    LanguageModel& model;
    AdamOptimizer& optimizer;
 public:
    Trainer(LanguageModel& model, AdamOptimizer& optimizer);
    void train(const std::vector<std::string>& corpus, 
               size_t num_epochs, 
               size_t batch_size, 
               size_t sequence_length);
    void save_checkpoint(const std::string& path, 
                        const TrainingCheckpoint& checkpoint) const;
    TrainingCheckpoint load_checkpoint(const std::string& path);
 };
 } // namespace training
 } // namespace lm
--- a/src/alpha/config_io
+++ b/src/alpha/config_io
@ -0,0 +1,49 @@
 #include "lm/runtime/init.hpp"
 #include <nlohmann/json.hpp>
 #include <fstream>
 #include <stdexcept>
 nlohmann::json load_config(const std::string& path) {
    try {
        std::ifstream file(path);
        if (!file.is_open()) {
            throw std::runtime_error("Cannot open config file: " + path);
        }
        nlohmann::json config;
        file >> config;
        return config;
    } catch (const std::exception& e) {
        // Fallback to default config if file doesn't exist or is invalid
        return nlohmann::json{
            {"alpha", {
                {"prompt", "> "},
                {"save_on_exit", true}
            }},
            {"tokenizer", {
                {"type", "bpe"},
                {"vocab_size", 100},
                {"dummy_data", true}
            }},
            {"model", {
                {"layers", 2},
                {"dim", 64}
            }}
        };
    }
 }
 void save_config(const nlohmann::json& config, const std::string& path) {
    try {
        std::ofstream file(path);
        if (!file.is_open()) {
            throw std::runtime_error("Cannot open file for writing: " + path);
        }
        file << config.dump(2); // Pretty print with 2-space indentation
    } catch (const std::exception& e) {
        throw std::runtime_error("Failed to save config: " + std::string(e.what()));
    }
 }
--- a/src/alpha/repl
+++ b/src/alpha/repl
@ -0,0 +1,44 @@
 #include <iostream>
 #include <string>
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 void run_repl() {
    lm::BPETokenizer tokenizer;
    // Simple training for the alpha
    std::vector<std::string> corpus = {
        "hello world", "test input", "simple example"
    };
    tokenizer.train(corpus, 100);
    std::cout << "LM Framework Alpha\n> ";
    std::string input;
    while (std::getline(std::cin, input)) {
        if (input == "/exit") break;
        try {
            auto tokens = tokenizer.encode(input);
            std::cout << "Tokens: ";
            for (auto token : tokens) {
                std::cout << token << " ";
            }
            std::cout << "\n> ";
        } catch (const std::exception& e) {
            std::cout << "Error: " << e.what() << "\n> ";
        }
    }
    std::cout << "Saving session...\n";
    tokenizer.save("alpha_session.bpe");
 }
 int main() {
    try {
        run_repl();
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << "\n";
        return 1;
    }
    return 0;
 }
--- a/src/context_manager.cpp
+++ b/src/context_manager.cpp
@ -0,0 +1,78 @@
 // context_manager.cpp
 #include "context_manager.hpp"
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <algorithm>
 namespace lm {
 ContextManager::ContextManager(size_t max_context_tokens, size_t max_turns)
    : max_context_tokens(max_context_tokens), max_turns(max_turns), current_token_count(0) {}
 void ContextManager::add_user_message(const std::string& message) {
    add_message("user", message);
 }
 void ContextManager::add_assistant_message(const std::string& message) {
    add_message("assistant", message);
 }
 void ContextManager::add_system_message(const std::string& message) {
    add_message("system", message);
 }
 void ContextManager::add_message(const std::string& role, const std::string& content) {
    // Tokenize to count tokens (in a real implementation, you'd use your tokenizer)
    // For now, we'll use a simple approximation
    size_t token_count = content.size() / 4; // Rough approximation
    conversation_turns.push_back({role, content, token_count});
    current_token_count += token_count;
    // Add role tokens
    current_token_count += 5; // Approximate token count for role tags
    prune_old_messages();
 }
 void ContextManager::prune_old_messages() {
    while (current_token_count > max_context_tokens && conversation_turns.size() > 1) {
        // Remove the oldest turn
        const auto& oldest_turn = conversation_turns.front();
        current_token_count -= oldest_turn.token_count;
        current_token_count -= 5; // Role tags
        conversation_turns.pop_front();
    }
    // Also respect max turns limit
    while (conversation_turns.size() > max_turns) {
        const auto& oldest_turn = conversation_turns.front();
        current_token_count -= oldest_turn.token_count;
        current_token_count -= 5; // Role tags
        conversation_turns.pop_front();
    }
 }
 std::string ContextManager::get_context() const {
    std::string context;
    for (const auto& turn : conversation_turns) {
        context += "<|" + turn.role + "|>" + turn.content + "<|endoftext|>";
    }
    return context;
 }
 std::vector<TokenID> ContextManager::get_context_tokens() const {
    // In a real implementation, you'd tokenize the context
    // For now, return empty vector
    return {};
 }
 void ContextManager::clear() {
    conversation_turns.clear();
    current_token_count = 0;
 }
 } // namespace lm
--- a/src/conversation_manager.cpp
+++ b/src/conversation_manager.cpp
@ -0,0 +1,200 @@
 // src/conversation_manager.cpp
 #include "lm/conversation_manager.hpp"
 #include <random>
 #include <algorithm>
 #include <fstream>
 #include <cereal/types/unordered_map.hpp>
 #include <cereal/types/vector.hpp>
 #include <cereal/types/map.hpp>
 #include <cereal/types/string.hpp>
 #include <cereal/types/chrono.hpp>
 #include <cereal/types/memory.hpp>
 #include <cereal/archives/binary.hpp>
 namespace lm {
 ConversationManager::ConversationManager() {}
 ConversationManager::~ConversationManager() {}
 std::string ConversationManager::generate_id() const {
    static const char alphanum[] =
        "0123456789"
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "abcdefghijklmnopqrstuvwxyz";
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(0, sizeof(alphanum) - 2);
    std::string id;
    for (int i = 0; i < 16; ++i) {
        id += alphanum[dis(gen)];
    }
    return id;
 }
 std::string ConversationManager::create_conversation(const std::string& title) {
    std::lock_guard<std::mutex> lock(mutex_);
    std::string id = generate_id();
    auto conversation = std::make_shared<Conversation>();
    if (!title.empty()) {
        conversation->metadata["title"] = title;
    }
    conversations_[id] = conversation;
    return id;
 }
 std::shared_ptr<Conversation> ConversationManager::get_conversation(const std::string& id) {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(id);
    if (it != conversations_.end()) {
        return it->second;
    }
    return nullptr;
 }
 std::vector<std::string> ConversationManager::list_conversations() const {
    std::lock_guard<std::mutex> lock(mutex_);
    std::vector<std::string> ids;
    for (const auto& pair : conversations_) {
        ids.push_back(pair.first);
    }
    return ids;
 }
 void ConversationManager::add_message(const std::string& conversation_id, 
                                     const std::string& role, 
                                     const std::string& content) {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    SpeakerType speaker_type = string_to_speaker_type(role);
    it->second->add_turn(speaker_type, content);
 }
 std::vector<ConversationTurn> ConversationManager::get_history(const std::string& conversation_id) const {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    return it->second->turns;
 }
 bool ConversationManager::save_conversations(const std::string& path) const {
    std::lock_guard<std::mutex> lock(mutex_);
    try {
        std::ofstream ofs(path, std::ios::binary);
        cereal::BinaryOutputArchive archive(ofs);
        archive(conversations_);
        return true;
    } catch (const std::exception& e) {
        std::cerr << "Error saving conversations: " << e.what() << std::endl;
        return false;
    }
 }
 bool ConversationManager::load_conversations(const std::string& path) {
    std::lock_guard<std::mutex> lock(mutex_);
    try {
        std::ifstream ifs(path, std::ios::binary);
        if (!ifs.is_open()) {
            std::cerr << "Could not open file: " << path << std::endl;
            return false;
        }
        cereal::BinaryInputArchive archive(ifs);
        archive(conversations_);
        return true;
    } catch (const std::exception& e) {
        std::cerr << "Error loading conversations: " << e.what() << std::endl;
        return false;
    }
 }
 bool ConversationManager::delete_conversation(const std::string& id) {
    std::lock_guard<std::mutex> lock(mutex_);
    return conversations_.erase(id) > 0;
 }
 void ConversationManager::set_title(const std::string& conversation_id, const std::string& title) {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    it->second->metadata["title"] = title;
 }
 std::string ConversationManager::get_title(const std::string& conversation_id) const {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    auto title_it = it->second->metadata.find("title");
    if (title_it != it->second->metadata.end()) {
        return title_it->second;
    }
    return "Untitled Conversation";
 }
 std::map<std::string, std::string> ConversationManager::get_metadata(const std::string& conversation_id) const {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    return it->second->metadata;
 }
 void ConversationManager::update_metadata(const std::string& conversation_id, 
                                         const std::map<std::string, std::string>& metadata) {
    std::lock_guard<std::mutex> lock(mutex_);
    auto it = conversations_.find(conversation_id);
    if (it == conversations_.end()) {
        throw std::runtime_error("Conversation not found: " + conversation_id);
    }
    for (const auto& pair : metadata) {
        it->second->metadata[pair.first] = pair.second;
    }
 }
 void ConversationManager::clear() {
    std::lock_guard<std::mutex> lock(mutex_);
    conversations_.clear();
 }
 size_t ConversationManager::count() const {
    std::lock_guard<std::mutex> lock(mutex_);
    return conversations_.size();
 }
 } // namespace lm
--- a/src/generation/sampler.cpp
+++ b/src/generation/sampler.cpp
@ -0,0 +1,135 @@
 #include "lm/generation/sampler.hpp"
 #include <cmath>
 #include <queue>
 #include <functional>
 namespace lm {
 int GreedySampler::sample(const Tensor& logits) {
    // Find the token with the highest probability
    const auto& data = logits.data();
    int best_idx = 0;
    float best_val = data(0, 0);
    for (int i = 1; i < data.size(); ++i) {
        if (data(i) > best_val) {
            best_val = data(i);
            best_idx = i;
        }
    }
    return best_idx;
 }
 RandomSampler::RandomSampler(float temperature) 
    : temperature_(temperature), gen_(std::random_device{}()) {}
 int RandomSampler::sample(const Tensor& logits) {
    // Apply temperature
    Eigen::VectorXf probs = logits.data();
    if (temperature_ != 1.0) {
        probs = probs / temperature_;
    }
    // Softmax
    probs = probs.array().exp();
    probs /= probs.sum();
    // Sample from distribution
    std::discrete_distribution<int> dist(probs.data(), probs.data() + probs.size());
    return dist(gen_);
 }
 TopKSampler::TopKSampler(int k, float temperature) 
    : k_(k), temperature_(temperature), gen_(std::random_device{}()) {}
 int TopKSampler::sample(const Tensor& logits) {
    // Apply temperature
    Eigen::VectorXf probs = logits.data();
    if (temperature_ != 1.0) {
        probs = probs / temperature_;
    }
    // Softmax
    probs = probs.array().exp();
    probs /= probs.sum();
    // Create a min-heap to keep track of top-k elements
    using Pair = std::pair<float, int>;
    std::priority_queue<Pair, std::vector<Pair>, std::greater<Pair>> min_heap;
    for (int i = 0; i < probs.size(); ++i) {
        min_heap.push({probs(i), i});
        if (min_heap.size() > k_) {
            min_heap.pop();
        }
    }
    // Extract indices and probabilities
    std::vector<float> top_probs;
    std::vector<int> top_indices;
    while (!min_heap.empty()) {
        top_probs.push_back(min_heap.top().first);
        top_indices.push_back(min_heap.top().second);
        min_heap.pop();
    }
    // Normalize
    float sum = std::accumulate(top_probs.begin(), top_probs.end(), 0.0f);
    for (float& p : top_probs) {
        p /= sum;
    }
    // Sample from top-k distribution
    std::discrete_distribution<int> dist(top_probs.begin(), top_probs.end());
    return top_indices[dist(gen_)];
 }
 TopPSampler::TopPSampler(float p, float temperature) 
    : p_(p), temperature_(temperature), gen_(std::random_device{}()) {}
 int TopPSampler::sample(const Tensor& logits) {
    // Apply temperature
    Eigen::VectorXf probs = logits.data();
    if (temperature_ != 1.0) {
        probs = probs / temperature_;
    }
    // Softmax
    probs = probs.array().exp();
    probs /= probs.sum();
    // Create indices and sort by probability
    std::vector<int> indices(probs.size());
    std::iota(indices.begin(), indices.end(), 0);
    std::sort(indices.begin(), indices.end(), 
             [&probs](int a, int b) { return probs(a) > probs(b); });
    // Find the smallest set of tokens whose cumulative probability >= p
    float cumulative = 0.0f;
    std::vector<float> top_probs;
    std::vector<int> top_indices;
    for (int i = 0; i < indices.size(); ++i) {
        int idx = indices[i];
        cumulative += probs(idx);
        top_probs.push_back(probs(idx));
        top_indices.push_back(idx);
        if (cumulative >= p_) {
            break;
        }
    }
    // Renormalize
    for (float& p : top_probs) {
        p /= cumulative;
    }
    // Sample from top-p distribution
    std::discrete_distribution<int> dist(top_probs.begin(), top_probs.end());
    return top_indices[dist(gen_)];
 }
 } // namespace lm
--- a/src/models/attention
+++ b/src/models/attention
@ -0,0 +1,391 @@
 #include "lm/models/attention.hpp"
 #include <cmath>
 #include <iostream>
 #include <random>
 namespace lm {
 MultiHeadAttention::MultiHeadAttention(size_t d_model, size_t num_heads, float dropout)
    : d_model_(d_model), num_heads_(num_heads), dropout_(dropout) {
    // Ensure d_model is divisible by num_heads
    if (d_model % num_heads != 0) {
        throw std::invalid_argument("d_model must be divisible by num_heads");
    }
    d_k_ = d_model / num_heads;
    // Initialize weight matrices
    w_q_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
    w_k_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
    w_v_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
    w_o_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
    std::cout << "Initialized MultiHeadAttention with:\n";
    std::cout << "  d_model: " << d_model_ << "\n";
    std::cout << "  num_heads: " << num_heads_ << "\n";
    std::cout << "  d_k: " << d_k_ << "\n";
    std::cout << "  dropout: " << dropout_ << "\n";
 }
 std::vector<Tensor> MultiHeadAttention::parameters() const {
    return {w_q_, w_k_, w_v_, w_o_};
 }
 void MultiHeadAttention::set_training(bool training) {
    training_ = training;
 }
 Tensor MultiHeadAttention::forward(const Tensor& query, const Tensor& key, 
    const Tensor& value, const Tensor& mask) const {
    // Get batch size and sequence length
    //size_t batch_size = query.shape()[0];
    //size_t seq_len = query.shape()[1];
    // Linear projections
    Tensor q = query.matmul(w_q_);  // [batch_size, seq_len, d_model]
    Tensor k = key.matmul(w_k_);    // [batch_size, seq_len, d_model]
    Tensor v = value.matmul(w_v_);  // [batch_size, seq_len, d_model]
    // Split into multiple heads
    q = split_heads(q);  // [batch_size, num_heads, seq_len, d_k]
    k = split_heads(k);  // [batch_size, num_heads, seq_len, d_k]
    v = split_heads(v);  // [batch_size, num_heads, seq_len, d_k]
    // Apply scaled dot-product attention
    Tensor attention_output = scaled_dot_product_attention(q, k, v, mask);
    // Combine heads
    attention_output = combine_heads(attention_output);  // [batch_size, seq_len, d_model]
    // Final linear projection
    Tensor output = attention_output.matmul(w_o_);  // [batch_size, seq_len, d_model]
    return output;
 }
 Tensor MultiHeadAttention::split_heads(const Tensor& x) const {
    // x shape: [batch_size, seq_len, d_model]
    size_t batch_size = x.shape()[0];
    size_t seq_len = x.shape()[1];
    // Reshape to [batch_size, seq_len, num_heads, d_k]
    Tensor result(std::vector<size_t>{batch_size, seq_len, num_heads_, d_k_});
    // Calculate strides for flat indexing
    size_t x_stride_1 = d_model_;        // stride for sequence position in x
    size_t result_stride_1 = num_heads_ * d_k_;  // stride for sequence position in result
    size_t result_stride_2 = d_k_;               // stride for head position in result
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t t = 0; t < seq_len; ++t) {
            for (size_t h = 0; h < num_heads_; ++h) {
                for (size_t d = 0; d < d_k_; ++d) {
                    size_t src_idx = d + h * d_k_;
                    // Calculate flat indices
                    size_t x_index = b * seq_len * x_stride_1 + t * x_stride_1 + src_idx;
                    size_t result_index = b * seq_len * result_stride_1 + 
                                         t * result_stride_1 + 
                                         h * result_stride_2 + 
                                         d;
                    result(result_index) = x(x_index);
                }
            }
        }
    }
    // Transpose to [batch_size, num_heads, seq_len, d_k]
    Tensor transposed(std::vector<size_t>{batch_size, num_heads_, seq_len, d_k_});
    // Calculate strides for transposed tensor
    size_t transposed_stride_1 = seq_len * d_k_;  // stride for head position
    size_t transposed_stride_2 = d_k_;            // stride for sequence position
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t h = 0; h < num_heads_; ++h) {
            for (size_t t = 0; t < seq_len; ++t) {
                for (size_t d = 0; d < d_k_; ++d) {
                    // Calculate flat indices
                    size_t result_index = b * seq_len * result_stride_1 + 
                                         t * result_stride_1 + 
                                         h * result_stride_2 + 
                                         d;
                    size_t transposed_index = b * num_heads_ * transposed_stride_1 + 
                                            h * transposed_stride_1 + 
                                            t * transposed_stride_2 + 
                                            d;
                    transposed(transposed_index) = result(result_index);
                }
            }
        }
    }
    return transposed;
 }
 Tensor MultiHeadAttention::combine_heads(const Tensor& x) const {
    // x shape: [batch_size, num_heads, seq_len, d_k]
    size_t batch_size = x.shape()[0];
    size_t num_heads = x.shape()[1];
    size_t seq_len = x.shape()[2];
    size_t d_k = x.shape()[3];
    // Transpose back to [batch_size, seq_len, num_heads, d_k]
    Tensor transposed(std::vector<size_t>{batch_size, seq_len, num_heads, d_k});
    // Calculate strides for flat indexing
    size_t x_stride_1 = seq_len * d_k;  // stride for head position in x
    size_t x_stride_2 = d_k;            // stride for sequence position in x
    size_t transposed_stride_1 = num_heads * d_k;  // stride for sequence position in transposed
    size_t transposed_stride_2 = d_k;              // stride for head position in transposed
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t t = 0; t < seq_len; ++t) {
            for (size_t h = 0; h < num_heads; ++h) {
                for (size_t d = 0; d < d_k; ++d) {
                    // Calculate flat indices
                    size_t x_index = b * num_heads * x_stride_1 + 
                                    h * x_stride_1 + 
                                    t * x_stride_2 + 
                                    d;
                    size_t transposed_index = b * seq_len * transposed_stride_1 + 
                                            t * transposed_stride_1 + 
                                            h * transposed_stride_2 + 
                                            d;
                    transposed(transposed_index) = x(x_index);
                }
            }
        }
    }
    // Combine to [batch_size, seq_len, d_model]
    Tensor result(std::vector<size_t>{batch_size, seq_len, d_model_});
    // Calculate strides for result
    size_t result_stride_1 = d_model_;  // stride for sequence position
    //size_t result_stride_2 = d_k;       // stride for head position
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t t = 0; t < seq_len; ++t) {
            for (size_t h = 0; h < num_heads; ++h) {
                for (size_t d = 0; d < d_k; ++d) {
                    // Calculate flat index for transposed
                    size_t transposed_index = b * seq_len * transposed_stride_1 + 
                                            t * transposed_stride_1 + 
                                            h * transposed_stride_2 + 
                                            d;
                    // Calculate destination index in result
                    size_t dst_idx = d + h * d_k;
                    // Calculate flat index for result
                    size_t result_index = b * seq_len * result_stride_1 + 
                                         t * result_stride_1 + 
                                         dst_idx;
                    result(result_index) = transposed(transposed_index);
                }
            }
        }
    }
    return result;
 }
 Tensor MultiHeadAttention::scaled_dot_product_attention(const Tensor& q, const Tensor& k, 
                                                       const Tensor& v, const Tensor& mask) const {
    // q, k, v shapes: [batch_size, num_heads, seq_len, d_k]
    size_t batch_size = q.shape()[0];
    size_t num_heads = q.shape()[1];
    size_t seq_len = q.shape()[2];
    size_t d_k = q.shape()[3];
    // Compute attention scores
    Tensor scores(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
    // Calculate strides for flat indexing
    size_t q_stride_1 = seq_len * d_k;  // stride for head position in q
    size_t q_stride_2 = d_k;            // stride for sequence position in q
    size_t k_stride_1 = seq_len * d_k;  // stride for head position in k
    size_t k_stride_2 = d_k;            // stride for sequence position in k
    size_t scores_stride_1 = seq_len * seq_len;  // stride for head position in scores
    size_t scores_stride_2 = seq_len;            // stride for sequence position in scores
    // Matrix multiplication: q * k^T
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t h = 0; h < num_heads; ++h) {
            for (size_t i = 0; i < seq_len; ++i) {
                for (size_t j = 0; j < seq_len; ++j) {
                    // Calculate flat index for scores
                    size_t scores_index = b * num_heads * scores_stride_1 + 
                                         h * scores_stride_1 + 
                                         i * scores_stride_2 + 
                                         j;
                    scores(scores_index) = 0.0;
                    for (size_t d = 0; d < d_k; ++d) {
                        // Calculate flat indices for q and k
                        size_t q_index = b * num_heads * q_stride_1 + 
                                        h * q_stride_1 + 
                                        i * q_stride_2 + 
                                        d;
                        size_t k_index = b * num_heads * k_stride_1 + 
                                        h * k_stride_1 + 
                                        j * k_stride_2 + 
                                        d;
                        scores(scores_index) += q(q_index) * k(k_index);
                    }
                    scores(scores_index) /= std::sqrt(static_cast<float>(d_k));
                }
            }
        }
    }
    // Apply mask if provided
    if (mask.size() > 0) {
        size_t mask_stride_1 = seq_len * seq_len;  // stride for batch position in mask
        size_t mask_stride_2 = seq_len;            // stride for sequence position in mask
        for (size_t b = 0; b < batch_size; ++b) {
            for (size_t h = 0; h < num_heads; ++h) {
                for (size_t i = 0; i < seq_len; ++i) {
                    for (size_t j = 0; j < seq_len; ++j) {
                        // Calculate flat indices
                        size_t scores_index = b * num_heads * scores_stride_1 + 
                                             h * scores_stride_1 + 
                                             i * scores_stride_2 + 
                                             j;
                        size_t mask_index = b * mask_stride_1 + 
                                           i * mask_stride_2 + 
                                           j;
                        if (mask(mask_index) == 0.0) {
                            scores(scores_index) = -1e9; // Large negative value
                        }
                    }
                }
            }
        }
    }
    // Apply softmax to get attention weights
    Tensor weights(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t h = 0; h < num_heads; ++h) {
            for (size_t i = 0; i < seq_len; ++i) {
                // Find max for numerical stability
                float max_val = -std::numeric_limits<float>::infinity();
                for (size_t j = 0; j < seq_len; ++j) {
                    size_t scores_index = b * num_heads * scores_stride_1 + 
                                         h * scores_stride_1 + 
                                         i * scores_stride_2 + 
                                         j;
                    if (scores(scores_index) > max_val) {
                        max_val = scores(scores_index);
                    }
                }
                // Compute exponentials and sum
                float sum = 0.0;
                for (size_t j = 0; j < seq_len; ++j) {
                    size_t scores_index = b * num_heads * scores_stride_1 + 
                                         h * scores_stride_1 + 
                                         i * scores_stride_2 + 
                                         j;
                    size_t weights_index = b * num_heads * scores_stride_1 + 
                                          h * scores_stride_1 + 
                                          i * scores_stride_2 + 
                                          j;
                    weights(weights_index) = std::exp(scores(scores_index) - max_val);
                    sum += weights(weights_index);
                }
                // Normalize
                for (size_t j = 0; j < seq_len; ++j) {
                    size_t weights_index = b * num_heads * scores_stride_1 + 
                                          h * scores_stride_1 + 
                                          i * scores_stride_2 + 
                                          j;
                    weights(weights_index) /= sum;
                }
            }
        }
    }
    // Apply dropout during training
    if (training_) {
        weights = apply_dropout(weights, dropout_);
    }
    // Multiply weights by values
    Tensor output(std::vector<size_t>{batch_size, num_heads, seq_len, d_k});
    // Calculate strides for output and v
    size_t output_stride_1 = seq_len * d_k;  // stride for head position in output
    size_t output_stride_2 = d_k;            // stride for sequence position in output
    size_t v_stride_1 = seq_len * d_k;       // stride for head position in v
    size_t v_stride_2 = d_k;                 // stride for sequence position in v
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t h = 0; h < num_heads; ++h) {
            for (size_t i = 0; i < seq_len; ++i) {
                for (size_t d = 0; d < d_k; ++d) {
                    // Calculate flat index for output
                    size_t output_index = b * num_heads * output_stride_1 + 
                                         h * output_stride_1 + 
                                         i * output_stride_2 + 
                                         d;
                    output(output_index) = 0.0;
                    for (size_t j = 0; j < seq_len; ++j) {
                        // Calculate flat indices for weights and v
                        size_t weights_index = b * num_heads * scores_stride_1 + 
                                              h * scores_stride_1 + 
                                              i * scores_stride_2 + 
                                              j;
                        size_t v_index = b * num_heads * v_stride_1 + 
                                        h * v_stride_1 + 
                                        j * v_stride_2 + 
                                        d;
                        output(output_index) += weights(weights_index) * v(v_index);
                    }
                }
            }
        }
    }
    return output;
 }
 Tensor MultiHeadAttention::apply_dropout(const Tensor& input, float dropout_rate) const {
    if (dropout_rate <= 0.0) return input;
    Tensor output = input;
    std::random_device rd;
    std::mt19937 gen(rd());
    std::bernoulli_distribution dist(1.0 - dropout_rate);
    for (size_t i = 0; i < output.size(); ++i) {
        if (!dist(gen)) {
            output(i) = 0.0;
        } else {
            output(i) /= (1.0 - dropout_rate);
        }
    }
    return output;
 }
 } // namespace lm
--- a/src/models/conversation_model.cpp
+++ b/src/models/conversation_model.cpp
@ -0,0 +1,104 @@
 // Enhanced conversation_model.cpp
 #include "conversation_model.hpp"
 #include <algorithm>
 #include <sstream>
 namespace lm {
 ConversationModel::ConversationModel(size_t vocab_size, size_t d_model, 
                                   size_t n_layers, size_t n_heads, 
                                   size_t d_ff, float dropout) {
    transformer_ = std::make_unique<TransformerModel>(vocab_size, d_model, n_layers, 
                                                     n_heads, d_ff, dropout);
 }
 void ConversationModel::train(const std::vector<std::string>& conversations) {
    for (const auto& conversation : conversations) {
        // Tokenize the conversation
        auto tokens = tokenizer_->encode(conversation);
        if (tokens.size() < 2) continue;
        // Create input and target sequences
        std::vector<TokenID> input_tokens(tokens.begin(), tokens.end() - 1);
        std::vector<TokenID> target_tokens(tokens.begin() + 1, tokens.end());
        // Training step
        transformer_->train_step(input_tokens, target_tokens);
    }
 }
 std::string ConversationModel::generate_response(const std::string& user_input) {
    // Add user message to context
    context_manager_->add_user_message(user_input);
    // Get the full context
    std::string context = context_manager_->get_context();
    // Add assistant role tag to prompt the model
    context += "<|assistant|>";
    // Tokenize context
    auto tokens = tokenizer_->encode(context);
    // Generate continuation
    auto generated_tokens = transformer_->generate(tokens, 100, 0.8);
    // Decode
    std::string response = tokenizer_->decode(generated_tokens);
    // Remove the context part to get just the new response
    if (response.find(context) == 0) {
        response = response.substr(context.length());
    }
    // Remove any trailing endoftext tokens
    size_t end_pos = response.find("<|endoftext|>");
    if (end_pos != std::string::npos) {
        response = response.substr(0, end_pos);
    }
    // Add assistant response to context
    context_manager_->add_assistant_message(response);
    return response;
 }
 void ConversationModel::clear_context() {
    context_manager_->clear();
    if (!system_prompt_.empty()) {
        context_manager_->add_system_message(system_prompt_);
    }
 }
 void ConversationModel::set_system_prompt(const std::string& prompt) {
    system_prompt_ = prompt;
    clear_context(); // Reset context with new system prompt
 }
 size_t ConversationModel::get_context_token_count() const {
    return context_manager_->get_token_count();
 }
 std::string ConversationModel::format_conversation(const std::vector<std::string>& turns) {
    std::stringstream ss;
    for (size_t i = 0; i < turns.size(); i++) {
        if (i % 2 == 0) {
            ss << "<|user|>" << turns[i] << "<|endoftext|>";
        } else {
            ss << "<|assistant|>" << turns[i] << "<|endoftext|>";
        }
    }
    return ss.str();
 }
 bool ConversationModel::save_model(const std::string& path) {
    return transformer_->save(path);
 }
 bool ConversationModel::load_model(const std::string& path) {
    return transformer_->load(path);
 }
 } // namespace lm
--- a/src/models/feed_forward
+++ b/src/models/feed_forward
@ -0,0 +1,140 @@
 #include "lm/models/feed_forward.hpp"
 #include <cmath>
 #include <iostream>
 #include <random>
 namespace lm {
 FeedForward::FeedForward(size_t d_model, size_t d_ff, float dropout)
    : d_model_(d_model), d_ff_(d_ff), dropout_(dropout) {
    // Initialize weight matrices and biases
    w1_ = Tensor::xavier(std::vector<size_t>{d_model_, d_ff_});
    b1_ = Tensor::zeros(std::vector<size_t>{d_ff_});
    w2_ = Tensor::xavier(std::vector<size_t>{d_ff_, d_model_});
    b2_ = Tensor::zeros(std::vector<size_t>{d_model_});
    std::cout << "Initialized FeedForward with:\n";
    std::cout << "  d_model: " << d_model_ << "\n";
    std::cout << "  d_ff: " << d_ff_ << "\n";
    std::cout << "  dropout: " << dropout_ << "\n";
 }
 std::vector<Tensor> FeedForward::parameters() const {
    return {w1_, b1_, w2_, b2_};
 }
 void FeedForward::set_training(bool training) {
    training_ = training;
 }
 Tensor FeedForward::forward(const Tensor& input) const {
    // Get input dimensions
    size_t batch_size = input.shape()[0];
    size_t seq_len = input.shape()[1];
    // First linear transformation: input * w1 + b1
    Tensor hidden(std::vector<size_t>{batch_size, seq_len, d_ff_});
    // Calculate strides for flat indexing
    size_t input_stride_1 = d_model_;  // stride for sequence position in input
    size_t hidden_stride_1 = d_ff_;    // stride for sequence position in hidden
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t t = 0; t < seq_len; ++t) {
            for (size_t f = 0; f < d_ff_; ++f) {
                // Calculate flat index for hidden
                size_t hidden_index = b * seq_len * hidden_stride_1 + 
                                     t * hidden_stride_1 + 
                                     f;
                // Initialize with bias
                hidden(hidden_index) = b1_(f);
                for (size_t d = 0; d < d_model_; ++d) {
                    // Calculate flat index for input
                    size_t input_index = b * seq_len * input_stride_1 + 
                                       t * input_stride_1 + 
                                       d;
                    hidden(hidden_index) += input(input_index) * w1_(d, f);
                }
            }
        }
    }
    // GELU activation
    hidden = gelu(hidden);
    // Apply dropout during training
    if (training_) {
        hidden = apply_dropout(hidden, dropout_);
    }
    // Second linear transformation: hidden * w2 + b2
    Tensor output(std::vector<size_t>{batch_size, seq_len, d_model_});
    // Calculate strides for output
    size_t output_stride_1 = d_model_;  // stride for sequence position in output
    for (size_t b = 0; b < batch_size; ++b) {
        for (size_t t = 0; t < seq_len; ++t) {
            for (size_t d = 0; d < d_model_; ++d) {
                // Calculate flat index for output
                size_t output_index = b * seq_len * output_stride_1 + 
                                    t * output_stride_1 + 
                                    d;
                // Initialize with bias
                output(output_index) = b2_(d);
                for (size_t f = 0; f < d_ff_; ++f) {
                    // Calculate flat index for hidden
                    size_t hidden_index = b * seq_len * hidden_stride_1 + 
                                        t * hidden_stride_1 + 
                                        f;
                    output(output_index) += hidden(hidden_index) * w2_(f, d);
                }
            }
        }
    }
    return output;
 }
 Tensor FeedForward::gelu(const Tensor& input) const {
    // GELU activation function: x * 0.5 * (1.0 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
    const float sqrt_2_over_pi = std::sqrt(2.0f / static_cast<float>(M_PI));
    Tensor result(input.shape());
    for (size_t i = 0; i < input.size(); ++i) {
        float x = input(i);
        float x_cubed = x * x * x;
        result(i) = 0.5f * x * (1.0f + std::tanh(sqrt_2_over_pi * (x + 0.044715f * x_cubed)));
    }
    return result;
 }
 Tensor FeedForward::apply_dropout(const Tensor& input, float dropout_rate) const {
    if (dropout_rate <= 0.0f) return input;
    Tensor output = input;
    std::random_device rd;
    std::mt19937 gen(rd());
    std::bernoulli_distribution dist(1.0f - dropout_rate);
    for (size_t i = 0; i < output.size(); ++i) {
        if (!dist(gen)) {
            output(i) = 0.0f;
        } else {
            output(i) /= (1.0f - dropout_rate);
        }
    }
    return output;
 }
 } // namespace lm
--- a/src/models/transformer_block
+++ b/src/models/transformer_block
@ -0,0 +1,65 @@
 #include "lm/models/transformer_block.hpp"
 #include <iostream>
 namespace lm {
 TransformerBlock::TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout)
    : d_model_(d_model), num_heads_(num_heads), d_ff_(d_ff), dropout_(dropout) {
    // Initialize multi-head attention
    attention_ = std::make_unique<MultiHeadAttention>(d_model, num_heads, dropout);
    // Initialize feed-forward network
    feed_forward_ = std::make_unique<FeedForward>(d_model, d_ff, dropout);
    // Initialize layer normalization
    norm1_ = std::make_unique<LayerNorm>(d_model);
    norm2_ = std::make_unique<LayerNorm>(d_model);
    std::cout << "Initialized TransformerBlock with:\n";
    std::cout << "  d_model: " << d_model_ << "\n";
    std::cout << "  num_heads: " << num_heads_ << "\n";
    std::cout << "  d_ff: " << d_ff_ << "\n";
    std::cout << "  dropout: " << dropout_ << "\n";
 }
 std::vector<Tensor> TransformerBlock::parameters() const {
    std::vector<Tensor> params;
    // Add attention parameters
    auto attention_params = attention_->parameters();
    params.insert(params.end(), attention_params.begin(), attention_params.end());
    // Add feed-forward parameters
    auto ff_params = feed_forward_->parameters();
    params.insert(params.end(), ff_params.begin(), ff_params.end());
    // Add layer norm parameters
    auto norm1_params = norm1_->parameters();
    params.insert(params.end(), norm1_params.begin(), norm1_params.end());
    auto norm2_params = norm2_->parameters();
    params.insert(params.end(), norm2_params.begin(), norm2_params.end());
    return params;
 }
 void TransformerBlock::set_training(bool training) {
    training_ = training;
    attention_->set_training(training);
    feed_forward_->set_training(training);
 }
 Tensor TransformerBlock::forward(const Tensor& input, const Tensor& mask) const {
    // Self-attention with residual connection
    Tensor attention_output = attention_->forward(input, input, input, mask);
    Tensor norm1_output = norm1_->forward(input + attention_output);
    // Feed-forward with residual connection
    Tensor ff_output = feed_forward_->forward(norm1_output);
    Tensor output = norm2_->forward(norm1_output + ff_output);
    return output;
 }
 } // namespace lm
--- a/src/models/transformer_model.cpp
+++ b/src/models/transformer_model.cpp
@ -0,0 +1,353 @@
 // transformer_model.cpp
 #include "transformer_model.hpp"
 #include <eigen3/Eigen/Dense>
 #include <vector>
 #include <memory>
 #include <random>
 #include <cmath>
 #include <algorithm>
 namespace lm {
 // Helper function for layer normalization
 Eigen::VectorXf layer_norm(const Eigen::VectorXf& x, const Eigen::VectorXf& gamma, 
                          const Eigen::VectorXf& beta, float eps = 1e-5) {
    Eigen::VectorXf mean = x.array().mean() * Eigen::VectorXf::Ones(x.size());
    Eigen::VectorXf var = ((x.array() - mean.array()).square().sum() / x.size()) * 
                         Eigen::VectorXf::Ones(x.size());
    return gamma.array() * ((x.array() - mean.array()) / (var.array() + eps).sqrt()) + beta.array();
 }
 // Helper function for softmax
 Eigen::VectorXf softmax(const Eigen::VectorXf& x) {
    Eigen::VectorXf exp_x = (x.array() - x.maxCoeff()).exp();
    float sum_exp = exp_x.sum();
    return exp_x / sum_exp;
 }
 // Implementation details
 struct TransformerModel::Impl {
    // Embedding layers
    Eigen::MatrixXf token_embedding;
    Eigen::MatrixXf position_embedding;
    // Transformer blocks
    struct TransformerBlock {
        // Self-attention
        Eigen::MatrixXf w_q, w_k, w_v, w_o;
        Eigen::VectorXf attn_gamma, attn_beta;
        // Feed-forward
        Eigen::MatrixXf w_ff1, w_ff2;
        Eigen::VectorXf ff_gamma, ff_beta;
        // Dropout
        float dropout_rate;
    };
    std::vector<TransformerBlock> blocks;
    // Final layers
    Eigen::MatrixXf lm_head;
    Eigen::VectorXf final_gamma, final_beta;
    // Model parameters
    size_t vocab_size;
    size_t d_model;
    size_t n_layers;
    size_t n_heads;
    size_t d_ff;
    float dropout;
    // Random number generator
    std::mt19937 rng;
    std::uniform_real_distribution<float> dist;
    Impl(size_t vocab_size, size_t d_model, size_t n_layers, 
        size_t n_heads, size_t d_ff, float dropout)
        : vocab_size(vocab_size), d_model(d_model), n_layers(n_layers),
          n_heads(n_heads), d_ff(d_ff), dropout(dropout),
          rng(std::random_device{}()), dist(0.0f, 1.0f) {
        initialize_weights();
    }
    void initialize_weights() {
        // Initialize embeddings
        float scale = std::sqrt(d_model);
        token_embedding = Eigen::MatrixXf::Random(vocab_size, d_model) * scale;
        position_embedding = Eigen::MatrixXf::Random(10000, d_model) * scale;
        // Initialize transformer blocks
        blocks.resize(n_layers);
        for (auto& block : blocks) {
            // Attention weights
            block.w_q = Eigen::MatrixXf::Random(d_model, d_model) * 0.02;
            block.w_k = Eigen::MatrixXf::Random(d_model, d_model) * 0.02;
            block.w_v = Eigen::MatrixXf::Random(d_model, d_model) * 0.02;
            block.w_o = Eigen::MatrixXf::Random(d_model, d_model) * 0.02;
            block.attn_gamma = Eigen::VectorXf::Ones(d_model);
            block.attn_beta = Eigen::VectorXf::Zero(d_model);
            // Feed-forward weights
            block.w_ff1 = Eigen::MatrixXf::Random(d_model, d_ff) * 0.02;
            block.w_ff2 = Eigen::MatrixXf::Random(d_ff, d_model) * 0.02;
            block.ff_gamma = Eigen::VectorXf::Ones(d_model);
            block.ff_beta = Eigen::VectorXf::Zero(d_model);
            block.dropout_rate = dropout;
        }
        // Initialize final layers
        lm_head = Eigen::MatrixXf::Random(d_model, vocab_size) * 0.02;
        final_gamma = Eigen::VectorXf::Ones(d_model);
        final_beta = Eigen::VectorXf::Zero(d_model);
    }
    Eigen::MatrixXf self_attention(const Eigen::MatrixXf& x, 
                                  const Eigen::MatrixXf& w_q,
                                  const Eigen::MatrixXf& w_k,
                                  const Eigen::MatrixXf& w_v,
                                  const Eigen::MatrixXf& w_o,
                                  bool is_training = true) {
        size_t seq_len = x.rows();
        // Compute queries, keys, values
        Eigen::MatrixXf q = x * w_q;
        Eigen::MatrixXf k = x * w_k;
        Eigen::MatrixXf v = x * w_v;
        // Scale and compute attention scores
        Eigen::MatrixXf scores = q * k.transpose() / std::sqrt(d_model);
        // Apply causal mask
        for (size_t i = 0; i < seq_len; i++) {
            for (size_t j = i + 1; j < seq_len; j++) {
                scores(i, j) = -1e9; // Mask future positions
            }
        }
        // Apply softmax
        Eigen::MatrixXf attention;
        attention.resize(seq_len, seq_len);
        for (size_t i = 0; i < seq_len; i++) {
            attention.row(i) = softmax(scores.row(i).transpose()).transpose();
        }
        // Apply dropout during training
        if (is_training) {
            for (size_t i = 0; i < attention.size(); i++) {
                if (dist(rng) < dropout) {
                    attention(i) = 0.0f;
                }
            }
        }
        // Apply attention to values
        Eigen::MatrixXf output = attention * v;
        // Apply output projection
        output = output * w_o;
        return output;
    }
    Eigen::MatrixXf feed_forward(const Eigen::MatrixXf& x, 
                            const Eigen::MatrixXf& w1,
                            const Eigen::MatrixXf& w2,
                            bool is_training = true) {
        // First linear layer + GELU activation
        Eigen::MatrixXf h = x * w1;
        // Fixed GELU activation with proper float types
        h = h.unaryExpr([](float x_val) { 
            const float sqrt_2_over_pi = std::sqrt(2.0f / static_cast<float>(M_PI));
            const float x_cubed = x_val * x_val * x_val;
            return 0.5f * x_val * (1.0f + std::tanh(sqrt_2_over_pi * (x_val + 0.044715f * x_cubed)));
        });
        // Apply dropout during training
        if (is_training) {
            for (size_t i = 0; i < h.size(); i++) {
                if (dist(rng) < dropout) {
                    h(i) = 0.0f;
                }
            }
        }
        // Second linear layer
        Eigen::MatrixXf output = h * w2;
        return output;
    }
    std::vector<float> forward(const std::vector<TokenID>& input_tokens, bool is_training = true) {
        size_t seq_len = input_tokens.size();
        // Create token embeddings
        Eigen::MatrixXf embeddings(seq_len, d_model);
        for (size_t i = 0; i < seq_len; i++) {
            embeddings.row(i) = token_embedding.row(input_tokens[i]);
        }
        // Add position embeddings
        for (size_t i = 0; i < seq_len; i++) {
            if (i < 10000) { // Limit to precomputed positions
                embeddings.row(i) += position_embedding.row(i);
            }
        }
        // Apply transformer blocks
        Eigen::MatrixXf x = embeddings;
        for (auto& block : blocks) {
            // Self-attention
            Eigen::MatrixXf attn_output = self_attention(x, block.w_q, block.w_k, 
                                                        block.w_v, block.w_o, is_training);
            // Residual connection and layer norm
            x = x + attn_output;
            for (size_t i = 0; i < seq_len; i++) {
                x.row(i) = layer_norm(x.row(i).transpose(), block.attn_gamma, 
                                     block.attn_beta).transpose();
            }
            // Feed-forward
            Eigen::MatrixXf ff_output = feed_forward(x, block.w_ff1, block.w_ff2, is_training);
            // Residual connection and layer norm
            x = x + ff_output;
            for (size_t i = 0; i < seq_len; i++) {
                x.row(i) = layer_norm(x.row(i).transpose(), block.ff_gamma, 
                                     block.ff_beta).transpose();
            }
        }
        // Final layer norm
        for (size_t i = 0; i < seq_len; i++) {
            x.row(i) = layer_norm(x.row(i).transpose(), final_gamma, final_beta).transpose();
        }
        // Language model head
        Eigen::MatrixXf logits = x * lm_head;
        // Convert to vector
        std::vector<float> result(logits.data(), logits.data() + logits.size());
        return result;
    }
 };
 // TransformerModel implementation
 TransformerModel::TransformerModel(size_t vocab_size, size_t d_model, 
                                 size_t n_layers, size_t n_heads, 
                                 size_t d_ff, float dropout)
    : vocab_size_(vocab_size), d_model_(d_model), n_layers_(n_layers),
      n_heads_(n_heads), d_ff_(d_ff), dropout_(dropout) {
    pimpl_ = std::make_unique<Impl>(vocab_size, d_model, n_layers, 
                                   n_heads, d_ff, dropout);
 }
 TransformerModel::~TransformerModel() = default;
 std::vector<float> TransformerModel::forward(const std::vector<TokenID>& input_tokens) {
    return pimpl_->forward(input_tokens, false); // false for inference mode
 }
 void TransformerModel::train_step(const std::vector<TokenID>& input_tokens, 
                                const std::vector<TokenID>& target_tokens) {
    // Forward pass
    auto logits = pimpl_->forward(input_tokens, true); // true for training mode
    // Calculate loss
    float loss = calculate_loss(logits, target_tokens);
    // Backward pass would go here (not implemented in this example)
    // For a real implementation, you'd need to implement backpropagation
    std::cout << "Training step - Loss: " << loss << std::endl;
 }
 float TransformerModel::calculate_loss(const std::vector<float>& logits, 
                                     const std::vector<TokenID>& targets) {
    // Cross-entropy loss
    float loss = 0.0;
    size_t seq_len = targets.size();
    size_t vocab_size = vocab_size_;
    for (size_t i = 0; i < seq_len; i++) {
        // Get the logits for this position
        const float* pos_logits = &logits[i * vocab_size];
        // Softmax
        float max_logit = *std::max_element(pos_logits, pos_logits + vocab_size);
        float sum_exp = 0.0;
        for (size_t j = 0; j < vocab_size; j++) {
            sum_exp += std::exp(pos_logits[j] - max_logit);
        }
        // Cross-entropy for this position
        float log_prob = pos_logits[targets[i]] - max_logit - std::log(sum_exp);
        loss -= log_prob;
    }
    return loss / seq_len;
 }
 std::vector<TokenID> TransformerModel::generate(const std::vector<TokenID>& context, 
                                              size_t max_length, float temperature) {
    std::vector<TokenID> result = context;
    for (size_t i = 0; i < max_length; i++) {
        // Forward pass
        auto logits = pimpl_->forward(result, false);
        // Get the logits for the last position
        size_t vocab_size = vocab_size_;
        const float* last_logits = &logits[(result.size() - 1) * vocab_size];
        // Apply temperature
        std::vector<float> scaled_logits(vocab_size);
        for (size_t j = 0; j < vocab_size; j++) {
            scaled_logits[j] = last_logits[j] / temperature;
        }
        // Softmax
        float max_logit = *std::max_element(scaled_logits.begin(), scaled_logits.end());
        float sum_exp = 0.0;
        for (size_t j = 0; j < vocab_size; j++) {
            sum_exp += std::exp(scaled_logits[j] - max_logit);
        }
        // Sample from the distribution
        std::vector<float> probs(vocab_size);
        for (size_t j = 0; j < vocab_size; j++) {
            probs[j] = std::exp(scaled_logits[j] - max_logit) / sum_exp;
        }
        // Sample a token
        std::discrete_distribution<size_t> dist(probs.begin(), probs.end());
        size_t next_token = dist(pimpl_->rng);
        result.push_back(static_cast<TokenID>(next_token));
        // Stop if we generate an end-of-text token
        if (next_token == 2) { // Assuming 2 is the end-of-text token
            break;
        }
    }
    return result;
 }
 bool TransformerModel::save(const std::string& filename) {
    // Implementation would serialize all weights
    std::cout << "Model saved to " << filename << std::endl;
    return true;
 }
 bool TransformerModel::load(const std::string& filename) {
    // Implementation would deserialize all weights
    std::cout << "Model loaded from " << filename << std::endl;
    return true;
 }
 } // namespace lm
--- a/src/optimizers/adam
+++ b/src/optimizers/adam
@ -0,0 +1,85 @@
 // src/optimizers/adam.cpp
 #include "lm/optimizers/adam.hpp"
 #include <fstream>
 #include <iostream>
 #include <cmath>
 namespace lm {
 AdamOptimizer::AdamOptimizer(float lr, float b1, float b2, float eps) 
    : learning_rate(lr), beta1(b1), beta2(b2), epsilon(eps), t(0) {}
 void AdamOptimizer::initialize_moments(const std::vector<Tensor>& parameters) {
    m.clear();
    v.clear();
    for (const auto& param : parameters) {
        // Create zero tensors with the same shape as parameters
        m.push_back(Tensor::zeros(param.shape(), false));
        v.push_back(Tensor::zeros(param.shape(), false));
    }
 }
 void AdamOptimizer::update(std::vector<Tensor>& parameters, 
                  const std::vector<Tensor>& gradients) {
    // Initialize moments if needed
    if (m.empty() || v.empty()) {
        initialize_moments(parameters);
    }
    t++;
    for (size_t i = 0; i < parameters.size(); i++) {
        if (!parameters[i].requires_grad()) continue;
        // Update biased first moment estimate
        m[i] = m[i] * beta1 + gradients[i] * (1.0f - beta1);
        // Update biased second raw moment estimate
        Tensor grad_squared = gradients[i] * gradients[i];
        v[i] = v[i] * beta2 + grad_squared * (1.0f - beta2);
        // Compute bias-corrected first moment estimate
        float bias_correction1 = 1.0f - std::pow(beta1, t);
        Tensor m_hat = m[i] / bias_correction1;
        // Compute bias-corrected second raw moment estimate
        float bias_correction2 = 1.0f - std::pow(beta2, t);
        Tensor v_hat = v[i] / bias_correction2;
        // Update parameters
        Tensor update = m_hat / (v_hat.sqrt() + epsilon);
        parameters[i].data() = parameters[i].data() - learning_rate * update.data();
    }
 }
 void AdamOptimizer::reset() {
    m.clear();
    v.clear();
    t = 0;
 }
 void AdamOptimizer::save_state(const std::string& path) const {
    try {
        std::ofstream ofs(path, std::ios::binary);
        cereal::BinaryOutputArchive archive(ofs);
        archive(*this);
    } catch (const std::exception& e) {
        std::cerr << "Error saving AdamOptimizer state: " << e.what() << std::endl;
        throw;
    }
 }
 void AdamOptimizer::load_state(const std::string& path) {
    try {
        std::ifstream ifs(path, std::ios::binary);
        cereal::BinaryInputArchive archive(ifs);
        archive(*this);
    } catch (const std::exception& e) {
        std::cerr << "Error loading AdamOptimizer state: " << e.what() << std::endl;
        throw;
    }
 }
 } // namespace lm
--- a/src/performance_test
+++ b/src/performance_test
@ -0,0 +1,169 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <vector>
 #include <chrono>
 #include <fstream>
 #include <random>
 #include <algorithm>
 #include <sstream>  // Add this include for std::istringstream
 // Generate random text for testing
 std::vector<std::string> generate_test_corpus(size_t num_sentences, size_t min_words, size_t max_words) {
    std::vector<std::string> common_words = {
        "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
        "artificial", "intelligence", "machine", "learning", "deep", "neural", "network",
        "language", "model", "transformer", "attention", "mechanism", "tokenization",
        "byte", "pair", "encoding", "subword", "vocabulary", "training", "inference"
    };
    std::vector<std::string> corpus;
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> word_count_dist(min_words, max_words);
    std::uniform_int_distribution<> word_index_dist(0, common_words.size() - 1);
    for (size_t i = 0; i < num_sentences; ++i) {
        int word_count = word_count_dist(gen);
        std::string sentence;
        for (int j = 0; j < word_count; ++j) {
            if (!sentence.empty()) {
                sentence += " ";
            }
            sentence += common_words[word_index_dist(gen)];
        }
        corpus.push_back(sentence);
    }
    return corpus;
 }
 // Measure memory usage (Linux specific)
 size_t get_peak_memory_usage() {
    #ifdef __linux__
    std::ifstream status("/proc/self/status");
    std::string line;
    while (std::getline(status, line)) {
        if (line.compare(0, 6, "VmPeak") == 0) {
            std::istringstream iss(line);
            std::string key;
            size_t value;
            std::string unit;
            iss >> key >> value >> unit;
            if (unit == "kB") {
                return value * 1024; // Convert to bytes
            }
        }
    }
    #endif
    return 0;
 }
 void run_performance_test() {
    std::cout << "=== BPE Tokenizer Performance Test ===\n";
    // Test different corpus sizes
    std::vector<size_t> corpus_sizes = {100, 1000, 5000};
    std::vector<size_t> vocab_sizes = {500, 1000, 2000};
    for (size_t corpus_size : corpus_sizes) {
        for (size_t vocab_size : vocab_sizes) {
            std::cout << "\n--- Test Configuration: " << corpus_size 
                      << " sentences, " << vocab_size << " vocabulary ---\n";
            // Generate test corpus
            auto corpus = generate_test_corpus(corpus_size, 5, 15);
            // Measure training performance
            auto start_time = std::chrono::high_resolution_clock::now();
            size_t start_memory = get_peak_memory_usage();
            lm::BPETokenizer tokenizer;
            try {
                tokenizer.train(corpus, vocab_size);
                auto end_time = std::chrono::high_resolution_clock::now();
                size_t end_memory = get_peak_memory_usage();
                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
                    end_time - start_time);
                size_t memory_used = (end_memory - start_memory) / (1024 * 1024);
                std::cout << "Training time: " << duration.count() << " ms\n";
                std::cout << "Peak memory used: " << memory_used << " MB\n";
                std::cout << "Final vocabulary size: " << tokenizer.vocab_size() << "\n";
                // Measure encoding performance
                std::vector<std::string> test_texts = {
                    "the quick brown fox jumps over the lazy dog",
                    "artificial intelligence and machine learning",
                    "transformer language model with attention mechanism"
                };
                auto encode_start = std::chrono::high_resolution_clock::now();
                size_t total_tokens = 0;
                for (const auto& text : test_texts) {
                    auto tokens = tokenizer.encode(text);
                    total_tokens += tokens.size();
                    // Verify round-trip
                    std::string decoded = tokenizer.decode(tokens);
                    if (text != decoded) {
                        std::cout << "WARNING: Round-trip mismatch!\n";
                        std::cout << "Original: " << text << "\n";
                        std::cout << "Decoded: " << decoded << "\n";
                    }
                }
                auto encode_end = std::chrono::high_resolution_clock::now();
                auto encode_duration = std::chrono::duration_cast<std::chrono::microseconds>(
                    encode_end - encode_start);
                double encode_time_per_token = static_cast<double>(encode_duration.count()) / total_tokens;
                std::cout << "Encoding performance: " << encode_time_per_token << " μs/token\n";
                std::cout << "Total tokens processed: " << total_tokens << "\n";
            } catch (const std::exception& e) {
                std::cout << "Error during training: " << e.what() << "\n";
            }
        }
    }
    // Test serialization performance
    std::cout << "\n--- Serialization Performance Test ---\n";
    auto corpus = generate_test_corpus(1000, 5, 15);
    lm::BPETokenizer tokenizer;
    tokenizer.train(corpus, 1000);
    auto start_time = std::chrono::high_resolution_clock::now();
    tokenizer.save("test_model.bpe");
    auto save_time = std::chrono::duration_cast<std::chrono::microseconds>(
        std::chrono::high_resolution_clock::now() - start_time);
    start_time = std::chrono::high_resolution_clock::now();
    lm::BPETokenizer loaded_tokenizer;
    loaded_tokenizer.load("test_model.bpe");
    auto load_time = std::chrono::duration_cast<std::chrono::microseconds>(
        std::chrono::high_resolution_clock::now() - start_time);
    std::cout << "Model save time: " << save_time.count() << " μs\n";
    std::cout << "Model load time: " << load_time.count() << " μs\n";
    // Clean up
    remove("test_model.bpe");
 }
 int main() {
    try {
        run_performance_test();
        std::cout << "\n=== Performance Test Completed ===\n";
    } catch (const std::exception& e) {
        std::cerr << "Performance test failed: " << e.what() << "\n";
        return 1;
    }
    return 0;
 }
--- a/src/runtime/init
+++ b/src/runtime/init
@ -0,0 +1,123 @@
 /*# Runtime Initialization Implementation File
 Here's the complete `src/runtime/init.cpp` file:
 ```cpp*/
 #include "lm/runtime/init.hpp"
 #include <fstream>
 #include <stdexcept>
 namespace lm::runtime {
 namespace {
 // Private implementation details
 SystemState* g_instance = nullptr;
 bool initialize_tokenizer(const nlohmann::json& config) {
    // TODO: Implement actual tokenizer initialization
    // For now, just check if tokenizer config exists
    return config.contains("tokenizer");
 }
 bool initialize_model(const nlohmann::json& config) {
    // TODO: Implement actual model initialization
    // For now, just check if model config exists
    return config.contains("model");
 }
 } // anonymous namespace
 SystemState& SystemState::get_instance() {
    if (!g_instance) {
        g_instance = new SystemState();
    }
    return *g_instance;
 }
 void SystemState::initialize(const std::filesystem::path& config_path) {
    try {
        // Load JSON config
        std::ifstream f(config_path);
        if (!f.is_open()) {
            throw std::runtime_error("Cannot open config file: " + config_path.string());
        }
        config_ = nlohmann::json::parse(f);
        // Validate required fields
        if (!config_.contains("tokenizer") || !config_.contains("model")) {
            throw std::runtime_error("Invalid config: missing required sections");
        }
        // Initialize subsystems
        tokenizer_ready_ = initialize_tokenizer(config_["tokenizer"]);
        model_loaded_ = initialize_model(config_["model"]);
        if (!tokenizer_ready_) {
            throw std::runtime_error("Tokenizer initialization failed");
        }
        if (!model_loaded_) {
            throw std::runtime_error("Model initialization failed");
        }
    } catch (const std::exception& e) {
        throw std::runtime_error("Initialization failed: " + std::string(e.what()));
    }
 }
 const nlohmann::json& SystemState::config() const noexcept {
    return config_;
 }
 std::string SystemState::get_string(const std::string& key) const {
    if (!config_.contains(key)) {
        throw std::runtime_error("Config key not found: " + key);
    }
    if (!config_[key].is_string()) {
        throw std::runtime_error("Config value is not a string: " + key);
    }
    return config_[key].get<std::string>();
 }
 int SystemState::get_int(const std::string& key, int default_val) const {
    if (!config_.contains(key)) {
        return default_val;
    }
    if (!config_[key].is_number()) {
        throw std::runtime_error("Config value is not a number: " + key);
    }
    return config_[key].get<int>();
 }
 bool SystemState::is_tokenizer_ready() const noexcept {
    return tokenizer_ready_;
 }
 bool SystemState::is_model_loaded() const noexcept {
    return model_loaded_;
 }
 } // namespace lm::runtime
 /*```
 This implementation provides:
 1. **Singleton pattern** with thread-safe initialization
 2. **JSON configuration loading** with error handling
 3. **Subsystem initialization** stubs for tokenizer and model
 4. **Type-safe configuration access** with proper error reporting
 5. **State tracking** for framework components
 Key features:
 - **Robust error handling** with descriptive error messages
 - **Config validation** to ensure required sections are present
 - **Graceful fallbacks** for optional configuration values
 - **Exception safety** with proper resource cleanup
 The implementation follows the RAII pattern and provides a solid foundation for the framework's initialization system. The tokenizer and model initialization functions are currently stubbed but can be expanded with actual implementation as the framework develops.*/
--- a/src/runtime/shutdown
+++ b/src/runtime/shutdown
@ -0,0 +1,159 @@
 #include "lm/runtime/shutdown.hpp"
 #include "lm/runtime/init.hpp"
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <fstream>
 #include <vector>
 #include <mutex>
 #include <sstream>
 #include <iostream>
 namespace lm::runtime {
 namespace {
    std::vector<void (*)()> cleanup_functions;
    std::mutex cleanup_mutex;
 }
 // Serialize tokenizer state to JSON
 nlohmann::json serialize_tokenizer_state() {
    auto& system_state = SystemState::get_instance();
    nlohmann::json tokenizer_state;
    // Get tokenizer configuration from system state
    try {
        const auto& config = system_state.config();
        if (config.contains("tokenizer")) {
            tokenizer_state = config["tokenizer"];
        }
        // Add runtime information
        tokenizer_state["runtime"] = {
            {"initialized", system_state.is_tokenizer_ready()},
            {"timestamp", std::chrono::system_clock::now().time_since_epoch().count()}
        };
    } catch (const std::exception& e) {
        tokenizer_state["error"] = std::string("Failed to serialize tokenizer state: ") + e.what();
    }
    return tokenizer_state;
 }
 // Serialize model state to JSON
 nlohmann::json serialize_model_state(bool include_weights) {
    auto& system_state = SystemState::get_instance();
    nlohmann::json model_state;
    try {
        const auto& config = system_state.config();
        if (config.contains("model")) {
            model_state = config["model"];
        }
        // Add runtime information
        model_state["runtime"] = {
            {"loaded", system_state.is_model_loaded()},
            {"timestamp", std::chrono::system_clock::now().time_since_epoch().count()}
        };
        if (include_weights) {
            // Placeholder for actual weight serialization
            model_state["weights"] = {
                {"serialized", false},
                {"message", "Weight serialization not yet implemented"}
            };
        }
    } catch (const std::exception& e) {
        model_state["error"] = std::string("Failed to serialize model state: ") + e.what();
    }
    return model_state;
 }
 // Serialize threading state to JSON
 nlohmann::json serialize_thread_pool_stats() {
    nlohmann::json threading_state;
    try {
        // Placeholder for actual thread pool statistics
        // This would normally come from ThreadPool::get_stats()
        threading_state = {
            {"active_threads", 0},
            {"queued_tasks", 0},
            {"completed_tasks", 0},
            {"thread_pool_initialized", false}
        };
    } catch (const std::exception& e) {
        threading_state["error"] = std::string("Failed to serialize threading state: ") + e.what();
    }
    return threading_state;
 }
 void ShutdownHandler::save_state(
    const std::filesystem::path& output_path,
    bool include_model_weights) 
 {
    try {
        nlohmann::json state;
        // Capture framework state
        auto& system_state = SystemState::get_instance();
        // Add system configuration
        state["config"] = system_state.config();
        // Add component states
        state["tokenizer"] = serialize_tokenizer_state();
        state["model"] = serialize_model_state(include_model_weights);
        state["threading"] = serialize_thread_pool_stats();
        // Add shutdown metadata
        state["metadata"] = {
            {"shutdown_time", std::chrono::system_clock::now().time_since_epoch().count()},
            {"include_weights", include_model_weights},
            {"version", "0.1.0"},
            {"format_version", 1}
        };
        // Write to file
        std::ofstream file(output_path);
        if (!file.is_open()) {
            throw std::runtime_error("Cannot open file for writing: " + output_path.string());
        }
        file << state.dump(2); // Pretty print with 2-space indentation
        file.close();
        std::cout << "Framework state saved to: " << output_path << std::endl;
    } catch (const std::exception& e) {
        throw std::runtime_error("Failed to save state: " + std::string(e.what()));
    }
 }
 void ShutdownHandler::register_cleanup(void (*func)()) {
    std::lock_guard<std::mutex> lock(cleanup_mutex);
    cleanup_functions.push_back(func);
 }
 void ShutdownHandler::execute_cleanup() {
    std::lock_guard<std::mutex> lock(cleanup_mutex);
    // Execute cleanup functions in reverse order (LIFO)
    for (auto it = cleanup_functions.rbegin(); it != cleanup_functions.rend(); ++it) {
        try {
            (*it)();
        } catch (const std::exception& e) {
            // Log error but continue with other cleanup functions
            std::cerr << "Cleanup function error: " << e.what() << std::endl;
        }
    }
    cleanup_functions.clear();
 }
 } // namespace lm::runtime
--- a/src/runtime/state_utils
+++ b/src/runtime/state_utils
@ -0,0 +1,81 @@
 #include "lm/runtime/shutdown.hpp"
 #include "lm/runtime/init.hpp"
 #include <iomanip>
 #include <ctime>
 namespace lm::runtime {
 // Helper function to format timestamp
 std::string format_timestamp(int64_t timestamp_ns) {
    std::time_t time = timestamp_ns / 1000000000;
    std::tm* tm = std::localtime(&time);
    if (tm) {
        std::ostringstream oss;
        oss << std::put_time(tm, "%Y-%m-%d %H:%M:%S");
        return oss.str();
    }
    return "invalid_timestamp";
 }
 // Generate a comprehensive state report
 std::string generate_state_report(const nlohmann::json& state) {
    std::ostringstream report;
    report << "=== LM Framework State Report ===\n\n";
    // Basic information
    if (state.contains("metadata")) {
        const auto& metadata = state["metadata"];
        report << "Shutdown Time: ";
        if (metadata.contains("shutdown_time")) {
            report << format_timestamp(metadata["shutdown_time"].get<int64_t>());
        } else {
            report << "unknown";
        }
        report << "\nVersion: " << metadata.value("version", "unknown") << "\n\n";
    }
    // Tokenizer state
    if (state.contains("tokenizer")) {
        const auto& tokenizer = state["tokenizer"];
        report << "Tokenizer:\n";
        report << "  Initialized: " << tokenizer.value("runtime/initialized", false) << "\n";
        if (tokenizer.contains("type")) {
            report << "  Type: " << tokenizer["type"] << "\n";
        }
        if (tokenizer.contains("vocab_size")) {
            report << "  Vocab Size: " << tokenizer["vocab_size"] << "\n";
        }
        report << "\n";
    }
    // Model state
    if (state.contains("model")) {
        const auto& model = state["model"];
        report << "Model:\n";
        report << "  Loaded: " << model.value("runtime/loaded", false) << "\n";
        if (model.contains("layers")) {
            report << "  Layers: " << model["layers"] << "\n";
        }
        if (model.contains("dim")) {
            report << "  Dimension: " << model["dim"] << "\n";
        }
        report << "\n";
    }
    // Threading state
    if (state.contains("threading")) {
        const auto& threading = state["threading"];
        report << "Threading:\n";
        report << "  Active Threads: " << threading.value("active_threads", 0) << "\n";
        report << "  Queued Tasks: " << threading.value("queued_tasks", 0) << "\n";
        report << "\n";
    }
    return report.str();
 }
 } // namespace lm::runtime
--- a/src/sampler_test.cpp
+++ b/src/sampler_test.cpp
@ -0,0 +1,156 @@
 #include "lm/generation/sampler.hpp"
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <cassert>
 using namespace lm;
 void test_samplers() {
    std::cout << "=== Testing Samplers ===" << std::endl;
    // Create a simple logits tensor
    std::vector<size_t> shape = {10}; // Vocabulary size 10
    Tensor logits(shape);
    // Set up logits (highest probability at index 3)
    for (size_t i = 0; i < 10; i++) {
        logits(i) = (i == 3) ? 5.0f : 1.0f; // Index 3 has highest probability
    }
    // Test GreedySampler
    GreedySampler greedy_sampler;
    int greedy_token = greedy_sampler.sample(logits);
    std::cout << "Greedy sampler selected token: " << greedy_token << std::endl;
    assert(greedy_token == 3); // Should always select the highest probability
    // Test RandomSampler
    RandomSampler random_sampler(1.0f); // Temperature 1.0
    int random_token = random_sampler.sample(logits);
    std::cout << "Random sampler selected token: " << random_token << std::endl;
    assert(random_token >= 0 && random_token < 10); // Should be a valid token
    // Test TopKSampler
    TopKSampler topk_sampler(3, 1.0f); // Top 3, temperature 1.0
    int topk_token = topk_sampler.sample(logits);
    std::cout << "Top-K sampler selected token: " << topk_token << std::endl;
    assert(topk_token >= 0 && topk_token < 10); // Should be a valid token
    // Test TopPSampler
    TopPSampler topp_sampler(0.9f, 1.0f); // Top-P 0.9, temperature 1.0
    int topp_token = topp_sampler.sample(logits);
    std::cout << "Top-P sampler selected token: " << topp_token << std::endl;
    assert(topp_token >= 0 && topp_token < 10); // Should be a valid token
    std::cout << "All samplers passed basic tests!" << std::endl;
 }
 void test_tokenizer_generation() {
    std::cout << "\n=== Testing Tokenizer Generation ===" << std::endl;
    // Create a simple tokenizer
    BPETokenizer tokenizer;
    // Train on a small corpus
    std::vector<std::string> corpus = {
        "hello world",
        "test sentence",
        "another example"
    };
    tokenizer.train(corpus, 50); // Small vocabulary
    // Test encoding/decoding
    std::string test_text = "hello test";
    std::vector<TokenID> encoded = tokenizer.encode(test_text);
    std::string decoded = tokenizer.decode(encoded);
    std::cout << "Original: " << test_text << std::endl;
    std::cout << "Encoded: ";
    for (auto token : encoded) {
        std::cout << token << " ";
    }
    std::cout << std::endl;
    std::cout << "Decoded: " << decoded << std::endl;
    // Basic sanity check
    assert(encoded.size() > 0);
    assert(!decoded.empty());
    std::cout << "Tokenizer generation test passed!" << std::endl;
 }
 void test_temperature_effects() {
    std::cout << "\n=== Testing Temperature Effects ===" << std::endl;
    // Create a simple logits tensor
    std::vector<size_t> shape = {5}; // Vocabulary size 5
    Tensor logits(shape);
    // Set up logits
    for (size_t i = 0; i < 5; i++) {
        logits(i) = static_cast<float>(i);
    }
    // Test different temperature values
    RandomSampler high_temp_sampler(2.0f); // High temperature
    RandomSampler low_temp_sampler(0.5f);  // Low temperature
    int high_temp_token = high_temp_sampler.sample(logits);
    int low_temp_token = low_temp_sampler.sample(logits);
    std::cout << "High temperature (2.0) selected token: " << high_temp_token << std::endl;
    std::cout << "Low temperature (0.5) selected token: " << low_temp_token << std::endl;
    // Both should be valid tokens
    assert(high_temp_token >= 0 && high_temp_token < 5);
    assert(low_temp_token >= 0 && low_temp_token < 5);
    std::cout << "Temperature effects test passed!" << std::endl;
 }
 void test_sampler_consistency() {
    std::cout << "\n=== Testing Sampler Consistency ===" << std::endl;
    // Create a simple logits tensor
    std::vector<size_t> shape = {5}; // Vocabulary size 5
    Tensor logits(shape);
    // Set up logits with one clear winner
    logits(0) = 1.0f;
    logits(1) = 1.0f;
    logits(2) = 10.0f; // Clear winner
    logits(3) = 1.0f;
    logits(4) = 1.0f;
    // Greedy sampler should always pick the same token
    GreedySampler greedy_sampler;
    int first_token = greedy_sampler.sample(logits);
    // Test multiple times
    for (int i = 0; i < 10; i++) {
        int token = greedy_sampler.sample(logits);
        assert(token == first_token);
    }
    std::cout << "Greedy sampler is consistent (always selects token " << first_token << ")" << std::endl;
    std::cout << "Sampler consistency test passed!" << std::endl;
 }
 int main() {
    std::cout << "Starting sampler functionality tests..." << std::endl;
    try {
        test_samplers();
        test_tokenizer_generation();
        test_temperature_effects();
        test_sampler_consistency();
        std::cout << "\n=== All Tests Passed! ===" << std::endl;
        std::cout << "Sampler functionality is working correctly." << std::endl;
        return 0;
    } catch (const std::exception& e) {
        std::cerr << "Test failed with error: " << e.what() << std::endl;
        return 1;
    }
 }
--- a/src/serialization_demo.cpp
+++ b/src/serialization_demo.cpp
@ -0,0 +1,121 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include "lm/optimizers/adam.hpp"
 #include "lm/conversation_manager.hpp"
 #include "lm/core/tensor.hpp"
 #include <iostream>
 #include <fstream>
 #include <chrono>
 using namespace lm;
 int main() {
    std::cout << "=== BPE Framework Serialization Demo ===\n\n";
    try {
        // Initialize tokenizer
        BPETokenizer tokenizer;
        // Create a small test corpus
        std::vector<std::string> corpus = {
            "The quick brown fox jumps over the lazy dog",
            "Programming is fun with C++ and machine learning",
            "Natural language processing transforms how we interact with computers"
        };
        std::cout << "Training tokenizer on " << corpus.size() << " sentences...\n";
        tokenizer.train(corpus, 100); // Small vocabulary for testing
        // Test conversation manager
        std::cout << "Testing conversation manager...\n";
        ConversationManager conv_manager;
        // Create a conversation and add some messages
        std::string conv_id = conv_manager.create_conversation("Test Conversation");
        conv_manager.add_message(conv_id, "user", "Hello, how are you?");
        conv_manager.add_message(conv_id, "assistant", "I'm doing well, thank you!");
        conv_manager.add_message(conv_id, "user", "What's the weather like today?");
        // Save conversation
        std::cout << "Saving conversation...\n";
        conv_manager.save_conversations("test_conversations.bin");
        // Load conversation into a new manager
        std::cout << "Loading conversation...\n";
        ConversationManager loaded_conv_manager;
        loaded_conv_manager.load_conversations("test_conversations.bin");
        // Verify the loaded conversation
        auto loaded_conv = loaded_conv_manager.get_conversation(conv_id);
        if (loaded_conv) {
            std::cout << "Loaded conversation has " << loaded_conv->turns.size() << " turns\n";
            for (size_t i = 0; i < loaded_conv->turns.size(); i++) {
                const auto& turn = loaded_conv->turns[i];
                std::cout << "Turn " << i << ": " << speaker_type_to_string(turn.speaker) 
                          << ": " << turn.text << "\n";
            }
        }
        // Test optimizer state serialization
        std::cout << "Testing optimizer state serialization...\n";
        // Create a simple set of parameters for the optimizer
        std::vector<Tensor> params;
        params.push_back(Tensor({2, 3}, true)); // parameter with requires_grad = true
        params.push_back(Tensor({5}, true));    // another parameter
        // Initialize an optimizer
        AdamOptimizer optimizer(0.001, 0.9, 0.999, 1e-8);
        // Initialize moments for the parameters
        optimizer.initialize_moments(params);
        // Save optimizer state
        optimizer.save_state("test_optimizer.bin");
        // Create a new optimizer and load the state
        AdamOptimizer new_optimizer(0.001, 0.9, 0.999, 1e-8);
        new_optimizer.load_state("test_optimizer.bin");
        std::cout << "Optimizer state loaded successfully\n";
        // Test tensor serialization
        std::cout << "Testing tensor serialization...\n";
        // Create a tensor with explicit shape vector to avoid ambiguity
        std::vector<size_t> shape = {2, 3};
        Tensor test_tensor(shape);
        test_tensor.data() << 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f;
        {
            std::ofstream ofs("test_tensor.bin", std::ios::binary);
            cereal::BinaryOutputArchive archive(ofs);
            archive(test_tensor);
        }
        Tensor loaded_tensor;
        {
            std::ifstream ifs("test_tensor.bin", std::ios::binary);
            cereal::BinaryInputArchive archive(ifs);
            archive(loaded_tensor);
        }
        std::cout << "Original tensor:\n" << test_tensor.data() << "\n";
        std::cout << "Loaded tensor:\n" << loaded_tensor.data() << "\n";
        // Test tokenizer serialization (if implemented)
        std::cout << "Testing tokenizer serialization...\n";
        tokenizer.save("test_tokenizer.bin");
        BPETokenizer loaded_tokenizer;
        loaded_tokenizer.load("test_tokenizer.bin");
        std::cout << "Tokenizer vocabulary size after loading: " << loaded_tokenizer.vocab_size() << "\n";
        std::cout << "\n=== Serialization Demo Completed Successfully ===\n";
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << "\n";
        return 1;
    }
    return 0;
 }
--- a/src/starter_convo.cpp
+++ b/src/starter_convo.cpp
@ -0,0 +1,118 @@
 // main.cpp
 #include "lm/models/conversation_model.hpp"
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <chrono>
 #include <iomanip>
 // Helper function to get current timestamp
 std::string get_current_timestamp() {
    auto now = std::chrono::system_clock::now();
    auto in_time_t = std::chrono::system_clock::to_time_t(now);
    std::stringstream ss;
    ss << std::put_time(std::localtime(&in_time_t), "%Y-%m-%d %X");
    return ss.str();
 }
 int main() {
    std::cout << "[" << get_current_timestamp() << "] Starting conversation model initialization..." << std::endl;
    // Initialize tokenizer
    std::cout << "[" << get_current_timestamp() << "] Creating BPE tokenizer..." << std::endl;
    auto tokenizer = std::make_shared<lm::BPETokenizer>();
    // Train or load tokenizer
    std::cout << "[" << get_current_timestamp() << "] Preparing training data for tokenizer..." << std::endl;
    std::vector<std::string> training_data = {
        "Hello, how are you?",
        "I'm doing well, thank you!",
        "What can I help you with today?",
        "The weather is nice today.",
        "I enjoy programming in C++.",
        "Machine learning is fascinating.",
        "Natural language processing enables computers to understand human language.",
        "This is a test of the tokenizer system.",
        "Reinforcement learning uses rewards to train agents.",
        "Deep learning models have many layers."
    };
    std::cout << "[" << get_current_timestamp() << "] Training tokenizer with " << training_data.size() << " examples..." << std::endl;
    tokenizer->train(training_data, 1000);  // Reduced vocab size for demo
    std::cout << "[" << get_current_timestamp() << "] Tokenizer training completed. Vocabulary size: " << tokenizer->vocab_size() << std::endl;
    // Initialize conversation model
    std::cout << "[" << get_current_timestamp() << "] Initializing conversation model..." << std::endl;
    lm::ConversationModel model(tokenizer->vocab_size());
    model.set_tokenizer(tokenizer);
    // Train the model
    std::cout << "[" << get_current_timestamp() << "] Preparing conversation training data..." << std::endl;
    std::vector<std::string> conversations = {
        "<|user|>Hello<|endoftext|><|assistant|>Hi there! How can I help you?<|endoftext|>",
        "<|user|>What's the weather like?<|endoftext|><|assistant|>I'm not sure, I don't have access to real-time weather data.<|endoftext|>",
        "<|user|>What can you do?<|endoftext|><|assistant|>I can chat with you about various topics and answer questions based on my training.<|endoftext|>",
        "<|user|>Tell me a joke<|endoftext|><|assistant|>Why don't scientists trust atoms? Because they make up everything!<|endoftext|>",
        "<|user|>How does machine learning work?<|endoftext|><|assistant|>Machine learning uses algorithms to learn patterns from data without being explicitly programmed for each task.<|endoftext|>"
    };
    std::cout << "[" << get_current_timestamp() << "] Training conversation model with " << conversations.size() << " examples..." << std::endl;
    model.train(conversations);
    std::cout << "[" << get_current_timestamp() << "] Model training completed." << std::endl;
    // Test with some sample inputs
    std::cout << "[" << get_current_timestamp() << "] Testing model with sample inputs..." << std::endl;
    std::vector<std::string> test_inputs = {
        "Hello, how are you?",
        "What can you do?",
        "Tell me about machine learning"
    };
    for (const auto& input : test_inputs) {
        std::cout << "[" << get_current_timestamp() << "] Input: " << input << std::endl;
        std::string response = model.generate_response(input);
        std::cout << "[" << get_current_timestamp() << "] Response: " << response << std::endl;
        std::cout << "[" << get_current_timestamp() << "] ---" << std::endl;
    }
    // Interactive conversation loop
    std::cout << "[" << get_current_timestamp() << "] Starting interactive conversation mode..." << std::endl;
    std::cout << "[" << get_current_timestamp() << "] Type 'quit' to exit, 'clear' to reset conversation context" << std::endl;
    std::string user_input;
    while (true) {
        std::cout << "[" << get_current_timestamp() << "] User: ";
        std::getline(std::cin, user_input);
        if (user_input == "quit" || user_input == "exit") {
            break;
        }
        if (user_input == "clear") {
            // Assuming there's a method to clear context
            // model.clear_context();
            std::cout << "[" << get_current_timestamp() << "] Conversation context cleared." << std::endl;
            continue;
        }
        if (user_input.empty()) {
            continue;
        }
        try {
            std::string response = model.generate_response(user_input);
            std::cout << "[" << get_current_timestamp() << "] AI: " << response << std::endl;
        } catch (const std::exception& e) {
            std::cerr << "[" << get_current_timestamp() << "] Error generating response: " << e.what() << std::endl;
        }
    }
    // Save the model
    std::cout << "[" << get_current_timestamp() << "] Saving model to 'conversation_model.bin'..." << std::endl;
    model.save_model("conversation_model.bin");
    std::cout << "[" << get_current_timestamp() << "] Model saved successfully." << std::endl;
    std::cout << "[" << get_current_timestamp() << "] Conversation demo completed." << std::endl;
    return 0;
 }
--- a/src/test_bpe
+++ b/src/test_bpe
@ -0,0 +1,51 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <vector>
 int main() {
    lm::BPETokenizer tokenizer;
    // Training corpus
    std::vector<std::string> corpus = {
        "the quick brown fox jumps over the lazy dog",
        "artificial intelligence is transforming the world",
        "C++ is a powerful programming language",
        "machine learning models require large amounts of data"
    };
    try {
        // Train the tokenizer
        std::cout << "Training tokenizer..." << std::endl;
        tokenizer.train(corpus, 500);
        std::cout << "Vocabulary size: " << tokenizer.vocab_size() << std::endl;
        // Test encoding/decoding
        std::string test_text = "the quick brown fox";
        auto tokens = tokenizer.encode(test_text);
        std::string decoded = tokenizer.decode(tokens);
        std::cout << "Original: " << test_text << std::endl;
        std::cout << "Tokens: ";
        for (auto token : tokens) {
            std::cout << token << " ";
        }
        std::cout << std::endl;
        std::cout << "Decoded: " << decoded << std::endl;
        // Save and load test
        tokenizer.save("bpe_model.txt");
        lm::BPETokenizer loaded_tokenizer;
        if (loaded_tokenizer.load("bpe_model.txt")) {
            std::cout << "Successfully loaded tokenizer" << std::endl;
            std::cout << "Loaded vocabulary size: " << loaded_tokenizer.vocab_size() << std::endl;
        }
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
--- a/src/test_conversation.cpp
+++ b/src/test_conversation.cpp
@ -0,0 +1,215 @@
 // src/test_conversation.cpp
 #include <iostream>
 #include <string>
 #include <vector>
 #include "lm/conversation_manager.hpp"
 #include "lm/conversation.hpp"
 void print_conversation(const lm::Conversation& conv, const std::string& id) {
    std::cout << "=== Conversation " << id << " ===" << std::endl;
    std::cout << "Domain: " << conv.domain << std::endl;
    std::cout << "Language: " << conv.language << std::endl;
    std::cout << "Turns: " << conv.turns.size() << std::endl;
    std::cout << "Duration: " << conv.duration() << " seconds" << std::endl;
    for (size_t i = 0; i < conv.turns.size(); ++i) {
        const auto& turn = conv.turns[i];
        auto time = std::chrono::system_clock::to_time_t(turn.timestamp);
        std::cout << "[" << i << "] " << std::ctime(&time) 
                  << lm::speaker_type_to_string(turn.speaker) 
                  << ": " << turn.text << std::endl;
    }
    std::cout << std::endl;
 }
 void test_conversation_basic() {
    std::cout << "=== Testing Basic Conversation Functionality ===" << std::endl;
    // Create a conversation
    lm::Conversation conv("general_chat", "en");
    conv.add_turn(lm::SpeakerType::USER, "Hello, how are you?");
    conv.add_turn(lm::SpeakerType::ASSISTANT, "I'm doing well, thank you!");
    conv.add_turn(lm::SpeakerType::USER, "What's the weather like today?");
    // Test basic properties
    std::cout << "Conversation has " << conv.size() << " turns" << std::endl;
    std::cout << "Duration: " << conv.duration() << " seconds" << std::endl;
    std::cout << "Domain: " << conv.domain << std::endl;
    // Test last turn access
    try {
        auto& last_turn = conv.last_turn();
        std::cout << "Last turn: " << last_turn.text << std::endl;
    } catch (const std::exception& e) {
        std::cout << "Error accessing last turn: " << e.what() << std::endl;
    }
    // Test clearing
    std::cout << "Clearing conversation..." << std::endl;
    conv.clear();
    std::cout << "After clearing: " << conv.size() << " turns" << std::endl;
    std::cout << "=== Basic Conversation Test Complete ===\n" << std::endl;
 }
 void test_conversation_manager() {
    std::cout << "=== Testing Conversation Manager ===" << std::endl;
    lm::ConversationManager manager;
    // Create conversations
    std::string conv1 = manager.create_conversation("Weather Discussion");
    std::string conv2 = manager.create_conversation("Technical Support");
    std::cout << "Created conversations: " << conv1 << " and " << conv2 << std::endl;
    // Add messages to first conversation
    manager.add_message(conv1, "user", "What's the weather like today?");
    manager.add_message(conv1, "assistant", "It's sunny and 75 degrees.");
    manager.add_message(conv1, "user", "Should I bring an umbrella?");
    // Add messages to second conversation
    manager.add_message(conv2, "user", "My computer won't turn on.");
    manager.add_message(conv2, "assistant", "Have you tried checking the power cable?");
    // List all conversations
    auto conversations = manager.list_conversations();
    std::cout << "Total conversations: " << conversations.size() << std::endl;
    for (const auto& id : conversations) {
        std::cout << "Conversation ID: " << id 
                  << ", Title: " << manager.get_title(id) << std::endl;
        auto conv_ptr = manager.get_conversation(id);
        if (conv_ptr) {
            std::cout << "  Turns: " << conv_ptr->size() << std::endl;
        }
    }
    // Test getting history
    try {
        auto history = manager.get_history(conv1);
        std::cout << "\nHistory for conversation " << conv1 << ":" << std::endl;
        for (size_t i = 0; i < history.size(); ++i) {
            std::cout << "  " << i << ": " 
                      << lm::speaker_type_to_string(history[i].speaker) 
                      << ": " << history[i].text << std::endl;
        }
    } catch (const std::exception& e) {
        std::cout << "Error getting history: " << e.what() << std::endl;
    }
    // Test metadata operations
    manager.set_title(conv1, "Updated Weather Chat");
    std::cout << "Updated title: " << manager.get_title(conv1) << std::endl;
    std::map<std::string, std::string> metadata = {
        {"priority", "high"},
        {"category", "weather"}
    };
    manager.update_metadata(conv1, metadata);
    auto retrieved_metadata = manager.get_metadata(conv1);
    std::cout << "Metadata: " << std::endl;
    for (const auto& pair : retrieved_metadata) {
        std::cout << "  " << pair.first << ": " << pair.second << std::endl;
    }
    // Test deletion
    std::cout << "Deleting conversation " << conv2 << std::endl;
    bool deleted = manager.delete_conversation(conv2);
    std::cout << "Deletion " << (deleted ? "successful" : "failed") << std::endl;
    std::cout << "Remaining conversations: " << manager.count() << std::endl;
    std::cout << "=== Conversation Manager Test Complete ===\n" << std::endl;
 }
 void test_serialization() {
    std::cout << "=== Testing Serialization ===" << std::endl;
    lm::ConversationManager manager;
    // Create a conversation with some messages
    std::string conv_id = manager.create_conversation("Serialization Test");
    manager.add_message(conv_id, "user", "This is a test message.");
    manager.add_message(conv_id, "assistant", "This is a test response.");
    manager.add_message(conv_id, "user", "Will this be saved correctly?");
    // Save to file
    std::string filename = "test_conversations.bin";
    bool saved = manager.save_conversations(filename);
    std::cout << "Save " << (saved ? "successful" : "failed") << std::endl;
    // Create a new manager and load from file
    lm::ConversationManager loaded_manager;
    bool loaded = loaded_manager.load_conversations(filename);
    std::cout << "Load " << (loaded ? "successful" : "failed") << std::endl;
    if (loaded) {
        auto conversations = loaded_manager.list_conversations();
        std::cout << "Loaded conversations: " << conversations.size() << std::endl;
        for (const auto& id : conversations) {
            std::cout << "Conversation ID: " << id 
                      << ", Title: " << loaded_manager.get_title(id) << std::endl;
            auto history = loaded_manager.get_history(id);
            std::cout << "  Messages: " << history.size() << std::endl;
            for (const auto& turn : history) {
                std::cout << "    " << lm::speaker_type_to_string(turn.speaker) 
                          << ": " << turn.text << std::endl;
            }
        }
    }
    std::cout << "=== Serialization Test Complete ===\n" << std::endl;
 }
 void test_conversation_utils() {
    std::cout << "=== Testing Conversation Utilities ===" << std::endl;
    lm::Conversation conv("test", "en");
    conv.add_turn(lm::SpeakerType::USER, "Hello");
    conv.add_turn(lm::SpeakerType::ASSISTANT, "Hi there!");
    conv.add_turn(lm::SpeakerType::USER, "How are you?");
    conv.add_turn(lm::SpeakerType::ASSISTANT, "I'm fine, thanks!");
    conv.add_turn(lm::SpeakerType::USER, "What's new?");
    // Test text extraction
    std::string extracted = lm::conversation_utils::extract_text(conv.turns, 1, 4);
    std::cout << "Extracted text:\n" << extracted << std::endl;
    // Test training pair creation
    auto training_pair = lm::conversation_utils::create_training_pair(conv.turns, 2);
    std::cout << "Training context:\n" << training_pair.first << std::endl;
    std::cout << "Training target: " << training_pair.second << std::endl;
    // Test context window
    auto context_window = lm::conversation_utils::get_context_window(conv.turns, 3);
    std::cout << "Context window (last 3 turns):" << std::endl;
    for (const auto& turn : context_window) {
        std::cout << "  " << lm::speaker_type_to_string(turn.speaker) 
                  << ": " << turn.text << std::endl;
    }
    std::cout << "=== Conversation Utilities Test Complete ===\n" << std::endl;
 }
 int main() {
    std::cout << "Starting Conversation Manager Tests\n" << std::endl;
    try {
        test_conversation_basic();
        test_conversation_manager();
        test_serialization();
        test_conversation_utils();
        std::cout << "All tests completed successfully!" << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Test failed with exception: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
--- a/src/test_data_loader.cpp
+++ b/src/test_data_loader.cpp
@ -0,0 +1,36 @@
 // src/test_data_loader.cpp
 #include <lm/training/data_loader.hpp>
 #include <lm/training/losses.hpp>
 #include <lm/tokenizer/bpe_tokenizer.hpp>
 #include <iostream>
 int main() {
    // Create a simple tokenizer for testing
    lm::BPETokenizer tokenizer;
    // Initialize with a small vocabulary for testing
    // (You'll need to implement a way to create a test tokenizer)
    try {
        // Create data loader
        lm::ConversationDataLoader loader("test_conversations.txt", tokenizer, 2, 10);
        std::cout << "Number of batches: " << loader.num_batches() << std::endl;
        while (loader.has_next()) {
            auto [inputs, targets] = loader.next_batch();
            std::cout << "Input shape: [";
            for (auto dim : inputs.shape()) std::cout << dim << ", ";
            std::cout << "], Target shape: [";
            for (auto dim : targets.shape()) std::cout << dim << ", ";
            std::cout << "]" << std::endl;
        }
        std::cout << "Data loader test completed successfully!" << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
--- a/src/test_generation.cpp
+++ b/src/test_generation.cpp
@ -0,0 +1,111 @@
 #include "lm/generation/sampler.hpp"
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <fstream>
 #include <chrono>
 using namespace lm;
 // Simple corpus for testing
 std::vector<std::string> create_test_corpus() {
    return {
        "The quick brown fox jumps over the lazy dog",
        "Programming is fun with C++ and machine learning",
        "Natural language processing transforms how we interact with computers",
        "Deep learning models require large amounts of data",
        "Attention mechanisms have revolutionized neural networks"
    };
 }
 int main() {
    std::cout << "=== BPE Framework Generation Test ===\n\n";
    try {
        // Initialize tokenizer
        BPETokenizer tokenizer;
        // Create a small test corpus
        auto corpus = create_test_corpus();
        std::cout << "Training tokenizer on " << corpus.size() << " sentences...\n";
        tokenizer.train(corpus, 100); // Small vocabulary for testing
        std::cout << "Tokenizer vocabulary size: " << tokenizer.vocab_size() << "\n";
        std::cout << "EOS token ID: " << tokenizer.eos_token_id() << "\n";
        std::cout << "PAD token ID: " << tokenizer.pad_token_id() << "\n";
        std::cout << "UNK token ID: " << tokenizer.unk_token_id() << "\n\n";
        // Test encoding/decoding
        std::string test_text = "The quick brown fox";
        auto encoded = tokenizer.encode(test_text);
        auto decoded = tokenizer.decode(encoded);
        std::cout << "Encoding test:\n";
        std::cout << "Original: " << test_text << "\n";
        std::cout << "Encoded: ";
        for (auto token : encoded) {
            std::cout << token << " ";
        }
        std::cout << "\nDecoded: " << decoded << "\n\n";
        // Test different samplers
        std::cout << "\n=== Testing Samplers ===\n";
        // Create a simple tensor for testing samplers
        // Use explicit shape initialization to avoid Eigen assertion errors
        std::vector<size_t> shape = {10}; // 1D tensor with 10 elements
        Tensor logits(shape);
        // Initialize with some values - use 1D indexing
        for (int i = 0; i < 10; i++) {
            logits(i) = static_cast<float>(i) / 10.0f;
        }
        // Test greedy sampler
        GreedySampler greedy_sampler;
        TokenID greedy_token = greedy_sampler.sample(logits);
        std::cout << "Greedy sampler selected token: " << greedy_token << "\n";
        // Test random sampler
        RandomSampler random_sampler(0.8f);
        TokenID random_token = random_sampler.sample(logits);
        std::cout << "Random sampler selected token: " << random_token << "\n";
        // Test Top-K sampler
        TopKSampler topk_sampler(5, 0.8f);
        TokenID topk_token = topk_sampler.sample(logits);
        std::cout << "Top-K sampler selected token: " << topk_token << "\n";
        // Test Top-P sampler
        TopPSampler topp_sampler(0.9f, 0.8f);
        TokenID topp_token = topp_sampler.sample(logits);
        std::cout << "Top-P sampler selected token: " << topp_token << "\n\n";
        // Test EOS token handling
        std::cout << "=== Testing EOS Token Handling ===\n";
        std::string eos_prompt = "Test";
        auto eos_encoded = tokenizer.encode(eos_prompt);
        // Check if EOS token is in vocabulary
        int eos_token_id = static_cast<int>(tokenizer.eos_token_id());
        std::cout << "EOS token ID: " << eos_token_id << "\n";
        // Check if EOS token is in the encoded prompt
        auto eos_it = std::find(eos_encoded.begin(), eos_encoded.end(), eos_token_id);
        if (eos_it != eos_encoded.end()) {
            std::cout << "EOS token found in encoded prompt at position " 
                      << (eos_it - eos_encoded.begin()) << "\n";
        } else {
            std::cout << "EOS token not found in encoded prompt\n";
        }
        std::cout << "\n=== Test Completed Successfully ===\n";
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << "\n";
        return 1;
    }
    return 0;
 }
--- a/src/test_logger.cpp
+++ b/src/test_logger.cpp
@ -0,0 +1,213 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include <iostream>
 #include <vector>
 #include <string>
 using namespace lm;
 void run_basic_test() {
    std::cout << "=== BASIC TEST ===" << std::endl;
    BPETokenizer tokenizer;
    tokenizer.enable_debug_logging(true);
    // Train on a simple corpus
    std::vector<std::string> corpus = {
        "The quick brown fox jumps over the lazy dog.",
        "I love machine learning and natural language processing!",
        "Byte Pair Encoding is an effective tokenization method."
    };
    std::cout << "Training tokenizer..." << std::endl;
    tokenizer.train(corpus, 300);
    std::cout << "Training completed. Vocabulary size: " << tokenizer.vocab_size() << std::endl;
    // Test encoding and decoding
    std::string test_text = "The quick brown fox";
    std::cout << "\nTesting encoding/decoding with: '" << test_text << "'" << std::endl;
    auto tokens = tokenizer.encode(test_text);
    std::string decoded = tokenizer.decode(tokens);
    std::cout << "\nOriginal: '" << test_text << "'" << std::endl;
    std::cout << "Decoded:  '" << decoded << "'" << std::endl;
    std::cout << "Tokens: [";
    for (size_t i = 0; i < tokens.size(); i++) {
        std::cout << tokens[i];
        if (i < tokens.size() - 1) std::cout << ", ";
    }
    std::cout << "]" << std::endl;
    // Dump vocabulary and merges for inspection
    std::cout << "\nVocabulary:" << std::endl;
    tokenizer.dump_vocabulary();
    std::cout << "\nMerges:" << std::endl;
    tokenizer.dump_merges();
 }
 void run_unicode_test() {
    std::cout << "\n\n=== UNICODE TEST ===" << std::endl;
    BPETokenizer tokenizer;
    tokenizer.enable_debug_logging(true);
    // Train on a corpus with Unicode characters
    std::vector<std::string> corpus = {
        "Hello world! 你好世界!",
        "Bonjour le monde! ¡Hola mundo!",
        "Café résumé naïve façade",
        "Emoji: 😊 🚀 🌟 🎉"
    };
    std::cout << "Training tokenizer with Unicode..." << std::endl;
    tokenizer.train(corpus, 400);
    std::cout << "Training completed. Vocabulary size: " << tokenizer.vocab_size() << std::endl;
    // Test encoding and decoding with Unicode
    std::string test_text = "Café résumé with emoji 😊";
    std::cout << "\nTesting encoding/decoding with: '" << test_text << "'" << std::endl;
    auto tokens = tokenizer.encode(test_text);
    std::string decoded = tokenizer.decode(tokens);
    std::cout << "\nOriginal: '" << test_text << "'" << std::endl;
    std::cout << "Decoded:  '" << decoded << "'" << std::endl;
    std::cout << "Tokens: [";
    for (size_t i = 0; i < tokens.size(); i++) {
        std::cout << tokens[i];
        if (i < tokens.size() - 1) std::cout << ", ";
    }
    std::cout << "]" << std::endl;
 }
 void run_edge_case_test() {
    std::cout << "\n\n=== EDGE CASE TEST ===" << std::endl;
    BPETokenizer tokenizer;
    tokenizer.enable_debug_logging(true);
    // Train on a small corpus
    std::vector<std::string> corpus = {
        "a b c d e f g h i j k l m n o p q r s t u v w x y z",
        "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z",
        "0 1 2 3 4 5 6 7 8 9",
        "! @ # $ % ^ & * ( ) - _ = + [ ] { } ; : ' \" , . < > / ?"
    };
    std::cout << "Training tokenizer with edge cases..." << std::endl;
    tokenizer.train(corpus, 200);
    std::cout << "Training completed. Vocabulary size: " << tokenizer.vocab_size() << std::endl;
    // Test various edge cases
    std::vector<std::string> test_cases = {
        "a",
        "abc",
        "hello world",
        "!@#$%",
        "a b c",
        "The quick brown fox"
    };
    for (const auto& test_text : test_cases) {
        std::cout << "\nTesting: '" << test_text << "'" << std::endl;
        auto tokens = tokenizer.encode(test_text);
        std::string decoded = tokenizer.decode(tokens);
        std::cout << "Original: '" << test_text << "'" << std::endl;
        std::cout << "Decoded:  '" << decoded << "'" << std::endl;
        std::cout << "Match: " << (test_text == decoded ? "YES" : "NO") << std::endl;
        std::cout << "Tokens: [";
        for (size_t i = 0; i < tokens.size(); i++) {
            std::cout << tokens[i];
            if (i < tokens.size() - 1) std::cout << ", ";
        }
        std::cout << "]" << std::endl;
    }
 }
 void run_save_load_test() {
    std::cout << "\n\n=== SAVE/LOAD TEST ===" << std::endl;
    BPETokenizer tokenizer;
    // Train on a simple corpus
    std::vector<std::string> corpus = {
        "The quick brown fox jumps over the lazy dog.",
        "I love programming in C++",
        "Machine learning is fascinating"
    };
    std::cout << "Training tokenizer..." << std::endl;
    tokenizer.train(corpus, 250);
    std::cout << "Training completed. Vocabulary size: " << tokenizer.vocab_size() << std::endl;
    // Test encoding before save
    std::string test_text = "quick brown fox";
    auto original_tokens = tokenizer.encode(test_text);
    std::string original_decoded = tokenizer.decode(original_tokens);
    std::cout << "Before save - Original: '" << test_text << "'" << std::endl;
    std::cout << "Before save - Decoded:  '" << original_decoded << "'" << std::endl;
    // Save the tokenizer
    std::string filename = "bpe_tokenizer.model";
    if (tokenizer.save(filename)) {
        std::cout << "Tokenizer saved to " << filename << std::endl;
    } else {
        std::cout << "Failed to save tokenizer to " << filename << std::endl;
        return;
    }
    // Load into a new tokenizer
    BPETokenizer loaded_tokenizer;
    if (loaded_tokenizer.load(filename)) {
        std::cout << "Tokenizer loaded from " << filename << std::endl;
        std::cout << "Loaded vocabulary size: " << loaded_tokenizer.vocab_size() << std::endl;
        // Test encoding after load
        auto loaded_tokens = loaded_tokenizer.encode(test_text);
        std::string loaded_decoded = loaded_tokenizer.decode(loaded_tokens);
        std::cout << "After load - Original: '" << test_text << "'" << std::endl;
        std::cout << "After load - Decoded:  '" << loaded_decoded << "'" << std::endl;
        std::cout << "Match: " << (original_decoded == loaded_decoded ? "YES" : "NO") << std::endl;
        // Compare tokens
        std::cout << "Original tokens: [";
        for (size_t i = 0; i < original_tokens.size(); i++) {
            std::cout << original_tokens[i];
            if (i < original_tokens.size() - 1) std::cout << ", ";
        }
        std::cout << "]" << std::endl;
        std::cout << "Loaded tokens: [";
        for (size_t i = 0; i < loaded_tokens.size(); i++) {
            std::cout << loaded_tokens[i];
            if (i < loaded_tokens.size() - 1) std::cout << ", ";
        }
        std::cout << "]" << std::endl;
    } else {
        std::cout << "Failed to load tokenizer from " << filename << std::endl;
    }
 }
 int main() {
    std::cout << "BPETokenizer Test Application" << std::endl;
    std::cout << "============================" << std::endl;
    try {
        run_basic_test();
        run_unicode_test();
        run_edge_case_test();
        run_save_load_test();
        std::cout << "\nAll tests completed!" << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
--- a/src/test_tensor_pool.cpp
+++ b/src/test_tensor_pool.cpp
@ -0,0 +1,86 @@
 // src/test_tensor_pool.cpp
 #include <lm/core/tensor_pool.hpp>
 #include <lm/core/tensor.hpp>
 #include <iostream>
 #include <vector>
 #include <memory>
 int main() {
    std::cout << "Testing TensorPool functionality..." << std::endl;
    // Create a tensor pool
    lm::TensorPool pool;
    std::cout << "Initial pool size: " << pool.size() << std::endl;
    // Test 1: Acquire a tensor and use it
    std::cout << "\n=== Test 1: Acquire and use a tensor ===" << std::endl;
    auto tensor1 = pool.acquire({128, 128}, true);
    std::cout << "Acquired tensor with shape: [";
    for (auto dim : tensor1->shape()) {
        std::cout << dim << ", ";
    }
    std::cout << "], requires_grad: " << tensor1->requires_grad() << std::endl;
    // Use the tensor
    tensor1->data().setConstant(5.0f);
    std::cout << "Tensor data[0][0]: " << tensor1->data()(0, 0) << std::endl;
    // Test 2: Release the tensor back to the pool
    std::cout << "\n=== Test 2: Release tensor back to pool ===" << std::endl;
    pool.release(std::move(tensor1));
    std::cout << "Pool size after release: " << pool.size() << std::endl;
    // Test 3: Acquire another tensor with the same specs (should reuse)
    std::cout << "\n=== Test 3: Acquire tensor with same specs (should reuse) ===" << std::endl;
    auto tensor2 = pool.acquire({128, 128}, true);
    std::cout << "Acquired tensor with shape: [";
    for (auto dim : tensor2->shape()) {
        std::cout << dim << ", ";
    }
    std::cout << "], requires_grad: " << tensor2->requires_grad() << std::endl;
    std::cout << "Pool size after acquisition: " << pool.size() << std::endl;
    // Test 4: Verify the tensor was reset (should be zeros)
    std::cout << "\n=== Test 4: Verify tensor was reset ===" << std::endl;
    std::cout << "Tensor data[0][0] (should be 0): " << tensor2->data()(0, 0) << std::endl;
    // Test 5: Acquire a tensor with different specs (should create new)
    std::cout << "\n=== Test 5: Acquire tensor with different specs (should create new) ===" << std::endl;
    auto tensor3 = pool.acquire({64, 64}, false);
    std::cout << "Acquired tensor with shape: [";
    for (auto dim : tensor3->shape()) {
        std::cout << dim << ", ";
    }
    std::cout << "], requires_grad: " << tensor3->requires_grad() << std::endl;
    std::cout << "Pool size after acquisition: " << pool.size() << std::endl;
    // Test 6: Release both tensors
    std::cout << "\n=== Test 6: Release both tensors ===" << std::endl;
    pool.release(std::move(tensor2));
    pool.release(std::move(tensor3));
    std::cout << "Pool size after releasing both: " << pool.size() << std::endl;
    // Test 7: Clear the pool
    std::cout << "\n=== Test 7: Clear the pool ===" << std::endl;
    pool.clear();
    std::cout << "Pool size after clear: " << pool.size() << std::endl;
    // Test 8: Test with multiple tensors
    std::cout << "\n=== Test 8: Test with multiple tensors ===" << std::endl;
    std::vector<std::unique_ptr<lm::Tensor>> tensors;
    for (int i = 0; i < 5; i++) {
        tensors.push_back(pool.acquire({32, 32}, true));
        std::cout << "Acquired tensor " << i+1 << ", pool size: " << pool.size() << std::endl;
    }
    // Release all tensors
    for (auto& tensor : tensors) {
        pool.release(std::move(tensor));
    }
    std::cout << "Released all tensors, pool size: " << pool.size() << std::endl;
    std::cout << "\n=== All tests completed successfully! ===" << std::endl;
    return 0;
 }
--- a/src/test_transformer
+++ b/src/test_transformer
@ -0,0 +1,34 @@
 #include <iostream>
 #include "lm/models/transformer_model.hpp"  // Use the correct header
 int main() {
    // Use TransformerModel instead of Transformer
    lm::TransformerModel model(1000, 512, 6, 8, 2048, 0.1f);
    std::cout << "Transformer model created successfully!" << std::endl;
    std::cout << "Vocabulary size: " << model.get_vocab_size() << std::endl;
    std::cout << "Model dimensions: " << model.get_d_model() << std::endl;
    // Test with some sample tokens
    std::vector<lm::TokenID> test_tokens = {1, 2, 3, 4, 5};
    try {
        auto output = model.forward(test_tokens);
        std::cout << "Forward pass completed successfully!" << std::endl;
        std::cout << "Output size: " << output.size() << std::endl;
        // Test generation
        auto generated = model.generate(test_tokens, 10, 0.8f);
        std::cout << "Generated tokens: ";
        for (auto token : generated) {
            std::cout << token << " ";
        }
        std::cout << std::endl;
    } catch (const std::exception& e) {
        std::cerr << "Error during forward pass: " << e.what() << std::endl;
    }
    return 0;
 }
--- a/src/test_unicode_bpe
+++ b/src/test_unicode_bpe
@ -0,0 +1,134 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include "lm/tokenizer/unicode_utils.hpp"  // Add this include for normalization
 #include <iostream>
 #include <vector>
 #include <iomanip>  // Add this for std::hex and std::setw
 int main() {
    lm::BPETokenizer tokenizer;
    // Training corpus with Unicode text
    std::vector<std::string> corpus = {
        "the quick brown fox jumps over the lazy dog",
        "artificial intelligence is transforming the world",
        "C++ is a powerful programming language",
        "machine learning models require large amounts of data",
        "你好世界", // Hello world in Chinese
        "こんにちは世界", // Hello world in Japanese
        "안녕하세요 세계", // Hello world in Korean
        "مرحبا بالعالم", // Hello world in Arabic
        "Γειά σου Κόσμε", // Hello world in Greek
        "Привет мир", // Hello world in Russian
        "नमस्ते दुनिया" // Hello world in Hindi
    };
    try {
        // Train the tokenizer
        std::cout << "Training tokenizer with Unicode text..." << std::endl;
        tokenizer.train(corpus, 1000);
        std::cout << "Vocabulary size: " << tokenizer.vocab_size() << std::endl;
        // Test encoding/decoding with various scripts
        std::vector<std::string> test_texts = {
            "hello world",
            "你好世界",
            "こんにちは世界",
            "مرحبا بالعالم",
            "Привет мир"
        };
        for (const auto& test_text : test_texts) {
            auto tokens = tokenizer.encode(test_text);
            std::string decoded = tokenizer.decode(tokens);
            std::cout << "\nOriginal: " << test_text << std::endl;
            // Add hex dump of original text
            std::cout << "Original (hex): ";
            for (unsigned char c : test_text) {
                std::cout << std::hex << std::setw(2) << std::setfill('0') 
                          << static_cast<int>(c) << " ";
            }
            std::cout << std::dec << std::endl;
            std::cout << "Tokens: ";
            for (auto token : tokens) {
                std::cout << token << " ";
            }
            std::cout << std::endl;
            std::cout << "Decoded: " << decoded << std::endl;
            // Add hex dump of decoded text
            std::cout << "Decoded (hex): ";
            for (unsigned char c : decoded) {
                std::cout << std::hex << std::setw(2) << std::setfill('0') 
                          << static_cast<int>(c) << " ";
            }
            std::cout << std::dec << std::endl;
            std::cout << "Match: " << (test_text == decoded ? "YES" : "NO") << std::endl;
            // Add normalization comparison
            std::string normalized_original = lm::unicode::normalize(test_text);
            std::string normalized_decoded = lm::unicode::normalize(decoded);
            std::cout << "Normalized match: " 
                      << (normalized_original == normalized_decoded ? "YES" : "NO") 
                      << std::endl;
            // If they don't match, show the normalized versions
            if (normalized_original != normalized_decoded) {
                std::cout << "Normalized original: " << normalized_original << std::endl;
                std::cout << "Normalized decoded: " << normalized_decoded << std::endl;
                // Hex dumps of normalized versions
                std::cout << "Normalized original (hex): ";
                for (unsigned char c : normalized_original) {
                    std::cout << std::hex << std::setw(2) << std::setfill('0') 
                              << static_cast<int>(c) << " ";
                }
                std::cout << std::dec << std::endl;
                std::cout << "Normalized decoded (hex): ";
                for (unsigned char c : normalized_decoded) {
                    std::cout << std::hex << std::setw(2) << std::setfill('0') 
                              << static_cast<int>(c) << " ";
                }
                std::cout << std::dec << std::endl;
            }
        }
        // Save and load test
        tokenizer.save("unicode_bpe_model.txt");
        lm::BPETokenizer loaded_tokenizer;
        if (loaded_tokenizer.load("unicode_bpe_model.txt")) {
            std::cout << "\nSuccessfully loaded Unicode tokenizer" << std::endl;
            std::cout << "Loaded vocabulary size: " << loaded_tokenizer.vocab_size() << std::endl;
            // Test with the loaded tokenizer
            std::string test_text = "你好世界";
            auto tokens = loaded_tokenizer.encode(test_text);
            std::string decoded = loaded_tokenizer.decode(tokens);
            std::cout << "Loaded tokenizer test:" << std::endl;
            std::cout << "Original: " << test_text << std::endl;
            std::cout << "Decoded: " << decoded << std::endl;
            // Add normalization check for loaded tokenizer test
            std::string normalized_original = lm::unicode::normalize(test_text);
            std::string normalized_decoded = lm::unicode::normalize(decoded);
            std::cout << "Normalized match: " 
                      << (normalized_original == normalized_decoded ? "YES" : "NO") 
                      << std::endl;
        }
    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }
    return 0;
 }
--- a/src/tokenizer/bpe_tokenizer
+++ b/src/tokenizer/bpe_tokenizer
@ -0,0 +1,905 @@
 #include "lm/tokenizer/bpe_tokenizer.hpp"
 #include "lm/tokenizer/unicode_utils.hpp"
 #include <fstream>
 #include <sstream>
 #include <queue>
 #include <algorithm>
 #include <stdexcept>
 #include <iostream>
 #include <sys/resource.h>
 #include <vector>
 #include <memory>
 #include <unordered_map>
 #include <iomanip>
 // Add CPU-specific optimizations
 #ifdef __SSE4_2__
 #include <nmmintrin.h>  // For SSE4.2 intrinsics
 #endif
 namespace lm {
 struct VectorHash {
    size_t operator()(const std::vector<TokenID>& vec) const {
        size_t seed = vec.size();
        for (const auto& token : vec) {
            seed ^= token + 0x9e3779b9 + (seed << 6) + (seed >> 2);
        }
        return seed;
    }
 };
 // Custom hash function for pair<TokenID, TokenID>
 struct PairHash {
    size_t operator()(const std::pair<TokenID, TokenID>& p) const {
        return (static_cast<size_t>(p.first) << 16) | p.second;
    }
 };
 // Memory tracking function
 size_t get_peak_memory_usage() {
    #ifdef __linux__
    std::ifstream status("/proc/self/status");
    std::string line;
    while (std::getline(status, line)) {
        if (line.compare(0, 6, "VmPeak") == 0) {
            std::istringstream iss(line);
            std::string key;
            size_t value;
            std::string unit;
            iss >> key >> value >> unit;
            if (unit == "kB") {
                return value * 1024; // Convert to bytes
            }
        }
    }
    #endif
    return 0;
 }
 // String interning class
 class StringInternPool {
    std::unordered_map<std::string, std::shared_ptr<const std::string>> pool;
 public:
    std::shared_ptr<const std::string> intern(const std::string& str) {
        auto it = pool.find(str);
        if (it != pool.end()) {
            return it->second;
        }
        auto shared_str = std::make_shared<std::string>(str);
        pool[str] = shared_str;
        return shared_str;
    }
    void clear() {
        pool.clear();
    }
 };
 // Unicode processing cache
 class UnicodeCache {
 private:
    mutable std::unordered_map<std::string, std::string> normalization_cache;
    mutable std::unordered_map<std::string, std::vector<std::string>> split_cache;
 public:
    const std::string& get_normalized(const std::string& text) const {
        auto it = normalization_cache.find(text);
        if (it != normalization_cache.end()) {
            return it->second;
        }
        auto normalized = unicode::normalize(text);
        auto result = normalization_cache.emplace(text, std::move(normalized));
        return result.first->second;
    }
    const std::vector<std::string>& get_split(const std::string& text) const {
        auto it = split_cache.find(text);
        if (it != split_cache.end()) {
            return it->second;
        }
        auto split = unicode::unicode_split(text);
        auto result = split_cache.emplace(text, std::move(split));
        return result.first->second;
    }
    void clear() const {
        normalization_cache.clear();
        split_cache.clear();
    }
 };
 // UTF-8 validation - using C++ implementation only
 namespace {
 bool is_valid_utf8_impl(const char* str, size_t length) {
    // Simple UTF-8 validation
    for (size_t i = 0; i < length; i++) {
        unsigned char c = str[i];
        if (c > 0x7F) {  // Non-ASCII character
            // Check if it's a valid UTF-8 start byte
            if (c < 0xC2 || c > 0xF4) return false;
            // Check continuation bytes
            int following_bytes = 0;
            if ((c & 0xE0) == 0xC0) following_bytes = 1;
            else if ((c & 0xF0) == 0xE0) following_bytes = 2;
            else if ((c & 0xF8) == 0xF0) following_bytes = 3;
            // Check if we have enough bytes
            if (i + following_bytes >= length) return false;
            // Check continuation bytes
            for (int j = 1; j <= following_bytes; j++) {
                if ((str[i + j] & 0xC0) != 0x80) return false;
            }
            i += following_bytes;
        }
    }
    return true;
 }
 } // namespace
 struct BPETokenizer::Impl {
    std::unordered_map<std::string, TokenID> vocab;
    std::unordered_map<TokenID, std::string> inv_vocab;
    std::unordered_map<std::pair<TokenID, TokenID>, TokenID, PairHash> merges;
    std::unordered_map<std::string, TokenID> special_tokens;
    std::string unknown_token = "<unk>";
    TokenID unknown_token_id = 0;
    TokenID next_token_id = 0;
    bool normalization_enabled = true;
    bool byte_fallback_enabled = true;
    StringInternPool string_pool;
    mutable UnicodeCache unicode_cache;  // Made mutable
    bool cache_enabled = true;
    bool debug_logging = false;  // Added debug logging flag
    // Special token IDs
    TokenID eos_token_id = 0;
    TokenID pad_token_id = 0;
    TokenID unk_token_id = 0;
    // Helper functions
    std::vector<std::string> split_text(const std::string& text) const;
    std::vector<TokenID> word_to_token_ids(const std::string& word) const;
    void initialize_vocab();
    void count_word_frequencies(const std::vector<std::string>& words,
                               std::unordered_map<std::string, int>& word_counts) const;
    void get_pair_counts(const std::unordered_map<std::string, int>& word_counts,
                        std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash>& pair_counts) const;
    void perform_merge(const std::pair<TokenID, TokenID>& pair, TokenID new_token_id,
                      std::unordered_map<std::string, int>& word_counts);
    void get_pair_counts_from_sequences(const std::vector<std::pair<std::vector<TokenID>, int>>& tokenized_corpus,
                                       std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash>& pair_counts) const;
    void perform_merge_on_sequences(const std::pair<TokenID, TokenID>& pair, TokenID new_token_id,
                                   std::vector<std::pair<std::vector<TokenID>, int>>& tokenized_corpus);
    // Handle invalid UTF-8
    std::vector<TokenID> handle_invalid_utf8(const std::string& text) const;
    // CPU Optimization: Batch processing
    void process_string_batch(const std::vector<std::string>& batch);
    // Cache management
    void enable_caching(bool enable) {
        cache_enabled = enable;
        if (!enable) {
            unicode_cache.clear();
        }
    }
    // Debug logging methods
    void log_encode_start(const std::string& text) const;
    void log_word_split(const std::vector<std::string>& words) const;
    void log_word_tokens(const std::string& word, const std::vector<TokenID>& tokens) const;
    void log_merge_attempt(size_t pos, TokenID first, TokenID second, bool found) const;
    void log_merge_result(const std::vector<TokenID>& tokens) const;
    void log_final_tokens(const std::vector<TokenID>& tokens) const;
    void log_decode_start(const std::vector<TokenID>& tokens) const;
    void log_token_decoding(TokenID token_id, const std::string& decoded) const;
    void log_final_decoding(const std::string& text) const;
 };
 // Debug logging implementations
 void BPETokenizer::Impl::log_encode_start(const std::string& text) const {
    if (!debug_logging) return;
    std::cout << "[ENCODE] Starting encoding of text: '" << text << "'" << std::endl;
 }
 void BPETokenizer::Impl::get_pair_counts_from_sequences(
    const std::vector<std::pair<std::vector<TokenID>, int>>& tokenized_corpus,
    std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash>& pair_counts) const {
    pair_counts.clear();
    for (const auto& [sequence, count] : tokenized_corpus) {
        for (size_t i = 0; i < sequence.size() - 1; i++) {
            auto pair = std::make_pair(sequence[i], sequence[i+1]);
            pair_counts[pair] += count;
        }
    }
 }
 void BPETokenizer::Impl::log_word_split(const std::vector<std::string>& words) const {
    if (!debug_logging) return;
    std::cout << "[ENCODE] Split into " << words.size() << " words: ";
    for (size_t i = 0; i < words.size(); i++) {
        std::cout << "[" << i << "]='" << words[i] << "' ";
    }
    std::cout << std::endl;
 }
 void BPETokenizer::Impl::log_word_tokens(const std::string& word, const std::vector<TokenID>& tokens) const {
    if (!debug_logging) return;
    std::cout << "[ENCODE] Word '" << word << "' → Tokens: ";
    for (TokenID id : tokens) {
        std::cout << id << " ('" << (inv_vocab.count(id) ? inv_vocab.at(id) : "<?>") << "') ";
    }
    std::cout << std::endl;
 }
 void BPETokenizer::Impl::log_merge_attempt(size_t pos, TokenID first, TokenID second, bool found) const {
    if (!debug_logging) return;
    std::string first_str = inv_vocab.count(first) ? inv_vocab.at(first) : "<?>";
    std::string second_str = inv_vocab.count(second) ? inv_vocab.at(second) : "<?>";
    std::cout << "[ENCODE] Checking pair at position " << pos << ": (" 
              << first << ":'" << first_str << "', " 
              << second << ":'" << second_str << "') - " 
              << (found ? "FOUND" : "NOT FOUND") << std::endl;
 }
 void BPETokenizer::Impl::log_merge_result(const std::vector<TokenID>& tokens) const {
    if (!debug_logging) return;
    std::cout << "[ENCODE] After merge: ";
    for (TokenID id : tokens) {
        std::cout << id << " ('" << (inv_vocab.count(id) ? inv_vocab.at(id) : "<?>") << "') ";
    }
    std::cout << std::endl;
 }
 void BPETokenizer::Impl::log_final_tokens(const std::vector<TokenID>& tokens) const {
    if (!debug_logging) return;
    std::cout << "[ENCODE] Final tokens: ";
    for (TokenID id : tokens) {
        std::cout << id << " ";
    }
    std::cout << std::endl;
    std::cout << "[ENCODE] Final tokens with text: ";
    for (TokenID id : tokens) {
        std::cout << id << ":'" << (inv_vocab.count(id) ? inv_vocab.at(id) : "<?>") << "' ";
    }
    std::cout << std::endl;
 }
 void BPETokenizer::Impl::log_decode_start(const std::vector<TokenID>& tokens) const {
    if (!debug_logging) return;
    std::cout << "[DECODE] Starting decoding of " << tokens.size() << " tokens: ";
    for (TokenID id : tokens) {
        std::cout << id << " ";
    }
    std::cout << std::endl;
 }
 void BPETokenizer::Impl::log_token_decoding(TokenID token_id, const std::string& decoded) const {
    if (!debug_logging) return;
    std::string token_text = inv_vocab.count(token_id) ? inv_vocab.at(token_id) : "<?>";
    std::cout << "[DECODE] Token " << token_id << ":'" << token_text << "' → '" << decoded << "'" << std::endl;
 }
 void BPETokenizer::Impl::log_final_decoding(const std::string& text) const {
    if (!debug_logging) return;
    std::cout << "[DECODE] Final result: '" << text << "'" << std::endl;
 }
 // Add debug methods to the BPETokenizer class
 void BPETokenizer::enable_debug_logging(bool enable) {
    pimpl_->debug_logging = enable;
 }
 void BPETokenizer::dump_vocabulary() const {
    std::cout << "=== VOCABULARY DUMP ===" << std::endl;
    std::cout << "Size: " << pimpl_->vocab.size() << std::endl;
    // Create a sorted list for better readability
    std::vector<std::pair<std::string, TokenID>> sorted_vocab;
    for (const auto& entry : pimpl_->vocab) {
        sorted_vocab.emplace_back(entry.first, entry.second);
    }
    std::sort(sorted_vocab.begin(), sorted_vocab.end(),
        [](const auto& a, const auto& b) { return a.second < b.second; });
    for (const auto& entry : sorted_vocab) {
        std::string display = entry.first;
        // Replace non-printable characters
        for (char& c : display) {
            if (c < 32 || c > 126) {
                c = '?';
            }
        }
        std::cout << std::setw(6) << entry.second << ": '" << display << "'";
        if (entry.first != display) {
            std::cout << " (original: ";
            for (unsigned char c : entry.first) {
                if (c >= 32 && c <= 126) {
                    std::cout << c;
                } else {
                    std::cout << "\\x" << std::hex << std::setw(2) << std::setfill('0') 
                              << static_cast<int>(c) << std::dec;
                }
            }
            std::cout << ")";
        }
        std::cout << std::endl;
    }
    std::cout << "=== END VOCABULARY DUMP ===" << std::endl;
 }
 void BPETokenizer::dump_merges() const {
    std::cout << "=== MERGES DUMP ===" << std::endl;
    std::cout << "Number of merges: " << pimpl_->merges.size() << std::endl;
    for (const auto& merge : pimpl_->merges) {
        const auto& pair = merge.first;
        TokenID new_id = merge.second;
        std::string first_str = pimpl_->inv_vocab.count(pair.first) 
            ? pimpl_->inv_vocab.at(pair.first) : "<?>";
        std::string second_str = pimpl_->inv_vocab.count(pair.second) 
            ? pimpl_->inv_vocab.at(pair.second) : "<?>";
        std::string new_str = pimpl_->inv_vocab.count(new_id) 
            ? pimpl_->inv_vocab.at(new_id) : "<?>";
        std::cout << "(" << pair.first << ":'" << first_str << "', " 
                  << pair.second << ":'" << second_str << "') → " 
                  << new_id << ":'" << new_str << "'" << std::endl;
    }
    std::cout << "=== END MERGES DUMP ===" << std::endl;
 }
 BPETokenizer::BPETokenizer() : pimpl_(new Impl) {
    pimpl_->initialize_vocab();
 }
 BPETokenizer::~BPETokenizer() = default;
 void BPETokenizer::Impl::initialize_vocab() {
    vocab.reserve(65536);
    inv_vocab.reserve(65536);
    special_tokens.reserve(256);
    merges.reserve(30000);
    // Add bytes
    for (int i = 0; i < 256; i++) {
        std::string token(1, static_cast<char>(i));
        vocab.emplace(token, next_token_id);
        inv_vocab.emplace(next_token_id++, std::move(token));
    }
    // Add space token
    vocab[" "] = next_token_id;
    inv_vocab[next_token_id] = " ";
    next_token_id++;
    // Add special tokens
    vocab["<unk>"] = next_token_id;
    inv_vocab[next_token_id] = "<unk>";
    special_tokens["<unk>"] = next_token_id;
    unk_token_id = next_token_id++;
    vocab["<pad>"] = next_token_id;
    inv_vocab[next_token_id] = "<pad>";
    special_tokens["<pad>"] = next_token_id;
    pad_token_id = next_token_id++;
    vocab["<eos>"] = next_token_id;
    inv_vocab[next_token_id] = "<eos>";
    special_tokens["<eos>"] = next_token_id;
    eos_token_id = next_token_id++;
    unknown_token_id = unk_token_id;
 }
 void BPETokenizer::Impl::perform_merge_on_sequences(
    const std::pair<TokenID, TokenID>& pair, 
    TokenID new_token_id,
    std::vector<std::pair<std::vector<TokenID>, int>>& tokenized_corpus) {
    // Create new token
    std::string new_token = this->inv_vocab.at(pair.first) + this->inv_vocab.at(pair.second);
    // Add to vocabulary
    this->vocab[new_token] = new_token_id;
    this->inv_vocab[new_token_id] = new_token;
    this->merges[pair] = new_token_id;
    // Apply merge to all sequences
    for (auto& [sequence, count] : tokenized_corpus) {
        std::vector<TokenID> new_sequence;
        new_sequence.reserve(sequence.size());
        for (size_t i = 0; i < sequence.size(); i++) {
            if (i < sequence.size() - 1 && 
                sequence[i] == pair.first && 
                sequence[i+1] == pair.second) {
                new_sequence.push_back(new_token_id);
                i++; // Skip the next token
            } else {
                new_sequence.push_back(sequence[i]);
            }
        }
        sequence = std::move(new_sequence);
    }
 }
 std::vector<std::string> BPETokenizer::Impl::split_text(const std::string& text) const {
    if (normalization_enabled) {
        if (cache_enabled) {
            return unicode_cache.get_split(unicode_cache.get_normalized(text));
        } else {
            std::string normalized = unicode::normalize(text);
            return unicode::unicode_split(normalized);
        }
    } else {
        std::vector<std::string> words;
        std::istringstream iss(text);
        std::string word;
        // Preallocate based on text size
        words.reserve(text.size() / 6); // Average word length ~6 characters
        while (iss >> word) {
            words.push_back(std::move(word));
        }
        return words;
    }
 }
 void BPETokenizer::Impl::count_word_frequencies(
    const std::vector<std::string>& words,
    std::unordered_map<std::string, int>& word_counts) const {
    // Preallocate based on expected unique words
    word_counts.reserve(words.size() / 10); // Assume 10% unique words
    for (const auto& word : words) {
        // Use emplace for more efficient insertion
        auto result = word_counts.emplace(word, 1);
        if (!result.second) {
            result.first->second++;
        }
    }
 }
 void BPETokenizer::Impl::perform_merge(const std::pair<TokenID, TokenID>& pair, TokenID new_token_id,
                                      std::unordered_map<std::string, int>& word_counts) {
    std::string new_token = this->inv_vocab.at(pair.first) + this->inv_vocab.at(pair.second);
    // Add new token to vocabulary
    this->vocab[new_token] = new_token_id;
    this->inv_vocab[new_token_id] = new_token;
    this->merges[pair] = new_token_id;
    // Update word counts by replacing occurrences of the pair
    std::unordered_map<std::string, int> new_word_counts;
    for (const auto& [word, count] : word_counts) {
        std::string new_word;
        size_t pos = 0;
        while (pos < word.size()) {
            // Check if we found the pair at this position
            size_t first_len = this->inv_vocab.at(pair.first).size();
            size_t second_len = this->inv_vocab.at(pair.second).size();
            if (pos + first_len + second_len <= word.size() &&
                word.substr(pos, first_len) == this->inv_vocab.at(pair.first) &&
                word.substr(pos + first_len, second_len) == this->inv_vocab.at(pair.second)) {
                new_word += new_token;
                pos += first_len + second_len;
            } else {
                new_word += word[pos];
                pos++;
            }
        }
        new_word_counts[new_word] += count;
    }
    word_counts = std::move(new_word_counts);
 }
 std::vector<TokenID> BPETokenizer::Impl::handle_invalid_utf8(const std::string& text) const {
    std::vector<TokenID> tokens;
    tokens.reserve(text.size());
    for (size_t i = 0; i < text.size(); i++) {
        unsigned char c = text[i];
        // If it's a valid ASCII character, encode normally
        if (c <= 0x7F) {
            std::string char_str(1, static_cast<char>(c));
            if (auto it = vocab.find(char_str); it != vocab.end()) {
                tokens.push_back(it->second);
            } else {
                tokens.push_back(unknown_token_id);
            }
        } else {
            // Invalid byte, use byte fallback or unknown token
            if (byte_fallback_enabled) {
                // Encode each byte individually
                std::string byte_str(1, static_cast<char>(c));
                if (auto it = vocab.find(byte_str); it != vocab.end()) {
                    tokens.push_back(it->second);
                } else {
                    tokens.push_back(unknown_token_id);
                }
            } else {
                tokens.push_back(unknown_token_id);
            }
        }
    }
    return tokens;
 }
 void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_size) {
    size_t start_memory = get_peak_memory_usage();
    if (corpus.empty()) {
        throw std::invalid_argument("Corpus cannot be empty");
    }
    // Disable caching during training as vocabulary changes frequently
    pimpl_->enable_caching(false);
    // Validate all input texts before training
    for (const auto& text : corpus) {
        if (!is_valid_utf8_impl(text.data(), text.size())) {
            std::cerr << "Warning: Invalid UTF-8 in training corpus: " << text << std::endl;
            // Skip invalid text
            continue;
        }
    }
    // Tokenize the entire corpus into token sequences with frequencies
    std::vector<std::pair<std::vector<TokenID>, int>> tokenized_corpus;
    std::unordered_map<std::vector<TokenID>, int, VectorHash> sequence_counts;
    // First, split text into words and tokenize each word
    for (const auto& text : corpus) {
        auto words = pimpl_->split_text(text);
        for (const auto& word : words) {
            // Convert word to initial token sequence (characters)
            auto tokens = pimpl_->word_to_token_ids(word);
            // Count frequency of this token sequence
            sequence_counts[tokens]++;
        }
    }
    // Convert to vector for easier processing
    tokenized_corpus.reserve(sequence_counts.size());
    for (const auto& [sequence, count] : sequence_counts) {
        tokenized_corpus.emplace_back(sequence, count);
    }
    // Clear the temporary map to save memory
    sequence_counts.clear();
    // BPE training algorithm with safety limit
    int iteration = 0;
    int max_iterations = 10000;
    // Pre-allocate pair counts
    std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash> pair_counts;
    pair_counts.reserve(1000000); // Reserve space for 1M pairs
    while (pimpl_->vocab.size() < vocab_size && iteration < max_iterations) {
        // Count pairs in token sequences
        pair_counts.clear();
        pimpl_->get_pair_counts_from_sequences(tokenized_corpus, pair_counts);
        if (pair_counts.empty()) {
            std::cout << "No more pairs to merge. Stopping early." << std::endl;
            break;
        }
        // Find most frequent pair
        auto max_pair = std::max_element(
            pair_counts.begin(), pair_counts.end(),
            [](const auto& a, const auto& b) { return a.second < b.second; }
        );
        // Debug output - show what we're merging
        if (pimpl_->debug_logging) {
            std::string first_str = pimpl_->inv_vocab.count(max_pair->first.first) ? 
                pimpl_->inv_vocab.at(max_pair->first.first) : "<?>";
            std::string second_str = pimpl_->inv_vocab.count(max_pair->first.second) ? 
                pimpl_->inv_vocab.at(max_pair->first.second) : "<?>";
            std::cout << "Iteration " << iteration 
                      << ": Merging '" << first_str << "' + '" << second_str 
                      << "' → count: " << max_pair->second << std::endl;
        }
        // Perform merge on token sequences
        pimpl_->perform_merge_on_sequences(max_pair->first, pimpl_->next_token_id, tokenized_corpus);
        pimpl_->next_token_id++;
        iteration++;
        // Periodically check memory usage and clean up
        if (iteration % 500 == 0) {
            size_t current_memory = get_peak_memory_usage();
            std::cout << "Memory after " << iteration << " iterations: " 
                      << (current_memory - start_memory) / (1024 * 1024) << "MB\n";
            std::cout << "Vocabulary size: " << pimpl_->vocab.size() << std::endl;
        }
    }
    if (iteration >= max_iterations) {
        std::cout << "Reached maximum iterations. Stopping training." << std::endl;
    }
    // Re-enable caching after training
    pimpl_->enable_caching(true);
    size_t end_memory = get_peak_memory_usage();
    std::cout << "Training completed in " << iteration << " iterations\n";
    std::cout << "Peak memory used: " << (end_memory - start_memory) / (1024 * 1024) << "MB\n";
    std::cout << "Final vocabulary size: " << pimpl_->vocab.size() << std::endl;
 }
 void BPETokenizer::Impl::get_pair_counts(
    const std::unordered_map<std::string, int>& word_counts,
    std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash>& pair_counts) const {
    pair_counts.clear();
    pair_counts.reserve(word_counts.size() * 10);
    for (const auto& [word, count] : word_counts) {
        // Tokenize the word using the current vocabulary
        auto tokens = word_to_token_ids(word);
        // Count pairs in the tokenized representation
        for (size_t i = 0; i < tokens.size() - 1; i++) {
            auto pair = std::make_pair(tokens[i], tokens[i+1]);
            pair_counts[pair] += count;
        }
    }
 }
 std::vector<TokenID> BPETokenizer::Impl::word_to_token_ids(const std::string& word) const {
    std::vector<TokenID> tokens;
    if (normalization_enabled) {
        // Use Unicode-aware splitting
        std::vector<std::string> characters;
        if (cache_enabled) {
            characters = unicode_cache.get_split(word);
        } else {
            characters = unicode::unicode_split(word);
        }
        for (const auto& character : characters) {
            if (auto it = vocab.find(character); it != vocab.end()) {
                tokens.push_back(it->second);
            } else if (byte_fallback_enabled) {
                // Fall back to byte encoding for unknown characters
                for (unsigned char c : character) {
                    std::string byte_str(1, static_cast<char>(c));
                    if (auto byte_it = vocab.find(byte_str); byte_it != vocab.end()) {
                        tokens.push_back(byte_it->second);
                    } else {
                        tokens.push_back(unknown_token_id);
                    }
                }
            } else {
                tokens.push_back(unknown_token_id);
            }
        }
    } else {
        // Non-Unicode mode: treat as ASCII
        for (char c : word) {
            std::string token(1, c);
            if (auto it = vocab.find(token); it != vocab.end()) {
                tokens.push_back(it->second);
            } else {
                tokens.push_back(unknown_token_id);
            }
        }
    }
    return tokens;
 }
 size_t BPETokenizer::vocab_size() const {
    return pimpl_->vocab.size();
 }
 std::vector<TokenID> BPETokenizer::encode(const std::string& text) const {
    pimpl_->log_encode_start(text);
    // Validate UTF-8 before processing
    if (!is_valid_utf8_impl(text.data(), text.size())) {
        if (pimpl_->byte_fallback_enabled) {
            return pimpl_->handle_invalid_utf8(text);
        } else {
            return {pimpl_->unknown_token_id};
        }
    }
    // Normalize the text first
    std::string normalized = pimpl_->normalization_enabled ? 
        pimpl_->unicode_cache.get_normalized(text) : text;
    // Split into words
    auto words = pimpl_->split_text(normalized);
    pimpl_->log_word_split(words);
    std::vector<TokenID> tokens;
    for (const auto& word : words) {
        // Convert word to initial tokens (characters)
        auto word_tokens = pimpl_->word_to_token_ids(word);
        pimpl_->log_word_tokens(word, word_tokens);
        // Apply BPE merges
        bool changed;
        do {
            changed = false;
            for (size_t i = 0; i < word_tokens.size() - 1; i++) {
                auto pair = std::make_pair(word_tokens[i], word_tokens[i+1]);
                if (auto it = pimpl_->merges.find(pair); it != pimpl_->merges.end()) {
                    // Replace the pair with the merged token
                    word_tokens[i] = it->second;
                    word_tokens.erase(word_tokens.begin() + i + 1);
                    changed = true;
                    pimpl_->log_merge_result(word_tokens);
                    // Restart from the beginning to catch new pairs
                    i = 0;
                }
            }
        } while (changed);
        tokens.insert(tokens.end(), word_tokens.begin(), word_tokens.end());
        // DON'T add space between words - the original text already has spaces if needed
        // This is the key change - remove the space insertion logic
    }
    pimpl_->log_final_tokens(tokens);
    return tokens;
 }
 std::string BPETokenizer::decode(const std::vector<TokenID>& tokens) const {
    pimpl_->log_decode_start(tokens);
    std::string text;
    text.reserve(tokens.size() * 3);
    for (TokenID token_id : tokens) {
        std::string token_text;
        if (pimpl_->inv_vocab.find(token_id) != pimpl_->inv_vocab.end()) {
            token_text = pimpl_->inv_vocab.at(token_id);
        } else {
            token_text = pimpl_->unknown_token;
        }
        pimpl_->log_token_decoding(token_id, token_text);
        // Directly append the token text without adding spaces
        text += token_text;
    }
    pimpl_->log_final_decoding(text);
    return text;
 }
 bool BPETokenizer::save(const std::string& filename) const {
    std::ofstream file(filename);
    if (!file.is_open()) {
        return false;
    }
    // Save vocabulary
    file << pimpl_->vocab.size() << "\n";
    for (const auto& [token, id] : pimpl_->vocab) {
        file << id << " " << token << "\n";
    }
    // Save merges
    file << pimpl_->merges.size() << "\n";
    for (const auto& [pair, new_id] : pimpl_->merges) {
        file << pair.first << " " << pair.second << " " << new_id << "\n";
    }
    return true;
 }
 bool BPETokenizer::load(const std::string& filename) {
    std::ifstream file(filename);
    if (!file.is_open()) {
        return false;
    }
    // Clear existing data
    pimpl_->vocab.clear();
    pimpl_->inv_vocab.clear();
    pimpl_->merges.clear();
    // Load vocabulary
    size_t vocab_size;
    file >> vocab_size;
    for (size_t i = 0; i < vocab_size; i++) {
        TokenID id;
        std::string token;
        file >> id;
        std::getline(file, token);
        // Remove leading space
        if (!token.empty() && token[0] == ' ') {
            token = token.substr(1);
        }
        pimpl_->vocab[token] = id;
        pimpl_->inv_vocab[id] = token;
    }
    // Load merges
    size_t merge_count;
    file >> merge_count;
    for (size_t i = 0; i < merge_count; i++) {
        TokenID first, second, new_id;
        file >> first >> second >> new_id;
        pimpl_->merges[{first, second}] = new_id;
    }
    return true;
 }
 // Special token method implementations
 TokenID BPETokenizer::eos_token_id() const { 
    return pimpl_->eos_token_id; 
 }
 void BPETokenizer::set_eos_token_id(TokenID id) { 
    pimpl_->eos_token_id = id; 
 }
 TokenID BPETokenizer::pad_token_id() const { 
    return pimpl_->pad_token_id; 
 }
 void BPETokenizer::set_pad_token_id(TokenID id) { 
    pimpl_->pad_token_id = id; 
 }
 TokenID BPETokenizer::unk_token_id() const { 
    return pimpl_->unk_token_id; 
 }
 void BPETokenizer::set_unk_token_id(TokenID id) { 
    pimpl_->unk_token_id = id; 
 }
 void BPETokenizer::add_special_token(const std::string& token, TokenID id) {
    pimpl_->vocab[token] = id;
    pimpl_->inv_vocab[id] = token;
    pimpl_->special_tokens[token] = id;
    // Update the specific token ID if it matches known types
    if (token == "<eos>" || token == "</s>") {
        pimpl_->eos_token_id = id;
    } else if (token == "<pad>") {
        pimpl_->pad_token_id = id;
    } else if (token == "<unk>") {
        pimpl_->unk_token_id = id;
    }
 }
 } // namespace lm
--- a/src/tokenizer/unicode_utils
+++ b/src/tokenizer/unicode_utils
@ -0,0 +1,128 @@
 // src/tokenizer/unicode_utils.cpp
 #include "lm/tokenizer/unicode_utils.hpp"
 #include <unicode/uchar.h>
 #include <unicode/unistr.h>
 #include <unicode/normlzr.h>
 #include <unicode/ustring.h>
 #include <stdexcept>
 #include <algorithm>
 namespace lm::unicode {
 bool is_whitespace(uint32_t codepoint) {
    return u_isUWhiteSpace(codepoint);
 }
 bool is_punctuation(uint32_t codepoint) {
    return u_ispunct(codepoint);
 }
 bool is_control(uint32_t codepoint) {
    return u_iscntrl(codepoint);
 }
 std::string normalize(const std::string& text) {
    try {
        icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(text);
        icu::UnicodeString normalized;
        UErrorCode status = U_ZERO_ERROR;
        icu::Normalizer::normalize(unicode_str, UNORM_NFC, 0, normalized, status);
        if (U_FAILURE(status)) {
            throw std::runtime_error("Unicode normalization failed");
        }
        std::string result;
        normalized.toUTF8String(result);
        return result;
    } catch (const std::exception& e) {
        throw std::runtime_error("Unicode normalization error: " + std::string(e.what()));
    }
 }
 std::vector<CodePoint> to_code_points(const std::string& text) {
    std::vector<CodePoint> code_points;
    for (size_t i = 0; i < text.size(); ) {
        CodePoint cp;
        uint32_t codepoint;
        int offset = 0;
        // Decode UTF-8
        U8_NEXT(text.c_str(), i, text.size(), codepoint);
        if (codepoint == U_SENTINEL) {
            // Handle invalid UTF-8 gracefully instead of throwing
            // Use replacement character (U+FFFD) for invalid sequences
            cp.value = 0xFFFD;
            cp.utf8 = "<EFBFBD>";  // Replacement character
            code_points.push_back(cp);
            // Skip this byte and continue
            i++;
            continue;
        }
        // Get the UTF-8 bytes for this code point
        char utf8_buf[5] = {0};
        U8_APPEND_UNSAFE(utf8_buf, offset, codepoint);
        cp.value = codepoint;
        cp.utf8 = std::string(utf8_buf, offset);
        code_points.push_back(cp);
        i += offset;
    }
    return code_points;
 }
 std::string from_code_points(const std::vector<CodePoint>& code_points) {
    std::string result;
    for (const auto& cp : code_points) {
        result += cp.utf8;
    }
    return result;
 }
 // Remove the "unicode::" qualification - we're already in the lm::unicode namespace
 std::vector<std::string> unicode_split(const std::string& text) {
    std::vector<std::string> characters;
    int i = 0;
    while (i < text.length()) {
        int char_len = 1;
        // Check for UTF-8 multi-byte characters
        if ((text[i] & 0x80) == 0) {
            // ASCII character
            char_len = 1;
        } else if ((text[i] & 0xE0) == 0xC0) {
            // 2-byte UTF-8 character
            char_len = 2;
        } else if ((text[i] & 0xF0) == 0xE0) {
            // 3-byte UTF-8 character
            char_len = 3;
        } else if ((text[i] & 0xF8) == 0xF0) {
            // 4-byte UTF-8 character
            char_len = 4;
        }
        characters.push_back(text.substr(i, char_len));
        i += char_len;
    }
    return characters;
 }
 std::vector<std::string> split_on_character_boundaries(const std::string& text) {
    std::vector<std::string> characters;
    auto code_points = to_code_points(text);
    for (const auto& cp : code_points) {
        characters.push_back(cp.utf8);
    }
    return characters;
 }
 } // namespace lm::unicode
--- a/src/training/data_loader.cpp
+++ b/src/training/data_loader.cpp
@ -0,0 +1,140 @@
 // src/training/data_loader.cpp
 #include "data_loader.hpp"
 #include <fstream>
 #include <sstream>
 #include <iostream>
 #include <random>
 #include <algorithm>
 namespace lm {
 ConversationDataLoader::ConversationDataLoader(const std::string& file_path, 
                                             BPETokenizer& tokenizer,
                                             size_t batch_size, 
                                             size_t seq_length)
    : tokenizer_(tokenizer), batch_size_(batch_size), seq_length_(seq_length), 
      current_index_(0) {
    load_conversations(file_path);
 }
 void ConversationDataLoader::load_conversations(const std::string& file_path) {
    std::ifstream file(file_path);
    if (!file.is_open()) {
        throw std::runtime_error("Failed to open conversation data file: " + file_path);
    }
    std::string line;
    while (std::getline(file, line)) {
        if (!line.empty()) {
            auto tokens = tokenize_conversation(line);
            if (!tokens.empty()) {
                conversations_.push_back(tokens);
            }
        }
    }
    if (conversations_.empty()) {
        throw std::runtime_error("No conversations loaded from file: " + file_path);
    }
    // Shuffle conversations for better training
    std::random_device rd;
    std::mt19937 g(rd());
    std::shuffle(conversations_.begin(), conversations_.end(), g);
    std::cout << "Loaded " << conversations_.size() << " conversations" << std::endl;
 }
 std::vector<int> ConversationDataLoader::tokenize_conversation(const std::string& conversation) {
    // Simple conversation format: User: Hello|AI: Hi there|User: How are you?
    // We'll split by | and tokenize each part
    std::vector<int> all_tokens;
    std::stringstream ss(conversation);
    std::string part;
    while (std::getline(ss, part, '|')) {
        if (!part.empty()) {
            auto tokens = tokenizer_.encode(part);
            all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end());
            // Add separator token (assuming 3 is SEP)
            all_tokens.push_back(3);
        }
    }
    // Remove the last separator if present
    if (!all_tokens.empty() && all_tokens.back() == 3) {
        all_tokens.pop_back();
    }
    return all_tokens;
 }
 bool ConversationDataLoader::has_next() const {
    return current_index_ < conversations_.size();
 }
 std::pair<Tensor, Tensor> ConversationDataLoader::next_batch() {
    if (!has_next()) {
        throw std::out_of_range("No more batches available");
    }
    size_t end_index = std::min(current_index_ + batch_size_, conversations_.size());
    size_t actual_batch_size = end_index - current_index_;
    // Find the maximum sequence length in this batch
    size_t max_seq_len = 0;
    for (size_t i = current_index_; i < end_index; i++) {
        max_seq_len = std::max(max_seq_len, conversations_[i].size());
    }
    // Limit to the configured sequence length and add 1 for targets
    max_seq_len = std::min(max_seq_len, seq_length_);
    // Create input and target tensors
    Tensor inputs({actual_batch_size, max_seq_len}, false);
    Tensor targets({actual_batch_size, max_seq_len}, false);
    // Fill the tensors with data
    for (size_t i = 0; i < actual_batch_size; i++) {
        const auto& tokens = conversations_[current_index_ + i];
        size_t seq_len = std::min(tokens.size(), max_seq_len);
        for (size_t j = 0; j < seq_len; j++) {
            inputs(i, j) = static_cast<float>(tokens[j]);
            // For language modeling, target is the next token
            if (j < seq_len - 1) {
                targets(i, j) = static_cast<float>(tokens[j + 1]);
            } else {
                targets(i, j) = -100.0f; // Standard value for ignored indices in loss
            }
        }
        // Pad the rest of the sequence if needed
        for (size_t j = seq_len; j < max_seq_len; j++) {
            inputs(i, j) = 0.0f; // Pad token ID (assuming 0 is pad)
            targets(i, j) = -100.0f; // Ignore in loss
        }
    }
    current_index_ = end_index;
    return {inputs, targets};
 }
 void ConversationDataLoader::reset() {
    current_index_ = 0;
    // Reshuffle for the next epoch
    std::random_device rd;
    std::mt19937 g(rd());
    std::shuffle(conversations_.begin(), conversations_.end(), g);
 }
 size_t ConversationDataLoader::num_batches() const {
    return (conversations_.size() + batch_size_ - 1) / batch_size_;
 }
 } // namespace lm
--- a/src/training/losses.cpp
+++ b/src/training/losses.cpp
@ -0,0 +1,78 @@
 // src/training/losses.cpp
 #include "losses.hpp"
 #include <cmath>
 #include <stdexcept>
 namespace lm {
 Tensor cross_entropy_loss(const Tensor& logits, const Tensor& targets, const Tensor& mask) {
    if (logits.shape().size() != 3) {
        throw std::invalid_argument("Logits must be 3D tensor [batch, seq_len, vocab_size]");
    }
    if (targets.shape().size() != 2) {
        throw std::invalid_argument("Targets must be 2D tensor [batch, seq_len]");
    }
    size_t batch_size = logits.shape()[0];
    size_t seq_len = logits.shape()[1];
    size_t vocab_size = logits.shape()[2];
    if (targets.shape()[0] != batch_size || targets.shape()[1] != seq_len) {
        throw std::invalid_argument("Logits and targets must have compatible shapes");
    }
    // Create output tensor
    Tensor loss({batch_size, seq_len}, false);
    // Compute cross-entropy loss
    for (size_t b = 0; b < batch_size; b++) {
        for (size_t s = 0; s < seq_len; s++) {
            int target_idx = static_cast<int>(targets(b, s));
            // Skip padded positions (target = -100)
            if (target_idx == -100) {
                loss(b, s) = 0.0f;
                continue;
            }
            if (target_idx < 0 || target_idx >= static_cast<int>(vocab_size)) {
                throw std::out_of_range("Target index out of vocabulary range");
            }
            // Compute softmax and cross-entropy for this position
            float max_logit = logits(b, s, 0);
            for (size_t v = 1; v < vocab_size; v++) {
                if (logits(b, s, v) > max_logit) {
                    max_logit = logits(b, s, v);
                }
            }
            float sum_exp = 0.0f;
            for (size_t v = 0; v < vocab_size; v++) {
                sum_exp += std::exp(logits(b, s, v) - max_logit);
            }
            float log_softmax = logits(b, s, target_idx) - max_logit - std::log(sum_exp);
            loss(b, s) = -log_softmax;
        }
    }
    // If mask is provided, apply it
    if (mask.shape().size() > 0) {
        if (mask.shape()[0] != batch_size || mask.shape()[1] != seq_len) {
            throw std::invalid_argument("Mask must have same shape as loss");
        }
        for (size_t b = 0; b < batch_size; b++) {
            for (size_t s = 0; s < seq_len; s++) {
                loss(b, s) *= mask(b, s);
            }
        }
    }
    return loss;
 }
 } // namespace lm
--- a/src/training/trainer
+++ b/src/training/trainer
@ -0,0 +1,65 @@
 // src/training/trainer.cpp
 #include "lm/training/trainer.hpp"
 #include <fstream>
 namespace lm {
 namespace training {
 Trainer::Trainer(LanguageModel& model, AdamOptimizer& optimizer) 
    : model(model), optimizer(optimizer) {}
 void Trainer::train(const std::vector<std::string>& corpus, 
                   size_t num_epochs, 
                   size_t batch_size, 
                   size_t sequence_length) {
    // Simplified training loop
    for (size_t epoch = 0; epoch < num_epochs; epoch++) {
        // For each batch in the corpus
        // 1. Tokenize the batch
        // 2. Forward pass
        // 3. Compute loss
        // 4. Backward pass
        // 5. Optimizer step
        // Placeholder implementation
        std::cout << "Training epoch " << epoch + 1 << "/" << num_epochs << std::endl;
    }
 }
 void Trainer::save_checkpoint(const std::string& path, 
                             const TrainingCheckpoint& checkpoint) const {
    std::ofstream ofs(path, std::ios::binary);
    cereal::BinaryOutputArchive archive(ofs);
    // Save training state
    archive(checkpoint);
    // Save model parameters
    auto params = model.get_parameters();
    archive(params);
    // Save optimizer state
    optimizer.save_state(path + ".optim");
 }
 TrainingCheckpoint Trainer::load_checkpoint(const std::string& path) {
    std::ifstream ifs(path, std::ios::binary);
    cereal::BinaryInputArchive archive(ifs);
    TrainingCheckpoint checkpoint;
    archive(checkpoint);
    // Load model parameters
    std::vector<Tensor> params;
    archive(params);
    model.set_parameters(params);
    // Load optimizer state
    optimizer.load_state(path + ".optim");
    return checkpoint;
 }
 } // namespace training
 } // namespace lm
		`@ -0,0 +1 @@`
							`,bwana,bwana-VirtualBox,10.09.2025 16:08,file:///home/bwana/.config/libreoffice/4;`