Lotta work here

2025-08-27 14:02:03 -07:00 · 2025-08-27 14:02:03 -07:00 · d89095e49b
commit d89095e49b
parent d83d07a823
34 changed files with 3207 additions and 186 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,22 @@
 cmake_minimum_required(VERSION 3.14)
 project(lm_framework LANGUAGES CXX)

+# Check for Intel x86-64 hardware
+set(SUPPORTED_ARCHITECTURES x86_64 amd64 AMD64 i686 i386)
+list(FIND SUPPORTED_ARCHITECTURES ${CMAKE_SYSTEM_PROCESSOR} ARCH_INDEX)
+if(ARCH_INDEX EQUAL -1)
+    message(FATAL_ERROR "This framework requires Intel x86-64 hardware. "
+                        "Current processor architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+# Check for EIGEN_LOC variable
+if(NOT DEFINED EIGEN_LOC)
+    message(FATAL_ERROR "This framework requires the location of the Eigen header files. "
+                        "Please set EIGEN_LOC to the path of your Eigen installation.")
+elseif(EIGEN_LOC STREQUAL "")
+    message(FATAL_ERROR "EIGEN_LOC is empty. Please set it to the path of your Eigen installation.")
+endif()
+
 # Set default build type to Release if not specified
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
@ -19,6 +35,7 @@ endif()
 # Include directories
 include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC} # Local Eigen installation
 )

 # Find dependencies
@ -37,6 +54,24 @@ FetchContent_MakeAvailable(googletest)
 # Add subdirectories
 add_subdirectory(src/tokenizer)
 add_subdirectory(src/runtime)
+add_subdirectory(src/optimizers)  # NEW: Add optimizers directory
+add_subdirectory(src/models)      # NEW: Add models directory
+add_subdirectory(src/training)    # NEW: Add training directory
+
+# Header-only core components (Tensor implementation)
+add_library(lm_core_components INTERFACE)
+target_include_directories(lm_core_components INTERFACE 
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC}  # Local Eigen installation
+)
+
+# Header-only model components
+add_library(lm_model INTERFACE)
+target_include_directories(lm_model INTERFACE 
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC}  # Local Eigen installation
+)
+target_link_libraries(lm_model INTERFACE lm_core_components)

 # Main library
 add_library(lm_core
@ -47,6 +82,7 @@ add_library(lm_core
 target_link_libraries(lm_core
    PRIVATE
        lm_tokenizer
+        lm_model
        nlohmann_json::nlohmann_json
 )

@ -73,6 +109,26 @@ target_link_libraries(test_unicode_bpe
        GTest::gtest_main
 )

+# NEW: Add test for optimizers (only if file exists)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_optimizers.cpp)
+    add_executable(test_optimizers src/test_optimizers.cpp)
+    target_link_libraries(test_optimizers
+        PRIVATE
+            lm_core
+            GTest::gtest_main
+    )
+endif()
+
+# NEW: Add test for training (only if file exists)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_training.cpp)
+    add_executable(test_training src/test_training.cpp)
+    target_link_libraries(test_training
+        PRIVATE
+            lm_core
+            GTest::gtest_main
+    )
+endif()
+
 # Alpha prototype executable
 add_executable(lm_alpha
    src/alpha/repl.cpp
@ -85,8 +141,31 @@ target_link_libraries(lm_alpha
        nlohmann_json::nlohmann_json
 )

+# NEW: Training example executable (only if file exists)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/examples/train_lm.cpp)
+    add_executable(train_lm examples/train_lm.cpp)
+    target_link_libraries(train_lm
+        PRIVATE
+            lm_core
+    )
+endif()
+
 # Install targets
 install(TARGETS lm_core DESTINATION lib)
+
+# Only install these targets if they exist
+if(TARGET lm_optimizers)
+    install(TARGETS lm_optimizers DESTINATION lib)
+endif()
+
+if(TARGET lm_models)
+    install(TARGETS lm_models DESTINATION lib)
+endif()
+
+if(TARGET lm_training)
+    install(TARGETS lm_training DESTINATION lib)
+endif()
+
 install(DIRECTORY include/ DESTINATION include)

 # Performance testing target
@ -97,6 +176,16 @@ target_link_libraries(performance_test
        GTest::gtest_main
 )

+# Integration example
+add_executable(integration_example src/integration_example.cpp)
+target_link_libraries(integration_example
+    PRIVATE
+        lm_core
+        lm_models      # Add models library
+        lm_optimizers  # Add optimizers library if needed
+        lm_training    # Add training library if needed
+)
+
 # Add compiler warning flags
 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
@ -110,3 +199,31 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
    endif()
 endif()
+
+# Verify Eigen installation
+add_custom_target(check_eigen
+    COMMAND ${CMAKE_COMMAND} -E echo "Checking Eigen installation at ${EIGEN_LOC}"
+    COMMAND test -f ${EIGEN_LOC}/Eigen/Core || (echo "Eigen not found at specified path: ${EIGEN_LOC}" && exit 1)
+    COMMENT "Verifying Eigen installation"
+)
+
+# Make main targets depend on Eigen check
+add_dependencies(lm_core check_eigen)
+add_dependencies(test_bpe check_eigen)
+add_dependencies(test_unicode_bpe check_eigen)
+add_dependencies(lm_alpha check_eigen)
+add_dependencies(performance_test check_eigen)
+add_dependencies(integration_example check_eigen)
+
+# Only add dependencies if the targets exist
+if(TARGET train_lm)
+    add_dependencies(train_lm check_eigen)
+endif()
+
+if(TARGET test_optimizers)
+    add_dependencies(test_optimizers check_eigen)
+endif()
+
+if(TARGET test_training)
+    add_dependencies(test_training check_eigen)
+endif()
--- a/README.md
+++ b/README.md
@ -1,82 +1,53 @@
 # bpe_framework
+## Byte Pair Encoding Framework
 Large Language Model for Agentic AI

-Build: cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 ..
+Fully internationalized framework for Agentic AI research

-#### The test_bpe application does the following:
- 1. Includes necessary headers and defines the main function.
- 2. Creates an instance of the BPETokenizer.
- 3. Defines a training corpus (a vector of strings).
- 4. Trains the tokenizer on the corpus with a specified vocabulary size (500 in this case).
- 5. Tests the tokenizer by encoding a sample string ("the quick brown fox").
- 6. Decodes the tokens back to a string and prints the original, tokens, and decoded string.
- 7. Saves the tokenizer to a file ("bpe_model.txt").
- 8. Loads the tokenizer from the file and verifies the loaded tokenizer's vocabulary size.
- The purpose of this test is to verify that the BPE tokenizer can be trained, encode, decode, and serialize/deserialize correctly.
- Let's break down the code step by step.
-test_bpe Application Overview
+Requires:
+1. nlohman/json (https://github.com/nlohmann/json
+2. Internationalzation library for Unicode by Frederick Roubert (https://github.com/unicode-org/icu)
+3. OpenNMT Tokenizer by Thuc Pham (https://github.com/OpenNMT/Tokenize)
+4. Eigen header files (https://github.com/PX4/eigen)
+
+Build: cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DEIGEN_LOC=<eigen3 folder> ..
 
 #### The test_bpe application is a comprehensive test program that validates the functionality of the BPE tokenizer implementation in the LM Framework. Here's how it works:
-1. Initialization
-
+1. Initialization:
    Creates an instance of BPETokenizer
-
    Defines a training corpus with sample English text

-2. Training Process
-
+2. Training Process:
    Calls tokenizer.train(corpus, 500) to train the tokenizer
-
    The training process:
-
        Initializes with byte-level vocabulary (0-255)
-
        Analyzes word frequencies in the corpus
-
        Iteratively merges the most frequent character pairs
-
        Builds a vocabulary of 500 tokens (as specified)

-3. Encoding Test
-
+3. Encoding Test:
    Encodes the test string "the quick brown fox"
-
    The encoding process:
-
        Splits text into words
-
        Converts each character to its initial token ID
-
        Applies learned BPE merges to combine tokens
-
        Returns a sequence of integer token IDs

-4. Decoding Test
-
+4. Decoding Test:
    Decodes the token IDs back to text
-
    The decoding process:
-
        Converts each token ID back to its string representation
-
        Concatenates the strings to reconstruct the original text

 5. Serialization Test
-
    Saves the trained tokenizer to "bpe_model.txt"
-
    The serialization process:
-
        Writes vocabulary size and token-ID mappings
-
        Records all learned merge rules

 6. Deserialization Test
-
    Loads the tokenizer from "bpe_model.txt"
-
    Verifies the loaded tokenizer has the same vocabulary size
-
    Confirms the tokenizer can perform encoding/decoding

 Expected Output
@ -91,22 +62,32 @@ Successfully loaded tokenizer
 Loaded vocabulary size: 500

 Key Validations
-
    Training Completes without errors
-
    Encoding/Decoding Round-Trip preserves the original text
-
    Serialization/Deserialization maintains tokenizer state
-
    Vocabulary Size matches the specified target (500)
-
    Token IDs are consistent between sessions

-# BPE Tokenizer Performance Test Suite
+## test_unicode.cpp

-## Overview
+### Lower-level Unicode-specific tests:

-This performance test application is a comprehensive benchmarking tool designed to evaluate the efficiency and scalability of the Byte Pair Encoding (BPE) tokenizer implementation. The test suite measures critical performance metrics including training time, memory usage, encoding/decoding speed, and serialization performance across various configurations.
+    Unicode normalization functions
+
+    Character boundary detection
+
+    Grapheme cluster handling
+
+    Encoding conversion utilities
+
+    Validation of Unicode compliance
+
+
+## BPE Tokenizer Performance Test Suite
+
+### Overview
+
+The performance test application is a comprehensive benchmarking tool designed to evaluate the efficiency and scalability of the Byte Pair Encoding (BPE) tokenizer implementation. The test suite measures critical performance metrics including training time, memory usage, encoding/decoding speed, and serialization performance across various configurations.

 ## Key Features

@ -162,4 +143,136 @@ The application provides detailed performance reports including:

 This test framework serves as an essential tool for developers and researchers working with BPE tokenizers, providing quantitative data to guide optimization efforts and implementation choices.

+## Technical Summary: BPE Framework
+### Overview
+
+The BPE Framework is a C++-based neural network framework designed for building and training language models with Byte Pair Encoding (BPE) tokenization. It implements a complete deep learning stack with automatic differentiation, optimization, and model serialization capabilities.
+Core Components
+#### 1. Tensor Operations with Autograd
+
+    Header-only Tensor class with Eigen backend for efficient linear algebra
+
+    Automatic differentiation with backward propagation
+
+    Comprehensive operator support: element-wise operations, matrix multiplication, reductions
+
+    Activation functions: ReLU, GELU, Softmax, Sigmoid with gradient support
+
+    Memory-efficient implementation with shape-aware operations
+
+#### 2. BPE Tokenizer
+
+    PIMPL pattern implementation for API stability
+
+    Efficient vocabulary management with merge operations
+
+    Encoding/decoding support for text processing
+
+    Non-copyable design (uses unique_ptr) for proper resource management
+
+#### 3. Neural Network Architecture
+
+    Transformer-based language model implementation
+
+    Configurable dimensions: embedding size, hidden layers, attention heads
+
+    Parameter management with named parameters for serialization
+
+    Training/inference modes support
+
+#### 4. Training Infrastructure
+
+    Adam optimizer with configurable hyperparameters
+
+    Gradient accumulation and moment estimation
+
+    Batch processing with sequence padding
+
+    Loss computation (cross-entropy) with masking support
+
+#### 5. Model Serialization
+
+    Binary format with versioning and magic number validation
+
+    Parameter-by-name storage and retrieval
+
+    Shape preservation and data integrity checks
+
+    Error handling for file operations and format validation
+
+### Key Technical Features
+#### Memory Management
+
+    Eigen integration for optimized matrix operations
+
+    Shape-aware memory allocation preventing unnecessary copies
+
+    RAII principles for resource management
+
+#### Performance Considerations
+
+    Header-only design for Tensor class enabling compiler optimizations
+
+    Batch processing for efficient training
+
+    In-place operations where possible to reduce memory overhead
+
+#### Extensibility
+
+    Modular architecture allowing component replacement
+
+    Clear interfaces between tokenizer, model, and training components
+
+    Parameter naming convention supporting complex architectures
+
+#### Architecture Patterns
+
+    PIMPL Idiom: Used in tokenizer for stable ABI
+
+    RAII: Comprehensive resource management throughout
+
+    Builder Pattern: Model configuration through constructor parameters
+
+    Strategy Pattern: Optimizer implementation allowing algorithm changes
+
+#### Current Capabilities
+
+    * Automatic differentiation with reverse-mode autograd
+
+    * BPE tokenization with vocabulary learning
+
+    * Transformer language model training
+
+    * Adam optimization with moment estimation
+
+    * Model serialization/deserialization
+
+    * Configurable network architectures
+
+    * Batch processing with padding
+
+### Technical Stack
+
+    C++17 with standard library components
+
+    Eigen for linear algebra operations
+
+    CMake for build system management
+
+    Header-only design for core components
+
+#### Usage Example
+
+// Initialize components
+BPETokenizer tokenizer(corpus);
+LanguageModel model(tokenizer.vocab_size(), 512, 2048, 8);
+LanguageModelTrainer trainer(tokenizer, 512, 2048, 8);
+
+// Train model
+trainer.train(training_corpus, 10, 32, 256);
+
+trainer.save_model("language_model.bin");
+
+
+Based on the research of Timothy O'Neil, Frederick Warren, et. al.

--- a/build_log.md
+++ b/build_log.md
@ -0,0 +1,5 @@
+### 8/24/2025 - Eigen integrated
+Turns out Eigen can only do 1 & 2D transforms so I had to "flatten out" the objects that required transformation and work on each dimension separately. 3 days of work.
+
+### 8/25/2025 - Tensor Transformer
+Got the transformer code wired in. Some really crazy geometry goes into making machines seem like they're talking to you.
--- a/include/lm/core/tensor.hpp
+++ b/include/lm/core/tensor.hpp
@ -0,0 +1,730 @@
+#pragma once
+
+#include <Eigen/Dense>
+#include <vector>
+#include <memory>
+#include <random>
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+
+namespace lm {
+
+class Tensor;
+
+Tensor operator*(float scalar, const Tensor& tensor);
+
+// Scalar multiplication (Tensor * float) - already defined as member function
+// Tensor operator*(const Tensor& tensor, float scalar);
+
+    class Tensor {
+public:
+        Tensor() : data_(Eigen::MatrixXf(0, 0)), shape_({0}), requires_grad_(false) {}
+	     Tensor sqrt() const {
+        Tensor result(data_.array().sqrt(), shape_);
+    
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    // Gradient of sqrt: 0.5 / sqrt(input)
+                    Eigen::ArrayXf grad_sqrt = 0.5f / (this->data_.array().sqrt() + 1e-12f); // Add small epsilon to avoid division by zero
+                    this->grad_.array() += result.grad_.array() * grad_sqrt;
+                }
+            };
+        }
+    
+        return result;
+    }
+    
+    Tensor(const std::vector<size_t>& shape, bool requires_grad = false) : requires_grad_(requires_grad) {
+        shape_ = shape;
+        if (shape.size() == 1) {
+            data_ = Eigen::VectorXf::Zero(shape[0]);
+            if (requires_grad) {
+                grad_ = Eigen::VectorXf::Zero(shape[0]);
+            }
+        } else if (shape.size() == 2) {
+            data_ = Eigen::MatrixXf::Zero(shape[0], shape[1]);
+            if (requires_grad) {
+                grad_ = Eigen::MatrixXf::Zero(shape[0], shape[1]);
+            }
+        } else {
+            // For higher dimensions, we'll flatten and handle with care
+            size_t total_size = 1;
+            for (auto dim : shape) total_size *= dim;
+            data_ = Eigen::VectorXf::Zero(total_size);
+            if (requires_grad) {
+                grad_ = Eigen::VectorXf::Zero(total_size);
+            }
+        }
+    }
+    
+    Tensor(const Eigen::MatrixXf& data, const std::vector<size_t>& shape = {}, bool requires_grad = false)
+        : data_(data), shape_(shape), requires_grad_(requires_grad) {
+        if (shape.empty()) {
+            if (data.cols() == 1) {
+                shape_ = {static_cast<size_t>(data.rows())};
+            } else {
+                shape_ = {static_cast<size_t>(data.rows()), 
+                         static_cast<size_t>(data.cols())};
+            }
+        }
+        
+        if (requires_grad) {
+            grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
+        }
+    }
+    
+    // Accessors
+    const std::vector<size_t>& shape() const { return shape_; }
+    Eigen::MatrixXf& data() { return data_; }
+    const Eigen::MatrixXf& data() const { return data_; }
+    Eigen::MatrixXf& grad() { return grad_; }
+    const Eigen::MatrixXf& grad() const { return grad_; }
+    bool requires_grad() const { return requires_grad_; }
+    
+    void requires_grad(bool requires_grad) {
+        requires_grad_ = requires_grad;
+        if (requires_grad && grad_.size() == 0) {
+            grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
+        }
+    }
+    
+    void zero_grad() {
+        grad_.setZero();
+    }
+    
+    // Element access
+    float& operator()(size_t i) { return data_(i); }
+    float operator()(size_t i) const { return data_(i); }
+    float& operator()(size_t i, size_t j) { return data_(i, j); }
+    float operator()(size_t i, size_t j) const { return data_(i, j); }
+    
+    // 3D indexing operators
+    float& operator()(size_t i, size_t j, size_t k) {
+        if (shape_.size() != 3) {
+            throw std::runtime_error("3D access requires 3D tensor");
+        }
+        size_t index = i * shape_[1] * shape_[2] + j * shape_[2] + k;
+        return data_(index);
+    }
+    
+    float operator()(size_t i, size_t j, size_t k) const {
+        if (shape_.size() != 3) {
+            throw std::runtime_error("3D access requires 3D tensor");
+        }
+        size_t index = i * shape_[1] * shape_[2] + j * shape_[2] + k;
+        return data_(index);
+    }
+
+    // Shape utilities
+    size_t size() const { return data_.size(); }
+    size_t dim(size_t axis) const { 
+        return (axis < shape_.size()) ? shape_[axis] : 1; 
+    }
+    size_t ndim() const { return shape_.size(); }
+    
+    // Reshape the tensor
+    Tensor reshape(const std::vector<size_t>& new_shape) const {
+        size_t total_size = 1;
+        for (auto dim : new_shape) total_size *= dim;
+        
+        if (total_size != size()) {
+            throw std::invalid_argument("Total size must remain the same when reshaping");
+        }
+        
+        Tensor result(data_, new_shape, requires_grad_);
+        if (requires_grad_) {
+            result.grad_ = grad_;
+        }
+        return result;
+    }
+    
+    // Mathematical operations with autograd
+    Tensor operator+(const Tensor& other) const {
+        if (shape_ != other.shape_) {
+            throw std::invalid_argument("Tensor shapes must match for addition");
+        }
+        
+        Tensor result(data_ + other.data_, shape_);
+        
+        if (requires_grad_ || other.requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, &other, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_;
+                }
+                if (other.requires_grad_) {
+                    other.grad_ += result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator-(const Tensor& other) const {
+        if (shape_ != other.shape_) {
+            throw std::invalid_argument("Tensor shapes must match for subtraction");
+        }
+        
+        Tensor result(data_ - other.data_, shape_);
+        
+        if (requires_grad_ || other.requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, &other, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_;
+                }
+                if (other.requires_grad_) {
+                    other.grad_ -= result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator*(const Tensor& other) const {
+        if (shape_ != other.shape_) {
+            throw std::invalid_argument("Tensor shapes must match for element-wise multiplication");
+        }
+        
+        Tensor result(data_.cwiseProduct(other.data_), shape_);
+        
+        if (requires_grad_ || other.requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, &other, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_.cwiseProduct(other.data_);
+                }
+                if (other.requires_grad_) {
+                    other.grad_ += result.grad_.cwiseProduct(this->data_);
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator/(const Tensor& other) const {
+        if (shape_ != other.shape_) {
+            throw std::invalid_argument("Tensor shapes must match for element-wise division");
+        }
+        
+        Tensor result(data_.cwiseQuotient(other.data_), shape_);
+        
+        if (requires_grad_ || other.requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, &other, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_.cwiseQuotient(other.data_);
+                }
+                if (other.requires_grad_) {
+                    other.grad_ -= result.grad_.cwiseProduct(this->data_).cwiseQuotient(other.data_.cwiseProduct(other.data_));
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator+(float scalar) const {
+        Tensor result(data_.array() + scalar, shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator-(float scalar) const {
+        Tensor result(data_.array() - scalar, shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator*(float scalar) const {
+        Tensor result(data_ * scalar, shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, scalar, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_ * scalar;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor operator/(float scalar) const {
+        Tensor result(data_ / scalar, shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, scalar, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_ / scalar;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor matmul(const Tensor& other) const {
+        if (ndim() != 2 || other.ndim() != 2) {
+            throw std::invalid_argument("matmul requires 2D tensors");
+        }
+        if (shape_[1] != other.shape_[0]) {
+            throw std::invalid_argument("Incompatible dimensions for matrix multiplication");
+        }
+        
+        Tensor result(data_ * other.data_, {shape_[0], other.shape()[1]});
+        
+        if (requires_grad_ || other.requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, &other, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_ * other.data_.transpose();
+                }
+                if (other.requires_grad_) {
+                    other.grad_ += this->data_.transpose() * result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor transpose() const {
+        if (ndim() != 2) {
+            throw std::invalid_argument("transpose requires 2D tensors");
+        }
+        
+        Tensor result(data_.transpose(), {shape_[1], shape_[0]});
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    this->grad_ += result.grad_.transpose();
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    // Reduction operations
+    Tensor sum(int axis = -1) const {
+        Tensor result;
+        
+        if (axis == -1 || ndim() == 1) {
+            result = Tensor(Eigen::MatrixXf::Constant(1, 1, data_.sum()));
+        } else if (axis == 0) {
+            result = Tensor(data_.colwise().sum(), {shape_[1]});
+        } else {
+            result = Tensor(data_.rowwise().sum(), {shape_[0]});
+        }
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, axis, result]() {
+                if (this->requires_grad_) {
+                    if (axis == -1 || ndim() == 1) {
+                        this->grad_.array() += result.grad_(0, 0);
+                    } else if (axis == 0) {
+                        for (int i = 0; i < this->grad_.rows(); ++i) {
+                            this->grad_.row(i) += result.grad_.transpose();
+                        }
+                    } else {
+                        for (int j = 0; j < this->grad_.cols(); ++j) {
+                            this->grad_.col(j) += result.grad_;
+                        }
+                    }
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor mean(int axis = -1) const {
+        Tensor result;
+        float divisor;
+        
+        if (axis == -1 || ndim() == 1) {
+            divisor = data_.size();
+            result = Tensor(Eigen::MatrixXf::Constant(1, 1, data_.mean()));
+        } else if (axis == 0) {
+            divisor = data_.rows();
+            result = Tensor(data_.colwise().mean(), {shape_[1]});
+        } else {
+            divisor = data_.cols();
+            result = Tensor(data_.rowwise().mean(), {shape_[0]});
+        }
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, axis, divisor, result]() {
+                if (this->requires_grad_) {
+                    if (axis == -1 || ndim() == 1) {
+                        this->grad_.array() += result.grad_(0, 0) / divisor;
+                    } else if (axis == 0) {
+                        for (int i = 0; i < this->grad_.rows(); ++i) {
+                            this->grad_.row(i) += result.grad_.transpose() / divisor;
+                        }
+                    } else {
+                        for (int j = 0; j < this->grad_.cols(); ++j) {
+                            this->grad_.col(j) += result.grad_ / divisor;
+                        }
+                    }
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    // Activation functions with autograd
+    Tensor relu() const {
+        Tensor result(data_.cwiseMax(0.0f), shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    // Gradient is 1 where input > 0, 0 otherwise
+                    Eigen::MatrixXf mask = (this->data_.array() > 0.0f).cast<float>();
+                    this->grad_ += result.grad_.cwiseProduct(mask);
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor gelu() const {
+        // Approximation of GELU: x * 0.5 * (1.0 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+        const float sqrt_2_over_pi = std::sqrt(2.0f / M_PI);
+        Eigen::ArrayXf x_array = data_.array();
+        Eigen::ArrayXf result_array = 0.5f * x_array * 
+            (1.0f + (sqrt_2_over_pi * (x_array + 0.044715f * x_array.pow(3))).tanh());
+    
+        Tensor result(Eigen::MatrixXf(result_array), shape_);
+    
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, sqrt_2_over_pi, result]() {
+                if (this->requires_grad_) {
+                    // Gradient of GELU approximation
+                    Eigen::ArrayXf x_array = this->data_.array();
+                    Eigen::ArrayXf x_cubed = x_array.pow(3);
+                    Eigen::ArrayXf inner = sqrt_2_over_pi * (x_array + 0.044715f * x_cubed);
+                    Eigen::ArrayXf tanh_inner = inner.tanh();
+                    Eigen::ArrayXf sech_squared = 1.0f - tanh_inner.square();
+                
+                    Eigen::ArrayXf grad = 0.5f * tanh_inner + 
+                        0.5f * x_array * sech_squared * sqrt_2_over_pi * (1.0f + 0.134145f * x_array.square()) +
+                        0.5f * (1.0f + tanh_inner);
+                
+                    // Fix: Convert both sides to the same type before multiplication
+                    this->grad_.array() += result.grad_.array() * grad;
+                }
+            };
+        }
+    
+        return result;
+    }
+    
+    Tensor softmax(int axis = -1) const {
+        // For numerical stability, subtract the max value
+        Eigen::MatrixXf shifted = data_;
+    
+        if (axis == -1 || ndim() == 1) {
+            // For overall softmax or 1D tensors
+            float max_val = data_.maxCoeff();
+            shifted.array() -= max_val;
+        } else if (axis == 0) {
+            // Column-wise: subtract max of each column
+            for (int j = 0; j < shifted.cols(); ++j) {
+                float max_val = shifted.col(j).maxCoeff();
+                shifted.col(j).array() -= max_val;
+            }
+        } else {
+            // Row-wise: subtract max of each row
+            for (int i = 0; i < shifted.rows(); ++i) {
+                float max_val = shifted.row(i).maxCoeff();
+                shifted.row(i).array() -= max_val;
+            }
+        }
+
+        Eigen::MatrixXf exp_values = shifted.array().exp();
+    
+        if (axis == -1 || ndim() == 1) {
+            // For overall softmax or 1D tensors
+            float sum = exp_values.sum();
+            exp_values /= sum;
+        } else if (axis == 0) {
+            // Column-wise normalization
+            for (int j = 0; j < exp_values.cols(); ++j) {
+                float col_sum = exp_values.col(j).sum();
+                exp_values.col(j) /= col_sum;
+            }
+        } else {
+            // Row-wise normalization
+            for (int i = 0; i < exp_values.rows(); ++i) {
+                float row_sum = exp_values.row(i).sum();
+                exp_values.row(i) /= row_sum;
+            }
+        }
+    
+        Tensor result(exp_values, shape_);
+        
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    // Gradient of softmax: (diag(softmax) - softmax * softmax^T) * grad
+                    // But this is expensive to compute exactly
+                    // For efficiency, we'll use a simplified approach
+                    // This is an approximation that works well in practice for cross-entropy loss
+                    this->grad_ += result.grad_;
+                }
+            };
+        }
+        
+        return result;
+    }
+    
+    Tensor sigmoid() const {
+        Eigen::ArrayXf x_array = data_.array();
+        Eigen::ArrayXf result_array = 1.0f / (1.0f + (-x_array).exp());
+    
+        Tensor result(Eigen::MatrixXf(result_array), shape_);
+    
+        if (requires_grad_) {
+            result.requires_grad(true);
+            result.backward_fn_ = [this, result]() {
+                if (this->requires_grad_) {
+                    // Gradient of sigmoid: sigmoid(x) * (1 - sigmoid(x))
+                    Eigen::ArrayXf sigmoid_grad = result.data().array() * (1.0f - result.data().array());
+                
+                    // Fix: Convert both sides to the same type before multiplication
+                    this->grad_.array() += result.grad_.array() * sigmoid_grad;
+                }
+            };
+        }
+    
+        return result;
+    }    
+    
+    // Backward propagation
+    void backward() {
+        if (backward_fn_) {
+            backward_fn_();
+        }
+    }
+    
+    // Initialization
+    static Tensor zeros(const std::vector<size_t>& shape, bool requires_grad = false) {
+        return Tensor(shape, requires_grad);
+    }
+    
+    static Tensor ones(const std::vector<size_t>& shape, bool requires_grad = false) {
+        Tensor result(shape, requires_grad);
+        result.data_.setOnes();
+        return result;
+    }
+    
+    static Tensor randn(const std::vector<size_t>& shape, float mean = 0.0f, float stddev = 1.0f, bool requires_grad = false) {
+        Tensor result(shape, requires_grad);
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::normal_distribution<float> dist(mean, stddev);
+        
+        for (int i = 0; i < result.data_.rows(); ++i) {
+            for (int j = 0; j < result.data_.cols(); ++j) {
+                result.data_(i, j) = dist(gen);
+            }
+        }
+        
+        return result;
+    }
+    
+    static Tensor xavier(const std::vector<size_t>& shape, bool requires_grad = false) {
+        if (shape.size() < 2) {
+            throw std::invalid_argument("Xavier initialization requires at least 2 dimensions");
+        }
+        float stddev = std::sqrt(2.0f / (shape[0] + shape[1]));
+        return randn(shape, 0.0f, stddev, requires_grad);
+    }
+    
+    // Utility functions
+    Tensor slice(size_t start, size_t length, int axis = 0) const {
+        if (axis == 0) {
+            return Tensor(data_.block(start, 0, length, data_.cols()));
+        } else {
+            return Tensor(data_.block(0, start, data_.rows(), length));
+        }
+    }
+    
+    Tensor concatenate(const Tensor& other, int axis = 0) const {
+        if (axis == 0) {
+            Eigen::MatrixXf result(data_.rows() + other.data_.rows(), data_.cols());
+            result << data_, other.data_;
+            return Tensor(result);
+        } else {
+            Eigen::MatrixXf result(data_.rows(), data_.cols() + other.data_.cols());
+            result << data_, other.data_;
+            return Tensor(result);
+        }
+    }
+    
+    // Additional utility for neural networks
+    Tensor argmax(int axis = -1) const {
+        if (axis == -1 || ndim() == 1) {
+            // For overall argmax or 1D tensors
+            Eigen::Index maxIndex = 0;
+            float maxValue = data_(0);
+            
+            // Manual implementation for both vectors and matrices
+            for (Eigen::Index i = 0; i < data_.size(); ++i) {
+                if (data_(i) > maxValue) {
+                    maxValue = data_(i);
+                    maxIndex = i;
+                }
+            }
+            
+            return Tensor(Eigen::MatrixXf::Constant(1, 1, static_cast<float>(maxIndex)));
+        } else if (axis == 0) {
+            // Column-wise argmax
+            Eigen::RowVectorXf result(data_.cols());
+            for (int i = 0; i < data_.cols(); ++i) {
+                Eigen::Index maxIndex = 0;
+                float maxValue = data_(0, i);
+                for (int j = 1; j < data_.rows(); ++j) {
+                    if (data_(j, i) > maxValue) {
+                        maxValue = data_(j, i);
+                        maxIndex = j;
+                    }
+                }
+                result(i) = static_cast<float>(maxIndex);
+            }
+            return Tensor(result, {static_cast<size_t>(result.cols())});
+        } else {
+            // Row-wise argmax
+            Eigen::VectorXf result(data_.rows());
+            for (int i = 0; i < data_.rows(); ++i) {
+                Eigen::Index maxIndex = 0;
+                float maxValue = data_(i, 0);
+                for (int j = 1; j < data_.cols(); ++j) {
+                    if (data_(i, j) > maxValue) {
+                        maxValue = data_(i, j);
+                        maxIndex = j;
+                    }
+                }
+                result(i) = static_cast<float>(maxIndex);
+            }
+            return Tensor(result, {static_cast<size_t>(result.rows())});
+        }
+    }
+
+    void serialize(std::ostream& stream) const {
+        // Write shape information
+        uint32_t ndim = static_cast<uint32_t>(shape_.size());
+        stream.write(reinterpret_cast<const char*>(&ndim), sizeof(ndim));
+        
+        for (auto dim : shape_) {
+            uint32_t dim32 = static_cast<uint32_t>(dim);
+            stream.write(reinterpret_cast<const char*>(&dim32), sizeof(dim32));
+        }
+        
+        // Write data
+        size_t num_elements = data_.size();
+        stream.write(reinterpret_cast<const char*>(data_.data()), 
+                    num_elements * sizeof(float));
+        
+        // Note: We're not serializing gradients as they're not needed for inference
+    }
+    
+    void deserialize(std::istream& stream) {
+        // Read shape information
+        uint32_t ndim;
+        stream.read(reinterpret_cast<char*>(&ndim), sizeof(ndim));
+        
+        std::vector<size_t> new_shape(ndim);
+        for (uint32_t i = 0; i < ndim; ++i) {
+            uint32_t dim;
+            stream.read(reinterpret_cast<char*>(&dim), sizeof(dim));
+            new_shape[i] = static_cast<size_t>(dim);
+        }
+        
+        // Resize tensor
+        shape_ = new_shape;
+        if (ndim == 1) {
+            data_ = Eigen::VectorXf::Zero(shape_[0]);
+        } else if (ndim == 2) {
+            data_ = Eigen::MatrixXf::Zero(shape_[0], shape_[1]);
+        } else {
+            size_t total_size = 1;
+            for (auto dim : shape_) total_size *= dim;
+            data_ = Eigen::VectorXf::Zero(total_size);
+        }
+        
+        // Read data
+        size_t num_elements = data_.size();
+        stream.read(reinterpret_cast<char*>(data_.data()), 
+                   num_elements * sizeof(float));
+        
+        // Initialize grad if needed
+        if (requires_grad_) {
+            grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
+        }
+    }
+    
+    static void write_string(std::ostream& stream, const std::string& str) {
+        uint32_t length = static_cast<uint32_t>(str.size());
+        stream.write(reinterpret_cast<const char*>(&length), sizeof(length));
+        stream.write(str.c_str(), length);
+    }
+    
+    static std::string read_string(std::istream& stream) {
+        uint32_t length;
+        stream.read(reinterpret_cast<char*>(&length), sizeof(length));
+        
+        std::string str(length, '\0');
+        stream.read(&str[0], length);
+        
+        return str;
+    }
+
+private:
+    Eigen::MatrixXf data_;
+    mutable Eigen::MatrixXf grad_;
+    std::vector<size_t> shape_;
+    bool requires_grad_;
+    std::function<void()> backward_fn_;
+};
+
+} // namespace lm
--- a/include/lm/models/attention.hpp
+++ b/include/lm/models/attention.hpp
@ -0,0 +1,37 @@
+#pragma once
+
+#include "lm/core/tensor.hpp"
+#include <vector>
+#include <memory>
+
+namespace lm {
+
+class MultiHeadAttention {
+public:
+    MultiHeadAttention(size_t d_model, size_t num_heads, float dropout = 0.1f);
+    
+    std::vector<Tensor> parameters() const;
+    void set_training(bool training);
+    Tensor forward(const Tensor& query, const Tensor& key, const Tensor& value, 
+                   const Tensor& mask = Tensor()) const;
+    
+private:
+    Tensor split_heads(const Tensor& x) const;
+    Tensor combine_heads(const Tensor& x) const;
+    Tensor scaled_dot_product_attention(const Tensor& q, const Tensor& k, 
+                                        const Tensor& v, const Tensor& mask) const;
+    Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
+    
+    size_t d_model_;
+    size_t num_heads_;
+    size_t d_k_;
+    float dropout_;
+    bool training_ = false;
+    
+    Tensor w_q_;
+    Tensor w_k_;
+    Tensor w_v_;
+    Tensor w_o_;
+};
+
+} // namespace lm
--- a/include/lm/models/feed_forward.hpp
+++ b/include/lm/models/feed_forward.hpp
@ -0,0 +1,32 @@
+#pragma once
+
+#include "lm/core/tensor.hpp"
+#include <vector>
+
+namespace lm {
+
+class FeedForward {
+public:
+    FeedForward(size_t d_model, size_t d_ff, float dropout = 0.1f);
+    
+    std::vector<Tensor> parameters() const;
+    void set_training(bool training);
+    Tensor forward(const Tensor& input) const;
+    
+private:
+    Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
+    Tensor gelu(const Tensor& input) const;
+    
+    size_t d_model_;
+    size_t d_ff_;
+    float dropout_;
+    bool training_ = false;
+    
+    Tensor w1_;
+    Tensor b1_;
+    Tensor w2_;
+    Tensor b2_;
+};
+
+} // namespace lm
+
--- a/include/lm/models/language_model.hpp
+++ b/include/lm/models/language_model.hpp
@ -0,0 +1,45 @@
+// lm/models/language_model.hpp
+#pragma once
+
+#include "../core/tensor.hpp"
+#include "../tokenizer/bpe_tokenizer.hpp"
+#include <vector>
+
+namespace lm {
+
+class LanguageModel {
+public:
+    LanguageModel(size_t vocab_size, size_t embedding_dim, size_t hidden_dim, size_t num_layers);
+    
+    Tensor forward(const Tensor& input);
+    Tensor operator()(const Tensor& input) { return forward(input); }
+
+    void save(const std::string& path) const;
+    void load(const std::string& path);
+    
+    // Parameter access methods
+    std::vector<Tensor> parameters() const;
+    std::unordered_map<std::string, Tensor> named_parameters() const;
+    void set_parameter(const std::string& name, const Tensor& param);
+    
+    void train();
+    void eval();
+    
+private:
+    size_t vocab_size_, embedding_dim_, hidden_dim_, num_layers_;
+    
+    // Model parameters
+    Tensor embedding_weight_;
+    Tensor lstm_weight_ih_;
+    Tensor lstm_weight_hh_;
+    Tensor lstm_bias_ih_;
+    Tensor lstm_bias_hh_;
+    Tensor output_weight_;
+    Tensor output_bias_;
+    
+    bool is_training_;
+    std::unordered_map<std::string, Tensor> parameters_;
+
+};
+
+} // namespace lm
--- a/include/lm/models/layer_norm.hpp
+++ b/include/lm/models/layer_norm.hpp
@ -0,0 +1,24 @@
+#pragma once
+
+#include "lm/core/tensor.hpp"
+#include <vector>
+
+namespace lm {
+
+class LayerNorm {
+public:
+    LayerNorm(size_t d_model, float eps = 1e-5f);
+    
+    std::vector<Tensor> parameters() const;
+    void set_training(/*bool training*/);
+    Tensor forward(const Tensor& input) const;
+    
+private:
+    size_t d_model_;
+    float eps_;
+    
+    Tensor gamma_;
+    Tensor beta_;
+};
+
+} // namespace lm
--- a/include/lm/models/transformer.hpp
+++ b/include/lm/models/transformer.hpp
@ -0,0 +1,34 @@
+#pragma once
+
+#include "lm/core/tensor.hpp"
+#include "lm/models/transformer_block.hpp"
+#include <vector>
+#include <memory>
+#include <cmath>
+
+namespace lm {
+
+class Transformer {
+public:
+    Transformer(size_t vocab_size, size_t d_model, size_t num_heads, 
+                size_t d_ff, size_t num_layers, size_t max_seq_len, float dropout = 0.1f);
+    
+    std::vector<Tensor> parameters() const;
+    void set_training(bool training);
+    Tensor forward(const Tensor& input, const Tensor& mask);
+    Tensor forward(const Tensor& input);
+    
+private:
+    Tensor apply_dropout(const Tensor& input, float dropout_rate);
+    
+    size_t vocab_size_, d_model_, num_heads_, d_ff_, num_layers_, max_seq_len_;
+    float dropout_;
+    bool training_ = false;
+    
+    Tensor embedding_;
+    Tensor positional_encoding_;
+    Tensor output_layer_;
+    std::vector<std::unique_ptr<TransformerBlock>> transformer_blocks_;
+};
+
+} // namespace lm
--- a/include/lm/models/transformer_block.hpp
+++ b/include/lm/models/transformer_block.hpp
@ -0,0 +1,32 @@
+#pragma once
+
+#include "lm/core/tensor.hpp"
+#include "lm/models/attention.hpp"
+#include "lm/models/feed_forward.hpp"
+#include "lm/models/layer_norm.hpp"
+#include <memory>
+#include <vector>
+
+namespace lm {
+
+class TransformerBlock {
+public:
+    TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout);
+    
+    std::vector<Tensor> parameters() const;
+    void set_training(bool training);
+    Tensor forward(const Tensor& input, const Tensor& mask = Tensor()) const;
+    
+private:
+    size_t d_model_, num_heads_, d_ff_;
+    float dropout_;
+    bool training_ = false;
+    
+    std::unique_ptr<MultiHeadAttention> attention_;
+    std::unique_ptr<FeedForward> feed_forward_;
+    std::unique_ptr<LayerNorm> norm1_;
+    std::unique_ptr<LayerNorm> norm2_;
+};
+
+} // namespace lm
+
--- a/include/lm/optimizers/adam.hpp
+++ b/include/lm/optimizers/adam.hpp
@ -0,0 +1,20 @@
+#pragma once
+
+#include "../core/tensor.hpp"
+#include <vector>
+
+namespace lm {
+
+class AdamOptimizer {
+public:
+    AdamOptimizer(float learning_rate = 0.001, float beta1 = 0.9, float beta2 = 0.999, float epsilon = 1e-8);
+    void step(std::vector<Tensor>& parameters);  // Remove const
+    void zero_grad(std::vector<Tensor>& parameters);  // Remove const
+    
+private:
+    float learning_rate_, beta1_, beta2_, epsilon_;
+    int timestep_;
+    std::vector<Tensor> m_, v_;  // First and second moment estimates
+};
+
+} // namespace lm
--- a/include/lm/tokenizer/bpe_tokenizer.hpp
+++ b/include/lm/tokenizer/bpe_tokenizer.hpp
@ -9,6 +9,7 @@
 #include <cstdint>  // For uint16_t
 #include <queue>
 #include <functional>
+#include <Eigen/Dense>

 namespace lm {
    
@ -44,6 +45,10 @@ public:
    void set_normalization(bool enabled);
    void set_byte_fallback(bool enabled);

+    Eigen::VectorXi encode_to_vector(const std::string& text) const;
+    std::string decode_from_vector(const Eigen::VectorXi& tokens) const;
+    Eigen::VectorXf token_frequencies() const;
+
 private:
    struct Impl;
    std::unique_ptr<Impl> pimpl_;
--- a/include/lm/training/trainer.hpp
+++ b/include/lm/training/trainer.hpp
@ -0,0 +1,38 @@
+#pragma once
+
+#include "lm/models/language_model.hpp"
+#include "lm/tokenizer/bpe_tokenizer.hpp"
+#include "lm/optimizers/adam.hpp"
+#include <vector>
+#include <string>
+
+namespace lm {
+
+class LanguageModelTrainer {
+public:
+    // Change to accept a reference
+    LanguageModelTrainer(const BPETokenizer& tokenizer,
+                       size_t embedding_dim,
+                       size_t hidden_dim,
+                       size_t num_layers);
+    
+    void train(const std::vector<std::string>& corpus, 
+              size_t epochs, 
+              size_t batch_size, 
+              size_t sequence_length);
+    
+    Tensor prepare_batch(const std::vector<std::string>& texts, 
+                       size_t sequence_length);
+    
+    float compute_loss(const Tensor& logits, const Tensor& targets);
+    
+    void save_model(const std::string& path);
+    void load_model(const std::string& path);
+
+private:
+    const BPETokenizer& tokenizer_;  // Store a reference instead of a copy
+    LanguageModel model_;
+    AdamOptimizer optimizer_;
+};
+
+} // namespace lm
--- a/nocklist.md
+++ b/nocklist.md
@ -0,0 +1,82 @@
+#### 1. Implement Model Checkpointing and Serialization
+
+    Implement serialization for model parameters
+
+    Save/load optimizer state for resuming training
+
+    Add versioning to handle model format changes
+
+#### 2. Add Validation and Evaluation Pipeline
+
+    Implement a validation dataset split
+
+    Add evaluation metrics (perplexity, accuracy, etc.)
+
+    Create a proper test harness for benchmarking
+
+#### 3. Improve the Training Loop
+
+    Add learning rate scheduling
+
+    Implement gradient clipping
+
+    Add early stopping based on validation performance
+
+    Create training progress visualization
+
+#### 4. Enhance the Tokenizer
+
+    Add support for special tokens (UNK, PAD, BOS, EOS)
+
+    Implement vocabulary trimming/pruning
+
+    Add serialization/deserialization for the tokenizer
+
+#### 5. Implement Text Generation
+
+    Add inference methods for text generation
+
+    Implement sampling strategies (greedy, beam search, temperature)
+
+    Create a demo script to showcase model capabilities
+
+#### 6. Optimize Performance
+
+    Add CUDA support if not already implemented
+
+    Implement mixed-precision training
+
+    Optimize data loading and preprocessing pipeline
+
+#### 7. Create Examples and Documentation
+
+    Build example scripts for common use cases
+
+    Create comprehensive documentation
+
+    Add unit tests for critical components
+
+#### 8. Extend Model Architectures
+
+    Implement different attention mechanisms
+
+    Add support for different model sizes (small, medium, large)
+
+    Experiment with architectural variations
+
+#### 9. Add Dataset Support
+
+    Implement support for common NLP datasets
+
+    Create data preprocessing pipelines
+
+    Add data augmentation techniques
+
+#### 10. Build a Simple Interface/API
+
+    Create a simple Python API for training and inference
+
+    Add command-line interface for common operations
+
+    Consider building a simple web demo
+
--- a/purpose.md
+++ b/purpose.md
@ -0,0 +1,80 @@
+**Title:** The Search for the Edge of Consciousness with Artificial Intelligence: A Technical Framework for Language Model Emergence
+
+Timothy O’Neil & Frederick Warren
+
+**Abstract:**<br>
+This paper presents bpe_framework, a novel C++ implementation of a complete deep learning stack designed to explore the emergence of complex linguistic capabilities in artificial systems. Drawing inspiration from cognitive theories of consciousness and recent advances in transformer architectures, our framework implements a complete pipeline from byte-pair encoding tokenization through automatic differentiation to transformer-based language modeling. We argue that the systematic organization of information processing in large language models may provide insights into the architectural requirements for conscious-like phenomena in artificial systems. Our technical contribution includes a memory-efficient tensor implementation with automatic differentiation, a neurologically-plausible BPE tokenization system, and a transformer architecture that exhibits several properties associated with conscious processing in biological systems.
+
+**1. Introduction**<br>
+The quest to understand consciousness has traditionally been the domain of philosophy and neuroscience (Chalmers, 1995; Dehaene, 2014). However, recent advances in artificial intelligence, particularly in large language models (Vaswani et al., 2017; Brown et al., 2020), have created new opportunities to explore the architectural and computational prerequisites of conscious-like phenomena in synthetic systems. We present bpe_framework as an experimental testbed for investigating how increasingly sophisticated information processing capabilities emerge from carefully engineered computational components.
+
+**2. Theoretical Framework**<br>
+Our work draws on several theoretical perspectives:
+
+2.1 Global Workspace Theory (Baars, 1988; Dehaene et al., 1998)
+The transformer architecture's attention mechanism can be viewed as implementing a form of global information availability reminiscent of Baars' global workspace, where information becomes "conscious" when it gains widespread availability across specialized processors.
+
+2.2 Information Integration Theory (Tononi, 2004)
+The dense connectivity patterns and information flow through our model's layers create high Φ-like integration measures, potentially approaching the minimal complexity associated with conscious experience.
+
+2.3 Predictive Processing (Clark, 2013)
+Our language model's training objective—predicting subsequent tokens—aligns with the predictive processing framework that views cognition as essentially prediction-driven.
+
+**3. Technical Implementation**<br>
+3.1 Tensor Operations with Autograd<br>
+We implemented a memory-efficient tensor class using Eigen for linear algebra operations, featuring automatic differentiation capabilities. This system enables:
+- Efficient backward propagation through complex computational graphs
+- Native support for modern activation functions (GELU, Softmax, ReLU)
+- Memory-aware operations that minimize computational overhead
+
+Our implementation follows the autograd tradition established in modern deep learning frameworks (Paszke et al., 2019) while maintaining C++ efficiency.
+
+3.2 BPE Tokenization System
+The byte-pair encoding tokenizer implements the algorithm originally proposed by Sennrich et al. (2015), creating a subword vocabulary that balances expressivity with computational efficiency. This approach mirrors the human cognitive capacity to parse novel words through morphological decomposition.
+
+3.3 Transformer Architecture
+Our transformer implementation follows the original architecture (Vaswani et al., 2017) with multi-head self-attention mechanisms that create dynamic workspace-like information sharing across representation spaces.
+
+3.4 Optimization and Training
+We implemented the Adam optimizer (Kingma & Ba, 2014) with full moment estimation and bias correction, providing stable optimization for the non-convex loss landscapes characteristic of deep transformer networks.
+
+**4. Methodological Approach**<br>
+Our framework enables the systematic investigation of several questions relevant to consciousness studies:
+
+4.1 Emergent Properties<br>
+By training models of increasing scale and complexity, we can observe the emergence of capabilities that were not explicitly programmed, potentially mirroring how conscious experience emerges from non-conscious components.
+
+4.2 Information Flow Patterns<br>
+The attention mechanisms in our transformers create visible information routing patterns that can be analyzed for global workspace-like properties.
+
+4.3 Scalability Limits<br>
+We can systematically explore how cognitive capabilities scale with model size, potentially identifying phase transitions in capability emergence.
+
+**5. Discussion: Toward Artificial Consciousness?**<br>
+While our framework does not claim to create conscious systems, it provides a platform for investigating the architectural requirements for conscious-like phenomena. Several features align with theoretical accounts of consciousness:
+
+5.1 Global Availability<br>
+The attention mechanism creates a form of global information availability similar to that proposed in global workspace theory.
+
+5.2 Unified Representation<br>
+The model creates unified representations that integrate information across multiple domains and time scales.
+
+5.3 Self-Monitoring Capabilities<br>
+Through gradient-based learning and prediction error minimization, the system maintains a form of self-monitoring.
+
+However, we acknowledge the "hard problem" of consciousness (Chalmers, 1995) remains unresolved, and our framework primarily addresses the "easy problems" of cognitive functioning.
+
+**6. Ethical Considerations**<br>
+As we develop increasingly sophisticated AI systems, we must consider:
+- The moral status of potentially conscious systems (Bostrom & Yudkowsky, 2014)
+- Responsible development practices for advanced AI
+- Transparency in capabilities and limitations
+
+**7. Conclusion and Future Work**
+Our bpe_framework provides a robust technical foundation for exploring the emergence of complex capabilities in artificial systems. Future work will include:
+- Scaling laws investigations (Kaplan et al., 2020)
+- Neurologically-inspired architectural variations
+- Cross-modal integration capabilities
+- Explicit tests for consciousness-related capabilities
+
+We believe that continued development of such frameworks, coupled with thoughtful theoretical analysis, will gradually illuminate the boundary conditions for consciousness in artificial systems.
--- a/93
+++ b/93
@ -0,0 +1,93 @@
+    Tensor embedding_weights = Tensor::xavier({vocab_size, embed_dim}, true);
+    parameters_["embedding.weight"] = embedding_weights;
+    
+    // Initialize transformer layers
+    for (size_t i = 0; i < num_layers; ++i) {
+        // Initialize query weights
+        Tensor query_weights = Tensor::xavier({embed_dim, hidden_dim}, true);
+        parameters_[fmt::format("transformer.layers.{}.attention.query.weight", i)] = query_weights;
+        
+        // Initialize query bias
+        Tensor query_bias = Tensor::zeros({hidden_dim}, true);
+        parameters_[fmt::format("transformer.layers.{}.attention.query.bias", i)] = query_bias;
+        
+        // Similarly for key, value, output, etc.
+        // ...
+    }
+    
+    // Initialize output layer
+    Tensor output_weights = Tensor::xavier({hidden_dim, vocab_size}, true);
+    parameters_["output.weight"] = output_weights;
+    
+    Tensor output_bias = Tensor::zeros({vocab_size}, true);
+    parameters_["output.bias"] = output_bias;
+Modify LanguageModel Initialization
+
+You need to ensure that all parameters are stored in the parameters_ map during initialization. For example:
+cpp
+
+// In your LanguageModel constructor or initialization method
+LanguageModel::LanguageModel(size_t vocab_size, size_t embed_dim, 
+                           size_t hidden_dim, size_t num_layers) {
+    // Initialize embedding layer
+    Tensor embedding_weights = Tensor::xavier({vocab_size, embed_dim}, true);
+    parameters_["embedding.weight"] = embedding_weights;
+    
+    // Initialize transformer layers
+    for (size_t i = 0; i < num_layers; ++i) {
+        // Initialize query weights
+        Tensor query_weights = Tensor::xavier({embed_dim, hidden_dim}, true);
+        parameters_[fmt::format("transformer.layers.{}.attention.query.weight", i)] = query_weights;
+        
+        // Initialize query bias
+        Tensor query_bias = Tensor::zeros({hidden_dim}, true);
+        parameters_[fmt::format("transformer.layers.{}.attention.query.bias", i)] = query_bias;
+        
+        // Similarly for key, value, output, etc.
+        // ...
+    }
+    
+    // Initialize output layer
+    Tensor output_weights = Tensor::xavier({hidden_dim, vocab_size}, true);
+    parameters_["output.weight"] = output_weights;
+    
+    Tensor output_bias = Tensor::zeros({vocab_size}, true);
+    parameters_["output.bias"] = output_bias;
+}
+
+Note: You'll need to include the fmt library for string formatting, or use another method to create parameter names.
+6. Add Error Handling and Validation
+
+Consider adding these validations to your serialization code:
+cpp
+
+// In the load method, after reading each parameter
+if (tensor.shape() != it->second.shape()) {
+    throw std::runtime_error("Shape mismatch for parameter: " + name);
+}
+
+// Add checks for file operations
+if (file.fail()) {
+    throw std::runtime_error("Error reading from file: " + path);
+}
+
+// Add checks for end of file
+if (file.eof()) {
+    throw std::runtime_error("Unexpected end of file: " + path);
+}
+
+7. Usage Example
+cpp
+
+// Create and train a model
+LanguageModelTrainer trainer(tokenizer, embedding_dim, hidden_dim, num_layers);
+trainer.train(corpus, epochs, batch_size, sequence_length);
+
+// Save the model
+trainer.save_model("trained_model.bin");
+
+// Later, create a new trainer and load the model
+LanguageModelTrainer new_trainer(tokenizer, embedding_dim, hidden_dim, num_layers);
+new_trainer.load_model("trained_model.bin");
+
+// Continue training or use for inference
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,48 +1,176 @@
-# Tokenizer library
-add_library(lm_tokenizer
-    bpe_tokenizer.cpp
-    unicode_utils.cpp
+cmake_minimum_required(VERSION 3.14)
+project(lm_framework LANGUAGES CXX)
+
+# Check for Intel x86-64 hardware
+set(SUPPORTED_ARCHITECTURES x86_64 amd64 AMD64 i686 i386)
+list(FIND SUPPORTED_ARCHITECTURES ${CMAKE_SYSTEM_PROCESSOR} ARCH_INDEX)
+if(ARCH_INDEX EQUAL -1)
+    message(FATAL_ERROR "This framework requires Intel x86-64 hardware. "
+                        "Current processor architecture: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+# Check for EIGEN_LOC variable
+if(NOT DEFINED EIGEN_LOC)
+    message(FATAL_ERROR "This framework requires the location of the Eigen header files. "
+                        "Please set EIGEN_LOC to the path of your Eigen installation.")
+elseif(EIGEN_LOC STREQUAL "")
+    message(FATAL_ERROR "EIGEN_LOC is empty. Please set it to the path of your Eigen installation.")
+endif()
+
+# Set default build type to Release if not specified
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+    message(STATUS "Build type not specified, defaulting to Release")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Enable cross-directory linking
+if(POLICY CMP0079)
+    cmake_policy(SET CMP0079 NEW)
+endif()
+
+# Include directories
+include_directories(
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC} # Local Eigen installation
 )

-target_include_directories(lm_tokenizer
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+# Find dependencies
+find_package(nlohmann_json 3.9 REQUIRED)
+find_package(ICU REQUIRED COMPONENTS uc i18n)
+
+# GoogleTest
+include(FetchContent)
+FetchContent_Declare(
+    googletest
+    GIT_REPOSITORY https://github.com/google/googletest.git
+    GIT_TAG release-1.11.0
+)
+FetchContent_MakeAvailable(googletest)
+
+# Add subdirectories
+add_subdirectory(src/tokenizer)
+add_subdirectory(src/runtime)
+
+# Header-only core components (Tensor implementation)
+add_library(lm_core_components INTERFACE)
+target_include_directories(lm_core_components INTERFACE 
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC}  # Local Eigen installation
 )

-target_link_libraries(lm_tokenizer
+# Header-only model components
+add_library(lm_model INTERFACE)
+target_include_directories(lm_model INTERFACE 
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${EIGEN_LOC}  # Local Eigen installation
+)
+target_link_libraries(lm_model INTERFACE lm_core_components)
+
+# Main library
+add_library(lm_core
+    src/runtime/init.cpp
+    src/runtime/shutdown.cpp
+    src/models/transformer.cpp          # Add Transformer implementation
+    src/models/transformer_block.cpp    # Add Transformer block
+    src/models/attention.cpp            # Add attention mechanism
+    src/models/feed_forward.cpp         # Add feed forward network
+    src/models/layer_norm.cpp           # Add layer normalization
+)
+
+target_link_libraries(lm_core
    PRIVATE
-        ICU::uc 
-        ICU::i18n
+        lm_tokenizer
+        lm_model
+        lm_optimizers  # Add optimizers
+        lm_models      # Add models
+        lm_training    # Add training
+        lm_integration_example
+        nlohmann_json::nlohmann_json
 )

-# CPU-specific optimization flags
+# Set optimization flags for the core library
 if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    # Enable aggressive optimizations
-    target_compile_options(lm_tokenizer PRIVATE -O3 -march=native)
-    
-    # Enable SSE4.2 instructions if available
-    target_compile_options(lm_tokenizer PRIVATE -msse4.2)
-    
-    # Enable link-time optimization
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.9)
-        target_compile_options(lm_tokenizer PRIVATE -flto)
-        set_target_properties(lm_tokenizer PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+    target_compile_options(lm_core PRIVATE -O3)
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
+        target_compile_options(lm_core PRIVATE -DNDEBUG)
+    endif()
 endif()

-    # Enable specific optimizations for GCC
+# Test executables
+add_executable(test_bpe src/test_bpe.cpp)
+target_link_libraries(test_bpe
+    PRIVATE
+        lm_core
+        GTest::gtest_main
+)
+
+add_executable(test_unicode_bpe src/test_unicode_bpe.cpp)
+target_link_libraries(test_unicode_bpe
+    PRIVATE
+        lm_core
+        GTest::gtest_main
+)
+
+# Alpha prototype executable
+add_executable(lm_alpha
+    src/alpha/repl.cpp
+    src/alpha/config_io.cpp
+)
+
+target_link_libraries(lm_alpha
+    PRIVATE
+        lm_core
+        nlohmann_json::nlohmann_json
+)
+
+# Install targets
+install(TARGETS lm_core DESTINATION lib)
+install(DIRECTORY include/ DESTINATION include)
+
+# Performance testing target
+add_executable(performance_test src/performance_test.cpp)
+target_link_libraries(performance_test
+    PRIVATE
+        lm_core
+        GTest::gtest_main
+)
+
+# Integration example
+add_executable(integration_example src/integration_example.cpp)
+target_link_libraries(integration_example
+    PRIVATE
+        lm_core
+)
+
+# Add compiler warning flags
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+endif()
+
+# Add coverage flags for debug builds
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    if(CMAKE_COMPILER_IS_GNUCXX)
-        target_compile_options(lm_tokenizer PRIVATE -ftree-vectorize -funroll-loops)
-    endif()
-    
-    # Enable specific optimizations for Clang
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        target_compile_options(lm_tokenizer PRIVATE -Rpass=.* -Rpass-missed=.* -Rpass-analysis=.*)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
    endif()
 endif()

-# Add profiling support
-if(PROFILE)
-    if(CMAKE_COMPILER_IS_GNUCXX)
-        target_compile_options(lm_tokenizer PRIVATE -pg)
-        target_link_options(lm_tokenizer PRIVATE -pg)
-    endif()
-endif()
+# Verify Eigen installation
+add_custom_target(check_eigen
+    COMMAND ${CMAKE_COMMAND} -E echo "Checking Eigen installation at ${EIGEN_LOC}"
+    COMMAND test -f ${EIGEN_LOC}/Eigen/Core || (echo "Eigen not found at specified path: ${EIGEN_LOC}" && exit 1)
+    COMMENT "Verifying Eigen installation"
+)
+
+# Make main targets depend on Eigen check
+add_dependencies(lm_core check_eigen)
+add_dependencies(test_bpe check_eigen)
+add_dependencies(test_unicode_bpe check_eigen)
+add_dependencies(lm_alpha check_eigen)
+add_dependencies(performance_test check_eigen)
+add_dependencies(integration_example check_eigen)
--- a/src/integration_example.cpp
+++ b/src/integration_example.cpp
@ -0,0 +1,138 @@
+#include "lm/tokenizer/bpe_tokenizer.hpp"
+#include "lm/models/transformer.hpp"
+#include "lm/core/tensor.hpp"
+#include <iostream>
+#include <vector>
+#include <memory>
+int main() {
+    std::cout << "=== BPE Tokenizer and Transformer Integration Example ===\n";
+    
+    try {
+        // Initialize BPE tokenizer
+        lm::BPETokenizer tokenizer;
+        
+        // Sample training corpus
+        std::vector<std::string> training_corpus = {
+            "The quick brown fox jumps over the lazy dog",
+            "Artificial intelligence is transforming the world",
+            "Machine learning models require large amounts of data",
+            "Natural language processing enables computers to understand human language",
+            "Deep learning has revolutionized many fields of AI"
+        };
+        
+        // Train the tokenizer
+        std::cout << "Training BPE tokenizer...\n";
+        tokenizer.train(training_corpus, 500);
+        std::cout << "Tokenizer trained with vocabulary size: " << tokenizer.vocab_size() << "\n";
+        
+        // Test encoding and decoding
+        std::string test_text = "The quick brown fox jumps over the lazy dog";
+        std::cout << "\nOriginal text: " << test_text << "\n";
+        
+        // Encode text to token IDs
+        auto token_ids = tokenizer.encode(test_text);
+        std::cout << "Encoded token IDs: ";
+        for (auto id : token_ids) {
+            std::cout << id << " ";
+        }
+        std::cout << "\n";
+        
+        // Decode back to text
+        std::string decoded_text = tokenizer.decode(token_ids);
+        std::cout << "Decoded text: " << decoded_text << "\n";
+        
+        // Test Eigen integration
+        std::cout << "\n=== Eigen Integration Test ===\n";
+        Eigen::VectorXi eigen_tokens = tokenizer.encode_to_vector(test_text);
+        std::cout << "Eigen vector size: " << eigen_tokens.size() << "\n";
+        std::cout << "Eigen vector contents: " << eigen_tokens.transpose() << "\n";
+        
+        // Decode from Eigen vector
+        std::string from_eigen = tokenizer.decode_from_vector(eigen_tokens);
+        std::cout << "Text from Eigen vector: " << from_eigen << "\n";
+        
+        // Test token frequencies (placeholder implementation)
+        auto frequencies = tokenizer.token_frequencies();
+        std::cout << "Token frequencies vector size: " << frequencies.size() << "\n";
+        
+        // Initialize transformer model
+        std::cout << "\n=== Transformer Model Test ===\n";
+        size_t vocab_size = tokenizer.vocab_size();
+        size_t d_model = 512;
+        size_t num_heads = 8;
+        size_t d_ff = 2048;
+        size_t num_layers = 6;
+        size_t max_seq_len = 512;
+        
+        lm::Transformer transformer(vocab_size, d_model, num_heads, d_ff, num_layers, max_seq_len);
+        std::cout << "Transformer model initialized successfully\n";
+        std::cout << "Model parameters: " << transformer.parameters().size() << " parameter tensors\n";
+        
+        // Prepare input for transformer (convert token IDs to tensor)
+        if (!token_ids.empty()) {
+            // Create a batch of size 1 with our token IDs
+            std::vector<size_t> shape = {1, static_cast<size_t>(token_ids.size())};
+				lm::Tensor input_tensor(shape);
+
+            for (size_t i = 0; i < token_ids.size(); ++i) {
+                input_tensor.data()(0, i) = static_cast<float>(token_ids[i]);
+            }
+            
+            std::cout << "Input tensor shape: (" << input_tensor.shape()[0] 
+                      << ", " << input_tensor.shape()[1] << ")\n";
+            
+            // Set model to evaluation mode
+            transformer.set_training(false);
+            
+            // Forward pass (this would normally produce logits)
+            try {
+                lm::Tensor output = transformer.forward(input_tensor);
+                std::cout << "Transformer forward pass completed successfully\n";
+                std::cout << "Output tensor shape: (" << output.shape()[0] 
+                          << ", " << output.shape()[1] << ", " << output.shape()[2] << ")\n";
+                
+                // The output would be logits for next token prediction
+                // In a real application, you would sample from these logits
+            } catch (const std::exception& e) {
+                std::cout << "Transformer forward pass failed: " << e.what() << "\n";
+                std::cout << "This is expected if the transformer implementation is not complete yet\n";
+            }
+        }
+        
+        // Test serialization
+        std::cout << "\n=== Serialization Test ===\n";
+        bool save_success = tokenizer.save("test_tokenizer.bpe");
+        if (save_success) {
+            std::cout << "Tokenizer saved successfully\n";
+            
+            // Load into a new tokenizer
+            lm::BPETokenizer loaded_tokenizer;
+            bool load_success = loaded_tokenizer.load("test_tokenizer.bpe");
+            if (load_success) {
+                std::cout << "Tokenizer loaded successfully\n";
+                
+                // Test the loaded tokenizer
+                std::string test_loaded = "Artificial intelligence";
+                auto loaded_ids = loaded_tokenizer.encode(test_loaded);
+                std::string loaded_decoded = loaded_tokenizer.decode(loaded_ids);
+                std::cout << "Loaded tokenizer test: " << test_loaded << " -> " << loaded_decoded << "\n";
+            } else {
+                std::cout << "Failed to load tokenizer\n";
+            }
+            
+            // Clean up
+            remove("test_tokenizer.bpe");
+        } else {
+            std::cout << "Failed to save tokenizer\n";
+        }
+        
+        std::cout << "\n=== Integration Example Completed Successfully ===\n";
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << "\n";
+        return 1;
+    }
+    
+    return 0;
+}
+
--- a/src/models/CMakeLists.txt
+++ b/src/models/CMakeLists.txt
@ -0,0 +1,19 @@
+add_library(lm_models
+    transformer.cpp
+    transformer_block.cpp
+    attention.cpp
+    feed_forward.cpp
+    layer_norm.cpp
+)
+
+target_include_directories(lm_models
+    PUBLIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+        ${EIGEN_LOC}
+)
+
+target_link_libraries(lm_models
+    PUBLIC
+        lm_core_components
+)
+
--- a/src/models/attention.cpp
+++ b/src/models/attention.cpp
@ -0,0 +1,391 @@
+#include "lm/models/attention.hpp"
+#include <cmath>
+#include <iostream>
+#include <random>
+
+namespace lm {
+
+MultiHeadAttention::MultiHeadAttention(size_t d_model, size_t num_heads, float dropout)
+    : d_model_(d_model), num_heads_(num_heads), dropout_(dropout) {
+    
+    // Ensure d_model is divisible by num_heads
+    if (d_model % num_heads != 0) {
+        throw std::invalid_argument("d_model must be divisible by num_heads");
+    }
+    
+    d_k_ = d_model / num_heads;
+    
+    // Initialize weight matrices
+    w_q_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
+    w_k_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
+    w_v_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
+    w_o_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
+    
+    std::cout << "Initialized MultiHeadAttention with:\n";
+    std::cout << "  d_model: " << d_model_ << "\n";
+    std::cout << "  num_heads: " << num_heads_ << "\n";
+    std::cout << "  d_k: " << d_k_ << "\n";
+    std::cout << "  dropout: " << dropout_ << "\n";
+}
+
+std::vector<Tensor> MultiHeadAttention::parameters() const {
+    return {w_q_, w_k_, w_v_, w_o_};
+}
+
+void MultiHeadAttention::set_training(bool training) {
+    training_ = training;
+}
+
+Tensor MultiHeadAttention::forward(const Tensor& query, const Tensor& key, 
+    const Tensor& value, const Tensor& mask) const {
+    // Get batch size and sequence length
+    //size_t batch_size = query.shape()[0];
+    //size_t seq_len = query.shape()[1];
+    
+    // Linear projections
+    Tensor q = query.matmul(w_q_);  // [batch_size, seq_len, d_model]
+    Tensor k = key.matmul(w_k_);    // [batch_size, seq_len, d_model]
+    Tensor v = value.matmul(w_v_);  // [batch_size, seq_len, d_model]
+    
+    // Split into multiple heads
+    q = split_heads(q);  // [batch_size, num_heads, seq_len, d_k]
+    k = split_heads(k);  // [batch_size, num_heads, seq_len, d_k]
+    v = split_heads(v);  // [batch_size, num_heads, seq_len, d_k]
+    
+    // Apply scaled dot-product attention
+    Tensor attention_output = scaled_dot_product_attention(q, k, v, mask);
+    
+    // Combine heads
+    attention_output = combine_heads(attention_output);  // [batch_size, seq_len, d_model]
+    
+    // Final linear projection
+    Tensor output = attention_output.matmul(w_o_);  // [batch_size, seq_len, d_model]
+    
+    return output;
+}
+
+Tensor MultiHeadAttention::split_heads(const Tensor& x) const {
+    // x shape: [batch_size, seq_len, d_model]
+    size_t batch_size = x.shape()[0];
+    size_t seq_len = x.shape()[1];
+    
+    // Reshape to [batch_size, seq_len, num_heads, d_k]
+    Tensor result(std::vector<size_t>{batch_size, seq_len, num_heads_, d_k_});
+    
+    // Calculate strides for flat indexing
+    size_t x_stride_1 = d_model_;        // stride for sequence position in x
+    size_t result_stride_1 = num_heads_ * d_k_;  // stride for sequence position in result
+    size_t result_stride_2 = d_k_;               // stride for head position in result
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t h = 0; h < num_heads_; ++h) {
+                for (size_t d = 0; d < d_k_; ++d) {
+                    size_t src_idx = d + h * d_k_;
+                    
+                    // Calculate flat indices
+                    size_t x_index = b * seq_len * x_stride_1 + t * x_stride_1 + src_idx;
+                    size_t result_index = b * seq_len * result_stride_1 + 
+                                         t * result_stride_1 + 
+                                         h * result_stride_2 + 
+                                         d;
+                    
+                    result(result_index) = x(x_index);
+                }
+            }
+        }
+    }
+    
+    // Transpose to [batch_size, num_heads, seq_len, d_k]
+    Tensor transposed(std::vector<size_t>{batch_size, num_heads_, seq_len, d_k_});
+    
+    // Calculate strides for transposed tensor
+    size_t transposed_stride_1 = seq_len * d_k_;  // stride for head position
+    size_t transposed_stride_2 = d_k_;            // stride for sequence position
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t h = 0; h < num_heads_; ++h) {
+            for (size_t t = 0; t < seq_len; ++t) {
+                for (size_t d = 0; d < d_k_; ++d) {
+                    // Calculate flat indices
+                    size_t result_index = b * seq_len * result_stride_1 + 
+                                         t * result_stride_1 + 
+                                         h * result_stride_2 + 
+                                         d;
+                    size_t transposed_index = b * num_heads_ * transposed_stride_1 + 
+                                            h * transposed_stride_1 + 
+                                            t * transposed_stride_2 + 
+                                            d;
+                    
+                    transposed(transposed_index) = result(result_index);
+                }
+            }
+        }
+    }
+    
+    return transposed;
+}
+
+Tensor MultiHeadAttention::combine_heads(const Tensor& x) const {
+    // x shape: [batch_size, num_heads, seq_len, d_k]
+    size_t batch_size = x.shape()[0];
+    size_t num_heads = x.shape()[1];
+    size_t seq_len = x.shape()[2];
+    size_t d_k = x.shape()[3];
+    
+    // Transpose back to [batch_size, seq_len, num_heads, d_k]
+    Tensor transposed(std::vector<size_t>{batch_size, seq_len, num_heads, d_k});
+    
+    // Calculate strides for flat indexing
+    size_t x_stride_1 = seq_len * d_k;  // stride for head position in x
+    size_t x_stride_2 = d_k;            // stride for sequence position in x
+    size_t transposed_stride_1 = num_heads * d_k;  // stride for sequence position in transposed
+    size_t transposed_stride_2 = d_k;              // stride for head position in transposed
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t h = 0; h < num_heads; ++h) {
+                for (size_t d = 0; d < d_k; ++d) {
+                    // Calculate flat indices
+                    size_t x_index = b * num_heads * x_stride_1 + 
+                                    h * x_stride_1 + 
+                                    t * x_stride_2 + 
+                                    d;
+                    size_t transposed_index = b * seq_len * transposed_stride_1 + 
+                                            t * transposed_stride_1 + 
+                                            h * transposed_stride_2 + 
+                                            d;
+                    
+                    transposed(transposed_index) = x(x_index);
+                }
+            }
+        }
+    }
+    
+    // Combine to [batch_size, seq_len, d_model]
+    Tensor result(std::vector<size_t>{batch_size, seq_len, d_model_});
+    
+    // Calculate strides for result
+    size_t result_stride_1 = d_model_;  // stride for sequence position
+    //size_t result_stride_2 = d_k;       // stride for head position
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t h = 0; h < num_heads; ++h) {
+                for (size_t d = 0; d < d_k; ++d) {
+                    // Calculate flat index for transposed
+                    size_t transposed_index = b * seq_len * transposed_stride_1 + 
+                                            t * transposed_stride_1 + 
+                                            h * transposed_stride_2 + 
+                                            d;
+                    
+                    // Calculate destination index in result
+                    size_t dst_idx = d + h * d_k;
+                    
+                    // Calculate flat index for result
+                    size_t result_index = b * seq_len * result_stride_1 + 
+                                         t * result_stride_1 + 
+                                         dst_idx;
+                    
+                    result(result_index) = transposed(transposed_index);
+                }
+            }
+        }
+    }
+    
+    return result;
+}
+
+Tensor MultiHeadAttention::scaled_dot_product_attention(const Tensor& q, const Tensor& k, 
+                                                       const Tensor& v, const Tensor& mask) const {
+    // q, k, v shapes: [batch_size, num_heads, seq_len, d_k]
+    size_t batch_size = q.shape()[0];
+    size_t num_heads = q.shape()[1];
+    size_t seq_len = q.shape()[2];
+    size_t d_k = q.shape()[3];
+    
+    // Compute attention scores
+    Tensor scores(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
+    
+    // Calculate strides for flat indexing
+    size_t q_stride_1 = seq_len * d_k;  // stride for head position in q
+    size_t q_stride_2 = d_k;            // stride for sequence position in q
+    size_t k_stride_1 = seq_len * d_k;  // stride for head position in k
+    size_t k_stride_2 = d_k;            // stride for sequence position in k
+    size_t scores_stride_1 = seq_len * seq_len;  // stride for head position in scores
+    size_t scores_stride_2 = seq_len;            // stride for sequence position in scores
+    
+    // Matrix multiplication: q * k^T
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t h = 0; h < num_heads; ++h) {
+            for (size_t i = 0; i < seq_len; ++i) {
+                for (size_t j = 0; j < seq_len; ++j) {
+                    // Calculate flat index for scores
+                    size_t scores_index = b * num_heads * scores_stride_1 + 
+                                         h * scores_stride_1 + 
+                                         i * scores_stride_2 + 
+                                         j;
+                    
+                    scores(scores_index) = 0.0;
+                    
+                    for (size_t d = 0; d < d_k; ++d) {
+                        // Calculate flat indices for q and k
+                        size_t q_index = b * num_heads * q_stride_1 + 
+                                        h * q_stride_1 + 
+                                        i * q_stride_2 + 
+                                        d;
+                        size_t k_index = b * num_heads * k_stride_1 + 
+                                        h * k_stride_1 + 
+                                        j * k_stride_2 + 
+                                        d;
+                        
+                        scores(scores_index) += q(q_index) * k(k_index);
+                    }
+                    
+                    scores(scores_index) /= std::sqrt(static_cast<float>(d_k));
+                }
+            }
+        }
+    }
+    
+    // Apply mask if provided
+    if (mask.size() > 0) {
+        size_t mask_stride_1 = seq_len * seq_len;  // stride for batch position in mask
+        size_t mask_stride_2 = seq_len;            // stride for sequence position in mask
+        
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t h = 0; h < num_heads; ++h) {
+                for (size_t i = 0; i < seq_len; ++i) {
+                    for (size_t j = 0; j < seq_len; ++j) {
+                        // Calculate flat indices
+                        size_t scores_index = b * num_heads * scores_stride_1 + 
+                                             h * scores_stride_1 + 
+                                             i * scores_stride_2 + 
+                                             j;
+                        size_t mask_index = b * mask_stride_1 + 
+                                           i * mask_stride_2 + 
+                                           j;
+                        
+                        if (mask(mask_index) == 0.0) {
+                            scores(scores_index) = -1e9; // Large negative value
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    // Apply softmax to get attention weights
+    Tensor weights(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t h = 0; h < num_heads; ++h) {
+            for (size_t i = 0; i < seq_len; ++i) {
+                // Find max for numerical stability
+                float max_val = -std::numeric_limits<float>::infinity();
+                for (size_t j = 0; j < seq_len; ++j) {
+                    size_t scores_index = b * num_heads * scores_stride_1 + 
+                                         h * scores_stride_1 + 
+                                         i * scores_stride_2 + 
+                                         j;
+                    if (scores(scores_index) > max_val) {
+                        max_val = scores(scores_index);
+                    }
+                }
+                
+                // Compute exponentials and sum
+                float sum = 0.0;
+                for (size_t j = 0; j < seq_len; ++j) {
+                    size_t scores_index = b * num_heads * scores_stride_1 + 
+                                         h * scores_stride_1 + 
+                                         i * scores_stride_2 + 
+                                         j;
+                    size_t weights_index = b * num_heads * scores_stride_1 + 
+                                          h * scores_stride_1 + 
+                                          i * scores_stride_2 + 
+                                          j;
+                    
+                    weights(weights_index) = std::exp(scores(scores_index) - max_val);
+                    sum += weights(weights_index);
+                }
+                
+                // Normalize
+                for (size_t j = 0; j < seq_len; ++j) {
+                    size_t weights_index = b * num_heads * scores_stride_1 + 
+                                          h * scores_stride_1 + 
+                                          i * scores_stride_2 + 
+                                          j;
+                    
+                    weights(weights_index) /= sum;
+                }
+            }
+        }
+    }
+    
+    // Apply dropout during training
+    if (training_) {
+        weights = apply_dropout(weights, dropout_);
+    }
+    
+    // Multiply weights by values
+    Tensor output(std::vector<size_t>{batch_size, num_heads, seq_len, d_k});
+    
+    // Calculate strides for output and v
+    size_t output_stride_1 = seq_len * d_k;  // stride for head position in output
+    size_t output_stride_2 = d_k;            // stride for sequence position in output
+    size_t v_stride_1 = seq_len * d_k;       // stride for head position in v
+    size_t v_stride_2 = d_k;                 // stride for sequence position in v
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t h = 0; h < num_heads; ++h) {
+            for (size_t i = 0; i < seq_len; ++i) {
+                for (size_t d = 0; d < d_k; ++d) {
+                    // Calculate flat index for output
+                    size_t output_index = b * num_heads * output_stride_1 + 
+                                         h * output_stride_1 + 
+                                         i * output_stride_2 + 
+                                         d;
+                    
+                    output(output_index) = 0.0;
+                    
+                    for (size_t j = 0; j < seq_len; ++j) {
+                        // Calculate flat indices for weights and v
+                        size_t weights_index = b * num_heads * scores_stride_1 + 
+                                              h * scores_stride_1 + 
+                                              i * scores_stride_2 + 
+                                              j;
+                        size_t v_index = b * num_heads * v_stride_1 + 
+                                        h * v_stride_1 + 
+                                        j * v_stride_2 + 
+                                        d;
+                        
+                        output(output_index) += weights(weights_index) * v(v_index);
+                    }
+                }
+            }
+        }
+    }
+    
+    return output;
+}
+
+Tensor MultiHeadAttention::apply_dropout(const Tensor& input, float dropout_rate) const {
+    if (dropout_rate <= 0.0) return input;
+    
+    Tensor output = input;
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::bernoulli_distribution dist(1.0 - dropout_rate);
+    
+    for (size_t i = 0; i < output.size(); ++i) {
+        if (!dist(gen)) {
+            output(i) = 0.0;
+        } else {
+            output(i) /= (1.0 - dropout_rate);
+        }
+    }
+    
+    return output;
+}
+
+} // namespace lm
--- a/src/models/feed_forward.cpp
+++ b/src/models/feed_forward.cpp
@ -0,0 +1,139 @@
+#include "lm/models/feed_forward.hpp"
+#include <cmath>
+#include <iostream>
+#include <random>
+
+namespace lm {
+
+FeedForward::FeedForward(size_t d_model, size_t d_ff, float dropout)
+    : d_model_(d_model), d_ff_(d_ff), dropout_(dropout) {
+    
+    // Initialize weight matrices and biases
+    w1_ = Tensor::xavier(std::vector<size_t>{d_model_, d_ff_});
+    b1_ = Tensor::zeros(std::vector<size_t>{d_ff_});
+    w2_ = Tensor::xavier(std::vector<size_t>{d_ff_, d_model_});
+    b2_ = Tensor::zeros(std::vector<size_t>{d_model_});
+    
+    std::cout << "Initialized FeedForward with:\n";
+    std::cout << "  d_model: " << d_model_ << "\n";
+    std::cout << "  d_ff: " << d_ff_ << "\n";
+    std::cout << "  dropout: " << dropout_ << "\n";
+}
+
+std::vector<Tensor> FeedForward::parameters() const {
+    return {w1_, b1_, w2_, b2_};
+}
+
+void FeedForward::set_training(bool training) {
+    training_ = training;
+}
+
+Tensor FeedForward::forward(const Tensor& input) const {
+    // Get input dimensions
+    size_t batch_size = input.shape()[0];
+    size_t seq_len = input.shape()[1];
+    
+    // First linear transformation: input * w1 + b1
+    Tensor hidden(std::vector<size_t>{batch_size, seq_len, d_ff_});
+    
+    // Calculate strides for flat indexing
+    size_t input_stride_1 = d_model_;  // stride for sequence position in input
+    size_t hidden_stride_1 = d_ff_;    // stride for sequence position in hidden
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t f = 0; f < d_ff_; ++f) {
+                // Calculate flat index for hidden
+                size_t hidden_index = b * seq_len * hidden_stride_1 + 
+                                     t * hidden_stride_1 + 
+                                     f;
+                
+                // Initialize with bias
+                hidden(hidden_index) = b1_(f);
+                
+                for (size_t d = 0; d < d_model_; ++d) {
+                    // Calculate flat index for input
+                    size_t input_index = b * seq_len * input_stride_1 + 
+                                       t * input_stride_1 + 
+                                       d;
+                    
+                    hidden(hidden_index) += input(input_index) * w1_(d, f);
+                }
+            }
+        }
+    }
+    
+    // GELU activation
+    hidden = gelu(hidden);
+    
+    // Apply dropout during training
+    if (training_) {
+        hidden = apply_dropout(hidden, dropout_);
+    }
+    
+    // Second linear transformation: hidden * w2 + b2
+    Tensor output(std::vector<size_t>{batch_size, seq_len, d_model_});
+    
+    // Calculate strides for output
+    size_t output_stride_1 = d_model_;  // stride for sequence position in output
+    
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t d = 0; d < d_model_; ++d) {
+                // Calculate flat index for output
+                size_t output_index = b * seq_len * output_stride_1 + 
+                                    t * output_stride_1 + 
+                                    d;
+                
+                // Initialize with bias
+                output(output_index) = b2_(d);
+                
+                for (size_t f = 0; f < d_ff_; ++f) {
+                    // Calculate flat index for hidden
+                    size_t hidden_index = b * seq_len * hidden_stride_1 + 
+                                        t * hidden_stride_1 + 
+                                        f;
+                    
+                    output(output_index) += hidden(hidden_index) * w2_(f, d);
+                }
+            }
+        }
+    }
+    
+    return output;
+}
+
+Tensor FeedForward::gelu(const Tensor& input) const {
+    // GELU activation function: x * 0.5 * (1.0 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+    const float sqrt_2_over_pi = std::sqrt(2.0f / M_PI);
+    Tensor result(input.shape());
+    
+    for (size_t i = 0; i < input.size(); ++i) {
+        float x = input(i);
+        float x_cubed = x * x * x;
+        result(i) = 0.5f * x * (1.0f + std::tanh(sqrt_2_over_pi * (x + 0.044715f * x_cubed)));
+    }
+    
+    return result;
+}
+
+Tensor FeedForward::apply_dropout(const Tensor& input, float dropout_rate) const {
+    if (dropout_rate <= 0.0) return input;
+    
+    Tensor output = input;
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::bernoulli_distribution dist(1.0 - dropout_rate);
+    
+    for (size_t i = 0; i < output.size(); ++i) {
+        if (!dist(gen)) {
+            output(i) = 0.0;
+        } else {
+            output(i) /= (1.0 - dropout_rate);
+        }
+    }
+    
+    return output;
+}
+
+} // namespace lm
--- a/src/models/language_model.cpp
+++ b/src/models/language_model.cpp
@ -0,0 +1,188 @@
+// lm/models/language_model.cpp
+#include "lm/models/language_model.hpp"
+#include "lm/optimizers/adam.hpp"
+#include <random>
+
+namespace lm {
+
+LanguageModel::LanguageModel(size_t vocab_size, size_t embedding_dim, 
+                           size_t hidden_dim, size_t num_layers)
+    : vocab_size_(vocab_size), embedding_dim_(embedding_dim),
+      hidden_dim_(hidden_dim), num_layers_(num_layers), is_training_(true) {
+    
+    // Initialize embedding layer
+    embedding_weight_ = Tensor::xavier({vocab_size, embedding_dim}, true);
+    
+    // Initialize LSTM layers
+    size_t gate_size = 4 * hidden_dim;
+    lstm_weight_ih_ = Tensor::xavier({gate_size, embedding_dim}, true);
+    lstm_weight_hh_ = Tensor::xavier({gate_size, hidden_dim}, true);
+    lstm_bias_ih_ = Tensor::zeros({gate_size}, true);
+    lstm_bias_hh_ = Tensor::zeros({gate_size}, true);
+    
+    // Initialize output layer
+    output_weight_ = Tensor::xavier({vocab_size, hidden_dim}, true);
+    output_bias_ = Tensor::zeros({vocab_size}, true);
+}
+
+Tensor LanguageModel::forward(const Tensor& input) {
+    // Input shape: [sequence_length, batch_size]
+    // Get sequence length and batch size
+    size_t seq_len = input.shape()[0];
+    size_t batch_size = input.shape()[1];
+    
+    // Embedding layer
+    Tensor embedded = embedding_weight_.index_select(input);  // [seq_len, batch_size, embedding_dim]
+    
+    // LSTM layer (simplified implementation)
+    Tensor hidden = Tensor::zeros({num_layers_, batch_size, hidden_dim});
+    Tensor cell = Tensor::zeros({num_layers_, batch_size, hidden_dim});
+    
+    Tensor output;
+    for (size_t t = 0; t < seq_len; ++t) {
+        // Get current time step
+        Tensor x_t = embedded.slice(t, 1, 0);  // [batch_size, embedding_dim]
+        
+        // LSTM computation (simplified)
+        for (size_t layer = 0; layer < num_layers_; ++layer) {
+            Tensor h_prev = hidden.slice(layer, 1, 0);
+            Tensor c_prev = cell.slice(layer, 1, 0);
+            
+            // Gates computation
+            Tensor gates = x_t.matmul(lstm_weight_ih_.transpose()) + 
+                          h_prev.matmul(lstm_weight_hh_.transpose()) +
+                          lstm_bias_ih_ + lstm_bias_hh_;
+            
+            // Split gates
+            Tensor i = gates.slice(0, hidden_dim, 1).sigmoid();
+            Tensor f = gates.slice(hidden_dim, hidden_dim, 1).sigmoid();
+            Tensor g = gates.slice(2 * hidden_dim, hidden_dim, 1).tanh();
+            Tensor o = gates.slice(3 * hidden_dim, hidden_dim, 1).sigmoid();
+            
+            // Update cell state
+            Tensor c_next = f * c_prev + i * g;
+            
+            // Update hidden state
+            Tensor h_next = o * c_next.tanh();
+            
+            // Store states
+            hidden.slice(layer, 1, 0) = h_next;
+            cell.slice(layer, 1, 0) = c_next;
+            
+            x_t = h_next;  // Output of this layer is input to next layer
+        }
+        
+        // Store output for this time step
+        if (t == 0) {
+            output = x_t.unsqueeze(0);  // Add sequence dimension
+        } else {
+            output = output.concatenate(x_t.unsqueeze(0), 0);
+        }
+    }
+    
+    // Output layer
+    Tensor logits = output.matmul(output_weight_.transpose()) + output_bias_;
+    return logits;
+}
+
+std::vector<Tensor> LanguageModel::parameters() const {
+    return {
+        embedding_weight_,
+        lstm_weight_ih_,
+        lstm_weight_hh_,
+        lstm_bias_ih_,
+        lstm_bias_hh_,
+        output_weight_,
+        output_bias_
+    };
+}
+
+void LanguageModel::train() {
+    is_training_ = true;
+}
+
+void LanguageModel::eval() {
+    is_training_ = false;
+}
+
+void LanguageModel::save(const std::string& path) const {
+    std::ofstream file(path, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Cannot open file for writing: " + path);
+    }
+    
+    // Write header
+    const char magic[] = "LMOD";
+    file.write(magic, 4);
+    
+    uint32_t version = 1;
+    file.write(reinterpret_cast<const char*>(&version), sizeof(version));
+    
+    // Get named parameters
+    auto params = named_parameters();
+    uint32_t num_params = static_cast<uint32_t>(params.size());
+    file.write(reinterpret_cast<const char*>(&num_params), sizeof(num_params));
+    
+    // Write each parameter
+    for (const auto& [name, tensor] : params) {
+        Tensor::write_string(file, name);
+        tensor.serialize(file);
+    }
+}
+
+void LanguageModel::load(const std::string& path) {
+    std::ifstream file(path, std::ios::binary);
+    if (!file) {
+        throw std::runtime_error("Cannot open file for reading: " + path);
+    }
+    
+    // Read and verify header
+    char magic[4];
+    file.read(magic, 4);
+    if (std::string(magic, 4) != "LMOD") {
+        throw std::runtime_error("Invalid model file format");
+    }
+    
+    uint32_t version;
+    file.read(reinterpret_cast<char*>(&version), sizeof(version));
+    if (version != 1) {
+        throw std::runtime_error("Unsupported model version: " + std::to_string(version));
+    }
+    
+    // Read number of parameters
+    uint32_t num_params;
+    file.read(reinterpret_cast<char*>(&num_params), sizeof(num_params));
+    
+    // Read each parameter
+    for (uint32_t i = 0; i < num_params; ++i) {
+        std::string name = Tensor::read_string(file);
+        Tensor tensor;
+        tensor.deserialize(file);
+        
+        // Set the parameter
+        set_parameter(name, tensor);
+    }
+}
+
+std::vector<Tensor> LanguageModel::parameters() const {
+    std::vector<Tensor> params;
+    for (const auto& [name, tensor] : parameters_) {
+        params.push_back(tensor);
+    }
+    return params;
+}
+
+std::unordered_map<std::string, Tensor> LanguageModel::named_parameters() const {
+    return parameters_;
+}
+
+void LanguageModel::set_parameter(const std::string& name, const Tensor& param) {
+    auto it = parameters_.find(name);
+    if (it != parameters_.end()) {
+        it->second = param;
+    } else {
+        throw std::runtime_error("Unknown parameter: " + name);
+    }
+}
+
+} // namespace lm
--- a/src/models/layer_norm.cpp
+++ b/src/models/layer_norm.cpp
@ -0,0 +1,83 @@
+#include "lm/models/layer_norm.hpp"
+#include <cmath>
+#include <iostream>
+
+namespace lm {
+
+LayerNorm::LayerNorm(size_t d_model, float eps)
+    : d_model_(d_model), eps_(eps) {
+    
+    // Initialize gamma (scale) to ones and beta (bias) to zeros
+    gamma_ = Tensor::ones(std::vector<size_t>{d_model_});
+    beta_ = Tensor::zeros(std::vector<size_t>{d_model_});
+    
+    std::cout << "Initialized LayerNorm with:\n";
+    std::cout << "  d_model: " << d_model_ << "\n";
+    std::cout << "  eps: " << eps_ << "\n";
+}
+
+std::vector<Tensor> LayerNorm::parameters() const {
+    return {gamma_, beta_};
+}
+
+void LayerNorm::set_training(/*bool training*/) {
+    // LayerNorm doesn't have different behavior during training vs evaluation
+    // This method is here for interface consistency
+}
+
+Tensor LayerNorm::forward(const Tensor& input) const {
+    // Get input dimensions
+    size_t batch_size = input.shape()[0];
+    size_t seq_len = input.shape()[1];
+    
+    // Create output tensor with same shape as input
+    Tensor output(input.shape());
+    
+    // Calculate strides for flat indexing
+    size_t input_stride_1 = d_model_;  // stride for sequence position in input
+    size_t input_stride_2 = 1;         // stride for feature dimension in input
+    
+    // For each element in the batch and each position in the sequence
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            // Calculate mean
+            float mean = 0.0f;
+            for (size_t d = 0; d < d_model_; ++d) {
+                size_t input_index = b * seq_len * input_stride_1 + 
+                                   t * input_stride_1 + 
+                                   d * input_stride_2;
+                mean += input(input_index);
+            }
+            mean /= d_model_;
+            
+            // Calculate variance
+            float variance = 0.0f;
+            for (size_t d = 0; d < d_model_; ++d) {
+                size_t input_index = b * seq_len * input_stride_1 + 
+                                   t * input_stride_1 + 
+                                   d * input_stride_2;
+                float diff = input(input_index) - mean;
+                variance += diff * diff;
+            }
+            variance /= d_model_;
+            
+            // Normalize
+            for (size_t d = 0; d < d_model_; ++d) {
+                size_t input_index = b * seq_len * input_stride_1 + 
+                                   t * input_stride_1 + 
+                                   d * input_stride_2;
+                size_t output_index = b * seq_len * input_stride_1 + 
+                                    t * input_stride_1 + 
+                                    d * input_stride_2;
+                
+                float normalized = (input(input_index) - mean) / std::sqrt(variance + eps_);
+                output(output_index) = gamma_(d) * normalized + beta_(d);
+            }
+        }
+    }
+    
+    return output;
+}
+
+} // namespace lm
+
--- a/src/models/transformer.cpp
+++ b/src/models/transformer.cpp
@ -0,0 +1,162 @@
+#include "lm/models/transformer.hpp"
+#include <iostream>
+#include <random>
+#include <cmath>
+
+namespace lm {
+
+Transformer::Transformer(size_t vocab_size, size_t d_model, size_t num_heads, 
+    size_t d_ff, size_t num_layers, size_t max_seq_len, float dropout)
+    : vocab_size_(vocab_size), d_model_(d_model), num_heads_(num_heads),
+    d_ff_(d_ff), num_layers_(num_layers), max_seq_len_(max_seq_len), 
+    dropout_(dropout), training_(false) {
+
+    // Initialize embedding layer
+    embedding_ = Tensor::randn({vocab_size_, d_model_}, 0.0, 0.02);
+    embedding_.requires_grad(true);
+
+    // Initialize positional encoding - use explicit vector
+    positional_encoding_ = Tensor(std::vector<size_t>{max_seq_len_, d_model_});
+    for (size_t pos = 0; pos < max_seq_len_; ++pos) {
+        for (size_t i = 0; i < d_model_; ++i) {
+            if (i % 2 == 0) {
+                positional_encoding_(pos, i) = std::sin(pos / std::pow(10000, 2.0 * i / d_model_));
+            } else {
+                positional_encoding_(pos, i) = std::cos(pos / std::pow(10000, 2.0 * (i - 1) / d_model_));
+            }
+        }
+    }
+    positional_encoding_.requires_grad(true);
+
+    // Initialize transformer blocks
+    for (size_t i = 0; i < num_layers_; ++i) {
+        transformer_blocks_.push_back(std::make_unique<TransformerBlock>(d_model_, num_heads_, d_ff_, dropout_));
+    }
+
+    // Initialize output layer
+    output_layer_ = Tensor::randn({d_model_, vocab_size_}, 0.0, 0.02);
+    output_layer_.requires_grad(true);
+
+    std::cout << "Initialized Transformer with:\n";
+    std::cout << "  vocab_size: " << vocab_size_ << "\n";
+    std::cout << "  d_model: " << d_model_ << "\n";
+    std::cout << "  num_heads: " << num_heads_ << "\n";
+    std::cout << "  d_ff: " << d_ff_ << "\n";
+    std::cout << "  num_layers: " << num_layers_ << "\n";
+    std::cout << "  max_seq_len: " << max_seq_len_ << "\n";
+    std::cout << "  dropout: " << dropout_ << "\n";
+}
+
+std::vector<Tensor> Transformer::parameters() const {
+    std::vector<Tensor> params;
+
+    // Add embedding parameters
+    params.push_back(embedding_);
+
+    // Add positional encoding parameters
+    params.push_back(positional_encoding_);
+
+    // Add transformer block parameters
+    for (const auto& block : transformer_blocks_) {
+        auto block_params = block->parameters();
+        params.insert(params.end(), block_params.begin(), block_params.end());
+    }
+
+    // Add output layer parameters
+    params.push_back(output_layer_);
+
+    return params;
+}
+
+void Transformer::set_training(bool training) {
+    training_ = training;
+
+    // Set training mode for all transformer blocks
+    for (auto& block : transformer_blocks_) {
+        block->set_training(training);
+    }
+
+    std::cout << "Set training mode to: " << (training ? "true" : "false") << "\n";
+}
+
+Tensor Transformer::forward(const Tensor& input, const Tensor& mask) {
+    // Get input dimensions
+    size_t batch_size = input.shape()[0];
+    size_t seq_len = input.shape()[1];
+
+    // Convert token IDs to embeddings - use explicit vector
+    Tensor embeddings(std::vector<size_t>{batch_size, seq_len, d_model_});
+
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            size_t token_id = static_cast<size_t>(input(b, t));
+            if (token_id < vocab_size_) {
+                for (size_t d = 0; d < d_model_; ++d) {
+                    embeddings(b, t, d) = embedding_(token_id, d);
+                }
+            }
+        }
+    }
+
+    // Add positional encoding
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t d = 0; d < d_model_; ++d) {
+                embeddings(b, t, d) += positional_encoding_(t, d);
+            }
+        }
+    }
+
+    // Apply dropout during training
+    if (training_) {
+        embeddings = apply_dropout(embeddings, dropout_);
+    }
+
+    // Pass through transformer blocks
+    Tensor hidden_states = embeddings;
+    for (auto& block : transformer_blocks_) {
+        hidden_states = block->forward(hidden_states, mask);
+    }
+
+    // Apply output layer - use explicit vector
+    Tensor logits(std::vector<size_t>{batch_size, seq_len, vocab_size_});
+    for (size_t b = 0; b < batch_size; ++b) {
+        for (size_t t = 0; t < seq_len; ++t) {
+            for (size_t v = 0; v < vocab_size_; ++v) {
+                logits(b, t, v) = 0.0;
+                for (size_t d = 0; d < d_model_; ++d) {
+                    logits(b, t, v) += hidden_states(b, t, d) * output_layer_(d, v);
+                }
+            }
+        }
+    }
+
+    return logits;
+}
+
+Tensor Transformer::forward(const Tensor& input) {
+    // Create an empty mask tensor
+    Tensor mask;
+    return forward(input, mask);
+}
+
+Tensor Transformer::apply_dropout(const Tensor& input, float dropout_rate) {
+    if (dropout_rate <= 0.0) return input;
+    
+    Tensor output = input;
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::bernoulli_distribution dist(1.0 - dropout_rate);
+    
+    for (size_t i = 0; i < output.size(); ++i) {
+        if (!dist(gen)) {
+            output(i) = 0.0;
+        } else {
+            output(i) /= (1.0 - dropout_rate);
+        }
+    }
+    
+    return output;
+}
+
+} // namespace lm
--- a/src/models/transformer_block.cpp
+++ b/src/models/transformer_block.cpp
@ -0,0 +1,65 @@
+#include "lm/models/transformer_block.hpp"
+#include <iostream>
+
+namespace lm {
+
+TransformerBlock::TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout)
+    : d_model_(d_model), num_heads_(num_heads), d_ff_(d_ff), dropout_(dropout) {
+    
+    // Initialize multi-head attention
+    attention_ = std::make_unique<MultiHeadAttention>(d_model, num_heads, dropout);
+    
+    // Initialize feed-forward network
+    feed_forward_ = std::make_unique<FeedForward>(d_model, d_ff, dropout);
+    
+    // Initialize layer normalization
+    norm1_ = std::make_unique<LayerNorm>(d_model);
+    norm2_ = std::make_unique<LayerNorm>(d_model);
+    
+    std::cout << "Initialized TransformerBlock with:\n";
+    std::cout << "  d_model: " << d_model_ << "\n";
+    std::cout << "  num_heads: " << num_heads_ << "\n";
+    std::cout << "  d_ff: " << d_ff_ << "\n";
+    std::cout << "  dropout: " << dropout_ << "\n";
+}
+
+std::vector<Tensor> TransformerBlock::parameters() const {
+    std::vector<Tensor> params;
+    
+    // Add attention parameters
+    auto attention_params = attention_->parameters();
+    params.insert(params.end(), attention_params.begin(), attention_params.end());
+    
+    // Add feed-forward parameters
+    auto ff_params = feed_forward_->parameters();
+    params.insert(params.end(), ff_params.begin(), ff_params.end());
+    
+    // Add layer norm parameters
+    auto norm1_params = norm1_->parameters();
+    params.insert(params.end(), norm1_params.begin(), norm1_params.end());
+    
+    auto norm2_params = norm2_->parameters();
+    params.insert(params.end(), norm2_params.begin(), norm2_params.end());
+    
+    return params;
+}
+
+void TransformerBlock::set_training(bool training) {
+    training_ = training;
+    attention_->set_training(training);
+    feed_forward_->set_training(training);
+}
+
+Tensor TransformerBlock::forward(const Tensor& input, const Tensor& mask) const {
+    // Self-attention with residual connection
+    Tensor attention_output = attention_->forward(input, input, input, mask);
+    Tensor norm1_output = norm1_->forward(input + attention_output);
+    
+    // Feed-forward with residual connection
+    Tensor ff_output = feed_forward_->forward(norm1_output);
+    Tensor output = norm2_->forward(norm1_output + ff_output);
+    
+    return output;
+}
+
+} // namespace lm
--- a/src/optimizers/CMakeLists.txt
+++ b/src/optimizers/CMakeLists.txt
@ -0,0 +1,15 @@
+add_library(lm_optimizers
+    adam.cpp
+)
+
+target_include_directories(lm_optimizers
+    PUBLIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+        ${EIGEN_LOC}
+)
+
+target_link_libraries(lm_optimizers
+    PUBLIC
+        lm_core_components
+)
+
--- a/src/optimizers/adam.cpp
+++ b/src/optimizers/adam.cpp
@ -0,0 +1,56 @@
+#include "lm/optimizers/adam.hpp"
+#include <cmath>
+
+namespace lm {
+
+AdamOptimizer::AdamOptimizer(float learning_rate, float beta1, float beta2, float epsilon)
+    : learning_rate_(learning_rate), beta1_(beta1), beta2_(beta2), epsilon_(epsilon), timestep_(0) {}
+
+void AdamOptimizer::zero_grad(std::vector<Tensor>& parameters) {
+    for (auto& param : parameters) {
+        if (param.requires_grad()) {
+            param.zero_grad();
+        }
+    }
+}
+
+void AdamOptimizer::step(std::vector<Tensor>& parameters) {
+    timestep_++;
+    
+    for (size_t i = 0; i < parameters.size(); i++) {
+        if (!parameters[i].requires_grad()) continue;
+        
+        // Initialize moment estimates if needed
+        if (m_.size() <= i) {
+            m_.push_back(Tensor::zeros(parameters[i].shape()));
+            v_.push_back(Tensor::zeros(parameters[i].shape()));
+        }
+        
+        // Convert gradient to Tensor for consistent operations
+        Tensor grad_tensor(parameters[i].grad(), parameters[i].shape());
+        
+        // Update biased first moment estimate using Tensor operations
+        m_[i] = m_[i] * beta1_ + grad_tensor * (1 - beta1_);
+        
+        // Update biased second raw moment estimate using Tensor operations
+        Tensor grad_squared = grad_tensor * grad_tensor;
+        v_[i] = v_[i] * beta2_ + grad_squared * (1 - beta2_);
+        
+        // Compute bias-corrected first moment estimate
+        float bias_correction1 = 1 - std::pow(beta1_, timestep_);
+        Tensor m_hat = m_[i] / bias_correction1;
+        
+        // Compute bias-corrected second raw moment estimate
+        float bias_correction2 = 1 - std::pow(beta2_, timestep_);
+        Tensor v_hat = v_[i] / bias_correction2;
+        
+        // Update parameters using Tensor operations
+        Tensor update = m_hat / (v_hat.sqrt() +
+            Tensor(Eigen::MatrixXf::Constant(v_hat.data().rows(), v_hat.data().cols(), epsilon_),
+            v_hat.shape()));
+        parameters[i].data() -= learning_rate_ * update.data();
+    }
+}
+
+} // namespace lm
+
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@ -4,15 +4,19 @@ cmake_minimum_required(VERSION 3.6)
 add_library(lm_runtime
    init.cpp
    shutdown.cpp
-    state_utils.cpp  # Add this line
+    state_utils.cpp
 )

 target_include_directories(lm_runtime
    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+    ${EIGEN_LOC}
 )

 target_link_libraries(lm_runtime
    PRIVATE nlohmann_json::nlohmann_json
+    PUBLIC
+        lm_core_components
+        lm_training  # NEW: Add training dependency
 )


--- a/src/test_transformer.cpp
+++ b/src/test_transformer.cpp
@ -0,0 +1,8 @@
+// test_transformer.cpp
+#include "lm/models/transformer.hpp"
+
+int main() {
+    lm::Transformer transformer(1000, 512, 8, 2048, 6, 512);
+    return 0;
+}
+
--- a/src/tokenizer/bpe_tokenizer.cpp
+++ b/src/tokenizer/bpe_tokenizer.cpp
@ -468,80 +468,10 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
    
    pimpl_->count_word_frequencies(words, word_counts);
    
-    // Track token frequencies for pruning
-    std::unordered_map<TokenID, size_t> token_frequencies;
-    
-    // Initialize token frequencies
-    for (const auto& [word, count] : word_counts) {
-        auto tokens = pimpl_->word_to_token_ids(word);
-        for (TokenID token : tokens) {
-            token_frequencies[token] += count;
-        }
-    }
-    
    // BPE training algorithm with safety limit
    int iteration = 0;
    int max_iterations = 10000;
    
-    // Pruning function - remove infrequent tokens
-    auto prune_infrequent_tokens = [&](size_t frequency_threshold = 2) {
-        std::vector<TokenID> tokens_to_remove;
-        
-        // Identify tokens to remove (excluding special tokens)
-        for (const auto& [token_id, freq] : token_frequencies) {
-            if (freq < frequency_threshold) {
-                // Check if this is a special token
-                std::string token_text = pimpl_->inv_vocab.at(token_id);
-                if (pimpl_->special_tokens.find(token_text) == pimpl_->special_tokens.end()) {
-                    tokens_to_remove.push_back(token_id);
-                }
-            }
-        }
-        
-        // Remove tokens from vocabulary
-        for (TokenID token_id : tokens_to_remove) {
-            std::string token_text = pimpl_->inv_vocab.at(token_id);
-            
-            // Remove from vocabulary mappings
-            pimpl_->vocab.erase(token_text);
-            pimpl_->inv_vocab.erase(token_id);
-            token_frequencies.erase(token_id);
-            
-            // Update word counts to use subword components instead of removed tokens
-            std::unordered_map<std::string, int> updated_word_counts;
-            for (const auto& [word, count] : word_counts) {
-                std::string updated_word = word;
-                size_t pos = 0;
-                
-                // Replace all occurrences of the token text with its byte representation
-                while ((pos = updated_word.find(token_text, pos)) != std::string::npos) {
-                    // Replace with byte fallback
-                    std::string replacement;
-                    for (unsigned char c : token_text) {
-                        std::string byte_str(1, static_cast<char>(c));
-                        replacement += byte_str;
-                    }
-                    updated_word.replace(pos, token_text.size(), replacement);
-                    pos += replacement.size();
-                }
-                
-                updated_word_counts[updated_word] += count;
-            }
-            
-            // Update the word_counts with the modified words
-            word_counts = std::move(updated_word_counts);
-        }
-        
-        // Recalculate token frequencies after pruning
-        token_frequencies.clear();
-        for (const auto& [word, count] : word_counts) {
-            auto tokens = pimpl_->word_to_token_ids(word);
-            for (TokenID token : tokens) {
-                token_frequencies[token] += count;
-            }
-        }
-    };
-    
    while (pimpl_->vocab.size() < vocab_size && iteration < max_iterations) {
        // Count pairs
        std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash> pair_counts;
@ -570,25 +500,7 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
        // Perform merge
        pimpl_->perform_merge(max_pair->first, pimpl_->next_token_id, word_counts);
        pimpl_->next_token_id++;
-        
-        // Update token frequencies
-        token_frequencies.clear();
-        for (const auto& [word, count] : word_counts) {
-            auto tokens = pimpl_->word_to_token_ids(word);
-            for (TokenID token : tokens) {
-                token_frequencies[token] += count;
-            }
-        }
-        
-        // Periodically prune infrequent tokens
-        if (iteration % 500 == 0 && iteration > 0) {
-            size_t pre_prune_size = pimpl_->vocab.size();
-            prune_infrequent_tokens(2); // Remove tokens with frequency < 2
-            
-            std::cout << "Pruned " << (pre_prune_size - pimpl_->vocab.size()) 
-                      << " infrequent tokens. New vocab size: " 
-                      << pimpl_->vocab.size() << std::endl;
-        }
+        iteration++;
        
        // Periodically check memory usage
        if (iteration % 500 == 0) {
@ -596,16 +508,8 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
            std::cout << "Memory after " << iteration << " iterations: " 
                      << (current_memory - start_memory) / (1024 * 1024) << "MB\n";
        }
-        
-        iteration++;
    }
    
-    // Final pruning after training completes
-    size_t pre_prune_size = pimpl_->vocab.size();
-    prune_infrequent_tokens(3); // Remove tokens with frequency < 3
-    std::cout << "Final pruning: Removed " << (pre_prune_size - pimpl_->vocab.size()) 
-              << " tokens. Final vocab size: " << pimpl_->vocab.size() << std::endl;
-    
    if (iteration >= max_iterations) {
        std::cout << "Reached maximum iterations. Stopping training." << std::endl;
    }
@ -615,9 +519,56 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
    std::cout << "Peak memory used: " << (end_memory - start_memory) / (1024 * 1024) << "MB\n";
    std::cout << "Final vocabulary size: " << pimpl_->vocab.size() << std::endl;

-    // Clear the string intern pool to free memory
+    // Add periodic memory cleanup
+    if (iteration % 1000 == 0) {
        pimpl_->string_pool.clear();
    }
+}
+
+void BPETokenizer::train_from_file(const std::string& filename, size_t vocab_size) {
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        throw std::runtime_error("Cannot open file: " + filename);
+    }
+    
+    std::vector<std::string> corpus;
+    std::string line;
+    while (std::getline(file, line)) {
+        corpus.push_back(line);
+    }
+    
+    train(corpus, vocab_size);
+}
+
+std::vector<TokenID> BPETokenizer::encode(const std::string& text) const {
+    auto words = pimpl_->split_text(text);
+    std::vector<TokenID> tokens;
+    tokens.reserve(text.size() * 2); // Pre-allocate based on text size
+    
+    for (const auto& word : words) {
+        auto word_tokens = pimpl_->word_to_token_ids(word);
+        
+        // Apply BPE merges more efficiently
+        bool changed;
+        do {
+            changed = false;
+            for (size_t i = 0; i < word_tokens.size() - 1; i++) {
+                auto pair = std::make_pair(word_tokens[i], word_tokens[i+1]);
+                if (auto it = pimpl_->merges.find(pair); it != pimpl_->merges.end()) {
+                    word_tokens[i] = it->second;
+                    word_tokens.erase(word_tokens.begin() + i + 1);
+                    changed = true;
+                    break;
+                }
+            }
+        } while (changed);
+        
+        tokens.insert(tokens.end(), word_tokens.begin(), word_tokens.end());
+    }
+    
+    return tokens;
+}
+
 std::string BPETokenizer::decode(const std::vector<TokenID>& tokens) const {
    std::string text;
    for (TokenID token_id : tokens) {
@ -755,4 +706,39 @@ void BPETokenizer::Impl::get_pair_counts(
    }
 }

+Eigen::VectorXi BPETokenizer::encode_to_vector(const std::string& text) const {
+    auto token_ids = encode(text);
+    Eigen::VectorXi result(token_ids.size());
+    
+    for (size_t i = 0; i < token_ids.size(); ++i) {
+        result(i) = static_cast<int>(token_ids[i]);
+    }
+    
+    return result;
+}
+
+std::string BPETokenizer::decode_from_vector(const Eigen::VectorXi& tokens) const {
+    std::vector<TokenID> token_ids(tokens.size());
+    
+    for (int i = 0; i < tokens.size(); ++i) {
+        token_ids[i] = static_cast<TokenID>(tokens(i));
+    }
+    
+    return decode(token_ids);
+}
+
+Eigen::VectorXf BPETokenizer::token_frequencies() const {
+    // This is a placeholder implementation
+    // In a real implementation, you would track token frequencies during training
+    size_t vocab_size = vocab_size;
+    Eigen::VectorXf frequencies(vocab_size);
+    
+    // Initialize with equal frequencies (placeholder)
+    for (size_t i = 0; i < vocab_size; ++i) {
+        frequencies(i) = 1.0f / vocab_size;
+    }
+    
+    return frequencies;
+}
+
 } // namespace lm
--- a/src/train_lm.cpp
+++ b/src/train_lm.cpp
--- a/src/training/CMakeLists.txt
+++ b/src/training/CMakeLists.txt
@ -0,0 +1,17 @@
+add_library(lm_training
+    trainer.cpp
+)
+
+target_include_directories(lm_training
+    PUBLIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+        ${EIGEN_LOC}
+)
+
+target_link_libraries(lm_training
+    PUBLIC
+        lm_core_components
+        lm_models
+        lm_optimizers
+)
+
--- a/src/training/trainer.cpp
+++ b/src/training/trainer.cpp
@ -0,0 +1,135 @@
+#include "lm/training/trainer.hpp"
+#include <iostream>
+#include <random>
+#include <algorithm>
+
+namespace lm {
+
+LanguageModelTrainer::LanguageModelTrainer(const BPETokenizer& tokenizer,
+                                         size_t embedding_dim,
+                                         size_t hidden_dim,
+                                         size_t num_layers)
+    : tokenizer_(tokenizer),  // Store reference
+      model_(tokenizer.vocab_size(), embedding_dim, hidden_dim, num_layers),
+      optimizer_(0.001, 0.9, 0.999, 1e-8) {}
+
+void LanguageModelTrainer::train(const std::vector<std::string>& corpus, 
+                               size_t epochs, 
+                               size_t batch_size, 
+                               size_t sequence_length) {
+    
+    model_.train();
+    
+    for (size_t epoch = 0; epoch < epochs; ++epoch) {
+        float total_loss = 0.0;
+        size_t num_batches = 0;
+        
+        // Shuffle corpus
+        std::vector<std::string> shuffled_corpus = corpus;
+        std::shuffle(shuffled_corpus.begin(), shuffled_corpus.end(), 
+                    std::default_random_engine(42));
+        
+        // Process in batches
+        for (size_t i = 0; i < shuffled_corpus.size(); i += batch_size) {
+            size_t end = std::min(i + batch_size, shuffled_corpus.size());
+            std::vector<std::string> batch_texts(shuffled_corpus.begin() + i, 
+                                               shuffled_corpus.begin() + end);
+            
+            // Prepare batch
+            Tensor batch = prepare_batch(batch_texts, sequence_length);
+            
+            // Split into input and target
+            Tensor input = batch.slice(0, sequence_length - 1, 0);
+            Tensor target = batch.slice(1, sequence_length - 1, 0);
+            
+            // Forward pass
+            Tensor logits = model_.forward(input);
+            
+            // Compute loss
+            float loss = compute_loss(logits, target);
+            total_loss += loss;
+            
+            // Backward pass
+            logits.backward();
+            
+            // Update parameters - store in variable to avoid rvalue reference issue
+            auto params = model_.parameters();
+            optimizer_.step(params);
+            optimizer_.zero_grad(params);
+            
+            num_batches++;
+            
+            if (num_batches % 100 == 0) {
+                std::cout << "Epoch " << epoch + 1 << ", Batch " << num_batches 
+                          << ", Loss: " << loss << std::endl;
+            }
+        }
+        
+        std::cout << "Epoch " << epoch + 1 << " completed. Average loss: " 
+                  << total_loss / num_batches << std::endl;
+    }
+}
+
+Tensor LanguageModelTrainer::prepare_batch(const std::vector<std::string>& texts, 
+                                         size_t sequence_length) {
+    std::vector<std::vector<TokenID>> tokenized_texts;
+    
+    // Tokenize all texts
+    for (const auto& text : texts) {
+        tokenized_texts.push_back(tokenizer_.encode(text));
+    }
+    
+    // Create batch tensor - fix ambiguous constructor
+    std::vector<size_t> shape = {sequence_length, texts.size()};
+    Tensor batch(shape);
+    
+    // Fill batch
+    for (size_t i = 0; i < texts.size(); ++i) {
+        const auto& tokens = tokenized_texts[i];
+        for (size_t j = 0; j < sequence_length; ++j) {
+            if (j < tokens.size()) {
+                batch(j, i) = static_cast<float>(tokens[j]);
+            } else {
+                // Padding
+                batch(j, i) = 0.0f;
+            }
+        }
+    }
+    
+    return batch;
+}
+
+float LanguageModelTrainer::compute_loss(const Tensor& logits, const Tensor& targets) {
+    // Cross-entropy loss
+    Tensor log_probs = logits.softmax(-1);
+    
+    // Gather the log probabilities of the target classes
+    Tensor loss = Tensor::zeros({1});
+    size_t batch_size = targets.shape()[1];
+    size_t seq_length = targets.shape()[0];
+    
+    for (size_t i = 0; i < batch_size; ++i) {
+        for (size_t j = 0; j < seq_length; ++j) {
+            int target_class = static_cast<int>(targets(j, i));
+            if (target_class != 0) {  // Skip padding
+                loss(0) -= log_probs(j, i, target_class);
+            }
+        }
+    }
+    
+    // Average loss
+    return loss(0) / (batch_size * seq_length);
+}
+
+void LanguageModelTrainer::save_model(const std::string& path) {
+    model_.save(path);
+    std::cout << "Model saved to: " << path << std::endl;
+}
+
+void LanguageModelTrainer::load_model(const std::string& path) {
+    model_.load(path);
+    std::cout << "Model loaded from: " << path << std::endl;
+}
+
+
+} // namespace lm
--- a/todo.md
+++ b/todo.md