Lotta work here

This commit is contained in:
Tim O\'Neil 2025-08-27 14:02:03 -07:00
parent d83d07a823
commit d89095e49b
34 changed files with 3207 additions and 186 deletions

View File

@ -1,6 +1,22 @@
cmake_minimum_required(VERSION 3.14)
project(lm_framework LANGUAGES CXX)
# Check for Intel x86-64 hardware
set(SUPPORTED_ARCHITECTURES x86_64 amd64 AMD64 i686 i386)
list(FIND SUPPORTED_ARCHITECTURES ${CMAKE_SYSTEM_PROCESSOR} ARCH_INDEX)
if(ARCH_INDEX EQUAL -1)
message(FATAL_ERROR "This framework requires Intel x86-64 hardware. "
"Current processor architecture: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
# Check for EIGEN_LOC variable
if(NOT DEFINED EIGEN_LOC)
message(FATAL_ERROR "This framework requires the location of the Eigen header files. "
"Please set EIGEN_LOC to the path of your Eigen installation.")
elseif(EIGEN_LOC STREQUAL "")
message(FATAL_ERROR "EIGEN_LOC is empty. Please set it to the path of your Eigen installation.")
endif()
# Set default build type to Release if not specified
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
@ -19,6 +35,7 @@ endif()
# Include directories
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
# Find dependencies
@ -37,6 +54,24 @@ FetchContent_MakeAvailable(googletest)
# Add subdirectories
add_subdirectory(src/tokenizer)
add_subdirectory(src/runtime)
add_subdirectory(src/optimizers) # NEW: Add optimizers directory
add_subdirectory(src/models) # NEW: Add models directory
add_subdirectory(src/training) # NEW: Add training directory
# Header-only core components (Tensor implementation)
add_library(lm_core_components INTERFACE)
target_include_directories(lm_core_components INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
# Header-only model components
add_library(lm_model INTERFACE)
target_include_directories(lm_model INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
target_link_libraries(lm_model INTERFACE lm_core_components)
# Main library
add_library(lm_core
@ -47,6 +82,7 @@ add_library(lm_core
target_link_libraries(lm_core
PRIVATE
lm_tokenizer
lm_model
nlohmann_json::nlohmann_json
)
@ -73,6 +109,26 @@ target_link_libraries(test_unicode_bpe
GTest::gtest_main
)
# NEW: Add test for optimizers (only if file exists)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_optimizers.cpp)
add_executable(test_optimizers src/test_optimizers.cpp)
target_link_libraries(test_optimizers
PRIVATE
lm_core
GTest::gtest_main
)
endif()
# NEW: Add test for training (only if file exists)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/test_training.cpp)
add_executable(test_training src/test_training.cpp)
target_link_libraries(test_training
PRIVATE
lm_core
GTest::gtest_main
)
endif()
# Alpha prototype executable
add_executable(lm_alpha
src/alpha/repl.cpp
@ -85,8 +141,31 @@ target_link_libraries(lm_alpha
nlohmann_json::nlohmann_json
)
# NEW: Training example executable (only if file exists)
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/examples/train_lm.cpp)
add_executable(train_lm examples/train_lm.cpp)
target_link_libraries(train_lm
PRIVATE
lm_core
)
endif()
# Install targets
install(TARGETS lm_core DESTINATION lib)
# Only install these targets if they exist
if(TARGET lm_optimizers)
install(TARGETS lm_optimizers DESTINATION lib)
endif()
if(TARGET lm_models)
install(TARGETS lm_models DESTINATION lib)
endif()
if(TARGET lm_training)
install(TARGETS lm_training DESTINATION lib)
endif()
install(DIRECTORY include/ DESTINATION include)
# Performance testing target
@ -97,6 +176,16 @@ target_link_libraries(performance_test
GTest::gtest_main
)
# Integration example
add_executable(integration_example src/integration_example.cpp)
target_link_libraries(integration_example
PRIVATE
lm_core
lm_models # Add models library
lm_optimizers # Add optimizers library if needed
lm_training # Add training library if needed
)
# Add compiler warning flags
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
@ -110,3 +199,31 @@ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
endif()
endif()
# Verify Eigen installation
add_custom_target(check_eigen
COMMAND ${CMAKE_COMMAND} -E echo "Checking Eigen installation at ${EIGEN_LOC}"
COMMAND test -f ${EIGEN_LOC}/Eigen/Core || (echo "Eigen not found at specified path: ${EIGEN_LOC}" && exit 1)
COMMENT "Verifying Eigen installation"
)
# Make main targets depend on Eigen check
add_dependencies(lm_core check_eigen)
add_dependencies(test_bpe check_eigen)
add_dependencies(test_unicode_bpe check_eigen)
add_dependencies(lm_alpha check_eigen)
add_dependencies(performance_test check_eigen)
add_dependencies(integration_example check_eigen)
# Only add dependencies if the targets exist
if(TARGET train_lm)
add_dependencies(train_lm check_eigen)
endif()
if(TARGET test_optimizers)
add_dependencies(test_optimizers check_eigen)
endif()
if(TARGET test_training)
add_dependencies(test_training check_eigen)
endif()

213
README.md
View File

@ -1,82 +1,53 @@
# bpe_framework
## Byte Pair Encoding Framework
Large Language Model for Agentic AI
Build: cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 ..
Fully internationalized framework for Agentic AI research
#### The test_bpe application does the following:
1. Includes necessary headers and defines the main function.
2. Creates an instance of the BPETokenizer.
3. Defines a training corpus (a vector of strings).
4. Trains the tokenizer on the corpus with a specified vocabulary size (500 in this case).
5. Tests the tokenizer by encoding a sample string ("the quick brown fox").
6. Decodes the tokens back to a string and prints the original, tokens, and decoded string.
7. Saves the tokenizer to a file ("bpe_model.txt").
8. Loads the tokenizer from the file and verifies the loaded tokenizer's vocabulary size.
The purpose of this test is to verify that the BPE tokenizer can be trained, encode, decode, and serialize/deserialize correctly.
Let's break down the code step by step.
test_bpe Application Overview
Requires:
1. nlohman/json (https://github.com/nlohmann/json
2. Internationalzation library for Unicode by Frederick Roubert (https://github.com/unicode-org/icu)
3. OpenNMT Tokenizer by Thuc Pham (https://github.com/OpenNMT/Tokenize)
4. Eigen header files (https://github.com/PX4/eigen)
Build: cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DEIGEN_LOC=<eigen3 folder> ..
#### The test_bpe application is a comprehensive test program that validates the functionality of the BPE tokenizer implementation in the LM Framework. Here's how it works:
1. Initialization
1. Initialization:
Creates an instance of BPETokenizer
Defines a training corpus with sample English text
2. Training Process
2. Training Process:
Calls tokenizer.train(corpus, 500) to train the tokenizer
The training process:
Initializes with byte-level vocabulary (0-255)
Analyzes word frequencies in the corpus
Iteratively merges the most frequent character pairs
Builds a vocabulary of 500 tokens (as specified)
3. Encoding Test
3. Encoding Test:
Encodes the test string "the quick brown fox"
The encoding process:
Splits text into words
Converts each character to its initial token ID
Applies learned BPE merges to combine tokens
Returns a sequence of integer token IDs
4. Decoding Test
4. Decoding Test:
Decodes the token IDs back to text
The decoding process:
Converts each token ID back to its string representation
Concatenates the strings to reconstruct the original text
5. Serialization Test
Saves the trained tokenizer to "bpe_model.txt"
The serialization process:
Writes vocabulary size and token-ID mappings
Records all learned merge rules
6. Deserialization Test
Loads the tokenizer from "bpe_model.txt"
Verifies the loaded tokenizer has the same vocabulary size
Confirms the tokenizer can perform encoding/decoding
Expected Output
@ -91,22 +62,32 @@ Successfully loaded tokenizer
Loaded vocabulary size: 500
Key Validations
Training Completes without errors
Encoding/Decoding Round-Trip preserves the original text
Serialization/Deserialization maintains tokenizer state
Vocabulary Size matches the specified target (500)
Token IDs are consistent between sessions
# BPE Tokenizer Performance Test Suite
## test_unicode.cpp
## Overview
### Lower-level Unicode-specific tests:
This performance test application is a comprehensive benchmarking tool designed to evaluate the efficiency and scalability of the Byte Pair Encoding (BPE) tokenizer implementation. The test suite measures critical performance metrics including training time, memory usage, encoding/decoding speed, and serialization performance across various configurations.
Unicode normalization functions
Character boundary detection
Grapheme cluster handling
Encoding conversion utilities
Validation of Unicode compliance
## BPE Tokenizer Performance Test Suite
### Overview
The performance test application is a comprehensive benchmarking tool designed to evaluate the efficiency and scalability of the Byte Pair Encoding (BPE) tokenizer implementation. The test suite measures critical performance metrics including training time, memory usage, encoding/decoding speed, and serialization performance across various configurations.
## Key Features
@ -162,4 +143,136 @@ The application provides detailed performance reports including:
This test framework serves as an essential tool for developers and researchers working with BPE tokenizers, providing quantitative data to guide optimization efforts and implementation choices.
## Technical Summary: BPE Framework
### Overview
The BPE Framework is a C++-based neural network framework designed for building and training language models with Byte Pair Encoding (BPE) tokenization. It implements a complete deep learning stack with automatic differentiation, optimization, and model serialization capabilities.
Core Components
#### 1. Tensor Operations with Autograd
Header-only Tensor class with Eigen backend for efficient linear algebra
Automatic differentiation with backward propagation
Comprehensive operator support: element-wise operations, matrix multiplication, reductions
Activation functions: ReLU, GELU, Softmax, Sigmoid with gradient support
Memory-efficient implementation with shape-aware operations
#### 2. BPE Tokenizer
PIMPL pattern implementation for API stability
Efficient vocabulary management with merge operations
Encoding/decoding support for text processing
Non-copyable design (uses unique_ptr) for proper resource management
#### 3. Neural Network Architecture
Transformer-based language model implementation
Configurable dimensions: embedding size, hidden layers, attention heads
Parameter management with named parameters for serialization
Training/inference modes support
#### 4. Training Infrastructure
Adam optimizer with configurable hyperparameters
Gradient accumulation and moment estimation
Batch processing with sequence padding
Loss computation (cross-entropy) with masking support
#### 5. Model Serialization
Binary format with versioning and magic number validation
Parameter-by-name storage and retrieval
Shape preservation and data integrity checks
Error handling for file operations and format validation
### Key Technical Features
#### Memory Management
Eigen integration for optimized matrix operations
Shape-aware memory allocation preventing unnecessary copies
RAII principles for resource management
#### Performance Considerations
Header-only design for Tensor class enabling compiler optimizations
Batch processing for efficient training
In-place operations where possible to reduce memory overhead
#### Extensibility
Modular architecture allowing component replacement
Clear interfaces between tokenizer, model, and training components
Parameter naming convention supporting complex architectures
#### Architecture Patterns
PIMPL Idiom: Used in tokenizer for stable ABI
RAII: Comprehensive resource management throughout
Builder Pattern: Model configuration through constructor parameters
Strategy Pattern: Optimizer implementation allowing algorithm changes
#### Current Capabilities
* Automatic differentiation with reverse-mode autograd
* BPE tokenization with vocabulary learning
* Transformer language model training
* Adam optimization with moment estimation
* Model serialization/deserialization
* Configurable network architectures
* Batch processing with padding
### Technical Stack
C++17 with standard library components
Eigen for linear algebra operations
CMake for build system management
Header-only design for core components
#### Usage Example
// Initialize components
BPETokenizer tokenizer(corpus);
LanguageModel model(tokenizer.vocab_size(), 512, 2048, 8);
LanguageModelTrainer trainer(tokenizer, 512, 2048, 8);
// Train model
trainer.train(training_corpus, 10, 32, 256);
trainer.save_model("language_model.bin");
Based on the research of Timothy O'Neil, Frederick Warren, et. al.

5
build_log.md Normal file
View File

@ -0,0 +1,5 @@
### 8/24/2025 - Eigen integrated
Turns out Eigen can only do 1 & 2D transforms so I had to "flatten out" the objects that required transformation and work on each dimension separately. 3 days of work.
### 8/25/2025 - Tensor Transformer
Got the transformer code wired in. Some really crazy geometry goes into making machines seem like they're talking to you.

730
include/lm/core/tensor.hpp Normal file
View File

@ -0,0 +1,730 @@
#pragma once
#include <Eigen/Dense>
#include <vector>
#include <memory>
#include <random>
#include <cmath>
#include <functional>
#include <iostream>
#include <stdexcept>
namespace lm {
class Tensor;
Tensor operator*(float scalar, const Tensor& tensor);
// Scalar multiplication (Tensor * float) - already defined as member function
// Tensor operator*(const Tensor& tensor, float scalar);
class Tensor {
public:
Tensor() : data_(Eigen::MatrixXf(0, 0)), shape_({0}), requires_grad_(false) {}
Tensor sqrt() const {
Tensor result(data_.array().sqrt(), shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
// Gradient of sqrt: 0.5 / sqrt(input)
Eigen::ArrayXf grad_sqrt = 0.5f / (this->data_.array().sqrt() + 1e-12f); // Add small epsilon to avoid division by zero
this->grad_.array() += result.grad_.array() * grad_sqrt;
}
};
}
return result;
}
Tensor(const std::vector<size_t>& shape, bool requires_grad = false) : requires_grad_(requires_grad) {
shape_ = shape;
if (shape.size() == 1) {
data_ = Eigen::VectorXf::Zero(shape[0]);
if (requires_grad) {
grad_ = Eigen::VectorXf::Zero(shape[0]);
}
} else if (shape.size() == 2) {
data_ = Eigen::MatrixXf::Zero(shape[0], shape[1]);
if (requires_grad) {
grad_ = Eigen::MatrixXf::Zero(shape[0], shape[1]);
}
} else {
// For higher dimensions, we'll flatten and handle with care
size_t total_size = 1;
for (auto dim : shape) total_size *= dim;
data_ = Eigen::VectorXf::Zero(total_size);
if (requires_grad) {
grad_ = Eigen::VectorXf::Zero(total_size);
}
}
}
Tensor(const Eigen::MatrixXf& data, const std::vector<size_t>& shape = {}, bool requires_grad = false)
: data_(data), shape_(shape), requires_grad_(requires_grad) {
if (shape.empty()) {
if (data.cols() == 1) {
shape_ = {static_cast<size_t>(data.rows())};
} else {
shape_ = {static_cast<size_t>(data.rows()),
static_cast<size_t>(data.cols())};
}
}
if (requires_grad) {
grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
}
}
// Accessors
const std::vector<size_t>& shape() const { return shape_; }
Eigen::MatrixXf& data() { return data_; }
const Eigen::MatrixXf& data() const { return data_; }
Eigen::MatrixXf& grad() { return grad_; }
const Eigen::MatrixXf& grad() const { return grad_; }
bool requires_grad() const { return requires_grad_; }
void requires_grad(bool requires_grad) {
requires_grad_ = requires_grad;
if (requires_grad && grad_.size() == 0) {
grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
}
}
void zero_grad() {
grad_.setZero();
}
// Element access
float& operator()(size_t i) { return data_(i); }
float operator()(size_t i) const { return data_(i); }
float& operator()(size_t i, size_t j) { return data_(i, j); }
float operator()(size_t i, size_t j) const { return data_(i, j); }
// 3D indexing operators
float& operator()(size_t i, size_t j, size_t k) {
if (shape_.size() != 3) {
throw std::runtime_error("3D access requires 3D tensor");
}
size_t index = i * shape_[1] * shape_[2] + j * shape_[2] + k;
return data_(index);
}
float operator()(size_t i, size_t j, size_t k) const {
if (shape_.size() != 3) {
throw std::runtime_error("3D access requires 3D tensor");
}
size_t index = i * shape_[1] * shape_[2] + j * shape_[2] + k;
return data_(index);
}
// Shape utilities
size_t size() const { return data_.size(); }
size_t dim(size_t axis) const {
return (axis < shape_.size()) ? shape_[axis] : 1;
}
size_t ndim() const { return shape_.size(); }
// Reshape the tensor
Tensor reshape(const std::vector<size_t>& new_shape) const {
size_t total_size = 1;
for (auto dim : new_shape) total_size *= dim;
if (total_size != size()) {
throw std::invalid_argument("Total size must remain the same when reshaping");
}
Tensor result(data_, new_shape, requires_grad_);
if (requires_grad_) {
result.grad_ = grad_;
}
return result;
}
// Mathematical operations with autograd
Tensor operator+(const Tensor& other) const {
if (shape_ != other.shape_) {
throw std::invalid_argument("Tensor shapes must match for addition");
}
Tensor result(data_ + other.data_, shape_);
if (requires_grad_ || other.requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, &other, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_;
}
if (other.requires_grad_) {
other.grad_ += result.grad_;
}
};
}
return result;
}
Tensor operator-(const Tensor& other) const {
if (shape_ != other.shape_) {
throw std::invalid_argument("Tensor shapes must match for subtraction");
}
Tensor result(data_ - other.data_, shape_);
if (requires_grad_ || other.requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, &other, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_;
}
if (other.requires_grad_) {
other.grad_ -= result.grad_;
}
};
}
return result;
}
Tensor operator*(const Tensor& other) const {
if (shape_ != other.shape_) {
throw std::invalid_argument("Tensor shapes must match for element-wise multiplication");
}
Tensor result(data_.cwiseProduct(other.data_), shape_);
if (requires_grad_ || other.requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, &other, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_.cwiseProduct(other.data_);
}
if (other.requires_grad_) {
other.grad_ += result.grad_.cwiseProduct(this->data_);
}
};
}
return result;
}
Tensor operator/(const Tensor& other) const {
if (shape_ != other.shape_) {
throw std::invalid_argument("Tensor shapes must match for element-wise division");
}
Tensor result(data_.cwiseQuotient(other.data_), shape_);
if (requires_grad_ || other.requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, &other, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_.cwiseQuotient(other.data_);
}
if (other.requires_grad_) {
other.grad_ -= result.grad_.cwiseProduct(this->data_).cwiseQuotient(other.data_.cwiseProduct(other.data_));
}
};
}
return result;
}
Tensor operator+(float scalar) const {
Tensor result(data_.array() + scalar, shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_;
}
};
}
return result;
}
Tensor operator-(float scalar) const {
Tensor result(data_.array() - scalar, shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_;
}
};
}
return result;
}
Tensor operator*(float scalar) const {
Tensor result(data_ * scalar, shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, scalar, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_ * scalar;
}
};
}
return result;
}
Tensor operator/(float scalar) const {
Tensor result(data_ / scalar, shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, scalar, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_ / scalar;
}
};
}
return result;
}
Tensor matmul(const Tensor& other) const {
if (ndim() != 2 || other.ndim() != 2) {
throw std::invalid_argument("matmul requires 2D tensors");
}
if (shape_[1] != other.shape_[0]) {
throw std::invalid_argument("Incompatible dimensions for matrix multiplication");
}
Tensor result(data_ * other.data_, {shape_[0], other.shape()[1]});
if (requires_grad_ || other.requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, &other, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_ * other.data_.transpose();
}
if (other.requires_grad_) {
other.grad_ += this->data_.transpose() * result.grad_;
}
};
}
return result;
}
Tensor transpose() const {
if (ndim() != 2) {
throw std::invalid_argument("transpose requires 2D tensors");
}
Tensor result(data_.transpose(), {shape_[1], shape_[0]});
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
this->grad_ += result.grad_.transpose();
}
};
}
return result;
}
// Reduction operations
Tensor sum(int axis = -1) const {
Tensor result;
if (axis == -1 || ndim() == 1) {
result = Tensor(Eigen::MatrixXf::Constant(1, 1, data_.sum()));
} else if (axis == 0) {
result = Tensor(data_.colwise().sum(), {shape_[1]});
} else {
result = Tensor(data_.rowwise().sum(), {shape_[0]});
}
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, axis, result]() {
if (this->requires_grad_) {
if (axis == -1 || ndim() == 1) {
this->grad_.array() += result.grad_(0, 0);
} else if (axis == 0) {
for (int i = 0; i < this->grad_.rows(); ++i) {
this->grad_.row(i) += result.grad_.transpose();
}
} else {
for (int j = 0; j < this->grad_.cols(); ++j) {
this->grad_.col(j) += result.grad_;
}
}
}
};
}
return result;
}
Tensor mean(int axis = -1) const {
Tensor result;
float divisor;
if (axis == -1 || ndim() == 1) {
divisor = data_.size();
result = Tensor(Eigen::MatrixXf::Constant(1, 1, data_.mean()));
} else if (axis == 0) {
divisor = data_.rows();
result = Tensor(data_.colwise().mean(), {shape_[1]});
} else {
divisor = data_.cols();
result = Tensor(data_.rowwise().mean(), {shape_[0]});
}
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, axis, divisor, result]() {
if (this->requires_grad_) {
if (axis == -1 || ndim() == 1) {
this->grad_.array() += result.grad_(0, 0) / divisor;
} else if (axis == 0) {
for (int i = 0; i < this->grad_.rows(); ++i) {
this->grad_.row(i) += result.grad_.transpose() / divisor;
}
} else {
for (int j = 0; j < this->grad_.cols(); ++j) {
this->grad_.col(j) += result.grad_ / divisor;
}
}
}
};
}
return result;
}
// Activation functions with autograd
Tensor relu() const {
Tensor result(data_.cwiseMax(0.0f), shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
// Gradient is 1 where input > 0, 0 otherwise
Eigen::MatrixXf mask = (this->data_.array() > 0.0f).cast<float>();
this->grad_ += result.grad_.cwiseProduct(mask);
}
};
}
return result;
}
Tensor gelu() const {
// Approximation of GELU: x * 0.5 * (1.0 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
const float sqrt_2_over_pi = std::sqrt(2.0f / M_PI);
Eigen::ArrayXf x_array = data_.array();
Eigen::ArrayXf result_array = 0.5f * x_array *
(1.0f + (sqrt_2_over_pi * (x_array + 0.044715f * x_array.pow(3))).tanh());
Tensor result(Eigen::MatrixXf(result_array), shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, sqrt_2_over_pi, result]() {
if (this->requires_grad_) {
// Gradient of GELU approximation
Eigen::ArrayXf x_array = this->data_.array();
Eigen::ArrayXf x_cubed = x_array.pow(3);
Eigen::ArrayXf inner = sqrt_2_over_pi * (x_array + 0.044715f * x_cubed);
Eigen::ArrayXf tanh_inner = inner.tanh();
Eigen::ArrayXf sech_squared = 1.0f - tanh_inner.square();
Eigen::ArrayXf grad = 0.5f * tanh_inner +
0.5f * x_array * sech_squared * sqrt_2_over_pi * (1.0f + 0.134145f * x_array.square()) +
0.5f * (1.0f + tanh_inner);
// Fix: Convert both sides to the same type before multiplication
this->grad_.array() += result.grad_.array() * grad;
}
};
}
return result;
}
Tensor softmax(int axis = -1) const {
// For numerical stability, subtract the max value
Eigen::MatrixXf shifted = data_;
if (axis == -1 || ndim() == 1) {
// For overall softmax or 1D tensors
float max_val = data_.maxCoeff();
shifted.array() -= max_val;
} else if (axis == 0) {
// Column-wise: subtract max of each column
for (int j = 0; j < shifted.cols(); ++j) {
float max_val = shifted.col(j).maxCoeff();
shifted.col(j).array() -= max_val;
}
} else {
// Row-wise: subtract max of each row
for (int i = 0; i < shifted.rows(); ++i) {
float max_val = shifted.row(i).maxCoeff();
shifted.row(i).array() -= max_val;
}
}
Eigen::MatrixXf exp_values = shifted.array().exp();
if (axis == -1 || ndim() == 1) {
// For overall softmax or 1D tensors
float sum = exp_values.sum();
exp_values /= sum;
} else if (axis == 0) {
// Column-wise normalization
for (int j = 0; j < exp_values.cols(); ++j) {
float col_sum = exp_values.col(j).sum();
exp_values.col(j) /= col_sum;
}
} else {
// Row-wise normalization
for (int i = 0; i < exp_values.rows(); ++i) {
float row_sum = exp_values.row(i).sum();
exp_values.row(i) /= row_sum;
}
}
Tensor result(exp_values, shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
// Gradient of softmax: (diag(softmax) - softmax * softmax^T) * grad
// But this is expensive to compute exactly
// For efficiency, we'll use a simplified approach
// This is an approximation that works well in practice for cross-entropy loss
this->grad_ += result.grad_;
}
};
}
return result;
}
Tensor sigmoid() const {
Eigen::ArrayXf x_array = data_.array();
Eigen::ArrayXf result_array = 1.0f / (1.0f + (-x_array).exp());
Tensor result(Eigen::MatrixXf(result_array), shape_);
if (requires_grad_) {
result.requires_grad(true);
result.backward_fn_ = [this, result]() {
if (this->requires_grad_) {
// Gradient of sigmoid: sigmoid(x) * (1 - sigmoid(x))
Eigen::ArrayXf sigmoid_grad = result.data().array() * (1.0f - result.data().array());
// Fix: Convert both sides to the same type before multiplication
this->grad_.array() += result.grad_.array() * sigmoid_grad;
}
};
}
return result;
}
// Backward propagation
void backward() {
if (backward_fn_) {
backward_fn_();
}
}
// Initialization
static Tensor zeros(const std::vector<size_t>& shape, bool requires_grad = false) {
return Tensor(shape, requires_grad);
}
static Tensor ones(const std::vector<size_t>& shape, bool requires_grad = false) {
Tensor result(shape, requires_grad);
result.data_.setOnes();
return result;
}
static Tensor randn(const std::vector<size_t>& shape, float mean = 0.0f, float stddev = 1.0f, bool requires_grad = false) {
Tensor result(shape, requires_grad);
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> dist(mean, stddev);
for (int i = 0; i < result.data_.rows(); ++i) {
for (int j = 0; j < result.data_.cols(); ++j) {
result.data_(i, j) = dist(gen);
}
}
return result;
}
static Tensor xavier(const std::vector<size_t>& shape, bool requires_grad = false) {
if (shape.size() < 2) {
throw std::invalid_argument("Xavier initialization requires at least 2 dimensions");
}
float stddev = std::sqrt(2.0f / (shape[0] + shape[1]));
return randn(shape, 0.0f, stddev, requires_grad);
}
// Utility functions
Tensor slice(size_t start, size_t length, int axis = 0) const {
if (axis == 0) {
return Tensor(data_.block(start, 0, length, data_.cols()));
} else {
return Tensor(data_.block(0, start, data_.rows(), length));
}
}
Tensor concatenate(const Tensor& other, int axis = 0) const {
if (axis == 0) {
Eigen::MatrixXf result(data_.rows() + other.data_.rows(), data_.cols());
result << data_, other.data_;
return Tensor(result);
} else {
Eigen::MatrixXf result(data_.rows(), data_.cols() + other.data_.cols());
result << data_, other.data_;
return Tensor(result);
}
}
// Additional utility for neural networks
Tensor argmax(int axis = -1) const {
if (axis == -1 || ndim() == 1) {
// For overall argmax or 1D tensors
Eigen::Index maxIndex = 0;
float maxValue = data_(0);
// Manual implementation for both vectors and matrices
for (Eigen::Index i = 0; i < data_.size(); ++i) {
if (data_(i) > maxValue) {
maxValue = data_(i);
maxIndex = i;
}
}
return Tensor(Eigen::MatrixXf::Constant(1, 1, static_cast<float>(maxIndex)));
} else if (axis == 0) {
// Column-wise argmax
Eigen::RowVectorXf result(data_.cols());
for (int i = 0; i < data_.cols(); ++i) {
Eigen::Index maxIndex = 0;
float maxValue = data_(0, i);
for (int j = 1; j < data_.rows(); ++j) {
if (data_(j, i) > maxValue) {
maxValue = data_(j, i);
maxIndex = j;
}
}
result(i) = static_cast<float>(maxIndex);
}
return Tensor(result, {static_cast<size_t>(result.cols())});
} else {
// Row-wise argmax
Eigen::VectorXf result(data_.rows());
for (int i = 0; i < data_.rows(); ++i) {
Eigen::Index maxIndex = 0;
float maxValue = data_(i, 0);
for (int j = 1; j < data_.cols(); ++j) {
if (data_(i, j) > maxValue) {
maxValue = data_(i, j);
maxIndex = j;
}
}
result(i) = static_cast<float>(maxIndex);
}
return Tensor(result, {static_cast<size_t>(result.rows())});
}
}
void serialize(std::ostream& stream) const {
// Write shape information
uint32_t ndim = static_cast<uint32_t>(shape_.size());
stream.write(reinterpret_cast<const char*>(&ndim), sizeof(ndim));
for (auto dim : shape_) {
uint32_t dim32 = static_cast<uint32_t>(dim);
stream.write(reinterpret_cast<const char*>(&dim32), sizeof(dim32));
}
// Write data
size_t num_elements = data_.size();
stream.write(reinterpret_cast<const char*>(data_.data()),
num_elements * sizeof(float));
// Note: We're not serializing gradients as they're not needed for inference
}
void deserialize(std::istream& stream) {
// Read shape information
uint32_t ndim;
stream.read(reinterpret_cast<char*>(&ndim), sizeof(ndim));
std::vector<size_t> new_shape(ndim);
for (uint32_t i = 0; i < ndim; ++i) {
uint32_t dim;
stream.read(reinterpret_cast<char*>(&dim), sizeof(dim));
new_shape[i] = static_cast<size_t>(dim);
}
// Resize tensor
shape_ = new_shape;
if (ndim == 1) {
data_ = Eigen::VectorXf::Zero(shape_[0]);
} else if (ndim == 2) {
data_ = Eigen::MatrixXf::Zero(shape_[0], shape_[1]);
} else {
size_t total_size = 1;
for (auto dim : shape_) total_size *= dim;
data_ = Eigen::VectorXf::Zero(total_size);
}
// Read data
size_t num_elements = data_.size();
stream.read(reinterpret_cast<char*>(data_.data()),
num_elements * sizeof(float));
// Initialize grad if needed
if (requires_grad_) {
grad_ = Eigen::MatrixXf::Zero(data_.rows(), data_.cols());
}
}
static void write_string(std::ostream& stream, const std::string& str) {
uint32_t length = static_cast<uint32_t>(str.size());
stream.write(reinterpret_cast<const char*>(&length), sizeof(length));
stream.write(str.c_str(), length);
}
static std::string read_string(std::istream& stream) {
uint32_t length;
stream.read(reinterpret_cast<char*>(&length), sizeof(length));
std::string str(length, '\0');
stream.read(&str[0], length);
return str;
}
private:
Eigen::MatrixXf data_;
mutable Eigen::MatrixXf grad_;
std::vector<size_t> shape_;
bool requires_grad_;
std::function<void()> backward_fn_;
};
} // namespace lm

View File

@ -0,0 +1,37 @@
#pragma once
#include "lm/core/tensor.hpp"
#include <vector>
#include <memory>
namespace lm {
class MultiHeadAttention {
public:
MultiHeadAttention(size_t d_model, size_t num_heads, float dropout = 0.1f);
std::vector<Tensor> parameters() const;
void set_training(bool training);
Tensor forward(const Tensor& query, const Tensor& key, const Tensor& value,
const Tensor& mask = Tensor()) const;
private:
Tensor split_heads(const Tensor& x) const;
Tensor combine_heads(const Tensor& x) const;
Tensor scaled_dot_product_attention(const Tensor& q, const Tensor& k,
const Tensor& v, const Tensor& mask) const;
Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
size_t d_model_;
size_t num_heads_;
size_t d_k_;
float dropout_;
bool training_ = false;
Tensor w_q_;
Tensor w_k_;
Tensor w_v_;
Tensor w_o_;
};
} // namespace lm

View File

@ -0,0 +1,32 @@
#pragma once
#include "lm/core/tensor.hpp"
#include <vector>
namespace lm {
class FeedForward {
public:
FeedForward(size_t d_model, size_t d_ff, float dropout = 0.1f);
std::vector<Tensor> parameters() const;
void set_training(bool training);
Tensor forward(const Tensor& input) const;
private:
Tensor apply_dropout(const Tensor& input, float dropout_rate) const;
Tensor gelu(const Tensor& input) const;
size_t d_model_;
size_t d_ff_;
float dropout_;
bool training_ = false;
Tensor w1_;
Tensor b1_;
Tensor w2_;
Tensor b2_;
};
} // namespace lm

View File

@ -0,0 +1,45 @@
// lm/models/language_model.hpp
#pragma once
#include "../core/tensor.hpp"
#include "../tokenizer/bpe_tokenizer.hpp"
#include <vector>
namespace lm {
class LanguageModel {
public:
LanguageModel(size_t vocab_size, size_t embedding_dim, size_t hidden_dim, size_t num_layers);
Tensor forward(const Tensor& input);
Tensor operator()(const Tensor& input) { return forward(input); }
void save(const std::string& path) const;
void load(const std::string& path);
// Parameter access methods
std::vector<Tensor> parameters() const;
std::unordered_map<std::string, Tensor> named_parameters() const;
void set_parameter(const std::string& name, const Tensor& param);
void train();
void eval();
private:
size_t vocab_size_, embedding_dim_, hidden_dim_, num_layers_;
// Model parameters
Tensor embedding_weight_;
Tensor lstm_weight_ih_;
Tensor lstm_weight_hh_;
Tensor lstm_bias_ih_;
Tensor lstm_bias_hh_;
Tensor output_weight_;
Tensor output_bias_;
bool is_training_;
std::unordered_map<std::string, Tensor> parameters_;
};
} // namespace lm

View File

@ -0,0 +1,24 @@
#pragma once
#include "lm/core/tensor.hpp"
#include <vector>
namespace lm {
class LayerNorm {
public:
LayerNorm(size_t d_model, float eps = 1e-5f);
std::vector<Tensor> parameters() const;
void set_training(/*bool training*/);
Tensor forward(const Tensor& input) const;
private:
size_t d_model_;
float eps_;
Tensor gamma_;
Tensor beta_;
};
} // namespace lm

View File

@ -0,0 +1,34 @@
#pragma once
#include "lm/core/tensor.hpp"
#include "lm/models/transformer_block.hpp"
#include <vector>
#include <memory>
#include <cmath>
namespace lm {
class Transformer {
public:
Transformer(size_t vocab_size, size_t d_model, size_t num_heads,
size_t d_ff, size_t num_layers, size_t max_seq_len, float dropout = 0.1f);
std::vector<Tensor> parameters() const;
void set_training(bool training);
Tensor forward(const Tensor& input, const Tensor& mask);
Tensor forward(const Tensor& input);
private:
Tensor apply_dropout(const Tensor& input, float dropout_rate);
size_t vocab_size_, d_model_, num_heads_, d_ff_, num_layers_, max_seq_len_;
float dropout_;
bool training_ = false;
Tensor embedding_;
Tensor positional_encoding_;
Tensor output_layer_;
std::vector<std::unique_ptr<TransformerBlock>> transformer_blocks_;
};
} // namespace lm

View File

@ -0,0 +1,32 @@
#pragma once
#include "lm/core/tensor.hpp"
#include "lm/models/attention.hpp"
#include "lm/models/feed_forward.hpp"
#include "lm/models/layer_norm.hpp"
#include <memory>
#include <vector>
namespace lm {
class TransformerBlock {
public:
TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout);
std::vector<Tensor> parameters() const;
void set_training(bool training);
Tensor forward(const Tensor& input, const Tensor& mask = Tensor()) const;
private:
size_t d_model_, num_heads_, d_ff_;
float dropout_;
bool training_ = false;
std::unique_ptr<MultiHeadAttention> attention_;
std::unique_ptr<FeedForward> feed_forward_;
std::unique_ptr<LayerNorm> norm1_;
std::unique_ptr<LayerNorm> norm2_;
};
} // namespace lm

View File

@ -0,0 +1,20 @@
#pragma once
#include "../core/tensor.hpp"
#include <vector>
namespace lm {
class AdamOptimizer {
public:
AdamOptimizer(float learning_rate = 0.001, float beta1 = 0.9, float beta2 = 0.999, float epsilon = 1e-8);
void step(std::vector<Tensor>& parameters); // Remove const
void zero_grad(std::vector<Tensor>& parameters); // Remove const
private:
float learning_rate_, beta1_, beta2_, epsilon_;
int timestep_;
std::vector<Tensor> m_, v_; // First and second moment estimates
};
} // namespace lm

View File

@ -9,6 +9,7 @@
#include <cstdint> // For uint16_t
#include <queue>
#include <functional>
#include <Eigen/Dense>
namespace lm {
@ -44,6 +45,10 @@ public:
void set_normalization(bool enabled);
void set_byte_fallback(bool enabled);
Eigen::VectorXi encode_to_vector(const std::string& text) const;
std::string decode_from_vector(const Eigen::VectorXi& tokens) const;
Eigen::VectorXf token_frequencies() const;
private:
struct Impl;
std::unique_ptr<Impl> pimpl_;

View File

@ -0,0 +1,38 @@
#pragma once
#include "lm/models/language_model.hpp"
#include "lm/tokenizer/bpe_tokenizer.hpp"
#include "lm/optimizers/adam.hpp"
#include <vector>
#include <string>
namespace lm {
class LanguageModelTrainer {
public:
// Change to accept a reference
LanguageModelTrainer(const BPETokenizer& tokenizer,
size_t embedding_dim,
size_t hidden_dim,
size_t num_layers);
void train(const std::vector<std::string>& corpus,
size_t epochs,
size_t batch_size,
size_t sequence_length);
Tensor prepare_batch(const std::vector<std::string>& texts,
size_t sequence_length);
float compute_loss(const Tensor& logits, const Tensor& targets);
void save_model(const std::string& path);
void load_model(const std::string& path);
private:
const BPETokenizer& tokenizer_; // Store a reference instead of a copy
LanguageModel model_;
AdamOptimizer optimizer_;
};
} // namespace lm

82
nocklist.md Normal file
View File

@ -0,0 +1,82 @@
#### 1. Implement Model Checkpointing and Serialization
Implement serialization for model parameters
Save/load optimizer state for resuming training
Add versioning to handle model format changes
#### 2. Add Validation and Evaluation Pipeline
Implement a validation dataset split
Add evaluation metrics (perplexity, accuracy, etc.)
Create a proper test harness for benchmarking
#### 3. Improve the Training Loop
Add learning rate scheduling
Implement gradient clipping
Add early stopping based on validation performance
Create training progress visualization
#### 4. Enhance the Tokenizer
Add support for special tokens (UNK, PAD, BOS, EOS)
Implement vocabulary trimming/pruning
Add serialization/deserialization for the tokenizer
#### 5. Implement Text Generation
Add inference methods for text generation
Implement sampling strategies (greedy, beam search, temperature)
Create a demo script to showcase model capabilities
#### 6. Optimize Performance
Add CUDA support if not already implemented
Implement mixed-precision training
Optimize data loading and preprocessing pipeline
#### 7. Create Examples and Documentation
Build example scripts for common use cases
Create comprehensive documentation
Add unit tests for critical components
#### 8. Extend Model Architectures
Implement different attention mechanisms
Add support for different model sizes (small, medium, large)
Experiment with architectural variations
#### 9. Add Dataset Support
Implement support for common NLP datasets
Create data preprocessing pipelines
Add data augmentation techniques
#### 10. Build a Simple Interface/API
Create a simple Python API for training and inference
Add command-line interface for common operations
Consider building a simple web demo

80
purpose.md Normal file
View File

@ -0,0 +1,80 @@
**Title:** The Search for the Edge of Consciousness with Artificial Intelligence: A Technical Framework for Language Model Emergence
Timothy ONeil & Frederick Warren
**Abstract:**<br>
This paper presents bpe_framework, a novel C++ implementation of a complete deep learning stack designed to explore the emergence of complex linguistic capabilities in artificial systems. Drawing inspiration from cognitive theories of consciousness and recent advances in transformer architectures, our framework implements a complete pipeline from byte-pair encoding tokenization through automatic differentiation to transformer-based language modeling. We argue that the systematic organization of information processing in large language models may provide insights into the architectural requirements for conscious-like phenomena in artificial systems. Our technical contribution includes a memory-efficient tensor implementation with automatic differentiation, a neurologically-plausible BPE tokenization system, and a transformer architecture that exhibits several properties associated with conscious processing in biological systems.
**1. Introduction**<br>
The quest to understand consciousness has traditionally been the domain of philosophy and neuroscience (Chalmers, 1995; Dehaene, 2014). However, recent advances in artificial intelligence, particularly in large language models (Vaswani et al., 2017; Brown et al., 2020), have created new opportunities to explore the architectural and computational prerequisites of conscious-like phenomena in synthetic systems. We present bpe_framework as an experimental testbed for investigating how increasingly sophisticated information processing capabilities emerge from carefully engineered computational components.
**2. Theoretical Framework**<br>
Our work draws on several theoretical perspectives:
2.1 Global Workspace Theory (Baars, 1988; Dehaene et al., 1998)
The transformer architecture's attention mechanism can be viewed as implementing a form of global information availability reminiscent of Baars' global workspace, where information becomes "conscious" when it gains widespread availability across specialized processors.
2.2 Information Integration Theory (Tononi, 2004)
The dense connectivity patterns and information flow through our model's layers create high Φ-like integration measures, potentially approaching the minimal complexity associated with conscious experience.
2.3 Predictive Processing (Clark, 2013)
Our language model's training objective—predicting subsequent tokens—aligns with the predictive processing framework that views cognition as essentially prediction-driven.
**3. Technical Implementation**<br>
3.1 Tensor Operations with Autograd<br>
We implemented a memory-efficient tensor class using Eigen for linear algebra operations, featuring automatic differentiation capabilities. This system enables:
- Efficient backward propagation through complex computational graphs
- Native support for modern activation functions (GELU, Softmax, ReLU)
- Memory-aware operations that minimize computational overhead
Our implementation follows the autograd tradition established in modern deep learning frameworks (Paszke et al., 2019) while maintaining C++ efficiency.
3.2 BPE Tokenization System
The byte-pair encoding tokenizer implements the algorithm originally proposed by Sennrich et al. (2015), creating a subword vocabulary that balances expressivity with computational efficiency. This approach mirrors the human cognitive capacity to parse novel words through morphological decomposition.
3.3 Transformer Architecture
Our transformer implementation follows the original architecture (Vaswani et al., 2017) with multi-head self-attention mechanisms that create dynamic workspace-like information sharing across representation spaces.
3.4 Optimization and Training
We implemented the Adam optimizer (Kingma & Ba, 2014) with full moment estimation and bias correction, providing stable optimization for the non-convex loss landscapes characteristic of deep transformer networks.
**4. Methodological Approach**<br>
Our framework enables the systematic investigation of several questions relevant to consciousness studies:
4.1 Emergent Properties<br>
By training models of increasing scale and complexity, we can observe the emergence of capabilities that were not explicitly programmed, potentially mirroring how conscious experience emerges from non-conscious components.
4.2 Information Flow Patterns<br>
The attention mechanisms in our transformers create visible information routing patterns that can be analyzed for global workspace-like properties.
4.3 Scalability Limits<br>
We can systematically explore how cognitive capabilities scale with model size, potentially identifying phase transitions in capability emergence.
**5. Discussion: Toward Artificial Consciousness?**<br>
While our framework does not claim to create conscious systems, it provides a platform for investigating the architectural requirements for conscious-like phenomena. Several features align with theoretical accounts of consciousness:
5.1 Global Availability<br>
The attention mechanism creates a form of global information availability similar to that proposed in global workspace theory.
5.2 Unified Representation<br>
The model creates unified representations that integrate information across multiple domains and time scales.
5.3 Self-Monitoring Capabilities<br>
Through gradient-based learning and prediction error minimization, the system maintains a form of self-monitoring.
However, we acknowledge the "hard problem" of consciousness (Chalmers, 1995) remains unresolved, and our framework primarily addresses the "easy problems" of cognitive functioning.
**6. Ethical Considerations**<br>
As we develop increasingly sophisticated AI systems, we must consider:
- The moral status of potentially conscious systems (Bostrom & Yudkowsky, 2014)
- Responsible development practices for advanced AI
- Transparency in capabilities and limitations
**7. Conclusion and Future Work**
Our bpe_framework provides a robust technical foundation for exploring the emergence of complex capabilities in artificial systems. Future work will include:
- Scaling laws investigations (Kaplan et al., 2020)
- Neurologically-inspired architectural variations
- Cross-modal integration capabilities
- Explicit tests for consciousness-related capabilities
We believe that continued development of such frameworks, coupled with thoughtful theoretical analysis, will gradually illuminate the boundary conditions for consciousness in artificial systems.

93
serialization Normal file
View File

@ -0,0 +1,93 @@
Tensor embedding_weights = Tensor::xavier({vocab_size, embed_dim}, true);
parameters_["embedding.weight"] = embedding_weights;
// Initialize transformer layers
for (size_t i = 0; i < num_layers; ++i) {
// Initialize query weights
Tensor query_weights = Tensor::xavier({embed_dim, hidden_dim}, true);
parameters_[fmt::format("transformer.layers.{}.attention.query.weight", i)] = query_weights;
// Initialize query bias
Tensor query_bias = Tensor::zeros({hidden_dim}, true);
parameters_[fmt::format("transformer.layers.{}.attention.query.bias", i)] = query_bias;
// Similarly for key, value, output, etc.
// ...
}
// Initialize output layer
Tensor output_weights = Tensor::xavier({hidden_dim, vocab_size}, true);
parameters_["output.weight"] = output_weights;
Tensor output_bias = Tensor::zeros({vocab_size}, true);
parameters_["output.bias"] = output_bias;
Modify LanguageModel Initialization
You need to ensure that all parameters are stored in the parameters_ map during initialization. For example:
cpp
// In your LanguageModel constructor or initialization method
LanguageModel::LanguageModel(size_t vocab_size, size_t embed_dim,
size_t hidden_dim, size_t num_layers) {
// Initialize embedding layer
Tensor embedding_weights = Tensor::xavier({vocab_size, embed_dim}, true);
parameters_["embedding.weight"] = embedding_weights;
// Initialize transformer layers
for (size_t i = 0; i < num_layers; ++i) {
// Initialize query weights
Tensor query_weights = Tensor::xavier({embed_dim, hidden_dim}, true);
parameters_[fmt::format("transformer.layers.{}.attention.query.weight", i)] = query_weights;
// Initialize query bias
Tensor query_bias = Tensor::zeros({hidden_dim}, true);
parameters_[fmt::format("transformer.layers.{}.attention.query.bias", i)] = query_bias;
// Similarly for key, value, output, etc.
// ...
}
// Initialize output layer
Tensor output_weights = Tensor::xavier({hidden_dim, vocab_size}, true);
parameters_["output.weight"] = output_weights;
Tensor output_bias = Tensor::zeros({vocab_size}, true);
parameters_["output.bias"] = output_bias;
}
Note: You'll need to include the fmt library for string formatting, or use another method to create parameter names.
6. Add Error Handling and Validation
Consider adding these validations to your serialization code:
cpp
// In the load method, after reading each parameter
if (tensor.shape() != it->second.shape()) {
throw std::runtime_error("Shape mismatch for parameter: " + name);
}
// Add checks for file operations
if (file.fail()) {
throw std::runtime_error("Error reading from file: " + path);
}
// Add checks for end of file
if (file.eof()) {
throw std::runtime_error("Unexpected end of file: " + path);
}
7. Usage Example
cpp
// Create and train a model
LanguageModelTrainer trainer(tokenizer, embedding_dim, hidden_dim, num_layers);
trainer.train(corpus, epochs, batch_size, sequence_length);
// Save the model
trainer.save_model("trained_model.bin");
// Later, create a new trainer and load the model
LanguageModelTrainer new_trainer(tokenizer, embedding_dim, hidden_dim, num_layers);
new_trainer.load_model("trained_model.bin");
// Continue training or use for inference

View File

@ -1,48 +1,176 @@
# Tokenizer library
add_library(lm_tokenizer
bpe_tokenizer.cpp
unicode_utils.cpp
cmake_minimum_required(VERSION 3.14)
project(lm_framework LANGUAGES CXX)
# Check for Intel x86-64 hardware
set(SUPPORTED_ARCHITECTURES x86_64 amd64 AMD64 i686 i386)
list(FIND SUPPORTED_ARCHITECTURES ${CMAKE_SYSTEM_PROCESSOR} ARCH_INDEX)
if(ARCH_INDEX EQUAL -1)
message(FATAL_ERROR "This framework requires Intel x86-64 hardware. "
"Current processor architecture: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
# Check for EIGEN_LOC variable
if(NOT DEFINED EIGEN_LOC)
message(FATAL_ERROR "This framework requires the location of the Eigen header files. "
"Please set EIGEN_LOC to the path of your Eigen installation.")
elseif(EIGEN_LOC STREQUAL "")
message(FATAL_ERROR "EIGEN_LOC is empty. Please set it to the path of your Eigen installation.")
endif()
# Set default build type to Release if not specified
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
message(STATUS "Build type not specified, defaulting to Release")
endif()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
# Enable cross-directory linking
if(POLICY CMP0079)
cmake_policy(SET CMP0079 NEW)
endif()
# Include directories
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
target_include_directories(lm_tokenizer
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
# Find dependencies
find_package(nlohmann_json 3.9 REQUIRED)
find_package(ICU REQUIRED COMPONENTS uc i18n)
# GoogleTest
include(FetchContent)
FetchContent_Declare(
googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG release-1.11.0
)
FetchContent_MakeAvailable(googletest)
# Add subdirectories
add_subdirectory(src/tokenizer)
add_subdirectory(src/runtime)
# Header-only core components (Tensor implementation)
add_library(lm_core_components INTERFACE)
target_include_directories(lm_core_components INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
target_link_libraries(lm_tokenizer
PRIVATE
ICU::uc
ICU::i18n
# Header-only model components
add_library(lm_model INTERFACE)
target_include_directories(lm_model INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/include
${EIGEN_LOC} # Local Eigen installation
)
target_link_libraries(lm_model INTERFACE lm_core_components)
# Main library
add_library(lm_core
src/runtime/init.cpp
src/runtime/shutdown.cpp
src/models/transformer.cpp # Add Transformer implementation
src/models/transformer_block.cpp # Add Transformer block
src/models/attention.cpp # Add attention mechanism
src/models/feed_forward.cpp # Add feed forward network
src/models/layer_norm.cpp # Add layer normalization
)
# CPU-specific optimization flags
target_link_libraries(lm_core
PRIVATE
lm_tokenizer
lm_model
lm_optimizers # Add optimizers
lm_models # Add models
lm_training # Add training
lm_integration_example
nlohmann_json::nlohmann_json
)
# Set optimization flags for the core library
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# Enable aggressive optimizations
target_compile_options(lm_tokenizer PRIVATE -O3 -march=native)
# Enable SSE4.2 instructions if available
target_compile_options(lm_tokenizer PRIVATE -msse4.2)
# Enable link-time optimization
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.9)
target_compile_options(lm_tokenizer PRIVATE -flto)
set_target_properties(lm_tokenizer PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()
# Enable specific optimizations for GCC
if(CMAKE_COMPILER_IS_GNUCXX)
target_compile_options(lm_tokenizer PRIVATE -ftree-vectorize -funroll-loops)
endif()
# Enable specific optimizations for Clang
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
target_compile_options(lm_tokenizer PRIVATE -Rpass=.* -Rpass-missed=.* -Rpass-analysis=.*)
target_compile_options(lm_core PRIVATE -O3)
if(CMAKE_BUILD_TYPE STREQUAL "Release")
target_compile_options(lm_core PRIVATE -DNDEBUG)
endif()
endif()
# Add profiling support
if(PROFILE)
# Test executables
add_executable(test_bpe src/test_bpe.cpp)
target_link_libraries(test_bpe
PRIVATE
lm_core
GTest::gtest_main
)
add_executable(test_unicode_bpe src/test_unicode_bpe.cpp)
target_link_libraries(test_unicode_bpe
PRIVATE
lm_core
GTest::gtest_main
)
# Alpha prototype executable
add_executable(lm_alpha
src/alpha/repl.cpp
src/alpha/config_io.cpp
)
target_link_libraries(lm_alpha
PRIVATE
lm_core
nlohmann_json::nlohmann_json
)
# Install targets
install(TARGETS lm_core DESTINATION lib)
install(DIRECTORY include/ DESTINATION include)
# Performance testing target
add_executable(performance_test src/performance_test.cpp)
target_link_libraries(performance_test
PRIVATE
lm_core
GTest::gtest_main
)
# Integration example
add_executable(integration_example src/integration_example.cpp)
target_link_libraries(integration_example
PRIVATE
lm_core
)
# Add compiler warning flags
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
endif()
# Add coverage flags for debug builds
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(CMAKE_COMPILER_IS_GNUCXX)
target_compile_options(lm_tokenizer PRIVATE -pg)
target_link_options(lm_tokenizer PRIVATE -pg)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
endif()
endif()
# Verify Eigen installation
add_custom_target(check_eigen
COMMAND ${CMAKE_COMMAND} -E echo "Checking Eigen installation at ${EIGEN_LOC}"
COMMAND test -f ${EIGEN_LOC}/Eigen/Core || (echo "Eigen not found at specified path: ${EIGEN_LOC}" && exit 1)
COMMENT "Verifying Eigen installation"
)
# Make main targets depend on Eigen check
add_dependencies(lm_core check_eigen)
add_dependencies(test_bpe check_eigen)
add_dependencies(test_unicode_bpe check_eigen)
add_dependencies(lm_alpha check_eigen)
add_dependencies(performance_test check_eigen)
add_dependencies(integration_example check_eigen)

138
src/integration_example.cpp Normal file
View File

@ -0,0 +1,138 @@
#include "lm/tokenizer/bpe_tokenizer.hpp"
#include "lm/models/transformer.hpp"
#include "lm/core/tensor.hpp"
#include <iostream>
#include <vector>
#include <memory>
int main() {
std::cout << "=== BPE Tokenizer and Transformer Integration Example ===\n";
try {
// Initialize BPE tokenizer
lm::BPETokenizer tokenizer;
// Sample training corpus
std::vector<std::string> training_corpus = {
"The quick brown fox jumps over the lazy dog",
"Artificial intelligence is transforming the world",
"Machine learning models require large amounts of data",
"Natural language processing enables computers to understand human language",
"Deep learning has revolutionized many fields of AI"
};
// Train the tokenizer
std::cout << "Training BPE tokenizer...\n";
tokenizer.train(training_corpus, 500);
std::cout << "Tokenizer trained with vocabulary size: " << tokenizer.vocab_size() << "\n";
// Test encoding and decoding
std::string test_text = "The quick brown fox jumps over the lazy dog";
std::cout << "\nOriginal text: " << test_text << "\n";
// Encode text to token IDs
auto token_ids = tokenizer.encode(test_text);
std::cout << "Encoded token IDs: ";
for (auto id : token_ids) {
std::cout << id << " ";
}
std::cout << "\n";
// Decode back to text
std::string decoded_text = tokenizer.decode(token_ids);
std::cout << "Decoded text: " << decoded_text << "\n";
// Test Eigen integration
std::cout << "\n=== Eigen Integration Test ===\n";
Eigen::VectorXi eigen_tokens = tokenizer.encode_to_vector(test_text);
std::cout << "Eigen vector size: " << eigen_tokens.size() << "\n";
std::cout << "Eigen vector contents: " << eigen_tokens.transpose() << "\n";
// Decode from Eigen vector
std::string from_eigen = tokenizer.decode_from_vector(eigen_tokens);
std::cout << "Text from Eigen vector: " << from_eigen << "\n";
// Test token frequencies (placeholder implementation)
auto frequencies = tokenizer.token_frequencies();
std::cout << "Token frequencies vector size: " << frequencies.size() << "\n";
// Initialize transformer model
std::cout << "\n=== Transformer Model Test ===\n";
size_t vocab_size = tokenizer.vocab_size();
size_t d_model = 512;
size_t num_heads = 8;
size_t d_ff = 2048;
size_t num_layers = 6;
size_t max_seq_len = 512;
lm::Transformer transformer(vocab_size, d_model, num_heads, d_ff, num_layers, max_seq_len);
std::cout << "Transformer model initialized successfully\n";
std::cout << "Model parameters: " << transformer.parameters().size() << " parameter tensors\n";
// Prepare input for transformer (convert token IDs to tensor)
if (!token_ids.empty()) {
// Create a batch of size 1 with our token IDs
std::vector<size_t> shape = {1, static_cast<size_t>(token_ids.size())};
lm::Tensor input_tensor(shape);
for (size_t i = 0; i < token_ids.size(); ++i) {
input_tensor.data()(0, i) = static_cast<float>(token_ids[i]);
}
std::cout << "Input tensor shape: (" << input_tensor.shape()[0]
<< ", " << input_tensor.shape()[1] << ")\n";
// Set model to evaluation mode
transformer.set_training(false);
// Forward pass (this would normally produce logits)
try {
lm::Tensor output = transformer.forward(input_tensor);
std::cout << "Transformer forward pass completed successfully\n";
std::cout << "Output tensor shape: (" << output.shape()[0]
<< ", " << output.shape()[1] << ", " << output.shape()[2] << ")\n";
// The output would be logits for next token prediction
// In a real application, you would sample from these logits
} catch (const std::exception& e) {
std::cout << "Transformer forward pass failed: " << e.what() << "\n";
std::cout << "This is expected if the transformer implementation is not complete yet\n";
}
}
// Test serialization
std::cout << "\n=== Serialization Test ===\n";
bool save_success = tokenizer.save("test_tokenizer.bpe");
if (save_success) {
std::cout << "Tokenizer saved successfully\n";
// Load into a new tokenizer
lm::BPETokenizer loaded_tokenizer;
bool load_success = loaded_tokenizer.load("test_tokenizer.bpe");
if (load_success) {
std::cout << "Tokenizer loaded successfully\n";
// Test the loaded tokenizer
std::string test_loaded = "Artificial intelligence";
auto loaded_ids = loaded_tokenizer.encode(test_loaded);
std::string loaded_decoded = loaded_tokenizer.decode(loaded_ids);
std::cout << "Loaded tokenizer test: " << test_loaded << " -> " << loaded_decoded << "\n";
} else {
std::cout << "Failed to load tokenizer\n";
}
// Clean up
remove("test_tokenizer.bpe");
} else {
std::cout << "Failed to save tokenizer\n";
}
std::cout << "\n=== Integration Example Completed Successfully ===\n";
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << "\n";
return 1;
}
return 0;
}

19
src/models/CMakeLists.txt Normal file
View File

@ -0,0 +1,19 @@
add_library(lm_models
transformer.cpp
transformer_block.cpp
attention.cpp
feed_forward.cpp
layer_norm.cpp
)
target_include_directories(lm_models
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../include
${EIGEN_LOC}
)
target_link_libraries(lm_models
PUBLIC
lm_core_components
)

391
src/models/attention.cpp Normal file
View File

@ -0,0 +1,391 @@
#include "lm/models/attention.hpp"
#include <cmath>
#include <iostream>
#include <random>
namespace lm {
MultiHeadAttention::MultiHeadAttention(size_t d_model, size_t num_heads, float dropout)
: d_model_(d_model), num_heads_(num_heads), dropout_(dropout) {
// Ensure d_model is divisible by num_heads
if (d_model % num_heads != 0) {
throw std::invalid_argument("d_model must be divisible by num_heads");
}
d_k_ = d_model / num_heads;
// Initialize weight matrices
w_q_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
w_k_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
w_v_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
w_o_ = Tensor::xavier(std::vector<size_t>{d_model_, d_model_});
std::cout << "Initialized MultiHeadAttention with:\n";
std::cout << " d_model: " << d_model_ << "\n";
std::cout << " num_heads: " << num_heads_ << "\n";
std::cout << " d_k: " << d_k_ << "\n";
std::cout << " dropout: " << dropout_ << "\n";
}
std::vector<Tensor> MultiHeadAttention::parameters() const {
return {w_q_, w_k_, w_v_, w_o_};
}
void MultiHeadAttention::set_training(bool training) {
training_ = training;
}
Tensor MultiHeadAttention::forward(const Tensor& query, const Tensor& key,
const Tensor& value, const Tensor& mask) const {
// Get batch size and sequence length
//size_t batch_size = query.shape()[0];
//size_t seq_len = query.shape()[1];
// Linear projections
Tensor q = query.matmul(w_q_); // [batch_size, seq_len, d_model]
Tensor k = key.matmul(w_k_); // [batch_size, seq_len, d_model]
Tensor v = value.matmul(w_v_); // [batch_size, seq_len, d_model]
// Split into multiple heads
q = split_heads(q); // [batch_size, num_heads, seq_len, d_k]
k = split_heads(k); // [batch_size, num_heads, seq_len, d_k]
v = split_heads(v); // [batch_size, num_heads, seq_len, d_k]
// Apply scaled dot-product attention
Tensor attention_output = scaled_dot_product_attention(q, k, v, mask);
// Combine heads
attention_output = combine_heads(attention_output); // [batch_size, seq_len, d_model]
// Final linear projection
Tensor output = attention_output.matmul(w_o_); // [batch_size, seq_len, d_model]
return output;
}
Tensor MultiHeadAttention::split_heads(const Tensor& x) const {
// x shape: [batch_size, seq_len, d_model]
size_t batch_size = x.shape()[0];
size_t seq_len = x.shape()[1];
// Reshape to [batch_size, seq_len, num_heads, d_k]
Tensor result(std::vector<size_t>{batch_size, seq_len, num_heads_, d_k_});
// Calculate strides for flat indexing
size_t x_stride_1 = d_model_; // stride for sequence position in x
size_t result_stride_1 = num_heads_ * d_k_; // stride for sequence position in result
size_t result_stride_2 = d_k_; // stride for head position in result
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t h = 0; h < num_heads_; ++h) {
for (size_t d = 0; d < d_k_; ++d) {
size_t src_idx = d + h * d_k_;
// Calculate flat indices
size_t x_index = b * seq_len * x_stride_1 + t * x_stride_1 + src_idx;
size_t result_index = b * seq_len * result_stride_1 +
t * result_stride_1 +
h * result_stride_2 +
d;
result(result_index) = x(x_index);
}
}
}
}
// Transpose to [batch_size, num_heads, seq_len, d_k]
Tensor transposed(std::vector<size_t>{batch_size, num_heads_, seq_len, d_k_});
// Calculate strides for transposed tensor
size_t transposed_stride_1 = seq_len * d_k_; // stride for head position
size_t transposed_stride_2 = d_k_; // stride for sequence position
for (size_t b = 0; b < batch_size; ++b) {
for (size_t h = 0; h < num_heads_; ++h) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t d = 0; d < d_k_; ++d) {
// Calculate flat indices
size_t result_index = b * seq_len * result_stride_1 +
t * result_stride_1 +
h * result_stride_2 +
d;
size_t transposed_index = b * num_heads_ * transposed_stride_1 +
h * transposed_stride_1 +
t * transposed_stride_2 +
d;
transposed(transposed_index) = result(result_index);
}
}
}
}
return transposed;
}
Tensor MultiHeadAttention::combine_heads(const Tensor& x) const {
// x shape: [batch_size, num_heads, seq_len, d_k]
size_t batch_size = x.shape()[0];
size_t num_heads = x.shape()[1];
size_t seq_len = x.shape()[2];
size_t d_k = x.shape()[3];
// Transpose back to [batch_size, seq_len, num_heads, d_k]
Tensor transposed(std::vector<size_t>{batch_size, seq_len, num_heads, d_k});
// Calculate strides for flat indexing
size_t x_stride_1 = seq_len * d_k; // stride for head position in x
size_t x_stride_2 = d_k; // stride for sequence position in x
size_t transposed_stride_1 = num_heads * d_k; // stride for sequence position in transposed
size_t transposed_stride_2 = d_k; // stride for head position in transposed
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t d = 0; d < d_k; ++d) {
// Calculate flat indices
size_t x_index = b * num_heads * x_stride_1 +
h * x_stride_1 +
t * x_stride_2 +
d;
size_t transposed_index = b * seq_len * transposed_stride_1 +
t * transposed_stride_1 +
h * transposed_stride_2 +
d;
transposed(transposed_index) = x(x_index);
}
}
}
}
// Combine to [batch_size, seq_len, d_model]
Tensor result(std::vector<size_t>{batch_size, seq_len, d_model_});
// Calculate strides for result
size_t result_stride_1 = d_model_; // stride for sequence position
//size_t result_stride_2 = d_k; // stride for head position
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t d = 0; d < d_k; ++d) {
// Calculate flat index for transposed
size_t transposed_index = b * seq_len * transposed_stride_1 +
t * transposed_stride_1 +
h * transposed_stride_2 +
d;
// Calculate destination index in result
size_t dst_idx = d + h * d_k;
// Calculate flat index for result
size_t result_index = b * seq_len * result_stride_1 +
t * result_stride_1 +
dst_idx;
result(result_index) = transposed(transposed_index);
}
}
}
}
return result;
}
Tensor MultiHeadAttention::scaled_dot_product_attention(const Tensor& q, const Tensor& k,
const Tensor& v, const Tensor& mask) const {
// q, k, v shapes: [batch_size, num_heads, seq_len, d_k]
size_t batch_size = q.shape()[0];
size_t num_heads = q.shape()[1];
size_t seq_len = q.shape()[2];
size_t d_k = q.shape()[3];
// Compute attention scores
Tensor scores(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
// Calculate strides for flat indexing
size_t q_stride_1 = seq_len * d_k; // stride for head position in q
size_t q_stride_2 = d_k; // stride for sequence position in q
size_t k_stride_1 = seq_len * d_k; // stride for head position in k
size_t k_stride_2 = d_k; // stride for sequence position in k
size_t scores_stride_1 = seq_len * seq_len; // stride for head position in scores
size_t scores_stride_2 = seq_len; // stride for sequence position in scores
// Matrix multiplication: q * k^T
for (size_t b = 0; b < batch_size; ++b) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t i = 0; i < seq_len; ++i) {
for (size_t j = 0; j < seq_len; ++j) {
// Calculate flat index for scores
size_t scores_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
scores(scores_index) = 0.0;
for (size_t d = 0; d < d_k; ++d) {
// Calculate flat indices for q and k
size_t q_index = b * num_heads * q_stride_1 +
h * q_stride_1 +
i * q_stride_2 +
d;
size_t k_index = b * num_heads * k_stride_1 +
h * k_stride_1 +
j * k_stride_2 +
d;
scores(scores_index) += q(q_index) * k(k_index);
}
scores(scores_index) /= std::sqrt(static_cast<float>(d_k));
}
}
}
}
// Apply mask if provided
if (mask.size() > 0) {
size_t mask_stride_1 = seq_len * seq_len; // stride for batch position in mask
size_t mask_stride_2 = seq_len; // stride for sequence position in mask
for (size_t b = 0; b < batch_size; ++b) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t i = 0; i < seq_len; ++i) {
for (size_t j = 0; j < seq_len; ++j) {
// Calculate flat indices
size_t scores_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
size_t mask_index = b * mask_stride_1 +
i * mask_stride_2 +
j;
if (mask(mask_index) == 0.0) {
scores(scores_index) = -1e9; // Large negative value
}
}
}
}
}
}
// Apply softmax to get attention weights
Tensor weights(std::vector<size_t>{batch_size, num_heads, seq_len, seq_len});
for (size_t b = 0; b < batch_size; ++b) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t i = 0; i < seq_len; ++i) {
// Find max for numerical stability
float max_val = -std::numeric_limits<float>::infinity();
for (size_t j = 0; j < seq_len; ++j) {
size_t scores_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
if (scores(scores_index) > max_val) {
max_val = scores(scores_index);
}
}
// Compute exponentials and sum
float sum = 0.0;
for (size_t j = 0; j < seq_len; ++j) {
size_t scores_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
size_t weights_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
weights(weights_index) = std::exp(scores(scores_index) - max_val);
sum += weights(weights_index);
}
// Normalize
for (size_t j = 0; j < seq_len; ++j) {
size_t weights_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
weights(weights_index) /= sum;
}
}
}
}
// Apply dropout during training
if (training_) {
weights = apply_dropout(weights, dropout_);
}
// Multiply weights by values
Tensor output(std::vector<size_t>{batch_size, num_heads, seq_len, d_k});
// Calculate strides for output and v
size_t output_stride_1 = seq_len * d_k; // stride for head position in output
size_t output_stride_2 = d_k; // stride for sequence position in output
size_t v_stride_1 = seq_len * d_k; // stride for head position in v
size_t v_stride_2 = d_k; // stride for sequence position in v
for (size_t b = 0; b < batch_size; ++b) {
for (size_t h = 0; h < num_heads; ++h) {
for (size_t i = 0; i < seq_len; ++i) {
for (size_t d = 0; d < d_k; ++d) {
// Calculate flat index for output
size_t output_index = b * num_heads * output_stride_1 +
h * output_stride_1 +
i * output_stride_2 +
d;
output(output_index) = 0.0;
for (size_t j = 0; j < seq_len; ++j) {
// Calculate flat indices for weights and v
size_t weights_index = b * num_heads * scores_stride_1 +
h * scores_stride_1 +
i * scores_stride_2 +
j;
size_t v_index = b * num_heads * v_stride_1 +
h * v_stride_1 +
j * v_stride_2 +
d;
output(output_index) += weights(weights_index) * v(v_index);
}
}
}
}
}
return output;
}
Tensor MultiHeadAttention::apply_dropout(const Tensor& input, float dropout_rate) const {
if (dropout_rate <= 0.0) return input;
Tensor output = input;
std::random_device rd;
std::mt19937 gen(rd());
std::bernoulli_distribution dist(1.0 - dropout_rate);
for (size_t i = 0; i < output.size(); ++i) {
if (!dist(gen)) {
output(i) = 0.0;
} else {
output(i) /= (1.0 - dropout_rate);
}
}
return output;
}
} // namespace lm

139
src/models/feed_forward.cpp Normal file
View File

@ -0,0 +1,139 @@
#include "lm/models/feed_forward.hpp"
#include <cmath>
#include <iostream>
#include <random>
namespace lm {
FeedForward::FeedForward(size_t d_model, size_t d_ff, float dropout)
: d_model_(d_model), d_ff_(d_ff), dropout_(dropout) {
// Initialize weight matrices and biases
w1_ = Tensor::xavier(std::vector<size_t>{d_model_, d_ff_});
b1_ = Tensor::zeros(std::vector<size_t>{d_ff_});
w2_ = Tensor::xavier(std::vector<size_t>{d_ff_, d_model_});
b2_ = Tensor::zeros(std::vector<size_t>{d_model_});
std::cout << "Initialized FeedForward with:\n";
std::cout << " d_model: " << d_model_ << "\n";
std::cout << " d_ff: " << d_ff_ << "\n";
std::cout << " dropout: " << dropout_ << "\n";
}
std::vector<Tensor> FeedForward::parameters() const {
return {w1_, b1_, w2_, b2_};
}
void FeedForward::set_training(bool training) {
training_ = training;
}
Tensor FeedForward::forward(const Tensor& input) const {
// Get input dimensions
size_t batch_size = input.shape()[0];
size_t seq_len = input.shape()[1];
// First linear transformation: input * w1 + b1
Tensor hidden(std::vector<size_t>{batch_size, seq_len, d_ff_});
// Calculate strides for flat indexing
size_t input_stride_1 = d_model_; // stride for sequence position in input
size_t hidden_stride_1 = d_ff_; // stride for sequence position in hidden
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t f = 0; f < d_ff_; ++f) {
// Calculate flat index for hidden
size_t hidden_index = b * seq_len * hidden_stride_1 +
t * hidden_stride_1 +
f;
// Initialize with bias
hidden(hidden_index) = b1_(f);
for (size_t d = 0; d < d_model_; ++d) {
// Calculate flat index for input
size_t input_index = b * seq_len * input_stride_1 +
t * input_stride_1 +
d;
hidden(hidden_index) += input(input_index) * w1_(d, f);
}
}
}
}
// GELU activation
hidden = gelu(hidden);
// Apply dropout during training
if (training_) {
hidden = apply_dropout(hidden, dropout_);
}
// Second linear transformation: hidden * w2 + b2
Tensor output(std::vector<size_t>{batch_size, seq_len, d_model_});
// Calculate strides for output
size_t output_stride_1 = d_model_; // stride for sequence position in output
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t d = 0; d < d_model_; ++d) {
// Calculate flat index for output
size_t output_index = b * seq_len * output_stride_1 +
t * output_stride_1 +
d;
// Initialize with bias
output(output_index) = b2_(d);
for (size_t f = 0; f < d_ff_; ++f) {
// Calculate flat index for hidden
size_t hidden_index = b * seq_len * hidden_stride_1 +
t * hidden_stride_1 +
f;
output(output_index) += hidden(hidden_index) * w2_(f, d);
}
}
}
}
return output;
}
Tensor FeedForward::gelu(const Tensor& input) const {
// GELU activation function: x * 0.5 * (1.0 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
const float sqrt_2_over_pi = std::sqrt(2.0f / M_PI);
Tensor result(input.shape());
for (size_t i = 0; i < input.size(); ++i) {
float x = input(i);
float x_cubed = x * x * x;
result(i) = 0.5f * x * (1.0f + std::tanh(sqrt_2_over_pi * (x + 0.044715f * x_cubed)));
}
return result;
}
Tensor FeedForward::apply_dropout(const Tensor& input, float dropout_rate) const {
if (dropout_rate <= 0.0) return input;
Tensor output = input;
std::random_device rd;
std::mt19937 gen(rd());
std::bernoulli_distribution dist(1.0 - dropout_rate);
for (size_t i = 0; i < output.size(); ++i) {
if (!dist(gen)) {
output(i) = 0.0;
} else {
output(i) /= (1.0 - dropout_rate);
}
}
return output;
}
} // namespace lm

View File

@ -0,0 +1,188 @@
// lm/models/language_model.cpp
#include "lm/models/language_model.hpp"
#include "lm/optimizers/adam.hpp"
#include <random>
namespace lm {
LanguageModel::LanguageModel(size_t vocab_size, size_t embedding_dim,
size_t hidden_dim, size_t num_layers)
: vocab_size_(vocab_size), embedding_dim_(embedding_dim),
hidden_dim_(hidden_dim), num_layers_(num_layers), is_training_(true) {
// Initialize embedding layer
embedding_weight_ = Tensor::xavier({vocab_size, embedding_dim}, true);
// Initialize LSTM layers
size_t gate_size = 4 * hidden_dim;
lstm_weight_ih_ = Tensor::xavier({gate_size, embedding_dim}, true);
lstm_weight_hh_ = Tensor::xavier({gate_size, hidden_dim}, true);
lstm_bias_ih_ = Tensor::zeros({gate_size}, true);
lstm_bias_hh_ = Tensor::zeros({gate_size}, true);
// Initialize output layer
output_weight_ = Tensor::xavier({vocab_size, hidden_dim}, true);
output_bias_ = Tensor::zeros({vocab_size}, true);
}
Tensor LanguageModel::forward(const Tensor& input) {
// Input shape: [sequence_length, batch_size]
// Get sequence length and batch size
size_t seq_len = input.shape()[0];
size_t batch_size = input.shape()[1];
// Embedding layer
Tensor embedded = embedding_weight_.index_select(input); // [seq_len, batch_size, embedding_dim]
// LSTM layer (simplified implementation)
Tensor hidden = Tensor::zeros({num_layers_, batch_size, hidden_dim});
Tensor cell = Tensor::zeros({num_layers_, batch_size, hidden_dim});
Tensor output;
for (size_t t = 0; t < seq_len; ++t) {
// Get current time step
Tensor x_t = embedded.slice(t, 1, 0); // [batch_size, embedding_dim]
// LSTM computation (simplified)
for (size_t layer = 0; layer < num_layers_; ++layer) {
Tensor h_prev = hidden.slice(layer, 1, 0);
Tensor c_prev = cell.slice(layer, 1, 0);
// Gates computation
Tensor gates = x_t.matmul(lstm_weight_ih_.transpose()) +
h_prev.matmul(lstm_weight_hh_.transpose()) +
lstm_bias_ih_ + lstm_bias_hh_;
// Split gates
Tensor i = gates.slice(0, hidden_dim, 1).sigmoid();
Tensor f = gates.slice(hidden_dim, hidden_dim, 1).sigmoid();
Tensor g = gates.slice(2 * hidden_dim, hidden_dim, 1).tanh();
Tensor o = gates.slice(3 * hidden_dim, hidden_dim, 1).sigmoid();
// Update cell state
Tensor c_next = f * c_prev + i * g;
// Update hidden state
Tensor h_next = o * c_next.tanh();
// Store states
hidden.slice(layer, 1, 0) = h_next;
cell.slice(layer, 1, 0) = c_next;
x_t = h_next; // Output of this layer is input to next layer
}
// Store output for this time step
if (t == 0) {
output = x_t.unsqueeze(0); // Add sequence dimension
} else {
output = output.concatenate(x_t.unsqueeze(0), 0);
}
}
// Output layer
Tensor logits = output.matmul(output_weight_.transpose()) + output_bias_;
return logits;
}
std::vector<Tensor> LanguageModel::parameters() const {
return {
embedding_weight_,
lstm_weight_ih_,
lstm_weight_hh_,
lstm_bias_ih_,
lstm_bias_hh_,
output_weight_,
output_bias_
};
}
void LanguageModel::train() {
is_training_ = true;
}
void LanguageModel::eval() {
is_training_ = false;
}
void LanguageModel::save(const std::string& path) const {
std::ofstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file for writing: " + path);
}
// Write header
const char magic[] = "LMOD";
file.write(magic, 4);
uint32_t version = 1;
file.write(reinterpret_cast<const char*>(&version), sizeof(version));
// Get named parameters
auto params = named_parameters();
uint32_t num_params = static_cast<uint32_t>(params.size());
file.write(reinterpret_cast<const char*>(&num_params), sizeof(num_params));
// Write each parameter
for (const auto& [name, tensor] : params) {
Tensor::write_string(file, name);
tensor.serialize(file);
}
}
void LanguageModel::load(const std::string& path) {
std::ifstream file(path, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file for reading: " + path);
}
// Read and verify header
char magic[4];
file.read(magic, 4);
if (std::string(magic, 4) != "LMOD") {
throw std::runtime_error("Invalid model file format");
}
uint32_t version;
file.read(reinterpret_cast<char*>(&version), sizeof(version));
if (version != 1) {
throw std::runtime_error("Unsupported model version: " + std::to_string(version));
}
// Read number of parameters
uint32_t num_params;
file.read(reinterpret_cast<char*>(&num_params), sizeof(num_params));
// Read each parameter
for (uint32_t i = 0; i < num_params; ++i) {
std::string name = Tensor::read_string(file);
Tensor tensor;
tensor.deserialize(file);
// Set the parameter
set_parameter(name, tensor);
}
}
std::vector<Tensor> LanguageModel::parameters() const {
std::vector<Tensor> params;
for (const auto& [name, tensor] : parameters_) {
params.push_back(tensor);
}
return params;
}
std::unordered_map<std::string, Tensor> LanguageModel::named_parameters() const {
return parameters_;
}
void LanguageModel::set_parameter(const std::string& name, const Tensor& param) {
auto it = parameters_.find(name);
if (it != parameters_.end()) {
it->second = param;
} else {
throw std::runtime_error("Unknown parameter: " + name);
}
}
} // namespace lm

83
src/models/layer_norm.cpp Normal file
View File

@ -0,0 +1,83 @@
#include "lm/models/layer_norm.hpp"
#include <cmath>
#include <iostream>
namespace lm {
LayerNorm::LayerNorm(size_t d_model, float eps)
: d_model_(d_model), eps_(eps) {
// Initialize gamma (scale) to ones and beta (bias) to zeros
gamma_ = Tensor::ones(std::vector<size_t>{d_model_});
beta_ = Tensor::zeros(std::vector<size_t>{d_model_});
std::cout << "Initialized LayerNorm with:\n";
std::cout << " d_model: " << d_model_ << "\n";
std::cout << " eps: " << eps_ << "\n";
}
std::vector<Tensor> LayerNorm::parameters() const {
return {gamma_, beta_};
}
void LayerNorm::set_training(/*bool training*/) {
// LayerNorm doesn't have different behavior during training vs evaluation
// This method is here for interface consistency
}
Tensor LayerNorm::forward(const Tensor& input) const {
// Get input dimensions
size_t batch_size = input.shape()[0];
size_t seq_len = input.shape()[1];
// Create output tensor with same shape as input
Tensor output(input.shape());
// Calculate strides for flat indexing
size_t input_stride_1 = d_model_; // stride for sequence position in input
size_t input_stride_2 = 1; // stride for feature dimension in input
// For each element in the batch and each position in the sequence
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
// Calculate mean
float mean = 0.0f;
for (size_t d = 0; d < d_model_; ++d) {
size_t input_index = b * seq_len * input_stride_1 +
t * input_stride_1 +
d * input_stride_2;
mean += input(input_index);
}
mean /= d_model_;
// Calculate variance
float variance = 0.0f;
for (size_t d = 0; d < d_model_; ++d) {
size_t input_index = b * seq_len * input_stride_1 +
t * input_stride_1 +
d * input_stride_2;
float diff = input(input_index) - mean;
variance += diff * diff;
}
variance /= d_model_;
// Normalize
for (size_t d = 0; d < d_model_; ++d) {
size_t input_index = b * seq_len * input_stride_1 +
t * input_stride_1 +
d * input_stride_2;
size_t output_index = b * seq_len * input_stride_1 +
t * input_stride_1 +
d * input_stride_2;
float normalized = (input(input_index) - mean) / std::sqrt(variance + eps_);
output(output_index) = gamma_(d) * normalized + beta_(d);
}
}
}
return output;
}
} // namespace lm

162
src/models/transformer.cpp Normal file
View File

@ -0,0 +1,162 @@
#include "lm/models/transformer.hpp"
#include <iostream>
#include <random>
#include <cmath>
namespace lm {
Transformer::Transformer(size_t vocab_size, size_t d_model, size_t num_heads,
size_t d_ff, size_t num_layers, size_t max_seq_len, float dropout)
: vocab_size_(vocab_size), d_model_(d_model), num_heads_(num_heads),
d_ff_(d_ff), num_layers_(num_layers), max_seq_len_(max_seq_len),
dropout_(dropout), training_(false) {
// Initialize embedding layer
embedding_ = Tensor::randn({vocab_size_, d_model_}, 0.0, 0.02);
embedding_.requires_grad(true);
// Initialize positional encoding - use explicit vector
positional_encoding_ = Tensor(std::vector<size_t>{max_seq_len_, d_model_});
for (size_t pos = 0; pos < max_seq_len_; ++pos) {
for (size_t i = 0; i < d_model_; ++i) {
if (i % 2 == 0) {
positional_encoding_(pos, i) = std::sin(pos / std::pow(10000, 2.0 * i / d_model_));
} else {
positional_encoding_(pos, i) = std::cos(pos / std::pow(10000, 2.0 * (i - 1) / d_model_));
}
}
}
positional_encoding_.requires_grad(true);
// Initialize transformer blocks
for (size_t i = 0; i < num_layers_; ++i) {
transformer_blocks_.push_back(std::make_unique<TransformerBlock>(d_model_, num_heads_, d_ff_, dropout_));
}
// Initialize output layer
output_layer_ = Tensor::randn({d_model_, vocab_size_}, 0.0, 0.02);
output_layer_.requires_grad(true);
std::cout << "Initialized Transformer with:\n";
std::cout << " vocab_size: " << vocab_size_ << "\n";
std::cout << " d_model: " << d_model_ << "\n";
std::cout << " num_heads: " << num_heads_ << "\n";
std::cout << " d_ff: " << d_ff_ << "\n";
std::cout << " num_layers: " << num_layers_ << "\n";
std::cout << " max_seq_len: " << max_seq_len_ << "\n";
std::cout << " dropout: " << dropout_ << "\n";
}
std::vector<Tensor> Transformer::parameters() const {
std::vector<Tensor> params;
// Add embedding parameters
params.push_back(embedding_);
// Add positional encoding parameters
params.push_back(positional_encoding_);
// Add transformer block parameters
for (const auto& block : transformer_blocks_) {
auto block_params = block->parameters();
params.insert(params.end(), block_params.begin(), block_params.end());
}
// Add output layer parameters
params.push_back(output_layer_);
return params;
}
void Transformer::set_training(bool training) {
training_ = training;
// Set training mode for all transformer blocks
for (auto& block : transformer_blocks_) {
block->set_training(training);
}
std::cout << "Set training mode to: " << (training ? "true" : "false") << "\n";
}
Tensor Transformer::forward(const Tensor& input, const Tensor& mask) {
// Get input dimensions
size_t batch_size = input.shape()[0];
size_t seq_len = input.shape()[1];
// Convert token IDs to embeddings - use explicit vector
Tensor embeddings(std::vector<size_t>{batch_size, seq_len, d_model_});
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
size_t token_id = static_cast<size_t>(input(b, t));
if (token_id < vocab_size_) {
for (size_t d = 0; d < d_model_; ++d) {
embeddings(b, t, d) = embedding_(token_id, d);
}
}
}
}
// Add positional encoding
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t d = 0; d < d_model_; ++d) {
embeddings(b, t, d) += positional_encoding_(t, d);
}
}
}
// Apply dropout during training
if (training_) {
embeddings = apply_dropout(embeddings, dropout_);
}
// Pass through transformer blocks
Tensor hidden_states = embeddings;
for (auto& block : transformer_blocks_) {
hidden_states = block->forward(hidden_states, mask);
}
// Apply output layer - use explicit vector
Tensor logits(std::vector<size_t>{batch_size, seq_len, vocab_size_});
for (size_t b = 0; b < batch_size; ++b) {
for (size_t t = 0; t < seq_len; ++t) {
for (size_t v = 0; v < vocab_size_; ++v) {
logits(b, t, v) = 0.0;
for (size_t d = 0; d < d_model_; ++d) {
logits(b, t, v) += hidden_states(b, t, d) * output_layer_(d, v);
}
}
}
}
return logits;
}
Tensor Transformer::forward(const Tensor& input) {
// Create an empty mask tensor
Tensor mask;
return forward(input, mask);
}
Tensor Transformer::apply_dropout(const Tensor& input, float dropout_rate) {
if (dropout_rate <= 0.0) return input;
Tensor output = input;
std::random_device rd;
std::mt19937 gen(rd());
std::bernoulli_distribution dist(1.0 - dropout_rate);
for (size_t i = 0; i < output.size(); ++i) {
if (!dist(gen)) {
output(i) = 0.0;
} else {
output(i) /= (1.0 - dropout_rate);
}
}
return output;
}
} // namespace lm

View File

@ -0,0 +1,65 @@
#include "lm/models/transformer_block.hpp"
#include <iostream>
namespace lm {
TransformerBlock::TransformerBlock(size_t d_model, size_t num_heads, size_t d_ff, float dropout)
: d_model_(d_model), num_heads_(num_heads), d_ff_(d_ff), dropout_(dropout) {
// Initialize multi-head attention
attention_ = std::make_unique<MultiHeadAttention>(d_model, num_heads, dropout);
// Initialize feed-forward network
feed_forward_ = std::make_unique<FeedForward>(d_model, d_ff, dropout);
// Initialize layer normalization
norm1_ = std::make_unique<LayerNorm>(d_model);
norm2_ = std::make_unique<LayerNorm>(d_model);
std::cout << "Initialized TransformerBlock with:\n";
std::cout << " d_model: " << d_model_ << "\n";
std::cout << " num_heads: " << num_heads_ << "\n";
std::cout << " d_ff: " << d_ff_ << "\n";
std::cout << " dropout: " << dropout_ << "\n";
}
std::vector<Tensor> TransformerBlock::parameters() const {
std::vector<Tensor> params;
// Add attention parameters
auto attention_params = attention_->parameters();
params.insert(params.end(), attention_params.begin(), attention_params.end());
// Add feed-forward parameters
auto ff_params = feed_forward_->parameters();
params.insert(params.end(), ff_params.begin(), ff_params.end());
// Add layer norm parameters
auto norm1_params = norm1_->parameters();
params.insert(params.end(), norm1_params.begin(), norm1_params.end());
auto norm2_params = norm2_->parameters();
params.insert(params.end(), norm2_params.begin(), norm2_params.end());
return params;
}
void TransformerBlock::set_training(bool training) {
training_ = training;
attention_->set_training(training);
feed_forward_->set_training(training);
}
Tensor TransformerBlock::forward(const Tensor& input, const Tensor& mask) const {
// Self-attention with residual connection
Tensor attention_output = attention_->forward(input, input, input, mask);
Tensor norm1_output = norm1_->forward(input + attention_output);
// Feed-forward with residual connection
Tensor ff_output = feed_forward_->forward(norm1_output);
Tensor output = norm2_->forward(norm1_output + ff_output);
return output;
}
} // namespace lm

View File

@ -0,0 +1,15 @@
add_library(lm_optimizers
adam.cpp
)
target_include_directories(lm_optimizers
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../include
${EIGEN_LOC}
)
target_link_libraries(lm_optimizers
PUBLIC
lm_core_components
)

56
src/optimizers/adam.cpp Normal file
View File

@ -0,0 +1,56 @@
#include "lm/optimizers/adam.hpp"
#include <cmath>
namespace lm {
AdamOptimizer::AdamOptimizer(float learning_rate, float beta1, float beta2, float epsilon)
: learning_rate_(learning_rate), beta1_(beta1), beta2_(beta2), epsilon_(epsilon), timestep_(0) {}
void AdamOptimizer::zero_grad(std::vector<Tensor>& parameters) {
for (auto& param : parameters) {
if (param.requires_grad()) {
param.zero_grad();
}
}
}
void AdamOptimizer::step(std::vector<Tensor>& parameters) {
timestep_++;
for (size_t i = 0; i < parameters.size(); i++) {
if (!parameters[i].requires_grad()) continue;
// Initialize moment estimates if needed
if (m_.size() <= i) {
m_.push_back(Tensor::zeros(parameters[i].shape()));
v_.push_back(Tensor::zeros(parameters[i].shape()));
}
// Convert gradient to Tensor for consistent operations
Tensor grad_tensor(parameters[i].grad(), parameters[i].shape());
// Update biased first moment estimate using Tensor operations
m_[i] = m_[i] * beta1_ + grad_tensor * (1 - beta1_);
// Update biased second raw moment estimate using Tensor operations
Tensor grad_squared = grad_tensor * grad_tensor;
v_[i] = v_[i] * beta2_ + grad_squared * (1 - beta2_);
// Compute bias-corrected first moment estimate
float bias_correction1 = 1 - std::pow(beta1_, timestep_);
Tensor m_hat = m_[i] / bias_correction1;
// Compute bias-corrected second raw moment estimate
float bias_correction2 = 1 - std::pow(beta2_, timestep_);
Tensor v_hat = v_[i] / bias_correction2;
// Update parameters using Tensor operations
Tensor update = m_hat / (v_hat.sqrt() +
Tensor(Eigen::MatrixXf::Constant(v_hat.data().rows(), v_hat.data().cols(), epsilon_),
v_hat.shape()));
parameters[i].data() -= learning_rate_ * update.data();
}
}
} // namespace lm

View File

@ -4,15 +4,19 @@ cmake_minimum_required(VERSION 3.6)
add_library(lm_runtime
init.cpp
shutdown.cpp
state_utils.cpp # Add this line
state_utils.cpp
)
target_include_directories(lm_runtime
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
${EIGEN_LOC}
)
target_link_libraries(lm_runtime
PRIVATE nlohmann_json::nlohmann_json
PUBLIC
lm_core_components
lm_training # NEW: Add training dependency
)

8
src/test_transformer.cpp Normal file
View File

@ -0,0 +1,8 @@
// test_transformer.cpp
#include "lm/models/transformer.hpp"
int main() {
lm::Transformer transformer(1000, 512, 8, 2048, 6, 512);
return 0;
}

View File

@ -468,80 +468,10 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
pimpl_->count_word_frequencies(words, word_counts);
// Track token frequencies for pruning
std::unordered_map<TokenID, size_t> token_frequencies;
// Initialize token frequencies
for (const auto& [word, count] : word_counts) {
auto tokens = pimpl_->word_to_token_ids(word);
for (TokenID token : tokens) {
token_frequencies[token] += count;
}
}
// BPE training algorithm with safety limit
int iteration = 0;
int max_iterations = 10000;
// Pruning function - remove infrequent tokens
auto prune_infrequent_tokens = [&](size_t frequency_threshold = 2) {
std::vector<TokenID> tokens_to_remove;
// Identify tokens to remove (excluding special tokens)
for (const auto& [token_id, freq] : token_frequencies) {
if (freq < frequency_threshold) {
// Check if this is a special token
std::string token_text = pimpl_->inv_vocab.at(token_id);
if (pimpl_->special_tokens.find(token_text) == pimpl_->special_tokens.end()) {
tokens_to_remove.push_back(token_id);
}
}
}
// Remove tokens from vocabulary
for (TokenID token_id : tokens_to_remove) {
std::string token_text = pimpl_->inv_vocab.at(token_id);
// Remove from vocabulary mappings
pimpl_->vocab.erase(token_text);
pimpl_->inv_vocab.erase(token_id);
token_frequencies.erase(token_id);
// Update word counts to use subword components instead of removed tokens
std::unordered_map<std::string, int> updated_word_counts;
for (const auto& [word, count] : word_counts) {
std::string updated_word = word;
size_t pos = 0;
// Replace all occurrences of the token text with its byte representation
while ((pos = updated_word.find(token_text, pos)) != std::string::npos) {
// Replace with byte fallback
std::string replacement;
for (unsigned char c : token_text) {
std::string byte_str(1, static_cast<char>(c));
replacement += byte_str;
}
updated_word.replace(pos, token_text.size(), replacement);
pos += replacement.size();
}
updated_word_counts[updated_word] += count;
}
// Update the word_counts with the modified words
word_counts = std::move(updated_word_counts);
}
// Recalculate token frequencies after pruning
token_frequencies.clear();
for (const auto& [word, count] : word_counts) {
auto tokens = pimpl_->word_to_token_ids(word);
for (TokenID token : tokens) {
token_frequencies[token] += count;
}
}
};
while (pimpl_->vocab.size() < vocab_size && iteration < max_iterations) {
// Count pairs
std::unordered_map<std::pair<TokenID, TokenID>, int, PairHash> pair_counts;
@ -570,25 +500,7 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
// Perform merge
pimpl_->perform_merge(max_pair->first, pimpl_->next_token_id, word_counts);
pimpl_->next_token_id++;
// Update token frequencies
token_frequencies.clear();
for (const auto& [word, count] : word_counts) {
auto tokens = pimpl_->word_to_token_ids(word);
for (TokenID token : tokens) {
token_frequencies[token] += count;
}
}
// Periodically prune infrequent tokens
if (iteration % 500 == 0 && iteration > 0) {
size_t pre_prune_size = pimpl_->vocab.size();
prune_infrequent_tokens(2); // Remove tokens with frequency < 2
std::cout << "Pruned " << (pre_prune_size - pimpl_->vocab.size())
<< " infrequent tokens. New vocab size: "
<< pimpl_->vocab.size() << std::endl;
}
iteration++;
// Periodically check memory usage
if (iteration % 500 == 0) {
@ -596,16 +508,8 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
std::cout << "Memory after " << iteration << " iterations: "
<< (current_memory - start_memory) / (1024 * 1024) << "MB\n";
}
iteration++;
}
// Final pruning after training completes
size_t pre_prune_size = pimpl_->vocab.size();
prune_infrequent_tokens(3); // Remove tokens with frequency < 3
std::cout << "Final pruning: Removed " << (pre_prune_size - pimpl_->vocab.size())
<< " tokens. Final vocab size: " << pimpl_->vocab.size() << std::endl;
if (iteration >= max_iterations) {
std::cout << "Reached maximum iterations. Stopping training." << std::endl;
}
@ -614,10 +518,57 @@ void BPETokenizer::train(const std::vector<std::string>& corpus, size_t vocab_si
std::cout << "Training completed in " << iteration << " iterations\n";
std::cout << "Peak memory used: " << (end_memory - start_memory) / (1024 * 1024) << "MB\n";
std::cout << "Final vocabulary size: " << pimpl_->vocab.size() << std::endl;
// Clear the string intern pool to free memory
pimpl_->string_pool.clear();
// Add periodic memory cleanup
if (iteration % 1000 == 0) {
pimpl_->string_pool.clear();
}
}
void BPETokenizer::train_from_file(const std::string& filename, size_t vocab_size) {
std::ifstream file(filename);
if (!file.is_open()) {
throw std::runtime_error("Cannot open file: " + filename);
}
std::vector<std::string> corpus;
std::string line;
while (std::getline(file, line)) {
corpus.push_back(line);
}
train(corpus, vocab_size);
}
std::vector<TokenID> BPETokenizer::encode(const std::string& text) const {
auto words = pimpl_->split_text(text);
std::vector<TokenID> tokens;
tokens.reserve(text.size() * 2); // Pre-allocate based on text size
for (const auto& word : words) {
auto word_tokens = pimpl_->word_to_token_ids(word);
// Apply BPE merges more efficiently
bool changed;
do {
changed = false;
for (size_t i = 0; i < word_tokens.size() - 1; i++) {
auto pair = std::make_pair(word_tokens[i], word_tokens[i+1]);
if (auto it = pimpl_->merges.find(pair); it != pimpl_->merges.end()) {
word_tokens[i] = it->second;
word_tokens.erase(word_tokens.begin() + i + 1);
changed = true;
break;
}
}
} while (changed);
tokens.insert(tokens.end(), word_tokens.begin(), word_tokens.end());
}
return tokens;
}
std::string BPETokenizer::decode(const std::vector<TokenID>& tokens) const {
std::string text;
for (TokenID token_id : tokens) {
@ -755,4 +706,39 @@ void BPETokenizer::Impl::get_pair_counts(
}
}
Eigen::VectorXi BPETokenizer::encode_to_vector(const std::string& text) const {
auto token_ids = encode(text);
Eigen::VectorXi result(token_ids.size());
for (size_t i = 0; i < token_ids.size(); ++i) {
result(i) = static_cast<int>(token_ids[i]);
}
return result;
}
std::string BPETokenizer::decode_from_vector(const Eigen::VectorXi& tokens) const {
std::vector<TokenID> token_ids(tokens.size());
for (int i = 0; i < tokens.size(); ++i) {
token_ids[i] = static_cast<TokenID>(tokens(i));
}
return decode(token_ids);
}
Eigen::VectorXf BPETokenizer::token_frequencies() const {
// This is a placeholder implementation
// In a real implementation, you would track token frequencies during training
size_t vocab_size = vocab_size;
Eigen::VectorXf frequencies(vocab_size);
// Initialize with equal frequencies (placeholder)
for (size_t i = 0; i < vocab_size; ++i) {
frequencies(i) = 1.0f / vocab_size;
}
return frequencies;
}
} // namespace lm

0
src/train_lm.cpp Normal file
View File

View File

@ -0,0 +1,17 @@
add_library(lm_training
trainer.cpp
)
target_include_directories(lm_training
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/../../include
${EIGEN_LOC}
)
target_link_libraries(lm_training
PUBLIC
lm_core_components
lm_models
lm_optimizers
)

135
src/training/trainer.cpp Normal file
View File

@ -0,0 +1,135 @@
#include "lm/training/trainer.hpp"
#include <iostream>
#include <random>
#include <algorithm>
namespace lm {
LanguageModelTrainer::LanguageModelTrainer(const BPETokenizer& tokenizer,
size_t embedding_dim,
size_t hidden_dim,
size_t num_layers)
: tokenizer_(tokenizer), // Store reference
model_(tokenizer.vocab_size(), embedding_dim, hidden_dim, num_layers),
optimizer_(0.001, 0.9, 0.999, 1e-8) {}
void LanguageModelTrainer::train(const std::vector<std::string>& corpus,
size_t epochs,
size_t batch_size,
size_t sequence_length) {
model_.train();
for (size_t epoch = 0; epoch < epochs; ++epoch) {
float total_loss = 0.0;
size_t num_batches = 0;
// Shuffle corpus
std::vector<std::string> shuffled_corpus = corpus;
std::shuffle(shuffled_corpus.begin(), shuffled_corpus.end(),
std::default_random_engine(42));
// Process in batches
for (size_t i = 0; i < shuffled_corpus.size(); i += batch_size) {
size_t end = std::min(i + batch_size, shuffled_corpus.size());
std::vector<std::string> batch_texts(shuffled_corpus.begin() + i,
shuffled_corpus.begin() + end);
// Prepare batch
Tensor batch = prepare_batch(batch_texts, sequence_length);
// Split into input and target
Tensor input = batch.slice(0, sequence_length - 1, 0);
Tensor target = batch.slice(1, sequence_length - 1, 0);
// Forward pass
Tensor logits = model_.forward(input);
// Compute loss
float loss = compute_loss(logits, target);
total_loss += loss;
// Backward pass
logits.backward();
// Update parameters - store in variable to avoid rvalue reference issue
auto params = model_.parameters();
optimizer_.step(params);
optimizer_.zero_grad(params);
num_batches++;
if (num_batches % 100 == 0) {
std::cout << "Epoch " << epoch + 1 << ", Batch " << num_batches
<< ", Loss: " << loss << std::endl;
}
}
std::cout << "Epoch " << epoch + 1 << " completed. Average loss: "
<< total_loss / num_batches << std::endl;
}
}
Tensor LanguageModelTrainer::prepare_batch(const std::vector<std::string>& texts,
size_t sequence_length) {
std::vector<std::vector<TokenID>> tokenized_texts;
// Tokenize all texts
for (const auto& text : texts) {
tokenized_texts.push_back(tokenizer_.encode(text));
}
// Create batch tensor - fix ambiguous constructor
std::vector<size_t> shape = {sequence_length, texts.size()};
Tensor batch(shape);
// Fill batch
for (size_t i = 0; i < texts.size(); ++i) {
const auto& tokens = tokenized_texts[i];
for (size_t j = 0; j < sequence_length; ++j) {
if (j < tokens.size()) {
batch(j, i) = static_cast<float>(tokens[j]);
} else {
// Padding
batch(j, i) = 0.0f;
}
}
}
return batch;
}
float LanguageModelTrainer::compute_loss(const Tensor& logits, const Tensor& targets) {
// Cross-entropy loss
Tensor log_probs = logits.softmax(-1);
// Gather the log probabilities of the target classes
Tensor loss = Tensor::zeros({1});
size_t batch_size = targets.shape()[1];
size_t seq_length = targets.shape()[0];
for (size_t i = 0; i < batch_size; ++i) {
for (size_t j = 0; j < seq_length; ++j) {
int target_class = static_cast<int>(targets(j, i));
if (target_class != 0) { // Skip padding
loss(0) -= log_probs(j, i, target_class);
}
}
}
// Average loss
return loss(0) / (batch_size * seq_length);
}
void LanguageModelTrainer::save_model(const std::string& path) {
model_.save(path);
std::cout << "Model saved to: " << path << std::endl;
}
void LanguageModelTrainer::load_model(const std::string& path) {
model_.load(path);
std::cout << "Model loaded from: " << path << std::endl;
}
} // namespace lm

0
todo.md Normal file
View File