From 261d0d700749af646afbda49241f2e923c976a1c Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 6 Jul 2025 20:03:16 +0100 Subject: [PATCH] Refactor DenseNN implementation to enhance activation function handling and improve training process --- src/compute/models/dense_nn.rs | 307 ++++++++++++++++++++++++++++----- 1 file changed, 260 insertions(+), 47 deletions(-) diff --git a/src/compute/models/dense_nn.rs b/src/compute/models/dense_nn.rs index 1959c2a..7be04a2 100644 --- a/src/compute/models/dense_nn.rs +++ b/src/compute/models/dense_nn.rs @@ -1,65 +1,278 @@ use crate::matrix::{Matrix, SeriesOps}; -use crate::compute::activations::{relu, sigmoid, drelu}; -use rand::Rng; +use crate::compute::activations::{relu, drelu, sigmoid}; +use rand::prelude::*; +/// Supported activation functions +#[derive(Clone)] +pub enum ActivationKind { + Relu, + Sigmoid, + Tanh, +} + +impl ActivationKind { + /// Apply activation elementwise + pub fn forward(&self, z: &Matrix) -> Matrix { + match self { + ActivationKind::Relu => relu(z), + ActivationKind::Sigmoid => sigmoid(z), + ActivationKind::Tanh => z.map(|v| v.tanh()), + } + } + + /// Compute elementwise derivative w.r.t. pre-activation z + pub fn derivative(&self, z: &Matrix) -> Matrix { + match self { + ActivationKind::Relu => drelu(z), + ActivationKind::Sigmoid => { + let s = sigmoid(z); + s.zip(&s, |si, sj| si * (1.0 - sj)) + } + ActivationKind::Tanh => z.map(|v| 1.0 - v.tanh().powi(2)), + } + } +} + +/// Weight initialization schemes +#[derive(Clone)] +pub enum InitializerKind { + /// Uniform(-limit .. limit) + Uniform(f64), + /// Xavier/Glorot uniform + Xavier, + /// He (Kaiming) uniform + He, +} + +impl InitializerKind { + pub fn initialize(&self, rows: usize, cols: usize) -> Matrix { + let mut rng = rand::rng(); + let fan_in = rows; + let fan_out = cols; + let limit = match self { + InitializerKind::Uniform(l) => *l, + InitializerKind::Xavier => (6.0 / (fan_in + fan_out) as f64).sqrt(), + InitializerKind::He => (2.0 / fan_in as f64).sqrt(), + }; + let data = (0..rows * cols) + .map(|_| rng.random_range(-limit..limit)) + .collect::>(); + Matrix::from_vec(data, rows, cols) + } +} + +/// Supported losses +#[derive(Clone)] +pub enum LossKind { + /// Mean Squared Error: L = 1/m * sum((y_hat - y)^2) + MSE, + /// Binary Cross-Entropy: L = -1/m * sum(y*log(y_hat) + (1-y)*log(1-y_hat)) + BCE, +} + +impl LossKind { + /// Compute gradient dL/dy_hat (before applying activation derivative) + pub fn gradient(&self, y_hat: &Matrix, y: &Matrix) -> Matrix { + let m = y.rows() as f64; + match self { + LossKind::MSE => (y_hat - y) * (2.0 / m), + LossKind::BCE => (y_hat - y) * (1.0 / m), + } + } +} + +/// Configuration for a dense neural network +pub struct DenseNNConfig { + pub input_size: usize, + pub hidden_layers: Vec, + /// Must have length = hidden_layers.len() + 1 + pub activations: Vec, + pub output_size: usize, + pub initializer: InitializerKind, + pub loss: LossKind, + pub learning_rate: f64, + pub epochs: usize, +} + +/// A multi-layer perceptron with full configurability pub struct DenseNN { - w1: Matrix, // (n_in, n_hidden) - b1: Matrix, // (1, n_hidden) - w2: Matrix, // (n_hidden, n_out) - b2: Matrix, // (1, n_out) + weights: Vec>, + biases: Vec>, + activations: Vec, + loss: LossKind, + lr: f64, + epochs: usize, } impl DenseNN { - pub fn new(n_in: usize, n_hidden: usize, n_out: usize) -> Self { - let mut rng = rand::rng(); - let mut init = |rows, cols| { - let data = (0..rows * cols) - .map(|_| rng.random_range(-1.0..1.0)) - .collect::>(); - Matrix::from_vec(data, rows, cols) - }; - Self { - w1: init(n_in, n_hidden), - b1: Matrix::zeros(1, n_hidden), - w2: init(n_hidden, n_out), - b2: Matrix::zeros(1, n_out), + /// Build a new DenseNN from the given configuration + pub fn new(config: DenseNNConfig) -> Self { + let mut sizes = vec![config.input_size]; + sizes.extend(&config.hidden_layers); + sizes.push(config.output_size); + + assert_eq!( + config.activations.len(), + sizes.len() - 1, + "Number of activation functions must match number of layers" + ); + + let mut weights = Vec::with_capacity(sizes.len() - 1); + let mut biases = Vec::with_capacity(sizes.len() - 1); + + for i in 0..sizes.len() - 1 { + let w = config.initializer.initialize(sizes[i], sizes[i + 1]); + let b = Matrix::zeros(1, sizes[i + 1]); + weights.push(w); + biases.push(b); + } + + DenseNN { + weights, + biases, + activations: config.activations, + loss: config.loss, + lr: config.learning_rate, + epochs: config.epochs, } } - pub fn forward(&self, x: &Matrix) -> (Matrix, Matrix, Matrix) { - // z1 = X·W1 + b1 ; a1 = ReLU(z1) - let z1 = x.dot(&self.w1) + &self.b1; - let a1 = relu(&z1); - // z2 = a1·W2 + b2 ; a2 = softmax(z2) (here binary => sigmoid) - let z2 = a1.dot(&self.w2) + &self.b2; - let a2 = sigmoid(&z2); // binary output - (a1, z2, a2) // keep intermediates for back-prop + /// Perform a full forward pass, returning pre-activations (z) and activations (a) + fn forward_full(&self, x: &Matrix) -> (Vec>, Vec>) { + let mut zs = Vec::with_capacity(self.weights.len()); + let mut activs = Vec::with_capacity(self.weights.len() + 1); + activs.push(x.clone()); + + let mut a = x.clone(); + for (i, (w, b)) in self.weights.iter().zip(self.biases.iter()).enumerate() { + let z = &a.dot(w) + &Matrix::repeat_rows(b, a.rows()); + let a_next = self.activations[i].forward(&z); + zs.push(z); + activs.push(a_next.clone()); + a = a_next; + } + + (zs, activs) } - pub fn train(&mut self, x: &Matrix, y: &Matrix, lr: f64, epochs: usize) { + /// Train the network on inputs X and targets Y + pub fn train(&mut self, x: &Matrix, y: &Matrix) { let m = x.rows() as f64; - for _ in 0..epochs { - let (a1, _z2, y_hat) = self.forward(x); + for _ in 0..self.epochs { + let (zs, activs) = self.forward_full(x); + let y_hat = activs.last().unwrap().clone(); - // -------- backwards ---------- - // dL/da2 = y_hat - y (BCE derivative) - let dz2 = &y_hat - y; // (m, n_out) - let dw2 = a1.transpose().dot(&dz2) / m; // (n_h, n_out) - // let db2 = dz2.sum_vertical() * (1.0 / m); // broadcast ok - let db2 = Matrix::from_vec(dz2.sum_vertical(), 1, dz2.cols()) * (1.0 / m); // (1, n_out) - let da1 = dz2.dot(&self.w2.transpose()); // (m,n_h) - let dz1 = da1.zip(&a1, |g, act| g * drelu(&Matrix::from_cols(vec![vec![act]])).data()[0]); // (m,n_h) - - // real code: drelu returns Matrix, broadcasting needed; you can optimise. + // Initial delta (dL/dz) on output + let mut delta = match self.loss { + LossKind::BCE => self.loss.gradient(&y_hat, y), + LossKind::MSE => { + let grad = self.loss.gradient(&y_hat, y); + let dz = self.activations.last().unwrap().derivative(zs.last().unwrap()); + grad.zip(&dz, |g, da| g * da) + } + }; - let dw1 = x.transpose().dot(&dz1) / m; // (n_in,n_h) - let db1 = Matrix::from_vec(dz1.sum_vertical(), 1, dz1.cols()) * (1.0 / m); // (1, n_h) + // Backpropagate through layers + for l in (0..self.weights.len()).rev() { + let a_prev = &activs[l]; + let dw = a_prev.transpose().dot(&delta) / m; + let db = Matrix::from_vec(delta.sum_vertical(), 1, delta.cols()) / m; - // -------- update ---------- - self.w2 = &self.w2 - &(dw2 * lr); - self.b2 = &self.b2 - &(db2 * lr); - self.w1 = &self.w1 - &(dw1 * lr); - self.b1 = &self.b1 - &(db1 * lr); + // Update weights & biases + self.weights[l] = &self.weights[l] - &(dw * self.lr); + self.biases[l] = &self.biases[l] - &(db * self.lr); + + // Propagate delta to previous layer + if l > 0 { + let w_t = self.weights[l].transpose(); + let da = self.activations[l - 1].derivative(&zs[l - 1]); + delta = delta.dot(&w_t).zip(&da, |d, a| d * a); + } + } } } + + /// Run a forward pass and return the network's output + pub fn predict(&self, x: &Matrix) -> Matrix { + let mut a = x.clone(); + for (i, (w, b)) in self.weights.iter().zip(self.biases.iter()).enumerate() { + let z = &a.dot(w) + &Matrix::repeat_rows(b, a.rows()); + a = self.activations[i].forward(&z); + } + a + } +} + +// ------------------------------ +// Simple tests +// ------------------------------ + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + #[test] + fn test_predict_shape() { + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![2], + activations: vec![ActivationKind::Relu, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.01, + epochs: 0, + }; + let model = DenseNN::new(config); + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0], 3, 1); + let preds = model.predict(&x); + assert_eq!(preds.rows(), 3); + assert_eq!(preds.cols(), 1); + } + + #[test] + fn test_train_no_epochs() { + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![2], + activations: vec![ActivationKind::Relu, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.01, + epochs: 0, + }; + let mut model = DenseNN::new(config); + let x = Matrix::from_vec(vec![1.0, 2.0], 2, 1); + let before = model.predict(&x); + model.train(&x, &before); + let after = model.predict(&x); + for i in 0..before.rows() { + assert!((before[(i, 0)] - after[(i, 0)]).abs() < 1e-12); + } + } + + #[test] + fn test_dense_nn_step() { + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![2], + activations: vec![ActivationKind::Relu, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::He, + loss: LossKind::BCE, + learning_rate: 0.01, + epochs: 5000, + }; + let mut model = DenseNN::new(config); + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 4, 1); + let y = Matrix::from_vec(vec![0.0, 0.0, 1.0, 1.0], 4, 1); + model.train(&x, &y); + let preds = model.predict(&x); + assert!((preds[(0, 0)] - 0.0).abs() < 0.5); + assert!((preds[(1, 0)] - 0.0).abs() < 0.5); + assert!((preds[(2, 0)] - 1.0).abs() < 0.5); + assert!((preds[(3, 0)] - 1.0).abs() < 0.5); + } }