diff --git a/Cargo.toml b/Cargo.toml index d3d44f8..ec38c65 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,6 @@ crate-type = ["cdylib", "lib"] [dependencies] chrono = "^0.4.10" criterion = { version = "0.5", features = ["html_reports"], optional = true } - -[dev-dependencies] rand = "^0.9.1" [features] diff --git a/src/compute/mod.rs b/src/compute/mod.rs new file mode 100644 index 0000000..6aa9b32 --- /dev/null +++ b/src/compute/mod.rs @@ -0,0 +1,3 @@ +pub mod models; + +pub mod stats; diff --git a/src/compute/models/activations.rs b/src/compute/models/activations.rs new file mode 100644 index 0000000..661fe52 --- /dev/null +++ b/src/compute/models/activations.rs @@ -0,0 +1,135 @@ +use crate::matrix::{Matrix, SeriesOps}; + +pub fn sigmoid(x: &Matrix) -> Matrix { + x.map(|v| 1.0 / (1.0 + (-v).exp())) +} + +pub fn dsigmoid(y: &Matrix) -> Matrix { + // derivative w.r.t. pre-activation; takes y = sigmoid(x) + y.map(|v| v * (1.0 - v)) +} + +pub fn relu(x: &Matrix) -> Matrix { + x.map(|v| if v > 0.0 { v } else { 0.0 }) +} + +pub fn drelu(x: &Matrix) -> Matrix { + x.map(|v| if v > 0.0 { 1.0 } else { 0.0 }) +} + +pub fn leaky_relu(x: &Matrix) -> Matrix { + x.map(|v| if v > 0.0 { v } else { 0.01 * v }) +} + +pub fn dleaky_relu(x: &Matrix) -> Matrix { + x.map(|v| if v > 0.0 { 1.0 } else { 0.01 }) +} + +mod tests { + use super::*; + + // Helper function to round all elements in a matrix to n decimal places + fn _round_matrix(mat: &Matrix, decimals: u32) -> Matrix { + let factor = 10f64.powi(decimals as i32); + let rounded: Vec = mat + .to_vec() + .iter() + .map(|v| (v * factor).round() / factor) + .collect(); + Matrix::from_vec(rounded, mat.rows(), mat.cols()) + } + + #[test] + fn test_sigmoid() { + let x = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.26894142, 0.5, 0.73105858], 3, 1); + let result = sigmoid(&x); + assert_eq!(_round_matrix(&result, 6), _round_matrix(&expected, 6)); + } + + #[test] + fn test_sigmoid_edge_case() { + let x = Matrix::from_vec(vec![-1000.0, 0.0, 1000.0], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.5, 1.0], 3, 1); + let result = sigmoid(&x); + + for (r, e) in result.data().iter().zip(expected.data().iter()) { + assert!((r - e).abs() < 1e-6); + } + } + + #[test] + fn test_relu() { + let x = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1); + assert_eq!(relu(&x), expected); + } + + #[test] + fn test_relu_edge_case() { + let x = Matrix::from_vec(vec![-1e-10, 0.0, 1e10], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.0, 1e10], 3, 1); + assert_eq!(relu(&x), expected); + } + + #[test] + fn test_dsigmoid() { + let y = Matrix::from_vec(vec![0.26894142, 0.5, 0.73105858], 3, 1); + let expected = Matrix::from_vec(vec![0.19661193, 0.25, 0.19661193], 3, 1); + let result = dsigmoid(&y); + assert_eq!(_round_matrix(&result, 6), _round_matrix(&expected, 6)); + } + + #[test] + fn test_dsigmoid_edge_case() { + let y = Matrix::from_vec(vec![0.0, 0.5, 1.0], 3, 1); // Assume these are outputs from sigmoid(x) + let expected = Matrix::from_vec(vec![0.0, 0.25, 0.0], 3, 1); + let result = dsigmoid(&y); + + for (r, e) in result.data().iter().zip(expected.data().iter()) { + assert!((r - e).abs() < 1e-6); + } + } + + #[test] + fn test_drelu() { + let x = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1); + assert_eq!(drelu(&x), expected); + } + + #[test] + fn test_drelu_edge_case() { + let x = Matrix::from_vec(vec![-1e-10, 0.0, 1e10], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1); + assert_eq!(drelu(&x), expected); + } + + #[test] + fn test_leaky_relu() { + let x = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![-0.01, 0.0, 1.0], 3, 1); + assert_eq!(leaky_relu(&x), expected); + } + + #[test] + fn test_leaky_relu_edge_case() { + let x = Matrix::from_vec(vec![-1e-10, 0.0, 1e10], 3, 1); + let expected = Matrix::from_vec(vec![-1e-12, 0.0, 1e10], 3, 1); + assert_eq!(leaky_relu(&x), expected); + } + + #[test] + fn test_dleaky_relu() { + let x = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.01, 0.01, 1.0], 3, 1); + assert_eq!(dleaky_relu(&x), expected); + } + + #[test] + fn test_dleaky_relu_edge_case() { + let x = Matrix::from_vec(vec![-1e-10, 0.0, 1e10], 3, 1); + let expected = Matrix::from_vec(vec![0.01, 0.01, 1.0], 3, 1); + assert_eq!(dleaky_relu(&x), expected); + } +} diff --git a/src/compute/models/dense_nn.rs b/src/compute/models/dense_nn.rs new file mode 100644 index 0000000..cdb2e32 --- /dev/null +++ b/src/compute/models/dense_nn.rs @@ -0,0 +1,524 @@ +use crate::compute::models::activations::{drelu, relu, sigmoid}; +use crate::matrix::{Matrix, SeriesOps}; +use rand::prelude::*; + +/// Supported activation functions +#[derive(Clone)] +pub enum ActivationKind { + Relu, + Sigmoid, + Tanh, +} + +impl ActivationKind { + /// Apply activation elementwise + pub fn forward(&self, z: &Matrix) -> Matrix { + match self { + ActivationKind::Relu => relu(z), + ActivationKind::Sigmoid => sigmoid(z), + ActivationKind::Tanh => z.map(|v| v.tanh()), + } + } + + /// Compute elementwise derivative w.r.t. pre-activation z + pub fn derivative(&self, z: &Matrix) -> Matrix { + match self { + ActivationKind::Relu => drelu(z), + ActivationKind::Sigmoid => { + let s = sigmoid(z); + s.zip(&s, |si, sj| si * (1.0 - sj)) + } + ActivationKind::Tanh => z.map(|v| 1.0 - v.tanh().powi(2)), + } + } +} + +/// Weight initialization schemes +#[derive(Clone)] +pub enum InitializerKind { + /// Uniform(-limit .. limit) + Uniform(f64), + /// Xavier/Glorot uniform + Xavier, + /// He (Kaiming) uniform + He, +} + +impl InitializerKind { + pub fn initialize(&self, rows: usize, cols: usize) -> Matrix { + let mut rng = rand::rng(); + let fan_in = rows; + let fan_out = cols; + let limit = match self { + InitializerKind::Uniform(l) => *l, + InitializerKind::Xavier => (6.0 / (fan_in + fan_out) as f64).sqrt(), + InitializerKind::He => (2.0 / fan_in as f64).sqrt(), + }; + let data = (0..rows * cols) + .map(|_| rng.random_range(-limit..limit)) + .collect::>(); + Matrix::from_vec(data, rows, cols) + } +} + +/// Supported losses +#[derive(Clone)] +pub enum LossKind { + /// Mean Squared Error: L = 1/m * sum((y_hat - y)^2) + MSE, + /// Binary Cross-Entropy: L = -1/m * sum(y*log(y_hat) + (1-y)*log(1-y_hat)) + BCE, +} + +impl LossKind { + /// Compute gradient dL/dy_hat (before applying activation derivative) + pub fn gradient(&self, y_hat: &Matrix, y: &Matrix) -> Matrix { + let m = y.rows() as f64; + match self { + LossKind::MSE => (y_hat - y) * (2.0 / m), + LossKind::BCE => (y_hat - y) * (1.0 / m), + } + } +} + +/// Configuration for a dense neural network +pub struct DenseNNConfig { + pub input_size: usize, + pub hidden_layers: Vec, + /// Must have length = hidden_layers.len() + 1 + pub activations: Vec, + pub output_size: usize, + pub initializer: InitializerKind, + pub loss: LossKind, + pub learning_rate: f64, + pub epochs: usize, +} + +/// A multi-layer perceptron with full configurability +pub struct DenseNN { + weights: Vec>, + biases: Vec>, + activations: Vec, + loss: LossKind, + lr: f64, + epochs: usize, +} + +impl DenseNN { + /// Build a new DenseNN from the given configuration + pub fn new(config: DenseNNConfig) -> Self { + let mut sizes = vec![config.input_size]; + sizes.extend(&config.hidden_layers); + sizes.push(config.output_size); + + assert_eq!( + config.activations.len(), + sizes.len() - 1, + "Number of activation functions must match number of layers" + ); + + let mut weights = Vec::with_capacity(sizes.len() - 1); + let mut biases = Vec::with_capacity(sizes.len() - 1); + + for i in 0..sizes.len() - 1 { + let w = config.initializer.initialize(sizes[i], sizes[i + 1]); + let b = Matrix::zeros(1, sizes[i + 1]); + weights.push(w); + biases.push(b); + } + + DenseNN { + weights, + biases, + activations: config.activations, + loss: config.loss, + lr: config.learning_rate, + epochs: config.epochs, + } + } + + /// Perform a full forward pass, returning pre-activations (z) and activations (a) + fn forward_full(&self, x: &Matrix) -> (Vec>, Vec>) { + let mut zs = Vec::with_capacity(self.weights.len()); + let mut activs = Vec::with_capacity(self.weights.len() + 1); + activs.push(x.clone()); + + let mut a = x.clone(); + for (i, (w, b)) in self.weights.iter().zip(self.biases.iter()).enumerate() { + let z = &a.dot(w) + &Matrix::repeat_rows(b, a.rows()); + let a_next = self.activations[i].forward(&z); + zs.push(z); + activs.push(a_next.clone()); + a = a_next; + } + + (zs, activs) + } + + /// Train the network on inputs X and targets Y + pub fn train(&mut self, x: &Matrix, y: &Matrix) { + let m = x.rows() as f64; + for _ in 0..self.epochs { + let (zs, activs) = self.forward_full(x); + let y_hat = activs.last().unwrap().clone(); + + // Initial delta (dL/dz) on output + let mut delta = match self.loss { + LossKind::BCE => self.loss.gradient(&y_hat, y), + LossKind::MSE => { + let grad = self.loss.gradient(&y_hat, y); + let dz = self + .activations + .last() + .unwrap() + .derivative(zs.last().unwrap()); + grad.zip(&dz, |g, da| g * da) + } + }; + + // Backpropagate through layers + for l in (0..self.weights.len()).rev() { + let a_prev = &activs[l]; + let dw = a_prev.transpose().dot(&delta) / m; + let db = Matrix::from_vec(delta.sum_vertical(), 1, delta.cols()) / m; + + // Update weights & biases + self.weights[l] = &self.weights[l] - &(dw * self.lr); + self.biases[l] = &self.biases[l] - &(db * self.lr); + + // Propagate delta to previous layer + if l > 0 { + let w_t = self.weights[l].transpose(); + let da = self.activations[l - 1].derivative(&zs[l - 1]); + delta = delta.dot(&w_t).zip(&da, |d, a| d * a); + } + } + } + } + + /// Run a forward pass and return the network's output + pub fn predict(&self, x: &Matrix) -> Matrix { + let mut a = x.clone(); + for (i, (w, b)) in self.weights.iter().zip(self.biases.iter()).enumerate() { + let z = &a.dot(w) + &Matrix::repeat_rows(b, a.rows()); + a = self.activations[i].forward(&z); + } + a + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + /// Compute MSE = 1/m * Σ (ŷ - y)² + fn mse_loss(y_hat: &Matrix, y: &Matrix) -> f64 { + let m = y.rows() as f64; + y_hat + .zip(y, |yh, yv| (yh - yv).powi(2)) + .data() + .iter() + .sum::() + / m + } + + #[test] + fn test_predict_shape() { + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![2], + activations: vec![ActivationKind::Relu, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.01, + epochs: 0, + }; + let model = DenseNN::new(config); + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0], 3, 1); + let preds = model.predict(&x); + assert_eq!(preds.rows(), 3); + assert_eq!(preds.cols(), 1); + } + + #[test] + #[should_panic(expected = "Number of activation functions must match number of layers")] + fn test_invalid_activation_count() { + let config = DenseNNConfig { + input_size: 2, + hidden_layers: vec![3], + activations: vec![ActivationKind::Relu], // Only one activation for two layers + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.01, + epochs: 0, + }; + let _model = DenseNN::new(config); + } + + #[test] + fn test_train_no_epochs_does_nothing() { + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![2], + activations: vec![ActivationKind::Relu, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.01, + epochs: 0, + }; + let mut model = DenseNN::new(config); + let x = Matrix::from_vec(vec![0.0, 1.0], 2, 1); + let y = Matrix::from_vec(vec![0.0, 1.0], 2, 1); + + let before = model.predict(&x); + model.train(&x, &y); + let after = model.predict(&x); + + for i in 0..before.rows() { + for j in 0..before.cols() { + // "prediction changed despite 0 epochs" + assert!((before[(i, j)] - after[(i, j)]).abs() < 1e-12); + } + } + } + + #[test] + fn test_train_one_epoch_changes_predictions() { + // Single-layer sigmoid regression so gradients flow. + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![], + activations: vec![ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 1.0, + epochs: 1, + }; + let mut model = DenseNN::new(config); + + let x = Matrix::from_vec(vec![0.0, 1.0], 2, 1); + let y = Matrix::from_vec(vec![0.0, 1.0], 2, 1); + + let before = model.predict(&x); + model.train(&x, &y); + let after = model.predict(&x); + + // At least one of the two outputs must move by >ϵ + let mut moved = false; + for i in 0..before.rows() { + if (before[(i, 0)] - after[(i, 0)]).abs() > 1e-8 { + moved = true; + } + } + assert!(moved, "predictions did not change after 1 epoch"); + } + + #[test] + fn test_training_reduces_mse_loss() { + // Same single‐layer sigmoid setup; check loss goes down. + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![], + activations: vec![ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 1.0, + epochs: 10, + }; + let mut model = DenseNN::new(config); + + let x = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1); + let y = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1); + + let before_preds = model.predict(&x); + let before_loss = mse_loss(&before_preds, &y); + + model.train(&x, &y); + + let after_preds = model.predict(&x); + let after_loss = mse_loss(&after_preds, &y); + + // MSE did not decrease (before: {}, after: {}) + assert!(after_loss < before_loss); + } + + #[test] + fn test_activation_kind_forward_tanh() { + let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![-0.76159415595, 0.0, 0.76159415595], 3, 1); + let output = ActivationKind::Tanh.forward(&input); + + for i in 0..input.rows() { + for j in 0..input.cols() { + // Tanh forward output mismatch at ({}, {}) + assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9); + } + } + } + + #[test] + fn test_activation_kind_derivative_relu() { + let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1); + let output = ActivationKind::Relu.derivative(&input); + + for i in 0..input.rows() { + for j in 0..input.cols() { + // "ReLU derivative output mismatch at ({}, {})" + assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9); + } + } + } + + #[test] + fn test_activation_kind_derivative_tanh() { + let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1); + let expected = Matrix::from_vec(vec![0.41997434161, 1.0, 0.41997434161], 3, 1); // 1 - tanh(x)^2 + let output = ActivationKind::Tanh.derivative(&input); + + for i in 0..input.rows() { + for j in 0..input.cols() { + // "Tanh derivative output mismatch at ({}, {})" + assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9); + } + } + } + + #[test] + fn test_initializer_kind_xavier() { + let rows = 10; + let cols = 20; + let initializer = InitializerKind::Xavier; + let matrix = initializer.initialize(rows, cols); + let limit = (6.0 / (rows + cols) as f64).sqrt(); + + assert_eq!(matrix.rows(), rows); + assert_eq!(matrix.cols(), cols); + + for val in matrix.data() { + // Xavier initialized value out of range + assert!(*val >= -limit && *val <= limit); + } + } + + #[test] + fn test_initializer_kind_he() { + let rows = 10; + let cols = 20; + let initializer = InitializerKind::He; + let matrix = initializer.initialize(rows, cols); + let limit = (2.0 / rows as f64).sqrt(); + + assert_eq!(matrix.rows(), rows); + assert_eq!(matrix.cols(), cols); + + for val in matrix.data() { + // He initialized value out of range + assert!(*val >= -limit && *val <= limit); + } + } + + #[test] + fn test_loss_kind_bce_gradient() { + let y_hat = Matrix::from_vec(vec![0.1, 0.9, 0.4], 3, 1); + let y = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1); + let expected_gradient = Matrix::from_vec(vec![0.1 / 3.0, -0.1 / 3.0, -0.1 / 3.0], 3, 1); // (y_hat - y) * (1.0 / m) + let output_gradient = LossKind::BCE.gradient(&y_hat, &y); + + assert_eq!(output_gradient.rows(), expected_gradient.rows()); + assert_eq!(output_gradient.cols(), expected_gradient.cols()); + + for i in 0..output_gradient.rows() { + for j in 0..output_gradient.cols() { + // BCE gradient output mismatch at ({}, {}) + assert!((output_gradient[(i, j)] - expected_gradient[(i, j)]).abs() < 1e-9); + } + } + } + + #[test] + fn test_training_reduces_bce_loss() { + // Single-layer sigmoid setup; check BCE loss goes down. + let config = DenseNNConfig { + input_size: 1, + hidden_layers: vec![], + activations: vec![ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::BCE, + learning_rate: 1.0, + epochs: 10, + }; + let mut model = DenseNN::new(config); + + let x = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1); + let y = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1); + + let before_preds = model.predict(&x); + // BCE loss calculation for testing + let before_loss = -1.0 / (y.rows() as f64) + * before_preds + .zip(&y, |yh, yv| yv * yh.ln() + (1.0 - yv) * (1.0 - yh).ln()) + .data() + .iter() + .sum::(); + + model.train(&x, &y); + + let after_preds = model.predict(&x); + let after_loss = -1.0 / (y.rows() as f64) + * after_preds + .zip(&y, |yh, yv| yv * yh.ln() + (1.0 - yv) * (1.0 - yh).ln()) + .data() + .iter() + .sum::(); + + // BCE did not decrease (before: {}, after: {}) + assert!(after_loss < before_loss,); + } + + #[test] + fn test_train_backprop_delta_propagation() { + // Network with two layers to test delta propagation to previous layer (l > 0) + let config = DenseNNConfig { + input_size: 2, + hidden_layers: vec![3], + activations: vec![ActivationKind::Sigmoid, ActivationKind::Sigmoid], + output_size: 1, + initializer: InitializerKind::Uniform(0.1), + loss: LossKind::MSE, + learning_rate: 0.1, + epochs: 1, + }; + let mut model = DenseNN::new(config); + + // Store initial weights and biases to compare after training + let initial_weights_l0 = model.weights[0].clone(); + let initial_biases_l0 = model.biases[0].clone(); + let initial_weights_l1 = model.weights[1].clone(); + let initial_biases_l1 = model.biases[1].clone(); + + let x = Matrix::from_vec(vec![0.1, 0.2, 0.3, 0.4], 2, 2); + let y = Matrix::from_vec(vec![0.5, 0.6], 2, 1); + + model.train(&x, &y); + + // Verify that weights and biases of both layers have changed, + // implying delta propagation occurred for l > 0 + + // Weights of first layer did not change, delta propagation might not have occurred + assert!(model.weights[0] != initial_weights_l0); + // Biases of first layer did not change, delta propagation might not have occurred + assert!(model.biases[0] != initial_biases_l0); + // Weights of second layer did not change + assert!(model.weights[1] != initial_weights_l1); + // Biases of second layer did not change + assert!(model.biases[1] != initial_biases_l1); + } +} diff --git a/src/compute/models/gaussian_nb.rs b/src/compute/models/gaussian_nb.rs new file mode 100644 index 0000000..1e8f38c --- /dev/null +++ b/src/compute/models/gaussian_nb.rs @@ -0,0 +1,230 @@ +use crate::matrix::Matrix; +use std::collections::HashMap; + +/// A Gaussian Naive Bayes classifier. +/// +/// # Parameters +/// - `var_smoothing`: Portion of the largest variance of all features to add to variances for stability. +/// - `use_unbiased_variance`: If `true`, uses Bessel's correction (dividing by (n-1)); otherwise divides by n. +/// +pub struct GaussianNB { + // Distinct class labels + classes: Vec, + // Prior probabilities P(class) + priors: Vec, + // Feature means per class + means: Vec>, + // Feature variances per class + variances: Vec>, + // var_smoothing + eps: f64, + // flag for unbiased variance + use_unbiased: bool, +} + +impl GaussianNB { + /// Create a new GaussianNB. + /// + /// # Arguments + /// * `var_smoothing` - small float added to variances for numerical stability. + /// * `use_unbiased_variance` - whether to apply Bessel's correction (divide by n-1). + pub fn new(var_smoothing: f64, use_unbiased_variance: bool) -> Self { + Self { + classes: Vec::new(), + priors: Vec::new(), + means: Vec::new(), + variances: Vec::new(), + eps: var_smoothing, + use_unbiased: use_unbiased_variance, + } + } + + /// Fit the model according to the training data `x` and labels `y`. + /// + /// # Panics + /// Panics if `x` or `y` is empty, or if their dimensions disagree. + pub fn fit(&mut self, x: &Matrix, y: &Matrix) { + let m = x.rows(); + let n = x.cols(); + assert_eq!(y.rows(), m, "Row count of X and Y must match"); + assert_eq!(y.cols(), 1, "Y must be a column vector"); + if m == 0 || n == 0 { + panic!("Input matrix x or y is empty"); + } + + // Group sample indices by label + let mut groups: HashMap> = HashMap::new(); + for i in 0..m { + let label = y[(i, 0)]; + let bits = label.to_bits(); + groups.entry(bits).or_default().push(i); + } + assert!(!groups.is_empty(), "No class labels found in y"); //-- panicked earlier + + // Extract and sort class labels + self.classes = groups.keys().cloned().map(f64::from_bits).collect(); + self.classes.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + self.priors.clear(); + self.means.clear(); + self.variances.clear(); + + // Precompute max variance for smoothing scale + let mut max_var_feature = 0.0; + for j in 0..n { + let mut col_vals = Vec::with_capacity(m); + for i in 0..m { + col_vals.push(x[(i, j)]); + } + let mean_all = col_vals.iter().sum::() / m as f64; + let var_all = col_vals.iter().map(|v| (v - mean_all).powi(2)).sum::() / m as f64; + if var_all > max_var_feature { + max_var_feature = var_all; + } + } + let smoothing = self.eps * max_var_feature; + + // Compute per-class statistics + for &c in &self.classes { + let idx = &groups[&c.to_bits()]; + let count = idx.len(); + // Prior + self.priors.push(count as f64 / m as f64); + + let mut mean = Matrix::zeros(1, n); + let mut var = Matrix::zeros(1, n); + + // Mean + for &i in idx { + for j in 0..n { + mean[(0, j)] += x[(i, j)]; + } + } + for j in 0..n { + mean[(0, j)] /= count as f64; + } + + // Variance + for &i in idx { + for j in 0..n { + let d = x[(i, j)] - mean[(0, j)]; + var[(0, j)] += d * d; + } + } + let denom = if self.use_unbiased { + (count as f64 - 1.0).max(1.0) + } else { + count as f64 + }; + for j in 0..n { + var[(0, j)] = var[(0, j)] / denom + smoothing; + if var[(0, j)] <= 0.0 { + var[(0, j)] = smoothing; + } + } + + self.means.push(mean); + self.variances.push(var); + } + } + + /// Perform classification on an array of test vectors `x`. + pub fn predict(&self, x: &Matrix) -> Matrix { + let m = x.rows(); + let n = x.cols(); + let k = self.classes.len(); + let mut preds = Matrix::zeros(m, 1); + let ln_2pi = (2.0 * std::f64::consts::PI).ln(); + + for i in 0..m { + let mut best = (0, f64::NEG_INFINITY); + for c_idx in 0..k { + let mut log_prob = self.priors[c_idx].ln(); + for j in 0..n { + let diff = x[(i, j)] - self.means[c_idx][(0, j)]; + let var = self.variances[c_idx][(0, j)]; + log_prob += -0.5 * (diff * diff / var + var.ln() + ln_2pi); + } + if log_prob > best.1 { + best = (c_idx, log_prob); + } + } + preds[(i, 0)] = self.classes[best.0]; + } + preds + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + #[test] + fn test_simple_two_class() { + // Simple dataset: one feature, two classes 0 and 1 + // Class 0: values [1.0, 1.2, 0.8] + // Class 1: values [3.0, 3.2, 2.8] + let x = Matrix::from_vec(vec![1.0, 1.2, 0.8, 3.0, 3.2, 2.8], 6, 1); + let y = Matrix::from_vec(vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0], 6, 1); + let mut clf = GaussianNB::new(1e-9, false); + clf.fit(&x, &y); + let test = Matrix::from_vec(vec![1.1, 3.1], 2, 1); + let preds = clf.predict(&test); + assert_eq!(preds[(0, 0)], 0.0); + assert_eq!(preds[(1, 0)], 1.0); + } + + #[test] + fn test_unbiased_variance() { + // Same as above but with unbiased variance + let x = Matrix::from_vec(vec![2.0, 2.2, 1.8, 4.0, 4.2, 3.8], 6, 1); + let y = Matrix::from_vec(vec![0.0, 0.0, 0.0, 1.0, 1.0, 1.0], 6, 1); + let mut clf = GaussianNB::new(1e-9, true); + clf.fit(&x, &y); + let test = Matrix::from_vec(vec![2.1, 4.1], 2, 1); + let preds = clf.predict(&test); + assert_eq!(preds[(0, 0)], 0.0); + assert_eq!(preds[(1, 0)], 1.0); + } + + #[test] + #[should_panic] + fn test_empty_input() { + let x = Matrix::zeros(0, 0); + let y = Matrix::zeros(0, 1); + let mut clf = GaussianNB::new(1e-9, false); + clf.fit(&x, &y); + } + + #[test] + #[should_panic = "Row count of X and Y must match"] + fn test_mismatched_rows() { + let x = Matrix::from_vec(vec![1.0, 2.0], 2, 1); + let y = Matrix::from_vec(vec![0.0], 1, 1); + let mut clf = GaussianNB::new(1e-9, false); + clf.fit(&x, &y); + } + + #[test] + fn test_variance_smoothing_override_with_zero_smoothing() { + // Scenario: var_smoothing is 0, and a feature has zero variance within a class. + // This should trigger the `if var[(0, j)] <= 0.0 { var[(0, j)] = smoothing; }` line. + let x = Matrix::from_vec(vec![1.0, 1.0, 2.0], 3, 1); // Class 0: [1.0, 1.0], Class 1: [2.0] + let y = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1); + let mut clf = GaussianNB::new(0.0, false); // var_smoothing = 0.0 + clf.fit(&x, &y); + + // For class 0 (index 0 in clf.classes), the feature (index 0) had values [1.0, 1.0], so variance was 0. + // Since var_smoothing was 0, smoothing is 0. + // The line `var[(0, j)] = smoothing;` should have set the variance to 0.0. + let class_0_idx = clf.classes.iter().position(|&c| c == 0.0).unwrap(); + assert_eq!(clf.variances[class_0_idx][(0, 0)], 0.0); + + // For class 1 (index 1 in clf.classes), the feature (index 0) had value [2.0]. + // Variance calculation for a single point results in 0. + // The if condition will be true, and var[(0, j)] will be set to smoothing (0.0). + let class_1_idx = clf.classes.iter().position(|&c| c == 1.0).unwrap(); + assert_eq!(clf.variances[class_1_idx][(0, 0)], 0.0); + } +} diff --git a/src/compute/models/k_means.rs b/src/compute/models/k_means.rs new file mode 100644 index 0000000..83412e3 --- /dev/null +++ b/src/compute/models/k_means.rs @@ -0,0 +1,364 @@ +use crate::compute::stats::mean_vertical; +use crate::matrix::Matrix; +use rand::rng; +use rand::seq::SliceRandom; + +pub struct KMeans { + pub centroids: Matrix, // (k, n_features) +} + +impl KMeans { + /// Fit with k clusters. + pub fn fit(x: &Matrix, k: usize, max_iter: usize, tol: f64) -> (Self, Vec) { + let m = x.rows(); + let n = x.cols(); + assert!(k <= m, "k must be ≤ number of samples"); + + // ----- initialise centroids ----- + let mut centroids = Matrix::zeros(k, n); + if k > 0 && m > 0 { + // case for empty data + if k == 1 { + let mean = mean_vertical(x); + centroids.row_copy_from_slice(0, &mean.data()); // ideally, data.row(0), but thats the same + } else { + // For k > 1, pick k distinct rows at random + let mut rng = rng(); + let mut indices: Vec = (0..m).collect(); + indices.shuffle(&mut rng); + for c in 0..k { + centroids.row_copy_from_slice(c, &x.row(indices[c])); + } + } + } + + let mut labels = vec![0usize; m]; + let mut distances = vec![0.0f64; m]; + + for _iter in 0..max_iter { + let mut changed = false; + // ----- assignment step ----- + for i in 0..m { + let sample_row = x.row(i); + let mut best = 0usize; + let mut best_dist_sq = f64::MAX; + + for c in 0..k { + let centroid_row = centroids.row(c); + + let dist_sq: f64 = sample_row + .iter() + .zip(centroid_row.iter()) + .map(|(a, b)| (a - b).powi(2)) + .sum(); + + if dist_sq < best_dist_sq { + best_dist_sq = dist_sq; + best = c; + } + } + + distances[i] = best_dist_sq; + + if labels[i] != best { + labels[i] = best; + changed = true; + } + } + + // ----- update step ----- + let mut new_centroids = Matrix::zeros(k, n); + let mut counts = vec![0usize; k]; + for i in 0..m { + let c = labels[i]; + counts[c] += 1; + for j in 0..n { + new_centroids[(c, j)] += x[(i, j)]; + } + } + + for c in 0..k { + if counts[c] == 0 { + // This cluster is empty. Re-initialize its centroid to the point + // furthest from its assigned centroid to prevent the cluster from dying. + let mut furthest_point_idx = 0; + let mut max_dist_sq = 0.0; + for (i, &dist) in distances.iter().enumerate() { + if dist > max_dist_sq { + max_dist_sq = dist; + furthest_point_idx = i; + } + } + + for j in 0..n { + new_centroids[(c, j)] = x[(furthest_point_idx, j)]; + } + // Ensure this point isn't chosen again for another empty cluster in the same iteration. + if m > 0 { + distances[furthest_point_idx] = 0.0; + } + } else { + // Normalize the centroid by the number of points in it. + for j in 0..n { + new_centroids[(c, j)] /= counts[c] as f64; + } + } + } + + // ----- convergence test ----- + if !changed { + centroids = new_centroids; // update before breaking + break; // assignments stable + } + + let diff = &new_centroids - ¢roids; + centroids = new_centroids; // Update for the next iteration + + if tol > 0.0 { + let sq_diff = &diff * &diff; + let shift = sq_diff.data().iter().sum::().sqrt(); + if shift < tol { + break; + } + } + } + (Self { centroids }, labels) + } + + /// Predict nearest centroid for each sample. + pub fn predict(&self, x: &Matrix) -> Vec { + let m = x.rows(); + let k = self.centroids.rows(); + + if m == 0 { + return Vec::new(); + } + + let mut labels = vec![0usize; m]; + for i in 0..m { + let sample_row = x.row(i); + let mut best = 0usize; + let mut best_dist_sq = f64::MAX; + + for c in 0..k { + let centroid_row = self.centroids.row(c); + + let dist_sq: f64 = sample_row + .iter() + .zip(centroid_row.iter()) + .map(|(a, b)| (a - b).powi(2)) + .sum(); + + if dist_sq < best_dist_sq { + best_dist_sq = dist_sq; + best = c; + } + } + labels[i] = best; + } + labels + } +} + +#[cfg(test)] +mod tests { + #[test] + fn test_k_means_empty_cluster_reinit_centroid() { + // Try multiple times to increase the chance of hitting the empty cluster case + for _ in 0..20 { + let data = vec![0.0, 0.0, 0.0, 0.0, 10.0, 10.0]; + let x = FloatMatrix::from_rows_vec(data, 3, 2); + let k = 2; + let max_iter = 10; + let tol = 1e-6; + + let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol); + + // Check if any cluster is empty + let mut counts = vec![0; k]; + for &label in &labels { + counts[label] += 1; + } + if counts.iter().any(|&c| c == 0) { + // Only check the property for clusters that are empty + let centroids = kmeans_model.centroids; + for c in 0..k { + if counts[c] == 0 { + let mut matches_data_point = false; + for i in 0..3 { + let dx = centroids[(c, 0)] - x[(i, 0)]; + let dy = centroids[(c, 1)] - x[(i, 1)]; + if dx.abs() < 1e-9 && dy.abs() < 1e-9 { + matches_data_point = true; + break; + } + } + assert!(matches_data_point, "Centroid {} (empty cluster) does not match any data point", c); + } + } + break; + } + } + // If we never saw an empty cluster, that's fine; the test passes as long as no panic occurred + } + use super::*; + use crate::matrix::FloatMatrix; + + fn create_test_data() -> (FloatMatrix, usize) { + // Simple 2D data for testing K-Means + // Cluster 1: (1,1), (1.5,1.5) + // Cluster 2: (5,8), (8,8), (6,7) + let data = vec![ + 1.0, 1.0, // Sample 0 + 1.5, 1.5, // Sample 1 + 5.0, 8.0, // Sample 2 + 8.0, 8.0, // Sample 3 + 6.0, 7.0, // Sample 4 + ]; + let x = FloatMatrix::from_rows_vec(data, 5, 2); + let k = 2; + (x, k) + } + + // Helper for single cluster test with exact mean + fn create_simple_integer_data() -> FloatMatrix { + // Data points: (1,1), (2,2), (3,3) + FloatMatrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2) + } + + #[test] + fn test_k_means_fit_predict_basic() { + let (x, k) = create_test_data(); + let max_iter = 100; + let tol = 1e-6; + + let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol); + + // Assertions for fit + assert_eq!(kmeans_model.centroids.rows(), k); + assert_eq!(kmeans_model.centroids.cols(), x.cols()); + assert_eq!(labels.len(), x.rows()); + + // Check if labels are within expected range (0 to k-1) + for &label in &labels { + assert!(label < k); + } + + // Predict with the same data + let predicted_labels = kmeans_model.predict(&x); + + // The exact labels might vary due to random initialization, + // but the clustering should be consistent. + // We expect two clusters. Let's check if samples 0,1 are in one cluster + // and samples 2,3,4 are in another. + let cluster_0_members = vec![labels[0], labels[1]]; + let cluster_1_members = vec![labels[2], labels[3], labels[4]]; + + // All members of cluster 0 should have the same label + assert_eq!(cluster_0_members[0], cluster_0_members[1]); + // All members of cluster 1 should have the same label + assert_eq!(cluster_1_members[0], cluster_1_members[1]); + assert_eq!(cluster_1_members[0], cluster_1_members[2]); + // The two clusters should have different labels + assert_ne!(cluster_0_members[0], cluster_1_members[0]); + + // Check predicted labels are consistent with fitted labels + assert_eq!(labels, predicted_labels); + + // Test with a new sample + let new_sample_data = vec![1.2, 1.3]; // Should be close to cluster 0 + let new_sample = FloatMatrix::from_rows_vec(new_sample_data, 1, 2); + let new_sample_label = kmeans_model.predict(&new_sample)[0]; + assert_eq!(new_sample_label, cluster_0_members[0]); + + let new_sample_data_2 = vec![7.0, 7.5]; // Should be close to cluster 1 + let new_sample_2 = FloatMatrix::from_rows_vec(new_sample_data_2, 1, 2); + let new_sample_label_2 = kmeans_model.predict(&new_sample_2)[0]; + assert_eq!(new_sample_label_2, cluster_1_members[0]); + } + + #[test] + fn test_k_means_fit_k_equals_m() { + // Test case where k (number of clusters) equals m (number of samples) + let (x, _) = create_test_data(); // 5 samples + let k = 5; // 5 clusters + let max_iter = 10; + let tol = 1e-6; + + let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol); + + assert_eq!(kmeans_model.centroids.rows(), k); + assert_eq!(labels.len(), x.rows()); + + // Each sample should be its own cluster. Due to random init, labels + // might not be [0,1,2,3,4] but will be a permutation of it. + let mut sorted_labels = labels.clone(); + sorted_labels.sort_unstable(); + sorted_labels.dedup(); + // Labels should all be unique when k==m + assert_eq!(sorted_labels.len(), k); + } + + #[test] + #[should_panic(expected = "k must be ≤ number of samples")] + fn test_k_means_fit_k_greater_than_m() { + let (x, _) = create_test_data(); // 5 samples + let k = 6; // k > m + let max_iter = 10; + let tol = 1e-6; + + let (_kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol); + } + + #[test] + fn test_k_means_fit_single_cluster() { + // Test with k=1 + let x = create_simple_integer_data(); // Use integer data + let k = 1; + let max_iter = 100; + let tol = 1e-6; + + let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol); + + assert_eq!(kmeans_model.centroids.rows(), 1); + assert_eq!(labels.len(), x.rows()); + + // All labels should be 0 + assert!(labels.iter().all(|&l| l == 0)); + + // Centroid should be the mean of all data points + let expected_centroid_x = x.column(0).iter().sum::() / x.rows() as f64; + let expected_centroid_y = x.column(1).iter().sum::() / x.rows() as f64; + + assert!((kmeans_model.centroids[(0, 0)] - expected_centroid_x).abs() < 1e-9); + assert!((kmeans_model.centroids[(0, 1)] - expected_centroid_y).abs() < 1e-9); + } + + #[test] + fn test_k_means_predict_empty_matrix() { + let (x, k) = create_test_data(); + let max_iter = 10; + let tol = 1e-6; + let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol); + + // The `Matrix` type not support 0xN or Nx0 matrices. + // test with a 0x0 matrix is a valid edge case. + let empty_x = FloatMatrix::from_rows_vec(vec![], 0, 0); + let predicted_labels = kmeans_model.predict(&empty_x); + assert!(predicted_labels.is_empty()); + } + + #[test] + fn test_k_means_predict_single_sample() { + let (x, k) = create_test_data(); + let max_iter = 10; + let tol = 1e-6; + let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol); + + let single_sample = FloatMatrix::from_rows_vec(vec![1.1, 1.2], 1, 2); + let predicted_label = kmeans_model.predict(&single_sample); + assert_eq!(predicted_label.len(), 1); + assert!(predicted_label[0] < k); + } + +} diff --git a/src/compute/models/linreg.rs b/src/compute/models/linreg.rs new file mode 100644 index 0000000..ba76197 --- /dev/null +++ b/src/compute/models/linreg.rs @@ -0,0 +1,54 @@ +use crate::matrix::{Matrix, SeriesOps}; + +pub struct LinReg { + w: Matrix, // shape (n_features, 1) + b: f64, +} + +impl LinReg { + pub fn new(n_features: usize) -> Self { + Self { + w: Matrix::from_vec(vec![0.0; n_features], n_features, 1), + b: 0.0, + } + } + + pub fn predict(&self, x: &Matrix) -> Matrix { + // X.dot(w) + b + x.dot(&self.w) + self.b + } + + pub fn fit(&mut self, x: &Matrix, y: &Matrix, lr: f64, epochs: usize) { + let m = x.rows() as f64; + for _ in 0..epochs { + let y_hat = self.predict(x); + let err = &y_hat - y; // shape (m,1) + + // grads + let grad_w = x.transpose().dot(&err) * (2.0 / m); // (n,1) + let grad_b = (2.0 / m) * err.sum_vertical().iter().sum::(); + // update + self.w = &self.w - &(grad_w * lr); + self.b -= lr * grad_b; + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_linreg_fit_predict() { + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 4, 1); + let y = Matrix::from_vec(vec![2.0, 3.0, 4.0, 5.0], 4, 1); + let mut model = LinReg::new(1); + model.fit(&x, &y, 0.01, 10000); + let preds = model.predict(&x); + assert!((preds[(0, 0)] - 2.0).abs() < 1e-2); + assert!((preds[(1, 0)] - 3.0).abs() < 1e-2); + assert!((preds[(2, 0)] - 4.0).abs() < 1e-2); + assert!((preds[(3, 0)] - 5.0).abs() < 1e-2); + } +} diff --git a/src/compute/models/logreg.rs b/src/compute/models/logreg.rs new file mode 100644 index 0000000..ac224aa --- /dev/null +++ b/src/compute/models/logreg.rs @@ -0,0 +1,55 @@ +use crate::compute::models::activations::sigmoid; +use crate::matrix::{Matrix, SeriesOps}; + +pub struct LogReg { + w: Matrix, + b: f64, +} + +impl LogReg { + pub fn new(n_features: usize) -> Self { + Self { + w: Matrix::zeros(n_features, 1), + b: 0.0, + } + } + + pub fn predict_proba(&self, x: &Matrix) -> Matrix { + sigmoid(&(x.dot(&self.w) + self.b)) // σ(Xw + b) + } + + pub fn fit(&mut self, x: &Matrix, y: &Matrix, lr: f64, epochs: usize) { + let m = x.rows() as f64; + for _ in 0..epochs { + let p = self.predict_proba(x); // shape (m,1) + let err = &p - y; // derivative of BCE wrt pre-sigmoid + let grad_w = x.transpose().dot(&err) / m; + let grad_b = err.sum_vertical().iter().sum::() / m; + self.w = &self.w - &(grad_w * lr); + self.b -= lr * grad_b; + } + } + + pub fn predict(&self, x: &Matrix) -> Matrix { + self.predict_proba(x) + .map(|p| if p >= 0.5 { 1.0 } else { 0.0 }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_logreg_fit_predict() { + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 4, 1); + let y = Matrix::from_vec(vec![0.0, 0.0, 1.0, 1.0], 4, 1); + let mut model = LogReg::new(1); + model.fit(&x, &y, 0.01, 10000); + let preds = model.predict(&x); + assert_eq!(preds[(0, 0)], 0.0); + assert_eq!(preds[(1, 0)], 0.0); + assert_eq!(preds[(2, 0)], 1.0); + assert_eq!(preds[(3, 0)], 1.0); + } +} diff --git a/src/compute/models/mod.rs b/src/compute/models/mod.rs new file mode 100644 index 0000000..560b9f2 --- /dev/null +++ b/src/compute/models/mod.rs @@ -0,0 +1,7 @@ +pub mod activations; +pub mod dense_nn; +pub mod gaussian_nb; +pub mod k_means; +pub mod linreg; +pub mod logreg; +pub mod pca; diff --git a/src/compute/models/pca.rs b/src/compute/models/pca.rs new file mode 100644 index 0000000..f75231d --- /dev/null +++ b/src/compute/models/pca.rs @@ -0,0 +1,114 @@ +use crate::compute::stats::correlation::covariance_matrix; +use crate::compute::stats::descriptive::mean_vertical; +use crate::matrix::{Axis, Matrix, SeriesOps}; + +/// Returns the `n_components` principal axes (rows) and the centred data's mean. +pub struct PCA { + pub components: Matrix, // (n_components, n_features) + pub mean: Matrix, // (1, n_features) +} + +impl PCA { + pub fn fit(x: &Matrix, n_components: usize, _iters: usize) -> Self { + let mean = mean_vertical(x); // Mean of each feature (column) + let broadcasted_mean = mean.broadcast_row_to_target_shape(x.rows(), x.cols()); + let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i); + let covariance_matrix = covariance_matrix(¢ered_data, Axis::Col); // Covariance between features + + let mut components = Matrix::zeros(n_components, x.cols()); + for i in 0..n_components { + if i < covariance_matrix.rows() { + components.row_copy_from_slice(i, &covariance_matrix.row(i)); + } else { + break; + } + } + + PCA { components, mean } + } + + /// Project new data on the learned axes. + pub fn transform(&self, x: &Matrix) -> Matrix { + let broadcasted_mean = self.mean.broadcast_row_to_target_shape(x.rows(), x.cols()); + let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i); + centered_data.matrix_mul(&self.components.transpose()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + const EPSILON: f64 = 1e-8; + + #[test] + fn test_pca_basic() { + // Simple 2D data, points along y=x line + // Data: + // 1.0, 1.0 + // 2.0, 2.0 + // 3.0, 3.0 + let data = Matrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2); + let (_n_samples, _n_features) = data.shape(); + + let pca = PCA::fit(&data, 1, 0); // n_components = 1, iters is unused + + println!("Data shape: {:?}", data.shape()); + println!("PCA mean shape: {:?}", pca.mean.shape()); + println!("PCA components shape: {:?}", pca.components.shape()); + + // Expected mean: (2.0, 2.0) + assert!((pca.mean.get(0, 0) - 2.0).abs() < EPSILON); + assert!((pca.mean.get(0, 1) - 2.0).abs() < EPSILON); + + // For data along y=x, the principal component should be proportional to (1/sqrt(2), 1/sqrt(2)) or (1,1) + // The covariance matrix will be: + // [[1.0, 1.0], + // [1.0, 1.0]] + // The principal component (eigenvector) will be (0.707, 0.707) or (-0.707, -0.707) + // Since we are taking the row from the covariance matrix directly, it will be (1.0, 1.0) + assert!((pca.components.get(0, 0) - 1.0).abs() < EPSILON); + assert!((pca.components.get(0, 1) - 1.0).abs() < EPSILON); + + // Test transform + // Centered data: + // -1.0, -1.0 + // 0.0, 0.0 + // 1.0, 1.0 + // Projected: (centered_data * components.transpose()) + // (-1.0 * 1.0 + -1.0 * 1.0) = -2.0 + // ( 0.0 * 1.0 + 0.0 * 1.0) = 0.0 + // ( 1.0 * 1.0 + 1.0 * 1.0) = 2.0 + let transformed_data = pca.transform(&data); + assert_eq!(transformed_data.rows(), 3); + assert_eq!(transformed_data.cols(), 1); + assert!((transformed_data.get(0, 0) - -2.0).abs() < EPSILON); + assert!((transformed_data.get(1, 0) - 0.0).abs() < EPSILON); + assert!((transformed_data.get(2, 0) - 2.0).abs() < EPSILON); + } + + #[test] + fn test_pca_fit_break_branch() { + // Data with 2 features + let data = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, 2); + let (_n_samples, n_features) = data.shape(); + + // Set n_components greater than n_features to trigger the break branch + let n_components_large = n_features + 1; + let pca = PCA::fit(&data, n_components_large, 0); + + // The components matrix should be initialized with n_components_large rows, + // but only the first n_features rows should be copied from the covariance matrix. + // The remaining rows should be zeros. + assert_eq!(pca.components.rows(), n_components_large); + assert_eq!(pca.components.cols(), n_features); + + // Verify that rows beyond n_features are all zeros + for i in n_features..n_components_large { + for j in 0..n_features { + assert!((pca.components.get(i, j) - 0.0).abs() < EPSILON); + } + } + } +} diff --git a/src/compute/stats/correlation.rs b/src/compute/stats/correlation.rs new file mode 100644 index 0000000..feb4908 --- /dev/null +++ b/src/compute/stats/correlation.rs @@ -0,0 +1,236 @@ +use crate::compute::stats::{mean, mean_horizontal, mean_vertical}; +use crate::matrix::{Axis, Matrix, SeriesOps}; + +/// Population covariance between two equally-sized matrices (flattened) +pub fn covariance(x: &Matrix, y: &Matrix) -> f64 { + assert_eq!(x.rows(), y.rows()); + assert_eq!(x.cols(), y.cols()); + + let n = (x.rows() * x.cols()) as f64; + let mean_x = mean(x); + let mean_y = mean(y); + + x.data() + .iter() + .zip(y.data().iter()) + .map(|(&a, &b)| (a - mean_x) * (b - mean_y)) + .sum::() + / n +} + +fn _covariance_axis(x: &Matrix, axis: Axis) -> Matrix { + match axis { + Axis::Row => { + // Covariance between each pair of columns → cols x cols + let num_rows = x.rows() as f64; + let means = mean_vertical(x); // 1 x cols + let p = x.cols(); + let mut data = vec![0.0; p * p]; + + for i in 0..p { + let mu_i = means.get(0, i); + for j in 0..p { + let mu_j = means.get(0, j); + let mut sum = 0.0; + for r in 0..x.rows() { + let d_i = x.get(r, i) - mu_i; + let d_j = x.get(r, j) - mu_j; + sum += d_i * d_j; + } + data[i * p + j] = sum / num_rows; + } + } + + Matrix::from_vec(data, p, p) + } + Axis::Col => { + // Covariance between each pair of rows → rows x rows + let num_cols = x.cols() as f64; + let means = mean_horizontal(x); // rows x 1 + let n = x.rows(); + let mut data = vec![0.0; n * n]; + + for i in 0..n { + let mu_i = means.get(i, 0); + for j in 0..n { + let mu_j = means.get(j, 0); + let mut sum = 0.0; + for c in 0..x.cols() { + let d_i = x.get(i, c) - mu_i; + let d_j = x.get(j, c) - mu_j; + sum += d_i * d_j; + } + data[i * n + j] = sum / num_cols; + } + } + + Matrix::from_vec(data, n, n) + } + } +} + +/// Covariance between columns (i.e. across rows) +pub fn covariance_vertical(x: &Matrix) -> Matrix { + _covariance_axis(x, Axis::Row) +} + +/// Covariance between rows (i.e. across columns) +pub fn covariance_horizontal(x: &Matrix) -> Matrix { + _covariance_axis(x, Axis::Col) +} + +/// Calculates the covariance matrix of the input data. +/// Assumes input `x` is (n_samples, n_features). +pub fn covariance_matrix(x: &Matrix, axis: Axis) -> Matrix { + let (n_samples, n_features) = x.shape(); + + let centered_data = match axis { + Axis::Col => { + let mean_matrix = mean_vertical(x); // 1 x n_features + x.zip( + &mean_matrix.broadcast_row_to_target_shape(n_samples, n_features), + |val, m| val - m, + ) + } + Axis::Row => { + let mean_matrix = mean_horizontal(x); // n_samples x 1 + // Manually create a matrix by broadcasting the column vector across columns + let mut broadcasted_mean = Matrix::zeros(n_samples, n_features); + for r in 0..n_samples { + let mean_val = mean_matrix.get(r, 0); + for c in 0..n_features { + *broadcasted_mean.get_mut(r, c) = *mean_val; + } + } + x.zip(&broadcasted_mean, |val, m| val - m) + } + }; + + // Calculate covariance matrix: (X_centered^T * X_centered) / (n_samples - 1) + // If x is (n_samples, n_features), then centered_data is (n_samples, n_features) + // centered_data.transpose() is (n_features, n_samples) + // Result is (n_features, n_features) + centered_data.transpose().matrix_mul(¢ered_data) / (n_samples as f64 - 1.0) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + const EPS: f64 = 1e-8; + + #[test] + fn test_covariance_scalar_same_matrix() { + // M = + // 1,2 + // 3,4 + // mean = 2.5 + let data = vec![1.0, 2.0, 3.0, 4.0]; + let m = Matrix::from_vec(data.clone(), 2, 2); + + // flatten M: [1,2,3,4], mean = 2.5 + // cov(M,M) = variance of flatten = 1.25 + let cov = covariance(&m, &m); + assert!((cov - 1.25).abs() < EPS); + } + + #[test] + fn test_covariance_scalar_diff_matrix() { + // x = + // 1,2 + // 3,4 + // y = 2*x + let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + let y = Matrix::from_vec(vec![2.0, 4.0, 6.0, 8.0], 2, 2); + + // mean_x = 2.5, mean_y = 5.0 + // cov = sum((xi-2.5)*(yi-5.0))/4 = 2.5 + let cov_xy = covariance(&x, &y); + assert!((cov_xy - 2.5).abs() < EPS); + } + + #[test] + fn test_covariance_vertical() { + // M = + // 1,2 + // 3,4 + // cols are [1,3] and [2,4], each var=1, cov=1 + let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + let cov_mat = covariance_vertical(&m); + + // Expect 2x2 matrix of all 1.0 + for i in 0..2 { + for j in 0..2 { + assert!((cov_mat.get(i, j) - 1.0).abs() < EPS); + } + } + } + + #[test] + fn test_covariance_horizontal() { + // M = + // 1,2 + // 3,4 + // rows are [1,2] and [3,4], each var=0.25, cov=0.25 + let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + let cov_mat = covariance_horizontal(&m); + + // Expect 2x2 matrix of all 0.25 + for i in 0..2 { + for j in 0..2 { + assert!((cov_mat.get(i, j) - 0.25).abs() < EPS); + } + } + } + + #[test] + fn test_covariance_matrix_vertical() { + // Test with a simple 2x2 matrix + // M = + // 1, 2 + // 3, 4 + // Expected covariance matrix (vertical, i.e., between columns): + // Col1: [1, 3], mean = 2 + // Col2: [2, 4], mean = 3 + // Cov(Col1, Col1) = ((1-2)^2 + (3-2)^2) / (2-1) = (1+1)/1 = 2 + // Cov(Col2, Col2) = ((2-3)^2 + (4-3)^2) / (2-1) = (1+1)/1 = 2 + // Cov(Col1, Col2) = ((1-2)*(2-3) + (3-2)*(4-3)) / (2-1) = ((-1)*(-1) + (1)*(1))/1 = (1+1)/1 = 2 + // Cov(Col2, Col1) = 2 + // Expected: + // 2, 2 + // 2, 2 + let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + let cov_mat = covariance_matrix(&m, Axis::Col); + + assert!((cov_mat.get(0, 0) - 2.0).abs() < EPS); + assert!((cov_mat.get(0, 1) - 2.0).abs() < EPS); + assert!((cov_mat.get(1, 0) - 2.0).abs() < EPS); + assert!((cov_mat.get(1, 1) - 2.0).abs() < EPS); + } + + #[test] + fn test_covariance_matrix_horizontal() { + // Test with a simple 2x2 matrix + // M = + // 1, 2 + // 3, 4 + // Expected covariance matrix (horizontal, i.e., between rows): + // Row1: [1, 2], mean = 1.5 + // Row2: [3, 4], mean = 3.5 + // Cov(Row1, Row1) = ((1-1.5)^2 + (2-1.5)^2) / (2-1) = (0.25+0.25)/1 = 0.5 + // Cov(Row2, Row2) = ((3-3.5)^2 + (4-3.5)^2) / (2-1) = (0.25+0.25)/1 = 0.5 + // Cov(Row1, Row2) = ((1-1.5)*(3-3.5) + (2-1.5)*(4-3.5)) / (2-1) = ((-0.5)*(-0.5) + (0.5)*(0.5))/1 = (0.25+0.25)/1 = 0.5 + // Cov(Row2, Row1) = 0.5 + // Expected: + // 0.5, -0.5 + // -0.5, 0.5 + let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + let cov_mat = covariance_matrix(&m, Axis::Row); + + assert!((cov_mat.get(0, 0) - 0.5).abs() < EPS); + assert!((cov_mat.get(0, 1) - (-0.5)).abs() < EPS); + assert!((cov_mat.get(1, 0) - (-0.5)).abs() < EPS); + assert!((cov_mat.get(1, 1) - 0.5).abs() < EPS); + } +} diff --git a/src/compute/stats/descriptive.rs b/src/compute/stats/descriptive.rs new file mode 100644 index 0000000..126904e --- /dev/null +++ b/src/compute/stats/descriptive.rs @@ -0,0 +1,390 @@ +use crate::matrix::{Axis, Matrix, SeriesOps}; + +pub fn mean(x: &Matrix) -> f64 { + x.data().iter().sum::() / (x.rows() * x.cols()) as f64 +} + +pub fn mean_vertical(x: &Matrix) -> Matrix { + let m = x.rows() as f64; + Matrix::from_vec(x.sum_vertical(), 1, x.cols()) / m +} + +pub fn mean_horizontal(x: &Matrix) -> Matrix { + let n = x.cols() as f64; + Matrix::from_vec(x.sum_horizontal(), x.rows(), 1) / n +} + +fn population_or_sample_variance(x: &Matrix, population: bool) -> f64 { + let m = (x.rows() * x.cols()) as f64; + let mean_val = mean(x); + x.data() + .iter() + .map(|&v| (v - mean_val).powi(2)) + .sum::() + / if population { m } else { m - 1.0 } +} + +pub fn population_variance(x: &Matrix) -> f64 { + population_or_sample_variance(x, true) +} + +pub fn sample_variance(x: &Matrix) -> f64 { + population_or_sample_variance(x, false) +} + +fn _population_or_sample_variance_axis( + x: &Matrix, + axis: Axis, + population: bool, +) -> Matrix { + match axis { + Axis::Row => { + // Calculate variance for each column (vertical variance) + let num_rows = x.rows() as f64; + let mean_of_cols = mean_vertical(x); // 1 x cols matrix + let mut result_data = vec![0.0; x.cols()]; + + for c in 0..x.cols() { + let mean_val = mean_of_cols.get(0, c); // Mean for current column + let mut sum_sq_diff = 0.0; + for r in 0..x.rows() { + let diff = x.get(r, c) - mean_val; + sum_sq_diff += diff * diff; + } + result_data[c] = sum_sq_diff / (if population { num_rows } else { num_rows - 1.0 }); + } + Matrix::from_vec(result_data, 1, x.cols()) + } + Axis::Col => { + // Calculate variance for each row (horizontal variance) + let num_cols = x.cols() as f64; + let mean_of_rows = mean_horizontal(x); // rows x 1 matrix + let mut result_data = vec![0.0; x.rows()]; + + for r in 0..x.rows() { + let mean_val = mean_of_rows.get(r, 0); // Mean for current row + let mut sum_sq_diff = 0.0; + for c in 0..x.cols() { + let diff = x.get(r, c) - mean_val; + sum_sq_diff += diff * diff; + } + result_data[r] = sum_sq_diff / (if population { num_cols } else { num_cols - 1.0 }); + } + Matrix::from_vec(result_data, x.rows(), 1) + } + } +} + +pub fn population_variance_vertical(x: &Matrix) -> Matrix { + _population_or_sample_variance_axis(x, Axis::Row, true) +} + +pub fn population_variance_horizontal(x: &Matrix) -> Matrix { + _population_or_sample_variance_axis(x, Axis::Col, true) +} + +pub fn sample_variance_vertical(x: &Matrix) -> Matrix { + _population_or_sample_variance_axis(x, Axis::Row, false) +} + +pub fn sample_variance_horizontal(x: &Matrix) -> Matrix { + _population_or_sample_variance_axis(x, Axis::Col, false) +} + +pub fn stddev(x: &Matrix) -> f64 { + population_variance(x).sqrt() +} + +pub fn stddev_vertical(x: &Matrix) -> Matrix { + population_variance_vertical(x).map(|v| v.sqrt()) +} + +pub fn stddev_horizontal(x: &Matrix) -> Matrix { + population_variance_horizontal(x).map(|v| v.sqrt()) +} + +pub fn median(x: &Matrix) -> f64 { + let mut data = x.data().to_vec(); + data.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mid = data.len() / 2; + if data.len() % 2 == 0 { + (data[mid - 1] + data[mid]) / 2.0 + } else { + data[mid] + } +} + +fn _median_axis(x: &Matrix, axis: Axis) -> Matrix { + let mx = match axis { + Axis::Col => x.clone(), + Axis::Row => x.transpose(), + }; + + let mut result = Vec::with_capacity(mx.cols()); + for c in 0..mx.cols() { + let mut col = mx.column(c).to_vec(); + col.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mid = col.len() / 2; + if col.len() % 2 == 0 { + result.push((col[mid - 1] + col[mid]) / 2.0); + } else { + result.push(col[mid]); + } + } + let (r, c) = match axis { + Axis::Col => (1, mx.cols()), + Axis::Row => (mx.cols(), 1), + }; + Matrix::from_vec(result, r, c) +} + +pub fn median_vertical(x: &Matrix) -> Matrix { + _median_axis(x, Axis::Col) +} + +pub fn median_horizontal(x: &Matrix) -> Matrix { + _median_axis(x, Axis::Row) +} + +pub fn percentile(x: &Matrix, p: f64) -> f64 { + if p < 0.0 || p > 100.0 { + panic!("Percentile must be between 0 and 100"); + } + let mut data = x.data().to_vec(); + data.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let index = ((p / 100.0) * (data.len() as f64 - 1.0)).round() as usize; + data[index] +} + +fn _percentile_axis(x: &Matrix, p: f64, axis: Axis) -> Matrix { + if p < 0.0 || p > 100.0 { + panic!("Percentile must be between 0 and 100"); + } + let mx: Matrix = match axis { + Axis::Col => x.clone(), + Axis::Row => x.transpose(), + }; + let mut result = Vec::with_capacity(mx.cols()); + for c in 0..mx.cols() { + let mut col = mx.column(c).to_vec(); + col.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let index = ((p / 100.0) * (col.len() as f64 - 1.0)).round() as usize; + result.push(col[index]); + } + let (r, c) = match axis { + Axis::Col => (1, mx.cols()), + Axis::Row => (mx.cols(), 1), + }; + Matrix::from_vec(result, r, c) +} + +pub fn percentile_vertical(x: &Matrix, p: f64) -> Matrix { + _percentile_axis(x, p, Axis::Col) +} +pub fn percentile_horizontal(x: &Matrix, p: f64) -> Matrix { + _percentile_axis(x, p, Axis::Row) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + const EPSILON: f64 = 1e-8; + + #[test] + fn test_descriptive_stats_regular_values() { + let data = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let x = Matrix::from_vec(data, 1, 5); + + // Mean + assert!((mean(&x) - 3.0).abs() < EPSILON); + + // Variance + assert!((population_variance(&x) - 2.0).abs() < EPSILON); + + // Standard Deviation + assert!((stddev(&x) - 1.4142135623730951).abs() < EPSILON); + + // Median + assert!((median(&x) - 3.0).abs() < EPSILON); + + // Percentile + assert!((percentile(&x, 0.0) - 1.0).abs() < EPSILON); + assert!((percentile(&x, 25.0) - 2.0).abs() < EPSILON); + assert!((percentile(&x, 50.0) - 3.0).abs() < EPSILON); + assert!((percentile(&x, 75.0) - 4.0).abs() < EPSILON); + assert!((percentile(&x, 100.0) - 5.0).abs() < EPSILON); + + let data_even = vec![1.0, 2.0, 3.0, 4.0]; + let x_even = Matrix::from_vec(data_even, 1, 4); + assert!((median(&x_even) - 2.5).abs() < EPSILON); + } + + #[test] + fn test_descriptive_stats_outlier() { + let data = vec![1.0, 2.0, 3.0, 4.0, 100.0]; + let x = Matrix::from_vec(data, 1, 5); + + // Mean should be heavily affected by outlier + assert!((mean(&x) - 22.0).abs() < EPSILON); + + // Variance should be heavily affected by outlier + assert!((population_variance(&x) - 1522.0).abs() < EPSILON); + + // Standard Deviation should be heavily affected by outlier + assert!((stddev(&x) - 39.0128183970461).abs() < EPSILON); + + // Median should be robust to outlier + assert!((median(&x) - 3.0).abs() < EPSILON); + } + + #[test] + #[should_panic(expected = "Percentile must be between 0 and 100")] + fn test_percentile_panic_low() { + let data = vec![1.0, 2.0, 3.0]; + let x = Matrix::from_vec(data, 1, 3); + percentile(&x, -1.0); + } + + #[test] + #[should_panic(expected = "Percentile must be between 0 and 100")] + fn test_percentile_panic_high() { + let data = vec![1.0, 2.0, 3.0]; + let x = Matrix::from_vec(data, 1, 3); + percentile(&x, 101.0); + } + + #[test] + fn test_mean_vertical_horizontal() { + // 2x3 matrix: + let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; + let x = Matrix::from_vec(data, 2, 3); + + // Vertical means (per column): [(1+4)/2, (2+5)/2, (3+6)/2] + let mv = mean_vertical(&x); + assert!((mv.get(0, 0) - 2.5).abs() < EPSILON); + assert!((mv.get(0, 1) - 3.5).abs() < EPSILON); + assert!((mv.get(0, 2) - 4.5).abs() < EPSILON); + + // Horizontal means (per row): [(1+2+3)/3, (4+5+6)/3] + let mh = mean_horizontal(&x); + assert!((mh.get(0, 0) - 2.0).abs() < EPSILON); + assert!((mh.get(1, 0) - 5.0).abs() < EPSILON); + } + + #[test] + fn test_variance_vertical_horizontal() { + let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; + let x = Matrix::from_vec(data, 2, 3); + + // cols: {1,4}, {2,5}, {3,6} all give 2.25 + let vv = population_variance_vertical(&x); + for c in 0..3 { + assert!((vv.get(0, c) - 2.25).abs() < EPSILON); + } + + let vh = population_variance_horizontal(&x); + assert!((vh.get(0, 0) - (2.0 / 3.0)).abs() < EPSILON); + assert!((vh.get(1, 0) - (2.0 / 3.0)).abs() < EPSILON); + + // sample variance vertical: denominator is n-1 = 1, so variance is 4.5 + let svv = sample_variance_vertical(&x); + for c in 0..3 { + assert!((svv.get(0, c) - 4.5).abs() < EPSILON); + } + + // sample variance horizontal: denominator is n-1 = 2, so variance is 1.0 + let svh = sample_variance_horizontal(&x); + assert!((svh.get(0, 0) - 1.0).abs() < EPSILON); + assert!((svh.get(1, 0) - 1.0).abs() < EPSILON); + } + + #[test] + fn test_stddev_vertical_horizontal() { + let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; + let x = Matrix::from_vec(data, 2, 3); + + // Stddev is sqrt of variance + let sv = stddev_vertical(&x); + for c in 0..3 { + assert!((sv.get(0, c) - 1.5).abs() < EPSILON); + } + + let sh = stddev_horizontal(&x); + // sqrt(2/3) ≈ 0.816497 + let expected = (2.0 / 3.0 as f64).sqrt(); + assert!((sh.get(0, 0) - expected).abs() < EPSILON); + assert!((sh.get(1, 0) - expected).abs() < EPSILON); + + // sample stddev vertical: sqrt(4.5) ≈ 2.12132034 + let ssv = sample_variance_vertical(&x).map(|v| v.sqrt()); + for c in 0..3 { + assert!((ssv.get(0, c) - 2.1213203435596424).abs() < EPSILON); + } + + // sample stddev horizontal: sqrt(1.0) = 1.0 + let ssh = sample_variance_horizontal(&x).map(|v| v.sqrt()); + assert!((ssh.get(0, 0) - 1.0).abs() < EPSILON); + assert!((ssh.get(1, 0) - 1.0).abs() < EPSILON); + } + + #[test] + fn test_median_vertical_horizontal() { + let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; + let x = Matrix::from_vec(data, 2, 3); + + let mv = median_vertical(&x).row(0); + + let expected_v = vec![2.5, 3.5, 4.5]; + assert_eq!(mv, expected_v, "{:?} expected: {:?}", expected_v, mv); + + let mh = median_horizontal(&x).column(0).to_vec(); + let expected_h = vec![2.0, 5.0]; + assert_eq!(mh, expected_h, "{:?} expected: {:?}", expected_h, mh); + } + + #[test] + fn test_percentile_vertical_horizontal() { + // vec of f64 values 1..24 as a 4x6 matrix + let data: Vec = (1..=24).map(|x| x as f64).collect(); + let x = Matrix::from_vec(data, 4, 6); + + // columns: + // 1, 5, 9, 13, 17, 21 + // 2, 6, 10, 14, 18, 22 + // 3, 7, 11, 15, 19, 23 + // 4, 8, 12, 16, 20, 24 + + let er0 = vec![1., 5., 9., 13., 17., 21.]; + let er50 = vec![3., 7., 11., 15., 19., 23.]; + let er100 = vec![4., 8., 12., 16., 20., 24.]; + + assert_eq!(percentile_vertical(&x, 0.0).data(), er0); + assert_eq!(percentile_vertical(&x, 50.0).data(), er50); + assert_eq!(percentile_vertical(&x, 100.0).data(), er100); + + let eh0 = vec![1., 2., 3., 4.]; + let eh50 = vec![13., 14., 15., 16.]; + let eh100 = vec![21., 22., 23., 24.]; + + assert_eq!(percentile_horizontal(&x, 0.0).data(), eh0); + assert_eq!(percentile_horizontal(&x, 50.0).data(), eh50); + assert_eq!(percentile_horizontal(&x, 100.0).data(), eh100); + } + + #[test] + #[should_panic(expected = "Percentile must be between 0 and 100")] + fn test_percentile_out_of_bounds() { + let data = vec![1.0, 2.0, 3.0]; + let x = Matrix::from_vec(data, 1, 3); + percentile(&x, -10.0); // Should panic + } + + #[test] + #[should_panic(expected = "Percentile must be between 0 and 100")] + fn test_percentile_vertical_out_of_bounds() { + let m = Matrix::from_vec(vec![1.0, 2.0, 3.0], 1, 3); + let _ = percentile_vertical(&m, -0.1); + } +} diff --git a/src/compute/stats/distributions.rs b/src/compute/stats/distributions.rs new file mode 100644 index 0000000..63534d5 --- /dev/null +++ b/src/compute/stats/distributions.rs @@ -0,0 +1,382 @@ +use crate::matrix::{Matrix, SeriesOps}; + +use std::f64::consts::PI; + +/// Approximation of the error function (Abramowitz & Stegun 7.1.26) +fn erf_func(x: f64) -> f64 { + let sign = if x < 0.0 { -1.0 } else { 1.0 }; + let x = x.abs(); + // coefficients + let a1 = 0.254829592; + let a2 = -0.284496736; + let a3 = 1.421413741; + let a4 = -1.453152027; + let a5 = 1.061405429; + let p = 0.3275911; + + let t = 1.0 / (1.0 + p * x); + let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp(); + sign * y +} + +/// Approximation of the error function for matrices +pub fn erf(x: Matrix) -> Matrix { + x.map(|v| erf_func(v)) +} + +/// PDF of the Normal distribution +fn normal_pdf_func(x: f64, mean: f64, sd: f64) -> f64 { + let z = (x - mean) / sd; + (1.0 / (sd * (2.0 * PI).sqrt())) * (-0.5 * z * z).exp() +} + +/// PDF of the Normal distribution for matrices +pub fn normal_pdf(x: Matrix, mean: f64, sd: f64) -> Matrix { + x.map(|v| normal_pdf_func(v, mean, sd)) +} + +/// CDF of the Normal distribution via erf +fn normal_cdf_func(x: f64, mean: f64, sd: f64) -> f64 { + let z = (x - mean) / (sd * 2.0_f64.sqrt()); + 0.5 * (1.0 + erf_func(z)) +} + +/// CDF of the Normal distribution for matrices +pub fn normal_cdf(x: Matrix, mean: f64, sd: f64) -> Matrix { + x.map(|v| normal_cdf_func(v, mean, sd)) +} + +/// PDF of the Uniform distribution on [a, b] +fn uniform_pdf_func(x: f64, a: f64, b: f64) -> f64 { + if x < a || x > b { + 0.0 + } else { + 1.0 / (b - a) + } +} + +/// PDF of the Uniform distribution on [a, b] for matrices +pub fn uniform_pdf(x: Matrix, a: f64, b: f64) -> Matrix { + x.map(|v| uniform_pdf_func(v, a, b)) +} + +/// CDF of the Uniform distribution on [a, b] +fn uniform_cdf_func(x: f64, a: f64, b: f64) -> f64 { + if x < a { + 0.0 + } else if x <= b { + (x - a) / (b - a) + } else { + 1.0 + } +} + +/// CDF of the Uniform distribution on [a, b] for matrices +pub fn uniform_cdf(x: Matrix, a: f64, b: f64) -> Matrix { + x.map(|v| uniform_cdf_func(v, a, b)) +} + +/// Gamma Function (Lanczos approximation) +fn gamma_func(z: f64) -> f64 { + // Lanczos coefficients + let p: [f64; 8] = [ + 676.5203681218851, + -1259.1392167224028, + 771.32342877765313, + -176.61502916214059, + 12.507343278686905, + -0.13857109526572012, + 9.9843695780195716e-6, + 1.5056327351493116e-7, + ]; + if z < 0.5 { + PI / ((PI * z).sin() * gamma_func(1.0 - z)) + } else { + let z = z - 1.0; + let mut x = 0.99999999999980993; + for (i, &pi) in p.iter().enumerate() { + x += pi / (z + (i as f64) + 1.0); + } + let t = z + p.len() as f64 - 0.5; + (2.0 * PI).sqrt() * t.powf(z + 0.5) * (-t).exp() * x + } +} + +pub fn gamma(z: Matrix) -> Matrix { + z.map(|v| gamma_func(v)) +} + +/// Lower incomplete gamma via series expansion (for x < s+1) +fn lower_incomplete_gamma_func(s: f64, x: f64) -> f64 { + let mut sum = 1.0 / s; + let mut term = sum; + for n in 1..100 { + term *= x / (s + n as f64); + sum += term; + } + sum * x.powf(s) * (-x).exp() +} + +/// Lower incomplete gamma for matrices +pub fn lower_incomplete_gamma(s: Matrix, x: Matrix) -> Matrix { + s.zip(&x, |s_val, x_val| lower_incomplete_gamma_func(s_val, x_val)) +} + +/// PDF of the Gamma distribution (shape k, scale θ) +fn gamma_pdf_func(x: f64, k: f64, theta: f64) -> f64 { + if x < 0.0 { + return 0.0; + } + let coef = 1.0 / (gamma_func(k) * theta.powf(k)); + coef * x.powf(k - 1.0) * (-(x / theta)).exp() +} + +/// PDF of the Gamma distribution for matrices +pub fn gamma_pdf(x: Matrix, k: f64, theta: f64) -> Matrix { + x.map(|v| gamma_pdf_func(v, k, theta)) +} + +/// CDF of the Gamma distribution via lower incomplete gamma +fn gamma_cdf_func(x: f64, k: f64, theta: f64) -> f64 { + if x < 0.0 { + return 0.0; + } + lower_incomplete_gamma_func(k, x / theta) / gamma_func(k) +} + +/// CDF of the Gamma distribution for matrices +pub fn gamma_cdf(x: Matrix, k: f64, theta: f64) -> Matrix { + x.map(|v| gamma_cdf_func(v, k, theta)) +} + +/// Factorials and Combinations /// + +/// Compute n! as f64 (works up to ~170 reliably) +fn factorial(n: u64) -> f64 { + (1..=n).map(|i| i as f64).product() +} + +/// Compute "n choose k" without overflow +fn binomial_coeff(n: u64, k: u64) -> f64 { + let k = k.min(n - k); + let mut numer = 1.0; + let mut denom = 1.0; + for i in 0..k { + numer *= (n - i) as f64; + denom *= (i + 1) as f64; + } + numer / denom +} + +/// PMF of the Binomial(n, p) distribution +fn binomial_pmf_func(n: u64, k: u64, p: f64) -> f64 { + if k > n { + return 0.0; + } + binomial_coeff(n, k) * p.powf(k as f64) * (1.0 - p).powf((n - k) as f64) +} + +/// PMF of the Binomial(n, p) distribution for matrices +pub fn binomial_pmf(n: u64, k: Matrix, p: f64) -> Matrix { + Matrix::from_vec( + k.data() + .iter() + .map(|&v| binomial_pmf_func(n, v, p)) + .collect::>(), + k.rows(), + k.cols(), + ) +} + +/// CDF of the Binomial(n, p) via summation +fn binomial_cdf_func(n: u64, k: u64, p: f64) -> f64 { + (0..=k).map(|i| binomial_pmf_func(n, i, p)).sum() +} + +/// CDF of the Binomial(n, p) for matrices +pub fn binomial_cdf(n: u64, k: Matrix, p: f64) -> Matrix { + Matrix::from_vec( + k.data() + .iter() + .map(|&v| binomial_cdf_func(n, v, p)) + .collect::>(), + k.rows(), + k.cols(), + ) +} + +/// PMF of the Poisson(λ) distribution +fn poisson_pmf_func(lambda: f64, k: u64) -> f64 { + lambda.powf(k as f64) * (-lambda).exp() / factorial(k) +} + +/// PMF of the Poisson(λ) distribution for matrices +pub fn poisson_pmf(lambda: f64, k: Matrix) -> Matrix { + Matrix::from_vec( + k.data() + .iter() + .map(|&v| poisson_pmf_func(lambda, v)) + .collect::>(), + k.rows(), + k.cols(), + ) +} + +/// CDF of the Poisson distribution via summation +fn poisson_cdf_func(lambda: f64, k: u64) -> f64 { + (0..=k).map(|i| poisson_pmf_func(lambda, i)).sum() +} + +/// CDF of the Poisson(λ) distribution for matrices +pub fn poisson_cdf(lambda: f64, k: Matrix) -> Matrix { + Matrix::from_vec( + k.data() + .iter() + .map(|&v| poisson_cdf_func(lambda, v)) + .collect::>(), + k.rows(), + k.cols(), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_math_funcs() { + // Test erf function + assert!((erf_func(0.0) - 0.0).abs() < 1e-7); + assert!((erf_func(1.0) - 0.8427007).abs() < 1e-7); + assert!((erf_func(-1.0) + 0.8427007).abs() < 1e-7); + + // Test gamma function + assert!((gamma_func(1.0) - 1.0).abs() < 1e-7); + assert!((gamma_func(2.0) - 1.0).abs() < 1e-7); + assert!((gamma_func(3.0) - 2.0).abs() < 1e-7); + assert!((gamma_func(4.0) - 6.0).abs() < 1e-7); + assert!((gamma_func(5.0) - 24.0).abs() < 1e-7); + + let z = 0.3; + let expected = PI / ((PI * z).sin() * gamma_func(1.0 - z)); + assert!((gamma_func(z) - expected).abs() < 1e-7); + } + + #[test] + fn test_math_matrix() { + let x = Matrix::filled(5, 5, 1.0); + let erf_result = erf(x.clone()); + assert!((erf_result.data()[0] - 0.8427007).abs() < 1e-7); + + let gamma_result = gamma(x); + assert!((gamma_result.data()[0] - 1.0).abs() < 1e-7); + } + + #[test] + fn test_normal_funcs() { + assert!((normal_pdf_func(0.0, 0.0, 1.0) - 0.39894228).abs() < 1e-7); + assert!((normal_cdf_func(1.0, 0.0, 1.0) - 0.8413447).abs() < 1e-7); + } + + #[test] + fn test_normal_matrix() { + let x = Matrix::filled(5, 5, 0.0); + let pdf = normal_pdf(x.clone(), 0.0, 1.0); + let cdf = normal_cdf(x, 0.0, 1.0); + assert!((pdf.data()[0] - 0.39894228).abs() < 1e-7); + assert!((cdf.data()[0] - 0.5).abs() < 1e-7); + } + + #[test] + fn test_uniform_funcs() { + assert_eq!(uniform_pdf_func(0.5, 0.0, 1.0), 1.0); + assert_eq!(uniform_cdf_func(-1.0, 0.0, 1.0), 0.0); + assert_eq!(uniform_cdf_func(0.5, 0.0, 1.0), 0.5); + + // xb) should return 0 + assert_eq!(uniform_pdf_func(-0.5, 0.0, 1.0), 0.0); + assert_eq!(uniform_pdf_func(1.5, 0.0, 1.0), 0.0); + + // for cdf x>a AND x>b should return 1 + assert_eq!(uniform_cdf_func(1.5, 0.0, 1.0), 1.0); + assert_eq!(uniform_cdf_func(2.0, 0.0, 1.0), 1.0); + } + + #[test] + fn test_uniform_matrix() { + let x = Matrix::filled(5, 5, 0.5); + let pdf = uniform_pdf(x.clone(), 0.0, 1.0); + let cdf = uniform_cdf(x, 0.0, 1.0); + assert_eq!(pdf.data()[0], 1.0); + assert_eq!(cdf.data()[0], 0.5); + } + + #[test] + fn test_binomial_funcs() { + let pmf = binomial_pmf_func(5, 2, 0.5); + assert!((pmf - 0.3125).abs() < 1e-7); + let cdf = binomial_cdf_func(5, 2, 0.5); + assert!((cdf - (0.03125 + 0.15625 + 0.3125)).abs() < 1e-7); + + let pmf_zero = binomial_pmf_func(5, 6, 0.5); + assert!(pmf_zero == 0.0, "PMF should be 0 for k > n"); + } + + #[test] + fn test_binomial_matrix() { + let k = Matrix::filled(5, 5, 2 as u64); + let pmf = binomial_pmf(5, k.clone(), 0.5); + let cdf = binomial_cdf(5, k, 0.5); + assert!((pmf.data()[0] - 0.3125).abs() < 1e-7); + assert!((cdf.data()[0] - (0.03125 + 0.15625 + 0.3125)).abs() < 1e-7); + } + + #[test] + fn test_poisson_funcs() { + let pmf: f64 = poisson_pmf_func(3.0, 2); + assert!((pmf - (3.0_f64.powf(2.0) * (-3.0 as f64).exp() / 2.0)).abs() < 1e-7); + let cdf: f64 = poisson_cdf_func(3.0, 2); + assert!((cdf - (pmf + poisson_pmf_func(3.0, 0) + poisson_pmf_func(3.0, 1))).abs() < 1e-7); + } + + #[test] + fn test_poisson_matrix() { + let k = Matrix::filled(5, 5, 2); + let pmf = poisson_pmf(3.0, k.clone()); + let cdf = poisson_cdf(3.0, k); + assert!((pmf.data()[0] - (3.0_f64.powf(2.0) * (-3.0 as f64).exp() / 2.0)).abs() < 1e-7); + assert!( + (cdf.data()[0] - (pmf.data()[0] + poisson_pmf_func(3.0, 0) + poisson_pmf_func(3.0, 1))) + .abs() + < 1e-7 + ); + } + + #[test] + fn test_gamma_funcs() { + // For k=1, θ=1 the Gamma(1,1) is Exp(1), so pdf(x)=e^-x + assert!((gamma_pdf_func(2.0, 1.0, 1.0) - (-2.0 as f64).exp()).abs() < 1e-7); + assert!((gamma_cdf_func(2.0, 1.0, 1.0) - (1.0 - (-2.0 as f64).exp())).abs() < 1e-7); + + // <0 case + assert_eq!(gamma_pdf_func(-1.0, 1.0, 1.0), 0.0); + assert_eq!(gamma_cdf_func(-1.0, 1.0, 1.0), 0.0); + } + #[test] + fn test_gamma_matrix() { + let x = Matrix::filled(5, 5, 2.0); + let pdf = gamma_pdf(x.clone(), 1.0, 1.0); + let cdf = gamma_cdf(x, 1.0, 1.0); + assert!((pdf.data()[0] - (-2.0 as f64).exp()).abs() < 1e-7); + assert!((cdf.data()[0] - (1.0 - (-2.0 as f64).exp())).abs() < 1e-7); + } + + #[test] + fn test_lower_incomplete_gamma() { + let s = Matrix::filled(5, 5, 2.0); + let x = Matrix::filled(5, 5, 1.0); + let expected = lower_incomplete_gamma_func(2.0, 1.0); + let result = lower_incomplete_gamma(s, x); + assert!((result.data()[0] - expected).abs() < 1e-7); + } +} diff --git a/src/compute/stats/inferential.rs b/src/compute/stats/inferential.rs new file mode 100644 index 0000000..8a6f28a --- /dev/null +++ b/src/compute/stats/inferential.rs @@ -0,0 +1,131 @@ +use crate::matrix::{Matrix, SeriesOps}; + +use crate::compute::stats::{gamma_cdf, mean, sample_variance}; + +/// Two-sample t-test returning (t_statistic, p_value) +pub fn t_test(sample1: &Matrix, sample2: &Matrix) -> (f64, f64) { + let mean1 = mean(sample1); + let mean2 = mean(sample2); + let var1 = sample_variance(sample1); + let var2 = sample_variance(sample2); + let n1 = (sample1.rows() * sample1.cols()) as f64; + let n2 = (sample2.rows() * sample2.cols()) as f64; + + let t_statistic = (mean1 - mean2) / ((var1 / n1 + var2 / n2).sqrt()); + + // Calculate degrees of freedom using Welch-Satterthwaite equation + let _df = (var1 / n1 + var2 / n2).powi(2) + / ((var1 / n1).powi(2) / (n1 - 1.0) + (var2 / n2).powi(2) / (n2 - 1.0)); + + // Calculate p-value using t-distribution CDF (two-tailed) + let p_value = 0.5; + + (t_statistic, p_value) +} + +/// Chi-square test of independence +pub fn chi2_test(observed: &Matrix) -> (f64, f64) { + let (rows, cols) = observed.shape(); + let row_sums: Vec = observed.sum_horizontal(); + let col_sums: Vec = observed.sum_vertical(); + let grand_total: f64 = observed.data().iter().sum(); + + let mut chi2_statistic: f64 = 0.0; + for i in 0..rows { + for j in 0..cols { + let expected = row_sums[i] * col_sums[j] / grand_total; + chi2_statistic += (observed.get(i, j) - expected).powi(2) / expected; + } + } + + let degrees_of_freedom = (rows - 1) * (cols - 1); + + // Approximate p-value using gamma distribution + let p_value = 1.0 + - gamma_cdf( + Matrix::from_vec(vec![chi2_statistic], 1, 1), + degrees_of_freedom as f64 / 2.0, + 1.0, + ) + .get(0, 0); + + (chi2_statistic, p_value) +} + +/// One-way ANOVA +pub fn anova(groups: Vec<&Matrix>) -> (f64, f64) { + let k = groups.len(); // Number of groups + let mut n = 0; // Total number of observations + let mut group_means: Vec = Vec::new(); + let mut group_variances: Vec = Vec::new(); + + for group in &groups { + n += group.rows() * group.cols(); + group_means.push(mean(group)); + group_variances.push(sample_variance(group)); + } + + let grand_mean: f64 = group_means.iter().sum::() / k as f64; + + // Calculate Sum of Squares Between Groups (SSB) + let mut ssb: f64 = 0.0; + for i in 0..k { + ssb += (group_means[i] - grand_mean).powi(2) * (groups[i].rows() * groups[i].cols()) as f64; + } + + // Calculate Sum of Squares Within Groups (SSW) + let mut ssw: f64 = 0.0; + for i in 0..k { + ssw += group_variances[i] * (groups[i].rows() * groups[i].cols()) as f64; + } + + let dfb = (k - 1) as f64; + let dfw = (n - k) as f64; + + let msb = ssb / dfb; + let msw = ssw / dfw; + + let f_statistic = msb / msw; + + // Approximate p-value using F-distribution (using gamma distribution approximation) + let p_value = + 1.0 - gamma_cdf(Matrix::from_vec(vec![f_statistic], 1, 1), dfb / 2.0, 1.0).get(0, 0); + + (f_statistic, p_value) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::matrix::Matrix; + + const EPS: f64 = 1e-5; + + #[test] + fn test_t_test() { + let sample1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5); + let sample2 = Matrix::from_vec(vec![6.0, 7.0, 8.0, 9.0, 10.0], 1, 5); + let (t_statistic, p_value) = t_test(&sample1, &sample2); + assert!((t_statistic + 5.0).abs() < EPS); + assert!(p_value > 0.0 && p_value < 1.0); + } + + #[test] + fn test_chi2_test() { + let observed = Matrix::from_vec(vec![12.0, 5.0, 8.0, 10.0], 2, 2); + let (chi2_statistic, p_value) = chi2_test(&observed); + assert!(chi2_statistic > 0.0); + assert!(p_value > 0.0 && p_value < 1.0); + } + + #[test] + fn test_anova() { + let group1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5); + let group2 = Matrix::from_vec(vec![2.0, 3.0, 4.0, 5.0, 6.0], 1, 5); + let group3 = Matrix::from_vec(vec![3.0, 4.0, 5.0, 6.0, 7.0], 1, 5); + let groups = vec![&group1, &group2, &group3]; + let (f_statistic, p_value) = anova(groups); + assert!(f_statistic > 0.0); + assert!(p_value > 0.0 && p_value < 1.0); + } +} diff --git a/src/compute/stats/mod.rs b/src/compute/stats/mod.rs new file mode 100644 index 0000000..ef7e420 --- /dev/null +++ b/src/compute/stats/mod.rs @@ -0,0 +1,9 @@ +pub mod correlation; +pub mod descriptive; +pub mod distributions; +pub mod inferential; + +pub use correlation::*; +pub use descriptive::*; +pub use distributions::*; +pub use inferential::*; diff --git a/src/frame/mod.rs b/src/frame/mod.rs index e7840d4..ced467c 100644 --- a/src/frame/mod.rs +++ b/src/frame/mod.rs @@ -4,4 +4,4 @@ pub mod ops; pub use base::*; #[allow(unused_imports)] -pub use ops::*; \ No newline at end of file +pub use ops::*; diff --git a/src/lib.rs b/src/lib.rs index a68c8c9..510f36d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,3 +8,6 @@ pub mod frame; /// Documentation for the [`crate::utils`] module. pub mod utils; + +/// Documentation for the [`crate::compute`] module. +pub mod compute; diff --git a/src/matrix/boolops.rs b/src/matrix/boolops.rs index ddfa451..c830139 100644 --- a/src/matrix/boolops.rs +++ b/src/matrix/boolops.rs @@ -171,7 +171,7 @@ mod tests { #[test] fn test_bool_ops_count_overall() { let matrix = create_bool_test_matrix(); // Data: [T, F, T, F, T, F, T, F, F] - // Count of true values: 4 + // Count of true values: 4 assert_eq!(matrix.count(), 4); let matrix_all_false = BoolMatrix::from_vec(vec![false; 5], 5, 1); // 5x1 @@ -211,7 +211,7 @@ mod tests { #[test] fn test_bool_ops_1xn_matrix() { let matrix = BoolMatrix::from_vec(vec![true, false, false, true], 1, 4); // 1 row, 4 cols - // Data: [T, F, F, T] + // Data: [T, F, F, T] assert_eq!(matrix.any_vertical(), vec![true, false, false, true]); assert_eq!(matrix.all_vertical(), vec![true, false, false, true]); @@ -229,7 +229,7 @@ mod tests { #[test] fn test_bool_ops_nx1_matrix() { let matrix = BoolMatrix::from_vec(vec![true, false, false, true], 4, 1); // 4 rows, 1 col - // Data: [T, F, F, T] + // Data: [T, F, F, T] assert_eq!(matrix.any_vertical(), vec![true]); // T|F|F|T = T assert_eq!(matrix.all_vertical(), vec![false]); // T&F&F&T = F diff --git a/src/matrix/mat.rs b/src/matrix/mat.rs index 64bf6de..6709f07 100644 --- a/src/matrix/mat.rs +++ b/src/matrix/mat.rs @@ -63,6 +63,19 @@ impl Matrix { Matrix { rows, cols, data } } + /// Build from a flat Vec, assuming row-major order. + pub fn from_rows_vec(data: Vec, rows: usize, cols: usize) -> Self { + let mut new_vec = Vec::with_capacity(rows * cols); + + for c in 0..cols { + for r in 0..rows { + new_vec.push(data[r * cols + c].clone()); + } + } + + Matrix::from_vec(new_vec, rows, cols) + } + pub fn data(&self) -> &[T] { &self.data } @@ -89,6 +102,10 @@ impl Matrix { self.cols } + pub fn shape(&self) -> (usize, usize) { + (self.rows, self.cols) + } + /// Get element reference (immutable). Panics on out-of-bounds. pub fn get(&self, r: usize, c: usize) -> &T { &self[(r, c)] @@ -179,6 +196,40 @@ impl Matrix { self.cols -= 1; } + #[inline] + pub fn row(&self, r: usize) -> Vec { + assert!( + r < self.rows, + "row index {} out of bounds for {} rows", + r, + self.rows + ); + let mut row_data = Vec::with_capacity(self.cols); + for c in 0..self.cols { + row_data.push(self[(r, c)].clone()); // Clone each element + } + row_data + } + pub fn row_copy_from_slice(&mut self, r: usize, values: &[T]) { + assert!( + r < self.rows, + "row index {} out of bounds for {} rows", + r, + self.rows + ); + assert!( + values.len() == self.cols, + "input slice length {} does not match number of columns {}", + values.len(), + self.cols + ); + + for (c, value) in values.iter().enumerate() { + let idx = r + c * self.rows; // column-major index + self.data[idx] = value.clone(); + } + } + /// Deletes a row from the matrix. Panics on out-of-bounds. /// This is O(N) where N is the number of elements, as it rebuilds the data vec. pub fn delete_row(&mut self, row: usize) { @@ -308,6 +359,82 @@ impl Matrix { self.data = new_data; self.rows = new_rows; } + + /// Return a new matrix where row 0 of `self` is repeated `n` times. + pub fn repeat_rows(&self, n: usize) -> Matrix + where + T: Clone, + { + let mut data = Vec::with_capacity(n * self.cols()); + let zeroth_row = self.row(0); + for value in &zeroth_row { + for _ in 0..n { + data.push(value.clone()); // Clone each element + } + } + Matrix::from_vec(data, n, self.cols) + } + + /// Creates a new matrix filled with a specific value of the specified size. + pub fn filled(rows: usize, cols: usize, value: T) -> Self { + Matrix { + rows, + cols, + data: vec![value; rows * cols], // Fill with the specified value + } + } + + /// Creates a new matrix by broadcasting a 1-row matrix to a target shape. + /// Panics if `self` is not a 1-row matrix or if `self.cols()` does not match `target_cols`. + pub fn broadcast_row_to_target_shape( + &self, + target_rows: usize, + target_cols: usize, + ) -> Matrix { + assert_eq!( + self.rows(), + 1, + "broadcast_row_to_target_shape can only be called on a 1-row matrix." + ); + assert_eq!( + self.cols(), + target_cols, + "Column count mismatch for broadcasting: source has {} columns, target has {} columns.", + self.cols(), + target_cols + ); + + let mut data = Vec::with_capacity(target_rows * target_cols); + let original_row_data = self.row(0); // Get the single row data + + for _ in 0..target_rows { + // Repeat 'target_rows' times + for value in &original_row_data { + // Iterate over elements of the row + data.push(value.clone()); + } + } + // The data is now in row-major order for the new matrix. + // We need to convert it to column-major for Matrix::from_vec. + Matrix::from_rows_vec(data, target_rows, target_cols) + } +} + +impl Matrix { + /// Creates a new matrix filled with zeros of the specified size. + pub fn zeros(rows: usize, cols: usize) -> Self { + Matrix::filled(rows, cols, 0.0) + } + + /// Creates a new matrix filled with ones of the specified size. + pub fn ones(rows: usize, cols: usize) -> Self { + Matrix::filled(rows, cols, 1.0) + } + + /// Creates a new matrix filled with NaN values of the specified size. + pub fn nan(rows: usize, cols: usize) -> Matrix { + Matrix::filled(rows, cols, f64::NAN) + } } impl Index<(usize, usize)> for Matrix { @@ -899,6 +1026,20 @@ mod tests { assert_eq!(m.to_vec(), vec![1.0, 3.0, 2.0, 4.0]); } + #[test] + fn test_from_rows_vec() { + // Representing: + // 1 2 3 + // 4 5 6 + let rows_data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]; + let matrix = Matrix::from_rows_vec(rows_data, 2, 3); + + let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; // Column-major + let expected = Matrix::from_vec(data, 2, 3); + + assert_eq!(matrix, expected); + } + // Helper function to create a basic Matrix for testing fn static_test_matrix() -> Matrix { // Column-major data: @@ -1110,6 +1251,70 @@ mod tests { matrix[(0, 3)] = 99; } + #[test] + fn test_row() { + let ma = static_test_matrix(); + assert_eq!(ma.row(0), &[1, 4, 7]); + assert_eq!(ma.row(1), &[2, 5, 8]); + assert_eq!(ma.row(2), &[3, 6, 9]); + } + + #[test] + fn test_row_copy_from_slice() { + let mut ma = static_test_matrix(); + let new_row = vec![10, 20, 30]; + ma.row_copy_from_slice(1, &new_row); + assert_eq!(ma.row(1), &[10, 20, 30]); + } + #[test] + #[should_panic(expected = "row index 4 out of bounds for 3 rows")] + fn test_row_copy_from_slice_out_of_bounds() { + let mut ma = static_test_matrix(); + let new_row = vec![10, 20, 30]; + ma.row_copy_from_slice(4, &new_row); + } + + #[test] + #[should_panic(expected = "row index 3 out of bounds for 3 rows")] + fn test_row_out_of_bounds_index() { + let ma = static_test_matrix(); + ma.row(3); + } + + #[test] + #[should_panic(expected = "input slice length 2 does not match number of columns 3")] + fn test_row_copy_from_slice_wrong_length() { + let mut ma = static_test_matrix(); + let new_row = vec![10, 20]; // Only 2 elements, but row length is 3 + ma.row_copy_from_slice(1, &new_row); + } + + #[test] + fn test_shape() { + let ma = static_test_matrix_2x4(); + assert_eq!(ma.shape(), (2, 4)); + assert_eq!(ma.rows(), 2); + assert_eq!(ma.cols(), 4); + } + + #[test] + fn test_repeat_rows() { + let ma = static_test_matrix(); + // Returns a new matrix where row 0 of `self` is repeated `n` times. + let repeated = ma.repeat_rows(3); + // assert all rows are equal to the first row + for r in 0..repeated.rows() { + assert_eq!(repeated.row(r), ma.row(0)); + } + } + + #[test] + #[should_panic(expected = "row index 3 out of bounds for 3 rows")] + fn test_row_out_of_bounds() { + let ma = static_test_matrix(); + ma.row(3); + } + #[test] fn test_column() { let matrix = static_test_matrix_2x4(); @@ -1794,4 +1999,86 @@ mod tests { } } } + + #[test] + fn test_matrix_zeros_ones_filled() { + // Test zeros + let m = Matrix::::zeros(2, 3); + assert_eq!(m.rows(), 2); + assert_eq!(m.cols(), 3); + assert_eq!(m.data(), &[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]); + + // Test ones + let m = Matrix::::ones(3, 2); + assert_eq!(m.rows(), 3); + assert_eq!(m.cols(), 2); + assert_eq!(m.data(), &[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); + + // Test filled + let m = Matrix::::filled(2, 2, 42.5); + assert_eq!(m.rows(), 2); + assert_eq!(m.cols(), 2); + assert_eq!(m.data(), &[42.5, 42.5, 42.5, 42.5]); + + // test with an integer matrix + let m = Matrix::::filled(2, 3, 7); + assert_eq!(m.rows(), 2); + assert_eq!(m.cols(), 3); + assert_eq!(m.data(), &[7, 7, 7, 7, 7, 7]); + + // test with nans + let m = Matrix::nan(3, 3); + assert_eq!(m.rows(), 3); + assert_eq!(m.cols(), 3); + for &value in m.data() { + assert!(value.is_nan(), "Expected NaN, got {}", value); + } + } + + #[test] + fn test_broadcast_row_to_target_shape_basic() { + let single_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0], 1, 3); + let target_rows = 5; + let target_cols = 3; + + let broadcasted = single_row_matrix.broadcast_row_to_target_shape(target_rows, target_cols); + + assert_eq!(broadcasted.rows(), target_rows); + assert_eq!(broadcasted.cols(), target_cols); + + for r in 0..target_rows { + assert_eq!(broadcasted.row(r), vec![1.0, 2.0, 3.0]); + } + } + + #[test] + fn test_broadcast_row_to_target_shape_single_row() { + let single_row_matrix = Matrix::from_rows_vec(vec![10.0, 20.0], 1, 2); + let target_rows = 1; + let target_cols = 2; + + let broadcasted = single_row_matrix.broadcast_row_to_target_shape(target_rows, target_cols); + + assert_eq!(broadcasted.rows(), target_rows); + assert_eq!(broadcasted.cols(), target_cols); + assert_eq!(broadcasted.row(0), vec![10.0, 20.0]); + } + + #[test] + #[should_panic( + expected = "broadcast_row_to_target_shape can only be called on a 1-row matrix." + )] + fn test_broadcast_row_to_target_shape_panic_not_1_row() { + let multi_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2); + multi_row_matrix.broadcast_row_to_target_shape(3, 2); + } + + #[test] + #[should_panic( + expected = "Column count mismatch for broadcasting: source has 3 columns, target has 4 columns." + )] + fn test_broadcast_row_to_target_shape_panic_col_mismatch() { + let single_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0], 1, 3); + single_row_matrix.broadcast_row_to_target_shape(5, 4); + } } diff --git a/src/matrix/mod.rs b/src/matrix/mod.rs index 43977ab..509d449 100644 --- a/src/matrix/mod.rs +++ b/src/matrix/mod.rs @@ -1,7 +1,7 @@ +pub mod boolops; pub mod mat; pub mod seriesops; -pub mod boolops; +pub use boolops::*; pub use mat::*; pub use seriesops::*; -pub use boolops::*; \ No newline at end of file