1 Commits

Author SHA1 Message Date
Palash Tyagi
d79cab35da Merge a08fb546a9 into 11330e464b 2025-07-07 00:02:29 +01:00
17 changed files with 160 additions and 1986 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "rustframe"
version = "0.0.1-a.20250716"
version = "0.0.1-a.0"
edition = "2021"
license = "GPL-3.0-or-later"
readme = "README.md"
@@ -19,6 +19,9 @@ rand = "^0.9.1"
[features]
bench = ["dep:criterion"]
# [dev-dependencies]
# criterion = { version = "0.5", features = ["html_reports"], optional = true }
[[bench]]
name = "benchmarks"
harness = false

View File

@@ -2,7 +2,7 @@
<!-- # <img align="center" alt="Rustframe" src=".github/rustframe_logo.png" height="50px" /> rustframe -->
<!-- though the centre tag doesn't work as it would normally, it achieves the desired effect -->
<!-- though the centre tag doesn't work as it would noramlly, it achieves the desired effect -->
📚 [Docs](https://magnus167.github.io/rustframe/) | 🐙 [GitHub](https://github.com/Magnus167/rustframe) | 🌐 [Gitea mirror](https://gitea.nulltech.uk/Magnus167/rustframe) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
@@ -15,54 +15,26 @@
## Rustframe: _A lightweight dataframe & math toolkit for Rust_
Rustframe provides intuitive dataframe, matrix, and series operations for data analysis and manipulation.
Rustframe provides intuitive dataframe, matrix, and series operations small-to-mid scale data analysis and manipulation.
Rustframe keeps things simple, safe, and readable. It is handy for quick numeric experiments and small analytical tasks as well as for educational purposes. It is designed to be easy to use and understand, with a clean API implemented in 100% safe Rust.
Rustframe is an educational project, and is not intended for production use. It is **not** meant to compete with powerhouse crates like `polars` or `ndarray`. It is a work in progress, and the API is subject to change. There are no guarantees of stability or performance, and it is not optimized for large datasets or high-performance computing.
Rustframe keeps things simple, safe, and readable. It is handy for quick numeric experiments and small analytical tasks, but it is **not** meant to compete with powerhouse crates like `polars` or `ndarray`.
### What it offers
- **Matrix operations** - Element-wise arithmetic, boolean logic, transpose, and more.
- **Math that reads like math** - element-wise `+`, ``, `×`, `÷` on entire frames or scalars.
- **Frames** - Column major data structure for single-type data, with labeled columns and typed row indices.
- **Compute module** - Implements various statistical computations and machine learning models.
- **[Coming Soon]** _DataFrame_ - Multi-type data structure for heterogeneous data, with labeled columns and typed row indices.
- **[Coming Soon]** _Random number utils_ - Random number generation utilities for statistical sampling and simulations. (Currently using the [`rand`](https://crates.io/crates/rand) crate.)
#### Matrix and Frame functionality
- **Matrix operations** - Element-wise arithmetic, boolean logic, transpose, and more.
- **Frame operations** - Column manipulation, sorting, and more.
#### Compute Module
The `compute` module provides implementations for various statistical computations and machine learning models.
**Statistics, Data Analysis, and Machine Learning:**
- Correlation analysis
- Descriptive statistics
- Distributions
- Inferential statistics
- Dense Neural Networks
- Gaussian Naive Bayes
- K-Means Clustering
- Linear Regression
- Logistic Regression
- Principal Component Analysis
- **Math that reads like math** - elementwise `+`, ``, `×`, `÷` on entire frames or scalars.
- **Broadcast & reduce** - sum, product, any/all across rows or columns without boilerplate.
- **Boolean masks made simple** - chain comparisons, combine with `&`/`|`, get a tidy `BoolMatrix` back.
- **Datecentric row index** - businessday ranges and calendar slicing built in.
- **Pure safe Rust** - 100% safe, zero `unsafe`.
### Heads up
- **Not memoryefficient (yet)** - footprint needs work.
- **The feature set is still limited** - expect missing pieces.
- **Feature set still small** - expect missing pieces.
### Somewhere down the line
### On the horizon
- Optional GPU acceleration (Vulkan or similar) for heavier workloads.
- Optional GPU help (Vulkan or similar) for heavier workloads.
- Straightforward Python bindings using `pyo3`.
---
@@ -79,7 +51,7 @@ use rustframe::{
let n_periods = 4;
// Four business days starting 2024-01-02
// Four business days starting 20240102
let dates: Vec<NaiveDate> =
BDatesList::from_n_periods("2024-01-02".to_string(), DateFreq::Daily, n_periods)
.unwrap()
@@ -114,13 +86,13 @@ let result: Matrix<f64> = result / 2.0; // divide by scalar
let check: bool = result.eq_elem(ma.clone()).all();
assert!(check);
// Alternatively:
// The above math can also be written as:
let check: bool = (&(&(&(&ma + 1.0) - 1.0) * 2.0) / 2.0)
.eq_elem(ma.clone())
.all();
assert!(check);
// or even as:
// The above math can also be written as:
let check: bool = ((((ma.clone() + 1.0) - 1.0) * 2.0) / 2.0)
.eq_elem(ma.clone())
.all();
@@ -191,11 +163,3 @@ E.g. to run the `game_of_life` example:
```bash
cargo run --example game_of_life
```
### Running benchmarks
To run the benchmarks, use:
```bash
cargo bench --features "bench"
```

View File

@@ -1,3 +1,4 @@
pub mod models;
pub mod stats;
pub mod stats;

View File

@@ -242,22 +242,6 @@ mod tests {
assert_eq!(preds.cols(), 1);
}
#[test]
#[should_panic(expected = "Number of activation functions must match number of layers")]
fn test_invalid_activation_count() {
let config = DenseNNConfig {
input_size: 2,
hidden_layers: vec![3],
activations: vec![ActivationKind::Relu], // Only one activation for two layers
output_size: 1,
initializer: InitializerKind::Uniform(0.1),
loss: LossKind::MSE,
learning_rate: 0.01,
epochs: 0,
};
let _model = DenseNN::new(config);
}
#[test]
fn test_train_no_epochs_does_nothing() {
let config = DenseNNConfig {
@@ -280,8 +264,10 @@ mod tests {
for i in 0..before.rows() {
for j in 0..before.cols() {
// "prediction changed despite 0 epochs"
assert!((before[(i, j)] - after[(i, j)]).abs() < 1e-12);
assert!(
(before[(i, j)] - after[(i, j)]).abs() < 1e-12,
"prediction changed despite 0 epochs"
);
}
}
}
@@ -344,181 +330,11 @@ mod tests {
let after_preds = model.predict(&x);
let after_loss = mse_loss(&after_preds, &y);
// MSE did not decrease (before: {}, after: {})
assert!(after_loss < before_loss);
}
#[test]
fn test_activation_kind_forward_tanh() {
let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1);
let expected = Matrix::from_vec(vec![-0.76159415595, 0.0, 0.76159415595], 3, 1);
let output = ActivationKind::Tanh.forward(&input);
for i in 0..input.rows() {
for j in 0..input.cols() {
// Tanh forward output mismatch at ({}, {})
assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9);
}
}
}
#[test]
fn test_activation_kind_derivative_relu() {
let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1);
let expected = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1);
let output = ActivationKind::Relu.derivative(&input);
for i in 0..input.rows() {
for j in 0..input.cols() {
// "ReLU derivative output mismatch at ({}, {})"
assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9);
}
}
}
#[test]
fn test_activation_kind_derivative_tanh() {
let input = Matrix::from_vec(vec![-1.0, 0.0, 1.0], 3, 1);
let expected = Matrix::from_vec(vec![0.41997434161, 1.0, 0.41997434161], 3, 1); // 1 - tanh(x)^2
let output = ActivationKind::Tanh.derivative(&input);
for i in 0..input.rows() {
for j in 0..input.cols() {
// "Tanh derivative output mismatch at ({}, {})"
assert!((output[(i, j)] - expected[(i, j)]).abs() < 1e-9);
}
}
}
#[test]
fn test_initializer_kind_xavier() {
let rows = 10;
let cols = 20;
let initializer = InitializerKind::Xavier;
let matrix = initializer.initialize(rows, cols);
let limit = (6.0 / (rows + cols) as f64).sqrt();
assert_eq!(matrix.rows(), rows);
assert_eq!(matrix.cols(), cols);
for val in matrix.data() {
// Xavier initialized value out of range
assert!(*val >= -limit && *val <= limit);
}
}
#[test]
fn test_initializer_kind_he() {
let rows = 10;
let cols = 20;
let initializer = InitializerKind::He;
let matrix = initializer.initialize(rows, cols);
let limit = (2.0 / rows as f64).sqrt();
assert_eq!(matrix.rows(), rows);
assert_eq!(matrix.cols(), cols);
for val in matrix.data() {
// He initialized value out of range
assert!(*val >= -limit && *val <= limit);
}
}
#[test]
fn test_loss_kind_bce_gradient() {
let y_hat = Matrix::from_vec(vec![0.1, 0.9, 0.4], 3, 1);
let y = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1);
let expected_gradient = Matrix::from_vec(vec![0.1 / 3.0, -0.1 / 3.0, -0.1 / 3.0], 3, 1); // (y_hat - y) * (1.0 / m)
let output_gradient = LossKind::BCE.gradient(&y_hat, &y);
assert_eq!(output_gradient.rows(), expected_gradient.rows());
assert_eq!(output_gradient.cols(), expected_gradient.cols());
for i in 0..output_gradient.rows() {
for j in 0..output_gradient.cols() {
// BCE gradient output mismatch at ({}, {})
assert!((output_gradient[(i, j)] - expected_gradient[(i, j)]).abs() < 1e-9);
}
}
}
#[test]
fn test_training_reduces_bce_loss() {
// Single-layer sigmoid setup; check BCE loss goes down.
let config = DenseNNConfig {
input_size: 1,
hidden_layers: vec![],
activations: vec![ActivationKind::Sigmoid],
output_size: 1,
initializer: InitializerKind::Uniform(0.1),
loss: LossKind::BCE,
learning_rate: 1.0,
epochs: 10,
};
let mut model = DenseNN::new(config);
let x = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1);
let y = Matrix::from_vec(vec![0.0, 1.0, 0.5], 3, 1);
let before_preds = model.predict(&x);
// BCE loss calculation for testing
let before_loss = -1.0 / (y.rows() as f64)
* before_preds
.zip(&y, |yh, yv| yv * yh.ln() + (1.0 - yv) * (1.0 - yh).ln())
.data()
.iter()
.sum::<f64>();
model.train(&x, &y);
let after_preds = model.predict(&x);
let after_loss = -1.0 / (y.rows() as f64)
* after_preds
.zip(&y, |yh, yv| yv * yh.ln() + (1.0 - yv) * (1.0 - yh).ln())
.data()
.iter()
.sum::<f64>();
// BCE did not decrease (before: {}, after: {})
assert!(after_loss < before_loss,);
}
#[test]
fn test_train_backprop_delta_propagation() {
// Network with two layers to test delta propagation to previous layer (l > 0)
let config = DenseNNConfig {
input_size: 2,
hidden_layers: vec![3],
activations: vec![ActivationKind::Sigmoid, ActivationKind::Sigmoid],
output_size: 1,
initializer: InitializerKind::Uniform(0.1),
loss: LossKind::MSE,
learning_rate: 0.1,
epochs: 1,
};
let mut model = DenseNN::new(config);
// Store initial weights and biases to compare after training
let initial_weights_l0 = model.weights[0].clone();
let initial_biases_l0 = model.biases[0].clone();
let initial_weights_l1 = model.weights[1].clone();
let initial_biases_l1 = model.biases[1].clone();
let x = Matrix::from_vec(vec![0.1, 0.2, 0.3, 0.4], 2, 2);
let y = Matrix::from_vec(vec![0.5, 0.6], 2, 1);
model.train(&x, &y);
// Verify that weights and biases of both layers have changed,
// implying delta propagation occurred for l > 0
// Weights of first layer did not change, delta propagation might not have occurred
assert!(model.weights[0] != initial_weights_l0);
// Biases of first layer did not change, delta propagation might not have occurred
assert!(model.biases[0] != initial_biases_l0);
// Weights of second layer did not change
assert!(model.weights[1] != initial_weights_l1);
// Biases of second layer did not change
assert!(model.biases[1] != initial_biases_l1);
assert!(
after_loss < before_loss,
"MSE did not decrease (before: {}, after: {})",
before_loss,
after_loss
);
}
}

View File

@@ -59,7 +59,9 @@ impl GaussianNB {
let bits = label.to_bits();
groups.entry(bits).or_default().push(i);
}
assert!(!groups.is_empty(), "No class labels found in y"); //-- panicked earlier
if groups.is_empty() {
panic!("No class labels found in y");
}
// Extract and sort class labels
self.classes = groups.keys().cloned().map(f64::from_bits).collect();
@@ -88,6 +90,9 @@ impl GaussianNB {
for &c in &self.classes {
let idx = &groups[&c.to_bits()];
let count = idx.len();
if count == 0 {
panic!("Class group for label {} is empty", c);
}
// Prior
self.priors.push(count as f64 / m as f64);
@@ -204,27 +209,6 @@ mod tests {
let y = Matrix::from_vec(vec![0.0], 1, 1);
let mut clf = GaussianNB::new(1e-9, false);
clf.fit(&x, &y);
}
#[test]
fn test_variance_smoothing_override_with_zero_smoothing() {
// Scenario: var_smoothing is 0, and a feature has zero variance within a class.
// This should trigger the `if var[(0, j)] <= 0.0 { var[(0, j)] = smoothing; }` line.
let x = Matrix::from_vec(vec![1.0, 1.0, 2.0], 3, 1); // Class 0: [1.0, 1.0], Class 1: [2.0]
let y = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1);
let mut clf = GaussianNB::new(0.0, false); // var_smoothing = 0.0
clf.fit(&x, &y);
// For class 0 (index 0 in clf.classes), the feature (index 0) had values [1.0, 1.0], so variance was 0.
// Since var_smoothing was 0, smoothing is 0.
// The line `var[(0, j)] = smoothing;` should have set the variance to 0.0.
let class_0_idx = clf.classes.iter().position(|&c| c == 0.0).unwrap();
assert_eq!(clf.variances[class_0_idx][(0, 0)], 0.0);
// For class 1 (index 1 in clf.classes), the feature (index 0) had value [2.0].
// Variance calculation for a single point results in 0.
// The if condition will be true, and var[(0, j)] will be set to smoothing (0.0).
let class_1_idx = clf.classes.iter().position(|&c| c == 1.0).unwrap();
assert_eq!(clf.variances[class_1_idx][(0, 0)], 0.0);
clf.predict(&x);
}
}

View File

@@ -1,6 +1,4 @@
use crate::compute::stats::mean_vertical;
use crate::matrix::Matrix;
use rand::rng;
use rand::seq::SliceRandom;
pub struct KMeans {
@@ -14,52 +12,35 @@ impl KMeans {
let n = x.cols();
assert!(k <= m, "k must be ≤ number of samples");
// ----- initialise centroids -----
// ----- initialise centroids: pick k distinct rows at random -----
let mut rng = rand::rng();
let mut indices: Vec<usize> = (0..m).collect();
indices.shuffle(&mut rng);
let mut centroids = Matrix::zeros(k, n);
if k > 0 && m > 0 {
// case for empty data
if k == 1 {
let mean = mean_vertical(x);
centroids.row_copy_from_slice(0, &mean.data()); // ideally, data.row(0), but thats the same
} else {
// For k > 1, pick k distinct rows at random
let mut rng = rng();
let mut indices: Vec<usize> = (0..m).collect();
indices.shuffle(&mut rng);
for c in 0..k {
centroids.row_copy_from_slice(c, &x.row(indices[c]));
}
for (c, &i) in indices[..k].iter().enumerate() {
for j in 0..n {
centroids[(c, j)] = x[(i, j)];
}
}
let mut labels = vec![0usize; m];
let mut distances = vec![0.0f64; m];
for _iter in 0..max_iter {
let mut changed = false;
for _ in 0..max_iter {
// ----- assignment step -----
let mut changed = false;
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist_sq = f64::MAX;
let mut best_dist = f64::MAX;
for c in 0..k {
let centroid_row = centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
let mut dist = 0.0;
for j in 0..n {
let d = x[(i, j)] - centroids[(c, j)];
dist += d * d;
}
if dist < best_dist {
best_dist = dist;
best = c;
}
}
distances[i] = best_dist_sq;
if labels[i] != best {
labels[i] = best;
changed = true;
@@ -67,57 +48,37 @@ impl KMeans {
}
// ----- update step -----
let mut new_centroids = Matrix::zeros(k, n);
let mut counts = vec![0usize; k];
let mut centroids = Matrix::zeros(k, n);
for i in 0..m {
let c = labels[i];
counts[c] += 1;
for j in 0..n {
new_centroids[(c, j)] += x[(i, j)];
centroids[(c, j)] += x[(i, j)];
}
}
for c in 0..k {
if counts[c] == 0 {
// This cluster is empty. Re-initialize its centroid to the point
// furthest from its assigned centroid to prevent the cluster from dying.
let mut furthest_point_idx = 0;
let mut max_dist_sq = 0.0;
for (i, &dist) in distances.iter().enumerate() {
if dist > max_dist_sq {
max_dist_sq = dist;
furthest_point_idx = i;
}
}
if counts[c] > 0 {
for j in 0..n {
new_centroids[(c, j)] = x[(furthest_point_idx, j)];
}
// Ensure this point isn't chosen again for another empty cluster in the same iteration.
if m > 0 {
distances[furthest_point_idx] = 0.0;
}
} else {
// Normalize the centroid by the number of points in it.
for j in 0..n {
new_centroids[(c, j)] /= counts[c] as f64;
centroids[(c, j)] /= counts[c] as f64;
}
}
}
// ----- convergence test -----
if !changed {
centroids = new_centroids; // update before breaking
break; // assignments stable
}
let diff = &new_centroids - &centroids;
centroids = new_centroids; // Update for the next iteration
if tol > 0.0 {
let sq_diff = &diff * &diff;
let shift = sq_diff.data().iter().sum::<f64>().sqrt();
if shift < tol {
// optional centroid-shift tolerance
let mut shift: f64 = 0.0;
for c in 0..k {
for j in 0..n {
let d = centroids[(c, j)] - centroids[(c, j)]; // previous stored?
shift += d * d;
}
}
if shift.sqrt() < tol {
break;
}
}
@@ -129,28 +90,19 @@ impl KMeans {
pub fn predict(&self, x: &Matrix<f64>) -> Vec<usize> {
let m = x.rows();
let k = self.centroids.rows();
if m == 0 {
return Vec::new();
}
let n = x.cols();
let mut labels = vec![0usize; m];
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist_sq = f64::MAX;
let mut best_dist = f64::MAX;
for c in 0..k {
let centroid_row = self.centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
let mut dist = 0.0;
for j in 0..n {
let d = x[(i, j)] - self.centroids[(c, j)];
dist += d * d;
}
if dist < best_dist {
best_dist = dist;
best = c;
}
}
@@ -159,206 +111,3 @@ impl KMeans {
labels
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_k_means_empty_cluster_reinit_centroid() {
// Try multiple times to increase the chance of hitting the empty cluster case
for _ in 0..20 {
let data = vec![0.0, 0.0, 0.0, 0.0, 10.0, 10.0];
let x = FloatMatrix::from_rows_vec(data, 3, 2);
let k = 2;
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Check if any cluster is empty
let mut counts = vec![0; k];
for &label in &labels {
counts[label] += 1;
}
if counts.iter().any(|&c| c == 0) {
// Only check the property for clusters that are empty
let centroids = kmeans_model.centroids;
for c in 0..k {
if counts[c] == 0 {
let mut matches_data_point = false;
for i in 0..3 {
let dx = centroids[(c, 0)] - x[(i, 0)];
let dy = centroids[(c, 1)] - x[(i, 1)];
if dx.abs() < 1e-9 && dy.abs() < 1e-9 {
matches_data_point = true;
break;
}
}
assert!(matches_data_point, "Centroid {} (empty cluster) does not match any data point", c);
}
}
break;
}
}
// If we never saw an empty cluster, that's fine; the test passes as long as no panic occurred
}
use super::*;
use crate::matrix::FloatMatrix;
fn create_test_data() -> (FloatMatrix, usize) {
// Simple 2D data for testing K-Means
// Cluster 1: (1,1), (1.5,1.5)
// Cluster 2: (5,8), (8,8), (6,7)
let data = vec![
1.0, 1.0, // Sample 0
1.5, 1.5, // Sample 1
5.0, 8.0, // Sample 2
8.0, 8.0, // Sample 3
6.0, 7.0, // Sample 4
];
let x = FloatMatrix::from_rows_vec(data, 5, 2);
let k = 2;
(x, k)
}
// Helper for single cluster test with exact mean
fn create_simple_integer_data() -> FloatMatrix {
// Data points: (1,1), (2,2), (3,3)
FloatMatrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2)
}
#[test]
fn test_k_means_fit_predict_basic() {
let (x, k) = create_test_data();
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Assertions for fit
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(kmeans_model.centroids.cols(), x.cols());
assert_eq!(labels.len(), x.rows());
// Check if labels are within expected range (0 to k-1)
for &label in &labels {
assert!(label < k);
}
// Predict with the same data
let predicted_labels = kmeans_model.predict(&x);
// The exact labels might vary due to random initialization,
// but the clustering should be consistent.
// We expect two clusters. Let's check if samples 0,1 are in one cluster
// and samples 2,3,4 are in another.
let cluster_0_members = vec![labels[0], labels[1]];
let cluster_1_members = vec![labels[2], labels[3], labels[4]];
// All members of cluster 0 should have the same label
assert_eq!(cluster_0_members[0], cluster_0_members[1]);
// All members of cluster 1 should have the same label
assert_eq!(cluster_1_members[0], cluster_1_members[1]);
assert_eq!(cluster_1_members[0], cluster_1_members[2]);
// The two clusters should have different labels
assert_ne!(cluster_0_members[0], cluster_1_members[0]);
// Check predicted labels are consistent with fitted labels
assert_eq!(labels, predicted_labels);
// Test with a new sample
let new_sample_data = vec![1.2, 1.3]; // Should be close to cluster 0
let new_sample = FloatMatrix::from_rows_vec(new_sample_data, 1, 2);
let new_sample_label = kmeans_model.predict(&new_sample)[0];
assert_eq!(new_sample_label, cluster_0_members[0]);
let new_sample_data_2 = vec![7.0, 7.5]; // Should be close to cluster 1
let new_sample_2 = FloatMatrix::from_rows_vec(new_sample_data_2, 1, 2);
let new_sample_label_2 = kmeans_model.predict(&new_sample_2)[0];
assert_eq!(new_sample_label_2, cluster_1_members[0]);
}
#[test]
fn test_k_means_fit_k_equals_m() {
// Test case where k (number of clusters) equals m (number of samples)
let (x, _) = create_test_data(); // 5 samples
let k = 5; // 5 clusters
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(labels.len(), x.rows());
// Each sample should be its own cluster. Due to random init, labels
// might not be [0,1,2,3,4] but will be a permutation of it.
let mut sorted_labels = labels.clone();
sorted_labels.sort_unstable();
sorted_labels.dedup();
// Labels should all be unique when k==m
assert_eq!(sorted_labels.len(), k);
}
#[test]
#[should_panic(expected = "k must be ≤ number of samples")]
fn test_k_means_fit_k_greater_than_m() {
let (x, _) = create_test_data(); // 5 samples
let k = 6; // k > m
let max_iter = 10;
let tol = 1e-6;
let (_kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
}
#[test]
fn test_k_means_fit_single_cluster() {
// Test with k=1
let x = create_simple_integer_data(); // Use integer data
let k = 1;
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), 1);
assert_eq!(labels.len(), x.rows());
// All labels should be 0
assert!(labels.iter().all(|&l| l == 0));
// Centroid should be the mean of all data points
let expected_centroid_x = x.column(0).iter().sum::<f64>() / x.rows() as f64;
let expected_centroid_y = x.column(1).iter().sum::<f64>() / x.rows() as f64;
assert!((kmeans_model.centroids[(0, 0)] - expected_centroid_x).abs() < 1e-9);
assert!((kmeans_model.centroids[(0, 1)] - expected_centroid_y).abs() < 1e-9);
}
#[test]
fn test_k_means_predict_empty_matrix() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
// The `Matrix` type not support 0xN or Nx0 matrices.
// test with a 0x0 matrix is a valid edge case.
let empty_x = FloatMatrix::from_rows_vec(vec![], 0, 0);
let predicted_labels = kmeans_model.predict(&empty_x);
assert!(predicted_labels.is_empty());
}
#[test]
fn test_k_means_predict_single_sample() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
let single_sample = FloatMatrix::from_rows_vec(vec![1.1, 1.2], 1, 2);
let predicted_label = kmeans_model.predict(&single_sample);
assert_eq!(predicted_label.len(), 1);
assert!(predicted_label[0] < k);
}
}

View File

@@ -1,7 +1,7 @@
pub mod activations;
pub mod dense_nn;
pub mod gaussian_nb;
pub mod k_means;
pub mod linreg;
pub mod logreg;
pub mod dense_nn;
pub mod k_means;
pub mod pca;
pub mod gaussian_nb;
pub mod activations;

View File

@@ -1,114 +1,85 @@
use crate::compute::stats::correlation::covariance_matrix;
use crate::compute::stats::descriptive::mean_vertical;
use crate::matrix::{Axis, Matrix, SeriesOps};
use crate::matrix::{Matrix, SeriesOps};
use rand;
/// Returns the `n_components` principal axes (rows) and the centred data's mean.
/// Returns the `n_components` principal axes (rows) and the centred datas mean.
pub struct PCA {
pub components: Matrix<f64>, // (n_components, n_features)
pub mean: Matrix<f64>, // (1, n_features)
pub components: Matrix<f64>, // (n_components, n_features)
pub mean: Matrix<f64>, // (1, n_features)
}
impl PCA {
pub fn fit(x: &Matrix<f64>, n_components: usize, _iters: usize) -> Self {
let mean = mean_vertical(x); // Mean of each feature (column)
let broadcasted_mean = mean.broadcast_row_to_target_shape(x.rows(), x.cols());
let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i);
let covariance_matrix = covariance_matrix(&centered_data, Axis::Col); // Covariance between features
pub fn fit(x: &Matrix<f64>, n_components: usize, iters: usize) -> Self {
let m = x.rows();
let n = x.cols();
assert!(n_components <= n);
let mut components = Matrix::zeros(n_components, x.cols());
for i in 0..n_components {
if i < covariance_matrix.rows() {
components.row_copy_from_slice(i, &covariance_matrix.row(i));
} else {
break;
// ----- centre data -----
let mean_vec = {
let mut v = Matrix::zeros(1, n);
for j in 0..n {
let mut s = 0.0;
for i in 0..m {
s += x[(i, j)];
}
v[(0, j)] = s / m as f64;
}
v
};
let x_centered = x - &mean_vec;
// ----- covariance matrix C = Xᵀ·X / (m-1) -----
let cov = x_centered.transpose().dot(&x_centered) * (1.0 / (m as f64 - 1.0));
// ----- power iteration to find top eigenvectors -----
let mut comp = Matrix::zeros(n_components, n);
let mut b = Matrix::zeros(1, n); // current vector
for c in 0..n_components {
// random initial vector
for j in 0..n {
b[(0, j)] = rand::random::<f64>() - 0.5;
}
// subtract projections on previously found components
for prev in 0..c {
// let proj = b.dot(Matrix::from_vec(data, rows, cols).transpose())[(0, 0)];
// let proj = b.dot(&comp.row(prev).transpose())[(0, 0)];
let proj = b.dot(&Matrix::from_vec(comp.row(prev).to_vec(), 1, n).transpose())[(0, 0)];
// subtract projection to maintain orthogonality
for j in 0..n {
b[(0, j)] -= proj * comp[(prev, j)];
}
}
// iterate
for _ in 0..iters {
// b = C·bᵀ
let mut nb = cov.dot(&b.transpose()).transpose();
// subtract projections again to maintain orthogonality
for prev in 0..c {
let proj = nb.dot(&Matrix::from_vec(comp.row(prev).to_vec(), 1, n).transpose())[(0, 0)];
for j in 0..n {
nb[(0, j)] -= proj * comp[(prev, j)];
}
}
// normalise
let norm = nb.data().iter().map(|v| v * v).sum::<f64>().sqrt();
for j in 0..n {
nb[(0, j)] /= norm;
}
b = nb;
}
// store component
for j in 0..n {
comp[(c, j)] = b[(0, j)];
}
}
PCA { components, mean }
Self {
components: comp,
mean: mean_vec,
}
}
/// Project new data on the learned axes.
pub fn transform(&self, x: &Matrix<f64>) -> Matrix<f64> {
let broadcasted_mean = self.mean.broadcast_row_to_target_shape(x.rows(), x.cols());
let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i);
centered_data.matrix_mul(&self.components.transpose())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::matrix::Matrix;
const EPSILON: f64 = 1e-8;
#[test]
fn test_pca_basic() {
// Simple 2D data, points along y=x line
// Data:
// 1.0, 1.0
// 2.0, 2.0
// 3.0, 3.0
let data = Matrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2);
let (_n_samples, _n_features) = data.shape();
let pca = PCA::fit(&data, 1, 0); // n_components = 1, iters is unused
println!("Data shape: {:?}", data.shape());
println!("PCA mean shape: {:?}", pca.mean.shape());
println!("PCA components shape: {:?}", pca.components.shape());
// Expected mean: (2.0, 2.0)
assert!((pca.mean.get(0, 0) - 2.0).abs() < EPSILON);
assert!((pca.mean.get(0, 1) - 2.0).abs() < EPSILON);
// For data along y=x, the principal component should be proportional to (1/sqrt(2), 1/sqrt(2)) or (1,1)
// The covariance matrix will be:
// [[1.0, 1.0],
// [1.0, 1.0]]
// The principal component (eigenvector) will be (0.707, 0.707) or (-0.707, -0.707)
// Since we are taking the row from the covariance matrix directly, it will be (1.0, 1.0)
assert!((pca.components.get(0, 0) - 1.0).abs() < EPSILON);
assert!((pca.components.get(0, 1) - 1.0).abs() < EPSILON);
// Test transform
// Centered data:
// -1.0, -1.0
// 0.0, 0.0
// 1.0, 1.0
// Projected: (centered_data * components.transpose())
// (-1.0 * 1.0 + -1.0 * 1.0) = -2.0
// ( 0.0 * 1.0 + 0.0 * 1.0) = 0.0
// ( 1.0 * 1.0 + 1.0 * 1.0) = 2.0
let transformed_data = pca.transform(&data);
assert_eq!(transformed_data.rows(), 3);
assert_eq!(transformed_data.cols(), 1);
assert!((transformed_data.get(0, 0) - -2.0).abs() < EPSILON);
assert!((transformed_data.get(1, 0) - 0.0).abs() < EPSILON);
assert!((transformed_data.get(2, 0) - 2.0).abs() < EPSILON);
}
#[test]
fn test_pca_fit_break_branch() {
// Data with 2 features
let data = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, 2);
let (_n_samples, n_features) = data.shape();
// Set n_components greater than n_features to trigger the break branch
let n_components_large = n_features + 1;
let pca = PCA::fit(&data, n_components_large, 0);
// The components matrix should be initialized with n_components_large rows,
// but only the first n_features rows should be copied from the covariance matrix.
// The remaining rows should be zeros.
assert_eq!(pca.components.rows(), n_components_large);
assert_eq!(pca.components.cols(), n_features);
// Verify that rows beyond n_features are all zeros
for i in n_features..n_components_large {
for j in 0..n_features {
assert!((pca.components.get(i, j) - 0.0).abs() < EPSILON);
}
}
let x_centered = x - &self.mean;
x_centered.dot(&self.components.transpose())
}
}

View File

@@ -1,251 +0,0 @@
use crate::compute::stats::{mean, mean_horizontal, mean_vertical, stddev};
use crate::matrix::{Axis, Matrix, SeriesOps};
/// Population covariance between two equally-sized matrices (flattened)
pub fn covariance(x: &Matrix<f64>, y: &Matrix<f64>) -> f64 {
assert_eq!(x.rows(), y.rows());
assert_eq!(x.cols(), y.cols());
let n = (x.rows() * x.cols()) as f64;
let mean_x = mean(x);
let mean_y = mean(y);
x.data()
.iter()
.zip(y.data().iter())
.map(|(&a, &b)| (a - mean_x) * (b - mean_y))
.sum::<f64>()
/ n
}
fn _covariance_axis(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
match axis {
Axis::Row => {
// Covariance between each pair of columns → cols x cols
let num_rows = x.rows() as f64;
let means = mean_vertical(x); // 1 x cols
let p = x.cols();
let mut data = vec![0.0; p * p];
for i in 0..p {
let mu_i = means.get(0, i);
for j in 0..p {
let mu_j = means.get(0, j);
let mut sum = 0.0;
for r in 0..x.rows() {
let d_i = x.get(r, i) - mu_i;
let d_j = x.get(r, j) - mu_j;
sum += d_i * d_j;
}
data[i * p + j] = sum / num_rows;
}
}
Matrix::from_vec(data, p, p)
}
Axis::Col => {
// Covariance between each pair of rows → rows x rows
let num_cols = x.cols() as f64;
let means = mean_horizontal(x); // rows x 1
let n = x.rows();
let mut data = vec![0.0; n * n];
for i in 0..n {
let mu_i = means.get(i, 0);
for j in 0..n {
let mu_j = means.get(j, 0);
let mut sum = 0.0;
for c in 0..x.cols() {
let d_i = x.get(i, c) - mu_i;
let d_j = x.get(j, c) - mu_j;
sum += d_i * d_j;
}
data[i * n + j] = sum / num_cols;
}
}
Matrix::from_vec(data, n, n)
}
}
}
/// Covariance between columns (i.e. across rows)
pub fn covariance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_covariance_axis(x, Axis::Row)
}
/// Covariance between rows (i.e. across columns)
pub fn covariance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_covariance_axis(x, Axis::Col)
}
/// Calculates the covariance matrix of the input data.
/// Assumes input `x` is (n_samples, n_features).
pub fn covariance_matrix(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
let (n_samples, n_features) = x.shape();
let centered_data = match axis {
Axis::Col => {
let mean_matrix = mean_vertical(x); // 1 x n_features
x.zip(
&mean_matrix.broadcast_row_to_target_shape(n_samples, n_features),
|val, m| val - m,
)
}
Axis::Row => {
let mean_matrix = mean_horizontal(x); // n_samples x 1
// Manually create a matrix by broadcasting the column vector across columns
let mut broadcasted_mean = Matrix::zeros(n_samples, n_features);
for r in 0..n_samples {
let mean_val = mean_matrix.get(r, 0);
for c in 0..n_features {
*broadcasted_mean.get_mut(r, c) = *mean_val;
}
}
x.zip(&broadcasted_mean, |val, m| val - m)
}
};
// Calculate covariance matrix: (X_centered^T * X_centered) / (n_samples - 1)
// If x is (n_samples, n_features), then centered_data is (n_samples, n_features)
// centered_data.transpose() is (n_features, n_samples)
// Result is (n_features, n_features)
centered_data.transpose().matrix_mul(&centered_data) / (n_samples as f64 - 1.0)
}
pub fn pearson(x: &Matrix<f64>, y: &Matrix<f64>) -> f64 {
assert_eq!(x.rows(), y.rows());
assert_eq!(x.cols(), y.cols());
let cov = covariance(x, y);
let std_x = stddev(x);
let std_y = stddev(y);
if std_x == 0.0 || std_y == 0.0 {
return 0.0; // Avoid division by zero
}
cov / (std_x * std_y)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::matrix::Matrix;
const EPS: f64 = 1e-8;
#[test]
fn test_covariance_scalar_same_matrix() {
// M =
// 1,2
// 3,4
// mean = 2.5
let data = vec![1.0, 2.0, 3.0, 4.0];
let m = Matrix::from_vec(data.clone(), 2, 2);
// flatten M: [1,2,3,4], mean = 2.5
// cov(M,M) = variance of flatten = 1.25
let cov = covariance(&m, &m);
assert!((cov - 1.25).abs() < EPS);
}
#[test]
fn test_covariance_scalar_diff_matrix() {
// x =
// 1,2
// 3,4
// y = 2*x
let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
let y = Matrix::from_vec(vec![2.0, 4.0, 6.0, 8.0], 2, 2);
// mean_x = 2.5, mean_y = 5.0
// cov = sum((xi-2.5)*(yi-5.0))/4 = 2.5
let cov_xy = covariance(&x, &y);
assert!((cov_xy - 2.5).abs() < EPS);
}
#[test]
fn test_covariance_vertical() {
// M =
// 1,2
// 3,4
// cols are [1,3] and [2,4], each var=1, cov=1
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
let cov_mat = covariance_vertical(&m);
// Expect 2x2 matrix of all 1.0
for i in 0..2 {
for j in 0..2 {
assert!((cov_mat.get(i, j) - 1.0).abs() < EPS);
}
}
}
#[test]
fn test_covariance_horizontal() {
// M =
// 1,2
// 3,4
// rows are [1,2] and [3,4], each var=0.25, cov=0.25
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
let cov_mat = covariance_horizontal(&m);
// Expect 2x2 matrix of all 0.25
for i in 0..2 {
for j in 0..2 {
assert!((cov_mat.get(i, j) - 0.25).abs() < EPS);
}
}
}
#[test]
fn test_covariance_matrix_vertical() {
// Test with a simple 2x2 matrix
// M =
// 1, 2
// 3, 4
// Expected covariance matrix (vertical, i.e., between columns):
// Col1: [1, 3], mean = 2
// Col2: [2, 4], mean = 3
// Cov(Col1, Col1) = ((1-2)^2 + (3-2)^2) / (2-1) = (1+1)/1 = 2
// Cov(Col2, Col2) = ((2-3)^2 + (4-3)^2) / (2-1) = (1+1)/1 = 2
// Cov(Col1, Col2) = ((1-2)*(2-3) + (3-2)*(4-3)) / (2-1) = ((-1)*(-1) + (1)*(1))/1 = (1+1)/1 = 2
// Cov(Col2, Col1) = 2
// Expected:
// 2, 2
// 2, 2
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
let cov_mat = covariance_matrix(&m, Axis::Col);
assert!((cov_mat.get(0, 0) - 2.0).abs() < EPS);
assert!((cov_mat.get(0, 1) - 2.0).abs() < EPS);
assert!((cov_mat.get(1, 0) - 2.0).abs() < EPS);
assert!((cov_mat.get(1, 1) - 2.0).abs() < EPS);
}
#[test]
fn test_covariance_matrix_horizontal() {
// Test with a simple 2x2 matrix
// M =
// 1, 2
// 3, 4
// Expected covariance matrix (horizontal, i.e., between rows):
// Row1: [1, 2], mean = 1.5
// Row2: [3, 4], mean = 3.5
// Cov(Row1, Row1) = ((1-1.5)^2 + (2-1.5)^2) / (2-1) = (0.25+0.25)/1 = 0.5
// Cov(Row2, Row2) = ((3-3.5)^2 + (4-3.5)^2) / (2-1) = (0.25+0.25)/1 = 0.5
// Cov(Row1, Row2) = ((1-1.5)*(3-3.5) + (2-1.5)*(4-3.5)) / (2-1) = ((-0.5)*(-0.5) + (0.5)*(0.5))/1 = (0.25+0.25)/1 = 0.5
// Cov(Row2, Row1) = 0.5
// Expected:
// 0.5, -0.5
// -0.5, 0.5
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
let cov_mat = covariance_matrix(&m, Axis::Row);
assert!((cov_mat.get(0, 0) - 0.5).abs() < EPS);
assert!((cov_mat.get(0, 1) - (-0.5)).abs() < EPS);
assert!((cov_mat.get(1, 0) - (-0.5)).abs() < EPS);
assert!((cov_mat.get(1, 1) - 0.5).abs() < EPS);
}
}

View File

@@ -1,390 +0,0 @@
use crate::matrix::{Axis, Matrix, SeriesOps};
pub fn mean(x: &Matrix<f64>) -> f64 {
x.data().iter().sum::<f64>() / (x.rows() * x.cols()) as f64
}
pub fn mean_vertical(x: &Matrix<f64>) -> Matrix<f64> {
let m = x.rows() as f64;
Matrix::from_vec(x.sum_vertical(), 1, x.cols()) / m
}
pub fn mean_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
let n = x.cols() as f64;
Matrix::from_vec(x.sum_horizontal(), x.rows(), 1) / n
}
fn population_or_sample_variance(x: &Matrix<f64>, population: bool) -> f64 {
let m = (x.rows() * x.cols()) as f64;
let mean_val = mean(x);
x.data()
.iter()
.map(|&v| (v - mean_val).powi(2))
.sum::<f64>()
/ if population { m } else { m - 1.0 }
}
pub fn population_variance(x: &Matrix<f64>) -> f64 {
population_or_sample_variance(x, true)
}
pub fn sample_variance(x: &Matrix<f64>) -> f64 {
population_or_sample_variance(x, false)
}
fn _population_or_sample_variance_axis(
x: &Matrix<f64>,
axis: Axis,
population: bool,
) -> Matrix<f64> {
match axis {
Axis::Row => {
// Calculate variance for each column (vertical variance)
let num_rows = x.rows() as f64;
let mean_of_cols = mean_vertical(x); // 1 x cols matrix
let mut result_data = vec![0.0; x.cols()];
for c in 0..x.cols() {
let mean_val = mean_of_cols.get(0, c); // Mean for current column
let mut sum_sq_diff = 0.0;
for r in 0..x.rows() {
let diff = x.get(r, c) - mean_val;
sum_sq_diff += diff * diff;
}
result_data[c] = sum_sq_diff / (if population { num_rows } else { num_rows - 1.0 });
}
Matrix::from_vec(result_data, 1, x.cols())
}
Axis::Col => {
// Calculate variance for each row (horizontal variance)
let num_cols = x.cols() as f64;
let mean_of_rows = mean_horizontal(x); // rows x 1 matrix
let mut result_data = vec![0.0; x.rows()];
for r in 0..x.rows() {
let mean_val = mean_of_rows.get(r, 0); // Mean for current row
let mut sum_sq_diff = 0.0;
for c in 0..x.cols() {
let diff = x.get(r, c) - mean_val;
sum_sq_diff += diff * diff;
}
result_data[r] = sum_sq_diff / (if population { num_cols } else { num_cols - 1.0 });
}
Matrix::from_vec(result_data, x.rows(), 1)
}
}
}
pub fn population_variance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Row, true)
}
pub fn population_variance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Col, true)
}
pub fn sample_variance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Row, false)
}
pub fn sample_variance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Col, false)
}
pub fn stddev(x: &Matrix<f64>) -> f64 {
population_variance(x).sqrt()
}
pub fn stddev_vertical(x: &Matrix<f64>) -> Matrix<f64> {
population_variance_vertical(x).map(|v| v.sqrt())
}
pub fn stddev_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
population_variance_horizontal(x).map(|v| v.sqrt())
}
pub fn median(x: &Matrix<f64>) -> f64 {
let mut data = x.data().to_vec();
data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mid = data.len() / 2;
if data.len() % 2 == 0 {
(data[mid - 1] + data[mid]) / 2.0
} else {
data[mid]
}
}
fn _median_axis(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
let mx = match axis {
Axis::Col => x.clone(),
Axis::Row => x.transpose(),
};
let mut result = Vec::with_capacity(mx.cols());
for c in 0..mx.cols() {
let mut col = mx.column(c).to_vec();
col.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mid = col.len() / 2;
if col.len() % 2 == 0 {
result.push((col[mid - 1] + col[mid]) / 2.0);
} else {
result.push(col[mid]);
}
}
let (r, c) = match axis {
Axis::Col => (1, mx.cols()),
Axis::Row => (mx.cols(), 1),
};
Matrix::from_vec(result, r, c)
}
pub fn median_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_median_axis(x, Axis::Col)
}
pub fn median_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_median_axis(x, Axis::Row)
}
pub fn percentile(x: &Matrix<f64>, p: f64) -> f64 {
if p < 0.0 || p > 100.0 {
panic!("Percentile must be between 0 and 100");
}
let mut data = x.data().to_vec();
data.sort_by(|a, b| a.partial_cmp(b).unwrap());
let index = ((p / 100.0) * (data.len() as f64 - 1.0)).round() as usize;
data[index]
}
fn _percentile_axis(x: &Matrix<f64>, p: f64, axis: Axis) -> Matrix<f64> {
if p < 0.0 || p > 100.0 {
panic!("Percentile must be between 0 and 100");
}
let mx: Matrix<f64> = match axis {
Axis::Col => x.clone(),
Axis::Row => x.transpose(),
};
let mut result = Vec::with_capacity(mx.cols());
for c in 0..mx.cols() {
let mut col = mx.column(c).to_vec();
col.sort_by(|a, b| a.partial_cmp(b).unwrap());
let index = ((p / 100.0) * (col.len() as f64 - 1.0)).round() as usize;
result.push(col[index]);
}
let (r, c) = match axis {
Axis::Col => (1, mx.cols()),
Axis::Row => (mx.cols(), 1),
};
Matrix::from_vec(result, r, c)
}
pub fn percentile_vertical(x: &Matrix<f64>, p: f64) -> Matrix<f64> {
_percentile_axis(x, p, Axis::Col)
}
pub fn percentile_horizontal(x: &Matrix<f64>, p: f64) -> Matrix<f64> {
_percentile_axis(x, p, Axis::Row)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::matrix::Matrix;
const EPSILON: f64 = 1e-8;
#[test]
fn test_descriptive_stats_regular_values() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let x = Matrix::from_vec(data, 1, 5);
// Mean
assert!((mean(&x) - 3.0).abs() < EPSILON);
// Variance
assert!((population_variance(&x) - 2.0).abs() < EPSILON);
// Standard Deviation
assert!((stddev(&x) - 1.4142135623730951).abs() < EPSILON);
// Median
assert!((median(&x) - 3.0).abs() < EPSILON);
// Percentile
assert!((percentile(&x, 0.0) - 1.0).abs() < EPSILON);
assert!((percentile(&x, 25.0) - 2.0).abs() < EPSILON);
assert!((percentile(&x, 50.0) - 3.0).abs() < EPSILON);
assert!((percentile(&x, 75.0) - 4.0).abs() < EPSILON);
assert!((percentile(&x, 100.0) - 5.0).abs() < EPSILON);
let data_even = vec![1.0, 2.0, 3.0, 4.0];
let x_even = Matrix::from_vec(data_even, 1, 4);
assert!((median(&x_even) - 2.5).abs() < EPSILON);
}
#[test]
fn test_descriptive_stats_outlier() {
let data = vec![1.0, 2.0, 3.0, 4.0, 100.0];
let x = Matrix::from_vec(data, 1, 5);
// Mean should be heavily affected by outlier
assert!((mean(&x) - 22.0).abs() < EPSILON);
// Variance should be heavily affected by outlier
assert!((population_variance(&x) - 1522.0).abs() < EPSILON);
// Standard Deviation should be heavily affected by outlier
assert!((stddev(&x) - 39.0128183970461).abs() < EPSILON);
// Median should be robust to outlier
assert!((median(&x) - 3.0).abs() < EPSILON);
}
#[test]
#[should_panic(expected = "Percentile must be between 0 and 100")]
fn test_percentile_panic_low() {
let data = vec![1.0, 2.0, 3.0];
let x = Matrix::from_vec(data, 1, 3);
percentile(&x, -1.0);
}
#[test]
#[should_panic(expected = "Percentile must be between 0 and 100")]
fn test_percentile_panic_high() {
let data = vec![1.0, 2.0, 3.0];
let x = Matrix::from_vec(data, 1, 3);
percentile(&x, 101.0);
}
#[test]
fn test_mean_vertical_horizontal() {
// 2x3 matrix:
let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0];
let x = Matrix::from_vec(data, 2, 3);
// Vertical means (per column): [(1+4)/2, (2+5)/2, (3+6)/2]
let mv = mean_vertical(&x);
assert!((mv.get(0, 0) - 2.5).abs() < EPSILON);
assert!((mv.get(0, 1) - 3.5).abs() < EPSILON);
assert!((mv.get(0, 2) - 4.5).abs() < EPSILON);
// Horizontal means (per row): [(1+2+3)/3, (4+5+6)/3]
let mh = mean_horizontal(&x);
assert!((mh.get(0, 0) - 2.0).abs() < EPSILON);
assert!((mh.get(1, 0) - 5.0).abs() < EPSILON);
}
#[test]
fn test_variance_vertical_horizontal() {
let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0];
let x = Matrix::from_vec(data, 2, 3);
// cols: {1,4}, {2,5}, {3,6} all give 2.25
let vv = population_variance_vertical(&x);
for c in 0..3 {
assert!((vv.get(0, c) - 2.25).abs() < EPSILON);
}
let vh = population_variance_horizontal(&x);
assert!((vh.get(0, 0) - (2.0 / 3.0)).abs() < EPSILON);
assert!((vh.get(1, 0) - (2.0 / 3.0)).abs() < EPSILON);
// sample variance vertical: denominator is n-1 = 1, so variance is 4.5
let svv = sample_variance_vertical(&x);
for c in 0..3 {
assert!((svv.get(0, c) - 4.5).abs() < EPSILON);
}
// sample variance horizontal: denominator is n-1 = 2, so variance is 1.0
let svh = sample_variance_horizontal(&x);
assert!((svh.get(0, 0) - 1.0).abs() < EPSILON);
assert!((svh.get(1, 0) - 1.0).abs() < EPSILON);
}
#[test]
fn test_stddev_vertical_horizontal() {
let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0];
let x = Matrix::from_vec(data, 2, 3);
// Stddev is sqrt of variance
let sv = stddev_vertical(&x);
for c in 0..3 {
assert!((sv.get(0, c) - 1.5).abs() < EPSILON);
}
let sh = stddev_horizontal(&x);
// sqrt(2/3) ≈ 0.816497
let expected = (2.0 / 3.0 as f64).sqrt();
assert!((sh.get(0, 0) - expected).abs() < EPSILON);
assert!((sh.get(1, 0) - expected).abs() < EPSILON);
// sample stddev vertical: sqrt(4.5) ≈ 2.12132034
let ssv = sample_variance_vertical(&x).map(|v| v.sqrt());
for c in 0..3 {
assert!((ssv.get(0, c) - 2.1213203435596424).abs() < EPSILON);
}
// sample stddev horizontal: sqrt(1.0) = 1.0
let ssh = sample_variance_horizontal(&x).map(|v| v.sqrt());
assert!((ssh.get(0, 0) - 1.0).abs() < EPSILON);
assert!((ssh.get(1, 0) - 1.0).abs() < EPSILON);
}
#[test]
fn test_median_vertical_horizontal() {
let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0];
let x = Matrix::from_vec(data, 2, 3);
let mv = median_vertical(&x).row(0);
let expected_v = vec![2.5, 3.5, 4.5];
assert_eq!(mv, expected_v, "{:?} expected: {:?}", expected_v, mv);
let mh = median_horizontal(&x).column(0).to_vec();
let expected_h = vec![2.0, 5.0];
assert_eq!(mh, expected_h, "{:?} expected: {:?}", expected_h, mh);
}
#[test]
fn test_percentile_vertical_horizontal() {
// vec of f64 values 1..24 as a 4x6 matrix
let data: Vec<f64> = (1..=24).map(|x| x as f64).collect();
let x = Matrix::from_vec(data, 4, 6);
// columns:
// 1, 5, 9, 13, 17, 21
// 2, 6, 10, 14, 18, 22
// 3, 7, 11, 15, 19, 23
// 4, 8, 12, 16, 20, 24
let er0 = vec![1., 5., 9., 13., 17., 21.];
let er50 = vec![3., 7., 11., 15., 19., 23.];
let er100 = vec![4., 8., 12., 16., 20., 24.];
assert_eq!(percentile_vertical(&x, 0.0).data(), er0);
assert_eq!(percentile_vertical(&x, 50.0).data(), er50);
assert_eq!(percentile_vertical(&x, 100.0).data(), er100);
let eh0 = vec![1., 2., 3., 4.];
let eh50 = vec![13., 14., 15., 16.];
let eh100 = vec![21., 22., 23., 24.];
assert_eq!(percentile_horizontal(&x, 0.0).data(), eh0);
assert_eq!(percentile_horizontal(&x, 50.0).data(), eh50);
assert_eq!(percentile_horizontal(&x, 100.0).data(), eh100);
}
#[test]
#[should_panic(expected = "Percentile must be between 0 and 100")]
fn test_percentile_out_of_bounds() {
let data = vec![1.0, 2.0, 3.0];
let x = Matrix::from_vec(data, 1, 3);
percentile(&x, -10.0); // Should panic
}
#[test]
#[should_panic(expected = "Percentile must be between 0 and 100")]
fn test_percentile_vertical_out_of_bounds() {
let m = Matrix::from_vec(vec![1.0, 2.0, 3.0], 1, 3);
let _ = percentile_vertical(&m, -0.1);
}
}

View File

@@ -1,382 +0,0 @@
use crate::matrix::{Matrix, SeriesOps};
use std::f64::consts::PI;
/// Approximation of the error function (Abramowitz & Stegun 7.1.26)
fn erf_func(x: f64) -> f64 {
let sign = if x < 0.0 { -1.0 } else { 1.0 };
let x = x.abs();
// coefficients
let a1 = 0.254829592;
let a2 = -0.284496736;
let a3 = 1.421413741;
let a4 = -1.453152027;
let a5 = 1.061405429;
let p = 0.3275911;
let t = 1.0 / (1.0 + p * x);
let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
sign * y
}
/// Approximation of the error function for matrices
pub fn erf(x: Matrix<f64>) -> Matrix<f64> {
x.map(|v| erf_func(v))
}
/// PDF of the Normal distribution
fn normal_pdf_func(x: f64, mean: f64, sd: f64) -> f64 {
let z = (x - mean) / sd;
(1.0 / (sd * (2.0 * PI).sqrt())) * (-0.5 * z * z).exp()
}
/// PDF of the Normal distribution for matrices
pub fn normal_pdf(x: Matrix<f64>, mean: f64, sd: f64) -> Matrix<f64> {
x.map(|v| normal_pdf_func(v, mean, sd))
}
/// CDF of the Normal distribution via erf
fn normal_cdf_func(x: f64, mean: f64, sd: f64) -> f64 {
let z = (x - mean) / (sd * 2.0_f64.sqrt());
0.5 * (1.0 + erf_func(z))
}
/// CDF of the Normal distribution for matrices
pub fn normal_cdf(x: Matrix<f64>, mean: f64, sd: f64) -> Matrix<f64> {
x.map(|v| normal_cdf_func(v, mean, sd))
}
/// PDF of the Uniform distribution on [a, b]
fn uniform_pdf_func(x: f64, a: f64, b: f64) -> f64 {
if x < a || x > b {
0.0
} else {
1.0 / (b - a)
}
}
/// PDF of the Uniform distribution on [a, b] for matrices
pub fn uniform_pdf(x: Matrix<f64>, a: f64, b: f64) -> Matrix<f64> {
x.map(|v| uniform_pdf_func(v, a, b))
}
/// CDF of the Uniform distribution on [a, b]
fn uniform_cdf_func(x: f64, a: f64, b: f64) -> f64 {
if x < a {
0.0
} else if x <= b {
(x - a) / (b - a)
} else {
1.0
}
}
/// CDF of the Uniform distribution on [a, b] for matrices
pub fn uniform_cdf(x: Matrix<f64>, a: f64, b: f64) -> Matrix<f64> {
x.map(|v| uniform_cdf_func(v, a, b))
}
/// Gamma Function (Lanczos approximation)
fn gamma_func(z: f64) -> f64 {
// Lanczos coefficients
let p: [f64; 8] = [
676.5203681218851,
-1259.1392167224028,
771.32342877765313,
-176.61502916214059,
12.507343278686905,
-0.13857109526572012,
9.9843695780195716e-6,
1.5056327351493116e-7,
];
if z < 0.5 {
PI / ((PI * z).sin() * gamma_func(1.0 - z))
} else {
let z = z - 1.0;
let mut x = 0.99999999999980993;
for (i, &pi) in p.iter().enumerate() {
x += pi / (z + (i as f64) + 1.0);
}
let t = z + p.len() as f64 - 0.5;
(2.0 * PI).sqrt() * t.powf(z + 0.5) * (-t).exp() * x
}
}
pub fn gamma(z: Matrix<f64>) -> Matrix<f64> {
z.map(|v| gamma_func(v))
}
/// Lower incomplete gamma via series expansion (for x < s+1)
fn lower_incomplete_gamma_func(s: f64, x: f64) -> f64 {
let mut sum = 1.0 / s;
let mut term = sum;
for n in 1..100 {
term *= x / (s + n as f64);
sum += term;
}
sum * x.powf(s) * (-x).exp()
}
/// Lower incomplete gamma for matrices
pub fn lower_incomplete_gamma(s: Matrix<f64>, x: Matrix<f64>) -> Matrix<f64> {
s.zip(&x, |s_val, x_val| lower_incomplete_gamma_func(s_val, x_val))
}
/// PDF of the Gamma distribution (shape k, scale θ)
fn gamma_pdf_func(x: f64, k: f64, theta: f64) -> f64 {
if x < 0.0 {
return 0.0;
}
let coef = 1.0 / (gamma_func(k) * theta.powf(k));
coef * x.powf(k - 1.0) * (-(x / theta)).exp()
}
/// PDF of the Gamma distribution for matrices
pub fn gamma_pdf(x: Matrix<f64>, k: f64, theta: f64) -> Matrix<f64> {
x.map(|v| gamma_pdf_func(v, k, theta))
}
/// CDF of the Gamma distribution via lower incomplete gamma
fn gamma_cdf_func(x: f64, k: f64, theta: f64) -> f64 {
if x < 0.0 {
return 0.0;
}
lower_incomplete_gamma_func(k, x / theta) / gamma_func(k)
}
/// CDF of the Gamma distribution for matrices
pub fn gamma_cdf(x: Matrix<f64>, k: f64, theta: f64) -> Matrix<f64> {
x.map(|v| gamma_cdf_func(v, k, theta))
}
/// Factorials and Combinations ///
/// Compute n! as f64 (works up to ~170 reliably)
fn factorial(n: u64) -> f64 {
(1..=n).map(|i| i as f64).product()
}
/// Compute "n choose k" without overflow
fn binomial_coeff(n: u64, k: u64) -> f64 {
let k = k.min(n - k);
let mut numer = 1.0;
let mut denom = 1.0;
for i in 0..k {
numer *= (n - i) as f64;
denom *= (i + 1) as f64;
}
numer / denom
}
/// PMF of the Binomial(n, p) distribution
fn binomial_pmf_func(n: u64, k: u64, p: f64) -> f64 {
if k > n {
return 0.0;
}
binomial_coeff(n, k) * p.powf(k as f64) * (1.0 - p).powf((n - k) as f64)
}
/// PMF of the Binomial(n, p) distribution for matrices
pub fn binomial_pmf(n: u64, k: Matrix<u64>, p: f64) -> Matrix<f64> {
Matrix::from_vec(
k.data()
.iter()
.map(|&v| binomial_pmf_func(n, v, p))
.collect::<Vec<f64>>(),
k.rows(),
k.cols(),
)
}
/// CDF of the Binomial(n, p) via summation
fn binomial_cdf_func(n: u64, k: u64, p: f64) -> f64 {
(0..=k).map(|i| binomial_pmf_func(n, i, p)).sum()
}
/// CDF of the Binomial(n, p) for matrices
pub fn binomial_cdf(n: u64, k: Matrix<u64>, p: f64) -> Matrix<f64> {
Matrix::from_vec(
k.data()
.iter()
.map(|&v| binomial_cdf_func(n, v, p))
.collect::<Vec<f64>>(),
k.rows(),
k.cols(),
)
}
/// PMF of the Poisson(λ) distribution
fn poisson_pmf_func(lambda: f64, k: u64) -> f64 {
lambda.powf(k as f64) * (-lambda).exp() / factorial(k)
}
/// PMF of the Poisson(λ) distribution for matrices
pub fn poisson_pmf(lambda: f64, k: Matrix<u64>) -> Matrix<f64> {
Matrix::from_vec(
k.data()
.iter()
.map(|&v| poisson_pmf_func(lambda, v))
.collect::<Vec<f64>>(),
k.rows(),
k.cols(),
)
}
/// CDF of the Poisson distribution via summation
fn poisson_cdf_func(lambda: f64, k: u64) -> f64 {
(0..=k).map(|i| poisson_pmf_func(lambda, i)).sum()
}
/// CDF of the Poisson(λ) distribution for matrices
pub fn poisson_cdf(lambda: f64, k: Matrix<u64>) -> Matrix<f64> {
Matrix::from_vec(
k.data()
.iter()
.map(|&v| poisson_cdf_func(lambda, v))
.collect::<Vec<f64>>(),
k.rows(),
k.cols(),
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_math_funcs() {
// Test erf function
assert!((erf_func(0.0) - 0.0).abs() < 1e-7);
assert!((erf_func(1.0) - 0.8427007).abs() < 1e-7);
assert!((erf_func(-1.0) + 0.8427007).abs() < 1e-7);
// Test gamma function
assert!((gamma_func(1.0) - 1.0).abs() < 1e-7);
assert!((gamma_func(2.0) - 1.0).abs() < 1e-7);
assert!((gamma_func(3.0) - 2.0).abs() < 1e-7);
assert!((gamma_func(4.0) - 6.0).abs() < 1e-7);
assert!((gamma_func(5.0) - 24.0).abs() < 1e-7);
let z = 0.3;
let expected = PI / ((PI * z).sin() * gamma_func(1.0 - z));
assert!((gamma_func(z) - expected).abs() < 1e-7);
}
#[test]
fn test_math_matrix() {
let x = Matrix::filled(5, 5, 1.0);
let erf_result = erf(x.clone());
assert!((erf_result.data()[0] - 0.8427007).abs() < 1e-7);
let gamma_result = gamma(x);
assert!((gamma_result.data()[0] - 1.0).abs() < 1e-7);
}
#[test]
fn test_normal_funcs() {
assert!((normal_pdf_func(0.0, 0.0, 1.0) - 0.39894228).abs() < 1e-7);
assert!((normal_cdf_func(1.0, 0.0, 1.0) - 0.8413447).abs() < 1e-7);
}
#[test]
fn test_normal_matrix() {
let x = Matrix::filled(5, 5, 0.0);
let pdf = normal_pdf(x.clone(), 0.0, 1.0);
let cdf = normal_cdf(x, 0.0, 1.0);
assert!((pdf.data()[0] - 0.39894228).abs() < 1e-7);
assert!((cdf.data()[0] - 0.5).abs() < 1e-7);
}
#[test]
fn test_uniform_funcs() {
assert_eq!(uniform_pdf_func(0.5, 0.0, 1.0), 1.0);
assert_eq!(uniform_cdf_func(-1.0, 0.0, 1.0), 0.0);
assert_eq!(uniform_cdf_func(0.5, 0.0, 1.0), 0.5);
// x<a (or x>b) should return 0
assert_eq!(uniform_pdf_func(-0.5, 0.0, 1.0), 0.0);
assert_eq!(uniform_pdf_func(1.5, 0.0, 1.0), 0.0);
// for cdf x>a AND x>b should return 1
assert_eq!(uniform_cdf_func(1.5, 0.0, 1.0), 1.0);
assert_eq!(uniform_cdf_func(2.0, 0.0, 1.0), 1.0);
}
#[test]
fn test_uniform_matrix() {
let x = Matrix::filled(5, 5, 0.5);
let pdf = uniform_pdf(x.clone(), 0.0, 1.0);
let cdf = uniform_cdf(x, 0.0, 1.0);
assert_eq!(pdf.data()[0], 1.0);
assert_eq!(cdf.data()[0], 0.5);
}
#[test]
fn test_binomial_funcs() {
let pmf = binomial_pmf_func(5, 2, 0.5);
assert!((pmf - 0.3125).abs() < 1e-7);
let cdf = binomial_cdf_func(5, 2, 0.5);
assert!((cdf - (0.03125 + 0.15625 + 0.3125)).abs() < 1e-7);
let pmf_zero = binomial_pmf_func(5, 6, 0.5);
assert!(pmf_zero == 0.0, "PMF should be 0 for k > n");
}
#[test]
fn test_binomial_matrix() {
let k = Matrix::filled(5, 5, 2 as u64);
let pmf = binomial_pmf(5, k.clone(), 0.5);
let cdf = binomial_cdf(5, k, 0.5);
assert!((pmf.data()[0] - 0.3125).abs() < 1e-7);
assert!((cdf.data()[0] - (0.03125 + 0.15625 + 0.3125)).abs() < 1e-7);
}
#[test]
fn test_poisson_funcs() {
let pmf: f64 = poisson_pmf_func(3.0, 2);
assert!((pmf - (3.0_f64.powf(2.0) * (-3.0 as f64).exp() / 2.0)).abs() < 1e-7);
let cdf: f64 = poisson_cdf_func(3.0, 2);
assert!((cdf - (pmf + poisson_pmf_func(3.0, 0) + poisson_pmf_func(3.0, 1))).abs() < 1e-7);
}
#[test]
fn test_poisson_matrix() {
let k = Matrix::filled(5, 5, 2);
let pmf = poisson_pmf(3.0, k.clone());
let cdf = poisson_cdf(3.0, k);
assert!((pmf.data()[0] - (3.0_f64.powf(2.0) * (-3.0 as f64).exp() / 2.0)).abs() < 1e-7);
assert!(
(cdf.data()[0] - (pmf.data()[0] + poisson_pmf_func(3.0, 0) + poisson_pmf_func(3.0, 1)))
.abs()
< 1e-7
);
}
#[test]
fn test_gamma_funcs() {
// For k=1, θ=1 the Gamma(1,1) is Exp(1), so pdf(x)=e^-x
assert!((gamma_pdf_func(2.0, 1.0, 1.0) - (-2.0 as f64).exp()).abs() < 1e-7);
assert!((gamma_cdf_func(2.0, 1.0, 1.0) - (1.0 - (-2.0 as f64).exp())).abs() < 1e-7);
// <0 case
assert_eq!(gamma_pdf_func(-1.0, 1.0, 1.0), 0.0);
assert_eq!(gamma_cdf_func(-1.0, 1.0, 1.0), 0.0);
}
#[test]
fn test_gamma_matrix() {
let x = Matrix::filled(5, 5, 2.0);
let pdf = gamma_pdf(x.clone(), 1.0, 1.0);
let cdf = gamma_cdf(x, 1.0, 1.0);
assert!((pdf.data()[0] - (-2.0 as f64).exp()).abs() < 1e-7);
assert!((cdf.data()[0] - (1.0 - (-2.0 as f64).exp())).abs() < 1e-7);
}
#[test]
fn test_lower_incomplete_gamma() {
let s = Matrix::filled(5, 5, 2.0);
let x = Matrix::filled(5, 5, 1.0);
let expected = lower_incomplete_gamma_func(2.0, 1.0);
let result = lower_incomplete_gamma(s, x);
assert!((result.data()[0] - expected).abs() < 1e-7);
}
}

View File

@@ -1,131 +0,0 @@
use crate::matrix::{Matrix, SeriesOps};
use crate::compute::stats::{gamma_cdf, mean, sample_variance};
/// Two-sample t-test returning (t_statistic, p_value)
pub fn t_test(sample1: &Matrix<f64>, sample2: &Matrix<f64>) -> (f64, f64) {
let mean1 = mean(sample1);
let mean2 = mean(sample2);
let var1 = sample_variance(sample1);
let var2 = sample_variance(sample2);
let n1 = (sample1.rows() * sample1.cols()) as f64;
let n2 = (sample2.rows() * sample2.cols()) as f64;
let t_statistic = (mean1 - mean2) / ((var1 / n1 + var2 / n2).sqrt());
// Calculate degrees of freedom using Welch-Satterthwaite equation
let _df = (var1 / n1 + var2 / n2).powi(2)
/ ((var1 / n1).powi(2) / (n1 - 1.0) + (var2 / n2).powi(2) / (n2 - 1.0));
// Calculate p-value using t-distribution CDF (two-tailed)
let p_value = 0.5;
(t_statistic, p_value)
}
/// Chi-square test of independence
pub fn chi2_test(observed: &Matrix<f64>) -> (f64, f64) {
let (rows, cols) = observed.shape();
let row_sums: Vec<f64> = observed.sum_horizontal();
let col_sums: Vec<f64> = observed.sum_vertical();
let grand_total: f64 = observed.data().iter().sum();
let mut chi2_statistic: f64 = 0.0;
for i in 0..rows {
for j in 0..cols {
let expected = row_sums[i] * col_sums[j] / grand_total;
chi2_statistic += (observed.get(i, j) - expected).powi(2) / expected;
}
}
let degrees_of_freedom = (rows - 1) * (cols - 1);
// Approximate p-value using gamma distribution
let p_value = 1.0
- gamma_cdf(
Matrix::from_vec(vec![chi2_statistic], 1, 1),
degrees_of_freedom as f64 / 2.0,
1.0,
)
.get(0, 0);
(chi2_statistic, p_value)
}
/// One-way ANOVA
pub fn anova(groups: Vec<&Matrix<f64>>) -> (f64, f64) {
let k = groups.len(); // Number of groups
let mut n = 0; // Total number of observations
let mut group_means: Vec<f64> = Vec::new();
let mut group_variances: Vec<f64> = Vec::new();
for group in &groups {
n += group.rows() * group.cols();
group_means.push(mean(group));
group_variances.push(sample_variance(group));
}
let grand_mean: f64 = group_means.iter().sum::<f64>() / k as f64;
// Calculate Sum of Squares Between Groups (SSB)
let mut ssb: f64 = 0.0;
for i in 0..k {
ssb += (group_means[i] - grand_mean).powi(2) * (groups[i].rows() * groups[i].cols()) as f64;
}
// Calculate Sum of Squares Within Groups (SSW)
let mut ssw: f64 = 0.0;
for i in 0..k {
ssw += group_variances[i] * (groups[i].rows() * groups[i].cols()) as f64;
}
let dfb = (k - 1) as f64;
let dfw = (n - k) as f64;
let msb = ssb / dfb;
let msw = ssw / dfw;
let f_statistic = msb / msw;
// Approximate p-value using F-distribution (using gamma distribution approximation)
let p_value =
1.0 - gamma_cdf(Matrix::from_vec(vec![f_statistic], 1, 1), dfb / 2.0, 1.0).get(0, 0);
(f_statistic, p_value)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::matrix::Matrix;
const EPS: f64 = 1e-5;
#[test]
fn test_t_test() {
let sample1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5);
let sample2 = Matrix::from_vec(vec![6.0, 7.0, 8.0, 9.0, 10.0], 1, 5);
let (t_statistic, p_value) = t_test(&sample1, &sample2);
assert!((t_statistic + 5.0).abs() < EPS);
assert!(p_value > 0.0 && p_value < 1.0);
}
#[test]
fn test_chi2_test() {
let observed = Matrix::from_vec(vec![12.0, 5.0, 8.0, 10.0], 2, 2);
let (chi2_statistic, p_value) = chi2_test(&observed);
assert!(chi2_statistic > 0.0);
assert!(p_value > 0.0 && p_value < 1.0);
}
#[test]
fn test_anova() {
let group1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5);
let group2 = Matrix::from_vec(vec![2.0, 3.0, 4.0, 5.0, 6.0], 1, 5);
let group3 = Matrix::from_vec(vec![3.0, 4.0, 5.0, 6.0, 7.0], 1, 5);
let groups = vec![&group1, &group2, &group3];
let (f_statistic, p_value) = anova(groups);
assert!(f_statistic > 0.0);
assert!(p_value > 0.0 && p_value < 1.0);
}
}

View File

@@ -1,9 +0,0 @@
pub mod correlation;
pub mod descriptive;
pub mod distributions;
pub mod inferential;
pub use correlation::*;
pub use descriptive::*;
pub use distributions::*;
pub use inferential::*;

View File

@@ -4,4 +4,4 @@ pub mod ops;
pub use base::*;
#[allow(unused_imports)]
pub use ops::*;
pub use ops::*;

View File

@@ -171,7 +171,7 @@ mod tests {
#[test]
fn test_bool_ops_count_overall() {
let matrix = create_bool_test_matrix(); // Data: [T, F, T, F, T, F, T, F, F]
// Count of true values: 4
// Count of true values: 4
assert_eq!(matrix.count(), 4);
let matrix_all_false = BoolMatrix::from_vec(vec![false; 5], 5, 1); // 5x1
@@ -211,7 +211,7 @@ mod tests {
#[test]
fn test_bool_ops_1xn_matrix() {
let matrix = BoolMatrix::from_vec(vec![true, false, false, true], 1, 4); // 1 row, 4 cols
// Data: [T, F, F, T]
// Data: [T, F, F, T]
assert_eq!(matrix.any_vertical(), vec![true, false, false, true]);
assert_eq!(matrix.all_vertical(), vec![true, false, false, true]);
@@ -229,7 +229,7 @@ mod tests {
#[test]
fn test_bool_ops_nx1_matrix() {
let matrix = BoolMatrix::from_vec(vec![true, false, false, true], 4, 1); // 4 rows, 1 col
// Data: [T, F, F, T]
// Data: [T, F, F, T]
assert_eq!(matrix.any_vertical(), vec![true]); // T|F|F|T = T
assert_eq!(matrix.all_vertical(), vec![false]); // T&F&F&T = F

View File

@@ -63,19 +63,6 @@ impl<T: Clone> Matrix<T> {
Matrix { rows, cols, data }
}
/// Build from a flat Vec, assuming row-major order.
pub fn from_rows_vec(data: Vec<T>, rows: usize, cols: usize) -> Self {
let mut new_vec = Vec::with_capacity(rows * cols);
for c in 0..cols {
for r in 0..rows {
new_vec.push(data[r * cols + c].clone());
}
}
Matrix::from_vec(new_vec, rows, cols)
}
pub fn data(&self) -> &[T] {
&self.data
}
@@ -374,53 +361,17 @@ impl<T: Clone> Matrix<T> {
}
Matrix::from_vec(data, n, self.cols)
}
}
impl Matrix<f64> {
/// Creates a new matrix filled with a specific value of the specified size.
pub fn filled(rows: usize, cols: usize, value: T) -> Self {
pub fn filled(rows: usize, cols: usize, value: f64) -> Self {
Matrix {
rows,
cols,
data: vec![value; rows * cols], // Fill with the specified value
}
}
/// Creates a new matrix by broadcasting a 1-row matrix to a target shape.
/// Panics if `self` is not a 1-row matrix or if `self.cols()` does not match `target_cols`.
pub fn broadcast_row_to_target_shape(
&self,
target_rows: usize,
target_cols: usize,
) -> Matrix<T> {
assert_eq!(
self.rows(),
1,
"broadcast_row_to_target_shape can only be called on a 1-row matrix."
);
assert_eq!(
self.cols(),
target_cols,
"Column count mismatch for broadcasting: source has {} columns, target has {} columns.",
self.cols(),
target_cols
);
let mut data = Vec::with_capacity(target_rows * target_cols);
let original_row_data = self.row(0); // Get the single row data
for _ in 0..target_rows {
// Repeat 'target_rows' times
for value in &original_row_data {
// Iterate over elements of the row
data.push(value.clone());
}
}
// The data is now in row-major order for the new matrix.
// We need to convert it to column-major for Matrix::from_vec.
Matrix::from_rows_vec(data, target_rows, target_cols)
}
}
impl Matrix<f64> {
/// Creates a new matrix filled with zeros of the specified size.
pub fn zeros(rows: usize, cols: usize) -> Self {
Matrix::filled(rows, cols, 0.0)
@@ -430,11 +381,6 @@ impl Matrix<f64> {
pub fn ones(rows: usize, cols: usize) -> Self {
Matrix::filled(rows, cols, 1.0)
}
/// Creates a new matrix filled with NaN values of the specified size.
pub fn nan(rows: usize, cols: usize) -> Matrix<f64> {
Matrix::filled(rows, cols, f64::NAN)
}
}
impl<T> Index<(usize, usize)> for Matrix<T> {
@@ -1026,20 +972,6 @@ mod tests {
assert_eq!(m.to_vec(), vec![1.0, 3.0, 2.0, 4.0]);
}
#[test]
fn test_from_rows_vec() {
// Representing:
// 1 2 3
// 4 5 6
let rows_data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let matrix = Matrix::from_rows_vec(rows_data, 2, 3);
let data = vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]; // Column-major
let expected = Matrix::from_vec(data, 2, 3);
assert_eq!(matrix, expected);
}
// Helper function to create a basic Matrix for testing
fn static_test_matrix() -> Matrix<i32> {
// Column-major data:
@@ -1266,28 +1198,6 @@ mod tests {
ma.row_copy_from_slice(1, &new_row);
assert_eq!(ma.row(1), &[10, 20, 30]);
}
#[test]
#[should_panic(expected = "row index 4 out of bounds for 3 rows")]
fn test_row_copy_from_slice_out_of_bounds() {
let mut ma = static_test_matrix();
let new_row = vec![10, 20, 30];
ma.row_copy_from_slice(4, &new_row);
}
#[test]
#[should_panic(expected = "row index 3 out of bounds for 3 rows")]
fn test_row_out_of_bounds_index() {
let ma = static_test_matrix();
ma.row(3);
}
#[test]
#[should_panic(expected = "input slice length 2 does not match number of columns 3")]
fn test_row_copy_from_slice_wrong_length() {
let mut ma = static_test_matrix();
let new_row = vec![10, 20]; // Only 2 elements, but row length is 3
ma.row_copy_from_slice(1, &new_row);
}
#[test]
fn test_shape() {
@@ -2019,66 +1929,5 @@ mod tests {
assert_eq!(m.rows(), 2);
assert_eq!(m.cols(), 2);
assert_eq!(m.data(), &[42.5, 42.5, 42.5, 42.5]);
// test with an integer matrix
let m = Matrix::<i32>::filled(2, 3, 7);
assert_eq!(m.rows(), 2);
assert_eq!(m.cols(), 3);
assert_eq!(m.data(), &[7, 7, 7, 7, 7, 7]);
// test with nans
let m = Matrix::nan(3, 3);
assert_eq!(m.rows(), 3);
assert_eq!(m.cols(), 3);
for &value in m.data() {
assert!(value.is_nan(), "Expected NaN, got {}", value);
}
}
#[test]
fn test_broadcast_row_to_target_shape_basic() {
let single_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0], 1, 3);
let target_rows = 5;
let target_cols = 3;
let broadcasted = single_row_matrix.broadcast_row_to_target_shape(target_rows, target_cols);
assert_eq!(broadcasted.rows(), target_rows);
assert_eq!(broadcasted.cols(), target_cols);
for r in 0..target_rows {
assert_eq!(broadcasted.row(r), vec![1.0, 2.0, 3.0]);
}
}
#[test]
fn test_broadcast_row_to_target_shape_single_row() {
let single_row_matrix = Matrix::from_rows_vec(vec![10.0, 20.0], 1, 2);
let target_rows = 1;
let target_cols = 2;
let broadcasted = single_row_matrix.broadcast_row_to_target_shape(target_rows, target_cols);
assert_eq!(broadcasted.rows(), target_rows);
assert_eq!(broadcasted.cols(), target_cols);
assert_eq!(broadcasted.row(0), vec![10.0, 20.0]);
}
#[test]
#[should_panic(
expected = "broadcast_row_to_target_shape can only be called on a 1-row matrix."
)]
fn test_broadcast_row_to_target_shape_panic_not_1_row() {
let multi_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
multi_row_matrix.broadcast_row_to_target_shape(3, 2);
}
#[test]
#[should_panic(
expected = "Column count mismatch for broadcasting: source has 3 columns, target has 4 columns."
)]
fn test_broadcast_row_to_target_shape_panic_col_mismatch() {
let single_row_matrix = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0], 1, 3);
single_row_matrix.broadcast_row_to_target_shape(5, 4);
}
}

View File

@@ -1,7 +1,7 @@
pub mod boolops;
pub mod mat;
pub mod seriesops;
pub mod boolops;
pub use boolops::*;
pub use mat::*;
pub use seriesops::*;
pub use boolops::*;