25 Commits

Author SHA1 Message Date
27e9eab028 Merge pull request #58 from Magnus167/prep
Updating README
2025-07-17 00:16:23 +01:00
Palash Tyagi
c13fcc99f7 Remove commented-out dev-dependencies from Cargo.toml 2025-07-16 19:14:45 -04:00
Palash Tyagi
eb9de0a647 Fix typos and improve clarity in README documentation 2025-07-16 19:14:06 -04:00
Palash Tyagi
044c3284df Enhance README with detailed project scope, features, and compute module overview 2025-07-17 00:09:42 +01:00
Palash Tyagi
ad4cadd8fb Update version in Cargo.toml and enhance README for clarity and project scope 2025-07-16 23:51:42 +01:00
34b09508f3 Merge pull request #57 from Magnus167/compute
Add statistical functions and machine learning models
2025-07-16 01:53:39 +01:00
Palash Tyagi
a8a532f252 Remove Spearman correlation function and unused rank import from stats module 2025-07-16 01:50:28 +01:00
Palash Tyagi
19c3dde169 Add Pearson and Spearman correlation functions to stats module 2025-07-16 01:32:18 +01:00
Palash Tyagi
a335d29347 Simplify t-test assertion in unit test for clarity 2025-07-15 01:05:32 +01:00
Palash Tyagi
b2f6794e05 Add inferential module to stats module exports 2025-07-15 01:02:20 +01:00
Palash Tyagi
5f1f0970da Implement statistical tests: t-test, chi-square test, and ANOVA with corresponding unit tests 2025-07-15 01:02:14 +01:00
Palash Tyagi
7bbfb5394f Add tests for sample variance and standard deviation calculations 2025-07-15 01:01:40 +01:00
Palash Tyagi
285147d52b Refactor variance functions to distinguish between population and sample variance 2025-07-15 01:00:03 +01:00
Palash Tyagi
64722914bd Add test for KMeans empty cluster reinitialization logic 2025-07-13 02:24:29 +01:00
Palash Tyagi
86ea548b4f Remove test for KMeans empty cluster reinitialization 2025-07-13 01:51:43 +01:00
Palash Tyagi
1bdcf1b113 Refactor test for KMeans empty cluster reinitialization to use distinct data points and remove redundant assertion 2025-07-13 01:49:15 +01:00
Palash Tyagi
7c7c8c2a16 Remove redundant assertion message in empty cluster reinitialization test 2025-07-13 01:41:41 +01:00
Palash Tyagi
4d8ed2e908 Add test for KMeans empty cluster reinitialization logic 2025-07-13 01:41:20 +01:00
Palash Tyagi
62d4803075 Simplify assertion for unique labels in KMeans tests when k equals m 2025-07-13 01:35:02 +01:00
Palash Tyagi
19bc09fd5a Refactor KMeans centroid initialization and improve handling of edge cases 2025-07-13 01:29:19 +01:00
Palash Tyagi
bda9b84987 Refactor KMeans centroid initialization to handle k=1 case by setting centroid to mean of data 2025-07-13 00:16:29 +01:00
Palash Tyagi
c24eb4a08c Relax assertion tolerance in KMeans tests to align with algorithm's convergence criteria 2025-07-12 01:47:40 +01:00
Palash Tyagi
12a72317e4 Refactor KMeans fit and predict methods for improved clarity and performance 2025-07-12 01:45:59 +01:00
Palash Tyagi
049dd02c1a Remove unreachable panic 2025-07-12 01:35:51 +01:00
Palash Tyagi
bc87e40481 Add test for variance smoothing with zero smoothing in GaussianNB 2025-07-12 01:34:08 +01:00
8 changed files with 571 additions and 77 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "rustframe"
version = "0.0.1-a.0"
version = "0.0.1-a.20250716"
edition = "2021"
license = "GPL-3.0-or-later"
readme = "README.md"
@@ -19,9 +19,6 @@ rand = "^0.9.1"
[features]
bench = ["dep:criterion"]
# [dev-dependencies]
# criterion = { version = "0.5", features = ["html_reports"], optional = true }
[[bench]]
name = "benchmarks"
harness = false

View File

@@ -2,7 +2,7 @@
<!-- # <img align="center" alt="Rustframe" src=".github/rustframe_logo.png" height="50px" /> rustframe -->
<!-- though the centre tag doesn't work as it would noramlly, it achieves the desired effect -->
<!-- though the centre tag doesn't work as it would normally, it achieves the desired effect -->
📚 [Docs](https://magnus167.github.io/rustframe/) | 🐙 [GitHub](https://github.com/Magnus167/rustframe) | 🌐 [Gitea mirror](https://gitea.nulltech.uk/Magnus167/rustframe) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
@@ -15,26 +15,54 @@
## Rustframe: _A lightweight dataframe & math toolkit for Rust_
Rustframe provides intuitive dataframe, matrix, and series operations small-to-mid scale data analysis and manipulation.
Rustframe provides intuitive dataframe, matrix, and series operations for data analysis and manipulation.
Rustframe keeps things simple, safe, and readable. It is handy for quick numeric experiments and small analytical tasks, but it is **not** meant to compete with powerhouse crates like `polars` or `ndarray`.
Rustframe keeps things simple, safe, and readable. It is handy for quick numeric experiments and small analytical tasks as well as for educational purposes. It is designed to be easy to use and understand, with a clean API implemented in 100% safe Rust.
Rustframe is an educational project, and is not intended for production use. It is **not** meant to compete with powerhouse crates like `polars` or `ndarray`. It is a work in progress, and the API is subject to change. There are no guarantees of stability or performance, and it is not optimized for large datasets or high-performance computing.
### What it offers
- **Math that reads like math** - elementwise `+`, ``, `×`, `÷` on entire frames or scalars.
- **Broadcast & reduce** - sum, product, any/all across rows or columns without boilerplate.
- **Boolean masks made simple** - chain comparisons, combine with `&`/`|`, get a tidy `BoolMatrix` back.
- **Datecentric row index** - businessday ranges and calendar slicing built in.
- **Pure safe Rust** - 100% safe, zero `unsafe`.
- **Matrix operations** - Element-wise arithmetic, boolean logic, transpose, and more.
- **Math that reads like math** - element-wise `+`, ``, `×`, `÷` on entire frames or scalars.
- **Frames** - Column major data structure for single-type data, with labeled columns and typed row indices.
- **Compute module** - Implements various statistical computations and machine learning models.
- **[Coming Soon]** _DataFrame_ - Multi-type data structure for heterogeneous data, with labeled columns and typed row indices.
- **[Coming Soon]** _Random number utils_ - Random number generation utilities for statistical sampling and simulations. (Currently using the [`rand`](https://crates.io/crates/rand) crate.)
#### Matrix and Frame functionality
- **Matrix operations** - Element-wise arithmetic, boolean logic, transpose, and more.
- **Frame operations** - Column manipulation, sorting, and more.
#### Compute Module
The `compute` module provides implementations for various statistical computations and machine learning models.
**Statistics, Data Analysis, and Machine Learning:**
- Correlation analysis
- Descriptive statistics
- Distributions
- Inferential statistics
- Dense Neural Networks
- Gaussian Naive Bayes
- K-Means Clustering
- Linear Regression
- Logistic Regression
- Principal Component Analysis
### Heads up
- **Not memoryefficient (yet)** - footprint needs work.
- **Feature set still small** - expect missing pieces.
- **The feature set is still limited** - expect missing pieces.
### On the horizon
### Somewhere down the line
- Optional GPU help (Vulkan or similar) for heavier workloads.
- Optional GPU acceleration (Vulkan or similar) for heavier workloads.
- Straightforward Python bindings using `pyo3`.
---
@@ -51,7 +79,7 @@ use rustframe::{
let n_periods = 4;
// Four business days starting 20240102
// Four business days starting 2024-01-02
let dates: Vec<NaiveDate> =
BDatesList::from_n_periods("2024-01-02".to_string(), DateFreq::Daily, n_periods)
.unwrap()
@@ -86,13 +114,13 @@ let result: Matrix<f64> = result / 2.0; // divide by scalar
let check: bool = result.eq_elem(ma.clone()).all();
assert!(check);
// The above math can also be written as:
// Alternatively:
let check: bool = (&(&(&(&ma + 1.0) - 1.0) * 2.0) / 2.0)
.eq_elem(ma.clone())
.all();
assert!(check);
// The above math can also be written as:
// or even as:
let check: bool = ((((ma.clone() + 1.0) - 1.0) * 2.0) / 2.0)
.eq_elem(ma.clone())
.all();
@@ -163,3 +191,11 @@ E.g. to run the `game_of_life` example:
```bash
cargo run --example game_of_life
```
### Running benchmarks
To run the benchmarks, use:
```bash
cargo bench --features "bench"
```

View File

@@ -88,9 +88,6 @@ impl GaussianNB {
for &c in &self.classes {
let idx = &groups[&c.to_bits()];
let count = idx.len();
if count == 0 {
panic!("Class group for label {} is empty", c);
}
// Prior
self.priors.push(count as f64 / m as f64);
@@ -208,4 +205,26 @@ mod tests {
let mut clf = GaussianNB::new(1e-9, false);
clf.fit(&x, &y);
}
#[test]
fn test_variance_smoothing_override_with_zero_smoothing() {
// Scenario: var_smoothing is 0, and a feature has zero variance within a class.
// This should trigger the `if var[(0, j)] <= 0.0 { var[(0, j)] = smoothing; }` line.
let x = Matrix::from_vec(vec![1.0, 1.0, 2.0], 3, 1); // Class 0: [1.0, 1.0], Class 1: [2.0]
let y = Matrix::from_vec(vec![0.0, 0.0, 1.0], 3, 1);
let mut clf = GaussianNB::new(0.0, false); // var_smoothing = 0.0
clf.fit(&x, &y);
// For class 0 (index 0 in clf.classes), the feature (index 0) had values [1.0, 1.0], so variance was 0.
// Since var_smoothing was 0, smoothing is 0.
// The line `var[(0, j)] = smoothing;` should have set the variance to 0.0.
let class_0_idx = clf.classes.iter().position(|&c| c == 0.0).unwrap();
assert_eq!(clf.variances[class_0_idx][(0, 0)], 0.0);
// For class 1 (index 1 in clf.classes), the feature (index 0) had value [2.0].
// Variance calculation for a single point results in 0.
// The if condition will be true, and var[(0, j)] will be set to smoothing (0.0).
let class_1_idx = clf.classes.iter().position(|&c| c == 1.0).unwrap();
assert_eq!(clf.variances[class_1_idx][(0, 0)], 0.0);
}
}

View File

@@ -1,4 +1,6 @@
use crate::compute::stats::mean_vertical;
use crate::matrix::Matrix;
use rand::rng;
use rand::seq::SliceRandom;
pub struct KMeans {
@@ -12,35 +14,52 @@ impl KMeans {
let n = x.cols();
assert!(k <= m, "k must be ≤ number of samples");
// ----- initialise centroids: pick k distinct rows at random -----
let mut rng = rand::rng();
let mut indices: Vec<usize> = (0..m).collect();
indices.shuffle(&mut rng);
// ----- initialise centroids -----
let mut centroids = Matrix::zeros(k, n);
for (c, &i) in indices[..k].iter().enumerate() {
for j in 0..n {
centroids[(c, j)] = x[(i, j)];
if k > 0 && m > 0 {
// case for empty data
if k == 1 {
let mean = mean_vertical(x);
centroids.row_copy_from_slice(0, &mean.data()); // ideally, data.row(0), but thats the same
} else {
// For k > 1, pick k distinct rows at random
let mut rng = rng();
let mut indices: Vec<usize> = (0..m).collect();
indices.shuffle(&mut rng);
for c in 0..k {
centroids.row_copy_from_slice(c, &x.row(indices[c]));
}
}
}
let mut labels = vec![0usize; m];
for _ in 0..max_iter {
// ----- assignment step -----
let mut distances = vec![0.0f64; m];
for _iter in 0..max_iter {
let mut changed = false;
// ----- assignment step -----
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist = f64::MAX;
let mut best_dist_sq = f64::MAX;
for c in 0..k {
let mut dist = 0.0;
for j in 0..n {
let d = x[(i, j)] - centroids[(c, j)];
dist += d * d;
}
if dist < best_dist {
best_dist = dist;
let centroid_row = centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best = c;
}
}
distances[i] = best_dist_sq;
if labels[i] != best {
labels[i] = best;
changed = true;
@@ -48,37 +67,57 @@ impl KMeans {
}
// ----- update step -----
let mut new_centroids = Matrix::zeros(k, n);
let mut counts = vec![0usize; k];
let mut centroids = Matrix::zeros(k, n);
for i in 0..m {
let c = labels[i];
counts[c] += 1;
for j in 0..n {
centroids[(c, j)] += x[(i, j)];
new_centroids[(c, j)] += x[(i, j)];
}
}
for c in 0..k {
if counts[c] > 0 {
if counts[c] == 0 {
// This cluster is empty. Re-initialize its centroid to the point
// furthest from its assigned centroid to prevent the cluster from dying.
let mut furthest_point_idx = 0;
let mut max_dist_sq = 0.0;
for (i, &dist) in distances.iter().enumerate() {
if dist > max_dist_sq {
max_dist_sq = dist;
furthest_point_idx = i;
}
}
for j in 0..n {
centroids[(c, j)] /= counts[c] as f64;
new_centroids[(c, j)] = x[(furthest_point_idx, j)];
}
// Ensure this point isn't chosen again for another empty cluster in the same iteration.
if m > 0 {
distances[furthest_point_idx] = 0.0;
}
} else {
// Normalize the centroid by the number of points in it.
for j in 0..n {
new_centroids[(c, j)] /= counts[c] as f64;
}
}
}
// ----- convergence test -----
if !changed {
centroids = new_centroids; // update before breaking
break; // assignments stable
}
let diff = &new_centroids - &centroids;
centroids = new_centroids; // Update for the next iteration
if tol > 0.0 {
// optional centroid-shift tolerance
let mut shift: f64 = 0.0;
for c in 0..k {
for j in 0..n {
let d = centroids[(c, j)] - centroids[(c, j)]; // previous stored?
shift += d * d;
}
}
if shift.sqrt() < tol {
let sq_diff = &diff * &diff;
let shift = sq_diff.data().iter().sum::<f64>().sqrt();
if shift < tol {
break;
}
}
@@ -90,19 +129,28 @@ impl KMeans {
pub fn predict(&self, x: &Matrix<f64>) -> Vec<usize> {
let m = x.rows();
let k = self.centroids.rows();
let n = x.cols();
if m == 0 {
return Vec::new();
}
let mut labels = vec![0usize; m];
for i in 0..m {
let sample_row = x.row(i);
let mut best = 0usize;
let mut best_dist = f64::MAX;
let mut best_dist_sq = f64::MAX;
for c in 0..k {
let mut dist = 0.0;
for j in 0..n {
let d = x[(i, j)] - self.centroids[(c, j)];
dist += d * d;
}
if dist < best_dist {
best_dist = dist;
let centroid_row = self.centroids.row(c);
let dist_sq: f64 = sample_row
.iter()
.zip(centroid_row.iter())
.map(|(a, b)| (a - b).powi(2))
.sum();
if dist_sq < best_dist_sq {
best_dist_sq = dist_sq;
best = c;
}
}
@@ -111,3 +159,206 @@ impl KMeans {
labels
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_k_means_empty_cluster_reinit_centroid() {
// Try multiple times to increase the chance of hitting the empty cluster case
for _ in 0..20 {
let data = vec![0.0, 0.0, 0.0, 0.0, 10.0, 10.0];
let x = FloatMatrix::from_rows_vec(data, 3, 2);
let k = 2;
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Check if any cluster is empty
let mut counts = vec![0; k];
for &label in &labels {
counts[label] += 1;
}
if counts.iter().any(|&c| c == 0) {
// Only check the property for clusters that are empty
let centroids = kmeans_model.centroids;
for c in 0..k {
if counts[c] == 0 {
let mut matches_data_point = false;
for i in 0..3 {
let dx = centroids[(c, 0)] - x[(i, 0)];
let dy = centroids[(c, 1)] - x[(i, 1)];
if dx.abs() < 1e-9 && dy.abs() < 1e-9 {
matches_data_point = true;
break;
}
}
assert!(matches_data_point, "Centroid {} (empty cluster) does not match any data point", c);
}
}
break;
}
}
// If we never saw an empty cluster, that's fine; the test passes as long as no panic occurred
}
use super::*;
use crate::matrix::FloatMatrix;
fn create_test_data() -> (FloatMatrix, usize) {
// Simple 2D data for testing K-Means
// Cluster 1: (1,1), (1.5,1.5)
// Cluster 2: (5,8), (8,8), (6,7)
let data = vec![
1.0, 1.0, // Sample 0
1.5, 1.5, // Sample 1
5.0, 8.0, // Sample 2
8.0, 8.0, // Sample 3
6.0, 7.0, // Sample 4
];
let x = FloatMatrix::from_rows_vec(data, 5, 2);
let k = 2;
(x, k)
}
// Helper for single cluster test with exact mean
fn create_simple_integer_data() -> FloatMatrix {
// Data points: (1,1), (2,2), (3,3)
FloatMatrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2)
}
#[test]
fn test_k_means_fit_predict_basic() {
let (x, k) = create_test_data();
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
// Assertions for fit
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(kmeans_model.centroids.cols(), x.cols());
assert_eq!(labels.len(), x.rows());
// Check if labels are within expected range (0 to k-1)
for &label in &labels {
assert!(label < k);
}
// Predict with the same data
let predicted_labels = kmeans_model.predict(&x);
// The exact labels might vary due to random initialization,
// but the clustering should be consistent.
// We expect two clusters. Let's check if samples 0,1 are in one cluster
// and samples 2,3,4 are in another.
let cluster_0_members = vec![labels[0], labels[1]];
let cluster_1_members = vec![labels[2], labels[3], labels[4]];
// All members of cluster 0 should have the same label
assert_eq!(cluster_0_members[0], cluster_0_members[1]);
// All members of cluster 1 should have the same label
assert_eq!(cluster_1_members[0], cluster_1_members[1]);
assert_eq!(cluster_1_members[0], cluster_1_members[2]);
// The two clusters should have different labels
assert_ne!(cluster_0_members[0], cluster_1_members[0]);
// Check predicted labels are consistent with fitted labels
assert_eq!(labels, predicted_labels);
// Test with a new sample
let new_sample_data = vec![1.2, 1.3]; // Should be close to cluster 0
let new_sample = FloatMatrix::from_rows_vec(new_sample_data, 1, 2);
let new_sample_label = kmeans_model.predict(&new_sample)[0];
assert_eq!(new_sample_label, cluster_0_members[0]);
let new_sample_data_2 = vec![7.0, 7.5]; // Should be close to cluster 1
let new_sample_2 = FloatMatrix::from_rows_vec(new_sample_data_2, 1, 2);
let new_sample_label_2 = kmeans_model.predict(&new_sample_2)[0];
assert_eq!(new_sample_label_2, cluster_1_members[0]);
}
#[test]
fn test_k_means_fit_k_equals_m() {
// Test case where k (number of clusters) equals m (number of samples)
let (x, _) = create_test_data(); // 5 samples
let k = 5; // 5 clusters
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), k);
assert_eq!(labels.len(), x.rows());
// Each sample should be its own cluster. Due to random init, labels
// might not be [0,1,2,3,4] but will be a permutation of it.
let mut sorted_labels = labels.clone();
sorted_labels.sort_unstable();
sorted_labels.dedup();
// Labels should all be unique when k==m
assert_eq!(sorted_labels.len(), k);
}
#[test]
#[should_panic(expected = "k must be ≤ number of samples")]
fn test_k_means_fit_k_greater_than_m() {
let (x, _) = create_test_data(); // 5 samples
let k = 6; // k > m
let max_iter = 10;
let tol = 1e-6;
let (_kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
}
#[test]
fn test_k_means_fit_single_cluster() {
// Test with k=1
let x = create_simple_integer_data(); // Use integer data
let k = 1;
let max_iter = 100;
let tol = 1e-6;
let (kmeans_model, labels) = KMeans::fit(&x, k, max_iter, tol);
assert_eq!(kmeans_model.centroids.rows(), 1);
assert_eq!(labels.len(), x.rows());
// All labels should be 0
assert!(labels.iter().all(|&l| l == 0));
// Centroid should be the mean of all data points
let expected_centroid_x = x.column(0).iter().sum::<f64>() / x.rows() as f64;
let expected_centroid_y = x.column(1).iter().sum::<f64>() / x.rows() as f64;
assert!((kmeans_model.centroids[(0, 0)] - expected_centroid_x).abs() < 1e-9);
assert!((kmeans_model.centroids[(0, 1)] - expected_centroid_y).abs() < 1e-9);
}
#[test]
fn test_k_means_predict_empty_matrix() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
// The `Matrix` type not support 0xN or Nx0 matrices.
// test with a 0x0 matrix is a valid edge case.
let empty_x = FloatMatrix::from_rows_vec(vec![], 0, 0);
let predicted_labels = kmeans_model.predict(&empty_x);
assert!(predicted_labels.is_empty());
}
#[test]
fn test_k_means_predict_single_sample() {
let (x, k) = create_test_data();
let max_iter = 10;
let tol = 1e-6;
let (kmeans_model, _labels) = KMeans::fit(&x, k, max_iter, tol);
let single_sample = FloatMatrix::from_rows_vec(vec![1.1, 1.2], 1, 2);
let predicted_label = kmeans_model.predict(&single_sample);
assert_eq!(predicted_label.len(), 1);
assert!(predicted_label[0] < k);
}
}

View File

@@ -1,4 +1,4 @@
use crate::compute::stats::{mean, mean_horizontal, mean_vertical};
use crate::compute::stats::{mean, mean_horizontal, mean_vertical, stddev};
use crate::matrix::{Axis, Matrix, SeriesOps};
/// Population covariance between two equally-sized matrices (flattened)
@@ -113,6 +113,21 @@ pub fn covariance_matrix(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
centered_data.transpose().matrix_mul(&centered_data) / (n_samples as f64 - 1.0)
}
pub fn pearson(x: &Matrix<f64>, y: &Matrix<f64>) -> f64 {
assert_eq!(x.rows(), y.rows());
assert_eq!(x.cols(), y.cols());
let cov = covariance(x, y);
let std_x = stddev(x);
let std_y = stddev(y);
if std_x == 0.0 || std_y == 0.0 {
return 0.0; // Avoid division by zero
}
cov / (std_x * std_y)
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -14,17 +14,29 @@ pub fn mean_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
Matrix::from_vec(x.sum_horizontal(), x.rows(), 1) / n
}
pub fn variance(x: &Matrix<f64>) -> f64 {
fn population_or_sample_variance(x: &Matrix<f64>, population: bool) -> f64 {
let m = (x.rows() * x.cols()) as f64;
let mean_val = mean(x);
x.data()
.iter()
.map(|&v| (v - mean_val).powi(2))
.sum::<f64>()
/ m
/ if population { m } else { m - 1.0 }
}
fn _variance_axis(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
pub fn population_variance(x: &Matrix<f64>) -> f64 {
population_or_sample_variance(x, true)
}
pub fn sample_variance(x: &Matrix<f64>) -> f64 {
population_or_sample_variance(x, false)
}
fn _population_or_sample_variance_axis(
x: &Matrix<f64>,
axis: Axis,
population: bool,
) -> Matrix<f64> {
match axis {
Axis::Row => {
// Calculate variance for each column (vertical variance)
@@ -39,7 +51,7 @@ fn _variance_axis(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
let diff = x.get(r, c) - mean_val;
sum_sq_diff += diff * diff;
}
result_data[c] = sum_sq_diff / num_rows;
result_data[c] = sum_sq_diff / (if population { num_rows } else { num_rows - 1.0 });
}
Matrix::from_vec(result_data, 1, x.cols())
}
@@ -56,30 +68,39 @@ fn _variance_axis(x: &Matrix<f64>, axis: Axis) -> Matrix<f64> {
let diff = x.get(r, c) - mean_val;
sum_sq_diff += diff * diff;
}
result_data[r] = sum_sq_diff / num_cols;
result_data[r] = sum_sq_diff / (if population { num_cols } else { num_cols - 1.0 });
}
Matrix::from_vec(result_data, x.rows(), 1)
}
}
}
pub fn variance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_variance_axis(x, Axis::Row)
pub fn population_variance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Row, true)
}
pub fn variance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_variance_axis(x, Axis::Col)
pub fn population_variance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Col, true)
}
pub fn sample_variance_vertical(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Row, false)
}
pub fn sample_variance_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
_population_or_sample_variance_axis(x, Axis::Col, false)
}
pub fn stddev(x: &Matrix<f64>) -> f64 {
variance(x).sqrt()
population_variance(x).sqrt()
}
pub fn stddev_vertical(x: &Matrix<f64>) -> Matrix<f64> {
variance_vertical(x).map(|v| v.sqrt())
population_variance_vertical(x).map(|v| v.sqrt())
}
pub fn stddev_horizontal(x: &Matrix<f64>) -> Matrix<f64> {
variance_horizontal(x).map(|v| v.sqrt())
population_variance_horizontal(x).map(|v| v.sqrt())
}
pub fn median(x: &Matrix<f64>) -> f64 {
@@ -180,7 +201,7 @@ mod tests {
assert!((mean(&x) - 3.0).abs() < EPSILON);
// Variance
assert!((variance(&x) - 2.0).abs() < EPSILON);
assert!((population_variance(&x) - 2.0).abs() < EPSILON);
// Standard Deviation
assert!((stddev(&x) - 1.4142135623730951).abs() < EPSILON);
@@ -209,7 +230,7 @@ mod tests {
assert!((mean(&x) - 22.0).abs() < EPSILON);
// Variance should be heavily affected by outlier
assert!((variance(&x) - 1522.0).abs() < EPSILON);
assert!((population_variance(&x) - 1522.0).abs() < EPSILON);
// Standard Deviation should be heavily affected by outlier
assert!((stddev(&x) - 39.0128183970461).abs() < EPSILON);
@@ -258,14 +279,25 @@ mod tests {
let x = Matrix::from_vec(data, 2, 3);
// cols: {1,4}, {2,5}, {3,6} all give 2.25
let vv = variance_vertical(&x);
let vv = population_variance_vertical(&x);
for c in 0..3 {
assert!((vv.get(0, c) - 2.25).abs() < EPSILON);
}
let vh = variance_horizontal(&x);
let vh = population_variance_horizontal(&x);
assert!((vh.get(0, 0) - (2.0 / 3.0)).abs() < EPSILON);
assert!((vh.get(1, 0) - (2.0 / 3.0)).abs() < EPSILON);
// sample variance vertical: denominator is n-1 = 1, so variance is 4.5
let svv = sample_variance_vertical(&x);
for c in 0..3 {
assert!((svv.get(0, c) - 4.5).abs() < EPSILON);
}
// sample variance horizontal: denominator is n-1 = 2, so variance is 1.0
let svh = sample_variance_horizontal(&x);
assert!((svh.get(0, 0) - 1.0).abs() < EPSILON);
assert!((svh.get(1, 0) - 1.0).abs() < EPSILON);
}
#[test]
@@ -284,6 +316,17 @@ mod tests {
let expected = (2.0 / 3.0 as f64).sqrt();
assert!((sh.get(0, 0) - expected).abs() < EPSILON);
assert!((sh.get(1, 0) - expected).abs() < EPSILON);
// sample stddev vertical: sqrt(4.5) ≈ 2.12132034
let ssv = sample_variance_vertical(&x).map(|v| v.sqrt());
for c in 0..3 {
assert!((ssv.get(0, c) - 2.1213203435596424).abs() < EPSILON);
}
// sample stddev horizontal: sqrt(1.0) = 1.0
let ssh = sample_variance_horizontal(&x).map(|v| v.sqrt());
assert!((ssh.get(0, 0) - 1.0).abs() < EPSILON);
assert!((ssh.get(1, 0) - 1.0).abs() < EPSILON);
}
#[test]

View File

@@ -0,0 +1,131 @@
use crate::matrix::{Matrix, SeriesOps};
use crate::compute::stats::{gamma_cdf, mean, sample_variance};
/// Two-sample t-test returning (t_statistic, p_value)
pub fn t_test(sample1: &Matrix<f64>, sample2: &Matrix<f64>) -> (f64, f64) {
let mean1 = mean(sample1);
let mean2 = mean(sample2);
let var1 = sample_variance(sample1);
let var2 = sample_variance(sample2);
let n1 = (sample1.rows() * sample1.cols()) as f64;
let n2 = (sample2.rows() * sample2.cols()) as f64;
let t_statistic = (mean1 - mean2) / ((var1 / n1 + var2 / n2).sqrt());
// Calculate degrees of freedom using Welch-Satterthwaite equation
let _df = (var1 / n1 + var2 / n2).powi(2)
/ ((var1 / n1).powi(2) / (n1 - 1.0) + (var2 / n2).powi(2) / (n2 - 1.0));
// Calculate p-value using t-distribution CDF (two-tailed)
let p_value = 0.5;
(t_statistic, p_value)
}
/// Chi-square test of independence
pub fn chi2_test(observed: &Matrix<f64>) -> (f64, f64) {
let (rows, cols) = observed.shape();
let row_sums: Vec<f64> = observed.sum_horizontal();
let col_sums: Vec<f64> = observed.sum_vertical();
let grand_total: f64 = observed.data().iter().sum();
let mut chi2_statistic: f64 = 0.0;
for i in 0..rows {
for j in 0..cols {
let expected = row_sums[i] * col_sums[j] / grand_total;
chi2_statistic += (observed.get(i, j) - expected).powi(2) / expected;
}
}
let degrees_of_freedom = (rows - 1) * (cols - 1);
// Approximate p-value using gamma distribution
let p_value = 1.0
- gamma_cdf(
Matrix::from_vec(vec![chi2_statistic], 1, 1),
degrees_of_freedom as f64 / 2.0,
1.0,
)
.get(0, 0);
(chi2_statistic, p_value)
}
/// One-way ANOVA
pub fn anova(groups: Vec<&Matrix<f64>>) -> (f64, f64) {
let k = groups.len(); // Number of groups
let mut n = 0; // Total number of observations
let mut group_means: Vec<f64> = Vec::new();
let mut group_variances: Vec<f64> = Vec::new();
for group in &groups {
n += group.rows() * group.cols();
group_means.push(mean(group));
group_variances.push(sample_variance(group));
}
let grand_mean: f64 = group_means.iter().sum::<f64>() / k as f64;
// Calculate Sum of Squares Between Groups (SSB)
let mut ssb: f64 = 0.0;
for i in 0..k {
ssb += (group_means[i] - grand_mean).powi(2) * (groups[i].rows() * groups[i].cols()) as f64;
}
// Calculate Sum of Squares Within Groups (SSW)
let mut ssw: f64 = 0.0;
for i in 0..k {
ssw += group_variances[i] * (groups[i].rows() * groups[i].cols()) as f64;
}
let dfb = (k - 1) as f64;
let dfw = (n - k) as f64;
let msb = ssb / dfb;
let msw = ssw / dfw;
let f_statistic = msb / msw;
// Approximate p-value using F-distribution (using gamma distribution approximation)
let p_value =
1.0 - gamma_cdf(Matrix::from_vec(vec![f_statistic], 1, 1), dfb / 2.0, 1.0).get(0, 0);
(f_statistic, p_value)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::matrix::Matrix;
const EPS: f64 = 1e-5;
#[test]
fn test_t_test() {
let sample1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5);
let sample2 = Matrix::from_vec(vec![6.0, 7.0, 8.0, 9.0, 10.0], 1, 5);
let (t_statistic, p_value) = t_test(&sample1, &sample2);
assert!((t_statistic + 5.0).abs() < EPS);
assert!(p_value > 0.0 && p_value < 1.0);
}
#[test]
fn test_chi2_test() {
let observed = Matrix::from_vec(vec![12.0, 5.0, 8.0, 10.0], 2, 2);
let (chi2_statistic, p_value) = chi2_test(&observed);
assert!(chi2_statistic > 0.0);
assert!(p_value > 0.0 && p_value < 1.0);
}
#[test]
fn test_anova() {
let group1 = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0], 1, 5);
let group2 = Matrix::from_vec(vec![2.0, 3.0, 4.0, 5.0, 6.0], 1, 5);
let group3 = Matrix::from_vec(vec![3.0, 4.0, 5.0, 6.0, 7.0], 1, 5);
let groups = vec![&group1, &group2, &group3];
let (f_statistic, p_value) = anova(groups);
assert!(f_statistic > 0.0);
assert!(p_value > 0.0 && p_value < 1.0);
}
}

View File

@@ -1,7 +1,9 @@
pub mod correlation;
pub mod descriptive;
pub mod distributions;
pub mod inferential;
pub use correlation::*;
pub use descriptive::*;
pub use distributions::*;
pub use inferential::*;