mirror of
https://github.com/Magnus167/rustframe.git
synced 2025-11-19 20:36:11 +00:00
Compare commits
33 Commits
b687fd4e6b
...
csv
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef25e77f04 | ||
|
|
4ba5cfea18 | ||
|
|
23367c7ca3 | ||
|
|
df8c1d2a12 | ||
|
|
1381c77eaf | ||
| c56574f0f3 | |||
| c53693fa7b | |||
| 109d39b248 | |||
|
|
18ad6c689a | ||
| 1fead78b69 | |||
|
|
6fb32e743c | ||
| 2cb4e46217 | |||
|
|
a53ba63f30 | ||
|
|
dae60ea1bd | ||
|
|
755dee58e7 | ||
|
|
9e6e22fc37 | ||
| cd3aa84e60 | |||
| 27275e2479 | |||
| 9ef719316a | |||
| 960fd345c2 | |||
| 325e75419c | |||
| b1dc18d05b | |||
| 8cbb957764 | |||
| b937ed1cdf | |||
| 2e071a6974 | |||
| 689169bab2 | |||
| a45a5ecf4e | |||
| 84e1b423f4 | |||
| 197739bc2f | |||
| d2c2ebca0f | |||
| f5f3f2c100 | |||
| 9fcb1ea2cf | |||
|
|
623303cf72 |
@@ -1,11 +1,12 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "rustframe"
|
name = "rustframe"
|
||||||
authors = ["Palash Tyagi (https://github.com/Magnus167)"]
|
authors = ["Palash Tyagi (https://github.com/Magnus167)"]
|
||||||
version = "0.0.1-a.20250716"
|
version = "0.0.1-a.20250805"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
description = "A simple dataframe library"
|
description = "A simple dataframe and math toolkit"
|
||||||
|
documentation = "https://magnus167.github.io/rustframe/"
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
name = "rustframe"
|
name = "rustframe"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# rustframe
|
# rustframe
|
||||||
|
|
||||||
📚 [Docs](https://magnus167.github.io/rustframe/) | 🐙 [GitHub](https://github.com/Magnus167/rustframe) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
|
🐙 [GitHub](https://github.com/Magnus167/rustframe) | 📚 [Docs](https://magnus167.github.io/rustframe/) | 📖 [User Guide](https://magnus167.github.io/rustframe/user-guide/) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
|
||||||
|
|
||||||
<!-- [](https://github.com/Magnus167/rustframe) -->
|
<!-- [](https://github.com/Magnus167/rustframe) -->
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,77 @@ assert!((corr - 1.0).abs() < 1e-8);
|
|||||||
assert!((cov - 2.5).abs() < 1e-8);
|
assert!((cov - 2.5).abs() < 1e-8);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Covariance
|
||||||
|
|
||||||
|
### `covariance`
|
||||||
|
|
||||||
|
Computes the population covariance between two equally sized matrices by flattening
|
||||||
|
their values.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::stats::covariance;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
let x = Matrix::from_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
|
||||||
|
let y = Matrix::from_vec(vec![2.0, 4.0, 6.0, 8.0], 2, 2);
|
||||||
|
let cov = covariance(&x, &y);
|
||||||
|
assert!((cov - 2.5).abs() < 1e-8);
|
||||||
|
```
|
||||||
|
|
||||||
|
### `covariance_vertical`
|
||||||
|
|
||||||
|
Evaluates covariance between columns (i.e. across rows) and returns a matrix of
|
||||||
|
column pair covariances.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::stats::covariance_vertical;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
|
||||||
|
let cov = covariance_vertical(&m);
|
||||||
|
assert_eq!(cov.shape(), (2, 2));
|
||||||
|
assert!(cov.data().iter().all(|&v| (v - 1.0).abs() < 1e-8));
|
||||||
|
```
|
||||||
|
|
||||||
|
### `covariance_horizontal`
|
||||||
|
|
||||||
|
Computes covariance between rows (i.e. across columns) returning a matrix that
|
||||||
|
describes how each pair of rows varies together.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::stats::covariance_horizontal;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
let m = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
|
||||||
|
let cov = covariance_horizontal(&m);
|
||||||
|
assert_eq!(cov.shape(), (2, 2));
|
||||||
|
assert!(cov.data().iter().all(|&v| (v - 0.25).abs() < 1e-8));
|
||||||
|
```
|
||||||
|
|
||||||
|
### `covariance_matrix`
|
||||||
|
|
||||||
|
Builds a covariance matrix either between columns (`Axis::Col`) or rows
|
||||||
|
(`Axis::Row`). Each entry represents how two series co-vary.
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::stats::covariance_matrix;
|
||||||
|
use rustframe::matrix::{Axis, Matrix};
|
||||||
|
|
||||||
|
let data = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0], 2, 2);
|
||||||
|
|
||||||
|
// Covariance between columns
|
||||||
|
let cov_cols = covariance_matrix(&data, Axis::Col);
|
||||||
|
assert!((cov_cols.get(0, 0) - 2.0).abs() < 1e-8);
|
||||||
|
|
||||||
|
// Covariance between rows
|
||||||
|
let cov_rows = covariance_matrix(&data, Axis::Row);
|
||||||
|
assert!((cov_rows.get(0, 1) + 0.5).abs() < 1e-8);
|
||||||
|
```
|
||||||
|
|
||||||
## Distributions
|
## Distributions
|
||||||
|
|
||||||
Probability distribution helpers are available for common PDFs and CDFs.
|
Probability distribution helpers are available for common PDFs and CDFs.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Introduction
|
# Introduction
|
||||||
|
|
||||||
📚 [Docs](https://magnus167.github.io/rustframe/) | 🐙 [GitHub](https://github.com/Magnus167/rustframe) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
|
🐙 [GitHub](https://github.com/Magnus167/rustframe) | 📚 [Docs](https://magnus167.github.io/rustframe/) | 📖 [User Guide](https://magnus167.github.io/rustframe/user-guide/) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/)
|
||||||
|
|
||||||
Welcome to the **Rustframe User Guide**. Rustframe is a lightweight dataframe
|
Welcome to the **Rustframe User Guide**. Rustframe is a lightweight dataframe
|
||||||
and math toolkit for Rust written in 100% safe Rust. It focuses on keeping the
|
and math toolkit for Rust written in 100% safe Rust. It focuses on keeping the
|
||||||
|
|||||||
@@ -41,9 +41,6 @@ let new_point = Matrix::from_vec(vec![0.0, 0.0], 1, 2);
|
|||||||
let cluster = model.predict(&new_point)[0];
|
let cluster = model.predict(&new_point)[0];
|
||||||
```
|
```
|
||||||
|
|
||||||
For helper functions and upcoming modules, visit the
|
|
||||||
[utilities](./utilities.md) section.
|
|
||||||
|
|
||||||
## Logistic Regression
|
## Logistic Regression
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
@@ -72,7 +69,7 @@ let transformed = pca.transform(&data);
|
|||||||
assert_eq!(transformed.cols(), 1);
|
assert_eq!(transformed.cols(), 1);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Gaussian Naive Bayes
|
## Gaussian Naive Bayes
|
||||||
|
|
||||||
Gaussian Naive Bayes classifier for continuous features:
|
Gaussian Naive Bayes classifier for continuous features:
|
||||||
|
|
||||||
@@ -101,7 +98,7 @@ let predictions = model.predict(&x);
|
|||||||
assert_eq!(predictions.rows(), 4);
|
assert_eq!(predictions.rows(), 4);
|
||||||
```
|
```
|
||||||
|
|
||||||
### Dense Neural Networks
|
## Dense Neural Networks
|
||||||
|
|
||||||
Simple fully connected neural network:
|
Simple fully connected neural network:
|
||||||
|
|
||||||
@@ -142,5 +139,144 @@ let predictions = model.predict(&x);
|
|||||||
assert_eq!(predictions.rows(), 4);
|
assert_eq!(predictions.rows(), 4);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Real-world Examples
|
||||||
|
|
||||||
|
### Housing Price Prediction
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::models::linreg::LinReg;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
// Features: square feet and bedrooms
|
||||||
|
let features = Matrix::from_rows_vec(vec![
|
||||||
|
2100.0, 3.0,
|
||||||
|
1600.0, 2.0,
|
||||||
|
2400.0, 4.0,
|
||||||
|
1400.0, 2.0,
|
||||||
|
], 4, 2);
|
||||||
|
|
||||||
|
// Sale prices
|
||||||
|
let target = Matrix::from_vec(vec![400_000.0, 330_000.0, 369_000.0, 232_000.0], 4, 1);
|
||||||
|
|
||||||
|
let mut model = LinReg::new(2);
|
||||||
|
model.fit(&features, &target, 1e-8, 10_000);
|
||||||
|
|
||||||
|
// Predict price of a new home
|
||||||
|
let new_home = Matrix::from_vec(vec![2000.0, 3.0], 1, 2);
|
||||||
|
let predicted_price = model.predict(&new_home);
|
||||||
|
println!("Predicted price: ${}", predicted_price.data()[0]);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Spam Detection
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::models::logreg::LogReg;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
// 20 e-mails × 5 features = 100 numbers (row-major, spam first)
|
||||||
|
let x = Matrix::from_rows_vec(
|
||||||
|
vec![
|
||||||
|
// ─────────── spam examples ───────────
|
||||||
|
2.0, 1.0, 1.0, 1.0, 1.0, // "You win a FREE offer - click for money-back bonus!"
|
||||||
|
1.0, 0.0, 1.0, 1.0, 0.0, // "FREE offer! Click now!"
|
||||||
|
0.0, 2.0, 0.0, 1.0, 1.0, // "Win win win - money inside, click…"
|
||||||
|
1.0, 1.0, 0.0, 0.0, 1.0, // "Limited offer to win easy money…"
|
||||||
|
1.0, 0.0, 1.0, 0.0, 1.0, // ...
|
||||||
|
0.0, 1.0, 1.0, 1.0, 0.0, // ...
|
||||||
|
2.0, 0.0, 0.0, 1.0, 1.0, // ...
|
||||||
|
0.0, 1.0, 1.0, 0.0, 1.0, // ...
|
||||||
|
1.0, 1.0, 1.0, 1.0, 0.0, // ...
|
||||||
|
1.0, 0.0, 0.0, 1.0, 1.0, // ...
|
||||||
|
// ─────────── ham examples ───────────
|
||||||
|
0.0, 0.0, 0.0, 0.0, 0.0, // "See you at the meeting tomorrow."
|
||||||
|
0.0, 0.0, 0.0, 1.0, 0.0, // "Here's the Zoom click-link."
|
||||||
|
0.0, 0.0, 0.0, 0.0, 1.0, // "Expense report: money attached."
|
||||||
|
0.0, 0.0, 0.0, 1.0, 1.0, // ...
|
||||||
|
0.0, 1.0, 0.0, 0.0, 0.0, // "Did we win the bid?"
|
||||||
|
0.0, 0.0, 0.0, 0.0, 0.0, // ...
|
||||||
|
0.0, 0.0, 0.0, 1.0, 0.0, // ...
|
||||||
|
1.0, 0.0, 0.0, 0.0, 0.0, // "Special offer for staff lunch."
|
||||||
|
0.0, 0.0, 0.0, 0.0, 0.0, // ...
|
||||||
|
0.0, 0.0, 0.0, 1.0, 0.0,
|
||||||
|
],
|
||||||
|
20,
|
||||||
|
5,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Labels: 1 = spam, 0 = ham
|
||||||
|
let y = Matrix::from_vec(
|
||||||
|
vec![
|
||||||
|
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, // 10 spam
|
||||||
|
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, // 10 ham
|
||||||
|
],
|
||||||
|
20,
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Train
|
||||||
|
let mut model = LogReg::new(5);
|
||||||
|
model.fit(&x, &y, 0.01, 5000);
|
||||||
|
|
||||||
|
// Predict
|
||||||
|
// e.g. "free money offer"
|
||||||
|
let email_data = vec![1.0, 0.0, 1.0, 0.0, 1.0];
|
||||||
|
let email = Matrix::from_vec(email_data, 1, 5);
|
||||||
|
let prob_spam = model.predict_proba(&email);
|
||||||
|
println!("Probability of spam: {:.4}", prob_spam.data()[0]);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Iris Flower Classification
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::models::gaussian_nb::GaussianNB;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
// Features: sepal length and petal length
|
||||||
|
let x = Matrix::from_rows_vec(vec![
|
||||||
|
5.1, 1.4, // setosa
|
||||||
|
4.9, 1.4, // setosa
|
||||||
|
6.2, 4.5, // versicolor
|
||||||
|
5.9, 5.1, // virginica
|
||||||
|
], 4, 2);
|
||||||
|
|
||||||
|
let y = Matrix::from_vec(vec![0.0, 0.0, 1.0, 2.0], 4, 1);
|
||||||
|
let names = vec!["setosa", "versicolor", "virginica"];
|
||||||
|
|
||||||
|
let mut model = GaussianNB::new(1e-9, true);
|
||||||
|
model.fit(&x, &y);
|
||||||
|
|
||||||
|
let sample = Matrix::from_vec(vec![5.0, 1.5], 1, 2);
|
||||||
|
let predicted_class = model.predict(&sample);
|
||||||
|
let class_name = names[predicted_class.data()[0] as usize];
|
||||||
|
println!("Predicted class: {} ({:?})", class_name, predicted_class.data()[0]);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Customer Segmentation
|
||||||
|
|
||||||
|
```rust
|
||||||
|
# extern crate rustframe;
|
||||||
|
use rustframe::compute::models::k_means::KMeans;
|
||||||
|
use rustframe::matrix::Matrix;
|
||||||
|
|
||||||
|
// Each row: [age, annual_income]
|
||||||
|
let customers = Matrix::from_rows_vec(
|
||||||
|
vec![
|
||||||
|
25.0, 40_000.0, 34.0, 52_000.0, 58.0, 95_000.0, 45.0, 70_000.0,
|
||||||
|
],
|
||||||
|
4,
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
|
||||||
|
let (model, labels) = KMeans::fit(&customers, 2, 20, 1e-4);
|
||||||
|
|
||||||
|
let new_customer = Matrix::from_vec(vec![30.0, 50_000.0], 1, 2);
|
||||||
|
let cluster = model.predict(&new_customer)[0];
|
||||||
|
println!("New customer belongs to cluster: {}", cluster);
|
||||||
|
println!("Cluster labels: {:?}", labels);
|
||||||
|
```
|
||||||
|
|
||||||
For helper functions and upcoming modules, visit the
|
For helper functions and upcoming modules, visit the
|
||||||
[utilities](./utilities.md) section.
|
[utilities](./utilities.md) section.
|
||||||
|
|||||||
411
src/csv/csv_core.rs
Normal file
411
src/csv/csv_core.rs
Normal file
@@ -0,0 +1,411 @@
|
|||||||
|
use chrono::{NaiveDate, NaiveDateTime};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufRead, BufReader};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// Represents the target type for a CSV column.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum DataType {
|
||||||
|
Int,
|
||||||
|
Float,
|
||||||
|
Bool,
|
||||||
|
UInt,
|
||||||
|
String,
|
||||||
|
Date,
|
||||||
|
DateTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a value parsed from the CSV.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub enum Value {
|
||||||
|
Int(i64),
|
||||||
|
Float(f64),
|
||||||
|
Bool(bool),
|
||||||
|
UInt(u64),
|
||||||
|
String(String),
|
||||||
|
Date(NaiveDate),
|
||||||
|
DateTime(NaiveDateTime),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convenience alias for a parsed CSV record.
|
||||||
|
pub type Record = HashMap<String, Value>;
|
||||||
|
|
||||||
|
/// A simple CSV reader that reads records line by line.
|
||||||
|
pub struct CsvReader<R: BufRead> {
|
||||||
|
reader: R,
|
||||||
|
separators: Vec<char>,
|
||||||
|
headers: Vec<String>,
|
||||||
|
types: Option<HashMap<String, DataType>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builder for [`CsvReader`] allowing chained configuration of headers, types, and separators.
|
||||||
|
pub struct CsvReaderBuilder<R: BufRead> {
|
||||||
|
reader: R,
|
||||||
|
separators: Vec<char>,
|
||||||
|
headers: Vec<String>,
|
||||||
|
types: Option<HashMap<String, DataType>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> CsvReader<R> {
|
||||||
|
/// Create a new CSV reader from a [`BufRead`] source.
|
||||||
|
/// The first line is expected to contain headers.
|
||||||
|
/// `separators` is a list of characters considered as field separators.
|
||||||
|
/// `types` optionally maps column names to target data types.
|
||||||
|
pub fn new(
|
||||||
|
mut reader: R,
|
||||||
|
separators: Vec<char>,
|
||||||
|
types: Option<HashMap<String, DataType>>,
|
||||||
|
) -> io::Result<Self> {
|
||||||
|
let mut first_line = String::new();
|
||||||
|
reader.read_line(&mut first_line)?;
|
||||||
|
let headers = parse_line(&first_line, &separators);
|
||||||
|
Ok(Self {
|
||||||
|
reader,
|
||||||
|
separators,
|
||||||
|
headers,
|
||||||
|
types,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a reader with default settings (comma separator, automatic typing).
|
||||||
|
pub fn new_default(reader: R) -> io::Result<Self> {
|
||||||
|
Self::new(reader, vec![','], None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a reader with default separators and explicit type mapping.
|
||||||
|
pub fn new_with_types(reader: R, types: HashMap<String, DataType>) -> io::Result<Self> {
|
||||||
|
Self::new(reader, vec![','], Some(types))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start building a reader from a source that lacks headers.
|
||||||
|
pub fn new_with_headers(reader: R, headers: Vec<String>) -> CsvReaderBuilder<R> {
|
||||||
|
CsvReaderBuilder {
|
||||||
|
reader,
|
||||||
|
separators: vec![','],
|
||||||
|
headers,
|
||||||
|
types: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the headers of the CSV file.
|
||||||
|
pub fn headers(&self) -> &[String] {
|
||||||
|
&self.headers
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read the next record. Returns `Ok(None)` on EOF.
|
||||||
|
pub fn read_record(&mut self) -> io::Result<Option<Record>> {
|
||||||
|
let mut line = String::new();
|
||||||
|
if self.reader.read_line(&mut line)? == 0 {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
let fields = parse_line(&line, &self.separators);
|
||||||
|
let mut record = HashMap::new();
|
||||||
|
|
||||||
|
for (i, header) in self.headers.iter().enumerate() {
|
||||||
|
let field = fields.get(i).cloned().unwrap_or_default();
|
||||||
|
let value = match &self.types {
|
||||||
|
Some(map) => {
|
||||||
|
if let Some(dt) = map.get(header) {
|
||||||
|
parse_with_type(&field, dt)
|
||||||
|
} else {
|
||||||
|
Value::String(field)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => parse_auto(&field),
|
||||||
|
};
|
||||||
|
record.insert(header.clone(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(record))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> Iterator for CsvReader<R> {
|
||||||
|
type Item = io::Result<Record>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self.read_record() {
|
||||||
|
Ok(Some(rec)) => Some(Ok(rec)),
|
||||||
|
Ok(None) => None,
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> CsvReaderBuilder<R> {
|
||||||
|
/// Override field separators for the upcoming reader.
|
||||||
|
pub fn separators(mut self, separators: Vec<char>) -> Self {
|
||||||
|
self.separators = separators;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finalize the builder with an explicit type mapping.
|
||||||
|
pub fn new_with_types(mut self, types: HashMap<String, DataType>) -> CsvReader<R> {
|
||||||
|
self.types = Some(types);
|
||||||
|
self.build()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finalize the builder without specifying types.
|
||||||
|
pub fn build(self) -> CsvReader<R> {
|
||||||
|
CsvReader {
|
||||||
|
reader: self.reader,
|
||||||
|
separators: self.separators,
|
||||||
|
headers: self.headers,
|
||||||
|
types: self.types,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> CsvReader<R> {
|
||||||
|
/// Read all remaining records into a vector.
|
||||||
|
pub fn read_all(&mut self) -> io::Result<Vec<Record>> {
|
||||||
|
let mut records = Vec::new();
|
||||||
|
while let Some(rec) = self.read_record()? {
|
||||||
|
records.push(rec);
|
||||||
|
}
|
||||||
|
Ok(records)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CsvReader<BufReader<File>> {
|
||||||
|
/// Create a [`CsvReader`] from a file path using comma separators and
|
||||||
|
/// automatic type detection.
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use rustframe::csv::{CsvReader, Value};
|
||||||
|
/// # let path = std::env::temp_dir().join("from_path_auto.csv");
|
||||||
|
/// # std::fs::write(&path, "a,b\n1,true\n").unwrap();
|
||||||
|
/// let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
||||||
|
/// let rec = reader.next().unwrap().unwrap();
|
||||||
|
/// assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
||||||
|
/// assert_eq!(rec.get("b"), Some(&Value::Bool(true)));
|
||||||
|
/// # std::fs::remove_file(path).unwrap();
|
||||||
|
/// ```
|
||||||
|
pub fn from_path_auto<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||||
|
let file = File::open(path)?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
CsvReader::new_default(reader)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an iterator over records from a file path using default settings.
|
||||||
|
pub fn reader<P: AsRef<Path>>(path: P) -> io::Result<CsvReader<BufReader<File>>> {
|
||||||
|
reader_with(path, vec![','], None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an iterator over records from a file path with custom separators and type mapping.
|
||||||
|
pub fn reader_with<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
separators: Vec<char>,
|
||||||
|
types: Option<HashMap<String, DataType>>,
|
||||||
|
) -> io::Result<CsvReader<BufReader<File>>> {
|
||||||
|
let file = File::open(path)?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
CsvReader::new(reader, separators, types)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read an entire CSV file into memory using default settings.
|
||||||
|
pub fn read_file<P: AsRef<Path>>(path: P) -> io::Result<Vec<Record>> {
|
||||||
|
read_file_with(path, vec![','], None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read an entire CSV file into memory with custom separators and type mapping.
|
||||||
|
pub fn read_file_with<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
separators: Vec<char>,
|
||||||
|
types: Option<HashMap<String, DataType>>,
|
||||||
|
) -> io::Result<Vec<Record>> {
|
||||||
|
let mut reader = reader_with(path, separators, types)?;
|
||||||
|
reader.read_all()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_with_type(s: &str, ty: &DataType) -> Value {
|
||||||
|
match ty {
|
||||||
|
DataType::Int => s
|
||||||
|
.parse::<i64>()
|
||||||
|
.map(Value::Int)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
DataType::Float => s
|
||||||
|
.parse::<f64>()
|
||||||
|
.map(Value::Float)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
DataType::Bool => s
|
||||||
|
.parse::<bool>()
|
||||||
|
.map(Value::Bool)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
DataType::UInt => s
|
||||||
|
.parse::<u64>()
|
||||||
|
.map(Value::UInt)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
DataType::String => Value::String(s.to_string()),
|
||||||
|
DataType::Date => s
|
||||||
|
.parse::<NaiveDate>()
|
||||||
|
.map(Value::Date)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
DataType::DateTime => NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S")
|
||||||
|
.map(Value::DateTime)
|
||||||
|
.unwrap_or_else(|_| Value::String(s.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_auto(s: &str) -> Value {
|
||||||
|
if let Ok(i) = s.parse::<i64>() {
|
||||||
|
Value::Int(i)
|
||||||
|
} else if let Ok(f) = s.parse::<f64>() {
|
||||||
|
Value::Float(f)
|
||||||
|
} else if let Ok(b) = s.parse::<bool>() {
|
||||||
|
Value::Bool(b)
|
||||||
|
} else if let Ok(dt) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
|
||||||
|
Value::DateTime(dt)
|
||||||
|
} else if let Ok(d) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
|
||||||
|
Value::Date(d)
|
||||||
|
} else {
|
||||||
|
Value::String(s.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_line(line: &str, separators: &[char]) -> Vec<String> {
|
||||||
|
let mut fields = Vec::new();
|
||||||
|
let mut current = String::new();
|
||||||
|
let mut in_quotes: Option<char> = None;
|
||||||
|
let chars: Vec<char> = line.chars().collect();
|
||||||
|
let mut i = 0;
|
||||||
|
|
||||||
|
while i < chars.len() {
|
||||||
|
let c = chars[i];
|
||||||
|
if let Some(q) = in_quotes {
|
||||||
|
if c == q {
|
||||||
|
if i + 1 < chars.len() && chars[i + 1] == q {
|
||||||
|
current.push(q);
|
||||||
|
i += 1; // skip escaped quote
|
||||||
|
} else {
|
||||||
|
in_quotes = None;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
current.push(c);
|
||||||
|
}
|
||||||
|
} else if c == '"' || c == '\'' {
|
||||||
|
in_quotes = Some(c);
|
||||||
|
} else if separators.contains(&c) {
|
||||||
|
fields.push(current.clone());
|
||||||
|
current.clear();
|
||||||
|
} else if c == '\r' {
|
||||||
|
// Ignore carriage returns
|
||||||
|
} else if c == '\n' {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
current.push(c);
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fields.push(current);
|
||||||
|
fields
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use chrono::{NaiveDate, NaiveDateTime};
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_line() {
|
||||||
|
let line = "a,'b,c',\"d\"\"e\",f";
|
||||||
|
let fields = parse_line(line, &[',']);
|
||||||
|
assert_eq!(fields, vec!["a", "b,c", "d\"e", "f"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reader_auto() {
|
||||||
|
let data = "a,b,c\n1,2.5,true\n4,5.0,false\n";
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
let mut reader = CsvReader::new_default(cursor).unwrap();
|
||||||
|
let rec = reader.next().unwrap().unwrap();
|
||||||
|
assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
||||||
|
assert_eq!(rec.get("b"), Some(&Value::Float(2.5)));
|
||||||
|
assert_eq!(rec.get("c"), Some(&Value::Bool(true)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reader_with_types() {
|
||||||
|
let data = "a,b,c\n1,2,3\n";
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
let mut types = HashMap::new();
|
||||||
|
types.insert("a".to_string(), DataType::Int);
|
||||||
|
types.insert("b".to_string(), DataType::Int);
|
||||||
|
types.insert("c".to_string(), DataType::String);
|
||||||
|
let mut reader = CsvReader::new_with_types(cursor, types).unwrap();
|
||||||
|
let rec = reader.next().unwrap().unwrap();
|
||||||
|
assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
||||||
|
assert_eq!(rec.get("b"), Some(&Value::Int(2)));
|
||||||
|
assert_eq!(rec.get("c"), Some(&Value::String("3".to_string())));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chain_headers_and_types() {
|
||||||
|
let data = "1,2\n3,4\n";
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
let headers = vec!["x".to_string(), "y".to_string()];
|
||||||
|
let mut types = HashMap::new();
|
||||||
|
types.insert("x".to_string(), DataType::Int);
|
||||||
|
types.insert("y".to_string(), DataType::UInt);
|
||||||
|
let mut reader = CsvReader::new_with_headers(cursor, headers).new_with_types(types);
|
||||||
|
let rec = reader.next().unwrap().unwrap();
|
||||||
|
assert_eq!(rec.get("x"), Some(&Value::Int(1)));
|
||||||
|
assert_eq!(rec.get("y"), Some(&Value::UInt(2)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_date_types() {
|
||||||
|
let data = "d,dt\n2024-01-01,2024-01-01 12:00:00\n";
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
let mut types = HashMap::new();
|
||||||
|
types.insert("d".to_string(), DataType::Date);
|
||||||
|
types.insert("dt".to_string(), DataType::DateTime);
|
||||||
|
let mut reader = CsvReader::new_with_types(cursor, types).unwrap();
|
||||||
|
let rec = reader.next().unwrap().unwrap();
|
||||||
|
let date = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
|
||||||
|
let datetime: NaiveDateTime = NaiveDate::from_ymd_opt(2024, 1, 1)
|
||||||
|
.unwrap()
|
||||||
|
.and_hms_opt(12, 0, 0)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rec.get("d"), Some(&Value::Date(date)));
|
||||||
|
assert_eq!(rec.get("dt"), Some(&Value::DateTime(datetime)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_read_file_all() {
|
||||||
|
let path = std::env::temp_dir().join("csv_full_test.csv");
|
||||||
|
std::fs::write(&path, "a,b\n1,2\n3,4\n").unwrap();
|
||||||
|
let records = read_file(&path).unwrap();
|
||||||
|
assert_eq!(records.len(), 2);
|
||||||
|
assert_eq!(records[1].get("b"), Some(&Value::Int(4)));
|
||||||
|
std::fs::remove_file(path).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_reader_from_path() {
|
||||||
|
let path = std::env::temp_dir().join("csv_iter_test.csv");
|
||||||
|
std::fs::write(&path, "a,b\n5,6\n").unwrap();
|
||||||
|
let mut iter = reader(&path).unwrap();
|
||||||
|
let rec = iter.next().unwrap().unwrap();
|
||||||
|
assert_eq!(rec.get("a"), Some(&Value::Int(5)));
|
||||||
|
assert_eq!(rec.get("b"), Some(&Value::Int(6)));
|
||||||
|
std::fs::remove_file(path).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_from_path_auto_method() {
|
||||||
|
let path = std::env::temp_dir().join("csv_method_auto.csv");
|
||||||
|
std::fs::write(&path, "a,b\n7,true\n").unwrap();
|
||||||
|
let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
||||||
|
let rec = reader.next().unwrap().unwrap();
|
||||||
|
assert_eq!(rec.get("a"), Some(&Value::Int(7)));
|
||||||
|
assert_eq!(rec.get("b"), Some(&Value::Bool(true)));
|
||||||
|
std::fs::remove_file(path).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
69
src/csv/mod.rs
Normal file
69
src/csv/mod.rs
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
//! CSV handling utilities.
|
||||||
|
//!
|
||||||
|
//! The [`csv`] module offers a flexible [`CsvReader`] with automatic type
|
||||||
|
//! detection and optional builders for custom headers and types.
|
||||||
|
//!
|
||||||
|
//! # Examples
|
||||||
|
//!
|
||||||
|
//! Read from a file with auto type detection:
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use rustframe::csv::CsvReader;
|
||||||
|
//! # let path = std::env::temp_dir().join("docs_auto.csv");
|
||||||
|
//! # std::fs::write(&path, "a,b\n1,true\n").unwrap();
|
||||||
|
//! let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
||||||
|
//! for rec in reader {
|
||||||
|
//! let rec = rec.unwrap();
|
||||||
|
//! println!("{:?}", rec);
|
||||||
|
//! }
|
||||||
|
//! # std::fs::remove_file(path).unwrap();
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Specify column types explicitly:
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use rustframe::csv::{CsvReader, DataType, Value};
|
||||||
|
//! use std::collections::HashMap;
|
||||||
|
//! use std::io::Cursor;
|
||||||
|
//! let data = "a,b\n1,2\n";
|
||||||
|
//! let mut types = HashMap::new();
|
||||||
|
//! types.insert("a".into(), DataType::Int);
|
||||||
|
//! types.insert("b".into(), DataType::Float);
|
||||||
|
//! let mut reader = CsvReader::new_with_types(Cursor::new(data), types).unwrap();
|
||||||
|
//! let rec = reader.next().unwrap().unwrap();
|
||||||
|
//! assert_eq!(rec.get("b"), Some(&Value::Float(2.0)));
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Building from custom headers and types:
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use rustframe::csv::{CsvReader, DataType, Value};
|
||||||
|
//! use std::collections::HashMap;
|
||||||
|
//! use std::io::Cursor;
|
||||||
|
//! let data = "1,2\n";
|
||||||
|
//! let headers = vec!["x".to_string(), "y".to_string()];
|
||||||
|
//! let mut types = HashMap::new();
|
||||||
|
//! types.insert("x".into(), DataType::Int);
|
||||||
|
//! types.insert("y".into(), DataType::UInt);
|
||||||
|
//! let mut reader = CsvReader::new_with_headers(Cursor::new(data), headers).new_with_types(types);
|
||||||
|
//! let rec = reader.next().unwrap().unwrap();
|
||||||
|
//! assert_eq!(rec.get("y"), Some(&Value::UInt(2)));
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Reading an entire file into memory:
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use rustframe::csv::read_file;
|
||||||
|
//! # let path = std::env::temp_dir().join("docs_full.csv");
|
||||||
|
//! # std::fs::write(&path, "a,b\n1,2\n3,4\n").unwrap();
|
||||||
|
//! let records = read_file(&path).unwrap();
|
||||||
|
//! assert_eq!(records.len(), 2);
|
||||||
|
//! # std::fs::remove_file(path).unwrap();
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
pub mod csv_core;
|
||||||
|
|
||||||
|
pub use csv_core::{
|
||||||
|
CsvReader, CsvReaderBuilder, DataType, Record, Value, reader, reader_with,
|
||||||
|
read_file, read_file_with,
|
||||||
|
};
|
||||||
@@ -14,3 +14,6 @@ pub mod compute;
|
|||||||
|
|
||||||
/// Documentation for the [`crate::random`] module.
|
/// Documentation for the [`crate::random`] module.
|
||||||
pub mod random;
|
pub mod random;
|
||||||
|
|
||||||
|
/// Documentation for the [`crate::csv`] module.
|
||||||
|
pub mod csv;
|
||||||
|
|||||||
Reference in New Issue
Block a user