mirror of
https://github.com/Magnus167/rustframe.git
synced 2025-11-19 17:36:11 +00:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 811c153eaf | |||
| 39a95e63d9 | |||
| 1de8ba4f2d | |||
| 74bec4b69e | |||
| 58b38311b5 | |||
| 4ed23069fc | |||
|
|
7d7794627b | ||
| d9bdf8ee96 | |||
| a61ff8a4e1 | |||
|
|
26ee580710 | ||
|
|
96934cd89f | ||
|
|
27ab1ac129 | ||
|
|
eb4fefe363 | ||
|
|
60cc97e702 | ||
|
|
7e2a5ec18d | ||
|
|
4038d25b07 | ||
|
|
aa15248b58 | ||
|
|
fa392ec631 | ||
|
|
8b6f16236a | ||
|
|
58acea8467 | ||
|
|
2607d9c3b0 | ||
|
|
57ed06f79b | ||
|
|
01a132264f | ||
|
|
ff4535c56b | ||
| 9b480e8130 | |||
|
|
fe666a4ddb | ||
|
|
b80d5ab381 | ||
|
|
49f7558225 |
127
README.md
127
README.md
@@ -153,6 +153,133 @@ let zipped_matrix = a.zip(&b, |x, y| x + y);
|
|||||||
assert_eq!(zipped_matrix.data(), &[6.0, 8.0, 10.0, 12.0]);
|
assert_eq!(zipped_matrix.data(), &[6.0, 8.0, 10.0, 12.0]);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## DataFrame Usage Example
|
||||||
|
|
||||||
|
```rust
|
||||||
|
use chrono::NaiveDate;
|
||||||
|
use rustframe::dataframe::DataFrame;
|
||||||
|
use rustframe::utils::{BDateFreq, BDatesList};
|
||||||
|
use std::any::TypeId;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
// Helper for NaiveDate
|
||||||
|
fn d(y: i32, m: u32, d: u32) -> NaiveDate {
|
||||||
|
NaiveDate::from_ymd_opt(y, m, d).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new DataFrame
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
|
||||||
|
// Add columns of different types
|
||||||
|
df.add_column("col_int1", vec![1, 2, 3, 4, 5]);
|
||||||
|
df.add_column("col_float1", vec![1.1, 2.2, 3.3, 4.4, 5.5]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec![
|
||||||
|
"apple".to_string(),
|
||||||
|
"banana".to_string(),
|
||||||
|
"cherry".to_string(),
|
||||||
|
"date".to_string(),
|
||||||
|
"elderberry".to_string(),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
df.add_column("col_bool", vec![true, false, true, false, true]);
|
||||||
|
// df.add_column("col_date", vec![d(2023,1,1), d(2023,1,2), d(2023,1,3), d(2023,1,4), d(2023,1,5)]);
|
||||||
|
df.add_column(
|
||||||
|
"col_date",
|
||||||
|
BDatesList::from_n_periods("2023-01-01".to_string(), BDateFreq::Daily, 5)
|
||||||
|
.unwrap()
|
||||||
|
.list()
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
|
println!("DataFrame after initial column additions:\n{}", df);
|
||||||
|
|
||||||
|
// Demonstrate frame re-use when adding columns of existing types
|
||||||
|
let initial_frames_count = df.num_internal_frames();
|
||||||
|
println!(
|
||||||
|
"\nInitial number of internal frames: {}",
|
||||||
|
initial_frames_count
|
||||||
|
);
|
||||||
|
|
||||||
|
df.add_column("col_int2", vec![6, 7, 8, 9, 10]);
|
||||||
|
df.add_column("col_float2", vec![6.6, 7.7, 8.8, 9.9, 10.0]);
|
||||||
|
|
||||||
|
let frames_after_reuse = df.num_internal_frames();
|
||||||
|
println!(
|
||||||
|
"Number of internal frames after adding more columns of existing types: {}",
|
||||||
|
frames_after_reuse
|
||||||
|
);
|
||||||
|
assert_eq!(initial_frames_count, frames_after_reuse); // Should be equal, demonstrating re-use
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"\nDataFrame after adding more columns of existing types:\n{}",
|
||||||
|
df
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get number of rows and columns
|
||||||
|
println!("Rows: {}", df.rows()); // Output: Rows: 5
|
||||||
|
println!("Columns: {}", df.cols()); // Output: Columns: 5
|
||||||
|
|
||||||
|
// Get column names
|
||||||
|
println!("Column names: {:?}", df.get_column_names());
|
||||||
|
// Output: Column names: ["col_int", "col_float", "col_string", "col_bool", "col_date"]
|
||||||
|
|
||||||
|
// Get a specific column by name and type
|
||||||
|
let int_col = df.get_column::<i32>("col_int1").unwrap();
|
||||||
|
// Output: Integer column: [1, 2, 3, 4, 5]
|
||||||
|
println!("Integer column (col_int1): {:?}", int_col);
|
||||||
|
|
||||||
|
let int_col2 = df.get_column::<i32>("col_int2").unwrap();
|
||||||
|
// Output: Integer column: [6, 7, 8, 9, 10]
|
||||||
|
println!("Integer column (col_int2): {:?}", int_col2);
|
||||||
|
|
||||||
|
let float_col = df.get_column::<f64>("col_float1").unwrap();
|
||||||
|
// Output: Float column: [1.1, 2.2, 3.3, 4.4, 5.5]
|
||||||
|
println!("Float column (col_float1): {:?}", float_col);
|
||||||
|
|
||||||
|
// Attempt to get a column with incorrect type (returns None)
|
||||||
|
let wrong_type_col = df.get_column::<bool>("col_int1");
|
||||||
|
// Output: Wrong type column: None
|
||||||
|
println!("Wrong type column: {:?}", wrong_type_col);
|
||||||
|
|
||||||
|
// Get a row by index
|
||||||
|
let row_0 = df.get_row(0).unwrap();
|
||||||
|
println!("Row 0: {:?}", row_0);
|
||||||
|
// Output: Row 0: {"col_int1": "1", "col_float1": "1.1", "col_string": "apple", "col_bool": "true", "col_date": "2023-01-01", "col_int2": "6", "col_float2": "6.6"}
|
||||||
|
|
||||||
|
let row_2 = df.get_row(2).unwrap();
|
||||||
|
println!("Row 2: {:?}", row_2);
|
||||||
|
// Output: Row 2: {"col_int1": "3", "col_float1": "3.3", "col_string": "cherry", "col_bool": "true", "col_date": "2023-01-03", "col_int2": "8", "col_float2": "8.8"}
|
||||||
|
|
||||||
|
// Attempt to get an out-of-bounds row (returns None)
|
||||||
|
let row_out_of_bounds = df.get_row(10);
|
||||||
|
// Output: Row out of bounds: None
|
||||||
|
println!("Row out of bounds: {:?}", row_out_of_bounds);
|
||||||
|
|
||||||
|
// Drop a column
|
||||||
|
df.drop_column("col_bool");
|
||||||
|
println!("\nDataFrame after dropping 'col_bool':\n{}", df);
|
||||||
|
|
||||||
|
println!("Columns after drop: {}", df.cols());
|
||||||
|
println!("Column names after drop: {:?}", df.get_column_names());
|
||||||
|
|
||||||
|
// Drop another column, ensuring the underlying Frame is removed if empty
|
||||||
|
df.drop_column("col_float1");
|
||||||
|
println!("\nDataFrame after dropping 'col_float1':\n{}", df);
|
||||||
|
|
||||||
|
println!("Columns after second drop: {}", df.cols());
|
||||||
|
println!(
|
||||||
|
"Column names after second drop: {:?}",
|
||||||
|
df.get_column_names()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Attempt to drop a non-existent column (will panic)
|
||||||
|
// df.drop_column("non_existent_col"); // Uncomment to see panic
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
See the [examples](./examples/) directory for some demonstrations of Rustframe's syntax and functionality.
|
See the [examples](./examples/) directory for some demonstrations of Rustframe's syntax and functionality.
|
||||||
|
|||||||
@@ -1,411 +0,0 @@
|
|||||||
use chrono::{NaiveDate, NaiveDateTime};
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{self, BufRead, BufReader};
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
/// Represents the target type for a CSV column.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub enum DataType {
|
|
||||||
Int,
|
|
||||||
Float,
|
|
||||||
Bool,
|
|
||||||
UInt,
|
|
||||||
String,
|
|
||||||
Date,
|
|
||||||
DateTime,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represents a value parsed from the CSV.
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
|
||||||
pub enum Value {
|
|
||||||
Int(i64),
|
|
||||||
Float(f64),
|
|
||||||
Bool(bool),
|
|
||||||
UInt(u64),
|
|
||||||
String(String),
|
|
||||||
Date(NaiveDate),
|
|
||||||
DateTime(NaiveDateTime),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Convenience alias for a parsed CSV record.
|
|
||||||
pub type Record = HashMap<String, Value>;
|
|
||||||
|
|
||||||
/// A simple CSV reader that reads records line by line.
|
|
||||||
pub struct CsvReader<R: BufRead> {
|
|
||||||
reader: R,
|
|
||||||
separators: Vec<char>,
|
|
||||||
headers: Vec<String>,
|
|
||||||
types: Option<HashMap<String, DataType>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Builder for [`CsvReader`] allowing chained configuration of headers, types, and separators.
|
|
||||||
pub struct CsvReaderBuilder<R: BufRead> {
|
|
||||||
reader: R,
|
|
||||||
separators: Vec<char>,
|
|
||||||
headers: Vec<String>,
|
|
||||||
types: Option<HashMap<String, DataType>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: BufRead> CsvReader<R> {
|
|
||||||
/// Create a new CSV reader from a [`BufRead`] source.
|
|
||||||
/// The first line is expected to contain headers.
|
|
||||||
/// `separators` is a list of characters considered as field separators.
|
|
||||||
/// `types` optionally maps column names to target data types.
|
|
||||||
pub fn new(
|
|
||||||
mut reader: R,
|
|
||||||
separators: Vec<char>,
|
|
||||||
types: Option<HashMap<String, DataType>>,
|
|
||||||
) -> io::Result<Self> {
|
|
||||||
let mut first_line = String::new();
|
|
||||||
reader.read_line(&mut first_line)?;
|
|
||||||
let headers = parse_line(&first_line, &separators);
|
|
||||||
Ok(Self {
|
|
||||||
reader,
|
|
||||||
separators,
|
|
||||||
headers,
|
|
||||||
types,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a reader with default settings (comma separator, automatic typing).
|
|
||||||
pub fn new_default(reader: R) -> io::Result<Self> {
|
|
||||||
Self::new(reader, vec![','], None)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a reader with default separators and explicit type mapping.
|
|
||||||
pub fn new_with_types(reader: R, types: HashMap<String, DataType>) -> io::Result<Self> {
|
|
||||||
Self::new(reader, vec![','], Some(types))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Start building a reader from a source that lacks headers.
|
|
||||||
pub fn new_with_headers(reader: R, headers: Vec<String>) -> CsvReaderBuilder<R> {
|
|
||||||
CsvReaderBuilder {
|
|
||||||
reader,
|
|
||||||
separators: vec![','],
|
|
||||||
headers,
|
|
||||||
types: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the headers of the CSV file.
|
|
||||||
pub fn headers(&self) -> &[String] {
|
|
||||||
&self.headers
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read the next record. Returns `Ok(None)` on EOF.
|
|
||||||
pub fn read_record(&mut self) -> io::Result<Option<Record>> {
|
|
||||||
let mut line = String::new();
|
|
||||||
if self.reader.read_line(&mut line)? == 0 {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
let fields = parse_line(&line, &self.separators);
|
|
||||||
let mut record = HashMap::new();
|
|
||||||
|
|
||||||
for (i, header) in self.headers.iter().enumerate() {
|
|
||||||
let field = fields.get(i).cloned().unwrap_or_default();
|
|
||||||
let value = match &self.types {
|
|
||||||
Some(map) => {
|
|
||||||
if let Some(dt) = map.get(header) {
|
|
||||||
parse_with_type(&field, dt)
|
|
||||||
} else {
|
|
||||||
Value::String(field)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => parse_auto(&field),
|
|
||||||
};
|
|
||||||
record.insert(header.clone(), value);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(record))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: BufRead> Iterator for CsvReader<R> {
|
|
||||||
type Item = io::Result<Record>;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
match self.read_record() {
|
|
||||||
Ok(Some(rec)) => Some(Ok(rec)),
|
|
||||||
Ok(None) => None,
|
|
||||||
Err(e) => Some(Err(e)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: BufRead> CsvReaderBuilder<R> {
|
|
||||||
/// Override field separators for the upcoming reader.
|
|
||||||
pub fn separators(mut self, separators: Vec<char>) -> Self {
|
|
||||||
self.separators = separators;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Finalize the builder with an explicit type mapping.
|
|
||||||
pub fn new_with_types(mut self, types: HashMap<String, DataType>) -> CsvReader<R> {
|
|
||||||
self.types = Some(types);
|
|
||||||
self.build()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Finalize the builder without specifying types.
|
|
||||||
pub fn build(self) -> CsvReader<R> {
|
|
||||||
CsvReader {
|
|
||||||
reader: self.reader,
|
|
||||||
separators: self.separators,
|
|
||||||
headers: self.headers,
|
|
||||||
types: self.types,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<R: BufRead> CsvReader<R> {
|
|
||||||
/// Read all remaining records into a vector.
|
|
||||||
pub fn read_all(&mut self) -> io::Result<Vec<Record>> {
|
|
||||||
let mut records = Vec::new();
|
|
||||||
while let Some(rec) = self.read_record()? {
|
|
||||||
records.push(rec);
|
|
||||||
}
|
|
||||||
Ok(records)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CsvReader<BufReader<File>> {
|
|
||||||
/// Create a [`CsvReader`] from a file path using comma separators and
|
|
||||||
/// automatic type detection.
|
|
||||||
///
|
|
||||||
/// # Examples
|
|
||||||
///
|
|
||||||
/// ```
|
|
||||||
/// use rustframe::csv::{CsvReader, Value};
|
|
||||||
/// # let path = std::env::temp_dir().join("from_path_auto.csv");
|
|
||||||
/// # std::fs::write(&path, "a,b\n1,true\n").unwrap();
|
|
||||||
/// let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
|
||||||
/// let rec = reader.next().unwrap().unwrap();
|
|
||||||
/// assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
|
||||||
/// assert_eq!(rec.get("b"), Some(&Value::Bool(true)));
|
|
||||||
/// # std::fs::remove_file(path).unwrap();
|
|
||||||
/// ```
|
|
||||||
pub fn from_path_auto<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
|
||||||
let file = File::open(path)?;
|
|
||||||
let reader = BufReader::new(file);
|
|
||||||
CsvReader::new_default(reader)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create an iterator over records from a file path using default settings.
|
|
||||||
pub fn reader<P: AsRef<Path>>(path: P) -> io::Result<CsvReader<BufReader<File>>> {
|
|
||||||
reader_with(path, vec![','], None)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create an iterator over records from a file path with custom separators and type mapping.
|
|
||||||
pub fn reader_with<P: AsRef<Path>>(
|
|
||||||
path: P,
|
|
||||||
separators: Vec<char>,
|
|
||||||
types: Option<HashMap<String, DataType>>,
|
|
||||||
) -> io::Result<CsvReader<BufReader<File>>> {
|
|
||||||
let file = File::open(path)?;
|
|
||||||
let reader = BufReader::new(file);
|
|
||||||
CsvReader::new(reader, separators, types)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read an entire CSV file into memory using default settings.
|
|
||||||
pub fn read_file<P: AsRef<Path>>(path: P) -> io::Result<Vec<Record>> {
|
|
||||||
read_file_with(path, vec![','], None)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read an entire CSV file into memory with custom separators and type mapping.
|
|
||||||
pub fn read_file_with<P: AsRef<Path>>(
|
|
||||||
path: P,
|
|
||||||
separators: Vec<char>,
|
|
||||||
types: Option<HashMap<String, DataType>>,
|
|
||||||
) -> io::Result<Vec<Record>> {
|
|
||||||
let mut reader = reader_with(path, separators, types)?;
|
|
||||||
reader.read_all()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_with_type(s: &str, ty: &DataType) -> Value {
|
|
||||||
match ty {
|
|
||||||
DataType::Int => s
|
|
||||||
.parse::<i64>()
|
|
||||||
.map(Value::Int)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
DataType::Float => s
|
|
||||||
.parse::<f64>()
|
|
||||||
.map(Value::Float)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
DataType::Bool => s
|
|
||||||
.parse::<bool>()
|
|
||||||
.map(Value::Bool)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
DataType::UInt => s
|
|
||||||
.parse::<u64>()
|
|
||||||
.map(Value::UInt)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
DataType::String => Value::String(s.to_string()),
|
|
||||||
DataType::Date => s
|
|
||||||
.parse::<NaiveDate>()
|
|
||||||
.map(Value::Date)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
DataType::DateTime => NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S")
|
|
||||||
.map(Value::DateTime)
|
|
||||||
.unwrap_or_else(|_| Value::String(s.to_string())),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_auto(s: &str) -> Value {
|
|
||||||
if let Ok(i) = s.parse::<i64>() {
|
|
||||||
Value::Int(i)
|
|
||||||
} else if let Ok(f) = s.parse::<f64>() {
|
|
||||||
Value::Float(f)
|
|
||||||
} else if let Ok(b) = s.parse::<bool>() {
|
|
||||||
Value::Bool(b)
|
|
||||||
} else if let Ok(dt) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
|
|
||||||
Value::DateTime(dt)
|
|
||||||
} else if let Ok(d) = NaiveDate::parse_from_str(s, "%Y-%m-%d") {
|
|
||||||
Value::Date(d)
|
|
||||||
} else {
|
|
||||||
Value::String(s.to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_line(line: &str, separators: &[char]) -> Vec<String> {
|
|
||||||
let mut fields = Vec::new();
|
|
||||||
let mut current = String::new();
|
|
||||||
let mut in_quotes: Option<char> = None;
|
|
||||||
let chars: Vec<char> = line.chars().collect();
|
|
||||||
let mut i = 0;
|
|
||||||
|
|
||||||
while i < chars.len() {
|
|
||||||
let c = chars[i];
|
|
||||||
if let Some(q) = in_quotes {
|
|
||||||
if c == q {
|
|
||||||
if i + 1 < chars.len() && chars[i + 1] == q {
|
|
||||||
current.push(q);
|
|
||||||
i += 1; // skip escaped quote
|
|
||||||
} else {
|
|
||||||
in_quotes = None;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
current.push(c);
|
|
||||||
}
|
|
||||||
} else if c == '"' || c == '\'' {
|
|
||||||
in_quotes = Some(c);
|
|
||||||
} else if separators.contains(&c) {
|
|
||||||
fields.push(current.clone());
|
|
||||||
current.clear();
|
|
||||||
} else if c == '\r' {
|
|
||||||
// Ignore carriage returns
|
|
||||||
} else if c == '\n' {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
current.push(c);
|
|
||||||
}
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fields.push(current);
|
|
||||||
fields
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use chrono::{NaiveDate, NaiveDateTime};
|
|
||||||
use std::io::Cursor;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_parse_line() {
|
|
||||||
let line = "a,'b,c',\"d\"\"e\",f";
|
|
||||||
let fields = parse_line(line, &[',']);
|
|
||||||
assert_eq!(fields, vec!["a", "b,c", "d\"e", "f"]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_reader_auto() {
|
|
||||||
let data = "a,b,c\n1,2.5,true\n4,5.0,false\n";
|
|
||||||
let cursor = Cursor::new(data);
|
|
||||||
let mut reader = CsvReader::new_default(cursor).unwrap();
|
|
||||||
let rec = reader.next().unwrap().unwrap();
|
|
||||||
assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
|
||||||
assert_eq!(rec.get("b"), Some(&Value::Float(2.5)));
|
|
||||||
assert_eq!(rec.get("c"), Some(&Value::Bool(true)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_reader_with_types() {
|
|
||||||
let data = "a,b,c\n1,2,3\n";
|
|
||||||
let cursor = Cursor::new(data);
|
|
||||||
let mut types = HashMap::new();
|
|
||||||
types.insert("a".to_string(), DataType::Int);
|
|
||||||
types.insert("b".to_string(), DataType::Int);
|
|
||||||
types.insert("c".to_string(), DataType::String);
|
|
||||||
let mut reader = CsvReader::new_with_types(cursor, types).unwrap();
|
|
||||||
let rec = reader.next().unwrap().unwrap();
|
|
||||||
assert_eq!(rec.get("a"), Some(&Value::Int(1)));
|
|
||||||
assert_eq!(rec.get("b"), Some(&Value::Int(2)));
|
|
||||||
assert_eq!(rec.get("c"), Some(&Value::String("3".to_string())));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_chain_headers_and_types() {
|
|
||||||
let data = "1,2\n3,4\n";
|
|
||||||
let cursor = Cursor::new(data);
|
|
||||||
let headers = vec!["x".to_string(), "y".to_string()];
|
|
||||||
let mut types = HashMap::new();
|
|
||||||
types.insert("x".to_string(), DataType::Int);
|
|
||||||
types.insert("y".to_string(), DataType::UInt);
|
|
||||||
let mut reader = CsvReader::new_with_headers(cursor, headers).new_with_types(types);
|
|
||||||
let rec = reader.next().unwrap().unwrap();
|
|
||||||
assert_eq!(rec.get("x"), Some(&Value::Int(1)));
|
|
||||||
assert_eq!(rec.get("y"), Some(&Value::UInt(2)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_date_types() {
|
|
||||||
let data = "d,dt\n2024-01-01,2024-01-01 12:00:00\n";
|
|
||||||
let cursor = Cursor::new(data);
|
|
||||||
let mut types = HashMap::new();
|
|
||||||
types.insert("d".to_string(), DataType::Date);
|
|
||||||
types.insert("dt".to_string(), DataType::DateTime);
|
|
||||||
let mut reader = CsvReader::new_with_types(cursor, types).unwrap();
|
|
||||||
let rec = reader.next().unwrap().unwrap();
|
|
||||||
let date = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
|
|
||||||
let datetime: NaiveDateTime = NaiveDate::from_ymd_opt(2024, 1, 1)
|
|
||||||
.unwrap()
|
|
||||||
.and_hms_opt(12, 0, 0)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(rec.get("d"), Some(&Value::Date(date)));
|
|
||||||
assert_eq!(rec.get("dt"), Some(&Value::DateTime(datetime)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_read_file_all() {
|
|
||||||
let path = std::env::temp_dir().join("csv_full_test.csv");
|
|
||||||
std::fs::write(&path, "a,b\n1,2\n3,4\n").unwrap();
|
|
||||||
let records = read_file(&path).unwrap();
|
|
||||||
assert_eq!(records.len(), 2);
|
|
||||||
assert_eq!(records[1].get("b"), Some(&Value::Int(4)));
|
|
||||||
std::fs::remove_file(path).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_reader_from_path() {
|
|
||||||
let path = std::env::temp_dir().join("csv_iter_test.csv");
|
|
||||||
std::fs::write(&path, "a,b\n5,6\n").unwrap();
|
|
||||||
let mut iter = reader(&path).unwrap();
|
|
||||||
let rec = iter.next().unwrap().unwrap();
|
|
||||||
assert_eq!(rec.get("a"), Some(&Value::Int(5)));
|
|
||||||
assert_eq!(rec.get("b"), Some(&Value::Int(6)));
|
|
||||||
std::fs::remove_file(path).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_from_path_auto_method() {
|
|
||||||
let path = std::env::temp_dir().join("csv_method_auto.csv");
|
|
||||||
std::fs::write(&path, "a,b\n7,true\n").unwrap();
|
|
||||||
let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
|
||||||
let rec = reader.next().unwrap().unwrap();
|
|
||||||
assert_eq!(rec.get("a"), Some(&Value::Int(7)));
|
|
||||||
assert_eq!(rec.get("b"), Some(&Value::Bool(true)));
|
|
||||||
std::fs::remove_file(path).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
//! CSV handling utilities.
|
|
||||||
//!
|
|
||||||
//! The [`csv`] module offers a flexible [`CsvReader`] with automatic type
|
|
||||||
//! detection and optional builders for custom headers and types.
|
|
||||||
//!
|
|
||||||
//! # Examples
|
|
||||||
//!
|
|
||||||
//! Read from a file with auto type detection:
|
|
||||||
//!
|
|
||||||
//! ```
|
|
||||||
//! use rustframe::csv::CsvReader;
|
|
||||||
//! # let path = std::env::temp_dir().join("docs_auto.csv");
|
|
||||||
//! # std::fs::write(&path, "a,b\n1,true\n").unwrap();
|
|
||||||
//! let mut reader = CsvReader::from_path_auto(&path).unwrap();
|
|
||||||
//! for rec in reader {
|
|
||||||
//! let rec = rec.unwrap();
|
|
||||||
//! println!("{:?}", rec);
|
|
||||||
//! }
|
|
||||||
//! # std::fs::remove_file(path).unwrap();
|
|
||||||
//! ```
|
|
||||||
//!
|
|
||||||
//! Specify column types explicitly:
|
|
||||||
//!
|
|
||||||
//! ```
|
|
||||||
//! use rustframe::csv::{CsvReader, DataType, Value};
|
|
||||||
//! use std::collections::HashMap;
|
|
||||||
//! use std::io::Cursor;
|
|
||||||
//! let data = "a,b\n1,2\n";
|
|
||||||
//! let mut types = HashMap::new();
|
|
||||||
//! types.insert("a".into(), DataType::Int);
|
|
||||||
//! types.insert("b".into(), DataType::Float);
|
|
||||||
//! let mut reader = CsvReader::new_with_types(Cursor::new(data), types).unwrap();
|
|
||||||
//! let rec = reader.next().unwrap().unwrap();
|
|
||||||
//! assert_eq!(rec.get("b"), Some(&Value::Float(2.0)));
|
|
||||||
//! ```
|
|
||||||
//!
|
|
||||||
//! Building from custom headers and types:
|
|
||||||
//!
|
|
||||||
//! ```
|
|
||||||
//! use rustframe::csv::{CsvReader, DataType, Value};
|
|
||||||
//! use std::collections::HashMap;
|
|
||||||
//! use std::io::Cursor;
|
|
||||||
//! let data = "1,2\n";
|
|
||||||
//! let headers = vec!["x".to_string(), "y".to_string()];
|
|
||||||
//! let mut types = HashMap::new();
|
|
||||||
//! types.insert("x".into(), DataType::Int);
|
|
||||||
//! types.insert("y".into(), DataType::UInt);
|
|
||||||
//! let mut reader = CsvReader::new_with_headers(Cursor::new(data), headers).new_with_types(types);
|
|
||||||
//! let rec = reader.next().unwrap().unwrap();
|
|
||||||
//! assert_eq!(rec.get("y"), Some(&Value::UInt(2)));
|
|
||||||
//! ```
|
|
||||||
//!
|
|
||||||
//! Reading an entire file into memory:
|
|
||||||
//!
|
|
||||||
//! ```
|
|
||||||
//! use rustframe::csv::read_file;
|
|
||||||
//! # let path = std::env::temp_dir().join("docs_full.csv");
|
|
||||||
//! # std::fs::write(&path, "a,b\n1,2\n3,4\n").unwrap();
|
|
||||||
//! let records = read_file(&path).unwrap();
|
|
||||||
//! assert_eq!(records.len(), 2);
|
|
||||||
//! # std::fs::remove_file(path).unwrap();
|
|
||||||
//! ```
|
|
||||||
|
|
||||||
pub mod csv_core;
|
|
||||||
|
|
||||||
pub use csv_core::{
|
|
||||||
CsvReader, CsvReaderBuilder, DataType, Record, Value, reader, reader_with,
|
|
||||||
read_file, read_file_with,
|
|
||||||
};
|
|
||||||
659
src/dataframe/df.rs
Normal file
659
src/dataframe/df.rs
Normal file
@@ -0,0 +1,659 @@
|
|||||||
|
use crate::frame::{Frame, RowIndex};
|
||||||
|
use std::any::{Any, TypeId};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fmt; // Import TypeId
|
||||||
|
|
||||||
|
const DEFAULT_DISPLAY_ROWS: usize = 5;
|
||||||
|
const DEFAULT_DISPLAY_COLS: usize = 10;
|
||||||
|
|
||||||
|
// Trait to enable type-agnostic operations on Frame objects within DataFrame
|
||||||
|
pub trait SubFrame: Send + Sync + fmt::Debug + Any {
|
||||||
|
fn rows(&self) -> usize;
|
||||||
|
fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String;
|
||||||
|
fn clone_box(&self) -> Box<dyn SubFrame>;
|
||||||
|
fn delete_column_from_frame(&mut self, col_name: &str);
|
||||||
|
fn get_frame_cols(&self) -> usize; // Add a method to get the number of columns in the underlying frame
|
||||||
|
|
||||||
|
// Methods for downcasting to concrete types
|
||||||
|
fn as_any(&self) -> &dyn Any;
|
||||||
|
fn as_any_mut(&mut self) -> &mut dyn Any;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implement SubFrame for any Frame<T> that meets the requirements
|
||||||
|
impl<T> SubFrame for Frame<T>
|
||||||
|
where
|
||||||
|
T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any,
|
||||||
|
{
|
||||||
|
fn rows(&self) -> usize {
|
||||||
|
self.rows()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String {
|
||||||
|
self.get_row(physical_row_idx).get(col_name).to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clone_box(&self) -> Box<dyn SubFrame> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn delete_column_from_frame(&mut self, col_name: &str) {
|
||||||
|
self.delete_column(col_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_frame_cols(&self) -> usize {
|
||||||
|
self.cols()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_any(&self) -> &dyn Any {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
fn as_any_mut(&mut self) -> &mut dyn Any {
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DataFrame {
|
||||||
|
frames_by_type: HashMap<TypeId, Box<dyn SubFrame>>, // Maps TypeId to the Frame holding columns of that type
|
||||||
|
column_to_type: HashMap<String, TypeId>, // Maps column name to its TypeId
|
||||||
|
column_names: Vec<String>,
|
||||||
|
index: RowIndex,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DataFrame {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
DataFrame {
|
||||||
|
frames_by_type: HashMap::new(),
|
||||||
|
column_to_type: HashMap::new(),
|
||||||
|
column_names: Vec::new(),
|
||||||
|
index: RowIndex::Range(0..0), // Initialize with an empty range index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of rows in the DataFrame.
|
||||||
|
pub fn rows(&self) -> usize {
|
||||||
|
self.index.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of columns in the DataFrame.
|
||||||
|
pub fn cols(&self) -> usize {
|
||||||
|
self.column_names.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a reference to the vector of column names.
|
||||||
|
pub fn get_column_names(&self) -> &Vec<String> {
|
||||||
|
&self.column_names
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of internal Frame objects (one per unique data type).
|
||||||
|
pub fn num_internal_frames(&self) -> usize {
|
||||||
|
self.frames_by_type.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a reference to a column of a specific type, if it exists.
|
||||||
|
pub fn get_column<T>(&self, col_name: &str) -> Option<&[T]>
|
||||||
|
where
|
||||||
|
T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any,
|
||||||
|
{
|
||||||
|
let expected_type_id = TypeId::of::<T>();
|
||||||
|
if let Some(actual_type_id) = self.column_to_type.get(col_name) {
|
||||||
|
if *actual_type_id == expected_type_id {
|
||||||
|
if let Some(sub_frame_box) = self.frames_by_type.get(actual_type_id) {
|
||||||
|
if let Some(frame) = sub_frame_box.as_any().downcast_ref::<Frame<T>>() {
|
||||||
|
return Some(frame.column(col_name));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a HashMap representing a row, mapping column names to their string values.
|
||||||
|
pub fn get_row(&self, row_idx: usize) -> Option<HashMap<String, String>> {
|
||||||
|
if row_idx >= self.rows() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut row_data = HashMap::new();
|
||||||
|
for col_name in &self.column_names {
|
||||||
|
if let Some(type_id) = self.column_to_type.get(col_name) {
|
||||||
|
if let Some(sub_frame_box) = self.frames_by_type.get(type_id) {
|
||||||
|
let value = sub_frame_box.get_value_as_string(row_idx, col_name);
|
||||||
|
row_data.insert(col_name.clone(), value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(row_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_column<T>(&mut self, col_name: &str, data: Vec<T>)
|
||||||
|
where
|
||||||
|
T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any,
|
||||||
|
{
|
||||||
|
let type_id = TypeId::of::<T>();
|
||||||
|
let col_name_string = col_name.to_string();
|
||||||
|
|
||||||
|
// Check for duplicate column name across the entire DataFrame
|
||||||
|
if self.column_to_type.contains_key(&col_name_string) {
|
||||||
|
panic!(
|
||||||
|
"DataFrame::add_column: duplicate column name: '{}'",
|
||||||
|
col_name_string
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this is the first column being added, set the DataFrame's index
|
||||||
|
if self.column_names.is_empty() {
|
||||||
|
self.index = RowIndex::Range(0..data.len());
|
||||||
|
} else {
|
||||||
|
// Ensure new column has the same number of rows as existing columns
|
||||||
|
if data.len() != self.index.len() {
|
||||||
|
panic!(
|
||||||
|
"DataFrame::add_column: new column '{}' has {} rows, but existing columns have {} rows",
|
||||||
|
col_name_string,
|
||||||
|
data.len(),
|
||||||
|
self.index.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if a Frame of this type already exists
|
||||||
|
if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) {
|
||||||
|
// Downcast to the concrete Frame<T> and add the column
|
||||||
|
if let Some(frame) = sub_frame_box.as_any_mut().downcast_mut::<Frame<T>>() {
|
||||||
|
frame.add_column(col_name_string.clone(), data);
|
||||||
|
} else {
|
||||||
|
// This should ideally not happen if TypeId matches, but good for safety
|
||||||
|
panic!(
|
||||||
|
"Type mismatch when downcasting existing SubFrame for TypeId {:?}",
|
||||||
|
type_id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No Frame of this type exists, create a new one
|
||||||
|
// The Frame::new constructor expects a Matrix and column names.
|
||||||
|
// We create a Matrix from a single column vector.
|
||||||
|
let new_frame = Frame::new(
|
||||||
|
crate::matrix::Matrix::from_cols(vec![data]),
|
||||||
|
vec![col_name_string.clone()],
|
||||||
|
Some(self.index.clone()), // Pass the DataFrame's index to the new Frame
|
||||||
|
);
|
||||||
|
self.frames_by_type.insert(type_id, Box::new(new_frame));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update column mappings and names
|
||||||
|
self.column_to_type.insert(col_name_string.clone(), type_id);
|
||||||
|
self.column_names.push(col_name_string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drops a column from the DataFrame.
|
||||||
|
/// Panics if the column does not exist.
|
||||||
|
pub fn drop_column(&mut self, col_name: &str) {
|
||||||
|
let col_name_string = col_name.to_string();
|
||||||
|
|
||||||
|
// 1. Get the TypeId associated with the column
|
||||||
|
let type_id = self
|
||||||
|
.column_to_type
|
||||||
|
.remove(&col_name_string)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
panic!(
|
||||||
|
"DataFrame::drop_column: column '{}' not found",
|
||||||
|
col_name_string
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
// 2. Remove the column name from the ordered list
|
||||||
|
self.column_names.retain(|name| name != &col_name_string);
|
||||||
|
|
||||||
|
// 3. Find the Frame object and delete the column from it
|
||||||
|
if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) {
|
||||||
|
sub_frame_box.delete_column_from_frame(&col_name_string);
|
||||||
|
|
||||||
|
// 4. If the Frame object for this type becomes empty, remove it from frames_by_type
|
||||||
|
if sub_frame_box.get_frame_cols() == 0 {
|
||||||
|
self.frames_by_type.remove(&type_id);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This should not happen if column_to_type was consistent
|
||||||
|
panic!(
|
||||||
|
"DataFrame::drop_column: internal error, no frame found for type_id {:?}",
|
||||||
|
type_id
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for DataFrame {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
// Display column headers
|
||||||
|
for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) {
|
||||||
|
write!(f, "{:<15}", col_name)?;
|
||||||
|
}
|
||||||
|
if self.column_names.len() > DEFAULT_DISPLAY_COLS {
|
||||||
|
write!(f, "...")?;
|
||||||
|
}
|
||||||
|
writeln!(f)?;
|
||||||
|
|
||||||
|
// Display data rows
|
||||||
|
let mut displayed_rows = 0;
|
||||||
|
for i in 0..self.index.len() {
|
||||||
|
if displayed_rows >= DEFAULT_DISPLAY_ROWS {
|
||||||
|
writeln!(f, "...")?;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) {
|
||||||
|
if let Some(type_id) = self.column_to_type.get(col_name) {
|
||||||
|
if let Some(sub_frame_box) = self.frames_by_type.get(type_id) {
|
||||||
|
write!(f, "{:<15}", sub_frame_box.get_value_as_string(i, col_name))?;
|
||||||
|
} else {
|
||||||
|
// This case indicates an inconsistency: column_to_type has an entry,
|
||||||
|
// but frames_by_type doesn't have the corresponding Frame.
|
||||||
|
write!(f, "{:<15}", "[ERROR]")?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This case indicates an inconsistency: column_names has an entry,
|
||||||
|
// but column_to_type doesn't have the corresponding column.
|
||||||
|
write!(f, "{:<15}", "[ERROR]")?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if self.column_names.len() > DEFAULT_DISPLAY_COLS {
|
||||||
|
write!(f, "...")?;
|
||||||
|
}
|
||||||
|
writeln!(f)?;
|
||||||
|
displayed_rows += 1;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for DataFrame {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
f.debug_struct("DataFrame")
|
||||||
|
.field("column_names", &self.column_names)
|
||||||
|
.field("index", &self.index)
|
||||||
|
.field("column_to_type", &self.column_to_type)
|
||||||
|
.field("frames_by_type", &self.frames_by_type)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::frame::Frame;
|
||||||
|
use crate::matrix::Matrix;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_new() {
|
||||||
|
let df = DataFrame::new();
|
||||||
|
assert_eq!(df.rows(), 0);
|
||||||
|
assert_eq!(df.cols(), 0);
|
||||||
|
assert!(df.get_column_names().is_empty());
|
||||||
|
assert!(df.frames_by_type.is_empty());
|
||||||
|
assert!(df.column_to_type.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_add_column_initial() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
let data = vec![1, 2, 3];
|
||||||
|
df.add_column("col_int", data.clone());
|
||||||
|
|
||||||
|
assert_eq!(df.rows(), 3);
|
||||||
|
assert_eq!(df.cols(), 1);
|
||||||
|
assert_eq!(df.get_column_names(), &vec!["col_int".to_string()]);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::<i32>()));
|
||||||
|
|
||||||
|
// Verify the underlying frame
|
||||||
|
let sub_frame_box = df.frames_by_type.get(&TypeId::of::<i32>()).unwrap();
|
||||||
|
let frame = sub_frame_box.as_any().downcast_ref::<Frame<i32>>().unwrap();
|
||||||
|
assert_eq!(frame.rows(), 3);
|
||||||
|
assert_eq!(frame.cols(), 1);
|
||||||
|
assert_eq!(frame.columns(), &vec!["col_int".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_add_column_same_type() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int1", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_int2", vec![4, 5, 6]);
|
||||||
|
|
||||||
|
assert_eq!(df.rows(), 3);
|
||||||
|
assert_eq!(df.cols(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column_names(),
|
||||||
|
&vec!["col_int1".to_string(), "col_int2".to_string()]
|
||||||
|
);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert_eq!(
|
||||||
|
df.column_to_type.get("col_int1"),
|
||||||
|
Some(&TypeId::of::<i32>())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
df.column_to_type.get("col_int2"),
|
||||||
|
Some(&TypeId::of::<i32>())
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify the underlying frame
|
||||||
|
let sub_frame_box = df.frames_by_type.get(&TypeId::of::<i32>()).unwrap();
|
||||||
|
let frame = sub_frame_box.as_any().downcast_ref::<Frame<i32>>().unwrap();
|
||||||
|
assert_eq!(frame.rows(), 3);
|
||||||
|
assert_eq!(frame.cols(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
frame.columns(),
|
||||||
|
&vec!["col_int1".to_string(), "col_int2".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_add_column_different_type() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec!["a".to_string(), "b".to_string(), "c".to_string()],
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(df.rows(), 3);
|
||||||
|
assert_eq!(df.cols(), 3);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column_names(),
|
||||||
|
&vec![
|
||||||
|
"col_int".to_string(),
|
||||||
|
"col_float".to_string(),
|
||||||
|
"col_string".to_string()
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<String>()));
|
||||||
|
|
||||||
|
assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::<i32>()));
|
||||||
|
assert_eq!(
|
||||||
|
df.column_to_type.get("col_float"),
|
||||||
|
Some(&TypeId::of::<f64>())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
df.column_to_type.get("col_string"),
|
||||||
|
Some(&TypeId::of::<String>())
|
||||||
|
);
|
||||||
|
|
||||||
|
// Verify underlying frames
|
||||||
|
let int_frame = df
|
||||||
|
.frames_by_type
|
||||||
|
.get(&TypeId::of::<i32>())
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<Frame<i32>>()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(int_frame.columns(), &vec!["col_int".to_string()]);
|
||||||
|
|
||||||
|
let float_frame = df
|
||||||
|
.frames_by_type
|
||||||
|
.get(&TypeId::of::<f64>())
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<Frame<f64>>()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(float_frame.columns(), &vec!["col_float".to_string()]);
|
||||||
|
|
||||||
|
let string_frame = df
|
||||||
|
.frames_by_type
|
||||||
|
.get(&TypeId::of::<String>())
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<Frame<String>>()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(string_frame.columns(), &vec!["col_string".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_get_column() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec!["a".to_string(), "b".to_string(), "c".to_string()],
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test getting existing columns with correct type
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column::<i32>("col_int").unwrap(),
|
||||||
|
vec![1, 2, 3].as_slice()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column::<f64>("col_float").unwrap(),
|
||||||
|
vec![1.1, 2.2, 3.3].as_slice()
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column::<String>("col_string").unwrap(),
|
||||||
|
vec!["a".to_string(), "b".to_string(), "c".to_string()].as_slice()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test getting non-existent column
|
||||||
|
assert_eq!(df.get_column::<i32>("non_existent"), None);
|
||||||
|
|
||||||
|
// Test getting existing column with incorrect type
|
||||||
|
assert_eq!(df.get_column::<f64>("col_int"), None);
|
||||||
|
assert_eq!(df.get_column::<i32>("col_float"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_get_row() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec!["a".to_string(), "b".to_string(), "c".to_string()],
|
||||||
|
);
|
||||||
|
|
||||||
|
// Test getting an existing row
|
||||||
|
let row0 = df.get_row(0).unwrap();
|
||||||
|
assert_eq!(row0.get("col_int"), Some(&"1".to_string()));
|
||||||
|
assert_eq!(row0.get("col_float"), Some(&"1.1".to_string()));
|
||||||
|
assert_eq!(row0.get("col_string"), Some(&"a".to_string()));
|
||||||
|
|
||||||
|
let row1 = df.get_row(1).unwrap();
|
||||||
|
assert_eq!(row1.get("col_int"), Some(&"2".to_string()));
|
||||||
|
assert_eq!(row1.get("col_float"), Some(&"2.2".to_string()));
|
||||||
|
assert_eq!(row1.get("col_string"), Some(&"b".to_string()));
|
||||||
|
|
||||||
|
// Test getting an out-of-bounds row
|
||||||
|
assert_eq!(df.get_row(3), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "DataFrame::add_column: duplicate column name: 'col_int'")]
|
||||||
|
fn test_dataframe_add_column_duplicate_name() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_int", vec![4, 5, 6]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(
|
||||||
|
expected = "DataFrame::add_column: new column 'col_int2' has 2 rows, but existing columns have 3 rows"
|
||||||
|
)]
|
||||||
|
fn test_dataframe_add_column_mismatched_rows() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int1", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_int2", vec![4, 5]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_display() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3, 4, 5, 6]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3, 4.4, 5.5, 6.6]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec![
|
||||||
|
"a".to_string(),
|
||||||
|
"b".to_string(),
|
||||||
|
"c".to_string(),
|
||||||
|
"d".to_string(),
|
||||||
|
"e".to_string(),
|
||||||
|
"f".to_string(),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
|
||||||
|
let expected_output = "\
|
||||||
|
col_int col_float col_string
|
||||||
|
1 1.1 a
|
||||||
|
2 2.2 b
|
||||||
|
3 3.3 c
|
||||||
|
4 4.4 d
|
||||||
|
5 5.5 e
|
||||||
|
...
|
||||||
|
";
|
||||||
|
assert_eq!(format!("{}", df), expected_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_debug() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
|
||||||
|
let debug_output = format!("{:?}", df);
|
||||||
|
assert!(debug_output.contains("DataFrame {"));
|
||||||
|
assert!(debug_output.contains("column_names: [\"col_int\", \"col_float\"]"));
|
||||||
|
assert!(debug_output.contains("index: Range(0..3)"));
|
||||||
|
assert!(debug_output.contains("column_to_type: {"));
|
||||||
|
assert!(debug_output.contains("frames_by_type: {"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_drop_column_single_type() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int1", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_int2", vec![4, 5, 6]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 3);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column_names(),
|
||||||
|
&vec![
|
||||||
|
"col_int1".to_string(),
|
||||||
|
"col_int2".to_string(),
|
||||||
|
"col_float".to_string()
|
||||||
|
]
|
||||||
|
);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
|
||||||
|
df.drop_column("col_int1");
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column_names(),
|
||||||
|
&vec!["col_int2".to_string(), "col_float".to_string()]
|
||||||
|
);
|
||||||
|
assert!(df.column_to_type.get("col_int1").is_none());
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>())); // Frame<i32> should still exist
|
||||||
|
let int_frame = df
|
||||||
|
.frames_by_type
|
||||||
|
.get(&TypeId::of::<i32>())
|
||||||
|
.unwrap()
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<Frame<i32>>()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(int_frame.columns(), &vec!["col_int2".to_string()]);
|
||||||
|
|
||||||
|
df.drop_column("col_int2");
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 1);
|
||||||
|
assert_eq!(df.get_column_names(), &vec!["col_float".to_string()]);
|
||||||
|
assert!(df.column_to_type.get("col_int2").is_none());
|
||||||
|
assert!(!df.frames_by_type.contains_key(&TypeId::of::<i32>())); // Frame<i32> should be removed
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_drop_column_mixed_types() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float", vec![1.1, 2.2, 3.3]);
|
||||||
|
df.add_column(
|
||||||
|
"col_string",
|
||||||
|
vec!["a".to_string(), "b".to_string(), "c".to_string()],
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 3);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<String>()));
|
||||||
|
|
||||||
|
df.drop_column("col_float");
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
df.get_column_names(),
|
||||||
|
&vec!["col_int".to_string(), "col_string".to_string()]
|
||||||
|
);
|
||||||
|
assert!(df.column_to_type.get("col_float").is_none());
|
||||||
|
assert!(!df.frames_by_type.contains_key(&TypeId::of::<f64>())); // Frame<f64> should be removed
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<String>()));
|
||||||
|
|
||||||
|
df.drop_column("col_int");
|
||||||
|
df.drop_column("col_string");
|
||||||
|
|
||||||
|
assert_eq!(df.cols(), 0);
|
||||||
|
assert!(df.get_column_names().is_empty());
|
||||||
|
assert!(df.frames_by_type.is_empty());
|
||||||
|
assert!(df.column_to_type.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic(expected = "DataFrame::drop_column: column 'non_existent' not found")]
|
||||||
|
fn test_dataframe_drop_column_non_existent() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int", vec![1, 2, 3]);
|
||||||
|
df.drop_column("non_existent");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dataframe_add_column_reuses_existing_frame() {
|
||||||
|
let mut df = DataFrame::new();
|
||||||
|
df.add_column("col_int1", vec![1, 2, 3]);
|
||||||
|
df.add_column("col_float1", vec![1.1, 2.2, 3.3]);
|
||||||
|
|
||||||
|
// Initially, there should be two frames (one for i32, one for f64)
|
||||||
|
assert_eq!(df.frames_by_type.len(), 2);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
|
||||||
|
// Add another integer column
|
||||||
|
df.add_column("col_int2", vec![4, 5, 6]);
|
||||||
|
|
||||||
|
// The number of frames should still be 2, as the existing i32 frame should be reused
|
||||||
|
assert_eq!(df.frames_by_type.len(), 2);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
|
||||||
|
// Verify the i32 frame now contains both integer columns
|
||||||
|
let int_frame = df.frames_by_type.get(&TypeId::of::<i32>()).unwrap().as_any().downcast_ref::<Frame<i32>>().unwrap();
|
||||||
|
assert_eq!(int_frame.columns(), &vec!["col_int1".to_string(), "col_int2".to_string()]);
|
||||||
|
assert_eq!(int_frame.cols(), 2);
|
||||||
|
|
||||||
|
// Add another float column
|
||||||
|
df.add_column("col_float2", vec![4.4, 5.5, 6.6]);
|
||||||
|
|
||||||
|
// The number of frames should still be 2, as the existing f64 frame should be reused
|
||||||
|
assert_eq!(df.frames_by_type.len(), 2);
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<i32>()));
|
||||||
|
assert!(df.frames_by_type.contains_key(&TypeId::of::<f64>()));
|
||||||
|
|
||||||
|
// Verify the f64 frame now contains both float columns
|
||||||
|
let float_frame = df.frames_by_type.get(&TypeId::of::<f64>()).unwrap().as_any().downcast_ref::<Frame<f64>>().unwrap();
|
||||||
|
assert_eq!(float_frame.columns(), &vec!["col_float1".to_string(), "col_float2".to_string()]);
|
||||||
|
assert_eq!(float_frame.cols(), 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
4
src/dataframe/mod.rs
Normal file
4
src/dataframe/mod.rs
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
//! This module provides the DataFrame structure for handling tabular data with mixed types.
|
||||||
|
pub mod df;
|
||||||
|
|
||||||
|
pub use df::{DataFrame, SubFrame};
|
||||||
@@ -332,7 +332,7 @@ impl<T: Clone + PartialEq> Frame<T> {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an immutable slice of the specified column's data.
|
/// Returns an immutable slice of the specified column's data by name.
|
||||||
/// Panics if the column name is not found.
|
/// Panics if the column name is not found.
|
||||||
pub fn column(&self, name: &str) -> &[T] {
|
pub fn column(&self, name: &str) -> &[T] {
|
||||||
let idx = self
|
let idx = self
|
||||||
@@ -341,7 +341,13 @@ impl<T: Clone + PartialEq> Frame<T> {
|
|||||||
self.matrix.column(idx)
|
self.matrix.column(idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a mutable slice of the specified column's data.
|
/// Returns an immutable slice of the specified column's data by its physical index.
|
||||||
|
/// Panics if the index is out of bounds.
|
||||||
|
pub fn column_by_physical_idx(&self, idx: usize) -> &[T] {
|
||||||
|
self.matrix.column(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a mutable slice of the specified column's data by name.
|
||||||
/// Panics if the column name is not found.
|
/// Panics if the column name is not found.
|
||||||
pub fn column_mut(&mut self, name: &str) -> &mut [T] {
|
pub fn column_mut(&mut self, name: &str) -> &mut [T] {
|
||||||
let idx = self
|
let idx = self
|
||||||
@@ -350,6 +356,12 @@ impl<T: Clone + PartialEq> Frame<T> {
|
|||||||
self.matrix.column_mut(idx)
|
self.matrix.column_mut(idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns a mutable slice of the specified column's data by its physical index.
|
||||||
|
/// Panics if the index is out of bounds.
|
||||||
|
pub fn column_mut_by_physical_idx(&mut self, idx: usize) -> &mut [T] {
|
||||||
|
self.matrix.column_mut(idx)
|
||||||
|
}
|
||||||
|
|
||||||
// Row access methods
|
// Row access methods
|
||||||
|
|
||||||
/// Returns an immutable view of the row for the given integer key.
|
/// Returns an immutable view of the row for the given integer key.
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
#![doc = include_str!("../README.md")]
|
#![doc = include_str!("../README.md")]
|
||||||
|
|
||||||
|
/// Documentation for the [`crate::dataframe`] module.
|
||||||
|
pub mod dataframe;
|
||||||
|
|
||||||
/// Documentation for the [`crate::matrix`] module.
|
/// Documentation for the [`crate::matrix`] module.
|
||||||
pub mod matrix;
|
pub mod matrix;
|
||||||
|
|
||||||
@@ -14,6 +17,3 @@ pub mod compute;
|
|||||||
|
|
||||||
/// Documentation for the [`crate::random`] module.
|
/// Documentation for the [`crate::random`] module.
|
||||||
pub mod random;
|
pub mod random;
|
||||||
|
|
||||||
/// Documentation for the [`crate::csv`] module.
|
|
||||||
pub mod csv;
|
|
||||||
|
|||||||
Reference in New Issue
Block a user