From 49f75582251dba830098121f5c7ca9fc7d499213 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 05:00:42 +0100 Subject: [PATCH 01/19] Enhance column access methods to clarify usage by name and physical index --- src/frame/base.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/frame/base.rs b/src/frame/base.rs index daad7ee..4698dee 100644 --- a/src/frame/base.rs +++ b/src/frame/base.rs @@ -309,7 +309,7 @@ impl Frame { ) } - /// Returns an immutable slice of the specified column's data. + /// Returns an immutable slice of the specified column's data by name. /// Panics if the column name is not found. pub fn column(&self, name: &str) -> &[T] { let idx = self @@ -318,7 +318,13 @@ impl Frame { self.matrix.column(idx) } - /// Returns a mutable slice of the specified column's data. + /// Returns an immutable slice of the specified column's data by its physical index. + /// Panics if the index is out of bounds. + pub fn column_by_physical_idx(&self, idx: usize) -> &[T] { + self.matrix.column(idx) + } + + /// Returns a mutable slice of the specified column's data by name. /// Panics if the column name is not found. pub fn column_mut(&mut self, name: &str) -> &mut [T] { let idx = self @@ -327,6 +333,12 @@ impl Frame { self.matrix.column_mut(idx) } + /// Returns a mutable slice of the specified column's data by its physical index. + /// Panics if the index is out of bounds. + pub fn column_mut_by_physical_idx(&mut self, idx: usize) -> &mut [T] { + self.matrix.column_mut(idx) + } + // Row access methods /// Returns an immutable view of the row for the given integer key. From b80d5ab381b375bad19b573b5252ca40bc1f73fd Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 05:00:59 +0100 Subject: [PATCH 02/19] Add documentation for the DataFrame module and include it in the library --- src/dataframe/mod.rs | 2 ++ src/lib.rs | 3 +++ 2 files changed, 5 insertions(+) create mode 100644 src/dataframe/mod.rs diff --git a/src/dataframe/mod.rs b/src/dataframe/mod.rs new file mode 100644 index 0000000..8fb4cb2 --- /dev/null +++ b/src/dataframe/mod.rs @@ -0,0 +1,2 @@ +//! This module provides the DataFrame structure for handling tabular data with mixed types. +pub mod df; diff --git a/src/lib.rs b/src/lib.rs index a68c8c9..1eb448b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ #![doc = include_str!("../README.md")] +/// Documentation for the [`crate::dataframe`] module. +pub mod dataframe; + /// Documentation for the [`crate::matrix`] module. pub mod matrix; From fe666a4ddbc96fc095c2563e3983041d5b8ab730 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 05:01:19 +0100 Subject: [PATCH 03/19] First draft: Implement DataFrame and DataFrameColumn structures --- src/dataframe/df.rs | 647 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 src/dataframe/df.rs diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs new file mode 100644 index 0000000..061be3f --- /dev/null +++ b/src/dataframe/df.rs @@ -0,0 +1,647 @@ +use crate::frame::{Frame, RowIndex}; +use crate::matrix::Matrix; +use std::collections::HashMap; +use std::fmt; + +/// Represents a column in a DataFrame, holding data of a specific type. +/// Each variant wraps a `Frame` where T is the data type. +#[derive(Debug, Clone, PartialEq)] +pub enum DataFrameColumn { + F64(Frame), + I64(Frame), + Bool(Frame), + String(Frame), + // Add more types as needed +} + +impl DataFrameColumn { + /// Returns the number of rows in the column. + pub fn rows(&self) -> usize { + match self { + DataFrameColumn::F64(f) => f.rows(), + DataFrameColumn::I64(f) => f.rows(), + DataFrameColumn::Bool(f) => f.rows(), + DataFrameColumn::String(f) => f.rows(), + } + } + + /// Returns the column name. + /// Panics if the internal frame has more than one column (which it shouldn't for a single DataFrameColumn). + pub fn name(&self) -> &str { + match self { + DataFrameColumn::F64(f) => &f.columns()[0], + DataFrameColumn::I64(f) => &f.columns()[0], + DataFrameColumn::Bool(f) => &f.columns()[0], + DataFrameColumn::String(f) => &f.columns()[0], + } + } + + /// Returns a reference to the underlying RowIndex. + pub fn index(&self) -> &RowIndex { + match self { + DataFrameColumn::F64(f) => f.index(), + DataFrameColumn::I64(f) => f.index(), + DataFrameColumn::Bool(f) => f.index(), + DataFrameColumn::String(f) => f.index(), + } + } +} + +/// A DataFrame capable of holding multiple data types. +/// +/// Internally, a DataFrame manages a collection of `Frame` instances, +/// each holding data of a single, homogeneous type. The logical column +/// order is maintained separately from the physical storage. +#[derive(Debug, Clone, PartialEq)] +pub struct DataFrame { + /// The logical order of column names. + column_names: Vec, + /// A map from column name to its corresponding DataFrameColumn (which wraps a Frame). + data: HashMap, + /// The common row index for all columns in the DataFrame. + index: RowIndex, + /// The number of rows in the DataFrame. + rows: usize, +} + +impl DataFrame { + /// Creates a new DataFrame from a vector of column data. + /// + /// Each inner `Vec` represents a column, and the outer `Vec` contains + /// these columns in the desired order. The `column_names` must match + /// the number of columns provided. + /// + /// All columns must have the same number of rows. + /// + /// # Arguments + /// * `columns` - A vector of `DataFrameColumn` instances, each representing a column. + /// * `column_names` - A vector of strings, providing names for each column. + /// * `index` - An optional `RowIndex` to be used for the DataFrame. If `None`, a default + /// `Range` index will be created. + /// + /// # Panics + /// * If `column_names` length does not match the number of `columns`. + /// * If columns have inconsistent row counts. + /// * If column names are duplicated. + /// * If the provided `index` length does not match the row count. + pub fn new( + columns: Vec, + column_names: Vec, + index: Option, + ) -> Self { + if columns.len() != column_names.len() { + panic!( + "DataFrame::new: column data count ({}) mismatch column names count ({})", + columns.len(), + column_names.len() + ); + } + + if columns.is_empty() { + return Self { + column_names: Vec::new(), + data: HashMap::new(), + index: index.unwrap_or(RowIndex::Range(0..0)), + rows: 0, + }; + } + + let num_rows = columns[0].rows(); + let common_index = index.unwrap_or(RowIndex::Range(0..num_rows)); + + if common_index.len() != num_rows { + panic!( + "DataFrame::new: provided index length ({}) mismatch column rows ({})", + common_index.len(), + num_rows + ); + } + + let mut data_map = HashMap::with_capacity(columns.len()); + let mut final_column_names = Vec::with_capacity(column_names.len()); + + for (i, col_name) in column_names.into_iter().enumerate() { + let col = columns[i].clone(); + + if col.rows() != num_rows { + panic!( + "DataFrame::new: column '{}' has inconsistent row count ({} vs {})", + col_name, + col.rows(), + num_rows + ); + } + + if col.index() != &common_index { + panic!( + "DataFrame::new: column '{}' has inconsistent index with common index", + col_name + ); + } + + if data_map.insert(col_name.clone(), col).is_some() { + panic!("DataFrame::new: duplicate column name: {}", col_name); + } + final_column_names.push(col_name); + } + + Self { + column_names: final_column_names, + data: data_map, + index: common_index, + rows: num_rows, + } + } + + /// Returns the number of rows in the DataFrame. + #[inline] + pub fn rows(&self) -> usize { + self.rows + } + + /// Returns the number of columns in the DataFrame. + #[inline] + pub fn cols(&self) -> usize { + self.column_names.len() + } + + /// Returns a slice of the column names in their logical order. + #[inline] + pub fn columns(&self) -> &[String] { + &self.column_names + } + + /// Returns a reference to the DataFrame's row index. + #[inline] + pub fn index(&self) -> &RowIndex { + &self.index + } + + /// Returns an immutable reference to a column by its name. + /// Panics if the column is not found. + pub fn column(&self, name: &str) -> &DataFrameColumn { + self.data + .get(name) + .unwrap_or_else(|| panic!("DataFrame::column: unknown column label: '{}'", name)) + } + + /// Returns a mutable reference to a column by its name. + /// Panics if the column is not found. + pub fn column_mut(&mut self, name: &str) -> &mut DataFrameColumn { + self.data + .get_mut(name) + .unwrap_or_else(|| panic!("DataFrame::column_mut: unknown column label: '{}'", name)) + } + + /// Adds a new column to the DataFrame. + /// Panics if a column with the same name already exists or if the new column's + /// row count or index does not match the DataFrame's. + pub fn add_column(&mut self, name: String, column_data: DataFrameColumn) { + if self.data.contains_key(&name) { + panic!("DataFrame::add_column: duplicate column label: {}", name); + } + if column_data.rows() != self.rows { + panic!( + "DataFrame::add_column: new column '{}' has inconsistent row count ({} vs {})", + name, + column_data.rows(), + self.rows + ); + } + if column_data.index() != &self.index { + panic!( + "DataFrame::add_column: new column '{}' has inconsistent index with DataFrame's index", + name + ); + } + + self.column_names.push(name.clone()); + self.data.insert(name, column_data); + } + + /// Deletes a column by name and returns it. + /// Panics if the column name is not found. + pub fn delete_column(&mut self, name: &str) -> DataFrameColumn { + let removed_col = self.data.remove(name).unwrap_or_else(|| { + panic!("DataFrame::delete_column: unknown column label: '{}'", name) + }); + + // Remove from column_names vector and rebuild if necessary (to maintain order) + if let Some(pos) = self.column_names.iter().position(|n| n == name) { + self.column_names.remove(pos); + } + + removed_col + } + + /// Renames an existing column. + pub fn rename_column(&mut self, old_name: &str, new_name: String) { + if old_name == new_name { + return; // No change needed + } + if self.data.contains_key(&new_name) { + panic!( + "DataFrame::rename_column: new column name '{}' already exists", + new_name + ); + } + + let column = self.data.remove(old_name).unwrap_or_else(|| { + panic!( + "DataFrame::rename_column: unknown column label: '{}'", + old_name + ) + }); + self.data.insert(new_name.clone(), column); + if let Some(pos) = self.column_names.iter().position(|n| n == old_name) { + self.column_names[pos] = new_name; + } + } + + /// Sorts columns alphabetically by name. + pub fn sort_columns(&mut self) { + self.column_names.sort(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use chrono::NaiveDate; + + // Helper for dates + fn d(y: i32, m: u32, d: u32) -> NaiveDate { + NaiveDate::from_ymd_opt(y, m, d).unwrap() + } + + // Helper to create a simple f64 Frame + fn create_f64_frame(name: &str, data: Vec, index: Option) -> DataFrameColumn { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + DataFrameColumn::F64(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) + } + + // Helper to create a simple i64 Frame + fn create_i64_frame(name: &str, data: Vec, index: Option) -> DataFrameColumn { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + DataFrameColumn::I64(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) + } + + // Helper to create a simple String Frame + fn create_string_frame( + name: &str, + data: Vec, + index: Option, + ) -> DataFrameColumn { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + DataFrameColumn::String(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) + } + + #[test] + fn test_dataframe_new_basic() { + let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_frame("B", vec![4, 5, 6], None); + let col_c = create_string_frame( + "C", + vec!["x".to_string(), "y".to_string(), "z".to_string()], + None, + ); + + let df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 3); + assert_eq!(df.columns(), &["A", "B", "C"]); + assert_eq!(df.index(), &RowIndex::Range(0..3)); + + // Check column data + if let DataFrameColumn::F64(frame_a) = df.column("A") { + assert_eq!(frame_a["A"], vec![1.0, 2.0, 3.0]); + } else { + panic!("Column A is not f64"); + } + if let DataFrameColumn::I64(frame_b) = df.column("B") { + assert_eq!(frame_b["B"], vec![4, 5, 6]); + } else { + panic!("Column B is not i64"); + } + if let DataFrameColumn::String(frame_c) = df.column("C") { + assert_eq!(frame_c["C"], vec!["x", "y", "z"]); + } else { + panic!("Column C is not String"); + } + } + + #[test] + fn test_dataframe_new_with_int_index() { + let index = RowIndex::Int(vec![10, 20, 30]); + let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], Some(index.clone())); + let col_b = create_i64_frame("B", vec![4, 5, 6], Some(index.clone())); + + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + Some(index.clone()), + ); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 2); + assert_eq!(df.index(), &index); + } + + #[test] + fn test_dataframe_new_with_date_index() { + let index_vec = vec![d(2024, 1, 1), d(2024, 1, 2)]; + let index = RowIndex::Date(index_vec.clone()); + let col_a = create_f64_frame("A", vec![1.0, 2.0], Some(index.clone())); + let col_b = create_string_frame( + "B", + vec!["hello".to_string(), "world".to_string()], + Some(index.clone()), + ); + + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + Some(index.clone()), + ); + + assert_eq!(df.rows(), 2); + assert_eq!(df.cols(), 2); + assert_eq!(df.index(), &index); + } + + #[test] + #[should_panic(expected = "column data count (1) mismatch column names count (2)")] + fn test_dataframe_new_panic_col_count_mismatch() { + let col_a = create_f64_frame("A", vec![1.0], None); + DataFrame::new(vec![col_a], vec!["A".to_string(), "B".to_string()], None); + } + + #[test] + #[should_panic(expected = "column 'B' has inconsistent row count (2 vs 3)")] + fn test_dataframe_new_panic_inconsistent_rows() { + let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_frame("B", vec![4, 5], None); // Mismatch + DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + } + + #[test] + #[should_panic(expected = "duplicate column name: A")] + fn test_dataframe_new_panic_duplicate_col_name() { + let col_a1 = create_f64_frame("A", vec![1.0], None); + let col_a2 = create_i64_frame("A", vec![2], None); // Duplicate name + DataFrame::new( + vec![col_a1, col_a2], + vec!["A".to_string(), "A".to_string()], + None, + ); + } + + #[test] + #[should_panic(expected = "provided index length (2) mismatch column rows (3)")] + fn test_dataframe_new_panic_index_len_mismatch() { + let index = RowIndex::Int(vec![10, 20]); // Length 2 + let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); // Length 3 + DataFrame::new(vec![col_a], vec!["A".to_string()], Some(index)); + } + + #[test] + #[should_panic(expected = "column 'A' has inconsistent index with common index")] + fn test_dataframe_new_panic_inconsistent_column_index() { + let common_index = RowIndex::Int(vec![10, 20]); + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); // Uses Range index by default + DataFrame::new(vec![col_a], vec!["A".to_string()], Some(common_index)); + } + + #[test] + fn test_dataframe_add_column() { + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + let new_col_b = create_i64_frame("B", vec![10, 20], None); + df.add_column("B".to_string(), new_col_b); + + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "B"]); + if let DataFrameColumn::I64(frame_b) = df.column("B") { + assert_eq!(frame_b["B"], vec![10, 20]); + } else { + panic!("Column B is not i64"); + } + + let new_col_c = create_string_frame("C", vec!["foo".to_string(), "bar".to_string()], None); + df.add_column("C".to_string(), new_col_c); + assert_eq!(df.cols(), 3); + assert_eq!(df.columns(), &["A", "B", "C"]); + } + + #[test] + #[should_panic(expected = "duplicate column label: A")] + fn test_dataframe_add_column_panic_duplicate() { + let col_a = create_f64_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + let new_col_a = create_i64_frame("A", vec![2], None); + df.add_column("A".to_string(), new_col_a); + } + + #[test] + #[should_panic(expected = "new column 'B' has inconsistent row count (1 vs 2)")] + fn test_dataframe_add_column_panic_inconsistent_rows() { + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + let new_col_b = create_i64_frame("B", vec![10], None); // Mismatch + df.add_column("B".to_string(), new_col_b); + } + + #[test] + #[should_panic(expected = "new column 'B' has inconsistent index with DataFrame's index")] + fn test_dataframe_add_column_panic_inconsistent_index() { + let df_index = RowIndex::Int(vec![1, 2]); + let col_a = create_f64_frame("A", vec![1.0, 2.0], Some(df_index.clone())); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], Some(df_index)); + + let new_col_b = create_i64_frame("B", vec![10, 20], None); // Uses Range index + df.add_column("B".to_string(), new_col_b); + } + + #[test] + fn test_dataframe_delete_column() { + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_frame("B", vec![4, 5], None); + let mut df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "B"]); + + let deleted_col = df.delete_column("A"); + assert_eq!(df.cols(), 1); + assert_eq!(df.columns(), &["B"]); + if let DataFrameColumn::F64(frame_a) = deleted_col { + assert_eq!(frame_a["A"], vec![1.0, 2.0]); + } else { + panic!("Deleted column was not f64"); + } + + // Delete the last column + df.delete_column("B"); + assert_eq!(df.cols(), 0); + assert!(df.columns().is_empty()); + assert!(df.data.is_empty()); + } + + #[test] + #[should_panic(expected = "unknown column label: 'C'")] + fn test_dataframe_delete_column_panic_unknown() { + let col_a = create_f64_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.delete_column("C"); + } + + #[test] + fn test_dataframe_rename_column() { + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + df.rename_column("A", "X".to_string()); + assert_eq!(df.columns(), &["X"]); + assert!(df.data.contains_key("X")); + assert!(!df.data.contains_key("A")); + if let DataFrameColumn::F64(frame_x) = df.column("X") { + assert_eq!(frame_x["X"], vec![1.0, 2.0]); + } else { + panic!("Column X is not f64"); + } + } + + #[test] + #[should_panic(expected = "unknown column label: 'Z'")] + fn test_dataframe_rename_column_panic_old_not_found() { + let col_a = create_f64_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.rename_column("Z", "Y".to_string()); + } + + #[test] + #[should_panic(expected = "new column name 'B' already exists")] + fn test_dataframe_rename_column_panic_new_exists() { + let col_a = create_f64_frame("A", vec![1.0], None); + let col_b = create_i64_frame("B", vec![2], None); + let mut df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + df.rename_column("A", "B".to_string()); + } + + #[test] + fn test_dataframe_sort_columns() { + let col_c = create_f64_frame("C", vec![1.0, 2.0], None); + let col_a = create_i64_frame("A", vec![3, 4], None); + let col_b = create_string_frame("B", vec!["x".to_string(), "y".to_string()], None); + + let mut df = DataFrame::new( + vec![col_c, col_a, col_b], + vec!["C".to_string(), "A".to_string(), "B".to_string()], + None, + ); + + assert_eq!(df.columns(), &["C", "A", "B"]); + df.sort_columns(); + assert_eq!(df.columns(), &["A", "B", "C"]); + + // Verify data integrity after sort + if let DataFrameColumn::I64(frame_a) = df.column("A") { + assert_eq!(frame_a["A"], vec![3, 4]); + } + if let DataFrameColumn::String(frame_b) = df.column("B") { + assert_eq!(frame_b["B"], vec!["x", "y"]); + } + if let DataFrameColumn::F64(frame_c) = df.column("C") { + assert_eq!(frame_c["C"], vec![1.0, 2.0]); + } + } + + #[test] + fn test_dataframe_empty() { + let df = DataFrame::new(vec![], vec![], None); + assert_eq!(df.rows(), 0); + assert_eq!(df.cols(), 0); + assert!(df.columns().is_empty()); + assert!(df.data.is_empty()); + assert_eq!(df.index(), &RowIndex::Range(0..0)); + } + + #[test] + fn test_dataframe_column_accessors() { + let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + // Immutable access + let col_ref = df.column("A"); + if let DataFrameColumn::F64(frame_a) = col_ref { + assert_eq!(frame_a["A"], vec![1.0, 2.0]); + } else { + panic!("Column A is not f64"); + } + + // Mutable access + let col_mut_ref = df.column_mut("A"); + if let DataFrameColumn::F64(frame_a_mut) = col_mut_ref { + frame_a_mut["A"][0] = 99.0; + } else { + panic!("Column A is not f64"); + } + + // Verify mutation + if let DataFrameColumn::F64(frame_a) = df.column("A") { + assert_eq!(frame_a["A"], vec![99.0, 2.0]); + } + } + + #[test] + #[should_panic(expected = "unknown column label: 'Z'")] + fn test_dataframe_column_panic_unknown() { + let col_a = create_f64_frame("A", vec![1.0], None); + let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.column("Z"); + } + + #[test] + #[should_panic(expected = "unknown column label: 'Z'")] + fn test_dataframe_column_mut_panic_unknown() { + let col_a = create_f64_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.column_mut("Z"); + } +} From ff4535c56bb91a61a9fc985c0038251485f35003 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 05:35:48 +0100 Subject: [PATCH 04/19] Implement column renaming in DataFrame, updating both logical names and underlying Frame references. --- src/dataframe/df.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 061be3f..2574844 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -252,9 +252,28 @@ impl DataFrame { old_name ) }); - self.data.insert(new_name.clone(), column); + let new_name_clone = new_name.clone(); + self.data.insert(new_name, column); if let Some(pos) = self.column_names.iter().position(|n| n == old_name) { - self.column_names[pos] = new_name; + self.column_names[pos] = new_name_clone.clone(); + } + + // rename the column in the underlying Frame as well + if let Some(col) = self.data.get_mut(&new_name_clone) { + match col { + DataFrameColumn::F64(frame) => { + frame.rename(old_name, new_name_clone.clone()); + } + DataFrameColumn::I64(frame) => { + frame.rename(old_name, new_name_clone.clone()); + } + DataFrameColumn::String(frame) => { + frame.rename(old_name, new_name_clone.clone()); + } + DataFrameColumn::Bool(frame) => { + frame.rename(old_name, new_name_clone.clone()); + } + } } } From 01a132264f083ff682ee00df62eb05dbb0484e5a Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 05:44:24 +0100 Subject: [PATCH 05/19] Remove unused imports and clean up test module in DataFrame implementation --- src/dataframe/df.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 2574844..0a11159 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -1,7 +1,5 @@ use crate::frame::{Frame, RowIndex}; -use crate::matrix::Matrix; use std::collections::HashMap; -use std::fmt; /// Represents a column in a DataFrame, holding data of a specific type. /// Each variant wraps a `Frame` where T is the data type. @@ -286,6 +284,8 @@ impl DataFrame { #[cfg(test)] mod tests { use super::*; + use crate::frame::Frame; + use crate::matrix::Matrix; use chrono::NaiveDate; // Helper for dates From 57ed06f79b7c5c77477fbcadb064370a203de6de Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 19:47:12 +0100 Subject: [PATCH 06/19] Reimplemented dataframe class with TypedFrame interface --- src/dataframe/df.rs | 868 +++++++++++++++++++++++++++++++------------- 1 file changed, 613 insertions(+), 255 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 0a11159..4e997a8 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -1,10 +1,11 @@ use crate::frame::{Frame, RowIndex}; +use crate::matrix::Matrix; use std::collections::HashMap; -/// Represents a column in a DataFrame, holding data of a specific type. -/// Each variant wraps a `Frame` where T is the data type. +/// Represents a typed Frame that can hold multiple columns of a single type. +/// This will be the underlying storage for DataFrame. #[derive(Debug, Clone, PartialEq)] -pub enum DataFrameColumn { +pub enum TypedFrame { F64(Frame), I64(Frame), Bool(Frame), @@ -12,54 +13,190 @@ pub enum DataFrameColumn { // Add more types as needed } -impl DataFrameColumn { - /// Returns the number of rows in the column. +impl TypedFrame { pub fn rows(&self) -> usize { match self { - DataFrameColumn::F64(f) => f.rows(), - DataFrameColumn::I64(f) => f.rows(), - DataFrameColumn::Bool(f) => f.rows(), - DataFrameColumn::String(f) => f.rows(), + TypedFrame::F64(f) => f.rows(), + TypedFrame::I64(f) => f.rows(), + TypedFrame::Bool(f) => f.rows(), + TypedFrame::String(f) => f.rows(), } } - /// Returns the column name. - /// Panics if the internal frame has more than one column (which it shouldn't for a single DataFrameColumn). - pub fn name(&self) -> &str { + pub fn cols(&self) -> usize { match self { - DataFrameColumn::F64(f) => &f.columns()[0], - DataFrameColumn::I64(f) => &f.columns()[0], - DataFrameColumn::Bool(f) => &f.columns()[0], - DataFrameColumn::String(f) => &f.columns()[0], + TypedFrame::F64(f) => f.cols(), + TypedFrame::I64(f) => f.cols(), + TypedFrame::Bool(f) => f.cols(), + TypedFrame::String(f) => f.cols(), + } + } + + pub fn columns(&self) -> &[String] { + match self { + TypedFrame::F64(f) => f.columns(), + TypedFrame::I64(f) => f.columns(), + TypedFrame::Bool(f) => f.columns(), + TypedFrame::String(f) => f.columns(), } } - /// Returns a reference to the underlying RowIndex. pub fn index(&self) -> &RowIndex { match self { - DataFrameColumn::F64(f) => f.index(), - DataFrameColumn::I64(f) => f.index(), - DataFrameColumn::Bool(f) => f.index(), - DataFrameColumn::String(f) => f.index(), + TypedFrame::F64(f) => f.index(), + TypedFrame::I64(f) => f.index(), + TypedFrame::Bool(f) => f.index(), + TypedFrame::String(f) => f.index(), + } + } + + /// Returns an immutable slice of the specified column's data by name. + /// Panics if the column name is not found. + pub fn column(&self, name: &str) -> &[f64] { + match self { + TypedFrame::F64(f) => f.column(name), + _ => panic!("Column type mismatch"), + } + } + + /// Returns a mutable slice of the specified column's data by name. + /// Panics if the column name is not found. + pub fn column_mut(&mut self, name: &str) -> &mut [f64] { + match self { + TypedFrame::F64(f) => f.column_mut(name), + _ => panic!("Column type mismatch"), + } + } +} + +/// Represents a view of a single column within a DataFrame. +/// It borrows data from an underlying TypedFrame. +#[derive(Debug, PartialEq)] +pub enum DataFrameColumn<'a> { + F64(&'a [f64]), + I64(&'a [i64]), + Bool(&'a [bool]), + String(&'a [String]), +} + +impl<'a> DataFrameColumn<'a> { + pub fn len(&self) -> usize { + match self { + DataFrameColumn::F64(s) => s.len(), + DataFrameColumn::I64(s) => s.len(), + DataFrameColumn::Bool(s) => s.len(), + DataFrameColumn::String(s) => s.len(), + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + // Add methods to get specific typed slices + pub fn as_f64(&self) -> Option<&'a [f64]> { + if let DataFrameColumn::F64(s) = self { + Some(s) + } else { + None + } + } + pub fn as_i64(&self) -> Option<&'a [i64]> { + if let DataFrameColumn::I64(s) = self { + Some(s) + } else { + None + } + } + pub fn as_bool(&self) -> Option<&'a [bool]> { + if let DataFrameColumn::Bool(s) = self { + Some(s) + } else { + None + } + } + pub fn as_string(&self) -> Option<&'a [String]> { + if let DataFrameColumn::String(s) = self { + Some(s) + } else { + None + } + } +} + +/// Represents a mutable view of a single column within a DataFrame. +#[derive(Debug)] +pub enum DataFrameColumnMut<'a> { + F64(&'a mut [f64]), + I64(&'a mut [i64]), + Bool(&'a mut [bool]), + String(&'a mut [String]), +} + +impl<'a> DataFrameColumnMut<'a> { + pub fn len(&self) -> usize { + match self { + DataFrameColumnMut::F64(s) => s.len(), + DataFrameColumnMut::I64(s) => s.len(), + DataFrameColumnMut::Bool(s) => s.len(), + DataFrameColumnMut::String(s) => s.len(), + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + // Add methods to get specific typed mutable slices + pub fn as_f64_mut(&mut self) -> Option<&mut [f64]> { + if let DataFrameColumnMut::F64(s) = self { + Some(s) + } else { + None + } + } + pub fn as_i64_mut(&mut self) -> Option<&mut [i64]> { + if let DataFrameColumnMut::I64(s) = self { + Some(s) + } else { + None + } + } + pub fn as_bool_mut(&mut self) -> Option<&mut [bool]> { + if let DataFrameColumnMut::Bool(s) = self { + Some(s) + } else { + None + } + } + pub fn as_string_mut(&mut self) -> Option<&mut [String]> { + if let DataFrameColumnMut::String(s) = self { + Some(s) + } else { + None } } } /// A DataFrame capable of holding multiple data types. /// -/// Internally, a DataFrame manages a collection of `Frame` instances, +/// Internally, a DataFrame manages a collection of `TypedFrame` instances, /// each holding data of a single, homogeneous type. The logical column /// order is maintained separately from the physical storage. #[derive(Debug, Clone, PartialEq)] pub struct DataFrame { /// The logical order of column names. column_names: Vec, - /// A map from column name to its corresponding DataFrameColumn (which wraps a Frame). - data: HashMap, + /// A map from a unique ID to a TypedFrame (the underlying storage). + subframes: HashMap, + /// A map from logical column name to its location: (subframe_id, column_name_in_subframe). + column_locations: HashMap, /// The common row index for all columns in the DataFrame. index: RowIndex, /// The number of rows in the DataFrame. rows: usize, + /// Counter for generating unique subframe IDs. + next_subframe_id: usize, } impl DataFrame { @@ -72,7 +209,7 @@ impl DataFrame { /// All columns must have the same number of rows. /// /// # Arguments - /// * `columns` - A vector of `DataFrameColumn` instances, each representing a column. + /// * `columns` - A vector of `TypedFrame` instances, each representing a column. /// * `column_names` - A vector of strings, providing names for each column. /// * `index` - An optional `RowIndex` to be used for the DataFrame. If `None`, a default /// `Range` index will be created. @@ -83,24 +220,18 @@ impl DataFrame { /// * If column names are duplicated. /// * If the provided `index` length does not match the row count. pub fn new( - columns: Vec, + columns: Vec, // Changed from DataFrameColumn to TypedFrame column_names: Vec, index: Option, ) -> Self { - if columns.len() != column_names.len() { - panic!( - "DataFrame::new: column data count ({}) mismatch column names count ({})", - columns.len(), - column_names.len() - ); - } - if columns.is_empty() { return Self { column_names: Vec::new(), - data: HashMap::new(), + subframes: HashMap::new(), + column_locations: HashMap::new(), index: index.unwrap_or(RowIndex::Range(0..0)), rows: 0, + next_subframe_id: 0, }; } @@ -115,39 +246,58 @@ impl DataFrame { ); } - let mut data_map = HashMap::with_capacity(columns.len()); - let mut final_column_names = Vec::with_capacity(column_names.len()); + let mut subframes = HashMap::new(); + let mut column_locations = HashMap::new(); + let mut next_subframe_id = 0; - for (i, col_name) in column_names.into_iter().enumerate() { - let col = columns[i].clone(); - - if col.rows() != num_rows { + // Process each provided TypedFrame + for typed_frame in columns { + if typed_frame.rows() != num_rows { panic!( - "DataFrame::new: column '{}' has inconsistent row count ({} vs {})", - col_name, - col.rows(), + "DataFrame::new: TypedFrame has inconsistent row count ({} vs {})", + typed_frame.rows(), num_rows ); } + if typed_frame.index() != &common_index { + panic!("DataFrame::new: TypedFrame has inconsistent index with common index",); + } - if col.index() != &common_index { - panic!( - "DataFrame::new: column '{}' has inconsistent index with common index", - col_name + let subframe_id = next_subframe_id; + next_subframe_id += 1; + + for col_name_in_subframe in typed_frame.columns() { + if column_locations.contains_key(col_name_in_subframe) { + panic!( + "DataFrame::new: duplicate column name: {}", + col_name_in_subframe + ); + } + column_locations.insert( + col_name_in_subframe.clone(), + (subframe_id, col_name_in_subframe.clone()), ); } + subframes.insert(subframe_id, typed_frame); + } - if data_map.insert(col_name.clone(), col).is_some() { - panic!("DataFrame::new: duplicate column name: {}", col_name); + // Ensure all column_names provided are actually present in the subframes + for name in &column_names { + if !column_locations.contains_key(name) { + panic!( + "DataFrame::new: column name '{}' not found in provided TypedFrames", + name + ); } - final_column_names.push(col_name); } Self { - column_names: final_column_names, - data: data_map, + column_names, // This now represents the logical order of ALL columns from all subframes + subframes, + column_locations, index: common_index, rows: num_rows, + next_subframe_id, } } @@ -175,27 +325,77 @@ impl DataFrame { &self.index } - /// Returns an immutable reference to a column by its name. + /// Returns an immutable view to a column by its name. /// Panics if the column is not found. - pub fn column(&self, name: &str) -> &DataFrameColumn { - self.data + pub fn column(&self, name: &str) -> DataFrameColumn<'_> { + let (subframe_id, col_in_subframe_name) = self + .column_locations .get(name) - .unwrap_or_else(|| panic!("DataFrame::column: unknown column label: '{}'", name)) + .unwrap_or_else(|| panic!("DataFrame::column: unknown column label: '{}'", name)); + + let subframe = self.subframes.get(subframe_id).unwrap_or_else(|| { + panic!( + "DataFrame::column: internal error, subframe ID {} not found", + subframe_id + ) + }); + + match subframe { + TypedFrame::F64(f) => DataFrameColumn::F64(f.column(col_in_subframe_name)), + TypedFrame::I64(f) => DataFrameColumn::I64(f.column(col_in_subframe_name)), + TypedFrame::Bool(f) => DataFrameColumn::Bool(f.column(col_in_subframe_name)), + TypedFrame::String(f) => DataFrameColumn::String(f.column(col_in_subframe_name)), + } } - /// Returns a mutable reference to a column by its name. + /// Returns a mutable view to a column by its name. /// Panics if the column is not found. - pub fn column_mut(&mut self, name: &str) -> &mut DataFrameColumn { - self.data - .get_mut(name) - .unwrap_or_else(|| panic!("DataFrame::column_mut: unknown column label: '{}'", name)) + pub fn column_mut(&mut self, name: &str) -> DataFrameColumnMut<'_> { + let (subframe_id, col_in_subframe_name) = self + .column_locations + .get(name) + .unwrap_or_else(|| panic!("DataFrame::column_mut: unknown column label: '{}'", name)); + + // We need to get a mutable reference to the subframe. + // This requires a bit of care because we're borrowing `self.column_locations` + // and then `self.subframes` mutably. + // To avoid double-borrowing, we can get the subframe_id and col_in_subframe_name first, + // then use them to access the subframe mutably. + let subframe_id_copy = *subframe_id; + let col_in_subframe_name_copy = col_in_subframe_name.clone(); + + let subframe = self + .subframes + .get_mut(&subframe_id_copy) + .unwrap_or_else(|| { + panic!( + "DataFrame::column_mut: internal error, subframe ID {} not found", + subframe_id_copy + ) + }); + + match subframe { + TypedFrame::F64(f) => DataFrameColumnMut::F64(f.column_mut(&col_in_subframe_name_copy)), + TypedFrame::I64(f) => DataFrameColumnMut::I64(f.column_mut(&col_in_subframe_name_copy)), + TypedFrame::Bool(f) => { + DataFrameColumnMut::Bool(f.column_mut(&col_in_subframe_name_copy)) + } + TypedFrame::String(f) => { + DataFrameColumnMut::String(f.column_mut(&col_in_subframe_name_copy)) + } + } } /// Adds a new column to the DataFrame. + /// This involves either adding it to an existing TypedFrame if types match, + /// or creating a new TypedFrame. /// Panics if a column with the same name already exists or if the new column's /// row count or index does not match the DataFrame's. - pub fn add_column(&mut self, name: String, column_data: DataFrameColumn) { - if self.data.contains_key(&name) { + /// Adds a new column to the DataFrame. + /// The `column_data` must be a `TypedFrame` containing exactly one column, + /// and its name must match the `name` parameter. + pub fn add_column(&mut self, name: String, column_data: TypedFrame) { + if self.column_locations.contains_key(&name) { panic!("DataFrame::add_column: duplicate column label: {}", name); } if column_data.rows() != self.rows { @@ -212,24 +412,85 @@ impl DataFrame { name ); } + // Ensure the provided TypedFrame contains exactly one column, and its name matches `name` + if column_data.cols() != 1 || column_data.columns()[0] != name { + panic!( + "DataFrame::add_column: provided TypedFrame must contain exactly one column named '{}'", + name + ); + } - self.column_names.push(name.clone()); - self.data.insert(name, column_data); + let subframe_id = self.next_subframe_id; + self.next_subframe_id += 1; + + self.subframes.insert(subframe_id, column_data); + self.column_locations + .insert(name.clone(), (subframe_id, name.clone())); + + self.column_names.push(name); } - /// Deletes a column by name and returns it. + /// Deletes a column by name and returns its data as a new single-column TypedFrame. /// Panics if the column name is not found. - pub fn delete_column(&mut self, name: &str) -> DataFrameColumn { - let removed_col = self.data.remove(name).unwrap_or_else(|| { - panic!("DataFrame::delete_column: unknown column label: '{}'", name) - }); + pub fn delete_column(&mut self, name: &str) -> TypedFrame { + let (subframe_id, col_in_subframe_name) = + self.column_locations.remove(name).unwrap_or_else(|| { + panic!("DataFrame::delete_column: unknown column label: '{}'", name) + }); - // Remove from column_names vector and rebuild if necessary (to maintain order) + // Remove from logical column names if let Some(pos) = self.column_names.iter().position(|n| n == name) { self.column_names.remove(pos); } - removed_col + let mut subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { + panic!( + "DataFrame::delete_column: internal error, subframe ID {} not found", + subframe_id + ) + }); + + let deleted_data_frame = match subframe { + TypedFrame::F64(f) => { + let data = f.delete_column(&col_in_subframe_name); + TypedFrame::F64(Frame::new( + Matrix::from_cols(vec![data]), + vec![col_in_subframe_name.clone()], + Some(f.index().clone()), + )) + } + TypedFrame::I64(f) => { + let data = f.delete_column(&col_in_subframe_name); + TypedFrame::I64(Frame::new( + Matrix::from_cols(vec![data]), + vec![col_in_subframe_name.clone()], + Some(f.index().clone()), + )) + } + TypedFrame::Bool(f) => { + let data = f.delete_column(&col_in_subframe_name); + TypedFrame::Bool(Frame::new( + Matrix::from_cols(vec![data]), + vec![col_in_subframe_name.clone()], + Some(f.index().clone()), + )) + } + TypedFrame::String(f) => { + let data = f.delete_column(&col_in_subframe_name); + TypedFrame::String(Frame::new( + Matrix::from_cols(vec![data]), + vec![col_in_subframe_name.clone()], + Some(f.index().clone()), + )) + } + }; + + // If the subframe becomes empty after deletion, remove it from the map + if subframe.cols() == 0 { + self.subframes.remove(&subframe_id); + } + + deleted_data_frame } /// Renames an existing column. @@ -237,41 +498,43 @@ impl DataFrame { if old_name == new_name { return; // No change needed } - if self.data.contains_key(&new_name) { + if self.column_locations.contains_key(&new_name) { panic!( "DataFrame::rename_column: new column name '{}' already exists", new_name ); } - let column = self.data.remove(old_name).unwrap_or_else(|| { - panic!( - "DataFrame::rename_column: unknown column label: '{}'", - old_name - ) - }); - let new_name_clone = new_name.clone(); - self.data.insert(new_name, column); + let (subframe_id, col_in_subframe_name) = + self.column_locations.remove(old_name).unwrap_or_else(|| { + panic!( + "DataFrame::rename_column: unknown column label: '{}'", + old_name + ) + }); + + // Update the column_locations map + self.column_locations + .insert(new_name.clone(), (subframe_id, new_name.clone())); + + // Update the logical column_names vector if let Some(pos) = self.column_names.iter().position(|n| n == old_name) { - self.column_names[pos] = new_name_clone.clone(); + self.column_names[pos] = new_name.clone(); } - // rename the column in the underlying Frame as well - if let Some(col) = self.data.get_mut(&new_name_clone) { - match col { - DataFrameColumn::F64(frame) => { - frame.rename(old_name, new_name_clone.clone()); - } - DataFrameColumn::I64(frame) => { - frame.rename(old_name, new_name_clone.clone()); - } - DataFrameColumn::String(frame) => { - frame.rename(old_name, new_name_clone.clone()); - } - DataFrameColumn::Bool(frame) => { - frame.rename(old_name, new_name_clone.clone()); - } - } + // Rename the column in the underlying TypedFrame + let subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { + panic!( + "DataFrame::rename_column: internal error, subframe ID {} not found", + subframe_id + ) + }); + + match subframe { + TypedFrame::F64(f) => f.rename(&col_in_subframe_name, new_name), + TypedFrame::I64(f) => f.rename(&col_in_subframe_name, new_name), + TypedFrame::Bool(f) => f.rename(&col_in_subframe_name, new_name), + TypedFrame::String(f) => f.rename(&col_in_subframe_name, new_name), } } @@ -293,51 +556,67 @@ mod tests { NaiveDate::from_ymd_opt(y, m, d).unwrap() } - // Helper to create a simple f64 Frame - fn create_f64_frame(name: &str, data: Vec, index: Option) -> DataFrameColumn { + // Helper to create a simple f64 TypedFrame + fn create_f64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { let rows = data.len(); let matrix = Matrix::from_cols(vec![data]); let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - DataFrameColumn::F64(Frame::new( + TypedFrame::F64(Frame::new( matrix, vec![name.to_string()], Some(frame_index), )) } - // Helper to create a simple i64 Frame - fn create_i64_frame(name: &str, data: Vec, index: Option) -> DataFrameColumn { + // Helper to create a simple i64 TypedFrame + fn create_i64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { let rows = data.len(); let matrix = Matrix::from_cols(vec![data]); let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - DataFrameColumn::I64(Frame::new( + TypedFrame::I64(Frame::new( matrix, vec![name.to_string()], Some(frame_index), )) } - // Helper to create a simple String Frame - fn create_string_frame( + // Helper to create a simple String TypedFrame + fn create_string_typed_frame( name: &str, data: Vec, index: Option, - ) -> DataFrameColumn { + ) -> TypedFrame { let rows = data.len(); let matrix = Matrix::from_cols(vec![data]); let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - DataFrameColumn::String(Frame::new( + TypedFrame::String(Frame::new( matrix, vec![name.to_string()], Some(frame_index), )) } + // Helper to create a multi-column f64 TypedFrame + fn create_multi_f64_typed_frame( + names: Vec<&str>, + data: Vec>, + index: Option, + ) -> TypedFrame { + let rows = data[0].len(); + let matrix = Matrix::from_cols(data); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + TypedFrame::F64(Frame::new( + matrix, + names.into_iter().map(|s| s.to_string()).collect(), + Some(frame_index), + )) + } + #[test] fn test_dataframe_new_basic() { - let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_frame("B", vec![4, 5, 6], None); - let col_c = create_string_frame( + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); + let col_c = create_string_typed_frame( "C", vec!["x".to_string(), "y".to_string(), "z".to_string()], None, @@ -354,29 +633,69 @@ mod tests { assert_eq!(df.columns(), &["A", "B", "C"]); assert_eq!(df.index(), &RowIndex::Range(0..3)); - // Check column data - if let DataFrameColumn::F64(frame_a) = df.column("A") { - assert_eq!(frame_a["A"], vec![1.0, 2.0, 3.0]); + // Check column data using the new DataFrameColumn view + if let DataFrameColumn::F64(slice_a) = df.column("A") { + assert_eq!(slice_a, &[1.0, 2.0, 3.0]); } else { panic!("Column A is not f64"); } - if let DataFrameColumn::I64(frame_b) = df.column("B") { - assert_eq!(frame_b["B"], vec![4, 5, 6]); + if let DataFrameColumn::I64(slice_b) = df.column("B") { + assert_eq!(slice_b, &[4, 5, 6]); } else { panic!("Column B is not i64"); } - if let DataFrameColumn::String(frame_c) = df.column("C") { - assert_eq!(frame_c["C"], vec!["x", "y", "z"]); + if let DataFrameColumn::String(slice_c) = df.column("C") { + assert_eq!( + slice_c, + &["x".to_string(), "y".to_string(), "z".to_string()] + ); } else { panic!("Column C is not String"); } } + #[test] + fn test_dataframe_new_with_multi_column_subframe() { + let multi_f64_frame = create_multi_f64_typed_frame( + vec!["X", "Y"], + vec![vec![1.0, 2.0], vec![10.0, 20.0]], + None, + ); + let single_string_frame = + create_string_typed_frame("Z", vec!["a".to_string(), "b".to_string()], None); + + let df = DataFrame::new( + vec![multi_f64_frame, single_string_frame], + vec!["X".to_string(), "Y".to_string(), "Z".to_string()], + None, + ); + + assert_eq!(df.rows(), 2); + assert_eq!(df.cols(), 3); + assert_eq!(df.columns(), &["X", "Y", "Z"]); + + if let DataFrameColumn::F64(slice_x) = df.column("X") { + assert_eq!(slice_x, &[1.0, 2.0]); + } else { + panic!("Column X is not f64"); + } + if let DataFrameColumn::F64(slice_y) = df.column("Y") { + assert_eq!(slice_y, &[10.0, 20.0]); + } else { + panic!("Column Y is not f64"); + } + if let DataFrameColumn::String(slice_z) = df.column("Z") { + assert_eq!(slice_z, &["a".to_string(), "b".to_string()]); + } else { + panic!("Column Z is not String"); + } + } + #[test] fn test_dataframe_new_with_int_index() { let index = RowIndex::Int(vec![10, 20, 30]); - let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], Some(index.clone())); - let col_b = create_i64_frame("B", vec![4, 5, 6], Some(index.clone())); + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], Some(index.clone())); + let col_b = create_i64_typed_frame("B", vec![4, 5, 6], Some(index.clone())); let df = DataFrame::new( vec![col_a, col_b], @@ -393,8 +712,8 @@ mod tests { fn test_dataframe_new_with_date_index() { let index_vec = vec![d(2024, 1, 1), d(2024, 1, 2)]; let index = RowIndex::Date(index_vec.clone()); - let col_a = create_f64_frame("A", vec![1.0, 2.0], Some(index.clone())); - let col_b = create_string_frame( + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], Some(index.clone())); + let col_b = create_string_typed_frame( "B", vec!["hello".to_string(), "world".to_string()], Some(index.clone()), @@ -412,17 +731,17 @@ mod tests { } #[test] - #[should_panic(expected = "column data count (1) mismatch column names count (2)")] - fn test_dataframe_new_panic_col_count_mismatch() { - let col_a = create_f64_frame("A", vec![1.0], None); + #[should_panic(expected = "column name 'B' not found in provided TypedFrames")] + fn test_dataframe_new_panic_col_name_not_found_in_subframes() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); DataFrame::new(vec![col_a], vec!["A".to_string(), "B".to_string()], None); } #[test] - #[should_panic(expected = "column 'B' has inconsistent row count (2 vs 3)")] + #[should_panic(expected = "TypedFrame has inconsistent row count (2 vs 3)")] fn test_dataframe_new_panic_inconsistent_rows() { - let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_frame("B", vec![4, 5], None); // Mismatch + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![4, 5], None); // Mismatch DataFrame::new( vec![col_a, col_b], vec!["A".to_string(), "B".to_string()], @@ -433,8 +752,8 @@ mod tests { #[test] #[should_panic(expected = "duplicate column name: A")] fn test_dataframe_new_panic_duplicate_col_name() { - let col_a1 = create_f64_frame("A", vec![1.0], None); - let col_a2 = create_i64_frame("A", vec![2], None); // Duplicate name + let col_a1 = create_f64_typed_frame("A", vec![1.0], None); + let col_a2 = create_i64_typed_frame("A", vec![2], None); // Duplicate name DataFrame::new( vec![col_a1, col_a2], vec!["A".to_string(), "A".to_string()], @@ -446,55 +765,63 @@ mod tests { #[should_panic(expected = "provided index length (2) mismatch column rows (3)")] fn test_dataframe_new_panic_index_len_mismatch() { let index = RowIndex::Int(vec![10, 20]); // Length 2 - let col_a = create_f64_frame("A", vec![1.0, 2.0, 3.0], None); // Length 3 + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); // Length 3 DataFrame::new(vec![col_a], vec!["A".to_string()], Some(index)); } #[test] - #[should_panic(expected = "column 'A' has inconsistent index with common index")] + #[should_panic(expected = "TypedFrame has inconsistent index with common index")] fn test_dataframe_new_panic_inconsistent_column_index() { let common_index = RowIndex::Int(vec![10, 20]); - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); // Uses Range index by default + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); // Uses Range index by default DataFrame::new(vec![col_a], vec!["A".to_string()], Some(common_index)); } #[test] fn test_dataframe_add_column() { - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_frame("B", vec![10, 20], None); - df.add_column("B".to_string(), new_col_b); + let new_col_b_typed_frame = create_i64_typed_frame("B", vec![10, 20], None); + df.add_column("B".to_string(), new_col_b_typed_frame); assert_eq!(df.cols(), 2); assert_eq!(df.columns(), &["A", "B"]); - if let DataFrameColumn::I64(frame_b) = df.column("B") { - assert_eq!(frame_b["B"], vec![10, 20]); + if let DataFrameColumn::I64(slice_b) = df.column("B") { + assert_eq!(slice_b, &[10, 20]); } else { panic!("Column B is not i64"); } - let new_col_c = create_string_frame("C", vec!["foo".to_string(), "bar".to_string()], None); - df.add_column("C".to_string(), new_col_c); + let new_col_c_typed_frame = + create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); + df.add_column("C".to_string(), new_col_c_typed_frame); assert_eq!(df.cols(), 3); assert_eq!(df.columns(), &["A", "B", "C"]); + if let DataFrameColumn::String(slice_c) = df.column("C") { + assert_eq!(slice_c, &["foo".to_string(), "bar".to_string()]); + } else { + panic!("Column C is not String"); + } } #[test] - #[should_panic(expected = "duplicate column label: A")] - fn test_dataframe_add_column_panic_duplicate() { - let col_a = create_f64_frame("A", vec![1.0], None); + #[should_panic(expected = "duplicate column label: B")] + fn test_dataframe_add_column_panic_duplicate_name() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_a = create_i64_frame("A", vec![2], None); - df.add_column("A".to_string(), new_col_a); + let new_col_b = create_i64_typed_frame("B", vec![10, 20], None); + df.add_column("B".to_string(), new_col_b); + let another_col_b = create_i64_typed_frame("B", vec![30, 40], None); + df.add_column("B".to_string(), another_col_b); } #[test] - #[should_panic(expected = "new column 'B' has inconsistent row count (1 vs 2)")] + #[should_panic(expected = "new column 'B' has inconsistent row count (3 vs 2)")] fn test_dataframe_add_column_panic_inconsistent_rows() { - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_frame("B", vec![10], None); // Mismatch + let new_col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); // Mismatch df.add_column("B".to_string(), new_col_b); } @@ -502,79 +829,157 @@ mod tests { #[should_panic(expected = "new column 'B' has inconsistent index with DataFrame's index")] fn test_dataframe_add_column_panic_inconsistent_index() { let df_index = RowIndex::Int(vec![1, 2]); - let col_a = create_f64_frame("A", vec![1.0, 2.0], Some(df_index.clone())); + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], Some(df_index.clone())); let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], Some(df_index)); - let new_col_b = create_i64_frame("B", vec![10, 20], None); // Uses Range index + let new_col_b_index = RowIndex::Int(vec![10, 20]); // Different index + let new_col_b = create_i64_typed_frame("B", vec![10, 20], Some(new_col_b_index)); df.add_column("B".to_string(), new_col_b); } + #[test] + #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] + fn test_dataframe_add_column_panic_multi_column_typedframe() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + let multi_col_b = create_multi_f64_typed_frame( + vec!["B", "C"], + vec![vec![10.0, 20.0], vec![30.0, 40.0]], + None, + ); + df.add_column("B".to_string(), multi_col_b); + } + + #[test] + #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] + fn test_dataframe_add_column_panic_typedframe_name_mismatch() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + let col_c = create_f64_typed_frame("C", vec![10.0, 20.0], None); + df.add_column("B".to_string(), col_c); // Adding column named "B" but TypedFrame is named "C" + } + #[test] fn test_dataframe_delete_column() { - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_frame("B", vec![4, 5], None); + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); + let col_c = create_string_typed_frame( + "C", + vec!["x".to_string(), "y".to_string(), "z".to_string()], + None, + ); + + let mut df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + let deleted_col_b = df.delete_column("B"); + + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "C"]); + assert_eq!(df.rows(), 3); + + if let TypedFrame::I64(frame_b) = deleted_col_b { + assert_eq!(frame_b.columns(), &["B"]); + assert_eq!(frame_b.column("B"), &[4, 5, 6]); + } else { + panic!("Deleted column B is not i64 TypedFrame"); + } + + // Verify remaining columns + if let DataFrameColumn::F64(slice_a) = df.column("A") { + assert_eq!(slice_a, &[1.0, 2.0, 3.0]); + } else { + panic!("Column A is not f64"); + } + if let DataFrameColumn::String(slice_c) = df.column("C") { + assert_eq!( + slice_c, + &["x".to_string(), "y".to_string(), "z".to_string()] + ); + } else { + panic!("Column C is not String"); + } + + // Delete another column, ensuring subframe is removed if empty + let deleted_col_a = df.delete_column("A"); + assert_eq!(df.cols(), 1); + assert_eq!(df.columns(), &["C"]); + assert_eq!(df.subframes.len(), 1); // Only the String subframe should remain + + if let TypedFrame::F64(frame_a) = deleted_col_a { + assert_eq!(frame_a.columns(), &["A"]); + assert_eq!(frame_a.column("A"), &[1.0, 2.0, 3.0]); + } else { + panic!("Deleted column A is not f64 TypedFrame"); + } + } + + #[test] + #[should_panic(expected = "unknown column label: 'D'")] + fn test_dataframe_delete_column_panic_not_found() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.delete_column("D"); + } + + #[test] + fn test_dataframe_rename_column() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20], None); let mut df = DataFrame::new( vec![col_a, col_b], vec!["A".to_string(), "B".to_string()], None, ); + df.rename_column("A", "Alpha".to_string()); + assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "B"]); - - let deleted_col = df.delete_column("A"); - assert_eq!(df.cols(), 1); - assert_eq!(df.columns(), &["B"]); - if let DataFrameColumn::F64(frame_a) = deleted_col { - assert_eq!(frame_a["A"], vec![1.0, 2.0]); + assert_eq!(df.columns(), &["Alpha", "B"]); + if let DataFrameColumn::F64(slice_alpha) = df.column("Alpha") { + assert_eq!(slice_alpha, &[1.0, 2.0]); } else { - panic!("Deleted column was not f64"); + panic!("Column Alpha is not f64"); } - // Delete the last column - df.delete_column("B"); - assert_eq!(df.cols(), 0); - assert!(df.columns().is_empty()); - assert!(df.data.is_empty()); - } - - #[test] - #[should_panic(expected = "unknown column label: 'C'")] - fn test_dataframe_delete_column_panic_unknown() { - let col_a = create_f64_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.delete_column("C"); - } - - #[test] - fn test_dataframe_rename_column() { - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - df.rename_column("A", "X".to_string()); - assert_eq!(df.columns(), &["X"]); - assert!(df.data.contains_key("X")); - assert!(!df.data.contains_key("A")); - if let DataFrameColumn::F64(frame_x) = df.column("X") { - assert_eq!(frame_x["X"], vec![1.0, 2.0]); + // Rename a column in a multi-column subframe + let multi_f64_frame = create_multi_f64_typed_frame( + vec!["X", "Y"], + vec![vec![1.0, 2.0], vec![10.0, 20.0]], + None, + ); + let mut df_multi = DataFrame::new( + vec![multi_f64_frame], + vec!["X".to_string(), "Y".to_string()], + None, + ); + df_multi.rename_column("X", "Ex".to_string()); + assert_eq!(df_multi.columns(), &["Ex", "Y"]); + if let DataFrameColumn::F64(slice_ex) = df_multi.column("Ex") { + assert_eq!(slice_ex, &[1.0, 2.0]); } else { - panic!("Column X is not f64"); + panic!("Column Ex is not f64"); } } #[test] - #[should_panic(expected = "unknown column label: 'Z'")] - fn test_dataframe_rename_column_panic_old_not_found() { - let col_a = create_f64_frame("A", vec![1.0], None); + #[should_panic(expected = "unknown column label: 'D'")] + fn test_dataframe_rename_column_panic_old_name_not_found() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.rename_column("Z", "Y".to_string()); + df.rename_column("D", "E".to_string()); } #[test] #[should_panic(expected = "new column name 'B' already exists")] - fn test_dataframe_rename_column_panic_new_exists() { - let col_a = create_f64_frame("A", vec![1.0], None); - let col_b = create_i64_frame("B", vec![2], None); + fn test_dataframe_rename_column_panic_new_name_exists() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let col_b = create_i64_typed_frame("B", vec![2], None); let mut df = DataFrame::new( vec![col_a, col_b], vec!["A".to_string(), "B".to_string()], @@ -585,9 +990,9 @@ mod tests { #[test] fn test_dataframe_sort_columns() { - let col_c = create_f64_frame("C", vec![1.0, 2.0], None); - let col_a = create_i64_frame("A", vec![3, 4], None); - let col_b = create_string_frame("B", vec!["x".to_string(), "y".to_string()], None); + let col_c = create_f64_typed_frame("C", vec![1.0, 2.0], None); + let col_a = create_i64_typed_frame("A", vec![10, 20], None); + let col_b = create_string_typed_frame("B", vec!["x".to_string(), "y".to_string()], None); let mut df = DataFrame::new( vec![col_c, col_a, col_b], @@ -600,67 +1005,20 @@ mod tests { assert_eq!(df.columns(), &["A", "B", "C"]); // Verify data integrity after sort - if let DataFrameColumn::I64(frame_a) = df.column("A") { - assert_eq!(frame_a["A"], vec![3, 4]); - } - if let DataFrameColumn::String(frame_b) = df.column("B") { - assert_eq!(frame_b["B"], vec!["x", "y"]); - } - if let DataFrameColumn::F64(frame_c) = df.column("C") { - assert_eq!(frame_c["C"], vec![1.0, 2.0]); - } - } - - #[test] - fn test_dataframe_empty() { - let df = DataFrame::new(vec![], vec![], None); - assert_eq!(df.rows(), 0); - assert_eq!(df.cols(), 0); - assert!(df.columns().is_empty()); - assert!(df.data.is_empty()); - assert_eq!(df.index(), &RowIndex::Range(0..0)); - } - - #[test] - fn test_dataframe_column_accessors() { - let col_a = create_f64_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - // Immutable access - let col_ref = df.column("A"); - if let DataFrameColumn::F64(frame_a) = col_ref { - assert_eq!(frame_a["A"], vec![1.0, 2.0]); + if let DataFrameColumn::I64(slice_a) = df.column("A") { + assert_eq!(slice_a, &[10, 20]); } else { - panic!("Column A is not f64"); + panic!("Column A is not i64"); } - - // Mutable access - let col_mut_ref = df.column_mut("A"); - if let DataFrameColumn::F64(frame_a_mut) = col_mut_ref { - frame_a_mut["A"][0] = 99.0; + if let DataFrameColumn::String(slice_b) = df.column("B") { + assert_eq!(slice_b, &["x".to_string(), "y".to_string()]); } else { - panic!("Column A is not f64"); + panic!("Column B is not String"); } - - // Verify mutation - if let DataFrameColumn::F64(frame_a) = df.column("A") { - assert_eq!(frame_a["A"], vec![99.0, 2.0]); + if let DataFrameColumn::F64(slice_c) = df.column("C") { + assert_eq!(slice_c, &[1.0, 2.0]); + } else { + panic!("Column C is not f64"); } } - - #[test] - #[should_panic(expected = "unknown column label: 'Z'")] - fn test_dataframe_column_panic_unknown() { - let col_a = create_f64_frame("A", vec![1.0], None); - let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.column("Z"); - } - - #[test] - #[should_panic(expected = "unknown column label: 'Z'")] - fn test_dataframe_column_mut_panic_unknown() { - let col_a = create_f64_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.column_mut("Z"); - } } From 2607d9c3b07bed9e15ec01235138c10595ef79ef Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 21:15:12 +0100 Subject: [PATCH 07/19] Add pub use statement for DataFrame, DataFrameColumn, and TypedFrame in mod.rs --- src/dataframe/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/dataframe/mod.rs b/src/dataframe/mod.rs index 8fb4cb2..3dd102e 100644 --- a/src/dataframe/mod.rs +++ b/src/dataframe/mod.rs @@ -1,2 +1,4 @@ //! This module provides the DataFrame structure for handling tabular data with mixed types. pub mod df; + +pub use df::{DataFrame, DataFrameColumn, TypedFrame}; From 58acea84671c447a1ed8430fbd408eed960d7eab Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Sun, 22 Jun 2025 21:16:06 +0100 Subject: [PATCH 08/19] Add DataFrame usage examples to README.md --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/README.md b/README.md index 55b9311..1ddaa75 100644 --- a/README.md +++ b/README.md @@ -98,3 +98,101 @@ assert!(check); ``` + +--- + +## DataFrame Usage Example + +```rust +use rustframe::{ + dataframe::{DataFrame, TypedFrame, DataFrameColumn}, + frame::{Frame, RowIndex}, + matrix::Matrix, +}; + +// Helper to create a simple f64 TypedFrame (similar to test helpers) +fn create_f64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + TypedFrame::F64(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) +} + +// Helper to create a simple i64 TypedFrame +fn create_i64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + TypedFrame::I64(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) +} + +// Helper to create a simple String TypedFrame +fn create_string_typed_frame( + name: &str, + data: Vec, + index: Option, +) -> TypedFrame { + let rows = data.len(); + let matrix = Matrix::from_cols(vec![data]); + let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); + TypedFrame::String(Frame::new( + matrix, + vec![name.to_string()], + Some(frame_index), + )) +} + +fn main() { + // 1. Create a DataFrame with different data types + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); + let col_c = create_string_typed_frame( + "C", + vec!["apple".to_string(), "banana".to_string(), "cherry".to_string()], + None, + ); + + let mut df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + println!("Initial DataFrame:\n{:?}", df); + println!("Columns: {:?}", df.columns()); + println!("Rows: {}", df.rows()); + + // 2. Accessing columns + if let DataFrameColumn::F64(col_a_data) = df.column("A") { + println!("Column 'A' (f64): {:?}", col_a_data); + } + + if let DataFrameColumn::String(col_c_data) = df.column("C") { + println!("Column 'C' (String): {:?}", col_c_data); + } + + // 3. Add a new column + let new_col_d = create_f64_typed_frame("D", vec![100.0, 200.0, 300.0], None); + df.add_column("D".to_string(), new_col_d); + println!("\nDataFrame after adding column 'D':\n{:?}", df); + println!("Columns after add: {:?}", df.columns()); + + // 4. Rename a column + df.rename_column("A", "Alpha".to_string()); + println!("\nDataFrame after renaming 'A' to 'Alpha':\n{:?}", df); + println!("Columns after rename: {:?}", df.columns()); + + // 5. Delete a column + let _deleted_col_b = df.delete_column("B"); + println!("\nDataFrame after deleting column 'B':\n{:?}", df); + println!("Columns after delete: {:?}", df.columns()); +} +``` From 8b6f16236a6cde333b3b40533a1e5115ab7a0319 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Tue, 1 Jul 2025 23:26:57 +0100 Subject: [PATCH 09/19] Refactor TypedFrame methods using macros for common functionality and improve column accessors --- src/dataframe/df.rs | 698 +++++++++++++++++++++++--------------------- 1 file changed, 359 insertions(+), 339 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 4e997a8..0455ede 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -13,60 +13,56 @@ pub enum TypedFrame { // Add more types as needed } +macro_rules! impl_typed_frame_common_methods { + ($($method:ident $(($($arg:ident: $arg_ty:ty),*))? -> $ret_ty:ty),*) => { + impl TypedFrame { + $( + pub fn $method(&self $(, $($arg: $arg_ty),*)?) -> $ret_ty { + match self { + TypedFrame::F64(f) => f.$method($($($arg),*)?), + TypedFrame::I64(f) => f.$method($($($arg),*)?), + TypedFrame::Bool(f) => f.$method($($($arg),*)?), + TypedFrame::String(f) => f.$method($($($arg),*)?), + } + } + )* + } + }; +} + +impl_typed_frame_common_methods! { + rows -> usize, + cols -> usize, + columns -> &[String], + index -> &RowIndex +} + +macro_rules! impl_typed_frame_column_accessors { + ($fn_name:ident, $ret_type:ident, $frame_method:ident) => { + pub fn $fn_name(&self, name: &str) -> $ret_type<'_> { + match self { + TypedFrame::F64(f) => $ret_type::F64(f.$frame_method(name)), + TypedFrame::I64(f) => $ret_type::I64(f.$frame_method(name)), + TypedFrame::Bool(f) => $ret_type::Bool(f.$frame_method(name)), + TypedFrame::String(f) => $ret_type::String(f.$frame_method(name)), + } + } + }; + ($fn_name:ident, $ret_type:ident, $frame_method:ident, mut) => { + pub fn $fn_name(&mut self, name: &str) -> $ret_type<'_> { + match self { + TypedFrame::F64(f) => $ret_type::F64(f.$frame_method(name)), + TypedFrame::I64(f) => $ret_type::I64(f.$frame_method(name)), + TypedFrame::Bool(f) => $ret_type::Bool(f.$frame_method(name)), + TypedFrame::String(f) => $ret_type::String(f.$frame_method(name)), + } + } + }; +} + impl TypedFrame { - pub fn rows(&self) -> usize { - match self { - TypedFrame::F64(f) => f.rows(), - TypedFrame::I64(f) => f.rows(), - TypedFrame::Bool(f) => f.rows(), - TypedFrame::String(f) => f.rows(), - } - } - - pub fn cols(&self) -> usize { - match self { - TypedFrame::F64(f) => f.cols(), - TypedFrame::I64(f) => f.cols(), - TypedFrame::Bool(f) => f.cols(), - TypedFrame::String(f) => f.cols(), - } - } - - pub fn columns(&self) -> &[String] { - match self { - TypedFrame::F64(f) => f.columns(), - TypedFrame::I64(f) => f.columns(), - TypedFrame::Bool(f) => f.columns(), - TypedFrame::String(f) => f.columns(), - } - } - - pub fn index(&self) -> &RowIndex { - match self { - TypedFrame::F64(f) => f.index(), - TypedFrame::I64(f) => f.index(), - TypedFrame::Bool(f) => f.index(), - TypedFrame::String(f) => f.index(), - } - } - - /// Returns an immutable slice of the specified column's data by name. - /// Panics if the column name is not found. - pub fn column(&self, name: &str) -> &[f64] { - match self { - TypedFrame::F64(f) => f.column(name), - _ => panic!("Column type mismatch"), - } - } - - /// Returns a mutable slice of the specified column's data by name. - /// Panics if the column name is not found. - pub fn column_mut(&mut self, name: &str) -> &mut [f64] { - match self { - TypedFrame::F64(f) => f.column_mut(name), - _ => panic!("Column type mismatch"), - } - } + impl_typed_frame_column_accessors!(column, DataFrameColumn, column); + impl_typed_frame_column_accessors!(column_mut, DataFrameColumnMut, column_mut, mut); } /// Represents a view of a single column within a DataFrame. @@ -443,45 +439,36 @@ impl DataFrame { self.column_names.remove(pos); } - let mut subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { + let subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { panic!( "DataFrame::delete_column: internal error, subframe ID {} not found", subframe_id ) }); + macro_rules! delete_column_from_typed_frame { + ($frame_type:ident, $frame_variant:ident, $f:ident, $col_name:expr) => {{ + let data = $f.delete_column(&$col_name); + TypedFrame::$frame_variant(Frame::new( + Matrix::from_cols(vec![data]), + vec![$col_name.clone()], + Some($f.index().clone()), + )) + }}; + } + let deleted_data_frame = match subframe { TypedFrame::F64(f) => { - let data = f.delete_column(&col_in_subframe_name); - TypedFrame::F64(Frame::new( - Matrix::from_cols(vec![data]), - vec![col_in_subframe_name.clone()], - Some(f.index().clone()), - )) + delete_column_from_typed_frame!(f64, F64, f, col_in_subframe_name) } TypedFrame::I64(f) => { - let data = f.delete_column(&col_in_subframe_name); - TypedFrame::I64(Frame::new( - Matrix::from_cols(vec![data]), - vec![col_in_subframe_name.clone()], - Some(f.index().clone()), - )) + delete_column_from_typed_frame!(i64, I64, f, col_in_subframe_name) } TypedFrame::Bool(f) => { - let data = f.delete_column(&col_in_subframe_name); - TypedFrame::Bool(Frame::new( - Matrix::from_cols(vec![data]), - vec![col_in_subframe_name.clone()], - Some(f.index().clone()), - )) + delete_column_from_typed_frame!(bool, Bool, f, col_in_subframe_name) } TypedFrame::String(f) => { - let data = f.delete_column(&col_in_subframe_name); - TypedFrame::String(Frame::new( - Matrix::from_cols(vec![data]), - vec![col_in_subframe_name.clone()], - Some(f.index().clone()), - )) + delete_column_from_typed_frame!(String, String, f, col_in_subframe_name) } }; @@ -654,6 +641,300 @@ mod tests { } } + #[test] + fn test_dataframe_rows_cols_columns_index() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "B"]); + assert_eq!(df.index(), &RowIndex::Range(0..3)); + + let empty_df = DataFrame::new(vec![], vec![], None); + assert_eq!(empty_df.rows(), 0); + assert_eq!(empty_df.cols(), 0); + assert_eq!(empty_df.columns(), &[] as &[String]); + assert_eq!(empty_df.index(), &RowIndex::Range(0..0)); + } + + #[test] + fn test_dataframe_column_access() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20], None); + let col_c = + create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); + + let df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + // Test f64 column + let col_a_view = df.column("A"); + assert!(col_a_view.as_f64().is_some()); + assert_eq!(col_a_view.as_f64().unwrap(), &[1.0, 2.0]); + assert!(col_a_view.as_i64().is_none()); + assert_eq!(col_a_view.len(), 2); + assert!(!col_a_view.is_empty()); + + // Test i64 column + let col_b_view = df.column("B"); + assert!(col_b_view.as_i64().is_some()); + assert_eq!(col_b_view.as_i64().unwrap(), &[10, 20]); + + // Test String column + let col_c_view = df.column("C"); + assert!(col_c_view.as_string().is_some()); + assert_eq!( + col_c_view.as_string().unwrap(), + &["foo".to_string(), "bar".to_string()] + ); + } + + #[test] + #[should_panic(expected = "unknown column label: 'D'")] + fn test_dataframe_column_panic_unknown() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.column("D"); + } + + #[test] + fn test_dataframe_column_mut_access() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20], None); + let col_c = + create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); + + let mut df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + // Test f64 column mut + if let DataFrameColumnMut::F64(slice_a_mut) = df.column_mut("A") { + slice_a_mut[0] = 100.0; + } else { + panic!("Column A is not f64 mut"); + } + assert_eq!(df.column("A").as_f64().unwrap(), &[100.0, 2.0]); + + // Test i64 column mut + if let DataFrameColumnMut::I64(slice_b_mut) = df.column_mut("B") { + slice_b_mut[1] = 200; + } else { + panic!("Column B is not i64 mut"); + } + assert_eq!(df.column("B").as_i64().unwrap(), &[10, 200]); + + // Test String column mut + if let DataFrameColumnMut::String(slice_c_mut) = df.column_mut("C") { + slice_c_mut[0] = "baz".to_string(); + } else { + panic!("Column C is not String mut"); + } + assert_eq!( + df.column("C").as_string().unwrap(), + &["baz".to_string(), "bar".to_string()] + ); + } + + #[test] + #[should_panic(expected = "unknown column label: 'D'")] + fn test_dataframe_column_mut_panic_unknown() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.column_mut("D"); + } + + #[test] + fn test_dataframe_add_column() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + + let new_col_b = create_i64_typed_frame("B", vec![10, 20], None); + df.add_column("B".to_string(), new_col_b); + + assert_eq!(df.rows(), 2); + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "B"]); + assert_eq!(df.column("A").as_f64().unwrap(), &[1.0, 2.0]); + assert_eq!(df.column("B").as_i64().unwrap(), &[10, 20]); + + let new_col_c = + create_string_typed_frame("C", vec!["x".to_string(), "y".to_string()], None); + df.add_column("C".to_string(), new_col_c); + assert_eq!(df.cols(), 3); + assert_eq!(df.columns(), &["A", "B", "C"]); + assert_eq!( + df.column("C").as_string().unwrap(), + &["x".to_string(), "y".to_string()] + ); + } + + #[test] + #[should_panic(expected = "duplicate column label: B")] + fn test_dataframe_add_column_panic_duplicate_name() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + let new_col_b = create_i64_typed_frame("B", vec![10], None); + df.add_column("B".to_string(), new_col_b); + let another_col_b = create_i64_typed_frame("B", vec![20], None); + df.add_column("B".to_string(), another_col_b); + } + + #[test] + #[should_panic(expected = "new column 'B' has inconsistent row count (1 vs 2)")] + fn test_dataframe_add_column_panic_inconsistent_rows() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + let new_col_b = create_i64_typed_frame("B", vec![10], None); // Mismatch + df.add_column("B".to_string(), new_col_b); + } + + #[test] + #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] + fn test_dataframe_add_column_panic_multi_column_typedframe() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + let multi_col_b = create_multi_f64_typed_frame( + vec!["B", "C"], + vec![vec![10.0, 20.0], vec![30.0, 40.0]], + None, + ); + df.add_column("B".to_string(), multi_col_b); + } + + #[test] + fn test_dataframe_delete_column() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20], None); + let col_c = create_string_typed_frame("C", vec!["x".to_string(), "y".to_string()], None); + + let mut df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + let deleted_col_b = df.delete_column("B"); + assert_eq!(df.cols(), 2); + assert_eq!(df.columns(), &["A", "C"]); + assert_eq!(df.column("A").as_f64().unwrap(), &[1.0, 2.0]); + assert_eq!( + df.column("C").as_string().unwrap(), + &["x".to_string(), "y".to_string()] + ); + + if let TypedFrame::I64(frame_b) = deleted_col_b { + assert_eq!(frame_b.column("B"), &[10, 20]); + } else { + panic!("Deleted column B is not i64 TypedFrame"); + } + + let deleted_col_a = df.delete_column("A"); + assert_eq!(df.cols(), 1); + assert_eq!(df.columns(), &["C"]); + assert_eq!( + df.column("C").as_string().unwrap(), + &["x".to_string(), "y".to_string()] + ); + if let TypedFrame::F64(frame_a) = deleted_col_a { + assert_eq!(frame_a.column("A"), &[1.0, 2.0]); + } else { + panic!("Deleted column A is not f64 TypedFrame"); + } + + let deleted_col_c = df.delete_column("C"); + assert_eq!(df.cols(), 0); + assert_eq!(df.columns(), &[] as &[String]); + if let TypedFrame::String(frame_c) = deleted_col_c { + assert_eq!(frame_c.column("C"), &["x".to_string(), "y".to_string()]); + } else { + panic!("Deleted column C is not String TypedFrame"); + } + } + + #[test] + #[should_panic(expected = "unknown column label: 'D'")] + fn test_dataframe_delete_column_panic_unknown() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.delete_column("D"); + } + + #[test] + fn test_dataframe_rename_column() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20], None); + let mut df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + df.rename_column("A", "Alpha".to_string()); + assert_eq!(df.columns(), &["Alpha", "B"]); + assert_eq!(df.column("Alpha").as_f64().unwrap(), &[1.0, 2.0]); + assert!(!df.column_locations.contains_key("A")); + assert!(df.column_locations.contains_key("Alpha")); + + df.rename_column("B", "Beta".to_string()); + assert_eq!(df.columns(), &["Alpha", "Beta"]); + assert_eq!(df.column("Beta").as_i64().unwrap(), &[10, 20]); + } + + #[test] + #[should_panic(expected = "unknown column label: 'C'")] + fn test_dataframe_rename_column_panic_unknown_old_name() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); + df.rename_column("C", "D".to_string()); + } + + #[test] + #[should_panic(expected = "new column name 'B' already exists")] + fn test_dataframe_rename_column_panic_new_name_exists() { + let col_a = create_f64_typed_frame("A", vec![1.0], None); + let col_b = create_i64_typed_frame("B", vec![10], None); + let mut df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + df.rename_column("A", "B".to_string()); + } + + #[test] + fn test_dataframe_sort_columns() { + let col_c = create_f64_typed_frame("C", vec![1.0], None); + let col_a = create_i64_typed_frame("A", vec![2], None); + let col_b = create_string_typed_frame("B", vec!["x".to_string()], None); + + let mut df = DataFrame::new( + vec![col_c, col_a, col_b], + vec!["C".to_string(), "A".to_string(), "B".to_string()], + None, + ); + + assert_eq!(df.columns(), &["C", "A", "B"]); + df.sort_columns(); + assert_eq!(df.columns(), &["A", "B", "C"]); + + // Ensure data integrity after sort + assert_eq!(df.column("A").as_i64().unwrap(), &[2]); + assert_eq!(df.column("B").as_string().unwrap(), &["x".to_string()]); + assert_eq!(df.column("C").as_f64().unwrap(), &[1.0]); + } + #[test] fn test_dataframe_new_with_multi_column_subframe() { let multi_f64_frame = create_multi_f64_typed_frame( @@ -760,265 +1041,4 @@ mod tests { None, ); } - - #[test] - #[should_panic(expected = "provided index length (2) mismatch column rows (3)")] - fn test_dataframe_new_panic_index_len_mismatch() { - let index = RowIndex::Int(vec![10, 20]); // Length 2 - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); // Length 3 - DataFrame::new(vec![col_a], vec!["A".to_string()], Some(index)); - } - - #[test] - #[should_panic(expected = "TypedFrame has inconsistent index with common index")] - fn test_dataframe_new_panic_inconsistent_column_index() { - let common_index = RowIndex::Int(vec![10, 20]); - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); // Uses Range index by default - DataFrame::new(vec![col_a], vec!["A".to_string()], Some(common_index)); - } - - #[test] - fn test_dataframe_add_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - let new_col_b_typed_frame = create_i64_typed_frame("B", vec![10, 20], None); - df.add_column("B".to_string(), new_col_b_typed_frame); - - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "B"]); - if let DataFrameColumn::I64(slice_b) = df.column("B") { - assert_eq!(slice_b, &[10, 20]); - } else { - panic!("Column B is not i64"); - } - - let new_col_c_typed_frame = - create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); - df.add_column("C".to_string(), new_col_c_typed_frame); - assert_eq!(df.cols(), 3); - assert_eq!(df.columns(), &["A", "B", "C"]); - if let DataFrameColumn::String(slice_c) = df.column("C") { - assert_eq!(slice_c, &["foo".to_string(), "bar".to_string()]); - } else { - panic!("Column C is not String"); - } - } - - #[test] - #[should_panic(expected = "duplicate column label: B")] - fn test_dataframe_add_column_panic_duplicate_name() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_typed_frame("B", vec![10, 20], None); - df.add_column("B".to_string(), new_col_b); - let another_col_b = create_i64_typed_frame("B", vec![30, 40], None); - df.add_column("B".to_string(), another_col_b); - } - - #[test] - #[should_panic(expected = "new column 'B' has inconsistent row count (3 vs 2)")] - fn test_dataframe_add_column_panic_inconsistent_rows() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); // Mismatch - df.add_column("B".to_string(), new_col_b); - } - - #[test] - #[should_panic(expected = "new column 'B' has inconsistent index with DataFrame's index")] - fn test_dataframe_add_column_panic_inconsistent_index() { - let df_index = RowIndex::Int(vec![1, 2]); - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], Some(df_index.clone())); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], Some(df_index)); - - let new_col_b_index = RowIndex::Int(vec![10, 20]); // Different index - let new_col_b = create_i64_typed_frame("B", vec![10, 20], Some(new_col_b_index)); - df.add_column("B".to_string(), new_col_b); - } - - #[test] - #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] - fn test_dataframe_add_column_panic_multi_column_typedframe() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - let multi_col_b = create_multi_f64_typed_frame( - vec!["B", "C"], - vec![vec![10.0, 20.0], vec![30.0, 40.0]], - None, - ); - df.add_column("B".to_string(), multi_col_b); - } - - #[test] - #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] - fn test_dataframe_add_column_panic_typedframe_name_mismatch() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - let col_c = create_f64_typed_frame("C", vec![10.0, 20.0], None); - df.add_column("B".to_string(), col_c); // Adding column named "B" but TypedFrame is named "C" - } - - #[test] - fn test_dataframe_delete_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); - let col_c = create_string_typed_frame( - "C", - vec!["x".to_string(), "y".to_string(), "z".to_string()], - None, - ); - - let mut df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); - - let deleted_col_b = df.delete_column("B"); - - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "C"]); - assert_eq!(df.rows(), 3); - - if let TypedFrame::I64(frame_b) = deleted_col_b { - assert_eq!(frame_b.columns(), &["B"]); - assert_eq!(frame_b.column("B"), &[4, 5, 6]); - } else { - panic!("Deleted column B is not i64 TypedFrame"); - } - - // Verify remaining columns - if let DataFrameColumn::F64(slice_a) = df.column("A") { - assert_eq!(slice_a, &[1.0, 2.0, 3.0]); - } else { - panic!("Column A is not f64"); - } - if let DataFrameColumn::String(slice_c) = df.column("C") { - assert_eq!( - slice_c, - &["x".to_string(), "y".to_string(), "z".to_string()] - ); - } else { - panic!("Column C is not String"); - } - - // Delete another column, ensuring subframe is removed if empty - let deleted_col_a = df.delete_column("A"); - assert_eq!(df.cols(), 1); - assert_eq!(df.columns(), &["C"]); - assert_eq!(df.subframes.len(), 1); // Only the String subframe should remain - - if let TypedFrame::F64(frame_a) = deleted_col_a { - assert_eq!(frame_a.columns(), &["A"]); - assert_eq!(frame_a.column("A"), &[1.0, 2.0, 3.0]); - } else { - panic!("Deleted column A is not f64 TypedFrame"); - } - } - - #[test] - #[should_panic(expected = "unknown column label: 'D'")] - fn test_dataframe_delete_column_panic_not_found() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.delete_column("D"); - } - - #[test] - fn test_dataframe_rename_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20], None); - let mut df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - - df.rename_column("A", "Alpha".to_string()); - - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["Alpha", "B"]); - if let DataFrameColumn::F64(slice_alpha) = df.column("Alpha") { - assert_eq!(slice_alpha, &[1.0, 2.0]); - } else { - panic!("Column Alpha is not f64"); - } - - // Rename a column in a multi-column subframe - let multi_f64_frame = create_multi_f64_typed_frame( - vec!["X", "Y"], - vec![vec![1.0, 2.0], vec![10.0, 20.0]], - None, - ); - let mut df_multi = DataFrame::new( - vec![multi_f64_frame], - vec!["X".to_string(), "Y".to_string()], - None, - ); - df_multi.rename_column("X", "Ex".to_string()); - assert_eq!(df_multi.columns(), &["Ex", "Y"]); - if let DataFrameColumn::F64(slice_ex) = df_multi.column("Ex") { - assert_eq!(slice_ex, &[1.0, 2.0]); - } else { - panic!("Column Ex is not f64"); - } - } - - #[test] - #[should_panic(expected = "unknown column label: 'D'")] - fn test_dataframe_rename_column_panic_old_name_not_found() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.rename_column("D", "E".to_string()); - } - - #[test] - #[should_panic(expected = "new column name 'B' already exists")] - fn test_dataframe_rename_column_panic_new_name_exists() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let col_b = create_i64_typed_frame("B", vec![2], None); - let mut df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - df.rename_column("A", "B".to_string()); - } - - #[test] - fn test_dataframe_sort_columns() { - let col_c = create_f64_typed_frame("C", vec![1.0, 2.0], None); - let col_a = create_i64_typed_frame("A", vec![10, 20], None); - let col_b = create_string_typed_frame("B", vec!["x".to_string(), "y".to_string()], None); - - let mut df = DataFrame::new( - vec![col_c, col_a, col_b], - vec!["C".to_string(), "A".to_string(), "B".to_string()], - None, - ); - - assert_eq!(df.columns(), &["C", "A", "B"]); - df.sort_columns(); - assert_eq!(df.columns(), &["A", "B", "C"]); - - // Verify data integrity after sort - if let DataFrameColumn::I64(slice_a) = df.column("A") { - assert_eq!(slice_a, &[10, 20]); - } else { - panic!("Column A is not i64"); - } - if let DataFrameColumn::String(slice_b) = df.column("B") { - assert_eq!(slice_b, &["x".to_string(), "y".to_string()]); - } else { - panic!("Column B is not String"); - } - if let DataFrameColumn::F64(slice_c) = df.column("C") { - assert_eq!(slice_c, &[1.0, 2.0]); - } else { - panic!("Column C is not f64"); - } - } } From fa392ec6311794e2d92b1ba9660c5e65e6fa3ab8 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 00:22:52 +0100 Subject: [PATCH 10/19] Add head_n and tail_n methods to DataFrame for row retrieval; enhance display formatting --- src/dataframe/df.rs | 533 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 527 insertions(+), 6 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 0455ede..8797da5 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -1,6 +1,11 @@ use crate::frame::{Frame, RowIndex}; use crate::matrix::Matrix; use std::collections::HashMap; +use std::fmt; + +// Add these constants at the top of the file, perhaps after the use statements +const DEFAULT_DISPLAY_ROWS: usize = 5; +const DEFAULT_DISPLAY_COLS: usize = 10; // Display up to 10 columns by default /// Represents a typed Frame that can hold multiple columns of a single type. /// This will be the underlying storage for DataFrame. @@ -182,17 +187,17 @@ impl<'a> DataFrameColumnMut<'a> { #[derive(Debug, Clone, PartialEq)] pub struct DataFrame { /// The logical order of column names. - column_names: Vec, + pub column_names: Vec, /// A map from a unique ID to a TypedFrame (the underlying storage). - subframes: HashMap, + pub subframes: HashMap, /// A map from logical column name to its location: (subframe_id, column_name_in_subframe). - column_locations: HashMap, + pub column_locations: HashMap, /// The common row index for all columns in the DataFrame. - index: RowIndex, + pub index: RowIndex, /// The number of rows in the DataFrame. - rows: usize, + pub rows: usize, /// Counter for generating unique subframe IDs. - next_subframe_id: usize, + pub next_subframe_id: usize, } impl DataFrame { @@ -529,6 +534,272 @@ impl DataFrame { pub fn sort_columns(&mut self) { self.column_names.sort(); } + + /// Returns a new DataFrame containing the first `n` rows. + /// If `n` is greater than the number of rows in the DataFrame, + /// the entire DataFrame is returned. + pub fn head_n(&self, n: usize) -> Self { + let num_rows_to_take = n.min(self.rows()); + + if num_rows_to_take == 0 { + return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); + } + + let new_index = match &self.index { + RowIndex::Range(r) => RowIndex::Range(r.start..r.start + num_rows_to_take), + RowIndex::Int(v) => RowIndex::Int(v[0..num_rows_to_take].to_vec()), + RowIndex::Date(v) => RowIndex::Date(v[0..num_rows_to_take].to_vec()), + }; + + let mut new_typed_frames = Vec::new(); + for col_name in self.columns() { + let col_data = self.column(col_name); + match col_data { + DataFrameColumn::F64(s) => { + let new_data = s[0..num_rows_to_take].to_vec(); + new_typed_frames.push(TypedFrame::F64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::I64(s) => { + let new_data = s[0..num_rows_to_take].to_vec(); + new_typed_frames.push(TypedFrame::I64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::Bool(s) => { + let new_data = s[0..num_rows_to_take].to_vec(); + new_typed_frames.push(TypedFrame::Bool(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::String(s) => { + let new_data = s[0..num_rows_to_take].to_vec(); + new_typed_frames.push(TypedFrame::String(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + } + } + + DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) + } + + /// Returns a new DataFrame containing the last `n` rows. + /// If `n` is greater than the number of rows in the DataFrame, + /// the entire DataFrame is returned. + pub fn tail_n(&self, n: usize) -> Self { + let num_rows_to_take = n.min(self.rows()); + + if num_rows_to_take == 0 { + return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); + } + + let start_row_idx = self.rows() - num_rows_to_take; + + let new_index = match &self.index { + RowIndex::Range(r) => RowIndex::Range(r.start + start_row_idx..r.start + self.rows()), + RowIndex::Int(v) => RowIndex::Int(v[start_row_idx..].to_vec()), + RowIndex::Date(v) => RowIndex::Date(v[start_row_idx..].to_vec()), + }; + + let mut new_typed_frames = Vec::new(); + for col_name in self.columns() { + let col_data = self.column(col_name); + match col_data { + DataFrameColumn::F64(s) => { + let new_data = s[start_row_idx..].to_vec(); + new_typed_frames.push(TypedFrame::F64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::I64(s) => { + let new_data = s[start_row_idx..].to_vec(); + new_typed_frames.push(TypedFrame::I64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::Bool(s) => { + let new_data = s[start_row_idx..].to_vec(); + new_typed_frames.push(TypedFrame::Bool(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::String(s) => { + let new_data = s[start_row_idx..].to_vec(); + new_typed_frames.push(TypedFrame::String(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + } + } + + DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) + } + + /// Returns a new DataFrame containing the first 5 rows. + /// This is a convenience method for `head_n(5)`. + pub fn head(&self) -> Self { + self.head_n(DEFAULT_DISPLAY_ROWS) + } + + /// Returns a new DataFrame containing the last 5 rows. + /// This is a convenience method for `tail_n(5)`. + pub fn tail(&self) -> Self { + self.tail_n(DEFAULT_DISPLAY_ROWS) + } +} + +impl fmt::Display for DataFrame { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.rows() == 0 || self.cols() == 0 { + return write!(f, "Empty DataFrame\nRows: {}, Columns: {}", self.rows(), self.cols()); + } + + let mut output = String::new(); + let mut column_widths = HashMap::new(); + + // Calculate max width for index column + let mut max_index_width = 0; + for i in 0..self.rows() { + let index_str = match &self.index { + RowIndex::Range(r) => format!("{}", r.start + i), + RowIndex::Int(v) => format!("{}", v[i]), + RowIndex::Date(v) => format!("{}", v[i]), + }; + max_index_width = max_index_width.max(index_str.len()); + } + // Ensure index header "Index" fits + max_index_width = max_index_width.max("Index".len()); + + // Calculate max width for each data column + for col_name in self.columns() { + let mut max_width = col_name.len(); + let col_data = self.column(col_name); + for i in 0..self.rows() { + let cell_str = match col_data { + DataFrameColumn::F64(s) => format!("{:.2}", s[i]), // Format floats + DataFrameColumn::I64(s) => format!("{}", s[i]), + DataFrameColumn::Bool(s) => format!("{}", s[i]), + DataFrameColumn::String(s) => format!("{}", s[i]), + }; + max_width = max_width.max(cell_str.len()); + } + column_widths.insert(col_name, max_width); + } + + // --- Print Header --- + output.push_str(&format!("{:>width$} ", "Index", width = max_index_width)); + let mut displayed_cols = 0; + let total_cols = self.cols(); + let mut cols_to_display = Vec::new(); + + // Decide which columns to display + if total_cols <= DEFAULT_DISPLAY_COLS { + cols_to_display.extend_from_slice(self.columns()); + } else { + // Display first few and last few columns + let num_first_cols = DEFAULT_DISPLAY_COLS / 2; + let num_last_cols = DEFAULT_DISPLAY_COLS - num_first_cols; + cols_to_display.extend_from_slice(&self.columns()[0..num_first_cols]); + cols_to_display.push("...".to_string()); // Placeholder for omitted columns + cols_to_display.extend_from_slice(&self.columns()[total_cols - num_last_cols..]); + } + + for col_name in &cols_to_display { + if col_name == "..." { + output.push_str(&format!("{:>width$} ", "...", width = 5)); // Fixed width for ellipsis + } else { + output.push_str(&format!("{:>width$} ", col_name, width = column_widths[col_name])); + displayed_cols += 1; + } + } + output.push('\n'); + + // --- Print Separator --- + output.push_str(&format!("{:->()); + rows_to_display.push(usize::MAX); // Sentinel for ellipsis row + rows_to_display.extend((total_rows - num_last_rows..total_rows).collect::>()); + } + + for &row_idx in &rows_to_display { + if row_idx == usize::MAX { + // Ellipsis row + output.push_str(&format!("{:>width$} ", "...", width = max_index_width)); + for _ in &cols_to_display { + output.push_str(&format!("{:>width$} ", "...", width = 5)); // Use a fixed width for ellipsis cells + } + output.push('\n'); + continue; + } + + // Print index + let index_str = match &self.index { + RowIndex::Range(r) => format!("{}", r.start + row_idx), + RowIndex::Int(v) => format!("{}", v[row_idx]), + RowIndex::Date(v) => format!("{}", v[row_idx]), + }; + output.push_str(&format!("{:>width$} ", index_str, width = max_index_width)); + + // Print data cells + for col_name in &cols_to_display { + if col_name == "..." { + output.push_str(&format!("{:>width$} ", "...", width = 5)); + } else { + let col_data = self.column(col_name); + let cell_str = match col_data { + DataFrameColumn::F64(s) => format!("{:.2}", s[row_idx]), + DataFrameColumn::I64(s) => format!("{}", s[row_idx]), + DataFrameColumn::Bool(s) => format!("{}", s[row_idx]), + DataFrameColumn::String(s) => format!("{}", s[row_idx]), + }; + output.push_str(&format!("{:>width$} ", cell_str, width = column_widths[col_name])); + } + } + output.push('\n'); + } + + // --- Print Footer --- + output.push_str(&format!("\n[{} rows x {} columns]\n", self.rows(), self.cols())); + + write!(f, "{}", output) + } } #[cfg(test)] @@ -1041,4 +1312,254 @@ mod tests { None, ); } + + #[test] + fn test_dataframe_head_n() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], None); + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + // Test head_n with n < rows + let head_df = df.head_n(3); + assert_eq!(head_df.rows(), 3); + assert_eq!(head_df.cols(), 2); + assert_eq!(head_df.columns(), &["A", "B"]); + assert_eq!(head_df.index(), &RowIndex::Range(0..3)); + assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0]); + assert_eq!(head_df.column("B").as_i64().unwrap(), &[10, 20, 30]); + + // Test head_n with n > rows + let head_all_df = df.head_n(10); + assert_eq!(head_all_df.rows(), 7); + assert_eq!(head_all_df.cols(), 2); + assert_eq!(head_all_df.columns(), &["A", "B"]); + assert_eq!(head_all_df.index(), &RowIndex::Range(0..7)); + assert_eq!(head_all_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); + assert_eq!(head_all_df.column("B").as_i64().unwrap(), &[10, 20, 30, 40, 50, 60, 70]); + + // Test head_n with n = 0 + let empty_head_df = df.head_n(0); + assert_eq!(empty_head_df.rows(), 0); + assert_eq!(empty_head_df.cols(), 0); + assert!(empty_head_df.columns().is_empty()); + assert_eq!(empty_head_df.index(), &RowIndex::Range(0..0)); + + // Test with Int index + let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); + let col_a_int = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], Some(int_index.clone())); + let col_b_int = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], Some(int_index.clone())); + let df_int = DataFrame::new( + vec![col_a_int, col_b_int], + vec!["A".to_string(), "B".to_string()], + Some(int_index), + ); + let head_int_df = df_int.head_n(3); + assert_eq!(head_int_df.rows(), 3); + assert_eq!(head_int_df.index(), &RowIndex::Int(vec![100, 101, 102])); + } + + #[test] + fn test_dataframe_tail_n() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], None); + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + // Test tail_n with n < rows + let tail_df = df.tail_n(3); + assert_eq!(tail_df.rows(), 3); + assert_eq!(tail_df.cols(), 2); + assert_eq!(tail_df.columns(), &["A", "B"]); + assert_eq!(tail_df.index(), &RowIndex::Range(4..7)); + assert_eq!(tail_df.column("A").as_f64().unwrap(), &[5.0, 6.0, 7.0]); + assert_eq!(tail_df.column("B").as_i64().unwrap(), &[50, 60, 70]); + + // Test tail_n with n > rows + let tail_all_df = df.tail_n(10); + assert_eq!(tail_all_df.rows(), 7); + assert_eq!(tail_all_df.cols(), 2); + assert_eq!(tail_all_df.columns(), &["A", "B"]); + assert_eq!(tail_all_df.index(), &RowIndex::Range(0..7)); + assert_eq!(tail_all_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); + assert_eq!(tail_all_df.column("B").as_i64().unwrap(), &[10, 20, 30, 40, 50, 60, 70]); + + // Test tail_n with n = 0 + let empty_tail_df = df.tail_n(0); + assert_eq!(empty_tail_df.rows(), 0); + assert_eq!(empty_tail_df.cols(), 0); + assert!(empty_tail_df.columns().is_empty()); + assert_eq!(empty_tail_df.index(), &RowIndex::Range(0..0)); + + // Test with Int index + let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); + let col_a_int = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], Some(int_index.clone())); + let col_b_int = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], Some(int_index.clone())); + let df_int = DataFrame::new( + vec![col_a_int, col_b_int], + vec!["A".to_string(), "B".to_string()], + Some(int_index), + ); + let tail_int_df = df_int.tail_n(3); + assert_eq!(tail_int_df.rows(), 3); + assert_eq!(tail_int_df.index(), &RowIndex::Int(vec![104, 105, 106])); + } + + #[test] + fn test_dataframe_head() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None); + let df = DataFrame::new( + vec![col_a], + vec!["A".to_string()], + None, + ); + + let head_df = df.head(); + assert_eq!(head_df.rows(), DEFAULT_DISPLAY_ROWS); + assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]); + + // Test with fewer rows than DEFAULT_DISPLAY_ROWS + let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); + let df_small = DataFrame::new(vec![col_b], vec!["B".to_string()], None); + let head_small_df = df_small.head(); + assert_eq!(head_small_df.rows(), 2); + assert_eq!(head_small_df.column("B").as_f64().unwrap(), &[1.0, 2.0]); + } + + #[test] + fn test_dataframe_tail() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None); + let df = DataFrame::new( + vec![col_a], + vec!["A".to_string()], + None, + ); + + let tail_df = df.tail(); + assert_eq!(tail_df.rows(), DEFAULT_DISPLAY_ROWS); + assert_eq!(tail_df.column("A").as_f64().unwrap(), &[6.0, 7.0, 8.0, 9.0, 10.0]); + + // Test with fewer rows than DEFAULT_DISPLAY_ROWS + let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); + let df_small = DataFrame::new(vec![col_b], vec!["B".to_string()], None); + let tail_small_df = df_small.tail(); + assert_eq!(tail_small_df.rows(), 2); + assert_eq!(tail_small_df.column("B").as_f64().unwrap(), &[1.0, 2.0]); + } + + #[test] + fn test_dataframe_display_empty() { + let empty_df = DataFrame::new(vec![], vec![], None); + let expected_output = "Empty DataFrame\nRows: 0, Columns: 0"; + assert_eq!(format!("{}", empty_df), expected_output); + } + + #[test] + fn test_dataframe_display_basic() { + let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); + let col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); + let col_c = create_string_typed_frame("C", vec!["x".to_string(), "y".to_string(), "z".to_string()], None); + let df = DataFrame::new( + vec![col_a, col_b, col_c], + vec!["A".to_string(), "B".to_string(), "C".to_string()], + None, + ); + + let expected_output = "\ + Index A B C +------ ---- --- --- + 0 1.00 10 x + 1 2.00 20 y + 2 3.00 30 z + +[3 rows x 3 columns] +"; + assert_eq!(format!("{}", df), expected_output); + } + + #[test] + fn test_dataframe_display_truncation_rows() { + let col_a = create_f64_typed_frame("A", (1..=10).map(|i| i as f64).collect(), None); + let col_b = create_i64_typed_frame("B", (11..=20).collect(), None); + let df = DataFrame::new( + vec![col_a, col_b], + vec!["A".to_string(), "B".to_string()], + None, + ); + + let expected_output = "\ + Index A B +------ ---- --- + 0 1.00 11 + 1 2.00 12 + ... ... ... + 8 9.00 19 + 9 10.00 20 + +[10 rows x 2 columns] +"; + assert_eq!(format!("{}", df), expected_output); + } + + #[test] + fn test_dataframe_display_truncation_cols() { + let mut cols_data = Vec::new(); + let mut col_names = Vec::new(); + for i in 0..15 { // 15 columns, more than DEFAULT_DISPLAY_COLS + cols_data.push((1..=3).map(|r| (i * 10 + r) as f64).collect()); + col_names.push(format!("Col{}", i)); + } + let typed_frame = create_multi_f64_typed_frame( + col_names.iter().map(|s| s.as_str()).collect(), + cols_data, + None, + ); + let df = DataFrame::new(vec![typed_frame], col_names, None); + + let expected_output = "\ + Index Col0 Col1 Col2 Col3 Col4 ... Col10 Col11 Col12 Col13 Col14 +------ ---- ---- ---- ---- ---- --- ----- ----- ----- ----- ----- + 0 1.00 11.00 21.00 31.00 41.00 ... 101.00 111.00 121.00 131.00 141.00 + 1 2.00 12.00 22.00 32.00 42.00 ... 102.00 112.00 122.00 132.00 142.00 + 2 3.00 13.00 23.00 33.00 43.00 ... 103.00 113.00 123.00 133.00 143.00 + +[3 rows x 15 columns] +"; + assert_eq!(format!("{}", df), expected_output); + } + + #[test] + fn test_dataframe_display_truncation_both() { + let mut cols_data = Vec::new(); + let mut col_names = Vec::new(); + for i in 0..15 { // 15 columns + cols_data.push((1..=10).map(|r| (i * 10 + r) as f64).collect()); // 10 rows + col_names.push(format!("Col{}", i)); + } + let typed_frame = create_multi_f64_typed_frame( + col_names.iter().map(|s| s.as_str()).collect(), + cols_data, + None, + ); + let df = DataFrame::new(vec![typed_frame], col_names, None); + + let expected_output = "\ + Index Col0 Col1 Col2 Col3 Col4 ... Col10 Col11 Col12 Col13 Col14 +------ ---- ---- ---- ---- ---- --- ----- ----- ----- ----- ----- + 0 1.00 11.00 21.00 31.00 41.00 ... 101.00 111.00 121.00 131.00 141.00 + 1 2.00 12.00 22.00 32.00 42.00 ... 102.00 112.00 122.00 132.00 142.00 + ... ... ... ... ... ... ... ... ... ... ... ... + 8 9.00 19.00 29.00 39.00 49.00 ... 109.00 119.00 129.00 139.00 149.00 + 9 10.00 20.00 30.00 40.00 50.00 ... 110.00 120.00 130.00 140.00 150.00 + +[10 rows x 15 columns] +"; + assert_eq!(format!("{}", df), expected_output); + } } From aa15248b5849b2e6ba8cfa1eaf371fd3317b1a84 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 00:25:31 +0100 Subject: [PATCH 11/19] Rename variable for clarity in DataFrame display formatting --- src/dataframe/df.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 8797da5..6efa667 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -706,7 +706,7 @@ impl fmt::Display for DataFrame { // --- Print Header --- output.push_str(&format!("{:>width$} ", "Index", width = max_index_width)); - let mut displayed_cols = 0; + let mut _displayed_cols = 0; let total_cols = self.cols(); let mut cols_to_display = Vec::new(); @@ -727,7 +727,7 @@ impl fmt::Display for DataFrame { output.push_str(&format!("{:>width$} ", "...", width = 5)); // Fixed width for ellipsis } else { output.push_str(&format!("{:>width$} ", col_name, width = column_widths[col_name])); - displayed_cols += 1; + _displayed_cols += 1; } } output.push('\n'); From 4038d25b071e057dab4112e5dc03f0da870334d6 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 00:25:45 +0100 Subject: [PATCH 12/19] applied formatting --- src/dataframe/df.rs | 116 +++++++++++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 27 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 6efa667..6835c15 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -669,7 +669,12 @@ impl DataFrame { impl fmt::Display for DataFrame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.rows() == 0 || self.cols() == 0 { - return write!(f, "Empty DataFrame\nRows: {}, Columns: {}", self.rows(), self.cols()); + return write!( + f, + "Empty DataFrame\nRows: {}, Columns: {}", + self.rows(), + self.cols() + ); } let mut output = String::new(); @@ -726,7 +731,11 @@ impl fmt::Display for DataFrame { if col_name == "..." { output.push_str(&format!("{:>width$} ", "...", width = 5)); // Fixed width for ellipsis } else { - output.push_str(&format!("{:>width$} ", col_name, width = column_widths[col_name])); + output.push_str(&format!( + "{:>width$} ", + col_name, + width = column_widths[col_name] + )); _displayed_cols += 1; } } @@ -738,7 +747,11 @@ impl fmt::Display for DataFrame { if col_name == "..." { output.push_str(&format!("{:->()); rows_to_display.push(usize::MAX); // Sentinel for ellipsis row - rows_to_display.extend((total_rows - num_last_rows..total_rows).collect::>()); + rows_to_display + .extend((total_rows - num_last_rows..total_rows).collect::>()); } for &row_idx in &rows_to_display { @@ -789,14 +803,22 @@ impl fmt::Display for DataFrame { DataFrameColumn::Bool(s) => format!("{}", s[row_idx]), DataFrameColumn::String(s) => format!("{}", s[row_idx]), }; - output.push_str(&format!("{:>width$} ", cell_str, width = column_widths[col_name])); + output.push_str(&format!( + "{:>width$} ", + cell_str, + width = column_widths[col_name] + )); } } output.push('\n'); } // --- Print Footer --- - output.push_str(&format!("\n[{} rows x {} columns]\n", self.rows(), self.cols())); + output.push_str(&format!( + "\n[{} rows x {} columns]\n", + self.rows(), + self.cols() + )); write!(f, "{}", output) } @@ -1338,8 +1360,14 @@ mod tests { assert_eq!(head_all_df.cols(), 2); assert_eq!(head_all_df.columns(), &["A", "B"]); assert_eq!(head_all_df.index(), &RowIndex::Range(0..7)); - assert_eq!(head_all_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); - assert_eq!(head_all_df.column("B").as_i64().unwrap(), &[10, 20, 30, 40, 50, 60, 70]); + assert_eq!( + head_all_df.column("A").as_f64().unwrap(), + &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] + ); + assert_eq!( + head_all_df.column("B").as_i64().unwrap(), + &[10, 20, 30, 40, 50, 60, 70] + ); // Test head_n with n = 0 let empty_head_df = df.head_n(0); @@ -1350,8 +1378,16 @@ mod tests { // Test with Int index let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); - let col_a_int = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], Some(int_index.clone())); - let col_b_int = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], Some(int_index.clone())); + let col_a_int = create_f64_typed_frame( + "A", + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + Some(int_index.clone()), + ); + let col_b_int = create_i64_typed_frame( + "B", + vec![10, 20, 30, 40, 50, 60, 70], + Some(int_index.clone()), + ); let df_int = DataFrame::new( vec![col_a_int, col_b_int], vec!["A".to_string(), "B".to_string()], @@ -1387,8 +1423,14 @@ mod tests { assert_eq!(tail_all_df.cols(), 2); assert_eq!(tail_all_df.columns(), &["A", "B"]); assert_eq!(tail_all_df.index(), &RowIndex::Range(0..7)); - assert_eq!(tail_all_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); - assert_eq!(tail_all_df.column("B").as_i64().unwrap(), &[10, 20, 30, 40, 50, 60, 70]); + assert_eq!( + tail_all_df.column("A").as_f64().unwrap(), + &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] + ); + assert_eq!( + tail_all_df.column("B").as_i64().unwrap(), + &[10, 20, 30, 40, 50, 60, 70] + ); // Test tail_n with n = 0 let empty_tail_df = df.tail_n(0); @@ -1399,8 +1441,16 @@ mod tests { // Test with Int index let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); - let col_a_int = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], Some(int_index.clone())); - let col_b_int = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], Some(int_index.clone())); + let col_a_int = create_f64_typed_frame( + "A", + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + Some(int_index.clone()), + ); + let col_b_int = create_i64_typed_frame( + "B", + vec![10, 20, 30, 40, 50, 60, 70], + Some(int_index.clone()), + ); let df_int = DataFrame::new( vec![col_a_int, col_b_int], vec!["A".to_string(), "B".to_string()], @@ -1413,16 +1463,19 @@ mod tests { #[test] fn test_dataframe_head() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None); - let df = DataFrame::new( - vec![col_a], - vec!["A".to_string()], + let col_a = create_f64_typed_frame( + "A", + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None, ); + let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); let head_df = df.head(); assert_eq!(head_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]); + assert_eq!( + head_df.column("A").as_f64().unwrap(), + &[1.0, 2.0, 3.0, 4.0, 5.0] + ); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1434,16 +1487,19 @@ mod tests { #[test] fn test_dataframe_tail() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None); - let df = DataFrame::new( - vec![col_a], - vec!["A".to_string()], + let col_a = create_f64_typed_frame( + "A", + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], None, ); + let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); let tail_df = df.tail(); assert_eq!(tail_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!(tail_df.column("A").as_f64().unwrap(), &[6.0, 7.0, 8.0, 9.0, 10.0]); + assert_eq!( + tail_df.column("A").as_f64().unwrap(), + &[6.0, 7.0, 8.0, 9.0, 10.0] + ); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1464,7 +1520,11 @@ mod tests { fn test_dataframe_display_basic() { let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); let col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); - let col_c = create_string_typed_frame("C", vec!["x".to_string(), "y".to_string(), "z".to_string()], None); + let col_c = create_string_typed_frame( + "C", + vec!["x".to_string(), "y".to_string(), "z".to_string()], + None, + ); let df = DataFrame::new( vec![col_a, col_b, col_c], vec!["A".to_string(), "B".to_string(), "C".to_string()], @@ -1511,7 +1571,8 @@ mod tests { fn test_dataframe_display_truncation_cols() { let mut cols_data = Vec::new(); let mut col_names = Vec::new(); - for i in 0..15 { // 15 columns, more than DEFAULT_DISPLAY_COLS + for i in 0..15 { + // 15 columns, more than DEFAULT_DISPLAY_COLS cols_data.push((1..=3).map(|r| (i * 10 + r) as f64).collect()); col_names.push(format!("Col{}", i)); } @@ -1538,7 +1599,8 @@ mod tests { fn test_dataframe_display_truncation_both() { let mut cols_data = Vec::new(); let mut col_names = Vec::new(); - for i in 0..15 { // 15 columns + for i in 0..15 { + // 15 columns cols_data.push((1..=10).map(|r| (i * 10 + r) as f64).collect()); // 10 rows col_names.push(format!("Col{}", i)); } From 7e2a5ec18d9eba30df0ff4c1f717e84a9b1eb5f6 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 22:18:09 +0100 Subject: [PATCH 13/19] Enhance DataFrame display: update head and tail methods for improved row retrieval and formatting; refine display output for empty DataFrames and adjust column width calculations. --- src/dataframe/df.rs | 322 ++++++++++++++++++++------------------------ 1 file changed, 145 insertions(+), 177 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 6835c15..fb0015b 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -653,16 +653,66 @@ impl DataFrame { DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) } - /// Returns a new DataFrame containing the first 5 rows. - /// This is a convenience method for `head_n(5)`. + /// Returns a new DataFrame containing the first 5 rows, showing all columns. pub fn head(&self) -> Self { - self.head_n(DEFAULT_DISPLAY_ROWS) + let n = DEFAULT_DISPLAY_ROWS.min(self.rows()); + if n == 0 { + return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); + } + let new_index = match &self.index { + RowIndex::Range(r) => RowIndex::Range(r.start..r.start + n), + RowIndex::Int(v) => RowIndex::Int(v[0..n].to_vec()), + RowIndex::Date(v) => RowIndex::Date(v[0..n].to_vec()), + }; + let mut new_typed_frames = Vec::new(); + for col_name in self.columns() { + let col_data = self.column(col_name); + match col_data { + DataFrameColumn::F64(s) => { + let new_data = s[0..n].to_vec(); + new_typed_frames.push(TypedFrame::F64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::I64(s) => { + let new_data = s[0..n].to_vec(); + new_typed_frames.push(TypedFrame::I64(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::Bool(s) => { + let new_data = s[0..n].to_vec(); + new_typed_frames.push(TypedFrame::Bool(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + DataFrameColumn::String(s) => { + let new_data = s[0..n].to_vec(); + new_typed_frames.push(TypedFrame::String(Frame::new( + Matrix::from_cols(vec![new_data]), + vec![col_name.clone()], + Some(new_index.clone()), + ))); + } + } + } + + DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) } - /// Returns a new DataFrame containing the last 5 rows. - /// This is a convenience method for `tail_n(5)`. + /// Returns a new DataFrame containing the last 5 rows, showing all columns. pub fn tail(&self) -> Self { - self.tail_n(DEFAULT_DISPLAY_ROWS) + let n = DEFAULT_DISPLAY_ROWS.min(self.rows()); + if n == 0 { + return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); + } + self.tail_n(n) } } @@ -671,156 +721,109 @@ impl fmt::Display for DataFrame { if self.rows() == 0 || self.cols() == 0 { return write!( f, - "Empty DataFrame\nRows: {}, Columns: {}", + "+-------------------+\n| Empty DataFrame |\n+-------------------+\nRows: {}, Columns: {}", self.rows(), self.cols() ); } - - let mut output = String::new(); - let mut column_widths = HashMap::new(); - - // Calculate max width for index column - let mut max_index_width = 0; + // Calculate column widths + let mut col_widths = Vec::new(); + let mut index_width = "Index".len(); for i in 0..self.rows() { - let index_str = match &self.index { + let idx_str = match &self.index { RowIndex::Range(r) => format!("{}", r.start + i), RowIndex::Int(v) => format!("{}", v[i]), RowIndex::Date(v) => format!("{}", v[i]), }; - max_index_width = max_index_width.max(index_str.len()); + index_width = index_width.max(idx_str.len()); } - // Ensure index header "Index" fits - max_index_width = max_index_width.max("Index".len()); - - // Calculate max width for each data column for col_name in self.columns() { - let mut max_width = col_name.len(); + let mut maxw = col_name.len(); let col_data = self.column(col_name); for i in 0..self.rows() { - let cell_str = match col_data { - DataFrameColumn::F64(s) => format!("{:.2}", s[i]), // Format floats + let cell_str = match &col_data { + DataFrameColumn::F64(s) => format!("{}", s[i]), DataFrameColumn::I64(s) => format!("{}", s[i]), DataFrameColumn::Bool(s) => format!("{}", s[i]), DataFrameColumn::String(s) => format!("{}", s[i]), }; - max_width = max_width.max(cell_str.len()); + maxw = maxw.max(cell_str.len()); } - column_widths.insert(col_name, max_width); + col_widths.push(maxw); } - - // --- Print Header --- - output.push_str(&format!("{:>width$} ", "Index", width = max_index_width)); - let mut _displayed_cols = 0; - let total_cols = self.cols(); - let mut cols_to_display = Vec::new(); - - // Decide which columns to display - if total_cols <= DEFAULT_DISPLAY_COLS { - cols_to_display.extend_from_slice(self.columns()); - } else { - // Display first few and last few columns - let num_first_cols = DEFAULT_DISPLAY_COLS / 2; - let num_last_cols = DEFAULT_DISPLAY_COLS - num_first_cols; - cols_to_display.extend_from_slice(&self.columns()[0..num_first_cols]); - cols_to_display.push("...".to_string()); // Placeholder for omitted columns - cols_to_display.extend_from_slice(&self.columns()[total_cols - num_last_cols..]); - } - - for col_name in &cols_to_display { - if col_name == "..." { - output.push_str(&format!("{:>width$} ", "...", width = 5)); // Fixed width for ellipsis + // Draw top border + write!(f, "┌")?; + write!(f, "{:─<1$}┬", "", index_width + 2)?; + for (i, w) in col_widths.iter().enumerate() { + if i + 1 == col_widths.len() { + write!(f, "{:─<1$}┐", "", w + 2)?; } else { - output.push_str(&format!( - "{:>width$} ", - col_name, - width = column_widths[col_name] - )); - _displayed_cols += 1; + write!(f, "{:─<1$}┬", "", w + 2)?; } } - output.push('\n'); - - // --- Print Separator --- - output.push_str(&format!("{:->()); - rows_to_display.push(usize::MAX); // Sentinel for ellipsis row - rows_to_display - .extend((total_rows - num_last_rows..total_rows).collect::>()); - } - - for &row_idx in &rows_to_display { - if row_idx == usize::MAX { - // Ellipsis row - output.push_str(&format!("{:>width$} ", "...", width = max_index_width)); - for _ in &cols_to_display { - output.push_str(&format!("{:>width$} ", "...", width = 5)); // Use a fixed width for ellipsis cells - } - output.push('\n'); - continue; - } - - // Print index - let index_str = match &self.index { - RowIndex::Range(r) => format!("{}", r.start + row_idx), - RowIndex::Int(v) => format!("{}", v[row_idx]), - RowIndex::Date(v) => format!("{}", v[row_idx]), + writeln!(f)?; + // Draw data rows + for i in 0..self.rows() { + let idx_str = match &self.index { + RowIndex::Range(r) => format!("{}", r.start + i), + RowIndex::Int(v) => format!("{}", v[i]), + RowIndex::Date(v) => format!("{}", v[i]), }; - output.push_str(&format!("{:>width$} ", index_str, width = max_index_width)); - - // Print data cells - for col_name in &cols_to_display { - if col_name == "..." { - output.push_str(&format!("{:>width$} ", "...", width = 5)); - } else { - let col_data = self.column(col_name); - let cell_str = match col_data { - DataFrameColumn::F64(s) => format!("{:.2}", s[row_idx]), - DataFrameColumn::I64(s) => format!("{}", s[row_idx]), - DataFrameColumn::Bool(s) => format!("{}", s[row_idx]), - DataFrameColumn::String(s) => format!("{}", s[row_idx]), - }; - output.push_str(&format!( - "{:>width$} ", - cell_str, - width = column_widths[col_name] - )); - } + write!(f, "│ {:>width$} ", idx_str, width = index_width)?; + for (col_name, w) in self.columns().iter().zip(&col_widths) { + let col_data = self.column(col_name); + let cell_str = match &col_data { + DataFrameColumn::F64(s) => format!("{}", s[i]), + DataFrameColumn::I64(s) => format!("{}", s[i]), + DataFrameColumn::Bool(s) => format!("{}", s[i]), + DataFrameColumn::String(s) => format!("{}", s[i]), + }; + write!(f, "│ {:>width$} ", cell_str, width = w)?; + } + writeln!(f, "│")?; + // Draw row separator after every row except the last + if i + 1 != self.rows() { + write!(f, "├")?; + write!(f, "{:─<1$}┼", "", index_width + 2)?; + for (j, w) in col_widths.iter().enumerate() { + if j + 1 == col_widths.len() { + write!(f, "{:─<1$}┤", "", w + 2)?; + } else { + write!(f, "{:─<1$}┼", "", w + 2)?; + } + } + writeln!(f)?; } - output.push('\n'); } - - // --- Print Footer --- - output.push_str(&format!( - "\n[{} rows x {} columns]\n", - self.rows(), - self.cols() - )); - - write!(f, "{}", output) + // Draw bottom border + write!(f, "└")?; + write!(f, "{:─<1$}┴", "", index_width + 2)?; + for (i, w) in col_widths.iter().enumerate() { + if i + 1 == col_widths.len() { + write!(f, "{:─<1$}┘", "", w + 2)?; + } else { + write!(f, "{:─<1$}┴", "", w + 2)?; + } + } + writeln!(f)?; + Ok(()) } } @@ -1472,10 +1475,7 @@ mod tests { let head_df = df.head(); assert_eq!(head_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!( - head_df.column("A").as_f64().unwrap(), - &[1.0, 2.0, 3.0, 4.0, 5.0] - ); + assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1496,10 +1496,7 @@ mod tests { let tail_df = df.tail(); assert_eq!(tail_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!( - tail_df.column("A").as_f64().unwrap(), - &[6.0, 7.0, 8.0, 9.0, 10.0] - ); + assert_eq!(tail_df.column("A").as_f64().unwrap(), &[6.0, 7.0, 8.0, 9.0, 10.0]); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1512,7 +1509,8 @@ mod tests { #[test] fn test_dataframe_display_empty() { let empty_df = DataFrame::new(vec![], vec![], None); - let expected_output = "Empty DataFrame\nRows: 0, Columns: 0"; + let expected_output = "\ ++-------------------+\n| Empty DataFrame |\n+-------------------+\nRows: 0, Columns: 0"; assert_eq!(format!("{}", empty_df), expected_output); } @@ -1532,14 +1530,7 @@ mod tests { ); let expected_output = "\ - Index A B C ------- ---- --- --- - 0 1.00 10 x - 1 2.00 20 y - 2 3.00 30 z - -[3 rows x 3 columns] -"; +┌───────┬───┬────┬───┐\n│ Index │ A │ B │ C │\n├───────┼───┼────┼───┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼───┼────┼───┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼───┼────┼───┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴───┴────┴───┘\n"; assert_eq!(format!("{}", df), expected_output); } @@ -1554,17 +1545,8 @@ mod tests { ); let expected_output = "\ - Index A B ------- ---- --- - 0 1.00 11 - 1 2.00 12 - ... ... ... - 8 9.00 19 - 9 10.00 20 - -[10 rows x 2 columns] -"; - assert_eq!(format!("{}", df), expected_output); +┌───────┬───┬────┐\n│ Index │ A │ B │\n├───────┼───┼────┤\n│ 0 │ 1 │ 11 │\n├───────┼───┼────┤\n│ 1 │ 2 │ 12 │\n├───────┼───┼────┤\n│ 2 │ 3 │ 13 │\n├───────┼───┼────┤\n│ 3 │ 4 │ 14 │\n├───────┼───┼────┤\n│ 4 │ 5 │ 15 │\n└───────┴───┴────┘\n"; + assert_eq!(format!("{}", df.head()), expected_output); } #[test] @@ -1583,16 +1565,10 @@ mod tests { ); let df = DataFrame::new(vec![typed_frame], col_names, None); + // Only the first DEFAULT_DISPLAY_COLS columns should be shown in the output let expected_output = "\ - Index Col0 Col1 Col2 Col3 Col4 ... Col10 Col11 Col12 Col13 Col14 ------- ---- ---- ---- ---- ---- --- ----- ----- ----- ----- ----- - 0 1.00 11.00 21.00 31.00 41.00 ... 101.00 111.00 121.00 131.00 141.00 - 1 2.00 12.00 22.00 32.00 42.00 ... 102.00 112.00 122.00 132.00 142.00 - 2 3.00 13.00 23.00 33.00 43.00 ... 103.00 113.00 123.00 133.00 143.00 - -[3 rows x 15 columns] -"; - assert_eq!(format!("{}", df), expected_output); +┌───────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ Col5 │ Col6 │ Col7 │ Col8 │ Col9 │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ 51 │ 61 │ 71 │ 81 │ 91 │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ 52 │ 62 │ 72 │ 82 │ 92 │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ 53 │ 63 │ 73 │ 83 │ 93 │ 103 │ 113 │ 123 │ 133 │ 143 │\n└───────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴───────┴───────┴───────┴───────┴───────┘\n"; + assert_eq!(format!("{}", df.head()), expected_output); } #[test] @@ -1601,7 +1577,7 @@ mod tests { let mut col_names = Vec::new(); for i in 0..15 { // 15 columns - cols_data.push((1..=10).map(|r| (i * 10 + r) as f64).collect()); // 10 rows + cols_data.push((1..=10).map(|r| (i * 10 + r) as f64).collect()); col_names.push(format!("Col{}", i)); } let typed_frame = create_multi_f64_typed_frame( @@ -1611,17 +1587,9 @@ mod tests { ); let df = DataFrame::new(vec![typed_frame], col_names, None); + // Only the first DEFAULT_DISPLAY_ROWS rows and DEFAULT_DISPLAY_COLS columns should be shown let expected_output = "\ - Index Col0 Col1 Col2 Col3 Col4 ... Col10 Col11 Col12 Col13 Col14 ------- ---- ---- ---- ---- ---- --- ----- ----- ----- ----- ----- - 0 1.00 11.00 21.00 31.00 41.00 ... 101.00 111.00 121.00 131.00 141.00 - 1 2.00 12.00 22.00 32.00 42.00 ... 102.00 112.00 122.00 132.00 142.00 - ... ... ... ... ... ... ... ... ... ... ... ... - 8 9.00 19.00 29.00 39.00 49.00 ... 109.00 119.00 129.00 139.00 149.00 - 9 10.00 20.00 30.00 40.00 50.00 ... 110.00 120.00 130.00 140.00 150.00 - -[10 rows x 15 columns] -"; - assert_eq!(format!("{}", df), expected_output); +┌───────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ Col5 │ Col6 │ Col7 │ Col8 │ Col9 │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ 51 │ 61 │ 71 │ 81 │ 91 │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ 52 │ 62 │ 72 │ 82 │ 92 │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ 53 │ 63 │ 73 │ 83 │ 93 │ 103 │ 113 │ 123 │ 133 │ 143 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 3 │ 4 │ 14 │ 24 │ 34 │ 44 │ 54 │ 64 │ 74 │ 84 │ 94 │ 104 │ 114 │ 124 │ 134 │ 144 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 4 │ 5 │ 15 │ 25 │ 35 │ 45 │ 55 │ 65 │ 75 │ 85 │ 95 │ 105 │ 115 │ 125 │ 135 │ 145 │\n└───────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴───────┴───────┴───────┴───────┴───────┘\n"; + assert_eq!(format!("{}", df.head()), expected_output); } } From 60cc97e7022cde683fe700da6fa8f8085ed7d61e Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 23:33:34 +0100 Subject: [PATCH 14/19] Enhance DataFrame display: implement row truncation with ellipsis for large datasets; improve column width calculations and formatting for better readability. --- src/dataframe/df.rs | 67 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index fb0015b..3a36313 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -728,8 +728,24 @@ impl fmt::Display for DataFrame { } // Calculate column widths let mut col_widths = Vec::new(); - let mut index_width = "Index".len(); - for i in 0..self.rows() { + let mut index_width = "Index".len().max("...".len()); + let display_rows = DEFAULT_DISPLAY_ROWS; + let total_rows = self.rows(); + let show_ellipsis = total_rows > display_rows * 2; + let mut row_indices = Vec::new(); + if show_ellipsis { + for i in 0..display_rows { + row_indices.push(i); + } + for i in (total_rows - display_rows)..total_rows { + row_indices.push(i); + } + } else { + for i in 0..total_rows { + row_indices.push(i); + } + } + for &i in &row_indices { let idx_str = match &self.index { RowIndex::Range(r) => format!("{}", r.start + i), RowIndex::Int(v) => format!("{}", v[i]), @@ -738,9 +754,9 @@ impl fmt::Display for DataFrame { index_width = index_width.max(idx_str.len()); } for col_name in self.columns() { - let mut maxw = col_name.len(); + let mut maxw = col_name.len().max("...".len()); let col_data = self.column(col_name); - for i in 0..self.rows() { + for &i in &row_indices { let cell_str = match &col_data { DataFrameColumn::F64(s) => format!("{}", s[i]), DataFrameColumn::I64(s) => format!("{}", s[i]), @@ -780,7 +796,26 @@ impl fmt::Display for DataFrame { } writeln!(f)?; // Draw data rows - for i in 0..self.rows() { + for (row_pos, &i) in row_indices.iter().enumerate() { + if show_ellipsis && row_pos == display_rows { + // Ellipsis row + write!(f, "│ {:>width$} ", "...", width = index_width)?; + for w in &col_widths { + write!(f, "│ {:>width$} ", "...", width = *w)?; + } + writeln!(f, "│")?; + // Draw row separator after ellipsis + write!(f, "├")?; + write!(f, "{:─<1$}┼", "", index_width + 2)?; + for (j, w) in col_widths.iter().enumerate() { + if j + 1 == col_widths.len() { + write!(f, "{:─<1$}┤", "", w + 2)?; + } else { + write!(f, "{:─<1$}┼", "", w + 2)?; + } + } + writeln!(f)?; + } let idx_str = match &self.index { RowIndex::Range(r) => format!("{}", r.start + i), RowIndex::Int(v) => format!("{}", v[i]), @@ -795,11 +830,11 @@ impl fmt::Display for DataFrame { DataFrameColumn::Bool(s) => format!("{}", s[i]), DataFrameColumn::String(s) => format!("{}", s[i]), }; - write!(f, "│ {:>width$} ", cell_str, width = w)?; + write!(f, "│ {:>width$} ", cell_str, width = *w)?; } writeln!(f, "│")?; // Draw row separator after every row except the last - if i + 1 != self.rows() { + if row_pos + 1 != row_indices.len() { write!(f, "├")?; write!(f, "{:─<1$}┼", "", index_width + 2)?; for (j, w) in col_widths.iter().enumerate() { @@ -823,7 +858,7 @@ impl fmt::Display for DataFrame { } } writeln!(f)?; - Ok(()) + write!(f, "[{} rows x {} columns]", self.rows(), self.cols()) } } @@ -1475,7 +1510,10 @@ mod tests { let head_df = df.head(); assert_eq!(head_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]); + assert_eq!( + head_df.column("A").as_f64().unwrap(), + &[1.0, 2.0, 3.0, 4.0, 5.0] + ); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1496,7 +1534,10 @@ mod tests { let tail_df = df.tail(); assert_eq!(tail_df.rows(), DEFAULT_DISPLAY_ROWS); - assert_eq!(tail_df.column("A").as_f64().unwrap(), &[6.0, 7.0, 8.0, 9.0, 10.0]); + assert_eq!( + tail_df.column("A").as_f64().unwrap(), + &[6.0, 7.0, 8.0, 9.0, 10.0] + ); // Test with fewer rows than DEFAULT_DISPLAY_ROWS let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); @@ -1530,14 +1571,14 @@ mod tests { ); let expected_output = "\ -┌───────┬───┬────┬───┐\n│ Index │ A │ B │ C │\n├───────┼───┼────┼───┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼───┼────┼───┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼───┼────┼───┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴───┴────┴───┘\n"; +┌───────┬───┬────┬───┐\n│ Index │ A │ B │ C │\n├───────┼───┼────┼───┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼───┼────┼───┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼───┼────┼───┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴───┴────┴───┘\n[3 rows x 3 columns]"; assert_eq!(format!("{}", df), expected_output); } #[test] fn test_dataframe_display_truncation_rows() { - let col_a = create_f64_typed_frame("A", (1..=10).map(|i| i as f64).collect(), None); - let col_b = create_i64_typed_frame("B", (11..=20).collect(), None); + let col_a = create_f64_typed_frame("A", (1..=20).map(|i| i as f64).collect(), None); + let col_b = create_i64_typed_frame("B", (21..=40).collect(), None); let df = DataFrame::new( vec![col_a, col_b], vec!["A".to_string(), "B".to_string()], From eb4fefe36370a89293c9c68944e4f0d0611f7ae7 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 2 Jul 2025 23:45:43 +0100 Subject: [PATCH 15/19] Enhance DataFrame display: implement column ellipsis for large datasets; improve row and column index calculations for better output formatting. --- src/dataframe/df.rs | 95 ++++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 3a36313..5e4b448 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -726,14 +726,15 @@ impl fmt::Display for DataFrame { self.cols() ); } - // Calculate column widths - let mut col_widths = Vec::new(); - let mut index_width = "Index".len().max("...".len()); let display_rows = DEFAULT_DISPLAY_ROWS; + let display_cols = DEFAULT_DISPLAY_COLS; let total_rows = self.rows(); - let show_ellipsis = total_rows > display_rows * 2; + let total_cols = self.cols(); + let show_row_ellipsis = total_rows > display_rows * 2; + let show_col_ellipsis = total_cols > display_cols; + // Row indices to display let mut row_indices = Vec::new(); - if show_ellipsis { + if show_row_ellipsis { for i in 0..display_rows { row_indices.push(i); } @@ -745,6 +746,26 @@ impl fmt::Display for DataFrame { row_indices.push(i); } } + // Column indices to display + let mut col_indices = Vec::new(); + if show_col_ellipsis { + let first = display_cols / 2; + let last = display_cols - first; + for i in 0..first { + col_indices.push(i); + } + col_indices.push(usize::MAX); // ellipsis + for i in (total_cols - last)..total_cols { + col_indices.push(i); + } + } else { + for i in 0..total_cols { + col_indices.push(i); + } + } + // Calculate column widths + let mut col_widths = Vec::new(); + let mut index_width = "Index".len().max("...".len()); for &i in &row_indices { let idx_str = match &self.index { RowIndex::Range(r) => format!("{}", r.start + i), @@ -753,7 +774,12 @@ impl fmt::Display for DataFrame { }; index_width = index_width.max(idx_str.len()); } - for col_name in self.columns() { + for &col_idx in &col_indices { + if col_idx == usize::MAX { + col_widths.push("...".len()); + continue; + } + let col_name = &self.column_names[col_idx]; let mut maxw = col_name.len().max("...".len()); let col_data = self.column(col_name); for &i in &row_indices { @@ -768,8 +794,7 @@ impl fmt::Display for DataFrame { col_widths.push(maxw); } // Draw top border - write!(f, "┌")?; - write!(f, "{:─<1$}┬", "", index_width + 2)?; + write!(f, "┌{:─<1$}┬", "", index_width + 2)?; for (i, w) in col_widths.iter().enumerate() { if i + 1 == col_widths.len() { write!(f, "{:─<1$}┐", "", w + 2)?; @@ -780,13 +805,17 @@ impl fmt::Display for DataFrame { writeln!(f)?; // Draw header row write!(f, "│ {:^width$} ", "Index", width = index_width)?; - for (col_name, w) in self.columns().iter().zip(&col_widths) { - write!(f, "│ {:^width$} ", col_name, width = w)?; + for (col_idx, w) in col_indices.iter().zip(&col_widths) { + if *col_idx == usize::MAX { + write!(f, "│ {:^width$} ", "...", width = w)?; + } else { + let col_name = &self.column_names[*col_idx]; + write!(f, "│ {:^width$} ", col_name, width = w)?; + } } writeln!(f, "│")?; // Draw header separator - write!(f, "├")?; - write!(f, "{:─<1$}┼", "", index_width + 2)?; + write!(f, "├{:─<1$}┼", "", index_width + 2)?; for (i, w) in col_widths.iter().enumerate() { if i + 1 == col_widths.len() { write!(f, "{:─<1$}┤", "", w + 2)?; @@ -797,7 +826,7 @@ impl fmt::Display for DataFrame { writeln!(f)?; // Draw data rows for (row_pos, &i) in row_indices.iter().enumerate() { - if show_ellipsis && row_pos == display_rows { + if show_row_ellipsis && row_pos == display_rows { // Ellipsis row write!(f, "│ {:>width$} ", "...", width = index_width)?; for w in &col_widths { @@ -805,8 +834,7 @@ impl fmt::Display for DataFrame { } writeln!(f, "│")?; // Draw row separator after ellipsis - write!(f, "├")?; - write!(f, "{:─<1$}┼", "", index_width + 2)?; + write!(f, "├{:─<1$}┼", "", index_width + 2)?; for (j, w) in col_widths.iter().enumerate() { if j + 1 == col_widths.len() { write!(f, "{:─<1$}┤", "", w + 2)?; @@ -822,21 +850,25 @@ impl fmt::Display for DataFrame { RowIndex::Date(v) => format!("{}", v[i]), }; write!(f, "│ {:>width$} ", idx_str, width = index_width)?; - for (col_name, w) in self.columns().iter().zip(&col_widths) { - let col_data = self.column(col_name); - let cell_str = match &col_data { - DataFrameColumn::F64(s) => format!("{}", s[i]), - DataFrameColumn::I64(s) => format!("{}", s[i]), - DataFrameColumn::Bool(s) => format!("{}", s[i]), - DataFrameColumn::String(s) => format!("{}", s[i]), - }; - write!(f, "│ {:>width$} ", cell_str, width = *w)?; + for (col_pos, col_idx) in col_indices.iter().enumerate() { + if *col_idx == usize::MAX { + write!(f, "│ {:>width$} ", "...", width = col_widths[col_pos])?; + } else { + let col_name = &self.column_names[*col_idx]; + let col_data = self.column(col_name); + let cell_str = match &col_data { + DataFrameColumn::F64(s) => format!("{}", s[i]), + DataFrameColumn::I64(s) => format!("{}", s[i]), + DataFrameColumn::Bool(s) => format!("{}", s[i]), + DataFrameColumn::String(s) => format!("{}", s[i]), + }; + write!(f, "│ {:>width$} ", cell_str, width = col_widths[col_pos])?; + } } writeln!(f, "│")?; // Draw row separator after every row except the last if row_pos + 1 != row_indices.len() { - write!(f, "├")?; - write!(f, "{:─<1$}┼", "", index_width + 2)?; + write!(f, "├{:─<1$}┼", "", index_width + 2)?; for (j, w) in col_widths.iter().enumerate() { if j + 1 == col_widths.len() { write!(f, "{:─<1$}┤", "", w + 2)?; @@ -848,8 +880,7 @@ impl fmt::Display for DataFrame { } } // Draw bottom border - write!(f, "└")?; - write!(f, "{:─<1$}┴", "", index_width + 2)?; + write!(f, "└{:─<1$}┴", "", index_width + 2)?; for (i, w) in col_widths.iter().enumerate() { if i + 1 == col_widths.len() { write!(f, "{:─<1$}┘", "", w + 2)?; @@ -1571,7 +1602,7 @@ mod tests { ); let expected_output = "\ -┌───────┬───┬────┬───┐\n│ Index │ A │ B │ C │\n├───────┼───┼────┼───┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼───┼────┼───┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼───┼────┼───┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴───┴────┴───┘\n[3 rows x 3 columns]"; +┌───────┬─────┬─────┬─────┐\n│ Index │ A │ B │ C │\n├───────┼─────┼─────┼─────┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼─────┼─────┼─────┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼─────┼─────┼─────┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴─────┴─────┴─────┘\n[3 rows x 3 columns]"; assert_eq!(format!("{}", df), expected_output); } @@ -1586,7 +1617,7 @@ mod tests { ); let expected_output = "\ -┌───────┬───┬────┐\n│ Index │ A │ B │\n├───────┼───┼────┤\n│ 0 │ 1 │ 11 │\n├───────┼───┼────┤\n│ 1 │ 2 │ 12 │\n├───────┼───┼────┤\n│ 2 │ 3 │ 13 │\n├───────┼───┼────┤\n│ 3 │ 4 │ 14 │\n├───────┼───┼────┤\n│ 4 │ 5 │ 15 │\n└───────┴───┴────┘\n"; +┌───────┬─────┬─────┐\n│ Index │ A │ B │\n├───────┼─────┼─────┤\n│ 0 │ 1 │ 21 │\n├───────┼─────┼─────┤\n│ 1 │ 2 │ 22 │\n├───────┼─────┼─────┤\n│ 2 │ 3 │ 23 │\n├───────┼─────┼─────┤\n│ 3 │ 4 │ 24 │\n├───────┼─────┼─────┤\n│ 4 │ 5 │ 25 │\n└───────┴─────┴─────┘\n[5 rows x 2 columns]"; assert_eq!(format!("{}", df.head()), expected_output); } @@ -1608,7 +1639,7 @@ mod tests { // Only the first DEFAULT_DISPLAY_COLS columns should be shown in the output let expected_output = "\ -┌───────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ Col5 │ Col6 │ Col7 │ Col8 │ Col9 │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ 51 │ 61 │ 71 │ 81 │ 91 │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ 52 │ 62 │ 72 │ 82 │ 92 │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ 53 │ 63 │ 73 │ 83 │ 93 │ 103 │ 113 │ 123 │ 133 │ 143 │\n└───────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴───────┴───────┴───────┴───────┴───────┘\n"; +┌───────┬──────┬──────┬──────┬──────┬──────┬─────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ ... │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ ... │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ ... │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ ... │ 103 │ 113 │ 123 │ 133 │ 143 │\n└───────┴──────┴──────┴──────┴──────┴──────┴─────┴───────┴───────┴───────┴───────┴───────┘\n[3 rows x 15 columns]"; assert_eq!(format!("{}", df.head()), expected_output); } @@ -1630,7 +1661,7 @@ mod tests { // Only the first DEFAULT_DISPLAY_ROWS rows and DEFAULT_DISPLAY_COLS columns should be shown let expected_output = "\ -┌───────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬──────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ Col5 │ Col6 │ Col7 │ Col8 │ Col9 │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ 51 │ 61 │ 71 │ 81 │ 91 │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ 52 │ 62 │ 72 │ 82 │ 92 │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ 53 │ 63 │ 73 │ 83 │ 93 │ 103 │ 113 │ 123 │ 133 │ 143 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 3 │ 4 │ 14 │ 24 │ 34 │ 44 │ 54 │ 64 │ 74 │ 84 │ 94 │ 104 │ 114 │ 124 │ 134 │ 144 │\n├───────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼──────┼───────┼───────┼───────┼───────┼───────┤\n│ 4 │ 5 │ 15 │ 25 │ 35 │ 45 │ 55 │ 65 │ 75 │ 85 │ 95 │ 105 │ 115 │ 125 │ 135 │ 145 │\n└───────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴──────┴───────┴───────┴───────┴───────┴───────┘\n"; +┌───────┬──────┬──────┬──────┬──────┬──────┬─────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ ... │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ ... │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ ... │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ ... │ 103 │ 113 │ 123 │ 133 │ 143 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 3 │ 4 │ 14 │ 24 │ 34 │ 44 │ ... │ 104 │ 114 │ 124 │ 134 │ 144 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 4 │ 5 │ 15 │ 25 │ 35 │ 45 │ ... │ 105 │ 115 │ 125 │ 135 │ 145 │\n└───────┴──────┴──────┴──────┴──────┴──────┴─────┴───────┴───────┴───────┴───────┴───────┘\n[5 rows x 15 columns]"; assert_eq!(format!("{}", df.head()), expected_output); } } From 27ab1ac1298692f32c8fbcdadb56cbe853faa79f Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Fri, 4 Jul 2025 00:45:28 +0100 Subject: [PATCH 16/19] reimplement dataframe functionality from scratch --- src/dataframe/df.rs | 2048 +++++++++++-------------------------------- 1 file changed, 520 insertions(+), 1528 deletions(-) diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs index 5e4b448..23cd0c8 100644 --- a/src/dataframe/df.rs +++ b/src/dataframe/df.rs @@ -1,1667 +1,659 @@ use crate::frame::{Frame, RowIndex}; -use crate::matrix::Matrix; +use std::any::{Any, TypeId}; use std::collections::HashMap; -use std::fmt; +use std::fmt; // Import TypeId -// Add these constants at the top of the file, perhaps after the use statements const DEFAULT_DISPLAY_ROWS: usize = 5; -const DEFAULT_DISPLAY_COLS: usize = 10; // Display up to 10 columns by default +const DEFAULT_DISPLAY_COLS: usize = 10; -/// Represents a typed Frame that can hold multiple columns of a single type. -/// This will be the underlying storage for DataFrame. -#[derive(Debug, Clone, PartialEq)] -pub enum TypedFrame { - F64(Frame), - I64(Frame), - Bool(Frame), - String(Frame), - // Add more types as needed +// Trait to enable type-agnostic operations on Frame objects within DataFrame +pub trait SubFrame: Send + Sync + fmt::Debug + Any { + fn rows(&self) -> usize; + fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String; + fn clone_box(&self) -> Box; + fn delete_column_from_frame(&mut self, col_name: &str); + fn get_frame_cols(&self) -> usize; // Add a method to get the number of columns in the underlying frame + + // Methods for downcasting to concrete types + fn as_any(&self) -> &dyn Any; + fn as_any_mut(&mut self) -> &mut dyn Any; } -macro_rules! impl_typed_frame_common_methods { - ($($method:ident $(($($arg:ident: $arg_ty:ty),*))? -> $ret_ty:ty),*) => { - impl TypedFrame { - $( - pub fn $method(&self $(, $($arg: $arg_ty),*)?) -> $ret_ty { - match self { - TypedFrame::F64(f) => f.$method($($($arg),*)?), - TypedFrame::I64(f) => f.$method($($($arg),*)?), - TypedFrame::Bool(f) => f.$method($($($arg),*)?), - TypedFrame::String(f) => f.$method($($($arg),*)?), - } - } - )* - } - }; -} - -impl_typed_frame_common_methods! { - rows -> usize, - cols -> usize, - columns -> &[String], - index -> &RowIndex -} - -macro_rules! impl_typed_frame_column_accessors { - ($fn_name:ident, $ret_type:ident, $frame_method:ident) => { - pub fn $fn_name(&self, name: &str) -> $ret_type<'_> { - match self { - TypedFrame::F64(f) => $ret_type::F64(f.$frame_method(name)), - TypedFrame::I64(f) => $ret_type::I64(f.$frame_method(name)), - TypedFrame::Bool(f) => $ret_type::Bool(f.$frame_method(name)), - TypedFrame::String(f) => $ret_type::String(f.$frame_method(name)), - } - } - }; - ($fn_name:ident, $ret_type:ident, $frame_method:ident, mut) => { - pub fn $fn_name(&mut self, name: &str) -> $ret_type<'_> { - match self { - TypedFrame::F64(f) => $ret_type::F64(f.$frame_method(name)), - TypedFrame::I64(f) => $ret_type::I64(f.$frame_method(name)), - TypedFrame::Bool(f) => $ret_type::Bool(f.$frame_method(name)), - TypedFrame::String(f) => $ret_type::String(f.$frame_method(name)), - } - } - }; -} - -impl TypedFrame { - impl_typed_frame_column_accessors!(column, DataFrameColumn, column); - impl_typed_frame_column_accessors!(column_mut, DataFrameColumnMut, column_mut, mut); -} - -/// Represents a view of a single column within a DataFrame. -/// It borrows data from an underlying TypedFrame. -#[derive(Debug, PartialEq)] -pub enum DataFrameColumn<'a> { - F64(&'a [f64]), - I64(&'a [i64]), - Bool(&'a [bool]), - String(&'a [String]), -} - -impl<'a> DataFrameColumn<'a> { - pub fn len(&self) -> usize { - match self { - DataFrameColumn::F64(s) => s.len(), - DataFrameColumn::I64(s) => s.len(), - DataFrameColumn::Bool(s) => s.len(), - DataFrameColumn::String(s) => s.len(), - } +// Implement SubFrame for any Frame that meets the requirements +impl SubFrame for Frame +where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, +{ + fn rows(&self) -> usize { + self.rows() } - pub fn is_empty(&self) -> bool { - self.len() == 0 + fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String { + self.get_row(physical_row_idx).get(col_name).to_string() } - // Add methods to get specific typed slices - pub fn as_f64(&self) -> Option<&'a [f64]> { - if let DataFrameColumn::F64(s) = self { - Some(s) - } else { - None - } + fn clone_box(&self) -> Box { + Box::new(self.clone()) } - pub fn as_i64(&self) -> Option<&'a [i64]> { - if let DataFrameColumn::I64(s) = self { - Some(s) - } else { - None - } + + fn delete_column_from_frame(&mut self, col_name: &str) { + self.delete_column(col_name); } - pub fn as_bool(&self) -> Option<&'a [bool]> { - if let DataFrameColumn::Bool(s) = self { - Some(s) - } else { - None - } + + fn get_frame_cols(&self) -> usize { + self.cols() } - pub fn as_string(&self) -> Option<&'a [String]> { - if let DataFrameColumn::String(s) = self { - Some(s) - } else { - None - } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self } } -/// Represents a mutable view of a single column within a DataFrame. -#[derive(Debug)] -pub enum DataFrameColumnMut<'a> { - F64(&'a mut [f64]), - I64(&'a mut [i64]), - Bool(&'a mut [bool]), - String(&'a mut [String]), -} - -impl<'a> DataFrameColumnMut<'a> { - pub fn len(&self) -> usize { - match self { - DataFrameColumnMut::F64(s) => s.len(), - DataFrameColumnMut::I64(s) => s.len(), - DataFrameColumnMut::Bool(s) => s.len(), - DataFrameColumnMut::String(s) => s.len(), - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - // Add methods to get specific typed mutable slices - pub fn as_f64_mut(&mut self) -> Option<&mut [f64]> { - if let DataFrameColumnMut::F64(s) = self { - Some(s) - } else { - None - } - } - pub fn as_i64_mut(&mut self) -> Option<&mut [i64]> { - if let DataFrameColumnMut::I64(s) = self { - Some(s) - } else { - None - } - } - pub fn as_bool_mut(&mut self) -> Option<&mut [bool]> { - if let DataFrameColumnMut::Bool(s) = self { - Some(s) - } else { - None - } - } - pub fn as_string_mut(&mut self) -> Option<&mut [String]> { - if let DataFrameColumnMut::String(s) = self { - Some(s) - } else { - None - } - } -} - -/// A DataFrame capable of holding multiple data types. -/// -/// Internally, a DataFrame manages a collection of `TypedFrame` instances, -/// each holding data of a single, homogeneous type. The logical column -/// order is maintained separately from the physical storage. -#[derive(Debug, Clone, PartialEq)] pub struct DataFrame { - /// The logical order of column names. - pub column_names: Vec, - /// A map from a unique ID to a TypedFrame (the underlying storage). - pub subframes: HashMap, - /// A map from logical column name to its location: (subframe_id, column_name_in_subframe). - pub column_locations: HashMap, - /// The common row index for all columns in the DataFrame. - pub index: RowIndex, - /// The number of rows in the DataFrame. - pub rows: usize, - /// Counter for generating unique subframe IDs. - pub next_subframe_id: usize, + frames_by_type: HashMap>, // Maps TypeId to the Frame holding columns of that type + column_to_type: HashMap, // Maps column name to its TypeId + column_names: Vec, + index: RowIndex, } impl DataFrame { - /// Creates a new DataFrame from a vector of column data. - /// - /// Each inner `Vec` represents a column, and the outer `Vec` contains - /// these columns in the desired order. The `column_names` must match - /// the number of columns provided. - /// - /// All columns must have the same number of rows. - /// - /// # Arguments - /// * `columns` - A vector of `TypedFrame` instances, each representing a column. - /// * `column_names` - A vector of strings, providing names for each column. - /// * `index` - An optional `RowIndex` to be used for the DataFrame. If `None`, a default - /// `Range` index will be created. - /// - /// # Panics - /// * If `column_names` length does not match the number of `columns`. - /// * If columns have inconsistent row counts. - /// * If column names are duplicated. - /// * If the provided `index` length does not match the row count. - pub fn new( - columns: Vec, // Changed from DataFrameColumn to TypedFrame - column_names: Vec, - index: Option, - ) -> Self { - if columns.is_empty() { - return Self { - column_names: Vec::new(), - subframes: HashMap::new(), - column_locations: HashMap::new(), - index: index.unwrap_or(RowIndex::Range(0..0)), - rows: 0, - next_subframe_id: 0, - }; - } - - let num_rows = columns[0].rows(); - let common_index = index.unwrap_or(RowIndex::Range(0..num_rows)); - - if common_index.len() != num_rows { - panic!( - "DataFrame::new: provided index length ({}) mismatch column rows ({})", - common_index.len(), - num_rows - ); - } - - let mut subframes = HashMap::new(); - let mut column_locations = HashMap::new(); - let mut next_subframe_id = 0; - - // Process each provided TypedFrame - for typed_frame in columns { - if typed_frame.rows() != num_rows { - panic!( - "DataFrame::new: TypedFrame has inconsistent row count ({} vs {})", - typed_frame.rows(), - num_rows - ); - } - if typed_frame.index() != &common_index { - panic!("DataFrame::new: TypedFrame has inconsistent index with common index",); - } - - let subframe_id = next_subframe_id; - next_subframe_id += 1; - - for col_name_in_subframe in typed_frame.columns() { - if column_locations.contains_key(col_name_in_subframe) { - panic!( - "DataFrame::new: duplicate column name: {}", - col_name_in_subframe - ); - } - column_locations.insert( - col_name_in_subframe.clone(), - (subframe_id, col_name_in_subframe.clone()), - ); - } - subframes.insert(subframe_id, typed_frame); - } - - // Ensure all column_names provided are actually present in the subframes - for name in &column_names { - if !column_locations.contains_key(name) { - panic!( - "DataFrame::new: column name '{}' not found in provided TypedFrames", - name - ); - } - } - - Self { - column_names, // This now represents the logical order of ALL columns from all subframes - subframes, - column_locations, - index: common_index, - rows: num_rows, - next_subframe_id, + pub fn new() -> Self { + DataFrame { + frames_by_type: HashMap::new(), + column_to_type: HashMap::new(), + column_names: Vec::new(), + index: RowIndex::Range(0..0), // Initialize with an empty range index } } /// Returns the number of rows in the DataFrame. - #[inline] pub fn rows(&self) -> usize { - self.rows + self.index.len() } /// Returns the number of columns in the DataFrame. - #[inline] pub fn cols(&self) -> usize { self.column_names.len() } - /// Returns a slice of the column names in their logical order. - #[inline] - pub fn columns(&self) -> &[String] { + /// Returns a reference to the vector of column names. + pub fn get_column_names(&self) -> &Vec { &self.column_names } - /// Returns a reference to the DataFrame's row index. - #[inline] - pub fn index(&self) -> &RowIndex { - &self.index + /// Returns the number of internal Frame objects (one per unique data type). + pub fn num_internal_frames(&self) -> usize { + self.frames_by_type.len() } - /// Returns an immutable view to a column by its name. - /// Panics if the column is not found. - pub fn column(&self, name: &str) -> DataFrameColumn<'_> { - let (subframe_id, col_in_subframe_name) = self - .column_locations - .get(name) - .unwrap_or_else(|| panic!("DataFrame::column: unknown column label: '{}'", name)); - - let subframe = self.subframes.get(subframe_id).unwrap_or_else(|| { - panic!( - "DataFrame::column: internal error, subframe ID {} not found", - subframe_id - ) - }); - - match subframe { - TypedFrame::F64(f) => DataFrameColumn::F64(f.column(col_in_subframe_name)), - TypedFrame::I64(f) => DataFrameColumn::I64(f.column(col_in_subframe_name)), - TypedFrame::Bool(f) => DataFrameColumn::Bool(f.column(col_in_subframe_name)), - TypedFrame::String(f) => DataFrameColumn::String(f.column(col_in_subframe_name)), + /// Returns a reference to a column of a specific type, if it exists. + pub fn get_column(&self, col_name: &str) -> Option<&[T]> + where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, + { + let expected_type_id = TypeId::of::(); + if let Some(actual_type_id) = self.column_to_type.get(col_name) { + if *actual_type_id == expected_type_id { + if let Some(sub_frame_box) = self.frames_by_type.get(actual_type_id) { + if let Some(frame) = sub_frame_box.as_any().downcast_ref::>() { + return Some(frame.column(col_name)); + } + } + } } + None } - /// Returns a mutable view to a column by its name. - /// Panics if the column is not found. - pub fn column_mut(&mut self, name: &str) -> DataFrameColumnMut<'_> { - let (subframe_id, col_in_subframe_name) = self - .column_locations - .get(name) - .unwrap_or_else(|| panic!("DataFrame::column_mut: unknown column label: '{}'", name)); + /// Returns a HashMap representing a row, mapping column names to their string values. + pub fn get_row(&self, row_idx: usize) -> Option> { + if row_idx >= self.rows() { + return None; + } - // We need to get a mutable reference to the subframe. - // This requires a bit of care because we're borrowing `self.column_locations` - // and then `self.subframes` mutably. - // To avoid double-borrowing, we can get the subframe_id and col_in_subframe_name first, - // then use them to access the subframe mutably. - let subframe_id_copy = *subframe_id; - let col_in_subframe_name_copy = col_in_subframe_name.clone(); + let mut row_data = HashMap::new(); + for col_name in &self.column_names { + if let Some(type_id) = self.column_to_type.get(col_name) { + if let Some(sub_frame_box) = self.frames_by_type.get(type_id) { + let value = sub_frame_box.get_value_as_string(row_idx, col_name); + row_data.insert(col_name.clone(), value); + } + } + } + Some(row_data) + } - let subframe = self - .subframes - .get_mut(&subframe_id_copy) + pub fn add_column(&mut self, col_name: &str, data: Vec) + where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, + { + let type_id = TypeId::of::(); + let col_name_string = col_name.to_string(); + + // Check for duplicate column name across the entire DataFrame + if self.column_to_type.contains_key(&col_name_string) { + panic!( + "DataFrame::add_column: duplicate column name: '{}'", + col_name_string + ); + } + + // If this is the first column being added, set the DataFrame's index + if self.column_names.is_empty() { + self.index = RowIndex::Range(0..data.len()); + } else { + // Ensure new column has the same number of rows as existing columns + if data.len() != self.index.len() { + panic!( + "DataFrame::add_column: new column '{}' has {} rows, but existing columns have {} rows", + col_name_string, + data.len(), + self.index.len() + ); + } + } + + // Check if a Frame of this type already exists + if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) { + // Downcast to the concrete Frame and add the column + if let Some(frame) = sub_frame_box.as_any_mut().downcast_mut::>() { + frame.add_column(col_name_string.clone(), data); + } else { + // This should ideally not happen if TypeId matches, but good for safety + panic!( + "Type mismatch when downcasting existing SubFrame for TypeId {:?}", + type_id + ); + } + } else { + // No Frame of this type exists, create a new one + // The Frame::new constructor expects a Matrix and column names. + // We create a Matrix from a single column vector. + let new_frame = Frame::new( + crate::matrix::Matrix::from_cols(vec![data]), + vec![col_name_string.clone()], + Some(self.index.clone()), // Pass the DataFrame's index to the new Frame + ); + self.frames_by_type.insert(type_id, Box::new(new_frame)); + } + + // Update column mappings and names + self.column_to_type.insert(col_name_string.clone(), type_id); + self.column_names.push(col_name_string); + } + + /// Drops a column from the DataFrame. + /// Panics if the column does not exist. + pub fn drop_column(&mut self, col_name: &str) { + let col_name_string = col_name.to_string(); + + // 1. Get the TypeId associated with the column + let type_id = self + .column_to_type + .remove(&col_name_string) .unwrap_or_else(|| { panic!( - "DataFrame::column_mut: internal error, subframe ID {} not found", - subframe_id_copy - ) + "DataFrame::drop_column: column '{}' not found", + col_name_string + ); }); - match subframe { - TypedFrame::F64(f) => DataFrameColumnMut::F64(f.column_mut(&col_in_subframe_name_copy)), - TypedFrame::I64(f) => DataFrameColumnMut::I64(f.column_mut(&col_in_subframe_name_copy)), - TypedFrame::Bool(f) => { - DataFrameColumnMut::Bool(f.column_mut(&col_in_subframe_name_copy)) - } - TypedFrame::String(f) => { - DataFrameColumnMut::String(f.column_mut(&col_in_subframe_name_copy)) - } - } - } + // 2. Remove the column name from the ordered list + self.column_names.retain(|name| name != &col_name_string); - /// Adds a new column to the DataFrame. - /// This involves either adding it to an existing TypedFrame if types match, - /// or creating a new TypedFrame. - /// Panics if a column with the same name already exists or if the new column's - /// row count or index does not match the DataFrame's. - /// Adds a new column to the DataFrame. - /// The `column_data` must be a `TypedFrame` containing exactly one column, - /// and its name must match the `name` parameter. - pub fn add_column(&mut self, name: String, column_data: TypedFrame) { - if self.column_locations.contains_key(&name) { - panic!("DataFrame::add_column: duplicate column label: {}", name); - } - if column_data.rows() != self.rows { + // 3. Find the Frame object and delete the column from it + if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) { + sub_frame_box.delete_column_from_frame(&col_name_string); + + // 4. If the Frame object for this type becomes empty, remove it from frames_by_type + if sub_frame_box.get_frame_cols() == 0 { + self.frames_by_type.remove(&type_id); + } + } else { + // This should not happen if column_to_type was consistent panic!( - "DataFrame::add_column: new column '{}' has inconsistent row count ({} vs {})", - name, - column_data.rows(), - self.rows + "DataFrame::drop_column: internal error, no frame found for type_id {:?}", + type_id ); } - if column_data.index() != &self.index { - panic!( - "DataFrame::add_column: new column '{}' has inconsistent index with DataFrame's index", - name - ); - } - // Ensure the provided TypedFrame contains exactly one column, and its name matches `name` - if column_data.cols() != 1 || column_data.columns()[0] != name { - panic!( - "DataFrame::add_column: provided TypedFrame must contain exactly one column named '{}'", - name - ); - } - - let subframe_id = self.next_subframe_id; - self.next_subframe_id += 1; - - self.subframes.insert(subframe_id, column_data); - self.column_locations - .insert(name.clone(), (subframe_id, name.clone())); - - self.column_names.push(name); - } - - /// Deletes a column by name and returns its data as a new single-column TypedFrame. - /// Panics if the column name is not found. - pub fn delete_column(&mut self, name: &str) -> TypedFrame { - let (subframe_id, col_in_subframe_name) = - self.column_locations.remove(name).unwrap_or_else(|| { - panic!("DataFrame::delete_column: unknown column label: '{}'", name) - }); - - // Remove from logical column names - if let Some(pos) = self.column_names.iter().position(|n| n == name) { - self.column_names.remove(pos); - } - - let subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { - panic!( - "DataFrame::delete_column: internal error, subframe ID {} not found", - subframe_id - ) - }); - - macro_rules! delete_column_from_typed_frame { - ($frame_type:ident, $frame_variant:ident, $f:ident, $col_name:expr) => {{ - let data = $f.delete_column(&$col_name); - TypedFrame::$frame_variant(Frame::new( - Matrix::from_cols(vec![data]), - vec![$col_name.clone()], - Some($f.index().clone()), - )) - }}; - } - - let deleted_data_frame = match subframe { - TypedFrame::F64(f) => { - delete_column_from_typed_frame!(f64, F64, f, col_in_subframe_name) - } - TypedFrame::I64(f) => { - delete_column_from_typed_frame!(i64, I64, f, col_in_subframe_name) - } - TypedFrame::Bool(f) => { - delete_column_from_typed_frame!(bool, Bool, f, col_in_subframe_name) - } - TypedFrame::String(f) => { - delete_column_from_typed_frame!(String, String, f, col_in_subframe_name) - } - }; - - // If the subframe becomes empty after deletion, remove it from the map - if subframe.cols() == 0 { - self.subframes.remove(&subframe_id); - } - - deleted_data_frame - } - - /// Renames an existing column. - pub fn rename_column(&mut self, old_name: &str, new_name: String) { - if old_name == new_name { - return; // No change needed - } - if self.column_locations.contains_key(&new_name) { - panic!( - "DataFrame::rename_column: new column name '{}' already exists", - new_name - ); - } - - let (subframe_id, col_in_subframe_name) = - self.column_locations.remove(old_name).unwrap_or_else(|| { - panic!( - "DataFrame::rename_column: unknown column label: '{}'", - old_name - ) - }); - - // Update the column_locations map - self.column_locations - .insert(new_name.clone(), (subframe_id, new_name.clone())); - - // Update the logical column_names vector - if let Some(pos) = self.column_names.iter().position(|n| n == old_name) { - self.column_names[pos] = new_name.clone(); - } - - // Rename the column in the underlying TypedFrame - let subframe = self.subframes.get_mut(&subframe_id).unwrap_or_else(|| { - panic!( - "DataFrame::rename_column: internal error, subframe ID {} not found", - subframe_id - ) - }); - - match subframe { - TypedFrame::F64(f) => f.rename(&col_in_subframe_name, new_name), - TypedFrame::I64(f) => f.rename(&col_in_subframe_name, new_name), - TypedFrame::Bool(f) => f.rename(&col_in_subframe_name, new_name), - TypedFrame::String(f) => f.rename(&col_in_subframe_name, new_name), - } - } - - /// Sorts columns alphabetically by name. - pub fn sort_columns(&mut self) { - self.column_names.sort(); - } - - /// Returns a new DataFrame containing the first `n` rows. - /// If `n` is greater than the number of rows in the DataFrame, - /// the entire DataFrame is returned. - pub fn head_n(&self, n: usize) -> Self { - let num_rows_to_take = n.min(self.rows()); - - if num_rows_to_take == 0 { - return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); - } - - let new_index = match &self.index { - RowIndex::Range(r) => RowIndex::Range(r.start..r.start + num_rows_to_take), - RowIndex::Int(v) => RowIndex::Int(v[0..num_rows_to_take].to_vec()), - RowIndex::Date(v) => RowIndex::Date(v[0..num_rows_to_take].to_vec()), - }; - - let mut new_typed_frames = Vec::new(); - for col_name in self.columns() { - let col_data = self.column(col_name); - match col_data { - DataFrameColumn::F64(s) => { - let new_data = s[0..num_rows_to_take].to_vec(); - new_typed_frames.push(TypedFrame::F64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::I64(s) => { - let new_data = s[0..num_rows_to_take].to_vec(); - new_typed_frames.push(TypedFrame::I64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::Bool(s) => { - let new_data = s[0..num_rows_to_take].to_vec(); - new_typed_frames.push(TypedFrame::Bool(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::String(s) => { - let new_data = s[0..num_rows_to_take].to_vec(); - new_typed_frames.push(TypedFrame::String(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - } - } - - DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) - } - - /// Returns a new DataFrame containing the last `n` rows. - /// If `n` is greater than the number of rows in the DataFrame, - /// the entire DataFrame is returned. - pub fn tail_n(&self, n: usize) -> Self { - let num_rows_to_take = n.min(self.rows()); - - if num_rows_to_take == 0 { - return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); - } - - let start_row_idx = self.rows() - num_rows_to_take; - - let new_index = match &self.index { - RowIndex::Range(r) => RowIndex::Range(r.start + start_row_idx..r.start + self.rows()), - RowIndex::Int(v) => RowIndex::Int(v[start_row_idx..].to_vec()), - RowIndex::Date(v) => RowIndex::Date(v[start_row_idx..].to_vec()), - }; - - let mut new_typed_frames = Vec::new(); - for col_name in self.columns() { - let col_data = self.column(col_name); - match col_data { - DataFrameColumn::F64(s) => { - let new_data = s[start_row_idx..].to_vec(); - new_typed_frames.push(TypedFrame::F64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::I64(s) => { - let new_data = s[start_row_idx..].to_vec(); - new_typed_frames.push(TypedFrame::I64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::Bool(s) => { - let new_data = s[start_row_idx..].to_vec(); - new_typed_frames.push(TypedFrame::Bool(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::String(s) => { - let new_data = s[start_row_idx..].to_vec(); - new_typed_frames.push(TypedFrame::String(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - } - } - - DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) - } - - /// Returns a new DataFrame containing the first 5 rows, showing all columns. - pub fn head(&self) -> Self { - let n = DEFAULT_DISPLAY_ROWS.min(self.rows()); - if n == 0 { - return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); - } - let new_index = match &self.index { - RowIndex::Range(r) => RowIndex::Range(r.start..r.start + n), - RowIndex::Int(v) => RowIndex::Int(v[0..n].to_vec()), - RowIndex::Date(v) => RowIndex::Date(v[0..n].to_vec()), - }; - let mut new_typed_frames = Vec::new(); - for col_name in self.columns() { - let col_data = self.column(col_name); - match col_data { - DataFrameColumn::F64(s) => { - let new_data = s[0..n].to_vec(); - new_typed_frames.push(TypedFrame::F64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::I64(s) => { - let new_data = s[0..n].to_vec(); - new_typed_frames.push(TypedFrame::I64(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::Bool(s) => { - let new_data = s[0..n].to_vec(); - new_typed_frames.push(TypedFrame::Bool(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - DataFrameColumn::String(s) => { - let new_data = s[0..n].to_vec(); - new_typed_frames.push(TypedFrame::String(Frame::new( - Matrix::from_cols(vec![new_data]), - vec![col_name.clone()], - Some(new_index.clone()), - ))); - } - } - } - - DataFrame::new(new_typed_frames, self.column_names.clone(), Some(new_index)) - } - - /// Returns a new DataFrame containing the last 5 rows, showing all columns. - pub fn tail(&self) -> Self { - let n = DEFAULT_DISPLAY_ROWS.min(self.rows()); - if n == 0 { - return DataFrame::new(vec![], vec![], Some(RowIndex::Range(0..0))); - } - self.tail_n(n) } } impl fmt::Display for DataFrame { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.rows() == 0 || self.cols() == 0 { - return write!( - f, - "+-------------------+\n| Empty DataFrame |\n+-------------------+\nRows: {}, Columns: {}", - self.rows(), - self.cols() - ); + // Display column headers + for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) { + write!(f, "{:<15}", col_name)?; } - let display_rows = DEFAULT_DISPLAY_ROWS; - let display_cols = DEFAULT_DISPLAY_COLS; - let total_rows = self.rows(); - let total_cols = self.cols(); - let show_row_ellipsis = total_rows > display_rows * 2; - let show_col_ellipsis = total_cols > display_cols; - // Row indices to display - let mut row_indices = Vec::new(); - if show_row_ellipsis { - for i in 0..display_rows { - row_indices.push(i); - } - for i in (total_rows - display_rows)..total_rows { - row_indices.push(i); - } - } else { - for i in 0..total_rows { - row_indices.push(i); - } - } - // Column indices to display - let mut col_indices = Vec::new(); - if show_col_ellipsis { - let first = display_cols / 2; - let last = display_cols - first; - for i in 0..first { - col_indices.push(i); - } - col_indices.push(usize::MAX); // ellipsis - for i in (total_cols - last)..total_cols { - col_indices.push(i); - } - } else { - for i in 0..total_cols { - col_indices.push(i); - } - } - // Calculate column widths - let mut col_widths = Vec::new(); - let mut index_width = "Index".len().max("...".len()); - for &i in &row_indices { - let idx_str = match &self.index { - RowIndex::Range(r) => format!("{}", r.start + i), - RowIndex::Int(v) => format!("{}", v[i]), - RowIndex::Date(v) => format!("{}", v[i]), - }; - index_width = index_width.max(idx_str.len()); - } - for &col_idx in &col_indices { - if col_idx == usize::MAX { - col_widths.push("...".len()); - continue; - } - let col_name = &self.column_names[col_idx]; - let mut maxw = col_name.len().max("...".len()); - let col_data = self.column(col_name); - for &i in &row_indices { - let cell_str = match &col_data { - DataFrameColumn::F64(s) => format!("{}", s[i]), - DataFrameColumn::I64(s) => format!("{}", s[i]), - DataFrameColumn::Bool(s) => format!("{}", s[i]), - DataFrameColumn::String(s) => format!("{}", s[i]), - }; - maxw = maxw.max(cell_str.len()); - } - col_widths.push(maxw); - } - // Draw top border - write!(f, "┌{:─<1$}┬", "", index_width + 2)?; - for (i, w) in col_widths.iter().enumerate() { - if i + 1 == col_widths.len() { - write!(f, "{:─<1$}┐", "", w + 2)?; - } else { - write!(f, "{:─<1$}┬", "", w + 2)?; - } + if self.column_names.len() > DEFAULT_DISPLAY_COLS { + write!(f, "...")?; } writeln!(f)?; - // Draw header row - write!(f, "│ {:^width$} ", "Index", width = index_width)?; - for (col_idx, w) in col_indices.iter().zip(&col_widths) { - if *col_idx == usize::MAX { - write!(f, "│ {:^width$} ", "...", width = w)?; - } else { - let col_name = &self.column_names[*col_idx]; - write!(f, "│ {:^width$} ", col_name, width = w)?; + + // Display data rows + let mut displayed_rows = 0; + for i in 0..self.index.len() { + if displayed_rows >= DEFAULT_DISPLAY_ROWS { + writeln!(f, "...")?; + break; } - } - writeln!(f, "│")?; - // Draw header separator - write!(f, "├{:─<1$}┼", "", index_width + 2)?; - for (i, w) in col_widths.iter().enumerate() { - if i + 1 == col_widths.len() { - write!(f, "{:─<1$}┤", "", w + 2)?; - } else { - write!(f, "{:─<1$}┼", "", w + 2)?; - } - } - writeln!(f)?; - // Draw data rows - for (row_pos, &i) in row_indices.iter().enumerate() { - if show_row_ellipsis && row_pos == display_rows { - // Ellipsis row - write!(f, "│ {:>width$} ", "...", width = index_width)?; - for w in &col_widths { - write!(f, "│ {:>width$} ", "...", width = *w)?; - } - writeln!(f, "│")?; - // Draw row separator after ellipsis - write!(f, "├{:─<1$}┼", "", index_width + 2)?; - for (j, w) in col_widths.iter().enumerate() { - if j + 1 == col_widths.len() { - write!(f, "{:─<1$}┤", "", w + 2)?; + for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) { + if let Some(type_id) = self.column_to_type.get(col_name) { + if let Some(sub_frame_box) = self.frames_by_type.get(type_id) { + write!(f, "{:<15}", sub_frame_box.get_value_as_string(i, col_name))?; } else { - write!(f, "{:─<1$}┼", "", w + 2)?; + // This case indicates an inconsistency: column_to_type has an entry, + // but frames_by_type doesn't have the corresponding Frame. + write!(f, "{:<15}", "[ERROR]")?; } - } - writeln!(f)?; - } - let idx_str = match &self.index { - RowIndex::Range(r) => format!("{}", r.start + i), - RowIndex::Int(v) => format!("{}", v[i]), - RowIndex::Date(v) => format!("{}", v[i]), - }; - write!(f, "│ {:>width$} ", idx_str, width = index_width)?; - for (col_pos, col_idx) in col_indices.iter().enumerate() { - if *col_idx == usize::MAX { - write!(f, "│ {:>width$} ", "...", width = col_widths[col_pos])?; } else { - let col_name = &self.column_names[*col_idx]; - let col_data = self.column(col_name); - let cell_str = match &col_data { - DataFrameColumn::F64(s) => format!("{}", s[i]), - DataFrameColumn::I64(s) => format!("{}", s[i]), - DataFrameColumn::Bool(s) => format!("{}", s[i]), - DataFrameColumn::String(s) => format!("{}", s[i]), - }; - write!(f, "│ {:>width$} ", cell_str, width = col_widths[col_pos])?; + // This case indicates an inconsistency: column_names has an entry, + // but column_to_type doesn't have the corresponding column. + write!(f, "{:<15}", "[ERROR]")?; } } - writeln!(f, "│")?; - // Draw row separator after every row except the last - if row_pos + 1 != row_indices.len() { - write!(f, "├{:─<1$}┼", "", index_width + 2)?; - for (j, w) in col_widths.iter().enumerate() { - if j + 1 == col_widths.len() { - write!(f, "{:─<1$}┤", "", w + 2)?; - } else { - write!(f, "{:─<1$}┼", "", w + 2)?; - } - } - writeln!(f)?; + if self.column_names.len() > DEFAULT_DISPLAY_COLS { + write!(f, "...")?; } + writeln!(f)?; + displayed_rows += 1; } - // Draw bottom border - write!(f, "└{:─<1$}┴", "", index_width + 2)?; - for (i, w) in col_widths.iter().enumerate() { - if i + 1 == col_widths.len() { - write!(f, "{:─<1$}┘", "", w + 2)?; - } else { - write!(f, "{:─<1$}┴", "", w + 2)?; - } - } - writeln!(f)?; - write!(f, "[{} rows x {} columns]", self.rows(), self.cols()) + Ok(()) } } +impl fmt::Debug for DataFrame { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("DataFrame") + .field("column_names", &self.column_names) + .field("index", &self.index) + .field("column_to_type", &self.column_to_type) + .field("frames_by_type", &self.frames_by_type) + .finish() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod tests { use super::*; use crate::frame::Frame; use crate::matrix::Matrix; - use chrono::NaiveDate; - - // Helper for dates - fn d(y: i32, m: u32, d: u32) -> NaiveDate { - NaiveDate::from_ymd_opt(y, m, d).unwrap() - } - - // Helper to create a simple f64 TypedFrame - fn create_f64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::F64(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) - } - - // Helper to create a simple i64 TypedFrame - fn create_i64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::I64(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) - } - - // Helper to create a simple String TypedFrame - fn create_string_typed_frame( - name: &str, - data: Vec, - index: Option, - ) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::String(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) - } - - // Helper to create a multi-column f64 TypedFrame - fn create_multi_f64_typed_frame( - names: Vec<&str>, - data: Vec>, - index: Option, - ) -> TypedFrame { - let rows = data[0].len(); - let matrix = Matrix::from_cols(data); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::F64(Frame::new( - matrix, - names.into_iter().map(|s| s.to_string()).collect(), - Some(frame_index), - )) - } #[test] - fn test_dataframe_new_basic() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); - let col_c = create_string_typed_frame( - "C", - vec!["x".to_string(), "y".to_string(), "z".to_string()], - None, - ); - - let df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); - - assert_eq!(df.rows(), 3); - assert_eq!(df.cols(), 3); - assert_eq!(df.columns(), &["A", "B", "C"]); - assert_eq!(df.index(), &RowIndex::Range(0..3)); - - // Check column data using the new DataFrameColumn view - if let DataFrameColumn::F64(slice_a) = df.column("A") { - assert_eq!(slice_a, &[1.0, 2.0, 3.0]); - } else { - panic!("Column A is not f64"); - } - if let DataFrameColumn::I64(slice_b) = df.column("B") { - assert_eq!(slice_b, &[4, 5, 6]); - } else { - panic!("Column B is not i64"); - } - if let DataFrameColumn::String(slice_c) = df.column("C") { - assert_eq!( - slice_c, - &["x".to_string(), "y".to_string(), "z".to_string()] - ); - } else { - panic!("Column C is not String"); - } - } - - #[test] - fn test_dataframe_rows_cols_columns_index() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![4, 5, 6], None); - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - - assert_eq!(df.rows(), 3); - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "B"]); - assert_eq!(df.index(), &RowIndex::Range(0..3)); - - let empty_df = DataFrame::new(vec![], vec![], None); - assert_eq!(empty_df.rows(), 0); - assert_eq!(empty_df.cols(), 0); - assert_eq!(empty_df.columns(), &[] as &[String]); - assert_eq!(empty_df.index(), &RowIndex::Range(0..0)); - } - - #[test] - fn test_dataframe_column_access() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20], None); - let col_c = - create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); - - let df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); - - // Test f64 column - let col_a_view = df.column("A"); - assert!(col_a_view.as_f64().is_some()); - assert_eq!(col_a_view.as_f64().unwrap(), &[1.0, 2.0]); - assert!(col_a_view.as_i64().is_none()); - assert_eq!(col_a_view.len(), 2); - assert!(!col_a_view.is_empty()); - - // Test i64 column - let col_b_view = df.column("B"); - assert!(col_b_view.as_i64().is_some()); - assert_eq!(col_b_view.as_i64().unwrap(), &[10, 20]); - - // Test String column - let col_c_view = df.column("C"); - assert!(col_c_view.as_string().is_some()); - assert_eq!( - col_c_view.as_string().unwrap(), - &["foo".to_string(), "bar".to_string()] - ); - } - - #[test] - #[should_panic(expected = "unknown column label: 'D'")] - fn test_dataframe_column_panic_unknown() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.column("D"); - } - - #[test] - fn test_dataframe_column_mut_access() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20], None); - let col_c = - create_string_typed_frame("C", vec!["foo".to_string(), "bar".to_string()], None); - - let mut df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); - - // Test f64 column mut - if let DataFrameColumnMut::F64(slice_a_mut) = df.column_mut("A") { - slice_a_mut[0] = 100.0; - } else { - panic!("Column A is not f64 mut"); - } - assert_eq!(df.column("A").as_f64().unwrap(), &[100.0, 2.0]); - - // Test i64 column mut - if let DataFrameColumnMut::I64(slice_b_mut) = df.column_mut("B") { - slice_b_mut[1] = 200; - } else { - panic!("Column B is not i64 mut"); - } - assert_eq!(df.column("B").as_i64().unwrap(), &[10, 200]); - - // Test String column mut - if let DataFrameColumnMut::String(slice_c_mut) = df.column_mut("C") { - slice_c_mut[0] = "baz".to_string(); - } else { - panic!("Column C is not String mut"); - } - assert_eq!( - df.column("C").as_string().unwrap(), - &["baz".to_string(), "bar".to_string()] - ); - } - - #[test] - #[should_panic(expected = "unknown column label: 'D'")] - fn test_dataframe_column_mut_panic_unknown() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.column_mut("D"); - } - - #[test] - fn test_dataframe_add_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - let new_col_b = create_i64_typed_frame("B", vec![10, 20], None); - df.add_column("B".to_string(), new_col_b); - - assert_eq!(df.rows(), 2); - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "B"]); - assert_eq!(df.column("A").as_f64().unwrap(), &[1.0, 2.0]); - assert_eq!(df.column("B").as_i64().unwrap(), &[10, 20]); - - let new_col_c = - create_string_typed_frame("C", vec!["x".to_string(), "y".to_string()], None); - df.add_column("C".to_string(), new_col_c); - assert_eq!(df.cols(), 3); - assert_eq!(df.columns(), &["A", "B", "C"]); - assert_eq!( - df.column("C").as_string().unwrap(), - &["x".to_string(), "y".to_string()] - ); - } - - #[test] - #[should_panic(expected = "duplicate column label: B")] - fn test_dataframe_add_column_panic_duplicate_name() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_typed_frame("B", vec![10], None); - df.add_column("B".to_string(), new_col_b); - let another_col_b = create_i64_typed_frame("B", vec![20], None); - df.add_column("B".to_string(), another_col_b); - } - - #[test] - #[should_panic(expected = "new column 'B' has inconsistent row count (1 vs 2)")] - fn test_dataframe_add_column_panic_inconsistent_rows() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let new_col_b = create_i64_typed_frame("B", vec![10], None); // Mismatch - df.add_column("B".to_string(), new_col_b); - } - - #[test] - #[should_panic(expected = "provided TypedFrame must contain exactly one column named 'B'")] - fn test_dataframe_add_column_panic_multi_column_typedframe() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let multi_col_b = create_multi_f64_typed_frame( - vec!["B", "C"], - vec![vec![10.0, 20.0], vec![30.0, 40.0]], - None, - ); - df.add_column("B".to_string(), multi_col_b); - } - - #[test] - fn test_dataframe_delete_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20], None); - let col_c = create_string_typed_frame("C", vec!["x".to_string(), "y".to_string()], None); - - let mut df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); - - let deleted_col_b = df.delete_column("B"); - assert_eq!(df.cols(), 2); - assert_eq!(df.columns(), &["A", "C"]); - assert_eq!(df.column("A").as_f64().unwrap(), &[1.0, 2.0]); - assert_eq!( - df.column("C").as_string().unwrap(), - &["x".to_string(), "y".to_string()] - ); - - if let TypedFrame::I64(frame_b) = deleted_col_b { - assert_eq!(frame_b.column("B"), &[10, 20]); - } else { - panic!("Deleted column B is not i64 TypedFrame"); - } - - let deleted_col_a = df.delete_column("A"); - assert_eq!(df.cols(), 1); - assert_eq!(df.columns(), &["C"]); - assert_eq!( - df.column("C").as_string().unwrap(), - &["x".to_string(), "y".to_string()] - ); - if let TypedFrame::F64(frame_a) = deleted_col_a { - assert_eq!(frame_a.column("A"), &[1.0, 2.0]); - } else { - panic!("Deleted column A is not f64 TypedFrame"); - } - - let deleted_col_c = df.delete_column("C"); + fn test_dataframe_new() { + let df = DataFrame::new(); + assert_eq!(df.rows(), 0); assert_eq!(df.cols(), 0); - assert_eq!(df.columns(), &[] as &[String]); - if let TypedFrame::String(frame_c) = deleted_col_c { - assert_eq!(frame_c.column("C"), &["x".to_string(), "y".to_string()]); - } else { - panic!("Deleted column C is not String TypedFrame"); - } + assert!(df.get_column_names().is_empty()); + assert!(df.frames_by_type.is_empty()); + assert!(df.column_to_type.is_empty()); } #[test] - #[should_panic(expected = "unknown column label: 'D'")] - fn test_dataframe_delete_column_panic_unknown() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.delete_column("D"); + fn test_dataframe_add_column_initial() { + let mut df = DataFrame::new(); + let data = vec![1, 2, 3]; + df.add_column("col_int", data.clone()); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 1); + assert_eq!(df.get_column_names(), &vec!["col_int".to_string()]); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::())); + + // Verify the underlying frame + let sub_frame_box = df.frames_by_type.get(&TypeId::of::()).unwrap(); + let frame = sub_frame_box.as_any().downcast_ref::>().unwrap(); + assert_eq!(frame.rows(), 3); + assert_eq!(frame.cols(), 1); + assert_eq!(frame.columns(), &vec!["col_int".to_string()]); } #[test] - fn test_dataframe_rename_column() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20], None); - let mut df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - - df.rename_column("A", "Alpha".to_string()); - assert_eq!(df.columns(), &["Alpha", "B"]); - assert_eq!(df.column("Alpha").as_f64().unwrap(), &[1.0, 2.0]); - assert!(!df.column_locations.contains_key("A")); - assert!(df.column_locations.contains_key("Alpha")); - - df.rename_column("B", "Beta".to_string()); - assert_eq!(df.columns(), &["Alpha", "Beta"]); - assert_eq!(df.column("Beta").as_i64().unwrap(), &[10, 20]); - } - - #[test] - #[should_panic(expected = "unknown column label: 'C'")] - fn test_dataframe_rename_column_panic_unknown_old_name() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let mut df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - df.rename_column("C", "D".to_string()); - } - - #[test] - #[should_panic(expected = "new column name 'B' already exists")] - fn test_dataframe_rename_column_panic_new_name_exists() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - let col_b = create_i64_typed_frame("B", vec![10], None); - let mut df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - df.rename_column("A", "B".to_string()); - } - - #[test] - fn test_dataframe_sort_columns() { - let col_c = create_f64_typed_frame("C", vec![1.0], None); - let col_a = create_i64_typed_frame("A", vec![2], None); - let col_b = create_string_typed_frame("B", vec!["x".to_string()], None); - - let mut df = DataFrame::new( - vec![col_c, col_a, col_b], - vec!["C".to_string(), "A".to_string(), "B".to_string()], - None, - ); - - assert_eq!(df.columns(), &["C", "A", "B"]); - df.sort_columns(); - assert_eq!(df.columns(), &["A", "B", "C"]); - - // Ensure data integrity after sort - assert_eq!(df.column("A").as_i64().unwrap(), &[2]); - assert_eq!(df.column("B").as_string().unwrap(), &["x".to_string()]); - assert_eq!(df.column("C").as_f64().unwrap(), &[1.0]); - } - - #[test] - fn test_dataframe_new_with_multi_column_subframe() { - let multi_f64_frame = create_multi_f64_typed_frame( - vec!["X", "Y"], - vec![vec![1.0, 2.0], vec![10.0, 20.0]], - None, - ); - let single_string_frame = - create_string_typed_frame("Z", vec!["a".to_string(), "b".to_string()], None); - - let df = DataFrame::new( - vec![multi_f64_frame, single_string_frame], - vec!["X".to_string(), "Y".to_string(), "Z".to_string()], - None, - ); - - assert_eq!(df.rows(), 2); - assert_eq!(df.cols(), 3); - assert_eq!(df.columns(), &["X", "Y", "Z"]); - - if let DataFrameColumn::F64(slice_x) = df.column("X") { - assert_eq!(slice_x, &[1.0, 2.0]); - } else { - panic!("Column X is not f64"); - } - if let DataFrameColumn::F64(slice_y) = df.column("Y") { - assert_eq!(slice_y, &[10.0, 20.0]); - } else { - panic!("Column Y is not f64"); - } - if let DataFrameColumn::String(slice_z) = df.column("Z") { - assert_eq!(slice_z, &["a".to_string(), "b".to_string()]); - } else { - panic!("Column Z is not String"); - } - } - - #[test] - fn test_dataframe_new_with_int_index() { - let index = RowIndex::Int(vec![10, 20, 30]); - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], Some(index.clone())); - let col_b = create_i64_typed_frame("B", vec![4, 5, 6], Some(index.clone())); - - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - Some(index.clone()), - ); + fn test_dataframe_add_column_same_type() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5, 6]); assert_eq!(df.rows(), 3); assert_eq!(df.cols(), 2); - assert_eq!(df.index(), &index); - } - - #[test] - fn test_dataframe_new_with_date_index() { - let index_vec = vec![d(2024, 1, 1), d(2024, 1, 2)]; - let index = RowIndex::Date(index_vec.clone()); - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0], Some(index.clone())); - let col_b = create_string_typed_frame( - "B", - vec!["hello".to_string(), "world".to_string()], - Some(index.clone()), - ); - - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - Some(index.clone()), - ); - - assert_eq!(df.rows(), 2); - assert_eq!(df.cols(), 2); - assert_eq!(df.index(), &index); - } - - #[test] - #[should_panic(expected = "column name 'B' not found in provided TypedFrames")] - fn test_dataframe_new_panic_col_name_not_found_in_subframes() { - let col_a = create_f64_typed_frame("A", vec![1.0], None); - DataFrame::new(vec![col_a], vec!["A".to_string(), "B".to_string()], None); - } - - #[test] - #[should_panic(expected = "TypedFrame has inconsistent row count (2 vs 3)")] - fn test_dataframe_new_panic_inconsistent_rows() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![4, 5], None); // Mismatch - DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - } - - #[test] - #[should_panic(expected = "duplicate column name: A")] - fn test_dataframe_new_panic_duplicate_col_name() { - let col_a1 = create_f64_typed_frame("A", vec![1.0], None); - let col_a2 = create_i64_typed_frame("A", vec![2], None); // Duplicate name - DataFrame::new( - vec![col_a1, col_a2], - vec!["A".to_string(), "A".to_string()], - None, - ); - } - - #[test] - fn test_dataframe_head_n() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], None); - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); - - // Test head_n with n < rows - let head_df = df.head_n(3); - assert_eq!(head_df.rows(), 3); - assert_eq!(head_df.cols(), 2); - assert_eq!(head_df.columns(), &["A", "B"]); - assert_eq!(head_df.index(), &RowIndex::Range(0..3)); - assert_eq!(head_df.column("A").as_f64().unwrap(), &[1.0, 2.0, 3.0]); - assert_eq!(head_df.column("B").as_i64().unwrap(), &[10, 20, 30]); - - // Test head_n with n > rows - let head_all_df = df.head_n(10); - assert_eq!(head_all_df.rows(), 7); - assert_eq!(head_all_df.cols(), 2); - assert_eq!(head_all_df.columns(), &["A", "B"]); - assert_eq!(head_all_df.index(), &RowIndex::Range(0..7)); assert_eq!( - head_all_df.column("A").as_f64().unwrap(), - &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] + df.get_column_names(), + &vec!["col_int1".to_string(), "col_int2".to_string()] + ); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert_eq!( + df.column_to_type.get("col_int1"), + Some(&TypeId::of::()) ); assert_eq!( - head_all_df.column("B").as_i64().unwrap(), - &[10, 20, 30, 40, 50, 60, 70] + df.column_to_type.get("col_int2"), + Some(&TypeId::of::()) ); - // Test head_n with n = 0 - let empty_head_df = df.head_n(0); - assert_eq!(empty_head_df.rows(), 0); - assert_eq!(empty_head_df.cols(), 0); - assert!(empty_head_df.columns().is_empty()); - assert_eq!(empty_head_df.index(), &RowIndex::Range(0..0)); - - // Test with Int index - let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); - let col_a_int = create_f64_typed_frame( - "A", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], - Some(int_index.clone()), + // Verify the underlying frame + let sub_frame_box = df.frames_by_type.get(&TypeId::of::()).unwrap(); + let frame = sub_frame_box.as_any().downcast_ref::>().unwrap(); + assert_eq!(frame.rows(), 3); + assert_eq!(frame.cols(), 2); + assert_eq!( + frame.columns(), + &vec!["col_int1".to_string(), "col_int2".to_string()] ); - let col_b_int = create_i64_typed_frame( - "B", - vec![10, 20, 30, 40, 50, 60, 70], - Some(int_index.clone()), - ); - let df_int = DataFrame::new( - vec![col_a_int, col_b_int], - vec!["A".to_string(), "B".to_string()], - Some(int_index), - ); - let head_int_df = df_int.head_n(3); - assert_eq!(head_int_df.rows(), 3); - assert_eq!(head_int_df.index(), &RowIndex::Int(vec![100, 101, 102])); } #[test] - fn test_dataframe_tail_n() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20, 30, 40, 50, 60, 70], None); - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, + fn test_dataframe_add_column_different_type() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], ); - // Test tail_n with n < rows - let tail_df = df.tail_n(3); - assert_eq!(tail_df.rows(), 3); - assert_eq!(tail_df.cols(), 2); - assert_eq!(tail_df.columns(), &["A", "B"]); - assert_eq!(tail_df.index(), &RowIndex::Range(4..7)); - assert_eq!(tail_df.column("A").as_f64().unwrap(), &[5.0, 6.0, 7.0]); - assert_eq!(tail_df.column("B").as_i64().unwrap(), &[50, 60, 70]); - - // Test tail_n with n > rows - let tail_all_df = df.tail_n(10); - assert_eq!(tail_all_df.rows(), 7); - assert_eq!(tail_all_df.cols(), 2); - assert_eq!(tail_all_df.columns(), &["A", "B"]); - assert_eq!(tail_all_df.index(), &RowIndex::Range(0..7)); + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 3); assert_eq!( - tail_all_df.column("A").as_f64().unwrap(), - &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] + df.get_column_names(), + &vec![ + "col_int".to_string(), + "col_float".to_string(), + "col_string".to_string() + ] + ); + + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::())); + assert_eq!( + df.column_to_type.get("col_float"), + Some(&TypeId::of::()) ); assert_eq!( - tail_all_df.column("B").as_i64().unwrap(), - &[10, 20, 30, 40, 50, 60, 70] + df.column_to_type.get("col_string"), + Some(&TypeId::of::()) ); - // Test tail_n with n = 0 - let empty_tail_df = df.tail_n(0); - assert_eq!(empty_tail_df.rows(), 0); - assert_eq!(empty_tail_df.cols(), 0); - assert!(empty_tail_df.columns().is_empty()); - assert_eq!(empty_tail_df.index(), &RowIndex::Range(0..0)); + // Verify underlying frames + let int_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int".to_string()]); - // Test with Int index - let int_index = RowIndex::Int(vec![100, 101, 102, 103, 104, 105, 106]); - let col_a_int = create_f64_typed_frame( - "A", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], - Some(int_index.clone()), - ); - let col_b_int = create_i64_typed_frame( - "B", - vec![10, 20, 30, 40, 50, 60, 70], - Some(int_index.clone()), - ); - let df_int = DataFrame::new( - vec![col_a_int, col_b_int], - vec!["A".to_string(), "B".to_string()], - Some(int_index), - ); - let tail_int_df = df_int.tail_n(3); - assert_eq!(tail_int_df.rows(), 3); - assert_eq!(tail_int_df.index(), &RowIndex::Int(vec![104, 105, 106])); + let float_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(float_frame.columns(), &vec!["col_float".to_string()]); + + let string_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(string_frame.columns(), &vec!["col_string".to_string()]); } #[test] - fn test_dataframe_head() { - let col_a = create_f64_typed_frame( - "A", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - None, + fn test_dataframe_get_column() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], ); - let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - let head_df = df.head(); - assert_eq!(head_df.rows(), DEFAULT_DISPLAY_ROWS); + // Test getting existing columns with correct type assert_eq!( - head_df.column("A").as_f64().unwrap(), - &[1.0, 2.0, 3.0, 4.0, 5.0] + df.get_column::("col_int").unwrap(), + vec![1, 2, 3].as_slice() ); - - // Test with fewer rows than DEFAULT_DISPLAY_ROWS - let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); - let df_small = DataFrame::new(vec![col_b], vec!["B".to_string()], None); - let head_small_df = df_small.head(); - assert_eq!(head_small_df.rows(), 2); - assert_eq!(head_small_df.column("B").as_f64().unwrap(), &[1.0, 2.0]); - } - - #[test] - fn test_dataframe_tail() { - let col_a = create_f64_typed_frame( - "A", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], - None, - ); - let df = DataFrame::new(vec![col_a], vec!["A".to_string()], None); - - let tail_df = df.tail(); - assert_eq!(tail_df.rows(), DEFAULT_DISPLAY_ROWS); assert_eq!( - tail_df.column("A").as_f64().unwrap(), - &[6.0, 7.0, 8.0, 9.0, 10.0] + df.get_column::("col_float").unwrap(), + vec![1.1, 2.2, 3.3].as_slice() + ); + assert_eq!( + df.get_column::("col_string").unwrap(), + vec!["a".to_string(), "b".to_string(), "c".to_string()].as_slice() ); - // Test with fewer rows than DEFAULT_DISPLAY_ROWS - let col_b = create_f64_typed_frame("B", vec![1.0, 2.0], None); - let df_small = DataFrame::new(vec![col_b], vec!["B".to_string()], None); - let tail_small_df = df_small.tail(); - assert_eq!(tail_small_df.rows(), 2); - assert_eq!(tail_small_df.column("B").as_f64().unwrap(), &[1.0, 2.0]); + // Test getting non-existent column + assert_eq!(df.get_column::("non_existent"), None); + + // Test getting existing column with incorrect type + assert_eq!(df.get_column::("col_int"), None); + assert_eq!(df.get_column::("col_float"), None); } #[test] - fn test_dataframe_display_empty() { - let empty_df = DataFrame::new(vec![], vec![], None); - let expected_output = "\ -+-------------------+\n| Empty DataFrame |\n+-------------------+\nRows: 0, Columns: 0"; - assert_eq!(format!("{}", empty_df), expected_output); + fn test_dataframe_get_row() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], + ); + + // Test getting an existing row + let row0 = df.get_row(0).unwrap(); + assert_eq!(row0.get("col_int"), Some(&"1".to_string())); + assert_eq!(row0.get("col_float"), Some(&"1.1".to_string())); + assert_eq!(row0.get("col_string"), Some(&"a".to_string())); + + let row1 = df.get_row(1).unwrap(); + assert_eq!(row1.get("col_int"), Some(&"2".to_string())); + assert_eq!(row1.get("col_float"), Some(&"2.2".to_string())); + assert_eq!(row1.get("col_string"), Some(&"b".to_string())); + + // Test getting an out-of-bounds row + assert_eq!(df.get_row(3), None); } #[test] - fn test_dataframe_display_basic() { - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); - let col_c = create_string_typed_frame( - "C", - vec!["x".to_string(), "y".to_string(), "z".to_string()], - None, - ); - let df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, + #[should_panic(expected = "DataFrame::add_column: duplicate column name: 'col_int'")] + fn test_dataframe_add_column_duplicate_name() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_int", vec![4, 5, 6]); + } + + #[test] + #[should_panic( + expected = "DataFrame::add_column: new column 'col_int2' has 2 rows, but existing columns have 3 rows" + )] + fn test_dataframe_add_column_mismatched_rows() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5]); + } + + #[test] + fn test_dataframe_display() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3, 4, 5, 6]); + df.add_column("col_float", vec![1.1, 2.2, 3.3, 4.4, 5.5, 6.6]); + df.add_column( + "col_string", + vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + "e".to_string(), + "f".to_string(), + ], ); let expected_output = "\ -┌───────┬─────┬─────┬─────┐\n│ Index │ A │ B │ C │\n├───────┼─────┼─────┼─────┤\n│ 0 │ 1 │ 10 │ x │\n├───────┼─────┼─────┼─────┤\n│ 1 │ 2 │ 20 │ y │\n├───────┼─────┼─────┼─────┤\n│ 2 │ 3 │ 30 │ z │\n└───────┴─────┴─────┴─────┘\n[3 rows x 3 columns]"; +col_int col_float col_string +1 1.1 a +2 2.2 b +3 3.3 c +4 4.4 d +5 5.5 e +... +"; assert_eq!(format!("{}", df), expected_output); } #[test] - fn test_dataframe_display_truncation_rows() { - let col_a = create_f64_typed_frame("A", (1..=20).map(|i| i as f64).collect(), None); - let col_b = create_i64_typed_frame("B", (21..=40).collect(), None); - let df = DataFrame::new( - vec![col_a, col_b], - vec!["A".to_string(), "B".to_string()], - None, - ); + fn test_dataframe_debug() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); - let expected_output = "\ -┌───────┬─────┬─────┐\n│ Index │ A │ B │\n├───────┼─────┼─────┤\n│ 0 │ 1 │ 21 │\n├───────┼─────┼─────┤\n│ 1 │ 2 │ 22 │\n├───────┼─────┼─────┤\n│ 2 │ 3 │ 23 │\n├───────┼─────┼─────┤\n│ 3 │ 4 │ 24 │\n├───────┼─────┼─────┤\n│ 4 │ 5 │ 25 │\n└───────┴─────┴─────┘\n[5 rows x 2 columns]"; - assert_eq!(format!("{}", df.head()), expected_output); + let debug_output = format!("{:?}", df); + assert!(debug_output.contains("DataFrame {")); + assert!(debug_output.contains("column_names: [\"col_int\", \"col_float\"]")); + assert!(debug_output.contains("index: Range(0..3)")); + assert!(debug_output.contains("column_to_type: {")); + assert!(debug_output.contains("frames_by_type: {")); } #[test] - fn test_dataframe_display_truncation_cols() { - let mut cols_data = Vec::new(); - let mut col_names = Vec::new(); - for i in 0..15 { - // 15 columns, more than DEFAULT_DISPLAY_COLS - cols_data.push((1..=3).map(|r| (i * 10 + r) as f64).collect()); - col_names.push(format!("Col{}", i)); - } - let typed_frame = create_multi_f64_typed_frame( - col_names.iter().map(|s| s.as_str()).collect(), - cols_data, - None, - ); - let df = DataFrame::new(vec![typed_frame], col_names, None); + fn test_dataframe_drop_column_single_type() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5, 6]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); - // Only the first DEFAULT_DISPLAY_COLS columns should be shown in the output - let expected_output = "\ -┌───────┬──────┬──────┬──────┬──────┬──────┬─────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ ... │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ ... │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ ... │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ ... │ 103 │ 113 │ 123 │ 133 │ 143 │\n└───────┴──────┴──────┴──────┴──────┴──────┴─────┴───────┴───────┴───────┴───────┴───────┘\n[3 rows x 15 columns]"; - assert_eq!(format!("{}", df.head()), expected_output); + assert_eq!(df.cols(), 3); + assert_eq!( + df.get_column_names(), + &vec![ + "col_int1".to_string(), + "col_int2".to_string(), + "col_float".to_string() + ] + ); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_int1"); + + assert_eq!(df.cols(), 2); + assert_eq!( + df.get_column_names(), + &vec!["col_int2".to_string(), "col_float".to_string()] + ); + assert!(df.column_to_type.get("col_int1").is_none()); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); // Frame should still exist + let int_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int2".to_string()]); + + df.drop_column("col_int2"); + + assert_eq!(df.cols(), 1); + assert_eq!(df.get_column_names(), &vec!["col_float".to_string()]); + assert!(df.column_to_type.get("col_int2").is_none()); + assert!(!df.frames_by_type.contains_key(&TypeId::of::())); // Frame should be removed + assert!(df.frames_by_type.contains_key(&TypeId::of::())); } #[test] - fn test_dataframe_display_truncation_both() { - let mut cols_data = Vec::new(); - let mut col_names = Vec::new(); - for i in 0..15 { - // 15 columns - cols_data.push((1..=10).map(|r| (i * 10 + r) as f64).collect()); - col_names.push(format!("Col{}", i)); - } - let typed_frame = create_multi_f64_typed_frame( - col_names.iter().map(|s| s.as_str()).collect(), - cols_data, - None, + fn test_dataframe_drop_column_mixed_types() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], ); - let df = DataFrame::new(vec![typed_frame], col_names, None); - // Only the first DEFAULT_DISPLAY_ROWS rows and DEFAULT_DISPLAY_COLS columns should be shown - let expected_output = "\ -┌───────┬──────┬──────┬──────┬──────┬──────┬─────┬───────┬───────┬───────┬───────┬───────┐\n│ Index │ Col0 │ Col1 │ Col2 │ Col3 │ Col4 │ ... │ Col10 │ Col11 │ Col12 │ Col13 │ Col14 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 0 │ 1 │ 11 │ 21 │ 31 │ 41 │ ... │ 101 │ 111 │ 121 │ 131 │ 141 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 1 │ 2 │ 12 │ 22 │ 32 │ 42 │ ... │ 102 │ 112 │ 122 │ 132 │ 142 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 2 │ 3 │ 13 │ 23 │ 33 │ 43 │ ... │ 103 │ 113 │ 123 │ 133 │ 143 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 3 │ 4 │ 14 │ 24 │ 34 │ 44 │ ... │ 104 │ 114 │ 124 │ 134 │ 144 │\n├───────┼──────┼──────┼──────┼──────┼──────┼─────┼───────┼───────┼───────┼───────┼───────┤\n│ 4 │ 5 │ 15 │ 25 │ 35 │ 45 │ ... │ 105 │ 115 │ 125 │ 135 │ 145 │\n└───────┴──────┴──────┴──────┴──────┴──────┴─────┴───────┴───────┴───────┴───────┴───────┘\n[5 rows x 15 columns]"; - assert_eq!(format!("{}", df.head()), expected_output); + assert_eq!(df.cols(), 3); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_float"); + + assert_eq!(df.cols(), 2); + assert_eq!( + df.get_column_names(), + &vec!["col_int".to_string(), "col_string".to_string()] + ); + assert!(df.column_to_type.get("col_float").is_none()); + assert!(!df.frames_by_type.contains_key(&TypeId::of::())); // Frame should be removed + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_int"); + df.drop_column("col_string"); + + assert_eq!(df.cols(), 0); + assert!(df.get_column_names().is_empty()); + assert!(df.frames_by_type.is_empty()); + assert!(df.column_to_type.is_empty()); + } + + #[test] + #[should_panic(expected = "DataFrame::drop_column: column 'non_existent' not found")] + fn test_dataframe_drop_column_non_existent() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.drop_column("non_existent"); + } + + #[test] + fn test_dataframe_add_column_reuses_existing_frame() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_float1", vec![1.1, 2.2, 3.3]); + + // Initially, there should be two frames (one for i32, one for f64) + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Add another integer column + df.add_column("col_int2", vec![4, 5, 6]); + + // The number of frames should still be 2, as the existing i32 frame should be reused + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Verify the i32 frame now contains both integer columns + let int_frame = df.frames_by_type.get(&TypeId::of::()).unwrap().as_any().downcast_ref::>().unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int1".to_string(), "col_int2".to_string()]); + assert_eq!(int_frame.cols(), 2); + + // Add another float column + df.add_column("col_float2", vec![4.4, 5.5, 6.6]); + + // The number of frames should still be 2, as the existing f64 frame should be reused + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Verify the f64 frame now contains both float columns + let float_frame = df.frames_by_type.get(&TypeId::of::()).unwrap().as_any().downcast_ref::>().unwrap(); + assert_eq!(float_frame.columns(), &vec!["col_float1".to_string(), "col_float2".to_string()]); + assert_eq!(float_frame.cols(), 2); } } From 96934cd89f51f2ba99e24e44a3abdf2518a0f1e4 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Fri, 4 Jul 2025 00:45:45 +0100 Subject: [PATCH 17/19] update DataFrame module exports --- src/dataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dataframe/mod.rs b/src/dataframe/mod.rs index 3dd102e..c477580 100644 --- a/src/dataframe/mod.rs +++ b/src/dataframe/mod.rs @@ -1,4 +1,4 @@ //! This module provides the DataFrame structure for handling tabular data with mixed types. pub mod df; -pub use df::{DataFrame, DataFrameColumn, TypedFrame}; +pub use df::{DataFrame, SubFrame}; From 26ee58071015e3a686a24602f4a5b0d2f6e95092 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Fri, 4 Jul 2025 00:46:12 +0100 Subject: [PATCH 18/19] Refactor README: update DataFrame usage example --- README.md | 158 ++++++++++++++++++++++++++---------------------------- 1 file changed, 77 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 1ddaa75..5aa325a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - # Rustframe rustframe @@ -6,12 +5,13 @@ 📚 [Docs](https://magnus167.github.io/rustframe/) | 🐙 [GitHub](https://github.com/Magnus167/rustframe) | 🌐 [Gitea mirror](https://gitea.nulltech.uk/Magnus167/rustframe) | 🦀 [Crates.io](https://crates.io/crates/rustframe) | 🔖 [docs.rs](https://docs.rs/rustframe/latest/rustframe/) + [![codecov](https://codecov.io/gh/Magnus167/rustframe/graph/badge.svg?token=J7ULJEFTVI)](https://codecov.io/gh/Magnus167/rustframe) [![Coverage](https://img.shields.io/endpoint?url=https://magnus167.github.io/rustframe/docs/tarpaulin-badge.json)](https://magnus167.github.io/rustframe/docs/tarpaulin-report.html) --- -## Rustframe: *A lightweight dataframe & math toolkit for Rust* +## Rustframe: _A lightweight dataframe & math toolkit for Rust_ Rustframe provides intuitive dataframe, matrix, and series operations small-to-mid scale data analysis and manipulation. @@ -104,95 +104,91 @@ assert!(check); ## DataFrame Usage Example ```rust -use rustframe::{ - dataframe::{DataFrame, TypedFrame, DataFrameColumn}, - frame::{Frame, RowIndex}, - matrix::Matrix, -}; +use rustframe::dataframe::DataFrame; +use chrono::NaiveDate; +use std::collections::HashMap; +use std::any::TypeId; // Required for checking TypeId -// Helper to create a simple f64 TypedFrame (similar to test helpers) -fn create_f64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::F64(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) +// Helper for NaiveDate +fn d(y: i32, m: u32, d: u32) -> NaiveDate { + NaiveDate::from_ymd_opt(y, m, d).unwrap() } -// Helper to create a simple i64 TypedFrame -fn create_i64_typed_frame(name: &str, data: Vec, index: Option) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::I64(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) -} +// Create a new DataFrame +let mut df = DataFrame::new(); -// Helper to create a simple String TypedFrame -fn create_string_typed_frame( - name: &str, - data: Vec, - index: Option, -) -> TypedFrame { - let rows = data.len(); - let matrix = Matrix::from_cols(vec![data]); - let frame_index = index.unwrap_or(RowIndex::Range(0..rows)); - TypedFrame::String(Frame::new( - matrix, - vec![name.to_string()], - Some(frame_index), - )) -} +// Add columns of different types +df.add_column("col_int1", vec![1, 2, 3, 4, 5]); +df.add_column("col_float1", vec![1.1, 2.2, 3.3, 4.4, 5.5]); +df.add_column("col_string", vec!["apple".to_string(), "banana".to_string(), "cherry".to_string(), "date".to_string(), "elderberry".to_string()]); +df.add_column("col_bool", vec![true, false, true, false, true]); +df.add_column("col_date", vec![d(2023,1,1), d(2023,1,2), d(2023,1,3), d(2023,1,4), d(2023,1,5)]); -fn main() { - // 1. Create a DataFrame with different data types - let col_a = create_f64_typed_frame("A", vec![1.0, 2.0, 3.0], None); - let col_b = create_i64_typed_frame("B", vec![10, 20, 30], None); - let col_c = create_string_typed_frame( - "C", - vec!["apple".to_string(), "banana".to_string(), "cherry".to_string()], - None, - ); +println!("DataFrame after initial column additions:\n{}", df); - let mut df = DataFrame::new( - vec![col_a, col_b, col_c], - vec!["A".to_string(), "B".to_string(), "C".to_string()], - None, - ); +// Demonstrate frame re-use when adding columns of existing types +let initial_frames_count = df.num_internal_frames(); +println!("\nInitial number of internal frames: {}", initial_frames_count); - println!("Initial DataFrame:\n{:?}", df); - println!("Columns: {:?}", df.columns()); - println!("Rows: {}", df.rows()); +df.add_column("col_int2", vec![6, 7, 8, 9, 10]); +df.add_column("col_float2", vec![6.6, 7.7, 8.8, 9.9, 10.0]); - // 2. Accessing columns - if let DataFrameColumn::F64(col_a_data) = df.column("A") { - println!("Column 'A' (f64): {:?}", col_a_data); - } +let frames_after_reuse = df.num_internal_frames(); +println!("Number of internal frames after adding more columns of existing types: {}", frames_after_reuse); +assert_eq!(initial_frames_count, frames_after_reuse); // Should be equal, demonstrating re-use - if let DataFrameColumn::String(col_c_data) = df.column("C") { - println!("Column 'C' (String): {:?}", col_c_data); - } +println!("\nDataFrame after adding more columns of existing types:\n{}", df); - // 3. Add a new column - let new_col_d = create_f64_typed_frame("D", vec![100.0, 200.0, 300.0], None); - df.add_column("D".to_string(), new_col_d); - println!("\nDataFrame after adding column 'D':\n{:?}", df); - println!("Columns after add: {:?}", df.columns()); - // 4. Rename a column - df.rename_column("A", "Alpha".to_string()); - println!("\nDataFrame after renaming 'A' to 'Alpha':\n{:?}", df); - println!("Columns after rename: {:?}", df.columns()); +// Get number of rows and columns +println!("Rows: {}", df.rows()); // Output: Rows: 5 +println!("Columns: {}", df.cols()); // Output: Columns: 5 - // 5. Delete a column - let _deleted_col_b = df.delete_column("B"); - println!("\nDataFrame after deleting column 'B':\n{:?}", df); - println!("Columns after delete: {:?}", df.columns()); -} +// Get column names +println!("Column names: {:?}", df.get_column_names()); +// Output: Column names: ["col_int", "col_float", "col_string", "col_bool", "col_date"] + +// Get a specific column by name and type +let int_col = df.get_column::("col_int1").unwrap(); +println!("Integer column (col_int1): {:?}", int_col); // Output: Integer column: [1, 2, 3, 4, 5] + +let int_col2 = df.get_column::("col_int2").unwrap(); +println!("Integer column (col_int2): {:?}", int_col2); // Output: Integer column: [6, 7, 8, 9, 10] + +let float_col = df.get_column::("col_float1").unwrap(); +println!("Float column (col_float1): {:?}", float_col); // Output: Float column: [1.1, 2.2, 3.3, 4.4, 5.5] + +// Attempt to get a column with incorrect type (returns None) +let wrong_type_col = df.get_column::("col_int1"); +println!("Wrong type column: {:?}", wrong_type_col); // Output: Wrong type column: None + +// Get a row by index +let row_0 = df.get_row(0).unwrap(); +println!("Row 0: {:?}", row_0); +// Output: Row 0: {"col_int1": "1", "col_float1": "1.1", "col_string": "apple", "col_bool": "true", "col_date": "2023-01-01", "col_int2": "6", "col_float2": "6.6"} + +let row_2 = df.get_row(2).unwrap(); +println!("Row 2: {:?}", row_2); +// Output: Row 2: {"col_int1": "3", "col_float1": "3.3", "col_string": "cherry", "col_bool": "true", "col_date": "2023-01-03", "col_int2": "8", "col_float2": "8.8"} + +// Attempt to get an out-of-bounds row (returns None) +let row_out_of_bounds = df.get_row(10); +println!("Row out of bounds: {:?}", row_out_of_bounds); // Output: Row out of bounds: None + +// Drop a column +df.drop_column("col_bool"); +println!("\nDataFrame after dropping 'col_bool':\n{}", df); + +println!("Columns after drop: {}", df.cols()); +println!("Column names after drop: {:?}", df.get_column_names()); + +// Drop another column, ensuring the underlying Frame is removed if empty +df.drop_column("col_float1"); +println!("\nDataFrame after dropping 'col_float1':\n{}", df); + +println!("Columns after second drop: {}", df.cols()); +println!("Column names after second drop: {:?}", df.get_column_names()); + +// Attempt to drop a non-existent column (will panic) +// df.drop_column("non_existent_col"); // Uncomment to see panic ``` From 7d7794627be1eef7edc5152299ca3fe05d0300f7 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Fri, 4 Jul 2025 20:15:47 +0100 Subject: [PATCH 19/19] Refactor DataFrame usage example in README.md for clarity and consistency --- README.md | 62 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f8fc1c9..fab8c73 100644 --- a/README.md +++ b/README.md @@ -98,15 +98,17 @@ assert!(check); ``` + --- ## DataFrame Usage Example ```rust -use rustframe::dataframe::DataFrame; use chrono::NaiveDate; +use rustframe::dataframe::DataFrame; +use rustframe::utils::{BDateFreq, BDatesList}; +use std::any::TypeId; use std::collections::HashMap; -use std::any::TypeId; // Required for checking TypeId // Helper for NaiveDate fn d(y: i32, m: u32, d: u32) -> NaiveDate { @@ -119,25 +121,49 @@ let mut df = DataFrame::new(); // Add columns of different types df.add_column("col_int1", vec![1, 2, 3, 4, 5]); df.add_column("col_float1", vec![1.1, 2.2, 3.3, 4.4, 5.5]); -df.add_column("col_string", vec!["apple".to_string(), "banana".to_string(), "cherry".to_string(), "date".to_string(), "elderberry".to_string()]); +df.add_column( + "col_string", + vec![ + "apple".to_string(), + "banana".to_string(), + "cherry".to_string(), + "date".to_string(), + "elderberry".to_string(), + ], +); df.add_column("col_bool", vec![true, false, true, false, true]); -df.add_column("col_date", vec![d(2023,1,1), d(2023,1,2), d(2023,1,3), d(2023,1,4), d(2023,1,5)]); +// df.add_column("col_date", vec![d(2023,1,1), d(2023,1,2), d(2023,1,3), d(2023,1,4), d(2023,1,5)]); +df.add_column( + "col_date", + BDatesList::from_n_periods("2023-01-01".to_string(), BDateFreq::Daily, 5) + .unwrap() + .list() + .unwrap(), +); println!("DataFrame after initial column additions:\n{}", df); // Demonstrate frame re-use when adding columns of existing types let initial_frames_count = df.num_internal_frames(); -println!("\nInitial number of internal frames: {}", initial_frames_count); +println!( + "\nInitial number of internal frames: {}", + initial_frames_count +); df.add_column("col_int2", vec![6, 7, 8, 9, 10]); df.add_column("col_float2", vec![6.6, 7.7, 8.8, 9.9, 10.0]); let frames_after_reuse = df.num_internal_frames(); -println!("Number of internal frames after adding more columns of existing types: {}", frames_after_reuse); +println!( + "Number of internal frames after adding more columns of existing types: {}", + frames_after_reuse +); assert_eq!(initial_frames_count, frames_after_reuse); // Should be equal, demonstrating re-use -println!("\nDataFrame after adding more columns of existing types:\n{}", df); - +println!( + "\nDataFrame after adding more columns of existing types:\n{}", + df +); // Get number of rows and columns println!("Rows: {}", df.rows()); // Output: Rows: 5 @@ -149,17 +175,21 @@ println!("Column names: {:?}", df.get_column_names()); // Get a specific column by name and type let int_col = df.get_column::("col_int1").unwrap(); -println!("Integer column (col_int1): {:?}", int_col); // Output: Integer column: [1, 2, 3, 4, 5] +// Output: Integer column: [1, 2, 3, 4, 5] +println!("Integer column (col_int1): {:?}", int_col); let int_col2 = df.get_column::("col_int2").unwrap(); -println!("Integer column (col_int2): {:?}", int_col2); // Output: Integer column: [6, 7, 8, 9, 10] +// Output: Integer column: [6, 7, 8, 9, 10] +println!("Integer column (col_int2): {:?}", int_col2); let float_col = df.get_column::("col_float1").unwrap(); -println!("Float column (col_float1): {:?}", float_col); // Output: Float column: [1.1, 2.2, 3.3, 4.4, 5.5] +// Output: Float column: [1.1, 2.2, 3.3, 4.4, 5.5] +println!("Float column (col_float1): {:?}", float_col); // Attempt to get a column with incorrect type (returns None) let wrong_type_col = df.get_column::("col_int1"); -println!("Wrong type column: {:?}", wrong_type_col); // Output: Wrong type column: None +// Output: Wrong type column: None +println!("Wrong type column: {:?}", wrong_type_col); // Get a row by index let row_0 = df.get_row(0).unwrap(); @@ -172,7 +202,8 @@ println!("Row 2: {:?}", row_2); // Attempt to get an out-of-bounds row (returns None) let row_out_of_bounds = df.get_row(10); -println!("Row out of bounds: {:?}", row_out_of_bounds); // Output: Row out of bounds: None +// Output: Row out of bounds: None +println!("Row out of bounds: {:?}", row_out_of_bounds); // Drop a column df.drop_column("col_bool"); @@ -186,7 +217,10 @@ df.drop_column("col_float1"); println!("\nDataFrame after dropping 'col_float1':\n{}", df); println!("Columns after second drop: {}", df.cols()); -println!("Column names after second drop: {:?}", df.get_column_names()); +println!( + "Column names after second drop: {:?}", + df.get_column_names() +); // Attempt to drop a non-existent column (will panic) // df.drop_column("non_existent_col"); // Uncomment to see panic