diff --git a/README.md b/README.md index c0bc452..5342399 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,133 @@ let zipped_matrix = a.zip(&b, |x, y| x + y); assert_eq!(zipped_matrix.data(), &[6.0, 8.0, 10.0, 12.0]); ``` +--- + +## DataFrame Usage Example + +```rust +use chrono::NaiveDate; +use rustframe::dataframe::DataFrame; +use rustframe::utils::{BDateFreq, BDatesList}; +use std::any::TypeId; +use std::collections::HashMap; + +// Helper for NaiveDate +fn d(y: i32, m: u32, d: u32) -> NaiveDate { + NaiveDate::from_ymd_opt(y, m, d).unwrap() +} + +// Create a new DataFrame +let mut df = DataFrame::new(); + +// Add columns of different types +df.add_column("col_int1", vec![1, 2, 3, 4, 5]); +df.add_column("col_float1", vec![1.1, 2.2, 3.3, 4.4, 5.5]); +df.add_column( + "col_string", + vec![ + "apple".to_string(), + "banana".to_string(), + "cherry".to_string(), + "date".to_string(), + "elderberry".to_string(), + ], +); +df.add_column("col_bool", vec![true, false, true, false, true]); +// df.add_column("col_date", vec![d(2023,1,1), d(2023,1,2), d(2023,1,3), d(2023,1,4), d(2023,1,5)]); +df.add_column( + "col_date", + BDatesList::from_n_periods("2023-01-01".to_string(), BDateFreq::Daily, 5) + .unwrap() + .list() + .unwrap(), +); + +println!("DataFrame after initial column additions:\n{}", df); + +// Demonstrate frame re-use when adding columns of existing types +let initial_frames_count = df.num_internal_frames(); +println!( + "\nInitial number of internal frames: {}", + initial_frames_count +); + +df.add_column("col_int2", vec![6, 7, 8, 9, 10]); +df.add_column("col_float2", vec![6.6, 7.7, 8.8, 9.9, 10.0]); + +let frames_after_reuse = df.num_internal_frames(); +println!( + "Number of internal frames after adding more columns of existing types: {}", + frames_after_reuse +); +assert_eq!(initial_frames_count, frames_after_reuse); // Should be equal, demonstrating re-use + +println!( + "\nDataFrame after adding more columns of existing types:\n{}", + df +); + +// Get number of rows and columns +println!("Rows: {}", df.rows()); // Output: Rows: 5 +println!("Columns: {}", df.cols()); // Output: Columns: 5 + +// Get column names +println!("Column names: {:?}", df.get_column_names()); +// Output: Column names: ["col_int", "col_float", "col_string", "col_bool", "col_date"] + +// Get a specific column by name and type +let int_col = df.get_column::("col_int1").unwrap(); +// Output: Integer column: [1, 2, 3, 4, 5] +println!("Integer column (col_int1): {:?}", int_col); + +let int_col2 = df.get_column::("col_int2").unwrap(); +// Output: Integer column: [6, 7, 8, 9, 10] +println!("Integer column (col_int2): {:?}", int_col2); + +let float_col = df.get_column::("col_float1").unwrap(); +// Output: Float column: [1.1, 2.2, 3.3, 4.4, 5.5] +println!("Float column (col_float1): {:?}", float_col); + +// Attempt to get a column with incorrect type (returns None) +let wrong_type_col = df.get_column::("col_int1"); +// Output: Wrong type column: None +println!("Wrong type column: {:?}", wrong_type_col); + +// Get a row by index +let row_0 = df.get_row(0).unwrap(); +println!("Row 0: {:?}", row_0); +// Output: Row 0: {"col_int1": "1", "col_float1": "1.1", "col_string": "apple", "col_bool": "true", "col_date": "2023-01-01", "col_int2": "6", "col_float2": "6.6"} + +let row_2 = df.get_row(2).unwrap(); +println!("Row 2: {:?}", row_2); +// Output: Row 2: {"col_int1": "3", "col_float1": "3.3", "col_string": "cherry", "col_bool": "true", "col_date": "2023-01-03", "col_int2": "8", "col_float2": "8.8"} + +// Attempt to get an out-of-bounds row (returns None) +let row_out_of_bounds = df.get_row(10); +// Output: Row out of bounds: None +println!("Row out of bounds: {:?}", row_out_of_bounds); + +// Drop a column +df.drop_column("col_bool"); +println!("\nDataFrame after dropping 'col_bool':\n{}", df); + +println!("Columns after drop: {}", df.cols()); +println!("Column names after drop: {:?}", df.get_column_names()); + +// Drop another column, ensuring the underlying Frame is removed if empty +df.drop_column("col_float1"); +println!("\nDataFrame after dropping 'col_float1':\n{}", df); + +println!("Columns after second drop: {}", df.cols()); +println!( + "Column names after second drop: {:?}", + df.get_column_names() +); + +// Attempt to drop a non-existent column (will panic) +// df.drop_column("non_existent_col"); // Uncomment to see panic +``` + ### More examples See the [examples](./examples/) directory for some demonstrations of Rustframe's syntax and functionality. diff --git a/src/dataframe/df.rs b/src/dataframe/df.rs new file mode 100644 index 0000000..23cd0c8 --- /dev/null +++ b/src/dataframe/df.rs @@ -0,0 +1,659 @@ +use crate::frame::{Frame, RowIndex}; +use std::any::{Any, TypeId}; +use std::collections::HashMap; +use std::fmt; // Import TypeId + +const DEFAULT_DISPLAY_ROWS: usize = 5; +const DEFAULT_DISPLAY_COLS: usize = 10; + +// Trait to enable type-agnostic operations on Frame objects within DataFrame +pub trait SubFrame: Send + Sync + fmt::Debug + Any { + fn rows(&self) -> usize; + fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String; + fn clone_box(&self) -> Box; + fn delete_column_from_frame(&mut self, col_name: &str); + fn get_frame_cols(&self) -> usize; // Add a method to get the number of columns in the underlying frame + + // Methods for downcasting to concrete types + fn as_any(&self) -> &dyn Any; + fn as_any_mut(&mut self) -> &mut dyn Any; +} + +// Implement SubFrame for any Frame that meets the requirements +impl SubFrame for Frame +where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, +{ + fn rows(&self) -> usize { + self.rows() + } + + fn get_value_as_string(&self, physical_row_idx: usize, col_name: &str) -> String { + self.get_row(physical_row_idx).get(col_name).to_string() + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn delete_column_from_frame(&mut self, col_name: &str) { + self.delete_column(col_name); + } + + fn get_frame_cols(&self) -> usize { + self.cols() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +pub struct DataFrame { + frames_by_type: HashMap>, // Maps TypeId to the Frame holding columns of that type + column_to_type: HashMap, // Maps column name to its TypeId + column_names: Vec, + index: RowIndex, +} + +impl DataFrame { + pub fn new() -> Self { + DataFrame { + frames_by_type: HashMap::new(), + column_to_type: HashMap::new(), + column_names: Vec::new(), + index: RowIndex::Range(0..0), // Initialize with an empty range index + } + } + + /// Returns the number of rows in the DataFrame. + pub fn rows(&self) -> usize { + self.index.len() + } + + /// Returns the number of columns in the DataFrame. + pub fn cols(&self) -> usize { + self.column_names.len() + } + + /// Returns a reference to the vector of column names. + pub fn get_column_names(&self) -> &Vec { + &self.column_names + } + + /// Returns the number of internal Frame objects (one per unique data type). + pub fn num_internal_frames(&self) -> usize { + self.frames_by_type.len() + } + + /// Returns a reference to a column of a specific type, if it exists. + pub fn get_column(&self, col_name: &str) -> Option<&[T]> + where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, + { + let expected_type_id = TypeId::of::(); + if let Some(actual_type_id) = self.column_to_type.get(col_name) { + if *actual_type_id == expected_type_id { + if let Some(sub_frame_box) = self.frames_by_type.get(actual_type_id) { + if let Some(frame) = sub_frame_box.as_any().downcast_ref::>() { + return Some(frame.column(col_name)); + } + } + } + } + None + } + + /// Returns a HashMap representing a row, mapping column names to their string values. + pub fn get_row(&self, row_idx: usize) -> Option> { + if row_idx >= self.rows() { + return None; + } + + let mut row_data = HashMap::new(); + for col_name in &self.column_names { + if let Some(type_id) = self.column_to_type.get(col_name) { + if let Some(sub_frame_box) = self.frames_by_type.get(type_id) { + let value = sub_frame_box.get_value_as_string(row_idx, col_name); + row_data.insert(col_name.clone(), value); + } + } + } + Some(row_data) + } + + pub fn add_column(&mut self, col_name: &str, data: Vec) + where + T: Clone + PartialEq + fmt::Display + fmt::Debug + 'static + Send + Sync + Any, + { + let type_id = TypeId::of::(); + let col_name_string = col_name.to_string(); + + // Check for duplicate column name across the entire DataFrame + if self.column_to_type.contains_key(&col_name_string) { + panic!( + "DataFrame::add_column: duplicate column name: '{}'", + col_name_string + ); + } + + // If this is the first column being added, set the DataFrame's index + if self.column_names.is_empty() { + self.index = RowIndex::Range(0..data.len()); + } else { + // Ensure new column has the same number of rows as existing columns + if data.len() != self.index.len() { + panic!( + "DataFrame::add_column: new column '{}' has {} rows, but existing columns have {} rows", + col_name_string, + data.len(), + self.index.len() + ); + } + } + + // Check if a Frame of this type already exists + if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) { + // Downcast to the concrete Frame and add the column + if let Some(frame) = sub_frame_box.as_any_mut().downcast_mut::>() { + frame.add_column(col_name_string.clone(), data); + } else { + // This should ideally not happen if TypeId matches, but good for safety + panic!( + "Type mismatch when downcasting existing SubFrame for TypeId {:?}", + type_id + ); + } + } else { + // No Frame of this type exists, create a new one + // The Frame::new constructor expects a Matrix and column names. + // We create a Matrix from a single column vector. + let new_frame = Frame::new( + crate::matrix::Matrix::from_cols(vec![data]), + vec![col_name_string.clone()], + Some(self.index.clone()), // Pass the DataFrame's index to the new Frame + ); + self.frames_by_type.insert(type_id, Box::new(new_frame)); + } + + // Update column mappings and names + self.column_to_type.insert(col_name_string.clone(), type_id); + self.column_names.push(col_name_string); + } + + /// Drops a column from the DataFrame. + /// Panics if the column does not exist. + pub fn drop_column(&mut self, col_name: &str) { + let col_name_string = col_name.to_string(); + + // 1. Get the TypeId associated with the column + let type_id = self + .column_to_type + .remove(&col_name_string) + .unwrap_or_else(|| { + panic!( + "DataFrame::drop_column: column '{}' not found", + col_name_string + ); + }); + + // 2. Remove the column name from the ordered list + self.column_names.retain(|name| name != &col_name_string); + + // 3. Find the Frame object and delete the column from it + if let Some(sub_frame_box) = self.frames_by_type.get_mut(&type_id) { + sub_frame_box.delete_column_from_frame(&col_name_string); + + // 4. If the Frame object for this type becomes empty, remove it from frames_by_type + if sub_frame_box.get_frame_cols() == 0 { + self.frames_by_type.remove(&type_id); + } + } else { + // This should not happen if column_to_type was consistent + panic!( + "DataFrame::drop_column: internal error, no frame found for type_id {:?}", + type_id + ); + } + } +} + +impl fmt::Display for DataFrame { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Display column headers + for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) { + write!(f, "{:<15}", col_name)?; + } + if self.column_names.len() > DEFAULT_DISPLAY_COLS { + write!(f, "...")?; + } + writeln!(f)?; + + // Display data rows + let mut displayed_rows = 0; + for i in 0..self.index.len() { + if displayed_rows >= DEFAULT_DISPLAY_ROWS { + writeln!(f, "...")?; + break; + } + for col_name in self.column_names.iter().take(DEFAULT_DISPLAY_COLS) { + if let Some(type_id) = self.column_to_type.get(col_name) { + if let Some(sub_frame_box) = self.frames_by_type.get(type_id) { + write!(f, "{:<15}", sub_frame_box.get_value_as_string(i, col_name))?; + } else { + // This case indicates an inconsistency: column_to_type has an entry, + // but frames_by_type doesn't have the corresponding Frame. + write!(f, "{:<15}", "[ERROR]")?; + } + } else { + // This case indicates an inconsistency: column_names has an entry, + // but column_to_type doesn't have the corresponding column. + write!(f, "{:<15}", "[ERROR]")?; + } + } + if self.column_names.len() > DEFAULT_DISPLAY_COLS { + write!(f, "...")?; + } + writeln!(f)?; + displayed_rows += 1; + } + Ok(()) + } +} + +impl fmt::Debug for DataFrame { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("DataFrame") + .field("column_names", &self.column_names) + .field("index", &self.index) + .field("column_to_type", &self.column_to_type) + .field("frames_by_type", &self.frames_by_type) + .finish() + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#[cfg(test)] +mod tests { + use super::*; + use crate::frame::Frame; + use crate::matrix::Matrix; + + #[test] + fn test_dataframe_new() { + let df = DataFrame::new(); + assert_eq!(df.rows(), 0); + assert_eq!(df.cols(), 0); + assert!(df.get_column_names().is_empty()); + assert!(df.frames_by_type.is_empty()); + assert!(df.column_to_type.is_empty()); + } + + #[test] + fn test_dataframe_add_column_initial() { + let mut df = DataFrame::new(); + let data = vec![1, 2, 3]; + df.add_column("col_int", data.clone()); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 1); + assert_eq!(df.get_column_names(), &vec!["col_int".to_string()]); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::())); + + // Verify the underlying frame + let sub_frame_box = df.frames_by_type.get(&TypeId::of::()).unwrap(); + let frame = sub_frame_box.as_any().downcast_ref::>().unwrap(); + assert_eq!(frame.rows(), 3); + assert_eq!(frame.cols(), 1); + assert_eq!(frame.columns(), &vec!["col_int".to_string()]); + } + + #[test] + fn test_dataframe_add_column_same_type() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5, 6]); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 2); + assert_eq!( + df.get_column_names(), + &vec!["col_int1".to_string(), "col_int2".to_string()] + ); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert_eq!( + df.column_to_type.get("col_int1"), + Some(&TypeId::of::()) + ); + assert_eq!( + df.column_to_type.get("col_int2"), + Some(&TypeId::of::()) + ); + + // Verify the underlying frame + let sub_frame_box = df.frames_by_type.get(&TypeId::of::()).unwrap(); + let frame = sub_frame_box.as_any().downcast_ref::>().unwrap(); + assert_eq!(frame.rows(), 3); + assert_eq!(frame.cols(), 2); + assert_eq!( + frame.columns(), + &vec!["col_int1".to_string(), "col_int2".to_string()] + ); + } + + #[test] + fn test_dataframe_add_column_different_type() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], + ); + + assert_eq!(df.rows(), 3); + assert_eq!(df.cols(), 3); + assert_eq!( + df.get_column_names(), + &vec![ + "col_int".to_string(), + "col_float".to_string(), + "col_string".to_string() + ] + ); + + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + assert_eq!(df.column_to_type.get("col_int"), Some(&TypeId::of::())); + assert_eq!( + df.column_to_type.get("col_float"), + Some(&TypeId::of::()) + ); + assert_eq!( + df.column_to_type.get("col_string"), + Some(&TypeId::of::()) + ); + + // Verify underlying frames + let int_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int".to_string()]); + + let float_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(float_frame.columns(), &vec!["col_float".to_string()]); + + let string_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(string_frame.columns(), &vec!["col_string".to_string()]); + } + + #[test] + fn test_dataframe_get_column() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], + ); + + // Test getting existing columns with correct type + assert_eq!( + df.get_column::("col_int").unwrap(), + vec![1, 2, 3].as_slice() + ); + assert_eq!( + df.get_column::("col_float").unwrap(), + vec![1.1, 2.2, 3.3].as_slice() + ); + assert_eq!( + df.get_column::("col_string").unwrap(), + vec!["a".to_string(), "b".to_string(), "c".to_string()].as_slice() + ); + + // Test getting non-existent column + assert_eq!(df.get_column::("non_existent"), None); + + // Test getting existing column with incorrect type + assert_eq!(df.get_column::("col_int"), None); + assert_eq!(df.get_column::("col_float"), None); + } + + #[test] + fn test_dataframe_get_row() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], + ); + + // Test getting an existing row + let row0 = df.get_row(0).unwrap(); + assert_eq!(row0.get("col_int"), Some(&"1".to_string())); + assert_eq!(row0.get("col_float"), Some(&"1.1".to_string())); + assert_eq!(row0.get("col_string"), Some(&"a".to_string())); + + let row1 = df.get_row(1).unwrap(); + assert_eq!(row1.get("col_int"), Some(&"2".to_string())); + assert_eq!(row1.get("col_float"), Some(&"2.2".to_string())); + assert_eq!(row1.get("col_string"), Some(&"b".to_string())); + + // Test getting an out-of-bounds row + assert_eq!(df.get_row(3), None); + } + + #[test] + #[should_panic(expected = "DataFrame::add_column: duplicate column name: 'col_int'")] + fn test_dataframe_add_column_duplicate_name() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_int", vec![4, 5, 6]); + } + + #[test] + #[should_panic( + expected = "DataFrame::add_column: new column 'col_int2' has 2 rows, but existing columns have 3 rows" + )] + fn test_dataframe_add_column_mismatched_rows() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5]); + } + + #[test] + fn test_dataframe_display() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3, 4, 5, 6]); + df.add_column("col_float", vec![1.1, 2.2, 3.3, 4.4, 5.5, 6.6]); + df.add_column( + "col_string", + vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + "e".to_string(), + "f".to_string(), + ], + ); + + let expected_output = "\ +col_int col_float col_string +1 1.1 a +2 2.2 b +3 3.3 c +4 4.4 d +5 5.5 e +... +"; + assert_eq!(format!("{}", df), expected_output); + } + + #[test] + fn test_dataframe_debug() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + + let debug_output = format!("{:?}", df); + assert!(debug_output.contains("DataFrame {")); + assert!(debug_output.contains("column_names: [\"col_int\", \"col_float\"]")); + assert!(debug_output.contains("index: Range(0..3)")); + assert!(debug_output.contains("column_to_type: {")); + assert!(debug_output.contains("frames_by_type: {")); + } + + #[test] + fn test_dataframe_drop_column_single_type() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_int2", vec![4, 5, 6]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + + assert_eq!(df.cols(), 3); + assert_eq!( + df.get_column_names(), + &vec![ + "col_int1".to_string(), + "col_int2".to_string(), + "col_float".to_string() + ] + ); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_int1"); + + assert_eq!(df.cols(), 2); + assert_eq!( + df.get_column_names(), + &vec!["col_int2".to_string(), "col_float".to_string()] + ); + assert!(df.column_to_type.get("col_int1").is_none()); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); // Frame should still exist + let int_frame = df + .frames_by_type + .get(&TypeId::of::()) + .unwrap() + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int2".to_string()]); + + df.drop_column("col_int2"); + + assert_eq!(df.cols(), 1); + assert_eq!(df.get_column_names(), &vec!["col_float".to_string()]); + assert!(df.column_to_type.get("col_int2").is_none()); + assert!(!df.frames_by_type.contains_key(&TypeId::of::())); // Frame should be removed + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + } + + #[test] + fn test_dataframe_drop_column_mixed_types() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.add_column("col_float", vec![1.1, 2.2, 3.3]); + df.add_column( + "col_string", + vec!["a".to_string(), "b".to_string(), "c".to_string()], + ); + + assert_eq!(df.cols(), 3); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_float"); + + assert_eq!(df.cols(), 2); + assert_eq!( + df.get_column_names(), + &vec!["col_int".to_string(), "col_string".to_string()] + ); + assert!(df.column_to_type.get("col_float").is_none()); + assert!(!df.frames_by_type.contains_key(&TypeId::of::())); // Frame should be removed + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + df.drop_column("col_int"); + df.drop_column("col_string"); + + assert_eq!(df.cols(), 0); + assert!(df.get_column_names().is_empty()); + assert!(df.frames_by_type.is_empty()); + assert!(df.column_to_type.is_empty()); + } + + #[test] + #[should_panic(expected = "DataFrame::drop_column: column 'non_existent' not found")] + fn test_dataframe_drop_column_non_existent() { + let mut df = DataFrame::new(); + df.add_column("col_int", vec![1, 2, 3]); + df.drop_column("non_existent"); + } + + #[test] + fn test_dataframe_add_column_reuses_existing_frame() { + let mut df = DataFrame::new(); + df.add_column("col_int1", vec![1, 2, 3]); + df.add_column("col_float1", vec![1.1, 2.2, 3.3]); + + // Initially, there should be two frames (one for i32, one for f64) + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Add another integer column + df.add_column("col_int2", vec![4, 5, 6]); + + // The number of frames should still be 2, as the existing i32 frame should be reused + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Verify the i32 frame now contains both integer columns + let int_frame = df.frames_by_type.get(&TypeId::of::()).unwrap().as_any().downcast_ref::>().unwrap(); + assert_eq!(int_frame.columns(), &vec!["col_int1".to_string(), "col_int2".to_string()]); + assert_eq!(int_frame.cols(), 2); + + // Add another float column + df.add_column("col_float2", vec![4.4, 5.5, 6.6]); + + // The number of frames should still be 2, as the existing f64 frame should be reused + assert_eq!(df.frames_by_type.len(), 2); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + assert!(df.frames_by_type.contains_key(&TypeId::of::())); + + // Verify the f64 frame now contains both float columns + let float_frame = df.frames_by_type.get(&TypeId::of::()).unwrap().as_any().downcast_ref::>().unwrap(); + assert_eq!(float_frame.columns(), &vec!["col_float1".to_string(), "col_float2".to_string()]); + assert_eq!(float_frame.cols(), 2); + } +} diff --git a/src/dataframe/mod.rs b/src/dataframe/mod.rs new file mode 100644 index 0000000..c477580 --- /dev/null +++ b/src/dataframe/mod.rs @@ -0,0 +1,4 @@ +//! This module provides the DataFrame structure for handling tabular data with mixed types. +pub mod df; + +pub use df::{DataFrame, SubFrame}; diff --git a/src/frame/base.rs b/src/frame/base.rs index 97552cc..553062c 100644 --- a/src/frame/base.rs +++ b/src/frame/base.rs @@ -316,7 +316,7 @@ impl Frame { ) } - /// Returns an immutable slice of the specified column's data. + /// Returns an immutable slice of the specified column's data by name. /// Panics if the column name is not found. pub fn column(&self, name: &str) -> &[T] { let idx = self @@ -325,7 +325,13 @@ impl Frame { self.matrix.column(idx) } - /// Returns a mutable slice of the specified column's data. + /// Returns an immutable slice of the specified column's data by its physical index. + /// Panics if the index is out of bounds. + pub fn column_by_physical_idx(&self, idx: usize) -> &[T] { + self.matrix.column(idx) + } + + /// Returns a mutable slice of the specified column's data by name. /// Panics if the column name is not found. pub fn column_mut(&mut self, name: &str) -> &mut [T] { let idx = self @@ -334,6 +340,12 @@ impl Frame { self.matrix.column_mut(idx) } + /// Returns a mutable slice of the specified column's data by its physical index. + /// Panics if the index is out of bounds. + pub fn column_mut_by_physical_idx(&mut self, idx: usize) -> &mut [T] { + self.matrix.column_mut(idx) + } + // Row access methods /// Returns an immutable view of the row for the given integer key. diff --git a/src/lib.rs b/src/lib.rs index 510f36d..68a4755 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,8 @@ #![doc = include_str!("../README.md")] +/// Documentation for the [`crate::dataframe`] module. +pub mod dataframe; + /// Documentation for the [`crate::matrix`] module. pub mod matrix;