From 25192e425d70853c672ec22a93076a1fc40dfde1 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Wed, 16 Apr 2025 01:36:48 +0100 Subject: [PATCH] completed wdf.columns implementation --- src/utils/qdf/blacklist.rs | 166 ++++++++++++++++++++++++++++++++----- 1 file changed, 143 insertions(+), 23 deletions(-) diff --git a/src/utils/qdf/blacklist.rs b/src/utils/qdf/blacklist.rs index d3c2566..fa53b2b 100644 --- a/src/utils/qdf/blacklist.rs +++ b/src/utils/qdf/blacklist.rs @@ -1,24 +1,23 @@ -use crate::utils::bdates::{self, get_bdates_list_with_freq, BDateFreq}; -use crate::utils::dateutils::{get_bdates_series_default_opt, get_min_max_real_dates}; +use crate::utils::bdates::{get_bdates_list_with_freq, BDateFreq}; +use crate::utils::dateutils::get_min_max_real_dates; +use crate::utils::misc::get_cid; use crate::utils::qdf::core::check_quantamental_dataframe; use chrono::NaiveDate; use polars::prelude::*; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::error::Error; pub fn create_blacklist_from_qdf( df: &DataFrame, + group_by_cid: Option, + blacklist_name: Option, metric: Option, - // ) -> Result, Box> { -) -> Result>, Box> { - // Verify that the DataFrame follows the Quantamental structure. +) -> Result, Box> { check_quantamental_dataframe(df)?; - let mut blacklist: HashMap> = HashMap::new(); - // let mut blacklist: HashMap = HashMap::new(); - let mut blk: HashMap> = HashMap::new(); - // Use the provided metric or default to "value". let metric = metric.unwrap_or_else(|| "value".into()); + let blacklist_name = blacklist_name.unwrap_or_else(|| "BLACKLIST".into()); + let group_by_cid = group_by_cid.unwrap_or(true); let (min_date, max_date) = get_min_max_real_dates(df, "real_date".into())?; let min_date_str = min_date.format("%Y-%m-%d").to_string(); @@ -70,27 +69,118 @@ pub fn create_blacklist_from_qdf( let rdt = get_vec_of_vec_of_dates_from_df(df)?; - // assert!(0 == 1, "{:?}", rdt); + let mut blk: HashMap> = HashMap::new(); for (tkr, dates) in ticker_vec.iter().zip(rdt.iter()) { - blacklist.insert(tkr.to_string(), dates.clone()); + if group_by_cid { + let _cid = get_cid(tkr.clone())?; + if blk.contains_key(&_cid) { + blk.get_mut(&_cid).unwrap().extend(dates.iter().cloned()); + } else { + blk.insert(_cid, dates.clone()); + } + } else { + blk.insert(tkr.to_string(), dates.clone()); + } + } + for (_key, vals) in blk.iter_mut() { + // order is important - dedup depends on the vec being sorted + vals.sort(); + vals.dedup(); } - Ok(blacklist) -} - -fn convert_dates_list_to_date_ranges( - blacklist: HashMap>, - all_bdates: Vec, -) -> HashMap { - let blk = HashMap::new(); - let bdates = all_bdates + let all_bdates_strs = all_bdates .iter() .map(|date| date.format("%Y-%m-%d").to_string()) .collect::>(); - // + let mut blacklist: HashMap = HashMap::new(); + for (tkr, dates) in blk.iter() { + let date_ranges = convert_dates_list_to_date_ranges(dates.clone(), all_bdates_strs.clone()); + for (rng_idx, (start_date, end_date)) in date_ranges.iter() { + let range_key = format!("{}_{}_{}", tkr, blacklist_name.clone(), rng_idx); + blacklist.insert(range_key, (start_date.clone(), end_date.clone())); + } + } + // Ok(blacklist) - blk + let mut btree_map: BTreeMap = BTreeMap::new(); + for (key, (start_date, end_date)) in blacklist.iter() { + btree_map.insert(key.clone(), (start_date.clone(), end_date.clone())); + } + + Ok(btree_map) +} + +fn convert_dates_list_to_date_ranges( + blacklist: Vec, + all_bdates_strs: Vec, +) -> HashMap { + // Step 1: Map every date in all_bdates_strs to its index + let mut all_map: HashMap = HashMap::new(); + for (i, d) in all_bdates_strs.iter().enumerate() { + all_map.insert(d.clone(), i); + } + + // Step 2: Convert each blacklisted date into its index, if it exists + let mut blacklisted_indices: Vec = Vec::new(); + for dt in blacklist { + if let Some(&idx) = all_map.get(&dt) { + blacklisted_indices.push(idx); + } + } + + // Step 3: Sort the blacklisted indices + blacklisted_indices.sort_unstable(); + + // Step 4: Traverse and group consecutive indices into ranges + let mut result: HashMap = HashMap::new(); + let mut string_result: HashMap = HashMap::new(); + + if blacklisted_indices.is_empty() { + return string_result; + } + + let mut range_idx: i64 = 0; + let mut start_idx = blacklisted_indices[0]; + let mut end_idx = start_idx; + + for &cur_idx in blacklisted_indices.iter().skip(1) { + if cur_idx == end_idx + 1 { + // We are still in a contiguous run + end_idx = cur_idx; + } else { + // We hit a break in contiguity, so store the last range + result.insert( + range_idx, + ( + all_bdates_strs[start_idx].clone(), + all_bdates_strs[end_idx].clone(), + ), + ); + range_idx += 1; + + // Start a new range + start_idx = cur_idx; + end_idx = cur_idx; + } + } + + // Don't forget to store the final range after the loop + result.insert( + range_idx, + ( + all_bdates_strs[start_idx].clone(), + all_bdates_strs[end_idx].clone(), + ), + ); + + let max_digits = result.keys().max().unwrap_or(&-1).to_string().len(); + for (key, (start_date, end_date)) in result.iter() { + let new_key = format!("{:0width$}", key, width = max_digits); + string_result.insert(new_key, (start_date.clone(), end_date.clone())); + } + + string_result } fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result>, Box> { @@ -115,6 +205,7 @@ fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result>, Bo Ok(rdt) } +#[allow(dead_code)] fn get_vec_of_vec_of_naivedates_from_df( df: DataFrame, ) -> Result>, Box> { @@ -159,3 +250,32 @@ fn get_vec_of_vec_of_naivedates_from_df( // .collect::>>(); // Ok(rdt) // } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_convert_dates_list_to_date_ranges() { + let all_dates = vec![ + "2023-01-01".to_string(), + "2023-01-02".to_string(), + "2023-01-03".to_string(), + "2023-01-04".to_string(), + "2023-01-05".to_string(), + "2023-01-06".to_string(), + ]; + let blacklist = vec![ + "2023-01-02".to_string(), + "2023-01-03".to_string(), + "2023-01-05".to_string(), + ]; + + let result = convert_dates_list_to_date_ranges(blacklist, all_dates); + // Expect two ranges: + // range 0 => ("2023-01-02", "2023-01-03") + // range 1 => ("2023-01-05", "2023-01-05") + assert_eq!(result["0"], ("2023-01-02".to_string(), "2023-01-03".to_string())); + assert_eq!(result["1"], ("2023-01-05".to_string(), "2023-01-05".to_string())); + } +} \ No newline at end of file