diff --git a/src/utils/qdf/blacklist.rs b/src/utils/qdf/blacklist.rs index fa53b2b..61ac029 100644 --- a/src/utils/qdf/blacklist.rs +++ b/src/utils/qdf/blacklist.rs @@ -7,15 +7,16 @@ use polars::prelude::*; use std::collections::{BTreeMap, HashMap}; use std::error::Error; +use super::get_unique_metrics; + pub fn create_blacklist_from_qdf( df: &DataFrame, group_by_cid: Option, blacklist_name: Option, - metric: Option, + metrics: Option>, ) -> Result, Box> { check_quantamental_dataframe(df)?; - - let metric = metric.unwrap_or_else(|| "value".into()); + let metrics = metrics.unwrap_or_else(|| get_unique_metrics(df).unwrap()); let blacklist_name = blacklist_name.unwrap_or_else(|| "BLACKLIST".into()); let group_by_cid = group_by_cid.unwrap_or(true); @@ -29,19 +30,19 @@ pub fn create_blacklist_from_qdf( BDateFreq::Daily, )?; - // filter df - let null_mask = df.column(metric.as_str())?.is_null(); - let nan_mask = df.column(metric.as_str())?.is_nan()?; - let null_mask = null_mask | nan_mask; - let df = df.filter(&null_mask)?; + let null_mask = get_nan_mask(df, metrics)?; + + let df = df.filter(&null_mask)?.clone(); let df = df + .clone() .lazy() - .filter( - col(metric.as_str()) - .is_null() - .or(col(metric.as_str()).is_nan()), - ) + // .filter(&null_mask) + // .filter( + // col(metric.as_str()) + // .is_null() + // .or(col(metric.as_str()).is_nan()), + // ) .sort( ["cid", "xcat"], SortMultipleOptions::default().with_maintain_order(true), @@ -85,7 +86,7 @@ pub fn create_blacklist_from_qdf( for (_key, vals) in blk.iter_mut() { // order is important - dedup depends on the vec being sorted vals.sort(); - vals.dedup(); + vals.dedup(); } let all_bdates_strs = all_bdates @@ -111,6 +112,25 @@ pub fn create_blacklist_from_qdf( Ok(btree_map) } +fn get_nan_mask( + df: &DataFrame, + metrics: Vec, +) -> Result, Box> { + let null_masks: Vec> = metrics + .iter() + .map(|metric| { + let null_mask = df.column(metric.as_str())?.is_null(); + let nan_mask = df.column(metric.as_str())?.is_nan()?; + Ok(null_mask | nan_mask) + }) + .collect::>>()?; + let null_mask = null_masks + .into_iter() + .reduce(|acc, mask| acc | mask) + .unwrap_or_else(|| BooleanChunked::full_null("null_mask".into(), df.height())); + Ok(null_mask) +} + fn convert_dates_list_to_date_ranges( blacklist: Vec, all_bdates_strs: Vec, @@ -275,7 +295,13 @@ mod tests { // Expect two ranges: // range 0 => ("2023-01-02", "2023-01-03") // range 1 => ("2023-01-05", "2023-01-05") - assert_eq!(result["0"], ("2023-01-02".to_string(), "2023-01-03".to_string())); - assert_eq!(result["1"], ("2023-01-05".to_string(), "2023-01-05".to_string())); + assert_eq!( + result["0"], + ("2023-01-02".to_string(), "2023-01-03".to_string()) + ); + assert_eq!( + result["1"], + ("2023-01-05".to_string(), "2023-01-05".to_string()) + ); } -} \ No newline at end of file +}