diff --git a/src/utils/qdf/core.rs b/src/utils/qdf/core.rs index 38eec71..82a9492 100644 --- a/src/utils/qdf/core.rs +++ b/src/utils/qdf/core.rs @@ -37,21 +37,11 @@ pub fn is_quantamental_dataframe(df: &DataFrame) -> bool { check_quantamental_dataframe(df).is_ok() } -/// Sort the columns of a Quantamental DataFrame. -/// The first columns are `real_date`, `cid`, and `xcat`. -/// These are followed by any available JPMAQS metrics, 'value', 'grading', 'eop_lag', 'mop_lag', -/// (**in that order**), followed by any other metrics (in alphabetical order). -pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box> { +pub fn get_sorted_qdf_columns(columns: Vec) -> Vec { let index_columns = ["real_date", "cid", "xcat"]; let known_metrics = ["value", "grading", "eop_lag", "mop_lag"]; - let df_columns = qdf - .get_column_names() - .into_iter() - .map(|s| s.clone().into_string()) - .collect::>(); - - let mut unknown_metrics: Vec = df_columns + let mut unknown_metrics: Vec = columns .iter() .filter(|&m| !known_metrics.contains(&m.as_str())) .filter(|&m| !index_columns.contains(&m.as_str())) @@ -61,13 +51,30 @@ pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box> { let mut new_columns: Vec = vec![]; new_columns.extend(index_columns.iter().map(|s| s.to_string())); for &colname in &known_metrics { - if df_columns.contains(&colname.into()) { + if columns.contains(&colname.into()) { new_columns.push(colname.to_string()); } } unknown_metrics.sort(); new_columns.extend(unknown_metrics); + + new_columns +} + +/// Sort the columns of a Quantamental DataFrame. +/// The first columns are `real_date`, `cid`, and `xcat`. +/// These are followed by any available JPMAQS metrics, 'value', 'grading', 'eop_lag', 'mop_lag', +/// (**in that order**), followed by any other metrics (in alphabetical order). +pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box> { + let df_columns = qdf + .get_column_names() + .into_iter() + .map(|s| s.to_string()) + .collect::>(); + + let new_columns = get_sorted_qdf_columns(df_columns); + *qdf = qdf .select(new_columns.clone()) .expect("Failed to select columns"); @@ -142,21 +149,23 @@ pub fn get_unique_xcats(df: &DataFrame) -> Result, Box> { get_unique_from_str_column(df, "xcat") } +pub fn get_unique_metrics(df: &DataFrame) -> Result, Box> { + // return a list of all columns that are not 'real_date', 'cid', 'xcat' + let columns = df + .get_column_names() + .iter() + .map(|s| s.as_str().to_string()) + .collect(); + + let sorted_cols = get_sorted_qdf_columns(columns); + + // return sorted_cols[3..].to_vec() + Ok(sorted_cols[3..].to_vec()) +} + /// Get the unique dates as a polars Column from a Quantamental DataFrame. pub fn get_unique_dates(df: &DataFrame) -> Result> { let date_col = df.column("real_date")?; let unique_dates = date_col.unique()?.sort(SortOptions::default())?; Ok(unique_dates) } - - - - - - - - - - - - diff --git a/src/utils/qdf/pivots.rs b/src/utils/qdf/pivots.rs index acf7acf..4699870 100644 --- a/src/utils/qdf/pivots.rs +++ b/src/utils/qdf/pivots.rs @@ -27,8 +27,8 @@ pub fn pivot_dataframe_by_ticker( // set metric to the first non-index column metric = df.get_column_names()[3].to_string(); } + let unique_dates: Column = get_unique_dates(&df)?; - // let mut new_df = df.clone(); // keep only the index columns and the metric column let mut keep_cols = QDF_INDEX_COLUMNS .to_vec() @@ -37,23 +37,24 @@ pub fn pivot_dataframe_by_ticker( .collect::>(); keep_cols.push(metric.clone()); - return Err("Not implemented".into()); + let out_dfs = split_df_by_tickers(&df, Some(vec![metric.clone()]))?; + + // create a new dataframe with the unique dates, and iteratively add add the metric columns + let mut new_df = DataFrame::new(vec![unique_dates])?; + + - // new_df = new_df.select(keep_cols)?; - // let ticker_col = get_ticker_column_for_quantamental_dataframe(&new_df)?; - // new_df.with_column(ticker_col)?; - // // drop the cid and xcat columns - // new_df = new_df.drop_many(&["cid".to_string(), "xcat".to_string()]); - // let dates_col = df.column("real_date")?; - // Ok(df) } /// Splits a dataframe by ticker. #[allow(dead_code)] -fn split_df_by_tickers(df: &DataFrame) -> Result, Box> { +fn split_df_by_tickers( + df: &DataFrame, + metrics: Option>, +) -> Result, Box> { check_quantamental_dataframe(df)?; let mut df_outs = HashMap::new(); @@ -66,6 +67,15 @@ fn split_df_by_tickers(df: &DataFrame) -> Result, Box icids.push(cid); ixcats.push(xcat); } + let metrics = metrics.unwrap_or_else(|| get_unique_metrics(df).unwrap()); + // let keep_cols:Vec = QDF_INDEX_COLUMNS + metrics; + let mut keep_cols = QDF_INDEX_COLUMNS + .to_vec() + .iter() + .map(|s| s.to_string()) + .collect::>(); + + keep_cols.extend(metrics); for (cid, xcat) in icids.iter().zip(ixcats.iter()) { // Apply filter while borrowing df and avoid moving ownership. @@ -74,17 +84,39 @@ fn split_df_by_tickers(df: &DataFrame) -> Result, Box .and(col("xcat").eq(lit(xcat.clone()))); let df_out = df.clone().lazy().filter(filter).collect()?; + // select keep_cols + let df_out = df_out.select(keep_cols.clone())?; df_outs.insert(format!("{}_{}", cid, xcat), df_out); } Ok(df_outs) } -fn single_ticker_qdf_to_timeseries(df: &DataFrame) -> Result, Box> { +fn single_ticker_qdf_to_timeseries(mut df: DataFrame) -> Result, Box> { let mut df_vec = Vec::new(); - // copy the date col - // let date_col = + // Since we own `df`, we can remove the "real_date" column, consuming it. + let date_col = df.drop_in_place("real_date")?; - Ok(vec![df.to_owned()]) + // Assuming `get_unique_metrics` only needs a reference to `df` + let metrics = get_unique_metrics(&df)?; + + for metric in metrics.iter() { + // Remove the metric column from `df`, consuming it. + let metric_col = df.drop_in_place(metric)?; + + // Create a new DataFrame with the date column and the metric column. + let new_df = DataFrame::new(vec![ + date_col.clone(), // Clone because `date_col` is used multiple times + metric_col, // Move the metric column (no clone needed) + ])?; + df_vec.push(new_df); + } + // At this point, `df` has had its metric columns removed and is effectively consumed. + // The "date_col" has been moved out of `df` as well. + + // If you need to use `df` after this function, consider redesigning the function + // to avoid consuming it. + + Ok(df_vec) }