msyrs/src/panel/linear_composite.rs

use crate::utils::qdf::check_quantamental_dataframe;
use crate::utils::qdf::pivots::{pivot_dataframe_by_ticker, pivot_wide_dataframe_to_qdf};
use crate::utils::qdf::reduce_df::reduce_dataframe;
use polars::prelude::*;
use std::collections::HashMap;
const TOLERANCE: f64 = 1e-8;

fn _norm_apply_nan_mask(
    weights_dfw: &DataFrame,
    nan_mask_dfw: &DataFrame,
) -> Result<DataFrame, PolarsError> {
    let mut norm_dfw = DataFrame::new(vec![])?;
    for col in weights_dfw.get_column_names().clone() {
        let weight_col = weights_dfw.column(col)?;
        let nan_mask_col = nan_mask_dfw.column(col)?;
        let nan_mask_series = (!(nan_mask_col.bool()?)).into_column();
        let float_col = nan_mask_series.cast(&DataType::Float64)?;
        let float_series = (weight_col * &float_col)?;
        let sum = float_series.f64()?.sum().unwrap();
        if sum < TOLERANCE {
            // get the sum of the nan_mask_series
            let nan_mask_sum = nan_mask_series
                .cast(&DataType::Float64)?
                .f64()?
                .sum()
                .unwrap();
            // if the sum of the nan_mask_series is close to the len of the series, then ok
            let weight_sum = weight_col.f64()?.sum().unwrap();
            let ratio = (nan_mask_sum - weight_col.len() as f64).abs() / weight_col.len() as f64;
            assert!(
                ratio < TOLERANCE,
                "The sum of the updated series is zero: {:?} w ratio: {:?} and the weights summing to: {:?}",
                sum, nan_mask_sum, weight_sum
            );
        }
        norm_dfw.with_column(float_series)?;
    }
    Ok(norm_dfw)
}

fn normalize_weights_with_nan_mask(
    weights_dfw: DataFrame,
    nan_mask_dfw: DataFrame,
) -> Result<DataFrame, PolarsError> {
    // Ensure dimensions and columns match
    assert_eq!(weights_dfw.shape(), nan_mask_dfw.shape());
    assert!(
        weights_dfw
            .get_column_names()
            .iter()
            .all(|col| nan_mask_dfw.get_column_names().contains(col)),
        "The columns in weights_dfw and nan_mask_dfw do not match."
    );

    let mut norm_dfw = _norm_apply_nan_mask(&weights_dfw, &nan_mask_dfw)?;

    let sums_of_rows = norm_dfw
        .sum_horizontal(polars::frame::NullStrategy::Ignore)?
        .unwrap()
        .into_column();

    for col in weights_dfw.get_column_names() {
        let weight_series = norm_dfw.column(col)?;
        let divided_series = weight_series.divide(&sums_of_rows)?;
        let normalized_series = divided_series.as_series().unwrap().clone();
        norm_dfw.replace(col, normalized_series)?;
    }

    Ok(norm_dfw)
}

fn _form_agg_data_dfw(dfw: &DataFrame, agg_targs: &Vec<String>) -> Result<DataFrame, PolarsError> {
    let mut data_dfw = DataFrame::new(vec![])?;
    for agg_targ in agg_targs {
        match dfw.column(agg_targ) {
            Ok(agg_targ_series) => {
                // If the column exists, clone it and add it to data_dfw
                data_dfw.with_column(agg_targ_series.clone())?;
            }
            Err(_) => {
                // If the column does not exist, create a series full of NaNs
                let nan_series =
                    Series::full_null(agg_targ.into(), dfw.height(), &DataType::Float64);
                data_dfw.with_column(nan_series)?;
            }
        }
    }
    Ok(data_dfw)
}

fn _form_agg_nan_mask_dfw(data_dfw: &DataFrame) -> Result<DataFrame, PolarsError> {
    // Create a NaN mask DataFrame
    let mut nan_mask_dfw = DataFrame::new(vec![])?;
    for column in data_dfw.get_columns() {
        let nan_mask_series = column.is_null().into_series();
        nan_mask_dfw.with_column(nan_mask_series)?;
    }
    Ok(nan_mask_dfw)
}

fn _form_agg_nan_mask_series(nan_mask_dfw: &DataFrame) -> Result<Series, PolarsError> {
    // perform a row-wise OR
    let columns = nan_mask_dfw.get_columns();
    let mut combined = columns[0].bool()?.clone();
    for column in columns.iter().skip(1) {
        combined = &combined | column.bool()?;
    }
    Ok(combined.into_series())
}

/// Form the weights DataFrame
fn _form_agg_weights_dfw(
    agg_weights_map: &HashMap<String, (WeightValue, f64)>,
    dfw: &DataFrame,
) -> Result<DataFrame, PolarsError> {
    let mut weights_dfw = DataFrame::new(vec![])?;
    for (agg_targ, weight_signs) in agg_weights_map.iter() {
        // let wgt = weight_signs[0] * weight_signs[1];
        let wgt_series = match &weight_signs.0 {
            WeightValue::F64(val) => {
                let wgt = val * weight_signs.1;
                Series::new(agg_targ.into(), vec![wgt; dfw.height()])
            }
            WeightValue::Str(vstr) => {
                // vstr column from data_dfw, else raise wieght specification error
                if !dfw.get_column_names().contains(&&PlSmallStr::from(vstr)) {
                    return Err(PolarsError::ComputeError(
                        format!(
                            "The column {} does not exist in the DataFrame. {:?}",
                            vstr, agg_weights_map
                        )
                        .into(),
                    ));
                }
                let vstr_series = dfw.column(vstr)?;
                let multiplied_series = vstr_series * weight_signs.1;
                let mut multiplied_series =
                    multiplied_series.as_series().cloned().ok_or_else(|| {
                        PolarsError::ComputeError(
                            "Failed to convert multiplied_series to Series".into(),
                        )
                    })?;
                multiplied_series.rename(agg_targ.into());
                multiplied_series
            }
        };
        weights_dfw.with_column(wgt_series)?;
    }
    Ok(weights_dfw)
}

fn _assert_weights_and_data_match(
    data_dfw: &DataFrame,
    weights_dfw: &DataFrame,
) -> Result<(), PolarsError> {
    for col in weights_dfw.get_column_names() {
        if !data_dfw.get_column_names().contains(&col) {
            return Err(PolarsError::ComputeError(
                format!(
                    "weights_dfw and data_dfw do not have the same set of columns: {}",
                    col
                )
                .into(),
            ));
        }
    }
    Ok(())
}

fn perform_single_group_agg(
    dfw: &DataFrame,
    agg_on: &String,
    agg_targs: &Vec<String>,
    agg_weights_map: &HashMap<String, (WeightValue, f64)>,
    normalize_weights: bool,
    complete: bool,
) -> Result<Column, PolarsError> {
    let data_dfw = _form_agg_data_dfw(dfw, agg_targs)?;
    let nan_mask_dfw = _form_agg_nan_mask_dfw(&data_dfw)?;
    let nan_mask_series = _form_agg_nan_mask_series(&nan_mask_dfw)?;
    let weights_dfw = _form_agg_weights_dfw(agg_weights_map, dfw)?;
    let weights_dfw = match normalize_weights {
        true => normalize_weights_with_nan_mask(weights_dfw, nan_mask_dfw)?,
        false => weights_dfw,
    };

    println!("weights_dfw: {:?}", weights_dfw);
    // assert weights and data have the same set of columns
    _assert_weights_and_data_match(&data_dfw, &weights_dfw)?;

    // let output_dfw = weights_dfw * data_dfw;
    let mut output_columns = Vec::with_capacity(data_dfw.get_column_names().len());
    for col_name in data_dfw.get_column_names() {
        let data_series = data_dfw.column(col_name)?;
        let weight_series = weights_dfw.column(col_name)?;
        let mut multiplied = (data_series * weight_series)?;
        multiplied.rename(col_name.to_string().into());
        output_columns.push(multiplied);
    }

    let mut sum_series = output_columns[0].clone();
    for i in 1..output_columns.len() {
        let filled_column = output_columns[i].fill_null(FillNullStrategy::Zero)?;
        sum_series = (&sum_series + &filled_column)?;
    }

    if complete {
        sum_series = sum_series.zip_with(
            nan_mask_series.bool()?,
            &Series::full_null(agg_on.clone().into(), sum_series.len(), sum_series.dtype())
                .into_column(),
        )?;
    }
    sum_series.rename(agg_on.clone().into());
    // new_dfw.with_column(sum_series)?;

    Ok(sum_series)
}

fn perform_multiplication(
    dfw: &DataFrame,
    mult_targets: &HashMap<String, Vec<String>>,
    weights_map: &HashMap<String, HashMap<String, (WeightValue, f64)>>,
    complete: bool,
    normalize_weights: bool,
) -> Result<DataFrame, PolarsError> {
    // let real_date = dfw.column("real_date".into())?.clone();
    // let mut new_dfw = DataFrame::new(vec![real_date])?;
    let mut new_dfw = DataFrame::new(vec![])?;
    assert!(!mult_targets.is_empty(), "agg_targs is empty");

    for (agg_on, agg_targs) in mult_targets.iter() {
        // perform_single_group_agg
        let cols_len = new_dfw.get_column_names().len();
        let new_col = perform_single_group_agg(
            dfw,
            agg_on,
            agg_targs,
            &weights_map[agg_on],
            normalize_weights,
            complete,
        )?;
        // assert that the number of columns has grown
        assert!(
            new_col.len() != 0,
            "The new DataFrame is empty after aggregation."
        );
        new_dfw.with_column(new_col)?;
        // if the height of new_dfw is 0,
        let new_cols_len = new_dfw.get_column_names().len();

        assert!(
            new_cols_len > cols_len,
            "The number of columns did not grow after aggregation."
        );
    }
    // get the real_date column from dfw
    let real_date = dfw.column("real_date".into())?.clone();
    new_dfw.with_column(real_date.clone())?;

    // Placeholder logic to return a valid DataFrame for now
    // Replace this with the actual implementation
    Ok(new_dfw)
}

fn get_agg_on_agg_targs(cids: Vec<String>, xcats: Vec<String>) -> (Vec<String>, Vec<String>) {
    let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
    let (agg_on, agg_targ) = if _agg_xcats_for_cid {
        (cids.clone(), xcats.clone())
    } else {
        (xcats.clone(), cids.clone())
    };
    // assert that if agg_xcats_for_cid is true, agg_on = cids
    match _agg_xcats_for_cid {
        true => {
            assert_eq!(agg_on, cids);
            assert_eq!(agg_targ, xcats);
        }
        false => {
            assert_eq!(agg_on, xcats);
            assert_eq!(agg_targ, cids);
        }
    }
    (agg_on, agg_targ)
}

/// Get the mapping of aggregation targets for the implied mode of aggregation.
/// # Returns
/// * `HashMap<String, Vec<String>>` - A mapping of cid/xcat to the list of tickers to be aggregated.
fn get_mul_targets(
    cids: Vec<String>,
    xcats: Vec<String>,
) -> Result<HashMap<String, Vec<String>>, Box<dyn std::error::Error>> {
    let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
    let mut mul_targets = HashMap::new();

    let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone());

    for agg_o in agg_on {
        let mut targets = Vec::new();
        for agg_t in &agg_targ {
            let ticker = match _agg_xcats_for_cid {
                true => format!("{}_{}", agg_o, agg_t),
                false => format!("{}_{}", agg_t, agg_o),
            };
            targets.push(ticker);
        }
        if !targets.is_empty() {
            mul_targets.insert(agg_o.clone(), targets);
        }
    }
    // check if mul_targets is empty
    assert!(
        !mul_targets.is_empty(),
        "The mul_targets is empty. Please check the input DataFrame."
    );
    Ok(mul_targets)
}

/// Builds a map of the shape:
/// `HashMap<String, HashMap<String, (WeightValue, f64)>>`
/// where only one of `weights` or `weight_xcats` can be provided.
/// If neither is provided, weights default to 1.0.
/// Each tuple is `(WeightValue, f64) = (weight, sign)`.
fn form_weights_and_signs_map(
    cids: Vec<String>,
    xcats: Vec<String>,
    weights: Option<Vec<f64>>,
    weight_xcat: Option<String>,
    signs: Option<Vec<f64>>,
) -> Result<HashMap<String, HashMap<String, (WeightValue, f64)>>, Box<dyn std::error::Error>> {
    // For demonstration, we pretend to load or infer these from helpers:
    let agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
    let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone());

    // Determine if each weight option has non-empty values.
    let weights_provided = weights.as_ref().map_or(false, |v| !v.is_empty());
    let weight_xcats_provided = weight_xcat.as_ref().map_or(false, |v| !v.is_empty());

    // Enforce that only one of weights or weight_xcats is specified.
    if weights_provided && weight_xcats_provided {
        return Err("Only one of `weights` and `weight_xcats` may be specified.".into());
    }

    // 1) Build the "actual_weights" vector as WeightValue.
    let actual_weights: Vec<WeightValue> = if weights_provided {
        weights.unwrap().into_iter().map(WeightValue::F64).collect()
    } else if weight_xcats_provided {
        vec![WeightValue::Str(weight_xcat.unwrap()); agg_targ.len()]
    } else {
        // Default to numeric 1.0 if neither is provided
        vec![WeightValue::F64(1.0); agg_targ.len()]
    };

    // 2) Build the "signs" vector; default to 1.0 if not provided
    let signs = signs.unwrap_or_else(|| vec![1.0; agg_targ.len()]);

    // 3) Optional: check lengths & zero values (only numeric weights).
    check_weights_signs_lengths(&actual_weights, &signs, agg_xcats_for_cid, agg_targ.len())?;

    // 4) Build the final nested HashMap
    let mut weights_map: HashMap<String, HashMap<String, (WeightValue, f64)>> = HashMap::new();

    for agg_o in agg_on {
        let mut agg_t_map = HashMap::new();
        for (i, agg_t) in agg_targ.iter().enumerate() {
            // Format the ticker
            let ticker = if agg_xcats_for_cid {
                format!("{}_{}", agg_o, agg_t)
            } else {
                format!("{}_{}", agg_t, agg_o)
            };
            // Build the tuple (WeightValue, f64)
            let weight_sign_tuple = match &actual_weights[i] {
                WeightValue::F64(val) => (WeightValue::F64(*val).clone(), signs[i]),
                WeightValue::Str(vstr) => {
                    let new_str = format!("{}_{}", agg_t, vstr);
                    (WeightValue::Str(new_str), signs[i])
                }
            };
            agg_t_map.insert(ticker, weight_sign_tuple);
        }
        weights_map.insert(agg_o.clone(), agg_t_map);
    }

    Ok(weights_map)
}
/// Checks that the given slices have the expected length and that:
/// - numeric weights are non-zero,
/// - signs are non-zero.
fn check_weights_signs_lengths(
    weights_vec: &[WeightValue],
    signs_vec: &[f64],
    agg_xcats_for_cid: bool,
    agg_targ_len: usize,
) -> Result<(), Box<dyn std::error::Error>> {
    // For diagnostics, decide what to call the dimension
    let agg_targ = if agg_xcats_for_cid { "xcats" } else { "cids" };

    // 1) Check numeric weights for zeroes.
    for (i, weight) in weights_vec.iter().enumerate() {
        if let WeightValue::F64(val) = weight {
            if *val == 0.0 {
                return Err(format!("The weight at index {} is 0.0", i).into());
            }
        }
    }
    // 2) Ensure the weights vector is the expected length.
    if weights_vec.len() != agg_targ_len {
        return Err(format!(
            "The length of weights ({}) does not match the length of {} ({})",
            weights_vec.len(),
            agg_targ,
            agg_targ_len
        )
        .into());
    }

    // 3) Check signs for zero.
    for (i, sign) in signs_vec.iter().enumerate() {
        if *sign == 0.0 {
            return Err(format!("The sign at index {} is 0.0", i).into());
        }
    }
    // 4) Ensure the signs vector is the expected length.
    if signs_vec.len() != agg_targ_len {
        return Err(format!(
            "The length of signs ({}) does not match the length of {} ({})",
            signs_vec.len(),
            agg_targ,
            agg_targ_len
        )
        .into());
    }

    Ok(())
}
fn rename_result_dfw_cols(
    xcats: Vec<String>,
    cids: Vec<String>,
    new_xcat: String,
    new_cid: String,
    dfw: &mut DataFrame,
) -> Result<(), Box<dyn std::error::Error>> {
    let dfw_cols: Vec<String> = dfw
        .get_column_names()
        .iter()
        .filter(|&s| *s != "real_date") // Exclude 'real_date'
        .map(|s| s.to_string())
        .collect();
    let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
    Ok(for col in dfw_cols {
        let new_name = match _agg_xcats_for_cid {
            true => format!("{}_{}", col, new_xcat),
            false => format!("{}_{}", new_cid, col),
        };
        // rename the column
        dfw.rename(&col, new_name.into())?;
    })
}

/// Flags if the xcats are aggregated for a given cid.
/// If true, the xcats are aggregated for each cid, creating a new xcat.
/// If false, the cids are aggregated for each xcat, creating a new cid.
#[allow(unused_variables)]
fn agg_xcats_for_cid(cids: Vec<String>, xcats: Vec<String>) -> bool {
    // if there is more than 1 xcat, return xcats.len() > 1
    xcats.len() > 1
}

/// Represents a weight value that can be a string, (float, or integer).
#[derive(Debug, Clone, PartialEq)]
pub enum WeightValue {
    Str(String),
    F64(f64),
}
impl From<String> for WeightValue {
    fn from(s: String) -> Self {
        WeightValue::Str(s)
    }
}

impl<'a> From<&'a str> for WeightValue {
    fn from(s: &'a str) -> Self {
        WeightValue::Str(s.to_string())
    }
}

impl From<f64> for WeightValue {
    fn from(f: f64) -> Self {
        WeightValue::F64(f)
    }
}

impl From<i32> for WeightValue {
    fn from(i: i32) -> Self {
        WeightValue::F64(i as f64)
    }
}

/// Weighted linear combinations of cross sections or categories
/// # Arguments
/// * `df` - QDF DataFrame
/// * `xcats` - List of category names or a single category name
/// * `cids` - List of cross section names or None
/// * `weights` - List of weights or a string indicating a weight `xcat`
/// * `normalize_weights` - Normalize weights to sum to 1 before applying
/// * `signs` - List of signs for each category (+1 or -1)
/// * `start` - Start date for the analysis
/// * `end` - End date for the analysis
/// * `blacklist` - Dictionary of blacklisted categories
/// * `complete_xcats` - If True, complete xcats with missing values
/// * `complete_cids` - If True, complete cids with missing values
/// * `new_xcat` - Name of the new xcat
/// * `new_cid` - Name of the new cid
///
/// # Returns
/// * `DataFrame` - DataFrame with the linear composite
pub fn linear_composite(
    df: &DataFrame,
    xcats: Vec<String>,
    cids: Vec<String>,
    weights: Option<Vec<f64>>,
    signs: Option<Vec<f64>>,
    weight_xcat: Option<String>,
    normalize_weights: bool,
    start: Option<String>,
    end: Option<String>,
    blacklist: Option<HashMap<String, Vec<String>>>,
    complete_xcats: bool,
    complete_cids: bool,
    new_xcat: Option<String>,
    new_cid: Option<String>,
) -> Result<DataFrame, Box<dyn std::error::Error>> {
    // Check if the DataFrame is a Quantamental DataFrame
    check_quantamental_dataframe(df)?;

    if agg_xcats_for_cid(cids.clone(), xcats.clone()) {
        if weight_xcat.is_some() {
            return Err(
                format!(
                    "Using xcats as weights is not supported when aggregating cids for a single xcat. {:?} {:?}",
                    cids, xcats
                )
                .into(),
            );
        }
    }

    let mut rxcats = xcats.clone();
    if weight_xcat.is_some() {
        rxcats.extend(vec![weight_xcat.clone().unwrap()]);
    }

    let rdf = reduce_dataframe(
        df.clone(),
        Some(cids.clone()),
        Some(rxcats.clone()),
        Some(vec!["value".to_string()]),
        start.clone(),
        end.clone(),
        false,
    )
    .unwrap();

    let new_xcat = new_xcat.unwrap_or_else(|| "COMPOSITE".to_string());
    let new_cid = new_cid.unwrap_or_else(|| "GLB".to_string());

    let dfw = pivot_dataframe_by_ticker(rdf, Some("value".to_string())).unwrap();

    let mul_targets = get_mul_targets(cids.clone(), xcats.clone())?;
    let weights_map =
        form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, weight_xcat, signs)?;

    for (ticker, targets) in mul_targets.iter() {
        println!("ticker: {}, targets: {:?}", ticker, targets);
    }
    for (agg_on, agg_t_map) in weights_map.iter() {
        println!("agg_on: {}, agg_t_map: {:?}", agg_on, agg_t_map);
    }
    // assert!(0==1, "Debugging weights_map: {:?}", weights_map);
    // Perform the multiplication
    let complete = complete_xcats || complete_cids;
    let mut dfw = perform_multiplication(
        &dfw,
        &mul_targets,
        &weights_map,
        complete,
        normalize_weights,
    )?;

    rename_result_dfw_cols(xcats, cids, new_xcat, new_cid, &mut dfw)?;

    let dfw = pivot_wide_dataframe_to_qdf(dfw, Some("value".to_string()))?;
    Ok(dfw)
}