Compare commits

...

9 Commits

Author SHA1 Message Date
Palash Tyagi
ed9d5d01a2 update notebook 2025-04-15 19:33:29 +01:00
Palash Tyagi
dda6e3e12f wip: blacklist util 2025-04-15 19:31:15 +01:00
Palash Tyagi
5ef2c7e6c7 refactor: simplify type annotations in reduce_dataframe function 2025-04-15 00:00:45 +01:00
Palash Tyagi
5c3862c297 refactor: reorganize module exports for clarity 2025-04-14 00:25:23 +01:00
Palash Tyagi
3559a90ad2 adding blacklist ability 2025-04-14 00:25:09 +01:00
Palash Tyagi
b7368a366e add weight_xcat capability 2025-04-14 00:24:26 +01:00
Palash Tyagi
24a4176e17 fix: correct import statement for reduce_dataframe in historic_vol.rs 2025-04-14 00:24:02 +01:00
Palash Tyagi
165e1c19e4 updating linear composite binding 2025-04-14 00:22:59 +01:00
Palash Tyagi
fefe849394 updating notebook 2025-04-14 00:22:38 +01:00
10 changed files with 2863 additions and 183 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -68,7 +68,7 @@ pub fn get_period_indices_hv(dfw: PyDataFrame, est_freq: &str) -> PyResult<Vec<u
cids,
weights = None,
signs = None,
weight_xcats = None,
weight_xcat = None,
normalize_weights = false,
start = None,
end = None,
@@ -84,7 +84,7 @@ pub fn linear_composite(
cids: Vec<String>,
weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>,
weight_xcats: Option<Vec<String>>,
weight_xcat: Option<String>,
normalize_weights: bool,
start: Option<String>,
end: Option<String>,
@@ -101,7 +101,7 @@ pub fn linear_composite(
cids,
weights,
signs,
weight_xcats,
weight_xcat,
normalize_weights,
start,
end,

View File

@@ -1,4 +1,5 @@
use pyo3::prelude::*;
use pyo3::types::IntoPyDict;
use pyo3::{prelude::*, types::PyDict};
use pyo3_polars::{PyDataFrame, PySeries};
/// Python wrapper for [`crate::utils::qdf`] module.
@@ -7,6 +8,7 @@ use pyo3_polars::{PyDataFrame, PySeries};
pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(get_bdates_series_default_pl, m)?)?;
m.add_function(wrap_pyfunction!(get_bdates_series_default_opt, m)?)?;
m.add_function(wrap_pyfunction!(create_blacklist_from_qdf, m)?)?;
Ok(())
}
@@ -33,3 +35,19 @@ pub fn get_bdates_series_default_opt(
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?,
))
}
#[allow(deprecated)]
#[pyfunction(signature = (df, metric = None))]
pub fn create_blacklist_from_qdf(df: PyDataFrame, metric: Option<String>) -> PyResult<PyObject> {
let result = crate::utils::qdf::blacklist::create_blacklist_from_qdf(&df.into(), metric)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
// for (key, (start_date, end_date)) in result {
// dict.set_item(key, (start_date, end_date))
for (key, dates) in result {
dict.set_item(key, dates).map_err(|e| PyErr::from(e))?;
}
Ok(dict.into())
})
}

View File

@@ -56,3 +56,5 @@ class utils:
def get_bdates_series_default_pl(*args, **kwargs) -> Series: ...
@staticmethod
def get_bdates_series_default_opt(*args, **kwargs) -> Series: ...
@staticmethod
def create_blacklist_from_qdf(*args, **kwargs) -> dict: ...

View File

@@ -1,6 +1,6 @@
use crate::utils::dateutils::{get_bdates_from_col, get_min_max_real_dates};
use crate::utils::qdf::pivots::*;
use crate::utils::qdf::reduce_df::*;
use crate::utils::qdf::reduce_dataframe;
use chrono::NaiveDate;
use ndarray::{s, Array, Array1, Zip};
use polars::prelude::*;

View File

@@ -1,6 +1,6 @@
use crate::utils::qdf::check_quantamental_dataframe;
use crate::utils::qdf::pivots::*;
use crate::utils::qdf::reduce_df::*;
use crate::utils::qdf::pivots::{pivot_dataframe_by_ticker, pivot_wide_dataframe_to_qdf};
use crate::utils::qdf::reduce_df::reduce_dataframe;
use polars::prelude::*;
use std::collections::HashMap;
const TOLERANCE: f64 = 1e-8;
@@ -108,14 +108,42 @@ fn _form_agg_nan_mask_series(nan_mask_dfw: &DataFrame) -> Result<Series, PolarsE
Ok(combined.into_series())
}
/// Form the weights DataFrame
fn _form_agg_weights_dfw(
agg_weights_map: &HashMap<String, Vec<f64>>,
data_dfw: DataFrame,
agg_weights_map: &HashMap<String, (WeightValue, f64)>,
dfw: &DataFrame,
) -> Result<DataFrame, PolarsError> {
let mut weights_dfw = DataFrame::new(vec![])?;
for (agg_targ, weight_signs) in agg_weights_map.iter() {
let wgt = weight_signs[0] * weight_signs[1];
let wgt_series = Series::new(agg_targ.into(), vec![wgt; data_dfw.height()]);
// let wgt = weight_signs[0] * weight_signs[1];
let wgt_series = match &weight_signs.0 {
WeightValue::F64(val) => {
let wgt = val * weight_signs.1;
Series::new(agg_targ.into(), vec![wgt; dfw.height()])
}
WeightValue::Str(vstr) => {
// vstr column from data_dfw, else raise wieght specification error
if !dfw.get_column_names().contains(&&PlSmallStr::from(vstr)) {
return Err(PolarsError::ComputeError(
format!(
"The column {} does not exist in the DataFrame. {:?}",
vstr, agg_weights_map
)
.into(),
));
}
let vstr_series = dfw.column(vstr)?;
let multiplied_series = vstr_series * weight_signs.1;
let mut multiplied_series =
multiplied_series.as_series().cloned().ok_or_else(|| {
PolarsError::ComputeError(
"Failed to convert multiplied_series to Series".into(),
)
})?;
multiplied_series.rename(agg_targ.into());
multiplied_series
}
};
weights_dfw.with_column(wgt_series)?;
}
Ok(weights_dfw)
@@ -143,14 +171,14 @@ fn perform_single_group_agg(
dfw: &DataFrame,
agg_on: &String,
agg_targs: &Vec<String>,
agg_weights_map: &HashMap<String, Vec<f64>>,
agg_weights_map: &HashMap<String, (WeightValue, f64)>,
normalize_weights: bool,
complete: bool,
) -> Result<Column, PolarsError> {
let data_dfw = _form_agg_data_dfw(dfw, agg_targs)?;
let nan_mask_dfw = _form_agg_nan_mask_dfw(&data_dfw)?;
let nan_mask_series = _form_agg_nan_mask_series(&nan_mask_dfw)?;
let weights_dfw = _form_agg_weights_dfw(agg_weights_map, data_dfw.clone())?;
let weights_dfw = _form_agg_weights_dfw(agg_weights_map, dfw)?;
let weights_dfw = match normalize_weights {
true => normalize_weights_with_nan_mask(weights_dfw, nan_mask_dfw)?,
false => weights_dfw,
@@ -192,7 +220,7 @@ fn perform_single_group_agg(
fn perform_multiplication(
dfw: &DataFrame,
mult_targets: &HashMap<String, Vec<String>>,
weights_map: &HashMap<String, HashMap<String, Vec<f64>>>,
weights_map: &HashMap<String, HashMap<String, (WeightValue, f64)>>,
complete: bool,
normalize_weights: bool,
) -> Result<DataFrame, PolarsError> {
@@ -200,6 +228,7 @@ fn perform_multiplication(
// let mut new_dfw = DataFrame::new(vec![real_date])?;
let mut new_dfw = DataFrame::new(vec![])?;
assert!(!mult_targets.is_empty(), "agg_targs is empty");
for (agg_on, agg_targs) in mult_targets.iter() {
// perform_single_group_agg
let cols_len = new_dfw.get_column_names().len();
@@ -288,76 +317,122 @@ fn get_mul_targets(
Ok(mul_targets)
}
/// Builds a map of the shape:
/// `HashMap<String, HashMap<String, (WeightValue, f64)>>`
/// where only one of `weights` or `weight_xcats` can be provided.
/// If neither is provided, weights default to 1.0.
/// Each tuple is `(WeightValue, f64) = (weight, sign)`.
fn form_weights_and_signs_map(
cids: Vec<String>,
xcats: Vec<String>,
weights: Option<Vec<f64>>,
weight_xcat: Option<String>,
signs: Option<Vec<f64>>,
) -> Result<HashMap<String, HashMap<String, Vec<f64>>>, Box<dyn std::error::Error>> {
let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
) -> Result<HashMap<String, HashMap<String, (WeightValue, f64)>>, Box<dyn std::error::Error>> {
// For demonstration, we pretend to load or infer these from helpers:
let agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone());
// if weights are None, create a vector of 1s of the same length as agg_targ
let weights = weights.unwrap_or(vec![1.0 / agg_targ.len() as f64; agg_targ.len()]);
let signs = signs.unwrap_or(vec![1.0; agg_targ.len()]);
// Determine if each weight option has non-empty values.
let weights_provided = weights.as_ref().map_or(false, |v| !v.is_empty());
let weight_xcats_provided = weight_xcat.as_ref().map_or(false, |v| !v.is_empty());
// check that the lengths of weights and signs match the length of agg_targ
check_weights_signs_lengths(
weights.clone(),
signs.clone(),
_agg_xcats_for_cid,
agg_targ.len(),
)?;
// Enforce that only one of weights or weight_xcats is specified.
if weights_provided && weight_xcats_provided {
return Err("Only one of `weights` and `weight_xcats` may be specified.".into());
}
let mut weights_map = HashMap::new();
// 1) Build the "actual_weights" vector as WeightValue.
let actual_weights: Vec<WeightValue> = if weights_provided {
weights.unwrap().into_iter().map(WeightValue::F64).collect()
} else if weight_xcats_provided {
vec![WeightValue::Str(weight_xcat.unwrap()); agg_targ.len()]
} else {
// Default to numeric 1.0 if neither is provided
vec![WeightValue::F64(1.0); agg_targ.len()]
};
// 2) Build the "signs" vector; default to 1.0 if not provided
let signs = signs.unwrap_or_else(|| vec![1.0; agg_targ.len()]);
// 3) Optional: check lengths & zero values (only numeric weights).
check_weights_signs_lengths(&actual_weights, &signs, agg_xcats_for_cid, agg_targ.len())?;
// 4) Build the final nested HashMap
let mut weights_map: HashMap<String, HashMap<String, (WeightValue, f64)>> = HashMap::new();
for agg_o in agg_on {
let mut agg_t_map = HashMap::new();
for (i, agg_t) in agg_targ.iter().enumerate() {
let ticker = match _agg_xcats_for_cid {
true => format!("{}_{}", agg_o, agg_t),
false => format!("{}_{}", agg_t, agg_o),
// Format the ticker
let ticker = if agg_xcats_for_cid {
format!("{}_{}", agg_o, agg_t)
} else {
format!("{}_{}", agg_t, agg_o)
};
let weight_signs = vec![weights[i], signs[i]];
agg_t_map.insert(ticker, weight_signs);
// Build the tuple (WeightValue, f64)
let weight_sign_tuple = match &actual_weights[i] {
WeightValue::F64(val) => (WeightValue::F64(*val).clone(), signs[i]),
WeightValue::Str(vstr) => {
let new_str = format!("{}_{}", agg_t, vstr);
(WeightValue::Str(new_str), signs[i])
}
};
agg_t_map.insert(ticker, weight_sign_tuple);
}
weights_map.insert(agg_o.clone(), agg_t_map);
}
Ok(weights_map)
}
/// Checks that the given slices have the expected length and that:
/// - numeric weights are non-zero,
/// - signs are non-zero.
fn check_weights_signs_lengths(
weights_vec: Vec<f64>,
signs_vec: Vec<f64>,
_agg_xcats_for_cid: bool,
weights_vec: &[WeightValue],
signs_vec: &[f64],
agg_xcats_for_cid: bool,
agg_targ_len: usize,
) -> Result<(), Box<dyn std::error::Error>> {
// for vx, vname in ...
let agg_targ = match _agg_xcats_for_cid {
true => "xcats",
false => "cids",
};
for (vx, vname) in vec![
(weights_vec.clone(), "weights"),
(signs_vec.clone(), "signs"),
] {
for (i, v) in vx.iter().enumerate() {
if *v == 0.0 {
return Err(format!("The {} at index {} is 0.0", vname, i).into());
// For diagnostics, decide what to call the dimension
let agg_targ = if agg_xcats_for_cid { "xcats" } else { "cids" };
// 1) Check numeric weights for zeroes.
for (i, weight) in weights_vec.iter().enumerate() {
if let WeightValue::F64(val) = weight {
if *val == 0.0 {
return Err(format!("The weight at index {} is 0.0", i).into());
}
}
if vx.len() != agg_targ_len {
}
// 2) Ensure the weights vector is the expected length.
if weights_vec.len() != agg_targ_len {
return Err(format!(
"The length of {} ({}) does not match the length of {} ({})",
vname,
vx.len(),
"The length of weights ({}) does not match the length of {} ({})",
weights_vec.len(),
agg_targ,
agg_targ_len
)
.into());
}
// 3) Check signs for zero.
for (i, sign) in signs_vec.iter().enumerate() {
if *sign == 0.0 {
return Err(format!("The sign at index {} is 0.0", i).into());
}
}
// 4) Ensure the signs vector is the expected length.
if signs_vec.len() != agg_targ_len {
return Err(format!(
"The length of signs ({}) does not match the length of {} ({})",
signs_vec.len(),
agg_targ,
agg_targ_len
)
.into());
}
Ok(())
}
fn rename_result_dfw_cols(
@@ -393,6 +468,36 @@ fn agg_xcats_for_cid(cids: Vec<String>, xcats: Vec<String>) -> bool {
xcats.len() > 1
}
/// Represents a weight value that can be a string, (float, or integer).
#[derive(Debug, Clone, PartialEq)]
pub enum WeightValue {
Str(String),
F64(f64),
}
impl From<String> for WeightValue {
fn from(s: String) -> Self {
WeightValue::Str(s)
}
}
impl<'a> From<&'a str> for WeightValue {
fn from(s: &'a str) -> Self {
WeightValue::Str(s.to_string())
}
}
impl From<f64> for WeightValue {
fn from(f: f64) -> Self {
WeightValue::F64(f)
}
}
impl From<i32> for WeightValue {
fn from(i: i32) -> Self {
WeightValue::F64(i as f64)
}
}
/// Weighted linear combinations of cross sections or categories
/// # Arguments
/// * `df` - QDF DataFrame
@@ -417,7 +522,7 @@ pub fn linear_composite(
cids: Vec<String>,
weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>,
weight_xcats: Option<Vec<String>>,
weight_xcat: Option<String>,
normalize_weights: bool,
start: Option<String>,
end: Option<String>,
@@ -429,10 +534,28 @@ pub fn linear_composite(
) -> Result<DataFrame, Box<dyn std::error::Error>> {
// Check if the DataFrame is a Quantamental DataFrame
check_quantamental_dataframe(df)?;
if agg_xcats_for_cid(cids.clone(), xcats.clone()) {
if weight_xcat.is_some() {
return Err(
format!(
"Using xcats as weights is not supported when aggregating cids for a single xcat. {:?} {:?}",
cids, xcats
)
.into(),
);
}
}
let mut rxcats = xcats.clone();
if weight_xcat.is_some() {
rxcats.extend(vec![weight_xcat.clone().unwrap()]);
}
let rdf = reduce_dataframe(
df.clone(),
Some(cids.clone()),
Some(xcats.clone()),
Some(rxcats.clone()),
Some(vec!["value".to_string()]),
start.clone(),
end.clone(),
@@ -443,10 +566,11 @@ pub fn linear_composite(
let new_xcat = new_xcat.unwrap_or_else(|| "COMPOSITE".to_string());
let new_cid = new_cid.unwrap_or_else(|| "GLB".to_string());
let dfw = pivot_dataframe_by_ticker(rdf.clone(), Some("value".to_string())).unwrap();
let dfw = pivot_dataframe_by_ticker(rdf, Some("value".to_string())).unwrap();
let mul_targets = get_mul_targets(cids.clone(), xcats.clone())?;
let weights_map = form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, signs)?;
let weights_map =
form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, weight_xcat, signs)?;
for (ticker, targets) in mul_targets.iter() {
println!("ticker: {}, targets: {:?}", ticker, targets);

161
src/utils/qdf/blacklist.rs Normal file
View File

@@ -0,0 +1,161 @@
use crate::utils::bdates::{self, get_bdates_list_with_freq, BDateFreq};
use crate::utils::dateutils::{get_bdates_series_default_opt, get_min_max_real_dates};
use crate::utils::qdf::core::check_quantamental_dataframe;
use chrono::NaiveDate;
use polars::prelude::*;
use std::collections::HashMap;
use std::error::Error;
pub fn create_blacklist_from_qdf(
df: &DataFrame,
metric: Option<String>,
// ) -> Result<HashMap<String, (String, String)>, Box<dyn Error>> {
) -> Result<HashMap<String, Vec<String>>, Box<dyn Error>> {
// Verify that the DataFrame follows the Quantamental structure.
check_quantamental_dataframe(df)?;
let mut blacklist: HashMap<String, Vec<String>> = HashMap::new();
// let mut blacklist: HashMap<String, (String, String)> = HashMap::new();
let mut blk: HashMap<String, Vec<NaiveDate>> = HashMap::new();
// Use the provided metric or default to "value".
let metric = metric.unwrap_or_else(|| "value".into());
let (min_date, max_date) = get_min_max_real_dates(df, "real_date".into())?;
let min_date_str = min_date.format("%Y-%m-%d").to_string();
let max_date_str = max_date.format("%Y-%m-%d").to_string();
// let all_bdates = get_bdates_series_default_opt(min_date_str, max_date_str, None)?;
let all_bdates = get_bdates_list_with_freq(
min_date_str.clone().as_str(),
max_date_str.clone().as_str(),
BDateFreq::Daily,
)?;
// filter df
let null_mask = df.column(metric.as_str())?.is_null();
let nan_mask = df.column(metric.as_str())?.is_nan()?;
let null_mask = null_mask | nan_mask;
let df = df.filter(&null_mask)?;
let df = df
.lazy()
.filter(
col(metric.as_str())
.is_null()
.or(col(metric.as_str()).is_nan()),
)
.sort(
["cid", "xcat"],
SortMultipleOptions::default().with_maintain_order(true),
)
.group_by([col("cid"), col("xcat")])
// .agg([col("real_date").sort(SortOptions::default())])
.agg([col("real_date")
.dt()
.strftime("%Y-%m-%d")
.sort(SortOptions::default())])
.select([
concat_str([col("cid"), col("xcat")], "_", true).alias("ticker"),
col("real_date").alias("real_dates"),
])
.collect()?;
// assert!(0 == 1, "{:?}", df);
let ticker_vec = df
.column("ticker")?
.str()?
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>();
let rdt = get_vec_of_vec_of_dates_from_df(df)?;
// assert!(0 == 1, "{:?}", rdt);
for (tkr, dates) in ticker_vec.iter().zip(rdt.iter()) {
blacklist.insert(tkr.to_string(), dates.clone());
}
Ok(blacklist)
}
fn convert_dates_list_to_date_ranges(
blacklist: HashMap<String, Vec<String>>,
all_bdates: Vec<NaiveDate>,
) -> HashMap<String, (String, String)> {
let blk = HashMap::new();
let bdates = all_bdates
.iter()
.map(|date| date.format("%Y-%m-%d").to_string())
.collect::<Vec<String>>();
//
blk
}
fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.str()
.unwrap()
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>()
})
.collect::<Vec<Vec<String>>>();
Ok(rdt)
}
fn get_vec_of_vec_of_naivedates_from_df(
df: DataFrame,
) -> Result<Vec<Vec<NaiveDate>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.date()
.unwrap()
.into_iter()
.filter_map(|opt| opt.and_then(|date| NaiveDate::from_num_days_from_ce_opt(date)))
.collect::<Vec<NaiveDate>>()
})
.collect::<Vec<Vec<NaiveDate>>>();
Ok(rdt)
}
// fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
// let real_dates_column = df.column("real_dates")?.clone();
// let series = real_dates_column.as_series().unwrap().clone();
// let rdt = series.list()?.clone();
// let rdt = rdt
// .into_iter()
// .filter_map(|opt| opt)
// .collect::<Vec<Series>>();
// let rdt = rdt
// .iter()
// .map(|s| {
// s.str()
// .unwrap()
// .into_iter()
// .filter_map(|opt| opt.map(|s| s.to_string()))
// .collect::<Vec<String>>()
// })
// .collect::<Vec<Vec<String>>>();
// Ok(rdt)
// }

View File

@@ -1,11 +1,12 @@
pub mod blacklist;
pub mod core;
pub mod update_df;
pub mod load;
pub mod reduce_df;
pub mod pivots;
pub mod reduce_df;
pub mod update_df;
// Re-export submodules for easier access
pub use core::*;
pub use update_df::*;
pub use load::*;
pub use reduce_df::*;
pub use update_df::*;

View File

@@ -30,12 +30,12 @@ pub fn reduce_dataframe(
let df_size = df.shape();
let mut new_df = df.clone();
let ticker_col: Column = get_ticker_column_for_quantamental_dataframe(&new_df)?;
let ticker_col = get_ticker_column_for_quantamental_dataframe(&new_df)?;
// if cids is not provided, get all unique cids
let u_cids: Vec<String> = get_unique_cids(&new_df)?;
let u_xcats: Vec<String> = get_unique_xcats(&new_df)?;
let u_tickers: Vec<String> = _get_unique_strs_from_str_column_object(&ticker_col)?;
let u_cids = get_unique_cids(&new_df)?;
let u_xcats = get_unique_xcats(&new_df)?;
let u_tickers = _get_unique_strs_from_str_column_object(&ticker_col)?;
let cids_vec = cids.unwrap_or_else(|| u_cids.clone());
let specified_cids: Vec<&str> = cids_vec.iter().map(AsRef::as_ref).collect();