Compare commits

...

18 Commits

Author SHA1 Message Date
Palash Tyagi
ee531deb7a update create_blacklist_from_qdf function signature to accept metrics as a vector 2025-04-16 19:27:46 +01:00
Palash Tyagi
0443906cc9 feat: refactor create_blacklist_from_qdf to accept multiple metrics and improve null handling 2025-04-16 19:27:35 +01:00
Palash Tyagi
f0a9242d10 rerean notebook 2025-04-16 01:37:52 +01:00
Palash Tyagi
df22667d63 updated notebook 2025-04-16 01:37:24 +01:00
Palash Tyagi
7d4c198067 feat: update create_blacklist_from_qdf function signature to include group_by_cid and blacklist_name parameters 2025-04-16 01:36:56 +01:00
Palash Tyagi
25192e425d completed wdf.columns implementation 2025-04-16 01:36:48 +01:00
Palash Tyagi
e602b8b2b4 fix: change code block syntax to ignore in documentation examples 2025-04-16 01:35:59 +01:00
Palash Tyagi
5d2ff3b88d fix: improve error messages in check_quantamental_dataframe function 2025-04-16 01:35:48 +01:00
Palash Tyagi
3d2afa01a8 feat: add build and install scripts for maturin integration 2025-04-15 21:28:09 +01:00
Palash Tyagi
ed9d5d01a2 update notebook 2025-04-15 19:33:29 +01:00
Palash Tyagi
dda6e3e12f wip: blacklist util 2025-04-15 19:31:15 +01:00
Palash Tyagi
5ef2c7e6c7 refactor: simplify type annotations in reduce_dataframe function 2025-04-15 00:00:45 +01:00
Palash Tyagi
5c3862c297 refactor: reorganize module exports for clarity 2025-04-14 00:25:23 +01:00
Palash Tyagi
3559a90ad2 adding blacklist ability 2025-04-14 00:25:09 +01:00
Palash Tyagi
b7368a366e add weight_xcat capability 2025-04-14 00:24:26 +01:00
Palash Tyagi
24a4176e17 fix: correct import statement for reduce_dataframe in historic_vol.rs 2025-04-14 00:24:02 +01:00
Palash Tyagi
165e1c19e4 updating linear composite binding 2025-04-14 00:22:59 +01:00
Palash Tyagi
fefe849394 updating notebook 2025-04-14 00:22:38 +01:00
14 changed files with 960 additions and 214 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

33
scripts/unix/build.sh Normal file
View File

@@ -0,0 +1,33 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Run "maturin --help". If it fails, print an error message and exit.
if ! maturin --help > /dev/null 2>&1; then
echo "Failed to run maturin --help" >&2
exit 1
fi
# Delete any existing build directory and create a new one.
rm -rf ./build
mkdir -p ./build
# Copy ./src/msyrs.pyi to ./msyrs.pyi.
cp ./src/msyrs.pyi ./msyrs.pyi
# Build using maturin.
maturin build --release --sdist --out ./build/
# Get the first wheel file found in the build directory.
whl_file=$(ls ./build/*.whl 2>/dev/null | head -n 1)
if [ -z "$whl_file" ]; then
echo "No wheel file found in ./build" >&2
exit 1
fi
# Rename the wheel file from .whl to .zip.
base_name="${whl_file%.whl}"
mv "$whl_file" "${base_name}.zip"
# Delete the temporary .pyi file.
rm ./msyrs.pyi

20
scripts/unix/install.sh Normal file
View File

@@ -0,0 +1,20 @@
#!/bin/bash
set -e
# Ensure maturin is installed. For example, you can install it via:
# pip install maturin
# Run "maturin --help". If it fails, print an error message and exit.
if ! maturin --help > /dev/null 2>&1; then
echo "Failed to run maturin --help" >&2
exit 1
fi
# Copy ./src/msyrs.pyi to the current directory as msyrs.pyi
cp ./src/msyrs.pyi ./msyrs.pyi
# Run maturin develop in release mode.
maturin develop --release
# Delete the temporary msyrs.pyi file.
rm ./msyrs.pyi

View File

@@ -68,7 +68,7 @@ pub fn get_period_indices_hv(dfw: PyDataFrame, est_freq: &str) -> PyResult<Vec<u
cids, cids,
weights = None, weights = None,
signs = None, signs = None,
weight_xcats = None, weight_xcat = None,
normalize_weights = false, normalize_weights = false,
start = None, start = None,
end = None, end = None,
@@ -84,7 +84,7 @@ pub fn linear_composite(
cids: Vec<String>, cids: Vec<String>,
weights: Option<Vec<f64>>, weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>, signs: Option<Vec<f64>>,
weight_xcats: Option<Vec<String>>, weight_xcat: Option<String>,
normalize_weights: bool, normalize_weights: bool,
start: Option<String>, start: Option<String>,
end: Option<String>, end: Option<String>,
@@ -101,7 +101,7 @@ pub fn linear_composite(
cids, cids,
weights, weights,
signs, signs,
weight_xcats, weight_xcat,
normalize_weights, normalize_weights,
start, start,
end, end,

View File

@@ -1,4 +1,4 @@
use pyo3::prelude::*; use pyo3::{prelude::*, types::PyDict};
use pyo3_polars::{PyDataFrame, PySeries}; use pyo3_polars::{PyDataFrame, PySeries};
/// Python wrapper for [`crate::utils::qdf`] module. /// Python wrapper for [`crate::utils::qdf`] module.
@@ -7,6 +7,7 @@ use pyo3_polars::{PyDataFrame, PySeries};
pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> { pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(get_bdates_series_default_pl, m)?)?; m.add_function(wrap_pyfunction!(get_bdates_series_default_pl, m)?)?;
m.add_function(wrap_pyfunction!(get_bdates_series_default_opt, m)?)?; m.add_function(wrap_pyfunction!(get_bdates_series_default_opt, m)?)?;
m.add_function(wrap_pyfunction!(create_blacklist_from_qdf, m)?)?;
Ok(()) Ok(())
} }
@@ -33,3 +34,29 @@ pub fn get_bdates_series_default_opt(
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?, .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?,
)) ))
} }
#[allow(deprecated)]
#[pyfunction(signature = (df, group_by_cid=None, blacklist_name=None, metrics=None))]
pub fn create_blacklist_from_qdf(
df: PyDataFrame,
group_by_cid: Option<bool>,
blacklist_name: Option<String>,
metrics: Option<Vec<String>>,
) -> PyResult<PyObject> {
let result = crate::utils::qdf::blacklist::create_blacklist_from_qdf(
&df.into(),
group_by_cid,
blacklist_name,
metrics,
)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
// for (key, (start_date, end_date)) in result {
// dict.set_item(key, (start_date, end_date))
for (key, dates) in result {
dict.set_item(key, dates).map_err(|e| PyErr::from(e))?;
}
Ok(dict.into())
})
}

View File

@@ -58,7 +58,7 @@ fn all_jpmaq_expressions(expressions: Vec<String>) -> bool {
/// ///
/// Example Usage: /// Example Usage:
/// ///
/// ```rust /// ```ignore
/// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs; /// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs;
/// use msyrs::download::jpmaqsdownload::JPMaQSDownload; /// use msyrs::download::jpmaqsdownload::JPMaQSDownload;
/// ///
@@ -102,7 +102,7 @@ impl Default for JPMaQSDownloadGetIndicatorArgs {
/// Struct for downloading data from the JPMaQS data from JPMorgan DataQuery API. /// Struct for downloading data from the JPMaQS data from JPMorgan DataQuery API.
/// ///
/// ## Example Usage /// ## Example Usage
/// ```rust /// ```ignore
/// use msyrs::download::jpmaqsdownload::JPMaQSDownload; /// use msyrs::download::jpmaqsdownload::JPMaQSDownload;
/// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs; /// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs;
/// use polars::prelude::*; /// use polars::prelude::*;
@@ -277,7 +277,7 @@ impl JPMaQSDownload {
/// ///
/// Usage: /// Usage:
/// ///
/// ```rust /// ```ignore
/// use msyrs::download::jpmaqsdownload::JPMaQSDownload; /// use msyrs::download::jpmaqsdownload::JPMaQSDownload;
/// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs; /// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs;
/// let mut jpamqs_download = JPMaQSDownload::default(); /// let mut jpamqs_download = JPMaQSDownload::default();

View File

@@ -56,3 +56,5 @@ class utils:
def get_bdates_series_default_pl(*args, **kwargs) -> Series: ... def get_bdates_series_default_pl(*args, **kwargs) -> Series: ...
@staticmethod @staticmethod
def get_bdates_series_default_opt(*args, **kwargs) -> Series: ... def get_bdates_series_default_opt(*args, **kwargs) -> Series: ...
@staticmethod
def create_blacklist_from_qdf(*args, **kwargs) -> dict: ...

View File

@@ -1,6 +1,6 @@
use crate::utils::dateutils::{get_bdates_from_col, get_min_max_real_dates}; use crate::utils::dateutils::{get_bdates_from_col, get_min_max_real_dates};
use crate::utils::qdf::pivots::*; use crate::utils::qdf::pivots::*;
use crate::utils::qdf::reduce_df::*; use crate::utils::qdf::reduce_dataframe;
use chrono::NaiveDate; use chrono::NaiveDate;
use ndarray::{s, Array, Array1, Zip}; use ndarray::{s, Array, Array1, Zip};
use polars::prelude::*; use polars::prelude::*;

View File

@@ -1,6 +1,6 @@
use crate::utils::qdf::check_quantamental_dataframe; use crate::utils::qdf::check_quantamental_dataframe;
use crate::utils::qdf::pivots::*; use crate::utils::qdf::pivots::{pivot_dataframe_by_ticker, pivot_wide_dataframe_to_qdf};
use crate::utils::qdf::reduce_df::*; use crate::utils::qdf::reduce_df::reduce_dataframe;
use polars::prelude::*; use polars::prelude::*;
use std::collections::HashMap; use std::collections::HashMap;
const TOLERANCE: f64 = 1e-8; const TOLERANCE: f64 = 1e-8;
@@ -108,14 +108,42 @@ fn _form_agg_nan_mask_series(nan_mask_dfw: &DataFrame) -> Result<Series, PolarsE
Ok(combined.into_series()) Ok(combined.into_series())
} }
/// Form the weights DataFrame
fn _form_agg_weights_dfw( fn _form_agg_weights_dfw(
agg_weights_map: &HashMap<String, Vec<f64>>, agg_weights_map: &HashMap<String, (WeightValue, f64)>,
data_dfw: DataFrame, dfw: &DataFrame,
) -> Result<DataFrame, PolarsError> { ) -> Result<DataFrame, PolarsError> {
let mut weights_dfw = DataFrame::new(vec![])?; let mut weights_dfw = DataFrame::new(vec![])?;
for (agg_targ, weight_signs) in agg_weights_map.iter() { for (agg_targ, weight_signs) in agg_weights_map.iter() {
let wgt = weight_signs[0] * weight_signs[1]; // let wgt = weight_signs[0] * weight_signs[1];
let wgt_series = Series::new(agg_targ.into(), vec![wgt; data_dfw.height()]); let wgt_series = match &weight_signs.0 {
WeightValue::F64(val) => {
let wgt = val * weight_signs.1;
Series::new(agg_targ.into(), vec![wgt; dfw.height()])
}
WeightValue::Str(vstr) => {
// vstr column from data_dfw, else raise wieght specification error
if !dfw.get_column_names().contains(&&PlSmallStr::from(vstr)) {
return Err(PolarsError::ComputeError(
format!(
"The column {} does not exist in the DataFrame. {:?}",
vstr, agg_weights_map
)
.into(),
));
}
let vstr_series = dfw.column(vstr)?;
let multiplied_series = vstr_series * weight_signs.1;
let mut multiplied_series =
multiplied_series.as_series().cloned().ok_or_else(|| {
PolarsError::ComputeError(
"Failed to convert multiplied_series to Series".into(),
)
})?;
multiplied_series.rename(agg_targ.into());
multiplied_series
}
};
weights_dfw.with_column(wgt_series)?; weights_dfw.with_column(wgt_series)?;
} }
Ok(weights_dfw) Ok(weights_dfw)
@@ -143,14 +171,14 @@ fn perform_single_group_agg(
dfw: &DataFrame, dfw: &DataFrame,
agg_on: &String, agg_on: &String,
agg_targs: &Vec<String>, agg_targs: &Vec<String>,
agg_weights_map: &HashMap<String, Vec<f64>>, agg_weights_map: &HashMap<String, (WeightValue, f64)>,
normalize_weights: bool, normalize_weights: bool,
complete: bool, complete: bool,
) -> Result<Column, PolarsError> { ) -> Result<Column, PolarsError> {
let data_dfw = _form_agg_data_dfw(dfw, agg_targs)?; let data_dfw = _form_agg_data_dfw(dfw, agg_targs)?;
let nan_mask_dfw = _form_agg_nan_mask_dfw(&data_dfw)?; let nan_mask_dfw = _form_agg_nan_mask_dfw(&data_dfw)?;
let nan_mask_series = _form_agg_nan_mask_series(&nan_mask_dfw)?; let nan_mask_series = _form_agg_nan_mask_series(&nan_mask_dfw)?;
let weights_dfw = _form_agg_weights_dfw(agg_weights_map, data_dfw.clone())?; let weights_dfw = _form_agg_weights_dfw(agg_weights_map, dfw)?;
let weights_dfw = match normalize_weights { let weights_dfw = match normalize_weights {
true => normalize_weights_with_nan_mask(weights_dfw, nan_mask_dfw)?, true => normalize_weights_with_nan_mask(weights_dfw, nan_mask_dfw)?,
false => weights_dfw, false => weights_dfw,
@@ -192,7 +220,7 @@ fn perform_single_group_agg(
fn perform_multiplication( fn perform_multiplication(
dfw: &DataFrame, dfw: &DataFrame,
mult_targets: &HashMap<String, Vec<String>>, mult_targets: &HashMap<String, Vec<String>>,
weights_map: &HashMap<String, HashMap<String, Vec<f64>>>, weights_map: &HashMap<String, HashMap<String, (WeightValue, f64)>>,
complete: bool, complete: bool,
normalize_weights: bool, normalize_weights: bool,
) -> Result<DataFrame, PolarsError> { ) -> Result<DataFrame, PolarsError> {
@@ -200,6 +228,7 @@ fn perform_multiplication(
// let mut new_dfw = DataFrame::new(vec![real_date])?; // let mut new_dfw = DataFrame::new(vec![real_date])?;
let mut new_dfw = DataFrame::new(vec![])?; let mut new_dfw = DataFrame::new(vec![])?;
assert!(!mult_targets.is_empty(), "agg_targs is empty"); assert!(!mult_targets.is_empty(), "agg_targs is empty");
for (agg_on, agg_targs) in mult_targets.iter() { for (agg_on, agg_targs) in mult_targets.iter() {
// perform_single_group_agg // perform_single_group_agg
let cols_len = new_dfw.get_column_names().len(); let cols_len = new_dfw.get_column_names().len();
@@ -288,76 +317,122 @@ fn get_mul_targets(
Ok(mul_targets) Ok(mul_targets)
} }
/// Builds a map of the shape:
/// `HashMap<String, HashMap<String, (WeightValue, f64)>>`
/// where only one of `weights` or `weight_xcats` can be provided.
/// If neither is provided, weights default to 1.0.
/// Each tuple is `(WeightValue, f64) = (weight, sign)`.
fn form_weights_and_signs_map( fn form_weights_and_signs_map(
cids: Vec<String>, cids: Vec<String>,
xcats: Vec<String>, xcats: Vec<String>,
weights: Option<Vec<f64>>, weights: Option<Vec<f64>>,
weight_xcat: Option<String>,
signs: Option<Vec<f64>>, signs: Option<Vec<f64>>,
) -> Result<HashMap<String, HashMap<String, Vec<f64>>>, Box<dyn std::error::Error>> { ) -> Result<HashMap<String, HashMap<String, (WeightValue, f64)>>, Box<dyn std::error::Error>> {
let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone()); // For demonstration, we pretend to load or infer these from helpers:
let agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone()); let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone());
// if weights are None, create a vector of 1s of the same length as agg_targ // Determine if each weight option has non-empty values.
let weights = weights.unwrap_or(vec![1.0 / agg_targ.len() as f64; agg_targ.len()]); let weights_provided = weights.as_ref().map_or(false, |v| !v.is_empty());
let signs = signs.unwrap_or(vec![1.0; agg_targ.len()]); let weight_xcats_provided = weight_xcat.as_ref().map_or(false, |v| !v.is_empty());
// check that the lengths of weights and signs match the length of agg_targ // Enforce that only one of weights or weight_xcats is specified.
check_weights_signs_lengths( if weights_provided && weight_xcats_provided {
weights.clone(), return Err("Only one of `weights` and `weight_xcats` may be specified.".into());
signs.clone(), }
_agg_xcats_for_cid,
agg_targ.len(),
)?;
let mut weights_map = HashMap::new(); // 1) Build the "actual_weights" vector as WeightValue.
let actual_weights: Vec<WeightValue> = if weights_provided {
weights.unwrap().into_iter().map(WeightValue::F64).collect()
} else if weight_xcats_provided {
vec![WeightValue::Str(weight_xcat.unwrap()); agg_targ.len()]
} else {
// Default to numeric 1.0 if neither is provided
vec![WeightValue::F64(1.0); agg_targ.len()]
};
// 2) Build the "signs" vector; default to 1.0 if not provided
let signs = signs.unwrap_or_else(|| vec![1.0; agg_targ.len()]);
// 3) Optional: check lengths & zero values (only numeric weights).
check_weights_signs_lengths(&actual_weights, &signs, agg_xcats_for_cid, agg_targ.len())?;
// 4) Build the final nested HashMap
let mut weights_map: HashMap<String, HashMap<String, (WeightValue, f64)>> = HashMap::new();
for agg_o in agg_on { for agg_o in agg_on {
let mut agg_t_map = HashMap::new(); let mut agg_t_map = HashMap::new();
for (i, agg_t) in agg_targ.iter().enumerate() { for (i, agg_t) in agg_targ.iter().enumerate() {
let ticker = match _agg_xcats_for_cid { // Format the ticker
true => format!("{}_{}", agg_o, agg_t), let ticker = if agg_xcats_for_cid {
false => format!("{}_{}", agg_t, agg_o), format!("{}_{}", agg_o, agg_t)
} else {
format!("{}_{}", agg_t, agg_o)
}; };
let weight_signs = vec![weights[i], signs[i]]; // Build the tuple (WeightValue, f64)
agg_t_map.insert(ticker, weight_signs); let weight_sign_tuple = match &actual_weights[i] {
WeightValue::F64(val) => (WeightValue::F64(*val).clone(), signs[i]),
WeightValue::Str(vstr) => {
let new_str = format!("{}_{}", agg_t, vstr);
(WeightValue::Str(new_str), signs[i])
}
};
agg_t_map.insert(ticker, weight_sign_tuple);
} }
weights_map.insert(agg_o.clone(), agg_t_map); weights_map.insert(agg_o.clone(), agg_t_map);
} }
Ok(weights_map) Ok(weights_map)
} }
/// Checks that the given slices have the expected length and that:
/// - numeric weights are non-zero,
/// - signs are non-zero.
fn check_weights_signs_lengths( fn check_weights_signs_lengths(
weights_vec: Vec<f64>, weights_vec: &[WeightValue],
signs_vec: Vec<f64>, signs_vec: &[f64],
_agg_xcats_for_cid: bool, agg_xcats_for_cid: bool,
agg_targ_len: usize, agg_targ_len: usize,
) -> Result<(), Box<dyn std::error::Error>> { ) -> Result<(), Box<dyn std::error::Error>> {
// for vx, vname in ... // For diagnostics, decide what to call the dimension
let agg_targ = match _agg_xcats_for_cid { let agg_targ = if agg_xcats_for_cid { "xcats" } else { "cids" };
true => "xcats",
false => "cids", // 1) Check numeric weights for zeroes.
}; for (i, weight) in weights_vec.iter().enumerate() {
for (vx, vname) in vec![ if let WeightValue::F64(val) = weight {
(weights_vec.clone(), "weights"), if *val == 0.0 {
(signs_vec.clone(), "signs"), return Err(format!("The weight at index {} is 0.0", i).into());
] {
for (i, v) in vx.iter().enumerate() {
if *v == 0.0 {
return Err(format!("The {} at index {} is 0.0", vname, i).into());
} }
} }
if vx.len() != agg_targ_len { }
return Err(format!( // 2) Ensure the weights vector is the expected length.
"The length of {} ({}) does not match the length of {} ({})", if weights_vec.len() != agg_targ_len {
vname, return Err(format!(
vx.len(), "The length of weights ({}) does not match the length of {} ({})",
agg_targ, weights_vec.len(),
agg_targ_len agg_targ,
) agg_targ_len
.into()); )
.into());
}
// 3) Check signs for zero.
for (i, sign) in signs_vec.iter().enumerate() {
if *sign == 0.0 {
return Err(format!("The sign at index {} is 0.0", i).into());
} }
} }
// 4) Ensure the signs vector is the expected length.
if signs_vec.len() != agg_targ_len {
return Err(format!(
"The length of signs ({}) does not match the length of {} ({})",
signs_vec.len(),
agg_targ,
agg_targ_len
)
.into());
}
Ok(()) Ok(())
} }
fn rename_result_dfw_cols( fn rename_result_dfw_cols(
@@ -393,6 +468,36 @@ fn agg_xcats_for_cid(cids: Vec<String>, xcats: Vec<String>) -> bool {
xcats.len() > 1 xcats.len() > 1
} }
/// Represents a weight value that can be a string, (float, or integer).
#[derive(Debug, Clone, PartialEq)]
pub enum WeightValue {
Str(String),
F64(f64),
}
impl From<String> for WeightValue {
fn from(s: String) -> Self {
WeightValue::Str(s)
}
}
impl<'a> From<&'a str> for WeightValue {
fn from(s: &'a str) -> Self {
WeightValue::Str(s.to_string())
}
}
impl From<f64> for WeightValue {
fn from(f: f64) -> Self {
WeightValue::F64(f)
}
}
impl From<i32> for WeightValue {
fn from(i: i32) -> Self {
WeightValue::F64(i as f64)
}
}
/// Weighted linear combinations of cross sections or categories /// Weighted linear combinations of cross sections or categories
/// # Arguments /// # Arguments
/// * `df` - QDF DataFrame /// * `df` - QDF DataFrame
@@ -417,7 +522,7 @@ pub fn linear_composite(
cids: Vec<String>, cids: Vec<String>,
weights: Option<Vec<f64>>, weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>, signs: Option<Vec<f64>>,
weight_xcats: Option<Vec<String>>, weight_xcat: Option<String>,
normalize_weights: bool, normalize_weights: bool,
start: Option<String>, start: Option<String>,
end: Option<String>, end: Option<String>,
@@ -429,10 +534,28 @@ pub fn linear_composite(
) -> Result<DataFrame, Box<dyn std::error::Error>> { ) -> Result<DataFrame, Box<dyn std::error::Error>> {
// Check if the DataFrame is a Quantamental DataFrame // Check if the DataFrame is a Quantamental DataFrame
check_quantamental_dataframe(df)?; check_quantamental_dataframe(df)?;
if agg_xcats_for_cid(cids.clone(), xcats.clone()) {
if weight_xcat.is_some() {
return Err(
format!(
"Using xcats as weights is not supported when aggregating cids for a single xcat. {:?} {:?}",
cids, xcats
)
.into(),
);
}
}
let mut rxcats = xcats.clone();
if weight_xcat.is_some() {
rxcats.extend(vec![weight_xcat.clone().unwrap()]);
}
let rdf = reduce_dataframe( let rdf = reduce_dataframe(
df.clone(), df.clone(),
Some(cids.clone()), Some(cids.clone()),
Some(xcats.clone()), Some(rxcats.clone()),
Some(vec!["value".to_string()]), Some(vec!["value".to_string()]),
start.clone(), start.clone(),
end.clone(), end.clone(),
@@ -443,10 +566,11 @@ pub fn linear_composite(
let new_xcat = new_xcat.unwrap_or_else(|| "COMPOSITE".to_string()); let new_xcat = new_xcat.unwrap_or_else(|| "COMPOSITE".to_string());
let new_cid = new_cid.unwrap_or_else(|| "GLB".to_string()); let new_cid = new_cid.unwrap_or_else(|| "GLB".to_string());
let dfw = pivot_dataframe_by_ticker(rdf.clone(), Some("value".to_string())).unwrap(); let dfw = pivot_dataframe_by_ticker(rdf, Some("value".to_string())).unwrap();
let mul_targets = get_mul_targets(cids.clone(), xcats.clone())?; let mul_targets = get_mul_targets(cids.clone(), xcats.clone())?;
let weights_map = form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, signs)?; let weights_map =
form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, weight_xcat, signs)?;
for (ticker, targets) in mul_targets.iter() { for (ticker, targets) in mul_targets.iter() {
println!("ticker: {}, targets: {:?}", ticker, targets); println!("ticker: {}, targets: {:?}", ticker, targets);

307
src/utils/qdf/blacklist.rs Normal file
View File

@@ -0,0 +1,307 @@
use crate::utils::bdates::{get_bdates_list_with_freq, BDateFreq};
use crate::utils::dateutils::get_min_max_real_dates;
use crate::utils::misc::get_cid;
use crate::utils::qdf::core::check_quantamental_dataframe;
use chrono::NaiveDate;
use polars::prelude::*;
use std::collections::{BTreeMap, HashMap};
use std::error::Error;
use super::get_unique_metrics;
pub fn create_blacklist_from_qdf(
df: &DataFrame,
group_by_cid: Option<bool>,
blacklist_name: Option<String>,
metrics: Option<Vec<String>>,
) -> Result<BTreeMap<String, (String, String)>, Box<dyn Error>> {
check_quantamental_dataframe(df)?;
let metrics = metrics.unwrap_or_else(|| get_unique_metrics(df).unwrap());
let blacklist_name = blacklist_name.unwrap_or_else(|| "BLACKLIST".into());
let group_by_cid = group_by_cid.unwrap_or(true);
let (min_date, max_date) = get_min_max_real_dates(df, "real_date".into())?;
let min_date_str = min_date.format("%Y-%m-%d").to_string();
let max_date_str = max_date.format("%Y-%m-%d").to_string();
// let all_bdates = get_bdates_series_default_opt(min_date_str, max_date_str, None)?;
let all_bdates = get_bdates_list_with_freq(
min_date_str.clone().as_str(),
max_date_str.clone().as_str(),
BDateFreq::Daily,
)?;
let null_mask = get_nan_mask(df, metrics)?;
let df = df.filter(&null_mask)?.clone();
let df = df
.clone()
.lazy()
// .filter(&null_mask)
// .filter(
// col(metric.as_str())
// .is_null()
// .or(col(metric.as_str()).is_nan()),
// )
.sort(
["cid", "xcat"],
SortMultipleOptions::default().with_maintain_order(true),
)
.group_by([col("cid"), col("xcat")])
// .agg([col("real_date").sort(SortOptions::default())])
.agg([col("real_date")
.dt()
.strftime("%Y-%m-%d")
.sort(SortOptions::default())])
.select([
concat_str([col("cid"), col("xcat")], "_", true).alias("ticker"),
col("real_date").alias("real_dates"),
])
.collect()?;
// assert!(0 == 1, "{:?}", df);
let ticker_vec = df
.column("ticker")?
.str()?
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>();
let rdt = get_vec_of_vec_of_dates_from_df(df)?;
let mut blk: HashMap<String, Vec<String>> = HashMap::new();
for (tkr, dates) in ticker_vec.iter().zip(rdt.iter()) {
if group_by_cid {
let _cid = get_cid(tkr.clone())?;
if blk.contains_key(&_cid) {
blk.get_mut(&_cid).unwrap().extend(dates.iter().cloned());
} else {
blk.insert(_cid, dates.clone());
}
} else {
blk.insert(tkr.to_string(), dates.clone());
}
}
for (_key, vals) in blk.iter_mut() {
// order is important - dedup depends on the vec being sorted
vals.sort();
vals.dedup();
}
let all_bdates_strs = all_bdates
.iter()
.map(|date| date.format("%Y-%m-%d").to_string())
.collect::<Vec<String>>();
let mut blacklist: HashMap<String, (String, String)> = HashMap::new();
for (tkr, dates) in blk.iter() {
let date_ranges = convert_dates_list_to_date_ranges(dates.clone(), all_bdates_strs.clone());
for (rng_idx, (start_date, end_date)) in date_ranges.iter() {
let range_key = format!("{}_{}_{}", tkr, blacklist_name.clone(), rng_idx);
blacklist.insert(range_key, (start_date.clone(), end_date.clone()));
}
}
// Ok(blacklist)
let mut btree_map: BTreeMap<String, (String, String)> = BTreeMap::new();
for (key, (start_date, end_date)) in blacklist.iter() {
btree_map.insert(key.clone(), (start_date.clone(), end_date.clone()));
}
Ok(btree_map)
}
fn get_nan_mask(
df: &DataFrame,
metrics: Vec<String>,
) -> Result<ChunkedArray<BooleanType>, Box<dyn Error>> {
let null_masks: Vec<ChunkedArray<BooleanType>> = metrics
.iter()
.map(|metric| {
let null_mask = df.column(metric.as_str())?.is_null();
let nan_mask = df.column(metric.as_str())?.is_nan()?;
Ok(null_mask | nan_mask)
})
.collect::<Result<_, Box<dyn Error>>>()?;
let null_mask = null_masks
.into_iter()
.reduce(|acc, mask| acc | mask)
.unwrap_or_else(|| BooleanChunked::full_null("null_mask".into(), df.height()));
Ok(null_mask)
}
fn convert_dates_list_to_date_ranges(
blacklist: Vec<String>,
all_bdates_strs: Vec<String>,
) -> HashMap<String, (String, String)> {
// Step 1: Map every date in all_bdates_strs to its index
let mut all_map: HashMap<String, usize> = HashMap::new();
for (i, d) in all_bdates_strs.iter().enumerate() {
all_map.insert(d.clone(), i);
}
// Step 2: Convert each blacklisted date into its index, if it exists
let mut blacklisted_indices: Vec<usize> = Vec::new();
for dt in blacklist {
if let Some(&idx) = all_map.get(&dt) {
blacklisted_indices.push(idx);
}
}
// Step 3: Sort the blacklisted indices
blacklisted_indices.sort_unstable();
// Step 4: Traverse and group consecutive indices into ranges
let mut result: HashMap<i64, (String, String)> = HashMap::new();
let mut string_result: HashMap<String, (String, String)> = HashMap::new();
if blacklisted_indices.is_empty() {
return string_result;
}
let mut range_idx: i64 = 0;
let mut start_idx = blacklisted_indices[0];
let mut end_idx = start_idx;
for &cur_idx in blacklisted_indices.iter().skip(1) {
if cur_idx == end_idx + 1 {
// We are still in a contiguous run
end_idx = cur_idx;
} else {
// We hit a break in contiguity, so store the last range
result.insert(
range_idx,
(
all_bdates_strs[start_idx].clone(),
all_bdates_strs[end_idx].clone(),
),
);
range_idx += 1;
// Start a new range
start_idx = cur_idx;
end_idx = cur_idx;
}
}
// Don't forget to store the final range after the loop
result.insert(
range_idx,
(
all_bdates_strs[start_idx].clone(),
all_bdates_strs[end_idx].clone(),
),
);
let max_digits = result.keys().max().unwrap_or(&-1).to_string().len();
for (key, (start_date, end_date)) in result.iter() {
let new_key = format!("{:0width$}", key, width = max_digits);
string_result.insert(new_key, (start_date.clone(), end_date.clone()));
}
string_result
}
fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.str()
.unwrap()
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>()
})
.collect::<Vec<Vec<String>>>();
Ok(rdt)
}
#[allow(dead_code)]
fn get_vec_of_vec_of_naivedates_from_df(
df: DataFrame,
) -> Result<Vec<Vec<NaiveDate>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.date()
.unwrap()
.into_iter()
.filter_map(|opt| opt.and_then(|date| NaiveDate::from_num_days_from_ce_opt(date)))
.collect::<Vec<NaiveDate>>()
})
.collect::<Vec<Vec<NaiveDate>>>();
Ok(rdt)
}
// fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
// let real_dates_column = df.column("real_dates")?.clone();
// let series = real_dates_column.as_series().unwrap().clone();
// let rdt = series.list()?.clone();
// let rdt = rdt
// .into_iter()
// .filter_map(|opt| opt)
// .collect::<Vec<Series>>();
// let rdt = rdt
// .iter()
// .map(|s| {
// s.str()
// .unwrap()
// .into_iter()
// .filter_map(|opt| opt.map(|s| s.to_string()))
// .collect::<Vec<String>>()
// })
// .collect::<Vec<Vec<String>>>();
// Ok(rdt)
// }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_dates_list_to_date_ranges() {
let all_dates = vec![
"2023-01-01".to_string(),
"2023-01-02".to_string(),
"2023-01-03".to_string(),
"2023-01-04".to_string(),
"2023-01-05".to_string(),
"2023-01-06".to_string(),
];
let blacklist = vec![
"2023-01-02".to_string(),
"2023-01-03".to_string(),
"2023-01-05".to_string(),
];
let result = convert_dates_list_to_date_ranges(blacklist, all_dates);
// Expect two ranges:
// range 0 => ("2023-01-02", "2023-01-03")
// range 1 => ("2023-01-05", "2023-01-05")
assert_eq!(
result["0"],
("2023-01-02".to_string(), "2023-01-03".to_string())
);
assert_eq!(
result["1"],
("2023-01-05".to_string(), "2023-01-05".to_string())
);
}
}

View File

@@ -17,14 +17,15 @@ use std::error::Error;
pub fn check_quantamental_dataframe(df: &DataFrame) -> Result<(), Box<dyn Error>> { pub fn check_quantamental_dataframe(df: &DataFrame) -> Result<(), Box<dyn Error>> {
let expected_cols = ["real_date", "cid", "xcat"]; let expected_cols = ["real_date", "cid", "xcat"];
let expected_dtype = [DataType::Date, DataType::String, DataType::String]; let expected_dtype = [DataType::Date, DataType::String, DataType::String];
let err = "Quantamental DataFrame must have at least 4 columns: 'real_date', 'cid', 'xcat' and one or more metrics.";
for (col, dtype) in expected_cols.iter().zip(expected_dtype.iter()) { for (col, dtype) in expected_cols.iter().zip(expected_dtype.iter()) {
let col = df.column(col); let col = df.column(col);
if col.is_err() { if col.is_err() {
return Err(format!("Column {:?} not found", col).into()); return Err(format!("{} Column {:?} not found", err, col).into());
} }
let col = col?; let col = col?;
if col.dtype() != dtype { if col.dtype() != dtype {
return Err(format!("Column {:?} has wrong dtype", col).into()); return Err(format!("{} Column {:?} has wrong dtype", err, col).into());
} }
} }
Ok(()) Ok(())

View File

@@ -1,11 +1,12 @@
pub mod blacklist;
pub mod core; pub mod core;
pub mod update_df;
pub mod load; pub mod load;
pub mod reduce_df;
pub mod pivots; pub mod pivots;
pub mod reduce_df;
pub mod update_df;
// Re-export submodules for easier access // Re-export submodules for easier access
pub use core::*; pub use core::*;
pub use update_df::*;
pub use load::*; pub use load::*;
pub use reduce_df::*; pub use reduce_df::*;
pub use update_df::*;

View File

@@ -30,12 +30,12 @@ pub fn reduce_dataframe(
let df_size = df.shape(); let df_size = df.shape();
let mut new_df = df.clone(); let mut new_df = df.clone();
let ticker_col: Column = get_ticker_column_for_quantamental_dataframe(&new_df)?; let ticker_col = get_ticker_column_for_quantamental_dataframe(&new_df)?;
// if cids is not provided, get all unique cids // if cids is not provided, get all unique cids
let u_cids: Vec<String> = get_unique_cids(&new_df)?; let u_cids = get_unique_cids(&new_df)?;
let u_xcats: Vec<String> = get_unique_xcats(&new_df)?; let u_xcats = get_unique_xcats(&new_df)?;
let u_tickers: Vec<String> = _get_unique_strs_from_str_column_object(&ticker_col)?; let u_tickers = _get_unique_strs_from_str_column_object(&ticker_col)?;
let cids_vec = cids.unwrap_or_else(|| u_cids.clone()); let cids_vec = cids.unwrap_or_else(|| u_cids.clone());
let specified_cids: Vec<&str> = cids_vec.iter().map(AsRef::as_ref).collect(); let specified_cids: Vec<&str> = cids_vec.iter().map(AsRef::as_ref).collect();