Compare commits

..

8 Commits

Author SHA1 Message Date
Palash Tyagi
19e91cfe47 Merge branch 'main' into rawdf 2025-04-13 11:41:00 +01:00
Palash Tyagi
b4d42c1dda add BSeries struct and conversion implementations for FSeries and ISeries 2025-04-12 22:06:44 +01:00
Palash Tyagi
2b969f4eaf change example usage code blocks to ignore in JPMaQSDownload documentation 2025-04-12 11:04:28 +01:00
Palash Tyagi
1d301b45b7 uncomment README.md documentation line and add core module declaration 2025-04-12 11:04:24 +01:00
Palash Tyagi
4f60e31d55 add core module with df, xseries, and dateseries 2025-04-12 11:04:12 +01:00
Palash Tyagi
d938d9adc3 added placeholder df module 2025-04-11 23:55:42 +01:00
Palash Tyagi
5a5bd4777d add first draft for dateseries 2025-04-11 23:55:29 +01:00
Palash Tyagi
cf2779c5a1 adding first draft for xseries 2025-04-11 23:55:15 +01:00
19 changed files with 725 additions and 960 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,33 +0,0 @@
#!/bin/bash
# Exit immediately if a command exits with a non-zero status
set -e
# Run "maturin --help". If it fails, print an error message and exit.
if ! maturin --help > /dev/null 2>&1; then
echo "Failed to run maturin --help" >&2
exit 1
fi
# Delete any existing build directory and create a new one.
rm -rf ./build
mkdir -p ./build
# Copy ./src/msyrs.pyi to ./msyrs.pyi.
cp ./src/msyrs.pyi ./msyrs.pyi
# Build using maturin.
maturin build --release --sdist --out ./build/
# Get the first wheel file found in the build directory.
whl_file=$(ls ./build/*.whl 2>/dev/null | head -n 1)
if [ -z "$whl_file" ]; then
echo "No wheel file found in ./build" >&2
exit 1
fi
# Rename the wheel file from .whl to .zip.
base_name="${whl_file%.whl}"
mv "$whl_file" "${base_name}.zip"
# Delete the temporary .pyi file.
rm ./msyrs.pyi

View File

@@ -1,20 +0,0 @@
#!/bin/bash
set -e
# Ensure maturin is installed. For example, you can install it via:
# pip install maturin
# Run "maturin --help". If it fails, print an error message and exit.
if ! maturin --help > /dev/null 2>&1; then
echo "Failed to run maturin --help" >&2
exit 1
fi
# Copy ./src/msyrs.pyi to the current directory as msyrs.pyi
cp ./src/msyrs.pyi ./msyrs.pyi
# Run maturin develop in release mode.
maturin develop --release
# Delete the temporary msyrs.pyi file.
rm ./msyrs.pyi

View File

@@ -68,7 +68,7 @@ pub fn get_period_indices_hv(dfw: PyDataFrame, est_freq: &str) -> PyResult<Vec<u
cids,
weights = None,
signs = None,
weight_xcat = None,
weight_xcats = None,
normalize_weights = false,
start = None,
end = None,
@@ -84,7 +84,7 @@ pub fn linear_composite(
cids: Vec<String>,
weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>,
weight_xcat: Option<String>,
weight_xcats: Option<Vec<String>>,
normalize_weights: bool,
start: Option<String>,
end: Option<String>,
@@ -101,7 +101,7 @@ pub fn linear_composite(
cids,
weights,
signs,
weight_xcat,
weight_xcats,
normalize_weights,
start,
end,

View File

@@ -1,4 +1,4 @@
use pyo3::{prelude::*, types::PyDict};
use pyo3::prelude::*;
use pyo3_polars::{PyDataFrame, PySeries};
/// Python wrapper for [`crate::utils::qdf`] module.
@@ -7,7 +7,6 @@ use pyo3_polars::{PyDataFrame, PySeries};
pub fn utils(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(get_bdates_series_default_pl, m)?)?;
m.add_function(wrap_pyfunction!(get_bdates_series_default_opt, m)?)?;
m.add_function(wrap_pyfunction!(create_blacklist_from_qdf, m)?)?;
Ok(())
}
@@ -34,29 +33,3 @@ pub fn get_bdates_series_default_opt(
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?,
))
}
#[allow(deprecated)]
#[pyfunction(signature = (df, group_by_cid=None, blacklist_name=None, metrics=None))]
pub fn create_blacklist_from_qdf(
df: PyDataFrame,
group_by_cid: Option<bool>,
blacklist_name: Option<String>,
metrics: Option<Vec<String>>,
) -> PyResult<PyObject> {
let result = crate::utils::qdf::blacklist::create_blacklist_from_qdf(
&df.into(),
group_by_cid,
blacklist_name,
metrics,
)
.map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?;
Python::with_gil(|py| {
let dict = PyDict::new(py);
// for (key, (start_date, end_date)) in result {
// dict.set_item(key, (start_date, end_date))
for (key, dates) in result {
dict.set_item(key, dates).map_err(|e| PyErr::from(e))?;
}
Ok(dict.into())
})
}

281
src/core/dateseries.rs Normal file
View File

@@ -0,0 +1,281 @@
//! # DateSeries and BDateSeries Implementations
//!
//! This module provides two date-handling types using the [`chrono`](https://docs.rs/chrono) crate:
//!
//! - [`DateSeries`]: Stores any set of calendar dates and allows adding/subtracting *calendar days*.
//! - [`BDateSeries`]: Stores only MondayFriday business days and interprets add/sub as *business day* shifts,
//! skipping weekends (e.g., adding 1 to Friday goes to Monday).
//!
//! Both types also provide a [`from_iso8601_range`](#method.from_iso8601_range) constructor
//! that builds a date series (or businessdate series) from a start/end string (YYYYMMDD).
use chrono::{Datelike, Duration, NaiveDate, ParseResult};
use std::ops::{Add, Sub};
/// Determines if the date is Saturday or Sunday.
fn is_weekend(date: NaiveDate) -> bool {
matches!(date.weekday(), chrono::Weekday::Sat | chrono::Weekday::Sun)
}
/// A `DateSeries` stores a list of [`NaiveDate`] values and shifts by **calendar days**.
///
/// ## Example Usage
///
/// ```
/// use chrono::NaiveDate;
/// use msyrs::core::dateseries::DateSeries;
///
/// // Create from explicit dates
/// let ds = DateSeries::new(vec![
/// NaiveDate::from_ymd_opt(2023, 7, 14).unwrap(), // a Friday
/// NaiveDate::from_ymd_opt(2023, 7, 15).unwrap(), // a Saturday
/// ]);
///
/// // Shift forward by 5 calendar days
/// let ds_plus = ds + 5;
/// // 2023-07-14 + 5 => 2023-07-19 (Wednesday)
/// // 2023-07-15 + 5 => 2023-07-20 (Thursday)
///
/// assert_eq!(ds_plus.data()[0], NaiveDate::from_ymd_opt(2023, 7, 19).unwrap());
/// assert_eq!(ds_plus.data()[1], NaiveDate::from_ymd_opt(2023, 7, 20).unwrap());
/// ```
///
#[derive(Debug, Clone)]
pub struct DateSeries {
data: Vec<NaiveDate>,
}
impl DateSeries {
/// Creates a new `DateSeries` from a vector of [`NaiveDate`] values.
///
/// # Panics
/// - Does not panic on invalid weekend or anything; this type accepts all valid dates.
pub fn new(data: Vec<NaiveDate>) -> Self {
Self { data }
}
/// Constructs a `DateSeries` by parsing an ISO8601 start/end string (YYYYMMDD)
/// and including **every calendar date** from start to end (inclusive).
///
/// # Errors
/// - Returns a [`chrono::ParseError`](chrono::ParseError) if parsing fails.
/// - Panics if `start` > `end` chronologically.
///
/// # Examples
///
/// ```
/// use msyrs::core::dateseries::DateSeries;
/// # fn main() -> Result<(), chrono::ParseError> {
/// let ds = DateSeries::from_iso8601_range("2023-07-14", "2023-07-16")?;
/// assert_eq!(ds.data().len(), 3);
/// # Ok(())
/// # }
/// ```
pub fn from_iso8601_range(start: &str, end: &str) -> ParseResult<Self> {
let start_date = NaiveDate::parse_from_str(start, "%Y-%m-%d")?;
let end_date = NaiveDate::parse_from_str(end, "%Y-%m-%d")?;
assert!(
start_date <= end_date,
"start date cannot be after end date"
);
let mut dates = Vec::new();
let mut current = start_date;
while current <= end_date {
dates.push(current);
current = current
.checked_add_signed(Duration::days(1))
.expect("Date overflow in from_iso8601_range");
}
Ok(Self::new(dates))
}
/// Returns a reference to the underlying slice of dates.
pub fn data(&self) -> &[NaiveDate] {
&self.data
}
/// Internal helper applying a function to each date.
fn apply<F>(&self, op: F) -> Self
where
F: Fn(NaiveDate) -> NaiveDate,
{
let new_data = self.data.iter().map(|&date| op(date)).collect();
Self { data: new_data }
}
}
/// Implements adding calendar days to each `NaiveDate`.
///
/// If the shifted date goes out of chrono's valid range, it panics.
impl Add<i64> for DateSeries {
type Output = Self;
fn add(self, rhs: i64) -> Self::Output {
self.apply(|date| {
date.checked_add_signed(Duration::days(rhs))
.expect("Overflow in date addition")
})
}
}
/// Implements subtracting calendar days from each `NaiveDate`.
///
/// If the shifted date goes out of chrono's valid range, it panics.
impl Sub<i64> for DateSeries {
type Output = Self;
fn sub(self, rhs: i64) -> Self::Output {
self.apply(|date| {
date.checked_sub_signed(Duration::days(rhs))
.expect("Overflow in date subtraction")
})
}
}
/// A “Business Date Series” for MondayFriday only.
///
/// 1. The constructor disallows weekend dates (panics if any date is Sat/Sun).
/// 2. Adding or subtracting an `i64` interprets that integer as *business days*, skipping weekends.
/// For example, adding 1 to a Friday yields the following Monday.
///
/// ## Example Usage
///
/// ```
/// use chrono::NaiveDate;
/// use msyrs::core::dateseries::BDateSeries;
///
/// // Friday
/// let friday = NaiveDate::from_ymd_opt(2023, 7, 14).unwrap();
/// let mut bds = BDateSeries::new(vec![friday]);
///
/// // Adding 1 “business day” => next Monday, 2023-07-17
/// bds = bds + 1;
/// assert_eq!(bds.data()[0], NaiveDate::from_ymd_opt(2023, 7, 17).unwrap());
/// ```
#[derive(Debug, Clone)]
pub struct BDateSeries {
data: Vec<NaiveDate>,
}
impl BDateSeries {
/// Creates a new `BDateSeries`, panicking if any of the supplied dates is on Saturday/Sunday.
pub fn new(data: Vec<NaiveDate>) -> Self {
for &d in &data {
if is_weekend(d) {
panic!("BDateSeries cannot contain weekend dates: {}", d);
}
}
Self { data }
}
/// Constructs a `BDateSeries` by parsing an ISO8601 start/end string (YYYYMMDD).
///
/// Only MondayFriday dates within `[start, end]` are included in the series.
///
/// # Errors
/// - Returns a [`chrono::ParseError`](chrono::ParseError) if parsing fails.
/// - Panics if `start` > `end` chronologically.
///
/// # Examples
///
/// ```
/// use msyrs::core::dateseries::BDateSeries;
/// # fn main() -> Result<(), chrono::ParseError> {
/// let bds = BDateSeries::from_iso8601_range("2023-07-14", "2023-07-18")?;
/// // 2023-07-14 (Friday), 2023-07-15 (Saturday) => skipped,
/// // 2023-07-16 (Sunday) => skipped,
/// // 2023-07-17 (Monday), 2023-07-18 (Tuesday)
/// // so total 3 valid business days
/// assert_eq!(bds.data().len(), 3);
/// # Ok(())
/// # }
/// ```
pub fn from_iso8601_range(start: &str, end: &str) -> ParseResult<Self> {
let start_date = NaiveDate::parse_from_str(start, "%Y-%m-%d")?;
let end_date = NaiveDate::parse_from_str(end, "%Y-%m-%d")?;
assert!(
start_date <= end_date,
"start date cannot be after end date"
);
let mut dates = Vec::new();
let mut current = start_date;
while current <= end_date {
if !is_weekend(current) {
dates.push(current);
}
current = current
.checked_add_signed(Duration::days(1))
.expect("Date overflow in from_iso8601_range");
}
Ok(Self::new(dates))
}
/// Returns a reference to the underlying slice of dates.
pub fn data(&self) -> &[NaiveDate] {
&self.data
}
/// Internal helper that tries to shift a date forward or backward by one day at a time,
/// skipping weekends, for a total of `delta` business days.
fn shift_business_days(date: NaiveDate, delta: i64) -> NaiveDate {
if delta == 0 {
return date;
}
let step = if delta > 0 { 1 } else { -1 };
let abs_delta = delta.abs();
let mut new_date = date;
for _ in 0..abs_delta {
// Move by 1 day in the correct direction
new_date = new_date
.checked_add_signed(Duration::days(step))
.expect("Overflow in BDateSeries add/sub");
// If we land on weekend, keep moving until Monday..Friday
while is_weekend(new_date) {
new_date = new_date
.checked_add_signed(Duration::days(step))
.expect("Overflow in BDateSeries skipping weekend");
}
}
new_date
}
/// Internal helper to apply a shift of `delta` business days to each date.
fn apply(&self, delta: i64) -> Self {
let new_data = self
.data
.iter()
.map(|&date| Self::shift_business_days(date, delta))
.collect();
Self { data: new_data }
}
}
/// Implement *business day* addition for `BDateSeries`.
///
/// # Panics
/// - If the resulting date(s) overflow `NaiveDate` range.
/// - `BDateSeries` is guaranteed to remain Monday..Friday after the shift.
impl Add<i64> for BDateSeries {
type Output = Self;
fn add(self, rhs: i64) -> Self::Output {
self.apply(rhs)
}
}
/// Implement *business day* subtraction for `BDateSeries`.
///
/// # Panics
/// - If the resulting date(s) overflow `NaiveDate`.
/// - `BDateSeries` is guaranteed to remain Monday..Friday after the shift.
impl Sub<i64> for BDateSeries {
type Output = Self;
fn sub(self, rhs: i64) -> Self::Output {
self.apply(-rhs)
}
}

0
src/core/df.rs Normal file
View File

3
src/core/mod.rs Normal file
View File

@@ -0,0 +1,3 @@
pub mod df;
pub mod xseries;
pub mod dateseries;

223
src/core/xseries.rs Normal file
View File

@@ -0,0 +1,223 @@
use std::ops::{Add, Div, Mul, Sub};
//
// 1) Define a float series: FSeries
//
#[derive(Debug, Clone)]
pub struct FSeries {
data: Vec<f64>,
}
impl FSeries {
/// Create a new FSeries from a vector of f64 values.
pub fn new(data: Vec<f64>) -> Self {
Self { data }
}
pub fn len(&self) -> usize {
self.data.len()
}
/// Elementwise helper applying an operation between two FSeries.
pub fn apply<F>(&self, other: &Self, op: F) -> Self
where
F: Fn(f64, f64) -> f64,
{
assert!(
self.len() == other.len(),
"FSeries must have the same length to apply operations."
);
let data = self
.data
.iter()
.zip(other.data.iter())
.map(|(&a, &b)| op(a, b))
.collect();
FSeries { data }
}
/// Access to the underlying data
pub fn data(&self) -> &[f64] {
&self.data
}
}
// Macros for float series arithmetic (elementwise)
macro_rules! impl_fseries_bin_op {
($trait:ident, $method:ident, $op:tt) => {
impl $trait for FSeries {
type Output = Self;
fn $method(self, rhs: Self) -> Self::Output {
self.apply(&rhs, |a, b| a $op b)
}
}
};
}
impl_fseries_bin_op!(Add, add, +);
impl_fseries_bin_op!(Sub, sub, -);
impl_fseries_bin_op!(Mul, mul, *);
impl_fseries_bin_op!(Div, div, /);
macro_rules! impl_fseries_scalar_op {
($trait:ident, $method:ident, $op:tt) => {
impl $trait<f64> for FSeries {
type Output = Self;
fn $method(mut self, scalar: f64) -> Self::Output {
for x in self.data.iter_mut() {
*x = *x $op scalar;
}
self
}
}
};
}
impl_fseries_scalar_op!(Add, add, +);
impl_fseries_scalar_op!(Sub, sub, -);
impl_fseries_scalar_op!(Mul, mul, *);
impl_fseries_scalar_op!(Div, div, /);
//
// 2) Define an integer series: ISeries
//
#[derive(Debug, Clone)]
pub struct ISeries {
data: Vec<i64>,
}
impl ISeries {
/// Create an ISeries from a vector of i64 values.
pub fn new(data: Vec<i64>) -> Self {
Self { data }
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn data(&self) -> &[i64] {
&self.data
}
/// Elementwise helper for integer series.
pub fn apply<F>(&self, other: &Self, op: F) -> Self
where
F: Fn(i64, i64) -> i64,
{
assert!(
self.len() == other.len(),
"ISeries must have the same length to apply operations."
);
let data = self
.data
.iter()
.zip(other.data.iter())
.map(|(&a, &b)| op(a, b))
.collect();
ISeries { data }
}
}
// Macros for integer series arithmetic (elementwise)
macro_rules! impl_iseries_bin_op {
($trait:ident, $method:ident, $op:tt) => {
impl $trait for ISeries {
type Output = Self;
fn $method(self, rhs: Self) -> Self::Output {
self.apply(&rhs, |a, b| a $op b)
}
}
};
}
impl_iseries_bin_op!(Add, add, +);
impl_iseries_bin_op!(Sub, sub, -);
impl_iseries_bin_op!(Mul, mul, *);
impl_iseries_bin_op!(Div, div, /); // integer division (floor trunc)
// Optional scalar operations (for i64)
macro_rules! impl_iseries_scalar_op {
($trait:ident, $method:ident, $op:tt) => {
impl $trait<i64> for ISeries {
type Output = Self;
fn $method(mut self, scalar: i64) -> Self::Output {
for x in self.data.iter_mut() {
*x = *x $op scalar;
}
self
}
}
};
}
impl_iseries_scalar_op!(Add, add, +);
impl_iseries_scalar_op!(Sub, sub, -);
impl_iseries_scalar_op!(Mul, mul, *);
impl_iseries_scalar_op!(Div, div, /); // floor/trunc division by scalar
/// A boolean series: BSeries
///
#[derive(Debug, Clone)]
pub struct BSeries {
data: Vec<bool>,
}
impl BSeries {
pub fn new(data: Vec<bool>) -> Self {
Self { data }
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn data(&self) -> &[bool] {
&self.data
}
}
/// Convert an FSeries to ISeries by truncation (floor cast).
impl From<FSeries> for ISeries {
fn from(fseries: FSeries) -> Self {
let data = fseries
.data
.into_iter()
.map(|val| val as i64) // trunc cast
.collect();
ISeries::new(data)
}
}
/// Implement conversion from ISeries to FSeries by casting to f64.
impl From<ISeries> for FSeries {
fn from(iseries: ISeries) -> Self {
let data = iseries.data.into_iter().map(|val| val as f64).collect();
FSeries::new(data)
}
}
/// Convert an ISeries to BSeries by checking if each value is non-zero.
impl From<ISeries> for BSeries {
fn from(iseries: ISeries) -> Self {
let data = iseries.data.into_iter().map(|val| val != 0).collect();
BSeries::new(data)
}
}
impl From<BSeries> for ISeries {
fn from(bseries: BSeries) -> Self {
let data = bseries
.data
.into_iter()
.map(|val| if val { 1 } else { 0 })
.collect();
ISeries::new(data)
}
}

View File

@@ -58,7 +58,7 @@ fn all_jpmaq_expressions(expressions: Vec<String>) -> bool {
///
/// Example Usage:
///
/// ```ignore
/// ```rust
/// use msyrs::download::jpmaqsdownload::JPMaQSDownloadGetIndicatorArgs;
/// use msyrs::download::jpmaqsdownload::JPMaQSDownload;
///
@@ -140,7 +140,7 @@ impl Default for JPMaQSDownloadGetIndicatorArgs {
/// Ok(_) => println!("Saved indicators to disk"),
/// Err(e) => println!("Error saving indicators: {:?}", e),
/// }
///
/// ```
#[derive(Debug, Clone)]
pub struct JPMaQSDownload {
requester: DQRequester,

View File

@@ -1,4 +1,5 @@
// #![doc = include_str!("../README.md")]
// uncomment the above line to include the README.md file in the documentation
//! # msyrs
//!
@@ -18,6 +19,9 @@
/// Documentation and type-stubs for the `msyrs` Python API.
pub mod _py;
/// Implementation for the `core` module.
pub mod core;
/// Implementation for the `download` module.
pub mod download;

View File

@@ -56,5 +56,3 @@ class utils:
def get_bdates_series_default_pl(*args, **kwargs) -> Series: ...
@staticmethod
def get_bdates_series_default_opt(*args, **kwargs) -> Series: ...
@staticmethod
def create_blacklist_from_qdf(*args, **kwargs) -> dict: ...

View File

@@ -1,6 +1,6 @@
use crate::utils::dateutils::{get_bdates_from_col, get_min_max_real_dates};
use crate::utils::qdf::pivots::*;
use crate::utils::qdf::reduce_dataframe;
use crate::utils::qdf::reduce_df::*;
use chrono::NaiveDate;
use ndarray::{s, Array, Array1, Zip};
use polars::prelude::*;

View File

@@ -1,6 +1,6 @@
use crate::utils::qdf::check_quantamental_dataframe;
use crate::utils::qdf::pivots::{pivot_dataframe_by_ticker, pivot_wide_dataframe_to_qdf};
use crate::utils::qdf::reduce_df::reduce_dataframe;
use crate::utils::qdf::pivots::*;
use crate::utils::qdf::reduce_df::*;
use polars::prelude::*;
use std::collections::HashMap;
const TOLERANCE: f64 = 1e-8;
@@ -108,42 +108,14 @@ fn _form_agg_nan_mask_series(nan_mask_dfw: &DataFrame) -> Result<Series, PolarsE
Ok(combined.into_series())
}
/// Form the weights DataFrame
fn _form_agg_weights_dfw(
agg_weights_map: &HashMap<String, (WeightValue, f64)>,
dfw: &DataFrame,
agg_weights_map: &HashMap<String, Vec<f64>>,
data_dfw: DataFrame,
) -> Result<DataFrame, PolarsError> {
let mut weights_dfw = DataFrame::new(vec![])?;
for (agg_targ, weight_signs) in agg_weights_map.iter() {
// let wgt = weight_signs[0] * weight_signs[1];
let wgt_series = match &weight_signs.0 {
WeightValue::F64(val) => {
let wgt = val * weight_signs.1;
Series::new(agg_targ.into(), vec![wgt; dfw.height()])
}
WeightValue::Str(vstr) => {
// vstr column from data_dfw, else raise wieght specification error
if !dfw.get_column_names().contains(&&PlSmallStr::from(vstr)) {
return Err(PolarsError::ComputeError(
format!(
"The column {} does not exist in the DataFrame. {:?}",
vstr, agg_weights_map
)
.into(),
));
}
let vstr_series = dfw.column(vstr)?;
let multiplied_series = vstr_series * weight_signs.1;
let mut multiplied_series =
multiplied_series.as_series().cloned().ok_or_else(|| {
PolarsError::ComputeError(
"Failed to convert multiplied_series to Series".into(),
)
})?;
multiplied_series.rename(agg_targ.into());
multiplied_series
}
};
let wgt = weight_signs[0] * weight_signs[1];
let wgt_series = Series::new(agg_targ.into(), vec![wgt; data_dfw.height()]);
weights_dfw.with_column(wgt_series)?;
}
Ok(weights_dfw)
@@ -171,14 +143,14 @@ fn perform_single_group_agg(
dfw: &DataFrame,
agg_on: &String,
agg_targs: &Vec<String>,
agg_weights_map: &HashMap<String, (WeightValue, f64)>,
agg_weights_map: &HashMap<String, Vec<f64>>,
normalize_weights: bool,
complete: bool,
) -> Result<Column, PolarsError> {
let data_dfw = _form_agg_data_dfw(dfw, agg_targs)?;
let nan_mask_dfw = _form_agg_nan_mask_dfw(&data_dfw)?;
let nan_mask_series = _form_agg_nan_mask_series(&nan_mask_dfw)?;
let weights_dfw = _form_agg_weights_dfw(agg_weights_map, dfw)?;
let weights_dfw = _form_agg_weights_dfw(agg_weights_map, data_dfw.clone())?;
let weights_dfw = match normalize_weights {
true => normalize_weights_with_nan_mask(weights_dfw, nan_mask_dfw)?,
false => weights_dfw,
@@ -220,7 +192,7 @@ fn perform_single_group_agg(
fn perform_multiplication(
dfw: &DataFrame,
mult_targets: &HashMap<String, Vec<String>>,
weights_map: &HashMap<String, HashMap<String, (WeightValue, f64)>>,
weights_map: &HashMap<String, HashMap<String, Vec<f64>>>,
complete: bool,
normalize_weights: bool,
) -> Result<DataFrame, PolarsError> {
@@ -228,7 +200,6 @@ fn perform_multiplication(
// let mut new_dfw = DataFrame::new(vec![real_date])?;
let mut new_dfw = DataFrame::new(vec![])?;
assert!(!mult_targets.is_empty(), "agg_targs is empty");
for (agg_on, agg_targs) in mult_targets.iter() {
// perform_single_group_agg
let cols_len = new_dfw.get_column_names().len();
@@ -317,122 +288,76 @@ fn get_mul_targets(
Ok(mul_targets)
}
/// Builds a map of the shape:
/// `HashMap<String, HashMap<String, (WeightValue, f64)>>`
/// where only one of `weights` or `weight_xcats` can be provided.
/// If neither is provided, weights default to 1.0.
/// Each tuple is `(WeightValue, f64) = (weight, sign)`.
fn form_weights_and_signs_map(
cids: Vec<String>,
xcats: Vec<String>,
weights: Option<Vec<f64>>,
weight_xcat: Option<String>,
signs: Option<Vec<f64>>,
) -> Result<HashMap<String, HashMap<String, (WeightValue, f64)>>, Box<dyn std::error::Error>> {
// For demonstration, we pretend to load or infer these from helpers:
let agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
) -> Result<HashMap<String, HashMap<String, Vec<f64>>>, Box<dyn std::error::Error>> {
let _agg_xcats_for_cid = agg_xcats_for_cid(cids.clone(), xcats.clone());
let (agg_on, agg_targ) = get_agg_on_agg_targs(cids.clone(), xcats.clone());
// Determine if each weight option has non-empty values.
let weights_provided = weights.as_ref().map_or(false, |v| !v.is_empty());
let weight_xcats_provided = weight_xcat.as_ref().map_or(false, |v| !v.is_empty());
// if weights are None, create a vector of 1s of the same length as agg_targ
let weights = weights.unwrap_or(vec![1.0 / agg_targ.len() as f64; agg_targ.len()]);
let signs = signs.unwrap_or(vec![1.0; agg_targ.len()]);
// Enforce that only one of weights or weight_xcats is specified.
if weights_provided && weight_xcats_provided {
return Err("Only one of `weights` and `weight_xcats` may be specified.".into());
}
// check that the lengths of weights and signs match the length of agg_targ
check_weights_signs_lengths(
weights.clone(),
signs.clone(),
_agg_xcats_for_cid,
agg_targ.len(),
)?;
// 1) Build the "actual_weights" vector as WeightValue.
let actual_weights: Vec<WeightValue> = if weights_provided {
weights.unwrap().into_iter().map(WeightValue::F64).collect()
} else if weight_xcats_provided {
vec![WeightValue::Str(weight_xcat.unwrap()); agg_targ.len()]
} else {
// Default to numeric 1.0 if neither is provided
vec![WeightValue::F64(1.0); agg_targ.len()]
};
// 2) Build the "signs" vector; default to 1.0 if not provided
let signs = signs.unwrap_or_else(|| vec![1.0; agg_targ.len()]);
// 3) Optional: check lengths & zero values (only numeric weights).
check_weights_signs_lengths(&actual_weights, &signs, agg_xcats_for_cid, agg_targ.len())?;
// 4) Build the final nested HashMap
let mut weights_map: HashMap<String, HashMap<String, (WeightValue, f64)>> = HashMap::new();
let mut weights_map = HashMap::new();
for agg_o in agg_on {
let mut agg_t_map = HashMap::new();
for (i, agg_t) in agg_targ.iter().enumerate() {
// Format the ticker
let ticker = if agg_xcats_for_cid {
format!("{}_{}", agg_o, agg_t)
} else {
format!("{}_{}", agg_t, agg_o)
let ticker = match _agg_xcats_for_cid {
true => format!("{}_{}", agg_o, agg_t),
false => format!("{}_{}", agg_t, agg_o),
};
// Build the tuple (WeightValue, f64)
let weight_sign_tuple = match &actual_weights[i] {
WeightValue::F64(val) => (WeightValue::F64(*val).clone(), signs[i]),
WeightValue::Str(vstr) => {
let new_str = format!("{}_{}", agg_t, vstr);
(WeightValue::Str(new_str), signs[i])
}
};
agg_t_map.insert(ticker, weight_sign_tuple);
let weight_signs = vec![weights[i], signs[i]];
agg_t_map.insert(ticker, weight_signs);
}
weights_map.insert(agg_o.clone(), agg_t_map);
}
Ok(weights_map)
}
/// Checks that the given slices have the expected length and that:
/// - numeric weights are non-zero,
/// - signs are non-zero.
fn check_weights_signs_lengths(
weights_vec: &[WeightValue],
signs_vec: &[f64],
agg_xcats_for_cid: bool,
weights_vec: Vec<f64>,
signs_vec: Vec<f64>,
_agg_xcats_for_cid: bool,
agg_targ_len: usize,
) -> Result<(), Box<dyn std::error::Error>> {
// For diagnostics, decide what to call the dimension
let agg_targ = if agg_xcats_for_cid { "xcats" } else { "cids" };
// 1) Check numeric weights for zeroes.
for (i, weight) in weights_vec.iter().enumerate() {
if let WeightValue::F64(val) = weight {
if *val == 0.0 {
return Err(format!("The weight at index {} is 0.0", i).into());
// for vx, vname in ...
let agg_targ = match _agg_xcats_for_cid {
true => "xcats",
false => "cids",
};
for (vx, vname) in vec![
(weights_vec.clone(), "weights"),
(signs_vec.clone(), "signs"),
] {
for (i, v) in vx.iter().enumerate() {
if *v == 0.0 {
return Err(format!("The {} at index {} is 0.0", vname, i).into());
}
}
}
// 2) Ensure the weights vector is the expected length.
if weights_vec.len() != agg_targ_len {
return Err(format!(
"The length of weights ({}) does not match the length of {} ({})",
weights_vec.len(),
agg_targ,
agg_targ_len
)
.into());
}
// 3) Check signs for zero.
for (i, sign) in signs_vec.iter().enumerate() {
if *sign == 0.0 {
return Err(format!("The sign at index {} is 0.0", i).into());
if vx.len() != agg_targ_len {
return Err(format!(
"The length of {} ({}) does not match the length of {} ({})",
vname,
vx.len(),
agg_targ,
agg_targ_len
)
.into());
}
}
// 4) Ensure the signs vector is the expected length.
if signs_vec.len() != agg_targ_len {
return Err(format!(
"The length of signs ({}) does not match the length of {} ({})",
signs_vec.len(),
agg_targ,
agg_targ_len
)
.into());
}
Ok(())
}
fn rename_result_dfw_cols(
@@ -468,36 +393,6 @@ fn agg_xcats_for_cid(cids: Vec<String>, xcats: Vec<String>) -> bool {
xcats.len() > 1
}
/// Represents a weight value that can be a string, (float, or integer).
#[derive(Debug, Clone, PartialEq)]
pub enum WeightValue {
Str(String),
F64(f64),
}
impl From<String> for WeightValue {
fn from(s: String) -> Self {
WeightValue::Str(s)
}
}
impl<'a> From<&'a str> for WeightValue {
fn from(s: &'a str) -> Self {
WeightValue::Str(s.to_string())
}
}
impl From<f64> for WeightValue {
fn from(f: f64) -> Self {
WeightValue::F64(f)
}
}
impl From<i32> for WeightValue {
fn from(i: i32) -> Self {
WeightValue::F64(i as f64)
}
}
/// Weighted linear combinations of cross sections or categories
/// # Arguments
/// * `df` - QDF DataFrame
@@ -522,7 +417,7 @@ pub fn linear_composite(
cids: Vec<String>,
weights: Option<Vec<f64>>,
signs: Option<Vec<f64>>,
weight_xcat: Option<String>,
weight_xcats: Option<Vec<String>>,
normalize_weights: bool,
start: Option<String>,
end: Option<String>,
@@ -534,28 +429,10 @@ pub fn linear_composite(
) -> Result<DataFrame, Box<dyn std::error::Error>> {
// Check if the DataFrame is a Quantamental DataFrame
check_quantamental_dataframe(df)?;
if agg_xcats_for_cid(cids.clone(), xcats.clone()) {
if weight_xcat.is_some() {
return Err(
format!(
"Using xcats as weights is not supported when aggregating cids for a single xcat. {:?} {:?}",
cids, xcats
)
.into(),
);
}
}
let mut rxcats = xcats.clone();
if weight_xcat.is_some() {
rxcats.extend(vec![weight_xcat.clone().unwrap()]);
}
let rdf = reduce_dataframe(
df.clone(),
Some(cids.clone()),
Some(rxcats.clone()),
Some(xcats.clone()),
Some(vec!["value".to_string()]),
start.clone(),
end.clone(),
@@ -566,11 +443,10 @@ pub fn linear_composite(
let new_xcat = new_xcat.unwrap_or_else(|| "COMPOSITE".to_string());
let new_cid = new_cid.unwrap_or_else(|| "GLB".to_string());
let dfw = pivot_dataframe_by_ticker(rdf, Some("value".to_string())).unwrap();
let dfw = pivot_dataframe_by_ticker(rdf.clone(), Some("value".to_string())).unwrap();
let mul_targets = get_mul_targets(cids.clone(), xcats.clone())?;
let weights_map =
form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, weight_xcat, signs)?;
let weights_map = form_weights_and_signs_map(cids.clone(), xcats.clone(), weights, signs)?;
for (ticker, targets) in mul_targets.iter() {
println!("ticker: {}, targets: {:?}", ticker, targets);

View File

@@ -1,307 +0,0 @@
use crate::utils::bdates::{get_bdates_list_with_freq, BDateFreq};
use crate::utils::dateutils::get_min_max_real_dates;
use crate::utils::misc::get_cid;
use crate::utils::qdf::core::check_quantamental_dataframe;
use chrono::NaiveDate;
use polars::prelude::*;
use std::collections::{BTreeMap, HashMap};
use std::error::Error;
use super::get_unique_metrics;
pub fn create_blacklist_from_qdf(
df: &DataFrame,
group_by_cid: Option<bool>,
blacklist_name: Option<String>,
metrics: Option<Vec<String>>,
) -> Result<BTreeMap<String, (String, String)>, Box<dyn Error>> {
check_quantamental_dataframe(df)?;
let metrics = metrics.unwrap_or_else(|| get_unique_metrics(df).unwrap());
let blacklist_name = blacklist_name.unwrap_or_else(|| "BLACKLIST".into());
let group_by_cid = group_by_cid.unwrap_or(true);
let (min_date, max_date) = get_min_max_real_dates(df, "real_date".into())?;
let min_date_str = min_date.format("%Y-%m-%d").to_string();
let max_date_str = max_date.format("%Y-%m-%d").to_string();
// let all_bdates = get_bdates_series_default_opt(min_date_str, max_date_str, None)?;
let all_bdates = get_bdates_list_with_freq(
min_date_str.clone().as_str(),
max_date_str.clone().as_str(),
BDateFreq::Daily,
)?;
let null_mask = get_nan_mask(df, metrics)?;
let df = df.filter(&null_mask)?.clone();
let df = df
.clone()
.lazy()
// .filter(&null_mask)
// .filter(
// col(metric.as_str())
// .is_null()
// .or(col(metric.as_str()).is_nan()),
// )
.sort(
["cid", "xcat"],
SortMultipleOptions::default().with_maintain_order(true),
)
.group_by([col("cid"), col("xcat")])
// .agg([col("real_date").sort(SortOptions::default())])
.agg([col("real_date")
.dt()
.strftime("%Y-%m-%d")
.sort(SortOptions::default())])
.select([
concat_str([col("cid"), col("xcat")], "_", true).alias("ticker"),
col("real_date").alias("real_dates"),
])
.collect()?;
// assert!(0 == 1, "{:?}", df);
let ticker_vec = df
.column("ticker")?
.str()?
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>();
let rdt = get_vec_of_vec_of_dates_from_df(df)?;
let mut blk: HashMap<String, Vec<String>> = HashMap::new();
for (tkr, dates) in ticker_vec.iter().zip(rdt.iter()) {
if group_by_cid {
let _cid = get_cid(tkr.clone())?;
if blk.contains_key(&_cid) {
blk.get_mut(&_cid).unwrap().extend(dates.iter().cloned());
} else {
blk.insert(_cid, dates.clone());
}
} else {
blk.insert(tkr.to_string(), dates.clone());
}
}
for (_key, vals) in blk.iter_mut() {
// order is important - dedup depends on the vec being sorted
vals.sort();
vals.dedup();
}
let all_bdates_strs = all_bdates
.iter()
.map(|date| date.format("%Y-%m-%d").to_string())
.collect::<Vec<String>>();
let mut blacklist: HashMap<String, (String, String)> = HashMap::new();
for (tkr, dates) in blk.iter() {
let date_ranges = convert_dates_list_to_date_ranges(dates.clone(), all_bdates_strs.clone());
for (rng_idx, (start_date, end_date)) in date_ranges.iter() {
let range_key = format!("{}_{}_{}", tkr, blacklist_name.clone(), rng_idx);
blacklist.insert(range_key, (start_date.clone(), end_date.clone()));
}
}
// Ok(blacklist)
let mut btree_map: BTreeMap<String, (String, String)> = BTreeMap::new();
for (key, (start_date, end_date)) in blacklist.iter() {
btree_map.insert(key.clone(), (start_date.clone(), end_date.clone()));
}
Ok(btree_map)
}
fn get_nan_mask(
df: &DataFrame,
metrics: Vec<String>,
) -> Result<ChunkedArray<BooleanType>, Box<dyn Error>> {
let null_masks: Vec<ChunkedArray<BooleanType>> = metrics
.iter()
.map(|metric| {
let null_mask = df.column(metric.as_str())?.is_null();
let nan_mask = df.column(metric.as_str())?.is_nan()?;
Ok(null_mask | nan_mask)
})
.collect::<Result<_, Box<dyn Error>>>()?;
let null_mask = null_masks
.into_iter()
.reduce(|acc, mask| acc | mask)
.unwrap_or_else(|| BooleanChunked::full_null("null_mask".into(), df.height()));
Ok(null_mask)
}
fn convert_dates_list_to_date_ranges(
blacklist: Vec<String>,
all_bdates_strs: Vec<String>,
) -> HashMap<String, (String, String)> {
// Step 1: Map every date in all_bdates_strs to its index
let mut all_map: HashMap<String, usize> = HashMap::new();
for (i, d) in all_bdates_strs.iter().enumerate() {
all_map.insert(d.clone(), i);
}
// Step 2: Convert each blacklisted date into its index, if it exists
let mut blacklisted_indices: Vec<usize> = Vec::new();
for dt in blacklist {
if let Some(&idx) = all_map.get(&dt) {
blacklisted_indices.push(idx);
}
}
// Step 3: Sort the blacklisted indices
blacklisted_indices.sort_unstable();
// Step 4: Traverse and group consecutive indices into ranges
let mut result: HashMap<i64, (String, String)> = HashMap::new();
let mut string_result: HashMap<String, (String, String)> = HashMap::new();
if blacklisted_indices.is_empty() {
return string_result;
}
let mut range_idx: i64 = 0;
let mut start_idx = blacklisted_indices[0];
let mut end_idx = start_idx;
for &cur_idx in blacklisted_indices.iter().skip(1) {
if cur_idx == end_idx + 1 {
// We are still in a contiguous run
end_idx = cur_idx;
} else {
// We hit a break in contiguity, so store the last range
result.insert(
range_idx,
(
all_bdates_strs[start_idx].clone(),
all_bdates_strs[end_idx].clone(),
),
);
range_idx += 1;
// Start a new range
start_idx = cur_idx;
end_idx = cur_idx;
}
}
// Don't forget to store the final range after the loop
result.insert(
range_idx,
(
all_bdates_strs[start_idx].clone(),
all_bdates_strs[end_idx].clone(),
),
);
let max_digits = result.keys().max().unwrap_or(&-1).to_string().len();
for (key, (start_date, end_date)) in result.iter() {
let new_key = format!("{:0width$}", key, width = max_digits);
string_result.insert(new_key, (start_date.clone(), end_date.clone()));
}
string_result
}
fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.str()
.unwrap()
.into_iter()
.filter_map(|opt| opt.map(|s| s.to_string()))
.collect::<Vec<String>>()
})
.collect::<Vec<Vec<String>>>();
Ok(rdt)
}
#[allow(dead_code)]
fn get_vec_of_vec_of_naivedates_from_df(
df: DataFrame,
) -> Result<Vec<Vec<NaiveDate>>, Box<dyn Error>> {
let rdt = df
.column("real_dates")?
// .clone()
.as_series()
.unwrap()
.list()?
.into_iter()
.filter_map(|opt| opt)
.collect::<Vec<Series>>()
.iter()
.map(|s| {
s.date()
.unwrap()
.into_iter()
.filter_map(|opt| opt.and_then(|date| NaiveDate::from_num_days_from_ce_opt(date)))
.collect::<Vec<NaiveDate>>()
})
.collect::<Vec<Vec<NaiveDate>>>();
Ok(rdt)
}
// fn get_vec_of_vec_of_dates_from_df(df: DataFrame) -> Result<Vec<Vec<String>>, Box<dyn Error>> {
// let real_dates_column = df.column("real_dates")?.clone();
// let series = real_dates_column.as_series().unwrap().clone();
// let rdt = series.list()?.clone();
// let rdt = rdt
// .into_iter()
// .filter_map(|opt| opt)
// .collect::<Vec<Series>>();
// let rdt = rdt
// .iter()
// .map(|s| {
// s.str()
// .unwrap()
// .into_iter()
// .filter_map(|opt| opt.map(|s| s.to_string()))
// .collect::<Vec<String>>()
// })
// .collect::<Vec<Vec<String>>>();
// Ok(rdt)
// }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_dates_list_to_date_ranges() {
let all_dates = vec![
"2023-01-01".to_string(),
"2023-01-02".to_string(),
"2023-01-03".to_string(),
"2023-01-04".to_string(),
"2023-01-05".to_string(),
"2023-01-06".to_string(),
];
let blacklist = vec![
"2023-01-02".to_string(),
"2023-01-03".to_string(),
"2023-01-05".to_string(),
];
let result = convert_dates_list_to_date_ranges(blacklist, all_dates);
// Expect two ranges:
// range 0 => ("2023-01-02", "2023-01-03")
// range 1 => ("2023-01-05", "2023-01-05")
assert_eq!(
result["0"],
("2023-01-02".to_string(), "2023-01-03".to_string())
);
assert_eq!(
result["1"],
("2023-01-05".to_string(), "2023-01-05".to_string())
);
}
}

View File

@@ -17,15 +17,14 @@ use std::error::Error;
pub fn check_quantamental_dataframe(df: &DataFrame) -> Result<(), Box<dyn Error>> {
let expected_cols = ["real_date", "cid", "xcat"];
let expected_dtype = [DataType::Date, DataType::String, DataType::String];
let err = "Quantamental DataFrame must have at least 4 columns: 'real_date', 'cid', 'xcat' and one or more metrics.";
for (col, dtype) in expected_cols.iter().zip(expected_dtype.iter()) {
let col = df.column(col);
if col.is_err() {
return Err(format!("{} Column {:?} not found", err, col).into());
return Err(format!("Column {:?} not found", col).into());
}
let col = col?;
if col.dtype() != dtype {
return Err(format!("{} Column {:?} has wrong dtype", err, col).into());
return Err(format!("Column {:?} has wrong dtype", col).into());
}
}
Ok(())

View File

@@ -1,12 +1,11 @@
pub mod blacklist;
pub mod core;
pub mod load;
pub mod pivots;
pub mod reduce_df;
pub mod update_df;
pub mod load;
pub mod reduce_df;
pub mod pivots;
// Re-export submodules for easier access
pub use core::*;
pub use update_df::*;
pub use load::*;
pub use reduce_df::*;
pub use update_df::*;

View File

@@ -30,12 +30,12 @@ pub fn reduce_dataframe(
let df_size = df.shape();
let mut new_df = df.clone();
let ticker_col = get_ticker_column_for_quantamental_dataframe(&new_df)?;
let ticker_col: Column = get_ticker_column_for_quantamental_dataframe(&new_df)?;
// if cids is not provided, get all unique cids
let u_cids = get_unique_cids(&new_df)?;
let u_xcats = get_unique_xcats(&new_df)?;
let u_tickers = _get_unique_strs_from_str_column_object(&ticker_col)?;
let u_cids: Vec<String> = get_unique_cids(&new_df)?;
let u_xcats: Vec<String> = get_unique_xcats(&new_df)?;
let u_tickers: Vec<String> = _get_unique_strs_from_str_column_object(&ticker_col)?;
let cids_vec = cids.unwrap_or_else(|| u_cids.clone());
let specified_cids: Vec<&str> = cids_vec.iter().map(AsRef::as_ref).collect();