msyrs/src/utils/dateutils.rs

241 lines
8.1 KiB
Rust

use chrono::NaiveDate;
use chrono::{Datelike, Weekday};
use polars::prelude::*;
use std::collections::HashMap;
use std::error::Error;
/// Get the minimum and maximum dates from a date column in a DataFrame.
pub fn get_min_max_real_dates(
df: &DataFrame,
date_col: &str,
) -> Result<(NaiveDate, NaiveDate), Box<dyn Error>> {
let date_series = df.column(date_col)?;
if let DataType::Date = date_series.dtype() {
// Convert the `date` series to an i32 (days since 1970-01-01)
let date_as_days = date_series.cast(&DataType::Int32)?;
let min_days = date_as_days.i32()?.min().ok_or("No minimum value found")?;
let max_days = date_as_days.i32()?.max().ok_or("No maximum value found")?;
// Convert the days back to `NaiveDate`
let min_date = NaiveDate::from_ymd_opt(1970, 1, 1)
.unwrap()
.checked_add_signed(chrono::Duration::days(min_days as i64))
.ok_or("Invalid minimum date")?;
let max_date = NaiveDate::from_ymd_opt(1970, 1, 1)
.unwrap()
.checked_add_signed(chrono::Duration::days(max_days as i64))
.ok_or("Invalid maximum date")?;
Ok((min_date, max_date))
} else {
Err(Box::new(polars::error::PolarsError::ComputeError(
"The column is not of Date type".into(),
)))
}
}
/// Get the business dates between two dates.
pub fn get_bdates_list(
start_date: String,
end_date: String,
) -> Result<Vec<NaiveDate>, Box<dyn Error>> {
let start_date = NaiveDate::parse_from_str(&start_date, "%Y-%m-%d")?;
let end_date = NaiveDate::parse_from_str(&end_date, "%Y-%m-%d")?;
let mut business_days = Vec::new();
let mut current_date = start_date;
while current_date <= end_date {
// Check if the current date is a business day (not Saturday or Sunday)
if current_date.weekday() != Weekday::Sat && current_date.weekday() != Weekday::Sun {
business_days.push(current_date);
}
current_date = current_date.succ_opt().ok_or(format!(
"Failed to get the next day for : {:?}",
current_date
))?;
}
Ok(business_days)
}
#[derive(Debug, Clone, Copy)]
pub enum BDateFreq {
Daily,
WeeklyMonday,
MonthStart,
QuarterStart,
YearStart,
MonthEnd,
QuarterEnd,
WeeklyFriday,
YearEnd,
}
impl BDateFreq {
pub fn from_str(freq: &str) -> Result<Self, Box<dyn Error>> {
match freq {
"D" => Ok(BDateFreq::Daily),
"W" => Ok(BDateFreq::WeeklyMonday),
"M" => Ok(BDateFreq::MonthStart),
"Q" => Ok(BDateFreq::QuarterStart),
"A" => Ok(BDateFreq::YearStart),
"ME" => Ok(BDateFreq::MonthEnd),
"QE" => Ok(BDateFreq::QuarterEnd),
"WF" => Ok(BDateFreq::WeeklyFriday),
"YE" => Ok(BDateFreq::YearEnd),
_ => Err("Invalid frequency specified".into()),
}
}
pub fn agg_type(&self) -> AggregationType {
match self {
BDateFreq::Daily
| BDateFreq::WeeklyMonday
| BDateFreq::MonthStart
| BDateFreq::QuarterStart
| BDateFreq::YearStart => AggregationType::Start,
BDateFreq::WeeklyFriday
| BDateFreq::MonthEnd
| BDateFreq::QuarterEnd
| BDateFreq::YearEnd => AggregationType::End,
}
}
}
#[derive(Debug, Clone, Copy)]
pub enum AggregationType {
Start, // Indicates picking the first date in a group.
End, // Indicates picking the last date in a group.
}
// Map a BDateFreq to an AggregationType.
fn compute_group_key(d: NaiveDate, freq: BDateFreq) -> String {
match freq {
// For Daily, each date is its own group.
BDateFreq::Daily => format!("{}", d),
// For weekly grouping, we use ISO week information.
BDateFreq::WeeklyMonday | BDateFreq::WeeklyFriday => {
let iso = d.iso_week();
format!("{}-W{:02}", iso.year(), iso.week())
}
// Group by Year-Month.
BDateFreq::MonthStart | BDateFreq::MonthEnd => {
format!("{}-M{:02}", d.year(), d.month())
}
// Group by Year-Quarter.
BDateFreq::QuarterStart | BDateFreq::QuarterEnd => {
let quarter = (d.month() - 1) / 3 + 1;
format!("{}-Q{}", d.year(), quarter)
}
// Group by Year.
BDateFreq::YearStart | BDateFreq::YearEnd => format!("{}", d.year()),
}
}
pub fn get_bdates_series_default(
start_date: String,
end_date: String,
freq: Option<String>,
) -> Result<Series, Box<dyn Error>> {
let freq = freq.unwrap_or_else(|| "D".to_string());
let freq = BDateFreq::from_str(&freq)?;
get_bdates_series(start_date, end_date, freq)
}
/// Get the business dates between two dates as a Series.
pub fn get_bdates_series(
start_date: String,
end_date: String,
freq: BDateFreq,
) -> Result<Series, Box<dyn Error>> {
let business_days = get_bdates_list(start_date, end_date)?;
let group_keys: Vec<String> = business_days
.iter()
.map(|&d| compute_group_key(d, freq))
.collect();
let df = DataFrame::new(vec![
Column::new("bdates".into(), business_days),
Column::new("group".into(), group_keys),
])?;
let gb = df.lazy().group_by(["group"]);
let aggx = match freq.agg_type() {
AggregationType::Start => gb.agg([col("bdates").first()]),
AggregationType::End => gb.agg([col("bdates").last()]),
};
let result = aggx.collect()?;
let result = result
.column("bdates")?
.as_series()
.ok_or("Column 'bdates' not found")?
.clone();
let result = result.sort(SortOptions {
descending: false,
nulls_last: false,
multithreaded: false,
maintain_order: false,
})?;
Ok(result)
}
/// Get the business dates from a date column in a DataFrame.
/// Identify business days, bucket them by period, and pick the first available date from each period.
pub fn get_bdates_from_col(date_col: &Series, freq: &str) -> Result<Series, Box<dyn Error>> {
// Ensure the column is of Date type
if date_col.dtype() != &DataType::Date {
return Err("The column is not of Date type".into());
}
// Step 1: Identify business days (exclude weekends)
let date_as_days = date_col.cast(&DataType::Int32)?;
let business_days: Vec<NaiveDate> = date_as_days
.i32()?
.into_iter()
.filter_map(|opt_days| {
opt_days.map(|days| {
NaiveDate::from_ymd_opt(1970, 1, 1).unwrap() + chrono::Duration::days(days as i64)
})
})
.filter(|date| {
// Exclude weekends (Saturday and Sunday)
let weekday = date.weekday();
weekday != Weekday::Sat && weekday != Weekday::Sun
})
.collect();
// Step 2: Bucket dates by period
let mut buckets: HashMap<String, Vec<NaiveDate>> = HashMap::new();
for date in &business_days {
let bucket_key = match freq {
"D" => date.format("%Y-%m-%d").to_string(),
"W" => format!("{}-W{:02}", date.year(), date.iso_week().week()),
"M" => date.format("%Y-%m").to_string(),
"Q" => format!("{}-Q{}", date.year(), (date.month() - 1) / 3 + 1),
"A" => date.year().to_string(),
_ => return Err("Invalid frequency specified".into()),
};
buckets.entry(bucket_key).or_default().push(*date);
}
// Step 3: Pick the first available date from each bucket
let mut selected_dates: Vec<NaiveDate> = Vec::new();
for (_, mut dates) in buckets {
dates.sort(); // Ensure dates are sorted within the bucket
if let Some(first_date) = dates.first() {
selected_dates.push(*first_date);
}
}
// Step 4: Convert selected dates back to a Series of Date type
let bdates_series = Series::new(
"bdates".into(),
selected_dates
.into_iter()
.map(|date| date.format("%Y-%m-%d").to_string()) // Format as strings
.collect::<Vec<String>>(),
)
.cast(&DataType::Date)?; // Cast to Date type
Ok(bdates_series)
}