This commit is contained in:
Palash Tyagi 2024-11-20 01:44:15 +00:00
parent 7ea8aa6dd8
commit fdc5f7d95f
2 changed files with 80 additions and 39 deletions

View File

@ -37,21 +37,11 @@ pub fn is_quantamental_dataframe(df: &DataFrame) -> bool {
check_quantamental_dataframe(df).is_ok() check_quantamental_dataframe(df).is_ok()
} }
/// Sort the columns of a Quantamental DataFrame. pub fn get_sorted_qdf_columns(columns: Vec<String>) -> Vec<String> {
/// The first columns are `real_date`, `cid`, and `xcat`.
/// These are followed by any available JPMAQS metrics, 'value', 'grading', 'eop_lag', 'mop_lag',
/// (**in that order**), followed by any other metrics (in alphabetical order).
pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box<dyn Error>> {
let index_columns = ["real_date", "cid", "xcat"]; let index_columns = ["real_date", "cid", "xcat"];
let known_metrics = ["value", "grading", "eop_lag", "mop_lag"]; let known_metrics = ["value", "grading", "eop_lag", "mop_lag"];
let df_columns = qdf let mut unknown_metrics: Vec<String> = columns
.get_column_names()
.into_iter()
.map(|s| s.clone().into_string())
.collect::<Vec<String>>();
let mut unknown_metrics: Vec<String> = df_columns
.iter() .iter()
.filter(|&m| !known_metrics.contains(&m.as_str())) .filter(|&m| !known_metrics.contains(&m.as_str()))
.filter(|&m| !index_columns.contains(&m.as_str())) .filter(|&m| !index_columns.contains(&m.as_str()))
@ -61,13 +51,30 @@ pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box<dyn Error>> {
let mut new_columns: Vec<String> = vec![]; let mut new_columns: Vec<String> = vec![];
new_columns.extend(index_columns.iter().map(|s| s.to_string())); new_columns.extend(index_columns.iter().map(|s| s.to_string()));
for &colname in &known_metrics { for &colname in &known_metrics {
if df_columns.contains(&colname.into()) { if columns.contains(&colname.into()) {
new_columns.push(colname.to_string()); new_columns.push(colname.to_string());
} }
} }
unknown_metrics.sort(); unknown_metrics.sort();
new_columns.extend(unknown_metrics); new_columns.extend(unknown_metrics);
new_columns
}
/// Sort the columns of a Quantamental DataFrame.
/// The first columns are `real_date`, `cid`, and `xcat`.
/// These are followed by any available JPMAQS metrics, 'value', 'grading', 'eop_lag', 'mop_lag',
/// (**in that order**), followed by any other metrics (in alphabetical order).
pub fn sort_qdf_columns(qdf: &mut DataFrame) -> Result<(), Box<dyn Error>> {
let df_columns = qdf
.get_column_names()
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<String>>();
let new_columns = get_sorted_qdf_columns(df_columns);
*qdf = qdf *qdf = qdf
.select(new_columns.clone()) .select(new_columns.clone())
.expect("Failed to select columns"); .expect("Failed to select columns");
@ -142,21 +149,23 @@ pub fn get_unique_xcats(df: &DataFrame) -> Result<Vec<String>, Box<dyn Error>> {
get_unique_from_str_column(df, "xcat") get_unique_from_str_column(df, "xcat")
} }
pub fn get_unique_metrics(df: &DataFrame) -> Result<Vec<String>, Box<dyn Error>> {
// return a list of all columns that are not 'real_date', 'cid', 'xcat'
let columns = df
.get_column_names()
.iter()
.map(|s| s.as_str().to_string())
.collect();
let sorted_cols = get_sorted_qdf_columns(columns);
// return sorted_cols[3..].to_vec()
Ok(sorted_cols[3..].to_vec())
}
/// Get the unique dates as a polars Column from a Quantamental DataFrame. /// Get the unique dates as a polars Column from a Quantamental DataFrame.
pub fn get_unique_dates(df: &DataFrame) -> Result<Column, Box<dyn Error>> { pub fn get_unique_dates(df: &DataFrame) -> Result<Column, Box<dyn Error>> {
let date_col = df.column("real_date")?; let date_col = df.column("real_date")?;
let unique_dates = date_col.unique()?.sort(SortOptions::default())?; let unique_dates = date_col.unique()?.sort(SortOptions::default())?;
Ok(unique_dates) Ok(unique_dates)
} }

View File

@ -27,8 +27,8 @@ pub fn pivot_dataframe_by_ticker(
// set metric to the first non-index column // set metric to the first non-index column
metric = df.get_column_names()[3].to_string(); metric = df.get_column_names()[3].to_string();
} }
let unique_dates: Column = get_unique_dates(&df)?;
// let mut new_df = df.clone();
// keep only the index columns and the metric column // keep only the index columns and the metric column
let mut keep_cols = QDF_INDEX_COLUMNS let mut keep_cols = QDF_INDEX_COLUMNS
.to_vec() .to_vec()
@ -37,23 +37,24 @@ pub fn pivot_dataframe_by_ticker(
.collect::<Vec<String>>(); .collect::<Vec<String>>();
keep_cols.push(metric.clone()); keep_cols.push(metric.clone());
return Err("Not implemented".into()); let out_dfs = split_df_by_tickers(&df, Some(vec![metric.clone()]))?;
// create a new dataframe with the unique dates, and iteratively add add the metric columns
let mut new_df = DataFrame::new(vec![unique_dates])?;
// new_df = new_df.select(keep_cols)?;
// let ticker_col = get_ticker_column_for_quantamental_dataframe(&new_df)?;
// new_df.with_column(ticker_col)?;
// // drop the cid and xcat columns
// new_df = new_df.drop_many(&["cid".to_string(), "xcat".to_string()]);
// let dates_col = df.column("real_date")?;
// Ok(df)
} }
/// Splits a dataframe by ticker. /// Splits a dataframe by ticker.
#[allow(dead_code)] #[allow(dead_code)]
fn split_df_by_tickers(df: &DataFrame) -> Result<HashMap<String, DataFrame>, Box<dyn Error>> { fn split_df_by_tickers(
df: &DataFrame,
metrics: Option<Vec<String>>,
) -> Result<HashMap<String, DataFrame>, Box<dyn Error>> {
check_quantamental_dataframe(df)?; check_quantamental_dataframe(df)?;
let mut df_outs = HashMap::new(); let mut df_outs = HashMap::new();
@ -66,6 +67,15 @@ fn split_df_by_tickers(df: &DataFrame) -> Result<HashMap<String, DataFrame>, Box
icids.push(cid); icids.push(cid);
ixcats.push(xcat); ixcats.push(xcat);
} }
let metrics = metrics.unwrap_or_else(|| get_unique_metrics(df).unwrap());
// let keep_cols:Vec<String> = QDF_INDEX_COLUMNS + metrics;
let mut keep_cols = QDF_INDEX_COLUMNS
.to_vec()
.iter()
.map(|s| s.to_string())
.collect::<Vec<String>>();
keep_cols.extend(metrics);
for (cid, xcat) in icids.iter().zip(ixcats.iter()) { for (cid, xcat) in icids.iter().zip(ixcats.iter()) {
// Apply filter while borrowing df and avoid moving ownership. // Apply filter while borrowing df and avoid moving ownership.
@ -74,17 +84,39 @@ fn split_df_by_tickers(df: &DataFrame) -> Result<HashMap<String, DataFrame>, Box
.and(col("xcat").eq(lit(xcat.clone()))); .and(col("xcat").eq(lit(xcat.clone())));
let df_out = df.clone().lazy().filter(filter).collect()?; let df_out = df.clone().lazy().filter(filter).collect()?;
// select keep_cols
let df_out = df_out.select(keep_cols.clone())?;
df_outs.insert(format!("{}_{}", cid, xcat), df_out); df_outs.insert(format!("{}_{}", cid, xcat), df_out);
} }
Ok(df_outs) Ok(df_outs)
} }
fn single_ticker_qdf_to_timeseries(df: &DataFrame) -> Result<Vec<DataFrame>, Box<dyn Error>> { fn single_ticker_qdf_to_timeseries(mut df: DataFrame) -> Result<Vec<DataFrame>, Box<dyn Error>> {
let mut df_vec = Vec::new(); let mut df_vec = Vec::new();
// copy the date col // Since we own `df`, we can remove the "real_date" column, consuming it.
// let date_col = let date_col = df.drop_in_place("real_date")?;
Ok(vec![df.to_owned()]) // Assuming `get_unique_metrics` only needs a reference to `df`
let metrics = get_unique_metrics(&df)?;
for metric in metrics.iter() {
// Remove the metric column from `df`, consuming it.
let metric_col = df.drop_in_place(metric)?;
// Create a new DataFrame with the date column and the metric column.
let new_df = DataFrame::new(vec![
date_col.clone(), // Clone because `date_col` is used multiple times
metric_col, // Move the metric column (no clone needed)
])?;
df_vec.push(new_df);
}
// At this point, `df` has had its metric columns removed and is effectively consumed.
// The "date_col" has been moved out of `df` as well.
// If you need to use `df` after this function, consider redesigning the function
// to avoid consuming it.
Ok(df_vec)
} }