From f84879119b88d0754aa77291c130d869040d6045 Mon Sep 17 00:00:00 2001
From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com>
Date: Thu, 10 Apr 2025 00:33:28 +0100
Subject: [PATCH] updating linear composites notebook

---
 notebooks/funcwise/linear_composites.ipynb | 395 ++++++++-------------
 1 file changed, 144 insertions(+), 251 deletions(-)
diff --git a/notebooks/funcwise/linear_composites.ipynb b/notebooks/funcwise/linear_composites.ipynb
index 86feb2f..8f6522f 100644
--- a/notebooks/funcwise/linear_composites.ipynb
+++ b/notebooks/funcwise/linear_composites.ipynb
@@ -9,34 +9,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2mUsing Python 3.12.4 environment at: E:\\Work\\ruzt\\msyrs\\.venv\u001b[0m\n",
-      "\u001b[2mResolved \u001b[1m34 packages\u001b[0m \u001b[2min 121ms\u001b[0m\u001b[0m\n",
-      "   \u001b[36m\u001b[1mBuilding\u001b[0m\u001b[39m msyrs\u001b[2m @ file:///E:/Work/ruzt/msyrs\u001b[0m\n",
-      "      \u001b[32m\u001b[1mBuilt\u001b[0m\u001b[39m msyrs\u001b[2m @ file:///E:/Work/ruzt/msyrs\u001b[0m\n",
-      "\u001b[2mPrepared \u001b[1m1 package\u001b[0m \u001b[2min 14.72s\u001b[0m\u001b[0m\n",
-      "\u001b[2mUninstalled \u001b[1m1 package\u001b[0m \u001b[2min 4ms\u001b[0m\u001b[0m\n",
-      "\u001b[1m\u001b[33mwarning\u001b[39m\u001b[0m\u001b[1m:\u001b[0m \u001b[1mFailed to hardlink files; falling back to full copy. This may lead to degraded performance.\n",
-      "         If the cache and target directories are on different filesystems, hardlinking may not be supported.\n",
-      "         If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.\u001b[0m\n",
-      "\u001b[2mInstalled \u001b[1m1 package\u001b[0m \u001b[2min 30ms\u001b[0m\u001b[0m\n",
-      " \u001b[33m~\u001b[39m \u001b[1mmsyrs\u001b[0m\u001b[2m==0.0.1 (from file:///E:/Work/ruzt/msyrs)\u001b[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "! uv pip install E:\\Work\\ruzt\\msyrs --upgrade"
+    "# ! uv pip install E:\\Work\\ruzt\\msyrs --upgrade"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -44,10 +26,11 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import polars as pl\n",
+    "import time\n",
     "import os\n",
     "\n",
     "from macrosynergy.panel import view_timelines\n",
-    "from macrosynergy.management.types import QuantamentalDataFrame\n"
+    "from macrosynergy.management.types import QuantamentalDataFrame"
    ]
   },
   {
@@ -59,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,83 +51,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "DATA_FOLDER_PATH = \"E:/Work/jpmaqs-data\"\n",
-    "# DATA_FOLDER_PATH = \"C:/Users/PalashTyagi/Code/go-dataquery/jpmaqs-data\"\n",
-    "DQ_CLIENT_ID = os.getenv(\"DQ_CLIENT_ID\")\n",
-    "DQ_CLIENT_SECRET = os.getenv(\"DQ_CLIENT_SECRET\")"
+    "DATA_FOLDER_PATH = \"E:/Work/jpmaqs-data\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "nb_start_time = time.time()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken to load qdf: 0.004000425338745117\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><style>\n",
-       ".dataframe > thead > tr,\n",
-       ".dataframe > tbody > tr {\n",
-       "  text-align: right;\n",
-       "  white-space: pre-wrap;\n",
-       "}\n",
-       "</style>\n",
-       "<small>shape: (5, 7)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>real_date</th><th>cid</th><th>xcat</th><th>value</th><th>grading</th><th>eop_lag</th><th>mop_lag</th></tr><tr><td>date</td><td>str</td><td>str</td><td>f64</td><td>f64</td><td>i64</td><td>i64</td></tr></thead><tbody><tr><td>2010-03-03</td><td>&quot;USD&quot;</td><td>&quot;ADPEMPL_SA_P1M1ML1&quot;</td><td>-0.173806</td><td>3.0</td><td>3</td><td>33</td></tr><tr><td>2010-03-04</td><td>&quot;USD&quot;</td><td>&quot;ADPEMPL_SA_P1M1ML1&quot;</td><td>-0.173806</td><td>3.0</td><td>4</td><td>34</td></tr><tr><td>2010-03-05</td><td>&quot;USD&quot;</td><td>&quot;ADPEMPL_SA_P1M1ML1&quot;</td><td>-0.173806</td><td>3.0</td><td>5</td><td>35</td></tr><tr><td>2010-03-08</td><td>&quot;USD&quot;</td><td>&quot;ADPEMPL_SA_P1M1ML1&quot;</td><td>-0.173806</td><td>3.0</td><td>8</td><td>38</td></tr><tr><td>2010-03-09</td><td>&quot;USD&quot;</td><td>&quot;ADPEMPL_SA_P1M1ML1&quot;</td><td>-0.173806</td><td>3.0</td><td>9</td><td>39</td></tr></tbody></table></div>"
-      ],
-      "text/plain": [
-       "shape: (5, 7)\n",
-       "┌────────────┬─────┬────────────────────┬───────────┬─────────┬─────────┬─────────┐\n",
-       "│ real_date  ┆ cid ┆ xcat               ┆ value     ┆ grading ┆ eop_lag ┆ mop_lag │\n",
-       "│ ---        ┆ --- ┆ ---                ┆ ---       ┆ ---     ┆ ---     ┆ ---     │\n",
-       "│ date       ┆ str ┆ str                ┆ f64       ┆ f64     ┆ i64     ┆ i64     │\n",
-       "╞════════════╪═════╪════════════════════╪═══════════╪═════════╪═════════╪═════════╡\n",
-       "│ 2010-03-03 ┆ USD ┆ ADPEMPL_SA_P1M1ML1 ┆ -0.173806 ┆ 3.0     ┆ 3       ┆ 33      │\n",
-       "│ 2010-03-04 ┆ USD ┆ ADPEMPL_SA_P1M1ML1 ┆ -0.173806 ┆ 3.0     ┆ 4       ┆ 34      │\n",
-       "│ 2010-03-05 ┆ USD ┆ ADPEMPL_SA_P1M1ML1 ┆ -0.173806 ┆ 3.0     ┆ 5       ┆ 35      │\n",
-       "│ 2010-03-08 ┆ USD ┆ ADPEMPL_SA_P1M1ML1 ┆ -0.173806 ┆ 3.0     ┆ 8       ┆ 38      │\n",
-       "│ 2010-03-09 ┆ USD ┆ ADPEMPL_SA_P1M1ML1 ┆ -0.173806 ┆ 3.0     ┆ 9       ┆ 39      │\n",
-       "└────────────┴─────┴────────────────────┴───────────┴─────────┴─────────┴─────────┘"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dfpath = f\"{DATA_FOLDER_PATH}/data/ADPEMPL_SA_P1M1ML1/USD_ADPEMPL_SA_P1M1ML1.csv\"\n",
-    "\n",
-    "starttime = time.time()\n",
-    "ldf: pl.DataFrame = msyrs.qdf.load_qdf(dfpath)\n",
-    "print(f\"Time taken to load qdf: {time.time() - starttime}\")\n",
-    "ldf.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,22 +78,18 @@
     ")\n",
     "xcats = ecos + mkts\n",
     "\n",
+    "cpi_xcats = \"CPIC_SA_P1M1ML12.CPIC_SJA_P3M3ML3AR.CPIC_SJA_P6M6ML6AR.CPIH_SA_P1M1ML12.CPIH_SJA_P3M3ML3AR.CPIH_SJA_P6M6ML6AR\".split(\n",
+    "    \".\"\n",
+    ")\n",
+    "\n",
     "tickers = [f\"{c}_{x}\" for c in cids for x in xcats]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken to load qdf batch: 1.3058679103851318\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "starttime = time.time()\n",
     "\n",
@@ -185,52 +97,41 @@
     "    folder_path=DATA_FOLDER_PATH,\n",
     "    xcats=xcats,\n",
     ")\n",
-    "print(f\"Time taken to load qdf batch: {time.time() - starttime}\")\n"
+    "print(f\"Time taken to load qdf batch: {time.time() - starttime}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "286.69339656829834"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "big_df.estimated_size(\"mb\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "sel_cids = [\"USD\", \"EUR\", \"GBP\", \"AUD\", \"CAD\"]\n",
-    "start = \"1990-01-01\""
+    "start = \"1990-01-01\"\n",
+    "nb_start_time = time.time()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Running with uniform weights, 2 xcats, 5 cids\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken to reduce qdf: 0.9705278873443604\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "fx_xcats = [xc for xc in xcats if xc.startswith(\"FX\")]\n",
     "eq_xcats = [xc for xc in xcats if xc.startswith(\"EQ\")]\n",
@@ -239,7 +140,7 @@
     "eq_df = msyrs.qdf.reduce_dataframe(\n",
     "    df=big_df,\n",
     "    cids=sel_cids,\n",
-    "    xcats=fx_xcats + eq_xcats,\n",
+    "    xcats=fx_xcats + eq_xcats + cpi_xcats,\n",
     "    start=start,\n",
     ")\n",
     "\n",
@@ -248,133 +149,107 @@
     ")\n",
     "new_df: pl.DataFrame = msyrs.qdf.update_dataframe(df=eq_df, df_add=fx_df)\n",
     "\n",
-    "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")\n"
+    "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Time taken: 2.3365907669067383 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "end_time = time.time()\n",
-    "print(f\"Time taken: {end_time - nb_start_time} seconds\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "e:\\Work\\ruzt\\msyrs\\.venv\\Lib\\site-packages\\macrosynergy\\panel\\linear_composite.py:437: UserWarning: USD does not have complete xcat data for ['FXXR_NSA']. These will be filled with NaNs for the calculation.\n",
-      "  warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "_cids = [\"USD\", \"CAD\"]\n",
+    "\n",
+    "_df = new_df.to_pandas()\n",
+    "starttime = time.time()\n",
+    "\n",
+    "\n",
     "mx = macrosynergy.panel.linear_composite(\n",
-    "    df=new_df.to_pandas(),\n",
-    "    xcats=[\"EQXR_NSA\", \"FXXR_NSA\"], \n",
+    "\n",
+    "    df=_df,\n",
+    "\n",
+    "    xcats=[\"EQXR_NSA\", \"FXXR_NSA\"],\n",
     "    cids=_cids,\n",
+    "\n",
     "    weights=None,\n",
+    "\n",
     "    signs=None,\n",
+    "\n",
     "    normalize_weights=False,\n",
     "    start=None,\n",
     "    end=None,\n",
+    "\n",
     "    blacklist=None,\n",
+    "\n",
     "    complete_xcats=False,\n",
+    "\n",
     "    complete_cids=False,\n",
+    "\n",
     "    new_xcat=\"COMPOSITE\",\n",
+    "\n",
     "    new_cid=\"GLB\",\n",
+    "\n",
     ")\n",
+    "print(f\"Time taken to run linear composite: {time.time() - starttime}\")\n",
+    "\n",
+    "\n",
     "# view_timelines(QuantamentalDataFrame(mx), cids=_cids)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "_cids = [\"USD\", \"CAD\"]\n",
+    "starttime = time.time()\n",
+    "\n",
     "x = msyrs.panel.linear_composite(\n",
+    "\n",
     "    df=new_df,\n",
+    "\n",
     "    xcats=[\"EQXR_NSA\", \"FXXR_NSA\"],\n",
     "    cids=_cids,\n",
+    "\n",
     "    weights=None,\n",
+    "\n",
     "    signs=None,\n",
+    "\n",
     "    weight_xcats=None,\n",
+    "\n",
     "    normalize_weights=False,\n",
     "    start=None,\n",
     "    end=None,\n",
+    "\n",
     "    blacklist=None,\n",
+    "\n",
     "    complete_xcats=False,\n",
+    "\n",
     "    complete_cids=False,\n",
+    "\n",
     "    new_xcat=\"COMPOSITE\",\n",
+    "\n",
     "    new_cid=\"GLB\",\n",
+    "\n",
     ")\n",
+    "print(f\"Time taken to run linear composite rs: {time.time() - starttime}\")\n",
+    "\n",
     "# view_timelines(QuantamentalDataFrame(x.to_pandas()), cids=_cids)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 15,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "mwide = QuantamentalDataFrame(mx).to_wide().sort_index()\n",
-    "rwide = QuantamentalDataFrame(x.to_pandas()).to_wide().sort_index()\n",
-    "np.allclose((mwide - rwide).sum(axis=1), 0)"
+    "### Running with variable weights\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "e:\\Work\\ruzt\\msyrs\\.venv\\Lib\\site-packages\\macrosynergy\\panel\\linear_composite.py:437: UserWarning: USD does not have complete xcat data for ['FXXR_NSA']. These will be filled with NaNs for the calculation.\n",
-      "  warnings.warn(wrn_msg.format(cidx=cidx, missing_xcats=missing_xcats))\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "_cids = [\"USD\", \"CAD\", \"EUR\", \"AUD\"]\n",
     "_xcats = [\"EQXR_NSA\", \"FXXR_NSA\"]\n",
@@ -397,72 +272,90 @@
     "    new_xcat=\"COMPOSITE\",\n",
     "    new_cid=\"GLB\",\n",
     ")\n",
+    "view_timelines(QuantamentalDataFrame(mx), cids=_cids)\n",
     "mwide = QuantamentalDataFrame(mx).to_wide().sort_index()\n",
     "rwide = QuantamentalDataFrame(x.to_pandas()).to_wide().sort_index()\n",
     "np.allclose((mwide - rwide).sum(axis=1), 0)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 17,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>real_date</th>\n",
-       "      <th>value</th>\n",
-       "      <th>cid</th>\n",
-       "      <th>xcat</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "Empty DataFrame\n",
-       "Columns: [real_date, value, cid, xcat]\n",
-       "Index: []"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
-    "\n",
+    "### Running with variable weights, normalized\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "x = msyrs.panel.linear_composite(\n",
     "    df=new_df,\n",
-    "    xcats=_xcats,\n",
+    "    xcats=cpi_xcats,\n",
     "    cids=_cids,\n",
-    "    weights=[1, 9],\n",
+    "    weights=list(range(1, len(cpi_xcats) + 1)),\n",
     "    normalize_weights=True,\n",
     "    new_xcat=\"COMPOSITE\",\n",
     "    new_cid=\"GLB\",\n",
     ")\n",
     "x.to_pandas().dropna(how=\"any\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_timelines(x.to_pandas().dropna(how=\"any\"), cids=_cids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mx = macrosynergy.panel.linear_composite(\n",
+    "    df=new_df.to_pandas(),\n",
+    "    xcats=cpi_xcats,\n",
+    "    cids=_cids,\n",
+    "    weights=list(range(1, len(cpi_xcats) + 1)),\n",
+    "    normalize_weights=True,\n",
+    "    new_xcat=\"COMPOSITE\",\n",
+    "    new_cid=\"GLB\",\n",
+    ")\n",
+    "view_timelines(mx.dropna(how=\"any\"), cids=_cids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mwide = QuantamentalDataFrame(mx).to_wide().sort_index()\n",
+    "rwide = QuantamentalDataFrame(x.to_pandas()).to_wide().sort_index()\n",
+    "np.allclose((mwide - rwide).sum(axis=1), 0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Running with categorical weights, normalized\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raise NotImplementedError(\"Not implemented yet\")"
+   ]
   }
  ],
  "metadata": {

real_date	cid	xcat	value	grading	eop_lag	mop_lag
date	str	str	f64	f64	i64	i64
2010-03-03	"USD"	"ADPEMPL_SA_P1M1ML1"	-0.173806	3.0	3	33
2010-03-04	"USD"	"ADPEMPL_SA_P1M1ML1"	-0.173806	3.0	4	34
2010-03-05	"USD"	"ADPEMPL_SA_P1M1ML1"	-0.173806	3.0	5	35
2010-03-08	"USD"	"ADPEMPL_SA_P1M1ML1"	-0.173806	3.0	8	38
2010-03-09	"USD"	"ADPEMPL_SA_P1M1ML1"	-0.173806	3.0	9	39