From f9ccad855f66f4205737243df7795f43b3abc903 Mon Sep 17 00:00:00 2001 From: Palash Tyagi <23239946+Magnus167@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:17:37 +0000 Subject: [PATCH] add comparisions to macrosynergy package --- notebooks/python-notebook.ipynb | 415 +++++++++++++++++++++++--------- 1 file changed, 304 insertions(+), 111 deletions(-) diff --git a/notebooks/python-notebook.ipynb b/notebooks/python-notebook.ipynb index 8942e28..8335b3c 100644 --- a/notebooks/python-notebook.ipynb +++ b/notebooks/python-notebook.ipynb @@ -8,10 +8,8 @@ "\n", "First patch `pyo3-polars`:\n", "\n", - "\n", "- Use [this diff](https://github.com/pola-rs/pyo3-polars/compare/main...Magnus167:pyo3-polars:main) to make changes to the `pyo3-polars` package.\n", "\n", - "\n", "Install the package:\n", "\n", "```bash\n", @@ -81,7 +79,8 @@ "outputs": [], "source": [ "import time\n", - "startime = time.time()" + "\n", + "nb_start_time = time.time()" ] }, { @@ -89,6 +88,13 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to load qdf: 0.007575511932373047\n" + ] + }, { "data": { "text/html": [ @@ -124,8 +130,9 @@ "source": [ "dfpath = f\"{DATA_FOLDER_PATH}/data/ADPEMPL_SA_P1M1ML1/USD_ADPEMPL_SA_P1M1ML1.csv\"\n", "\n", - "\n", + "starttime = time.time()\n", "ldf: pl.DataFrame = msyrs.qdf.load_qdf(dfpath)\n", + "print(f\"Time taken to load qdf: {time.time() - starttime}\")\n", "ldf.head(5)" ] }, @@ -171,6 +178,57 @@ "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading the JPMAQS catalogue from DataQuery...\n", + "Downloaded JPMAQS catalogue with 18711 tickers.\n", + "Removed 21/600 expressions that are not in the JPMaQS catalogue.\n", + "Downloading data from JPMaQS.\n", + "Timestamp UTC: 2024-11-22 17:13:07\n", + "Connection successful!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Requesting data: 100%|██████████| 29/29 [00:05<00:00, 4.91it/s]\n", + "Downloading data: 100%|██████████| 29/29 [00:22<00:00, 1.26it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Some dates are missing from the downloaded data. \n", + "2 out of 9107 dates are missing.\n" + ] + } + ], + "source": [ + "pddf = macrosynergy.download.JPMaQSDownload().download(\n", + " tickers=tickers,\n", + " get_catalogue=True,\n", + " show_progress=True,\n", + " start_date=\"1990-01-01\",\n", + ")\n", + "pddf = macrosynergy.management.types.QuantamentalDataFrame(pddf)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to load qdf batch: 1.8986454010009766\n" + ] + }, { "data": { "text/html": [ @@ -198,37 +256,26 @@ "└────────────┴─────┴──────────────────┴──────────┴─────────┴─────────┴─────────┘" ] }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "big_df: pl.DataFrame = msyrs.qdf.load_qdf_from_download_bank(\n", - " folder_path=DATA_FOLDER_PATH, xcats=xcats\n", - " # folder_path=DATA_FOLDER_PATH, cids=cids\n", - ")\n", - "big_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "277.8842191696167" - ] - }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "big_df.estimated_size(\"mb\")" + "starttime = time.time()\n", + "\n", + "big_df: pl.DataFrame = msyrs.qdf.load_qdf_from_download_bank(\n", + "\n", + " folder_path=DATA_FOLDER_PATH,\n", + " xcats=xcats,\n", + "\n", + " # folder_path=DATA_FOLDER_PATH, cids=cids\n", + "\n", + ")\n", + "print(f\"Time taken to load qdf batch: {time.time() - starttime}\")\n", + "\n", + "\n", + "big_df.head(5)" ] }, { @@ -239,7 +286,7 @@ { "data": { "text/plain": [ - "877.1654348373413" + "275.89989376068115" ] }, "execution_count": 10, @@ -248,7 +295,7 @@ } ], "source": [ - "big_df.to_pandas().memory_usage(deep=True).sum() / 1024**2" + "big_df.estimated_size(\"mb\")" ] }, { @@ -259,7 +306,7 @@ { "data": { "text/plain": [ - "213.30906867980957" + "871.0723962783813" ] }, "execution_count": 11, @@ -267,6 +314,26 @@ "output_type": "execute_result" } ], + "source": [ + "big_df.to_pandas().memory_usage(deep=True).sum() / 1024**2" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "211.8466453552246" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "macrosynergy.management.types.QuantamentalDataFrame(big_df.to_pandas()).memory_usage(\n", " deep=True\n", @@ -275,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -285,9 +352,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to reduce qdf: 0.34674978256225586\n" + ] + }, { "data": { "text/html": [ @@ -321,38 +395,97 @@ "└────────────┴─────┴───────────┴───────────┴─────────┴─────────┴─────────┘" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "starttime = time.time()\n", "eq_df = msyrs.qdf.reduce_dataframe(\n", " df=big_df,\n", " cids=sel_cids,\n", " xcats=[\"EQXR_NSA\", \"EQXR_VT10\"],\n", + "\n", " start=start,\n", ")\n", + "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")\n", "eq_df" ] }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "fx_xcats = [xc for xc in xcats if xc.startswith(\"FX\")]\n", - "fx_df = msyrs.qdf.reduce_dataframe(\n", - " df=big_df, cids=sel_cids, start=start, xcats=fx_xcats, intersect=True\n", - ")" - ] - }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to reduce qdf: 0.13223624229431152\n" + ] + } + ], + "source": [ + "starttime = time.time()\n", + "eq_pd_df = pddf.reduce_df(cids=sel_cids, xcats=[\"EQXR_NSA\", \"EQXR_VT10\"], start=start)\n", + "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to reduce qdf: 0.3902719020843506\n" + ] + } + ], + "source": [ + "fx_xcats = [xc for xc in xcats if xc.startswith(\"FX\")]\n", + "starttime = time.time()\n", + "\n", + "fx_df = msyrs.qdf.reduce_dataframe(\n", + " df=big_df, cids=sel_cids, start=start, xcats=fx_xcats, intersect=True\n", + ")\n", + "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken to reduce qdf: 0.171736478805542\n" + ] + } + ], + "source": [ + "starttime = time.time()\n", + "fx_pd_df = pddf.reduce_df(cids=sel_cids, xcats=fx_xcats, start=start, intersect=True)\n", + "print(f\"Time taken to reduce qdf: {time.time() - starttime}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0.024325132369995117\n" + ] + }, { "data": { "text/html": [ @@ -385,20 +518,40 @@ "└────────────┴─────┴──────────┴───────────┴─────────┴─────────┴─────────┘" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "starttime = time.time()\n", "new_df: pl.DataFrame = msyrs.qdf.update_dataframe(df=eq_df, df_add=fx_df)\n", - "\n", + "print(\"Time taken: \", time.time() - starttime)\n", "new_df.head(10)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0.8326597213745117\n" + ] + } + ], + "source": [ + "starttime = time.time()\n", + "new_pd_df = pddf.update_df(df_add=eq_pd_df,)\n", + "print(\"Time taken: \", time.time() - starttime)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -433,7 +586,7 @@ "└────────────┴─────┴───────────┴───────────┴─────────┴─────────┴─────────┘" ] }, - "execution_count": 16, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -444,77 +597,117 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (10, 31)
real_dateEUR_EQXR_NSAEUR_FXXR_NSAEUR_FXXR_VT10GBP_FXCRR_NSACAD_FXCRR_NSAGBP_FXTARGETED_NSACAD_FXTARGETED_NSAAUD_FXXR_VT10AUD_EQXR_VT10EUR_FXTARGETED_NSAUSD_EQXR_NSAEUR_FXUNTRADABLE_NSAAUD_EQXR_NSAAUD_FXCRR_NSAUSD_EQXR_VT10CAD_FXXR_NSACAD_EQXR_NSAGBP_EQXR_NSAEUR_FXCRR_NSACAD_FXXR_VT10GBP_FXXR_VT10GBP_FXUNTRADABLE_NSAAUD_FXTARGETED_NSACAD_FXUNTRADABLE_NSAGBP_FXXR_NSAAUD_FXUNTRADABLE_NSAGBP_EQXR_VT10EUR_EQXR_VT10AUD_FXXR_NSACAD_EQXR_VT10
datef64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
1999-11-151.280812-0.044278-0.047169nullnull0.00.00.162526null0.00.0998860.0nullnull0.042075-0.169111.592920.566529null-0.3992820.7376430.00.00.00.398630.00.2613010.6079960.1471080.782683
1999-11-160.5965160.0673580.071755nullnull0.00.0-0.650292null0.01.9957230.0nullnull0.8406530.100898-0.7839720.746041null0.238229-0.5873430.00.00.0-0.3174070.00.3440980.283164-0.588599-0.385205
1999-11-170.7352940.8608970.917107nullnull0.00.0-0.413651null0.0-0.9084560.0nullnull-0.382666-0.368379-0.054873-0.740517null-0.869773-0.0158750.00.00.0-0.0085790.0-0.341550.349041-0.374409-0.026962
1999-11-181.012479-1.147048-1.221942nullnull0.00.0-0.107377null0.00.7475320.0nullnull0.3148810.2154590.6149120.045676null0.508717-0.0401240.00.00.0-0.0216830.00.0210670.48062-0.0971910.302137
1999-11-190.349650.0772410.082284nullnull0.00.00.067403null0.0-0.0769980.0nullnull-0.0324340.2678140.622067-0.989195null0.632330.4658990.00.00.00.2517770.0-0.4562490.1659770.0610080.305653
1999-11-22-2.0905920.2227420.237285nullnull0.00.0-0.623692null0.0-0.0210160.0nullnull-0.008852-0.029578-0.824295-0.814633null-0.069836-0.0774270.00.00.0-0.0418420.0-0.375735-0.992395-0.564523-0.405018
1999-11-230.54567-0.484609-0.51625nullnull0.00.0-0.1249null0.0-1.0650220.0nullnull-0.448616-0.2349971.0498691.62715null-0.5548470.9074040.00.00.00.4903710.00.7504940.259027-0.1130510.515854
1999-11-240.0-0.846988-0.902291nullnull0.00.0-0.53942null0.00.6728050.0nullnull0.283403-0.0030230.7575760.465081null-0.007137-0.8668310.00.00.0-0.4684450.00.214510.0-0.4882460.372235
1999-11-260.780712-0.353428-0.376504nullnull0.00.0-0.279707null0.0-0.4854030.0nullnull-0.204465-0.1371210.297556-0.044776null-0.323755-0.5539510.00.00.0-0.2993620.0-0.0206520.3706-0.2531720.146204
1999-11-29-0.615174-0.696737-0.742228nullnull0.00.00.224152null0.0-0.2969040.0nullnull-0.125064-0.423601-1.9707570.029864null-1.0001570.5164820.00.00.00.2791130.00.013774-0.2920210.202887-0.968333
" - ], - "text/plain": [ - "shape: (10, 31)\n", - "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", - "│ real_date ┆ EUR_EQXR_ ┆ EUR_FXXR_ ┆ EUR_FXXR_ ┆ … ┆ GBP_EQXR_ ┆ EUR_EQXR_ ┆ AUD_FXXR_ ┆ CAD_EQXR │\n", - "│ --- ┆ NSA ┆ NSA ┆ VT10 ┆ ┆ VT10 ┆ VT10 ┆ NSA ┆ _VT10 │\n", - "│ date ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", - "│ 1999-11-1 ┆ 1.280812 ┆ -0.044278 ┆ -0.047169 ┆ … ┆ 0.261301 ┆ 0.607996 ┆ 0.147108 ┆ 0.782683 │\n", - "│ 5 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-1 ┆ 0.596516 ┆ 0.067358 ┆ 0.071755 ┆ … ┆ 0.344098 ┆ 0.283164 ┆ -0.588599 ┆ -0.38520 │\n", - "│ 6 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 5 │\n", - "│ 1999-11-1 ┆ 0.735294 ┆ 0.860897 ┆ 0.917107 ┆ … ┆ -0.34155 ┆ 0.349041 ┆ -0.374409 ┆ -0.02696 │\n", - "│ 7 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 2 │\n", - "│ 1999-11-1 ┆ 1.012479 ┆ -1.147048 ┆ -1.221942 ┆ … ┆ 0.021067 ┆ 0.48062 ┆ -0.097191 ┆ 0.302137 │\n", - "│ 8 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-1 ┆ 0.34965 ┆ 0.077241 ┆ 0.082284 ┆ … ┆ -0.456249 ┆ 0.165977 ┆ 0.061008 ┆ 0.305653 │\n", - "│ 9 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-2 ┆ -2.090592 ┆ 0.222742 ┆ 0.237285 ┆ … ┆ -0.375735 ┆ -0.992395 ┆ -0.564523 ┆ -0.40501 │\n", - "│ 2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 8 │\n", - "│ 1999-11-2 ┆ 0.54567 ┆ -0.484609 ┆ -0.51625 ┆ … ┆ 0.750494 ┆ 0.259027 ┆ -0.113051 ┆ 0.515854 │\n", - "│ 3 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-2 ┆ 0.0 ┆ -0.846988 ┆ -0.902291 ┆ … ┆ 0.21451 ┆ 0.0 ┆ -0.488246 ┆ 0.372235 │\n", - "│ 4 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-2 ┆ 0.780712 ┆ -0.353428 ┆ -0.376504 ┆ … ┆ -0.020652 ┆ 0.3706 ┆ -0.253172 ┆ 0.146204 │\n", - "│ 6 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 1999-11-2 ┆ -0.615174 ┆ -0.696737 ┆ -0.742228 ┆ … ┆ 0.013774 ┆ -0.292021 ┆ 0.202887 ┆ -0.96833 │\n", - "│ 9 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ 3 │\n", - "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "msyrs.qdf.pivot_dataframe_by_ticker(df=new_df).head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Time taken: 5.683506011962891 seconds\n" + "Time taken: 0.0014519691467285156\n", + "Time taken: 0.0\n" + ] + } + ], + "source": [ + "# df: polars::prelude::DataFrame,\n", + "# xcat: String,\n", + "# cids: Option>,\n", + "# lback_periods: Option,\n", + "# lback_method: Option,\n", + "# half_life: Option,\n", + "# start: Option,\n", + "# end: Option,\n", + "# est_freq: Option,\n", + "# remove_zeros: Option,\n", + "# postfix: Option,\n", + "# nan_tolerance: Option,\n", + "\n", + "starttime = time.time()\n", + "hv = msyrs.panel.historic_vol(\n", + " df=new_df,\n", + " xcat=\"EQXR_NSA\",\n", + " cids=None,\n", + " lback_periods=252,\n", + " lback_method=\"calendar\",\n", + " half_life=None,\n", + " start=None,\n", + " end=None,\n", + " est_freq=None,\n", + " remove_zeros=None,\n", + " postfix=\"_HV\",\n", + " nan_tolerance=None,\n", + ")\n", + "print(f\"Time taken: {time.time() - starttime}\")\n", + "\n", + "starttime = time.time()\n", + "a = 1 + 5\n", + "print(\"Time taken: \", time.time() - starttime)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 0.054981231689453125\n" + ] + } + ], + "source": [ + "starttime = time.time()\n", + "msyrs.qdf.pivot_dataframe_by_ticker(df=new_df).head(10)\n", + "print(\"Time taken: \", time.time() - starttime)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "new_pd_df = macrosynergy.management.types.QuantamentalDataFrame(new_pd_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 2.785749673843384\n" + ] + } + ], + "source": [ + "starttime = time.time()\n", + "new_pd_df.to_wide()\n", + "print(\"Time taken: \", time.time() - starttime)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time taken: 58.54259753227234 seconds\n" ] } ], "source": [ "end_time = time.time()\n", - "print(f\"Time taken: {end_time - startime} seconds\")" + "print(f\"Time taken: {end_time - nb_start_time} seconds\")" ] } ],