From 436b2d3ed4cfb53eccde019dd34dfbc17a84846c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Mon, 9 Mar 2026 16:52:07 +0100 Subject: [PATCH 1/7] Add DHIS2 quality of care configuration to SNT JSON files --- configuration/SNT_config_BFA.json | 1 + configuration/SNT_config_COD.json | 1 + configuration/SNT_config_NER.json | 1 + .../code/snt_quality_of_care.ipynb | 233 ++++++++++++++++++ .../snt_quality_of_care_report.ipynb | 93 +++++++ snt_quality_of_care/.gitignore | 4 + snt_quality_of_care/pipeline.py | 109 ++++++++ snt_quality_of_care/readme.md | 22 ++ snt_quality_of_care/requirements.txt | 2 + 9 files changed, 466 insertions(+) create mode 100644 pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb create mode 100644 pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb create mode 100644 snt_quality_of_care/.gitignore create mode 100644 snt_quality_of_care/pipeline.py create mode 100644 snt_quality_of_care/readme.md create mode 100644 snt_quality_of_care/requirements.txt diff --git a/configuration/SNT_config_BFA.json b/configuration/SNT_config_BFA.json index 75f8b40..dbadf72 100644 --- a/configuration/SNT_config_BFA.json +++ b/configuration/SNT_config_BFA.json @@ -21,6 +21,7 @@ "ERA5_DATASET_CLIMATE": "snt-era5-climate", "SNT_SEASONALITY_RAINFALL": "snt-seasonality-rainfall", "SNT_SEASONALITY_CASES": "snt-seasonality-cases", + "DHIS2_QUALITY_OF_CARE": "snt-dhis2-quality-of-care", "SNT_MAP_EXTRACTS": "snt-map-extracts", "SNT_RESULTS": "snt-results" }, diff --git a/configuration/SNT_config_COD.json b/configuration/SNT_config_COD.json index 6a5570a..39daf5c 100644 --- a/configuration/SNT_config_COD.json +++ b/configuration/SNT_config_COD.json @@ -21,6 +21,7 @@ "ERA5_DATASET_CLIMATE": "snt-era5-climate", "SNT_SEASONALITY_RAINFALL": "snt-seasonality-rainfall", "SNT_SEASONALITY_CASES": "snt-seasonality-cases", + "DHIS2_QUALITY_OF_CARE": "snt-dhis2-quality-of-care", "SNT_MAP_EXTRACTS": "snt-map-extracts", "SNT_RESULTS": "snt-results" }, diff --git a/configuration/SNT_config_NER.json b/configuration/SNT_config_NER.json index 7209d1c..11d9426 100644 --- a/configuration/SNT_config_NER.json +++ b/configuration/SNT_config_NER.json @@ -21,6 +21,7 @@ "ERA5_DATASET_CLIMATE": "snt-era5-climate", "SNT_SEASONALITY_RAINFALL": "snt-seasonality-rainfall", "SNT_SEASONALITY_CASES": "snt-seasonality-cases", + "DHIS2_QUALITY_OF_CARE": "snt-dhis2-quality-of-care", "SNT_MAP_EXTRACTS": "snt-map-extracts", "SNT_RESULTS": "snt-results" }, diff --git a/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb b/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb new file mode 100644 index 0000000..ff2a8f3 --- /dev/null +++ b/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fad6c24e", + "metadata": {}, + "source": [ + "## Quality of Care Indicators\n", + "\n", + "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", + "\n", + "Indicators:\n", + "- testing_rate = TEST / SUSP\n", + "- treatment_rate = MALTREAT / CONF\n", + "- case_fatality_rate = MALDTH / MALADM\n", + "- prop_adm_malaria = MALADM / ALLADM\n", + "- prop_malaria_deaths = MALDTH / ALLDTH\n", + "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", + "- presumed_cases = PRES (absolute)\n", + "\n", + "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "317c4085", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Preliminaries\n", + "options(scipen=999)\n", + "\n", + "ROOT_PATH <- \"~/workspace\"\n", + "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", + "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"writexl\")\n", + "install_and_load(required_packages)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98b78bf7", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Fallback parameters for local/dev execution\n", + "if (!exists(\"outlier_imputation_method\")) {\n", + " outlier_imputation_method <- \"mean\"\n", + "}\n", + "\n", + "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\", \"mg-partial\", \"mg-complete\")\n", + "if (!(outlier_imputation_method %in% allowed_methods)) {\n", + " stop(glue::glue(\"Invalid outlier_imputation_method: {outlier_imputation_method}. Allowed: {paste(allowed_methods, collapse=', ')}\"))\n", + "}\n", + "\n", + "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_imputed.parquet\")\n", + "log_msg(glue::glue(\"Loading routine file from DHIS2 outliers dataset: {routine_filename}\"))\n", + "\n", + "routine <- get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", + "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", + "\n", + "setDT(routine)\n", + "required_cols <- c(\"ADM2_ID\", \"YEAR\", \"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", + "missing_cols <- setdiff(required_cols, names(routine))\n", + "if (length(missing_cols) > 0) {\n", + " stop(glue::glue(\"Missing required columns in routine data: {paste(missing_cols, collapse=', ')}\"))\n", + "}\n", + "\n", + "num_cols <- setdiff(required_cols, c(\"ADM2_ID\", \"YEAR\"))\n", + "routine[, (num_cols) := lapply(.SD, function(x) as.numeric(x)), .SDcols = num_cols]\n", + "routine[, YEAR := as.integer(YEAR)]\n", + "routine[, ADM2_ID := as.character(ADM2_ID)]\n", + "\n", + "qoc <- routine[, .(\n", + " TEST = sum(TEST, na.rm = TRUE),\n", + " SUSP = sum(SUSP, na.rm = TRUE),\n", + " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", + " CONF = sum(CONF, na.rm = TRUE),\n", + " MALDTH = sum(MALDTH, na.rm = TRUE),\n", + " MALADM = sum(MALADM, na.rm = TRUE),\n", + " ALLADM = sum(ALLADM, na.rm = TRUE),\n", + " ALLDTH = sum(ALLDTH, na.rm = TRUE),\n", + " ALLOUT = sum(ALLOUT, na.rm = TRUE),\n", + " PRES = sum(PRES, na.rm = TRUE)\n", + "), by = .(ADM2_ID, YEAR)]\n", + "\n", + "qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", + "qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", + "qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", + "qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", + "qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + "qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", + "qoc[, presumed_cases := PRES]\n", + "\n", + "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", + "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", + " shapes_dt[, ADM2_ID := as.character(ADM2_ID)]\n", + " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", + "}\n", + "\n", + "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.parquet\"))\n", + "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.csv\"))\n", + "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.xlsx\"))\n", + "\n", + "arrow::write_parquet(qoc, out_parquet)\n", + "data.table::fwrite(qoc, out_csv)\n", + "writexl::write_xlsx(list(quality_of_care = as.data.frame(qoc)), out_xlsx)\n", + "\n", + "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "984689b0", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Yearly maps by ADM2\n", + "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", + "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", + "\n", + "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", + " years <- sort(unique(df$YEAR))\n", + " for (yr in years) {\n", + " df_y <- df[YEAR == yr]\n", + " map_df <- merge(sf_shapes, df_y, by = \"ADM2_ID\", all.x = TRUE)\n", + "\n", + " p <- ggplot(map_df)\n", + "\n", + " if (is_rate) {\n", + " map_df$cat <- cut(\n", + " map_df[[value_col]],\n", + " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", + " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", + " include.lowest = TRUE\n", + " )\n", + " p <- p + geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", + " scale_fill_brewer(palette = \"YlOrRd\", na.value = \"white\", drop = FALSE)\n", + " } else {\n", + " vals <- map_df[[value_col]]\n", + " finite_vals <- vals[is.finite(vals)]\n", + " if (length(finite_vals) > 4) {\n", + " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", + " if (length(br) < 2) {\n", + " map_df$cat <- as.factor(\"all\")\n", + " } else {\n", + " map_df$cat <- cut(vals, breaks = br, include.lowest = TRUE)\n", + " }\n", + " } else {\n", + " map_df$cat <- as.factor(vals)\n", + " }\n", + " p <- p + geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", + " scale_fill_brewer(palette = \"Blues\", na.value = \"white\", drop = FALSE)\n", + " }\n", + "\n", + " p <- p +\n", + " theme_void() +\n", + " labs(\n", + " title = paste0(title_prefix, \" - \", yr),\n", + " fill = value_col,\n", + " caption = \"Source: SNT DHIS2 outliers-imputed routine data\"\n", + " ) +\n", + " theme(\n", + " legend.position = \"bottom\",\n", + " plot.title = element_text(face = \"bold\", size = 12)\n", + " )\n", + "\n", + " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", + " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", + " }\n", + "}\n", + "\n", + "plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", + "plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", + "plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", + "plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", + "plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", + "plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", + "plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", + "\n", + "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb b/pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb new file mode 100644 index 0000000..faeee82 --- /dev/null +++ b/pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d246ae9", + "metadata": {}, + "source": [ + "## Quality of Care Report\n", + "\n", + "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eaa5bab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ROOT_PATH <- \"~/workspace\"\n", + "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", + "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\"))\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a8320f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_.*\\\\.parquet$\"), full.names = TRUE)\n", + "if (length(files) == 0) {\n", + " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", + "}\n", + "\n", + "latest_file <- files[which.max(file.info(files)$mtime)]\n", + "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", + "\n", + "summary_tbl <- qoc[, .(\n", + " testing_rate = mean(testing_rate, na.rm = TRUE),\n", + " treatment_rate = mean(treatment_rate, na.rm = TRUE),\n", + " case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE),\n", + " prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE),\n", + " prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE),\n", + " non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE),\n", + " presumed_cases = sum(presumed_cases, na.rm = TRUE)\n", + "), by = .(YEAR)][order(YEAR)]\n", + "\n", + "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", + "\n", + "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", + "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snt_quality_of_care/.gitignore b/snt_quality_of_care/.gitignore new file mode 100644 index 0000000..43a40cd --- /dev/null +++ b/snt_quality_of_care/.gitignore @@ -0,0 +1,4 @@ +workspace/ +workspace.yaml +.vscode/ +__pycache__ diff --git a/snt_quality_of_care/pipeline.py b/snt_quality_of_care/pipeline.py new file mode 100644 index 0000000..28320cc --- /dev/null +++ b/snt_quality_of_care/pipeline.py @@ -0,0 +1,109 @@ +from pathlib import Path + +from openhexa.sdk import current_run, pipeline, workspace, parameter +from snt_lib.snt_pipeline_utils import ( + add_files_to_dataset, + load_configuration_snt, + validate_config, + run_report_notebook, + run_notebook, + pull_scripts_from_repository, + save_pipeline_parameters, +) + + +@pipeline("snt_quality_of_care") +@parameter( + "outlier_imputation_method", + name="Outlier imputation method", + help="Choose which outliers-imputed routine dataset to use.", + type=str, + choices=["mean", "median", "iqr", "trend", "mg-partial", "mg-complete"], + default="mean", + required=True, +) +@parameter( + "run_report_only", + name="Run reporting only", + help="Skip computations and execute only the reporting notebook.", + type=bool, + default=False, + required=False, +) +@parameter( + "pull_scripts", + name="Pull scripts", + help="Pull the latest pipeline scripts from the repository.", + type=bool, + default=False, + required=False, +) +def snt_quality_of_care( + outlier_imputation_method: str, + run_report_only: bool, + pull_scripts: bool, +): + """Compute quality-of-care indicators from outliers-imputed DHIS2 routine data.""" + + root_path = Path(workspace.files_path) + pipeline_path = root_path / "pipelines" / "snt_quality_of_care" + data_path = root_path / "data" / "dhis2" / "quality_of_care" + pipeline_path.mkdir(parents=True, exist_ok=True) + data_path.mkdir(parents=True, exist_ok=True) + + if pull_scripts: + current_run.log_info("Pulling pipeline scripts from repository.") + pull_scripts_from_repository( + pipeline_name="snt_quality_of_care", + report_scripts=["snt_quality_of_care_report.ipynb"], + code_scripts=["snt_quality_of_care.ipynb"], + ) + + snt_config = load_configuration_snt(config_path=root_path / "configuration" / "SNT_config.json") + validate_config(snt_config) + country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] + + nb_parameters = { + "outlier_imputation_method": outlier_imputation_method, + } + + parameters_file = save_pipeline_parameters( + pipeline_name="snt_quality_of_care", + parameters=nb_parameters, + output_path=data_path, + country_code=country_code, + ) + + if not run_report_only: + run_notebook( + nb_path=pipeline_path / "code" / "snt_quality_of_care.ipynb", + out_nb_path=pipeline_path / "papermill_outputs", + kernel_name="ir", + parameters=nb_parameters, + error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, + country_code=country_code, + ) + + add_files_to_dataset( + dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], + country_code=country_code, + file_paths=[ + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.parquet", + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.csv", + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.xlsx", + parameters_file, + ], + ) + else: + current_run.log_info("Skipping computations, running only reporting notebook.") + + run_report_notebook( + nb_file=pipeline_path / "reporting" / "snt_quality_of_care_report.ipynb", + nb_output_path=pipeline_path / "reporting" / "outputs", + error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, + country_code=country_code, + ) + + +if __name__ == "__main__": + snt_quality_of_care() diff --git a/snt_quality_of_care/readme.md b/snt_quality_of_care/readme.md new file mode 100644 index 0000000..60b174f --- /dev/null +++ b/snt_quality_of_care/readme.md @@ -0,0 +1,22 @@ +SNT Quality of Care Pipeline + +Description + +This pipeline computes district-year quality-of-care indicators from DHIS2 outliers-imputed routine data and generates yearly ADM2 maps. + +Parameters + + outlier_imputation_method (String, required) + Name: Outlier imputation method + Description: Select which imputed routine file to load from DHIS2_OUTLIERS_IMPUTATION. + Choices/Default: mean, median, iqr, trend, mg-partial, mg-complete. Default: mean. + + run_report_only (Boolean, optional) + Name: Run reporting only + Description: Skip computations and run only reporting notebook. + Choices/Default: TRUE/FALSE. Default: FALSE. + + pull_scripts (Boolean, optional) + Name: Pull scripts + Description: Pull latest scripts from repository before run. + Choices/Default: TRUE/FALSE. Default: FALSE. diff --git a/snt_quality_of_care/requirements.txt b/snt_quality_of_care/requirements.txt new file mode 100644 index 0000000..e278876 --- /dev/null +++ b/snt_quality_of_care/requirements.txt @@ -0,0 +1,2 @@ +openhexa.toolbox @ git+https://github.com/BLSQ/openhexa-toolbox@main +snt_lib @ git+https://git@github.com/BLSQ/snt_utils.git From bee3260448205e58218eb677a759f63b3bddc33c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 10 Mar 2026 10:12:27 +0100 Subject: [PATCH 2/7] Enhance SNT Quality of Care pipeline by adding data action parameter for handling imputed or removed outliers. Updated output file naming conventions to include data action, and improved parameter descriptions for clarity. Added error handling for invalid parameter values. --- .../code/snt_quality_of_care.ipynb | 18 ++- snt_quality_of_care/pipeline.py | 117 ++++++++++-------- 2 files changed, 80 insertions(+), 55 deletions(-) diff --git a/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb b/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb index ff2a8f3..98a9df1 100644 --- a/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb +++ b/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb @@ -73,13 +73,21 @@ "if (!exists(\"outlier_imputation_method\")) {\n", " outlier_imputation_method <- \"mean\"\n", "}\n", + "if (!exists(\"data_action\")) {\n", + " data_action <- \"imputed\"\n", + "}\n", "\n", - "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\", \"mg-partial\", \"mg-complete\")\n", + "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\")\n", "if (!(outlier_imputation_method %in% allowed_methods)) {\n", " stop(glue::glue(\"Invalid outlier_imputation_method: {outlier_imputation_method}. Allowed: {paste(allowed_methods, collapse=', ')}\"))\n", "}\n", "\n", - "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_imputed.parquet\")\n", + "allowed_actions <- c(\"imputed\", \"removed\")\n", + "if (!(data_action %in% allowed_actions)) {\n", + " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", + "}\n", + "\n", + "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_{data_action}.parquet\")\n", "log_msg(glue::glue(\"Loading routine file from DHIS2 outliers dataset: {routine_filename}\"))\n", "\n", "routine <- get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", @@ -124,9 +132,9 @@ " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", "}\n", "\n", - "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.parquet\"))\n", - "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.csv\"))\n", - "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}.xlsx\"))\n", + "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.parquet\"))\n", + "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.csv\"))\n", + "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.xlsx\"))\n", "\n", "arrow::write_parquet(qoc, out_parquet)\n", "data.table::fwrite(qoc, out_csv)\n", diff --git a/snt_quality_of_care/pipeline.py b/snt_quality_of_care/pipeline.py index 28320cc..4d363a8 100644 --- a/snt_quality_of_care/pipeline.py +++ b/snt_quality_of_care/pipeline.py @@ -16,12 +16,21 @@ @parameter( "outlier_imputation_method", name="Outlier imputation method", - help="Choose which outliers-imputed routine dataset to use.", + help="Choose which outlier detection/imputation method to use.", type=str, - choices=["mean", "median", "iqr", "trend", "mg-partial", "mg-complete"], + choices=["mean", "median", "iqr", "trend"], default="mean", required=True, ) +@parameter( + "data_action", + name="Data action", + help="Choose whether to use imputed data (outliers replaced) or removed data (outliers removed).", + type=str, + choices=["imputed", "removed"], + default="imputed", + required=True, +) @parameter( "run_report_only", name="Run reporting only", @@ -40,69 +49,77 @@ ) def snt_quality_of_care( outlier_imputation_method: str, + data_action: str, run_report_only: bool, pull_scripts: bool, ): """Compute quality-of-care indicators from outliers-imputed DHIS2 routine data.""" + try: + current_run.log_info("Starting SNT Quality of Care pipeline...") + root_path = Path(workspace.files_path) + pipeline_path = root_path / "pipelines" / "snt_quality_of_care" + data_path = root_path / "data" / "dhis2" / "quality_of_care" + pipeline_path.mkdir(parents=True, exist_ok=True) + data_path.mkdir(parents=True, exist_ok=True) - root_path = Path(workspace.files_path) - pipeline_path = root_path / "pipelines" / "snt_quality_of_care" - data_path = root_path / "data" / "dhis2" / "quality_of_care" - pipeline_path.mkdir(parents=True, exist_ok=True) - data_path.mkdir(parents=True, exist_ok=True) - - if pull_scripts: - current_run.log_info("Pulling pipeline scripts from repository.") - pull_scripts_from_repository( - pipeline_name="snt_quality_of_care", - report_scripts=["snt_quality_of_care_report.ipynb"], - code_scripts=["snt_quality_of_care.ipynb"], - ) + if pull_scripts: + current_run.log_info("Pulling pipeline scripts from repository.") + pull_scripts_from_repository( + pipeline_name="snt_quality_of_care", + report_scripts=["snt_quality_of_care_report.ipynb"], + code_scripts=["snt_quality_of_care.ipynb"], + ) - snt_config = load_configuration_snt(config_path=root_path / "configuration" / "SNT_config.json") - validate_config(snt_config) - country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] + snt_config = load_configuration_snt(config_path=root_path / "configuration" / "SNT_config.json") + validate_config(snt_config) + country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] - nb_parameters = { - "outlier_imputation_method": outlier_imputation_method, - } + nb_parameters = { + "outlier_imputation_method": outlier_imputation_method, + "data_action": data_action, + } - parameters_file = save_pipeline_parameters( - pipeline_name="snt_quality_of_care", - parameters=nb_parameters, - output_path=data_path, - country_code=country_code, - ) - - if not run_report_only: - run_notebook( - nb_path=pipeline_path / "code" / "snt_quality_of_care.ipynb", - out_nb_path=pipeline_path / "papermill_outputs", - kernel_name="ir", + parameters_file = save_pipeline_parameters( + pipeline_name="snt_quality_of_care", parameters=nb_parameters, - error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, + output_path=data_path, country_code=country_code, ) - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], + if not run_report_only: + run_notebook( + nb_path=pipeline_path / "code" / "snt_quality_of_care.ipynb", + out_nb_path=pipeline_path / "papermill_outputs", + kernel_name="ir", + parameters=nb_parameters, + error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, + country_code=country_code, + ) + + add_files_to_dataset( + dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], + country_code=country_code, + file_paths=[ + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.parquet", + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.csv", + data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.xlsx", + parameters_file, + ], + ) + else: + current_run.log_info("Skipping computations, running only reporting notebook.") + + run_report_notebook( + nb_file=pipeline_path / "reporting" / "snt_quality_of_care_report.ipynb", + nb_output_path=pipeline_path / "reporting" / "outputs", + error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, country_code=country_code, - file_paths=[ - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.parquet", - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.csv", - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}.xlsx", - parameters_file, - ], ) - else: - current_run.log_info("Skipping computations, running only reporting notebook.") - run_report_notebook( - nb_file=pipeline_path / "reporting" / "snt_quality_of_care_report.ipynb", - nb_output_path=pipeline_path / "reporting" / "outputs", - error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, - country_code=country_code, - ) + current_run.log_info("Quality of Care pipeline finished successfully.") + except Exception as e: + current_run.log_error(f"Pipeline failed: {e}") + raise if __name__ == "__main__": From 0bc8241a01c05341b743529642c07244170dd12c Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 10 Mar 2026 10:26:56 +0100 Subject: [PATCH 3/7] Add SNT Quality of Care pipeline and reporting notebooks, including configuration, requirements, and .gitignore files. The pipeline computes quality-of-care indicators from DHIS2 data and generates yearly maps. Enhanced documentation and parameter management for improved usability. --- .../code/snt_dhis2_quality_of_care.ipynb} | 32 +++++++++---------- .../snt_dhis2_quality_of_care_report.ipynb} | 24 +++++++------- .../.gitignore | 0 .../pipeline.py | 20 ++++++------ .../readme.md | 9 ++++-- .../requirements.txt | 0 6 files changed, 45 insertions(+), 40 deletions(-) rename pipelines/{snt_quality_of_care/code/snt_quality_of_care.ipynb => snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb} (98%) rename pipelines/{snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb => snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb} (94%) rename {snt_quality_of_care => snt_dhis2_quality_of_care}/.gitignore (100%) rename {snt_quality_of_care => snt_dhis2_quality_of_care}/pipeline.py (86%) rename {snt_quality_of_care => snt_dhis2_quality_of_care}/readme.md (63%) rename {snt_quality_of_care => snt_dhis2_quality_of_care}/requirements.txt (100%) diff --git a/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb similarity index 98% rename from pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb rename to pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 98a9df1..ab64375 100644 --- a/pipelines/snt_quality_of_care/code/snt_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "id": "fad6c24e", "metadata": {}, "source": [ "## Quality of Care Indicators\n", @@ -19,18 +18,16 @@ "- presumed_cases = PRES (absolute)\n", "\n", "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." - ] + ], + "id": "fad6c24e" }, { "cell_type": "code", - "execution_count": null, - "id": "317c4085", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Preliminaries\n", "options(scipen=999)\n", @@ -40,7 +37,7 @@ "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", "\n", "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", @@ -56,18 +53,18 @@ "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" - ] + ], + "execution_count": null, + "outputs": [], + "id": "317c4085" }, { "cell_type": "code", - "execution_count": null, - "id": "98b78bf7", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Fallback parameters for local/dev execution\n", "if (!exists(\"outlier_imputation_method\")) {\n", @@ -141,18 +138,18 @@ "writexl::write_xlsx(list(quality_of_care = as.data.frame(qoc)), out_xlsx)\n", "\n", "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "98b78bf7" }, { "cell_type": "code", - "execution_count": null, - "id": "984689b0", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "# Yearly maps by ADM2\n", "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", @@ -218,7 +215,10 @@ "plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", "\n", "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "984689b0" } ], "metadata": { @@ -238,4 +238,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb similarity index 94% rename from pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb rename to pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index faeee82..9c973fc 100644 --- a/pipelines/snt_quality_of_care/reporting/snt_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -2,30 +2,27 @@ "cells": [ { "cell_type": "markdown", - "id": "7d246ae9", "metadata": {}, "source": [ "## Quality of Care Report\n", "\n", "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." - ] + ], + "id": "7d246ae9" }, { "cell_type": "code", - "execution_count": null, - "id": "5eaa5bab", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "ROOT_PATH <- \"~/workspace\"\n", "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", "\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\"))\n", @@ -35,18 +32,18 @@ "\n", "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] + ], + "execution_count": null, + "outputs": [], + "id": "5eaa5bab" }, { "cell_type": "code", - "execution_count": null, - "id": "1a8320f8", "metadata": { "vscode": { "languageId": "r" } }, - "outputs": [], "source": [ "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_.*\\\\.parquet$\"), full.names = TRUE)\n", "if (length(files) == 0) {\n", @@ -70,7 +67,10 @@ "\n", "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))" - ] + ], + "execution_count": null, + "outputs": [], + "id": "1a8320f8" } ], "metadata": { @@ -90,4 +90,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/snt_quality_of_care/.gitignore b/snt_dhis2_quality_of_care/.gitignore similarity index 100% rename from snt_quality_of_care/.gitignore rename to snt_dhis2_quality_of_care/.gitignore diff --git a/snt_quality_of_care/pipeline.py b/snt_dhis2_quality_of_care/pipeline.py similarity index 86% rename from snt_quality_of_care/pipeline.py rename to snt_dhis2_quality_of_care/pipeline.py index 4d363a8..e5f7e6d 100644 --- a/snt_quality_of_care/pipeline.py +++ b/snt_dhis2_quality_of_care/pipeline.py @@ -12,7 +12,7 @@ ) -@pipeline("snt_quality_of_care") +@pipeline("snt_dhis2_quality_of_care") @parameter( "outlier_imputation_method", name="Outlier imputation method", @@ -47,7 +47,7 @@ default=False, required=False, ) -def snt_quality_of_care( +def snt_dhis2_quality_of_care( outlier_imputation_method: str, data_action: str, run_report_only: bool, @@ -57,7 +57,7 @@ def snt_quality_of_care( try: current_run.log_info("Starting SNT Quality of Care pipeline...") root_path = Path(workspace.files_path) - pipeline_path = root_path / "pipelines" / "snt_quality_of_care" + pipeline_path = root_path / "pipelines" / "snt_dhis2_quality_of_care" data_path = root_path / "data" / "dhis2" / "quality_of_care" pipeline_path.mkdir(parents=True, exist_ok=True) data_path.mkdir(parents=True, exist_ok=True) @@ -65,9 +65,9 @@ def snt_quality_of_care( if pull_scripts: current_run.log_info("Pulling pipeline scripts from repository.") pull_scripts_from_repository( - pipeline_name="snt_quality_of_care", - report_scripts=["snt_quality_of_care_report.ipynb"], - code_scripts=["snt_quality_of_care.ipynb"], + pipeline_name="snt_dhis2_quality_of_care", + report_scripts=["snt_dhis2_quality_of_care_report.ipynb"], + code_scripts=["snt_dhis2_quality_of_care.ipynb"], ) snt_config = load_configuration_snt(config_path=root_path / "configuration" / "SNT_config.json") @@ -80,7 +80,7 @@ def snt_quality_of_care( } parameters_file = save_pipeline_parameters( - pipeline_name="snt_quality_of_care", + pipeline_name="snt_dhis2_quality_of_care", parameters=nb_parameters, output_path=data_path, country_code=country_code, @@ -88,7 +88,7 @@ def snt_quality_of_care( if not run_report_only: run_notebook( - nb_path=pipeline_path / "code" / "snt_quality_of_care.ipynb", + nb_path=pipeline_path / "code" / "snt_dhis2_quality_of_care.ipynb", out_nb_path=pipeline_path / "papermill_outputs", kernel_name="ir", parameters=nb_parameters, @@ -110,7 +110,7 @@ def snt_quality_of_care( current_run.log_info("Skipping computations, running only reporting notebook.") run_report_notebook( - nb_file=pipeline_path / "reporting" / "snt_quality_of_care_report.ipynb", + nb_file=pipeline_path / "reporting" / "snt_dhis2_quality_of_care_report.ipynb", nb_output_path=pipeline_path / "reporting" / "outputs", error_label_severity_map={"[ERROR]": "error", "[WARNING]": "warning"}, country_code=country_code, @@ -123,4 +123,4 @@ def snt_quality_of_care( if __name__ == "__main__": - snt_quality_of_care() + snt_dhis2_quality_of_care() diff --git a/snt_quality_of_care/readme.md b/snt_dhis2_quality_of_care/readme.md similarity index 63% rename from snt_quality_of_care/readme.md rename to snt_dhis2_quality_of_care/readme.md index 60b174f..5e2b61e 100644 --- a/snt_quality_of_care/readme.md +++ b/snt_dhis2_quality_of_care/readme.md @@ -8,8 +8,13 @@ Parameters outlier_imputation_method (String, required) Name: Outlier imputation method - Description: Select which imputed routine file to load from DHIS2_OUTLIERS_IMPUTATION. - Choices/Default: mean, median, iqr, trend, mg-partial, mg-complete. Default: mean. + Description: Select which outlier detection/imputation method to use. + Choices/Default: mean, median, iqr, trend. Default: mean. + + data_action (String, required) + Name: Data action + Description: Choose whether to use imputed data (outliers replaced) or removed data (outliers removed). + Choices/Default: imputed, removed. Default: imputed. run_report_only (Boolean, optional) Name: Run reporting only diff --git a/snt_quality_of_care/requirements.txt b/snt_dhis2_quality_of_care/requirements.txt similarity index 100% rename from snt_quality_of_care/requirements.txt rename to snt_dhis2_quality_of_care/requirements.txt From a6022f029df3a2b1cf042bd2aa20e4d20dcbb19f Mon Sep 17 00:00:00 2001 From: claude-marie Date: Tue, 10 Mar 2026 12:06:20 +0100 Subject: [PATCH 4/7] fix other pipeline so they are able to append other file --- .../code/snt_dhis2_quality_of_care.ipynb | 178 +++++++++++++----- .../snt_dhis2_quality_of_care_report.ipynb | 22 +-- snt_dhis2_outliers_imputation_iqr/pipeline.py | 74 +++++++- .../pipeline.py | 81 +++++++- .../pipeline.py | 75 +++++++- .../pipeline.py | 74 +++++++- .../pipeline.py | 80 +++++++- snt_dhis2_quality_of_care/pipeline.py | 2 +- 8 files changed, 500 insertions(+), 86 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index ab64375..6d8db1b 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "fad6c24e", "metadata": {}, "source": [ "## Quality of Care Indicators\n", @@ -18,16 +19,18 @@ "- presumed_cases = PRES (absolute)\n", "\n", "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." - ], - "id": "fad6c24e" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "317c4085", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Preliminaries\n", "options(scipen=999)\n", @@ -53,18 +56,18 @@ "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" - ], - "execution_count": null, - "outputs": [], - "id": "317c4085" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "98b78bf7", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Fallback parameters for local/dev execution\n", "if (!exists(\"outlier_imputation_method\")) {\n", @@ -74,7 +77,7 @@ " data_action <- \"imputed\"\n", "}\n", "\n", - "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\")\n", + "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\", \"mg-partial\", \"mg-complete\")\n", "if (!(outlier_imputation_method %in% allowed_methods)) {\n", " stop(glue::glue(\"Invalid outlier_imputation_method: {outlier_imputation_method}. Allowed: {paste(allowed_methods, collapse=', ')}\"))\n", "}\n", @@ -84,44 +87,105 @@ " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", "}\n", "\n", - "routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_{data_action}.parquet\")\n", - "log_msg(glue::glue(\"Loading routine file from DHIS2 outliers dataset: {routine_filename}\"))\n", + "# Magic Glasses uses different file naming convention\n", + "is_mg <- outlier_imputation_method %in% c(\"mg-partial\", \"mg-complete\")\n", + "\n", + "if (is_mg) {\n", + " # For Magic Glasses, use formatted routine data (no imputed files exist)\n", + " # Note: Magic Glasses produces outlier flags, not imputed routine files\n", + " # We use the base formatted routine data\n", + " routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\")\n", + " log_msg(glue::glue(\"Magic Glasses method selected. Loading base routine file: {routine_filename}\"))\n", + " log_msg(\"[WARNING] Magic Glasses does not produce imputed routine files. Using base formatted routine data.\")\n", + " routine <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, routine_filename)\n", + "} else {\n", + " # Standard methods: use outliers-imputed files\n", + " routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_{data_action}.parquet\")\n", + " log_msg(glue::glue(\"Loading routine file from DHIS2 outliers dataset: {routine_filename}\"))\n", + " routine <- get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", + "}\n", "\n", - "routine <- get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", "\n", "setDT(routine)\n", - "required_cols <- c(\"ADM2_ID\", \"YEAR\", \"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", - "missing_cols <- setdiff(required_cols, names(routine))\n", + "\n", + "# Core required columns (must exist)\n", + "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", + "core_missing <- setdiff(core_cols, names(routine))\n", + "if (length(core_missing) > 0) {\n", + " stop(glue::glue(\"Missing core required columns in routine data: {paste(core_missing, collapse=', ')}\"))\n", + "}\n", + "\n", + "# Optional indicator columns (will be checked and handled gracefully)\n", + "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", + "available_cols <- intersect(indicator_cols, names(routine))\n", + "missing_cols <- setdiff(indicator_cols, names(routine))\n", + "\n", "if (length(missing_cols) > 0) {\n", - " stop(glue::glue(\"Missing required columns in routine data: {paste(missing_cols, collapse=', ')}\"))\n", + " log_msg(glue::glue(\"[WARNING] Some indicator columns are missing: {paste(missing_cols, collapse=', ')}. These indicators will not be calculated.\"), level = \"warning\")\n", "}\n", "\n", - "num_cols <- setdiff(required_cols, c(\"ADM2_ID\", \"YEAR\"))\n", - "routine[, (num_cols) := lapply(.SD, function(x) as.numeric(x)), .SDcols = num_cols]\n", + "# Convert available numeric columns\n", + "num_cols <- intersect(available_cols, names(routine))\n", + "if (length(num_cols) > 0) {\n", + " routine[, (num_cols) := lapply(.SD, function(x) as.numeric(x)), .SDcols = num_cols]\n", + "}\n", "routine[, YEAR := as.integer(YEAR)]\n", "routine[, ADM2_ID := as.character(ADM2_ID)]\n", "\n", - "qoc <- routine[, .(\n", - " TEST = sum(TEST, na.rm = TRUE),\n", - " SUSP = sum(SUSP, na.rm = TRUE),\n", - " MALTREAT = sum(MALTREAT, na.rm = TRUE),\n", - " CONF = sum(CONF, na.rm = TRUE),\n", - " MALDTH = sum(MALDTH, na.rm = TRUE),\n", - " MALADM = sum(MALADM, na.rm = TRUE),\n", - " ALLADM = sum(ALLADM, na.rm = TRUE),\n", - " ALLDTH = sum(ALLDTH, na.rm = TRUE),\n", - " ALLOUT = sum(ALLOUT, na.rm = TRUE),\n", - " PRES = sum(PRES, na.rm = TRUE)\n", - "), by = .(ADM2_ID, YEAR)]\n", - "\n", - "qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", - "qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", - "qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", - "qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", - "qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", - "qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", - "qoc[, presumed_cases := PRES]\n", + "# Aggregate available columns only using lapply\n", + "if (length(available_cols) > 0) {\n", + " qoc <- routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), \n", + " .SDcols = available_cols, \n", + " by = .(ADM2_ID, YEAR)]\n", + "} else {\n", + " # If no indicator columns available, create empty structure\n", + " qoc <- routine[, .(ADM2_ID, YEAR)]\n", + " qoc <- unique(qoc)\n", + "}\n", + "\n", + "# Calculate indicators only if required columns are available\n", + "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) {\n", + " qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate testing_rate: missing TEST or SUSP columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) {\n", + " qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate treatment_rate: missing MALTREAT or CONF columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) {\n", + " qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate case_fatality_rate: missing MALDTH or MALADM columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) {\n", + " qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate prop_adm_malaria: missing MALADM or ALLADM columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", + " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"ALLOUT\" %in% names(qoc)) {\n", + " qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate non_malaria_all_cause_outpatients: missing ALLOUT column\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"PRES\" %in% names(qoc)) {\n", + " qoc[, presumed_cases := PRES]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate presumed_cases: missing PRES column\", level = \"warning\")\n", + "}\n", "\n", "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", @@ -138,18 +202,18 @@ "writexl::write_xlsx(list(quality_of_care = as.data.frame(qoc)), out_xlsx)\n", "\n", "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "98b78bf7" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "984689b0", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "# Yearly maps by ADM2\n", "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", @@ -206,19 +270,31 @@ " }\n", "}\n", "\n", - "plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", - "plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", - "plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", - "plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", - "plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", - "plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", - "plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", + "# Plot only indicators that were calculated (columns exist)\n", + "if (\"testing_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", + "}\n", + "if (\"treatment_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", + "}\n", + "if (\"case_fatality_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", + "}\n", + "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", + "}\n", + "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", + "}\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", + "}\n", + "if (\"presumed_cases\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", + "}\n", "\n", "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" - ], - "execution_count": null, - "outputs": [], - "id": "984689b0" + ] } ], "metadata": { @@ -238,4 +314,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 9c973fc..f737d85 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -2,21 +2,24 @@ "cells": [ { "cell_type": "markdown", + "id": "7d246ae9", "metadata": {}, "source": [ "## Quality of Care Report\n", "\n", "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." - ], - "id": "7d246ae9" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "5eaa5bab", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "ROOT_PATH <- \"~/workspace\"\n", "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", @@ -32,18 +35,18 @@ "\n", "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ], - "execution_count": null, - "outputs": [], - "id": "5eaa5bab" + ] }, { "cell_type": "code", + "execution_count": null, + "id": "1a8320f8", "metadata": { "vscode": { "languageId": "r" } }, + "outputs": [], "source": [ "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_.*\\\\.parquet$\"), full.names = TRUE)\n", "if (length(files) == 0) {\n", @@ -67,10 +70,7 @@ "\n", "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))" - ], - "execution_count": null, - "outputs": [], - "id": "1a8320f8" + ] } ], "metadata": { @@ -90,4 +90,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/snt_dhis2_outliers_imputation_iqr/pipeline.py b/snt_dhis2_outliers_imputation_iqr/pipeline.py index 4849fb0..94dfb48 100644 --- a/snt_dhis2_outliers_imputation_iqr/pipeline.py +++ b/snt_dhis2_outliers_imputation_iqr/pipeline.py @@ -1,4 +1,5 @@ from pathlib import Path +import tempfile from openhexa.sdk import current_run, parameter, pipeline, workspace from snt_lib.snt_pipeline_utils import ( @@ -13,6 +14,68 @@ ) +def preserve_and_add_files_to_dataset( + dataset_id: str, + country_code: str, + new_files: list[Path], + method_prefix: str, +): + """ + Add new files to dataset while preserving existing files from other methods. + + Args: + dataset_id: Dataset identifier + country_code: Country code + new_files: List of new file paths to add + method_prefix: Prefix pattern to identify files from this method (e.g., "mean", "median", "magic_glasses") + """ + try: + dataset = workspace.get_dataset(dataset_id) + latest_version = dataset.latest_version + existing_files = latest_version.list_files() + + # Filter out files from this method but keep others + preserved_files = [] + for file_obj in existing_files: + filename = file_obj.name + + # Determine if this file belongs to the current method + is_current_method = False + if method_prefix == "magic_glasses": + # Magic Glasses files: flagged_outliers_magic_glasses.parquet, outlier_magic_glasses_*.parquet + is_current_method = ( + filename == f"{country_code}_flagged_outliers_magic_glasses.parquet" or + filename.startswith(f"{country_code}_outlier_magic_glasses_") + ) + else: + # Other methods: routine_outliers-{method}*.parquet + is_current_method = filename.startswith(f"{country_code}_routine_outliers-{method_prefix}") + + # Preserve files from other methods + if not is_current_method: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp_file: + tmp_path = Path(tmp_file.name) + file_obj.download(tmp_path) + preserved_files.append(tmp_path) + current_run.log_info(f"Preserving existing file: {filename}") + except Exception as e: + current_run.log_warning(f"Could not preserve file {filename}: {e}") + + # Combine preserved files with new files + all_files = preserved_files + new_files + current_run.log_info(f"Adding {len(new_files)} new files and preserving {len(preserved_files)} existing files") + except Exception as e: + current_run.log_warning(f"Could not preserve existing files, adding only new files: {e}") + all_files = new_files + + add_files_to_dataset( + dataset_id=dataset_id, + country_code=country_code, + file_paths=all_files, + ) + + @pipeline("snt_dhis2_outliers_imputation_iqr") @parameter( "deviation_iqr", @@ -100,10 +163,15 @@ def snt_dhis2_outliers_imputation_iqr( ) iqr_files = list(data_path.glob(f"{country_code}_routine_outliers-iqr*.parquet")) - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"], + new_files = [*iqr_files, parameters_file] + + # Preserve existing files from other methods and add new ones + dataset_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + preserve_and_add_files_to_dataset( + dataset_id=dataset_id, country_code=country_code, - file_paths=[*iqr_files, parameters_file], + new_files=new_files, + method_prefix="iqr", ) if push_db: diff --git a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py index 0f34f0a..94e5bde 100644 --- a/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py +++ b/snt_dhis2_outliers_imputation_magic_glasses/pipeline.py @@ -1,5 +1,6 @@ from pathlib import Path import time +import tempfile from openhexa.sdk import current_run, parameter, pipeline, workspace from snt_lib.snt_pipeline_utils import ( @@ -14,6 +15,68 @@ ) +def preserve_and_add_files_to_dataset( + dataset_id: str, + country_code: str, + new_files: list[Path], + method_prefix: str, +): + """ + Add new files to dataset while preserving existing files from other methods. + + Args: + dataset_id: Dataset identifier + country_code: Country code + new_files: List of new file paths to add + method_prefix: Prefix pattern to identify files from this method (e.g., "mean", "median", "magic_glasses") + """ + try: + dataset = workspace.get_dataset(dataset_id) + latest_version = dataset.latest_version + existing_files = latest_version.list_files() + + # Filter out files from this method but keep others + preserved_files = [] + for file_obj in existing_files: + filename = file_obj.name + + # Determine if this file belongs to the current method + is_current_method = False + if method_prefix == "magic_glasses": + # Magic Glasses files: flagged_outliers_magic_glasses.parquet, outlier_magic_glasses_*.parquet + is_current_method = ( + filename == f"{country_code}_flagged_outliers_magic_glasses.parquet" or + filename.startswith(f"{country_code}_outlier_magic_glasses_") + ) + else: + # Other methods: routine_outliers-{method}*.parquet + is_current_method = filename.startswith(f"{country_code}_routine_outliers-{method_prefix}") + + # Preserve files from other methods + if not is_current_method: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp_file: + tmp_path = Path(tmp_file.name) + file_obj.download(tmp_path) + preserved_files.append(tmp_path) + current_run.log_info(f"Preserving existing file: {filename}") + except Exception as e: + current_run.log_warning(f"Could not preserve file {filename}: {e}") + + # Combine preserved files with new files + all_files = preserved_files + new_files + current_run.log_info(f"Adding {len(new_files)} new files and preserving {len(preserved_files)} existing files") + except Exception as e: + current_run.log_warning(f"Could not preserve existing files, adding only new files: {e}") + all_files = new_files + + add_files_to_dataset( + dataset_id=dataset_id, + country_code=country_code, + file_paths=all_files, + ) + + @pipeline("snt_dhis2_outliers_imputation_magic_glasses") @parameter( "mode", @@ -142,14 +205,18 @@ def snt_dhis2_outliers_imputation_magic_glasses( country_code=country_code, ) - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"], + # Get new files for Magic Glasses + mg_files = list(data_path.glob(f"{country_code}_flagged_outliers_magic_glasses.parquet")) + mg_files.extend(data_path.glob(f"{country_code}_outlier_magic_glasses_*.parquet")) + new_files = [*mg_files, parameters_file] + + # Preserve existing files from other methods and add new ones + dataset_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + preserve_and_add_files_to_dataset( + dataset_id=dataset_id, country_code=country_code, - file_paths=[ - *data_path.glob(f"{country_code}_flagged_outliers_magic_glasses.parquet"), - *data_path.glob(f"{country_code}_outlier_magic_glasses_*.parquet"), - parameters_file, - ], + new_files=new_files, + method_prefix="magic_glasses", ) if push_db: diff --git a/snt_dhis2_outliers_imputation_mean/pipeline.py b/snt_dhis2_outliers_imputation_mean/pipeline.py index 0cf3a88..2b65639 100644 --- a/snt_dhis2_outliers_imputation_mean/pipeline.py +++ b/snt_dhis2_outliers_imputation_mean/pipeline.py @@ -1,4 +1,5 @@ from pathlib import Path +import tempfile from openhexa.sdk import current_run, parameter, pipeline, workspace from snt_lib.snt_pipeline_utils import ( @@ -13,6 +14,68 @@ ) +def preserve_and_add_files_to_dataset( + dataset_id: str, + country_code: str, + new_files: list[Path], + method_prefix: str, +): + """ + Add new files to dataset while preserving existing files from other methods. + + Args: + dataset_id: Dataset identifier + country_code: Country code + new_files: List of new file paths to add + method_prefix: Prefix pattern to identify files from this method (e.g., "mean", "median", "magic_glasses") + """ + try: + dataset = workspace.get_dataset(dataset_id) + latest_version = dataset.latest_version + existing_files = latest_version.list_files() + + # Filter out files from this method but keep others + preserved_files = [] + for file_obj in existing_files: + filename = file_obj.name + + # Determine if this file belongs to the current method + is_current_method = False + if method_prefix == "magic_glasses": + # Magic Glasses files: flagged_outliers_magic_glasses.parquet, outlier_magic_glasses_*.parquet + is_current_method = ( + filename == f"{country_code}_flagged_outliers_magic_glasses.parquet" or + filename.startswith(f"{country_code}_outlier_magic_glasses_") + ) + else: + # Other methods: routine_outliers-{method}*.parquet + is_current_method = filename.startswith(f"{country_code}_routine_outliers-{method_prefix}") + + # Preserve files from other methods + if not is_current_method: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp_file: + tmp_path = Path(tmp_file.name) + file_obj.download(tmp_path) + preserved_files.append(tmp_path) + current_run.log_info(f"Preserving existing file: {filename}") + except Exception as e: + current_run.log_warning(f"Could not preserve file {filename}: {e}") + + # Combine preserved files with new files + all_files = preserved_files + new_files + current_run.log_info(f"Adding {len(new_files)} new files and preserving {len(preserved_files)} existing files") + except Exception as e: + current_run.log_warning(f"Could not preserve existing files, adding only new files: {e}") + all_files = new_files + + add_files_to_dataset( + dataset_id=dataset_id, + country_code=country_code, + file_paths=all_files, + ) + + @pipeline("snt_dhis2_outliers_imputation_mean") @parameter( "deviation_mean", @@ -99,11 +162,17 @@ def snt_dhis2_outliers_imputation_mean( country_code=country_code, ) + # Get new files for this method mean_files = list(data_path.glob(f"{country_code}_routine_outliers-mean*.parquet")) - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"], + new_files = [*mean_files, parameters_file] + + # Preserve existing files from other methods and add new ones + dataset_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + preserve_and_add_files_to_dataset( + dataset_id=dataset_id, country_code=country_code, - file_paths=[*mean_files, parameters_file], + new_files=new_files, + method_prefix="mean", ) if push_db: diff --git a/snt_dhis2_outliers_imputation_median/pipeline.py b/snt_dhis2_outliers_imputation_median/pipeline.py index cf49481..d5e0534 100644 --- a/snt_dhis2_outliers_imputation_median/pipeline.py +++ b/snt_dhis2_outliers_imputation_median/pipeline.py @@ -1,4 +1,5 @@ from pathlib import Path +import tempfile from openhexa.sdk import current_run, parameter, pipeline, workspace from snt_lib.snt_pipeline_utils import ( @@ -13,6 +14,68 @@ ) +def preserve_and_add_files_to_dataset( + dataset_id: str, + country_code: str, + new_files: list[Path], + method_prefix: str, +): + """ + Add new files to dataset while preserving existing files from other methods. + + Args: + dataset_id: Dataset identifier + country_code: Country code + new_files: List of new file paths to add + method_prefix: Prefix pattern to identify files from this method (e.g., "mean", "median", "magic_glasses") + """ + try: + dataset = workspace.get_dataset(dataset_id) + latest_version = dataset.latest_version + existing_files = latest_version.list_files() + + # Filter out files from this method but keep others + preserved_files = [] + for file_obj in existing_files: + filename = file_obj.name + + # Determine if this file belongs to the current method + is_current_method = False + if method_prefix == "magic_glasses": + # Magic Glasses files: flagged_outliers_magic_glasses.parquet, outlier_magic_glasses_*.parquet + is_current_method = ( + filename == f"{country_code}_flagged_outliers_magic_glasses.parquet" or + filename.startswith(f"{country_code}_outlier_magic_glasses_") + ) + else: + # Other methods: routine_outliers-{method}*.parquet + is_current_method = filename.startswith(f"{country_code}_routine_outliers-{method_prefix}") + + # Preserve files from other methods + if not is_current_method: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp_file: + tmp_path = Path(tmp_file.name) + file_obj.download(tmp_path) + preserved_files.append(tmp_path) + current_run.log_info(f"Preserving existing file: {filename}") + except Exception as e: + current_run.log_warning(f"Could not preserve file {filename}: {e}") + + # Combine preserved files with new files + all_files = preserved_files + new_files + current_run.log_info(f"Adding {len(new_files)} new files and preserving {len(preserved_files)} existing files") + except Exception as e: + current_run.log_warning(f"Could not preserve existing files, adding only new files: {e}") + all_files = new_files + + add_files_to_dataset( + dataset_id=dataset_id, + country_code=country_code, + file_paths=all_files, + ) + + @pipeline("snt_dhis2_outliers_imputation_median") @parameter( "deviation_median", @@ -100,10 +163,15 @@ def snt_dhis2_outliers_imputation_median( ) median_files = list(data_path.glob(f"{country_code}_routine_outliers-median*.parquet")) - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"], + new_files = [*median_files, parameters_file] + + # Preserve existing files from other methods and add new ones + dataset_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + preserve_and_add_files_to_dataset( + dataset_id=dataset_id, country_code=country_code, - file_paths=[*median_files, parameters_file], + new_files=new_files, + method_prefix="median", ) if push_db: diff --git a/snt_dhis2_outliers_imputation_path/pipeline.py b/snt_dhis2_outliers_imputation_path/pipeline.py index 1f98651..186aedc 100644 --- a/snt_dhis2_outliers_imputation_path/pipeline.py +++ b/snt_dhis2_outliers_imputation_path/pipeline.py @@ -1,4 +1,5 @@ from pathlib import Path +import tempfile from openhexa.sdk import current_run, parameter, pipeline, workspace from snt_lib.snt_pipeline_utils import ( @@ -13,6 +14,68 @@ ) +def preserve_and_add_files_to_dataset( + dataset_id: str, + country_code: str, + new_files: list[Path], + method_prefix: str, +): + """ + Add new files to dataset while preserving existing files from other methods. + + Args: + dataset_id: Dataset identifier + country_code: Country code + new_files: List of new file paths to add + method_prefix: Prefix pattern to identify files from this method (e.g., "mean", "median", "magic_glasses") + """ + try: + dataset = workspace.get_dataset(dataset_id) + latest_version = dataset.latest_version + existing_files = latest_version.list_files() + + # Filter out files from this method but keep others + preserved_files = [] + for file_obj in existing_files: + filename = file_obj.name + + # Determine if this file belongs to the current method + is_current_method = False + if method_prefix == "magic_glasses": + # Magic Glasses files: flagged_outliers_magic_glasses.parquet, outlier_magic_glasses_*.parquet + is_current_method = ( + filename == f"{country_code}_flagged_outliers_magic_glasses.parquet" or + filename.startswith(f"{country_code}_outlier_magic_glasses_") + ) + else: + # Other methods: routine_outliers-{method}*.parquet + is_current_method = filename.startswith(f"{country_code}_routine_outliers-{method_prefix}") + + # Preserve files from other methods + if not is_current_method: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=Path(filename).suffix) as tmp_file: + tmp_path = Path(tmp_file.name) + file_obj.download(tmp_path) + preserved_files.append(tmp_path) + current_run.log_info(f"Preserving existing file: {filename}") + except Exception as e: + current_run.log_warning(f"Could not preserve file {filename}: {e}") + + # Combine preserved files with new files + all_files = preserved_files + new_files + current_run.log_info(f"Adding {len(new_files)} new files and preserving {len(preserved_files)} existing files") + except Exception as e: + current_run.log_warning(f"Could not preserve existing files, adding only new files: {e}") + all_files = new_files + + add_files_to_dataset( + dataset_id=dataset_id, + country_code=country_code, + file_paths=all_files, + ) + + @pipeline("snt_dhis2_outliers_imputation_path") @parameter( "deviation_mean", @@ -101,14 +164,17 @@ def snt_dhis2_outliers_imputation_path( country_code=country_code, ) - # Add files to Dataset - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"], + # Get new files for this method (trend) + trend_files = list(data_path.glob(f"{country_code}_routine_outliers-trend*.parquet")) + new_files = [*trend_files, parameters_file] + + # Preserve existing files from other methods and add new ones + dataset_id = snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_OUTLIERS_IMPUTATION"] + preserve_and_add_files_to_dataset( + dataset_id=dataset_id, country_code=country_code, - file_paths=[ - *data_path.glob(f"{country_code}_routine_outliers*.parquet"), - parameters_file, - ], + new_files=new_files, + method_prefix="trend", ) # Create consolidated outliers DB table diff --git a/snt_dhis2_quality_of_care/pipeline.py b/snt_dhis2_quality_of_care/pipeline.py index e5f7e6d..fe07855 100644 --- a/snt_dhis2_quality_of_care/pipeline.py +++ b/snt_dhis2_quality_of_care/pipeline.py @@ -18,7 +18,7 @@ name="Outlier imputation method", help="Choose which outlier detection/imputation method to use.", type=str, - choices=["mean", "median", "iqr", "trend"], + choices=["mean", "median", "iqr", "trend", "mg-partial", "mg-complete"], default="mean", required=True, ) From b06168149a63ccd755fa50bcb29bada069223776 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 11 Mar 2026 12:19:33 +0100 Subject: [PATCH 5/7] Enhance SNT Quality of Care pipeline and reporting notebooks by adding dplyr to required packages, improving file handling for missing outputs, and refining summary table generation. Updated error handling for missing indicators and ensured compatibility with existing output formats. --- .../code/snt_dhis2_quality_of_care.ipynb | 256 ++++++-- .../snt_dhis2_quality_of_care_report.ipynb | 579 +++++++++++++++++- snt_dhis2_quality_of_care/pipeline.py | 32 +- 3 files changed, 786 insertions(+), 81 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 6d8db1b..210dd16 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -46,7 +46,7 @@ "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", "\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"writexl\")\n", + "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"writexl\", \"dplyr\")\n", "install_and_load(required_packages)\n", "\n", "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", @@ -69,42 +69,59 @@ }, "outputs": [], "source": [ - "# Fallback parameters for local/dev execution\n", - "if (!exists(\"outlier_imputation_method\")) {\n", - " outlier_imputation_method <- \"mean\"\n", - "}\n", + "# Validate data_action parameter\n", "if (!exists(\"data_action\")) {\n", " data_action <- \"imputed\"\n", "}\n", "\n", - "allowed_methods <- c(\"mean\", \"median\", \"iqr\", \"trend\", \"mg-partial\", \"mg-complete\")\n", - "if (!(outlier_imputation_method %in% allowed_methods)) {\n", - " stop(glue::glue(\"Invalid outlier_imputation_method: {outlier_imputation_method}. Allowed: {paste(allowed_methods, collapse=', ')}\"))\n", - "}\n", - "\n", "allowed_actions <- c(\"imputed\", \"removed\")\n", "if (!(data_action %in% allowed_actions)) {\n", " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", "}\n", "\n", - "# Magic Glasses uses different file naming convention\n", - "is_mg <- outlier_imputation_method %in% c(\"mg-partial\", \"mg-complete\")\n", + "# Automatically find the latest routine outliers-imputed file in the dataset\n", + "# Pattern: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", + "log_msg(glue::glue(\"Searching for latest routine outliers-imputed file in dataset (data_action: {data_action})...\"))\n", "\n", - "if (is_mg) {\n", - " # For Magic Glasses, use formatted routine data (no imputed files exist)\n", - " # Note: Magic Glasses produces outlier flags, not imputed routine files\n", - " # We use the base formatted routine data\n", - " routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine.parquet\")\n", - " log_msg(glue::glue(\"Magic Glasses method selected. Loading base routine file: {routine_filename}\"))\n", - " log_msg(\"[WARNING] Magic Glasses does not produce imputed routine files. Using base formatted routine data.\")\n", - " routine <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, routine_filename)\n", - "} else {\n", - " # Standard methods: use outliers-imputed files\n", - " routine_filename <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-{outlier_imputation_method}_{data_action}.parquet\")\n", - " log_msg(glue::glue(\"Loading routine file from DHIS2 outliers dataset: {routine_filename}\"))\n", - " routine <- get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", + "dataset_last_version <- openhexa$workspace$get_dataset(OUTLIERS_DATASET)$latest_version\n", + "if (is.null(dataset_last_version)) {\n", + " stop(glue::glue(\"[ERROR] No version available in dataset `{OUTLIERS_DATASET}`. Process stopped.\"))\n", + "}\n", + "\n", + "# Pattern to match: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", + "pattern_prefix <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-\")\n", + "pattern_suffix <- glue::glue(\"_{data_action}.parquet\")\n", + "routine_filename <- NULL\n", + "files_list <- reticulate::iterate(dataset_last_version$files)\n", + "\n", + "# Find all matching files and select the latest one\n", + "matching_files <- c()\n", + "for (file in files_list) {\n", + " filename <- file$filename\n", + " if (startsWith(filename, pattern_prefix) && endsWith(filename, pattern_suffix)) {\n", + " matching_files <- c(matching_files, filename)\n", + " }\n", "}\n", "\n", + "if (length(matching_files) == 0) {\n", + " stop(glue::glue(\"[ERROR] No file matching pattern `{pattern_prefix}*{pattern_suffix}` found in dataset `{OUTLIERS_DATASET}`. \",\n", + " \"Please run an outlier imputation pipeline first (e.g., snt_dhis2_outliers_imputation_mean) with `data_action=\\\"{data_action}\\\"`.\"))\n", + "}\n", + "\n", + "# Select the latest file (alphabetically sorted, which should correspond to most recent method)\n", + "routine_filename <- sort(matching_files, decreasing = TRUE)[1]\n", + "\n", + "log_msg(glue::glue(\"Found {length(matching_files)} matching file(s). Using latest: {routine_filename}\"))\n", + "\n", + "# Load the routine file\n", + "routine <- tryCatch({\n", + " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", + "}, error = function(e) {\n", + " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file `\", routine_filename, \n", + " \"` from `\", OUTLIERS_DATASET, \"`. [ERROR DETAILS] \", conditionMessage(e))\n", + " stop(msg)\n", + "})\n", + "\n", "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", "\n", "setDT(routine)\n", @@ -126,9 +143,15 @@ "}\n", "\n", "# Convert available numeric columns\n", + "# Handle \"-\" and other non-numeric values by converting them to NA first\n", "num_cols <- intersect(available_cols, names(routine))\n", "if (length(num_cols) > 0) {\n", - " routine[, (num_cols) := lapply(.SD, function(x) as.numeric(x)), .SDcols = num_cols]\n", + " for (col in num_cols) {\n", + " # First convert to character to handle \"-\" strings, then replace with NA, then convert to numeric\n", + " col_vals <- as.character(routine[[col]])\n", + " col_vals[is.na(col_vals) | col_vals == \"\" | col_vals == \"-\"] <- NA_character_\n", + " routine[, (col) := as.numeric(col_vals)]\n", + " }\n", "}\n", "routine[, YEAR := as.integer(YEAR)]\n", "routine[, ADM2_ID := as.character(ADM2_ID)]\n", @@ -171,6 +194,8 @@ "\n", "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + " # Compatibility alias to match historical notebook export naming\n", + " qoc[, prop_deaths_malaria := prop_malaria_deaths]\n", "} else {\n", " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", "}\n", @@ -193,15 +218,53 @@ " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", "}\n", "\n", - "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.parquet\"))\n", - "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.csv\"))\n", - "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{outlier_imputation_method}_{data_action}.xlsx\"))\n", + "# Main district-year outputs\n", + "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.parquet\"))\n", + "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.csv\"))\n", + "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.xlsx\"))\n", + "\n", + "# Explicit district-year naming (requested format style)\n", + "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.parquet\"))\n", + "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.csv\"))\n", + "\n", + "# Build a compact year-level summary from computed indicators\n", + "summary_tbl <- unique(qoc[, .(YEAR)])\n", + "if (\"testing_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"treatment_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"case_fatality_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"presumed_cases\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "summary_tbl <- summary_tbl[order(YEAR)]\n", + "\n", + "out_summary_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_year_summary_{data_action}.parquet\"))\n", + "out_summary_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_year_summary_{data_action}.csv\"))\n", "\n", + "# Persist all outputs\n", "arrow::write_parquet(qoc, out_parquet)\n", "data.table::fwrite(qoc, out_csv)\n", "writexl::write_xlsx(list(quality_of_care = as.data.frame(qoc)), out_xlsx)\n", + "arrow::write_parquet(qoc, out_district_parquet)\n", + "data.table::fwrite(qoc, out_district_csv)\n", + "arrow::write_parquet(summary_tbl, out_summary_parquet)\n", + "data.table::fwrite(summary_tbl, out_summary_csv)\n", "\n", - "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}\"))" + "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}, {out_district_parquet}, {out_district_csv}, {out_summary_parquet}, {out_summary_csv}\"))" ] }, { @@ -216,42 +279,124 @@ "outputs": [], "source": [ "# Yearly maps by ADM2\n", + "# Ensure ADM2_ID is character in both objects (do this once before the function)\n", "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", "\n", "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", + " # Check if value_col exists in df\n", + " if (!(value_col %in% names(df))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found in data. Skipping map generation.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " # Create a local copy of sf_shapes to avoid modifying the original\n", + " sf_shapes_local <- sf_shapes\n", + " if (!is.character(sf_shapes_local$ADM2_ID)) {\n", + " sf_shapes_local$ADM2_ID <- as.character(sf_shapes_local$ADM2_ID)\n", + " }\n", + " \n", " years <- sort(unique(df$YEAR))\n", " for (yr in years) {\n", " df_y <- df[YEAR == yr]\n", - " map_df <- merge(sf_shapes, df_y, by = \"ADM2_ID\", all.x = TRUE)\n", + " \n", + " # Check if df_y has any rows\n", + " if (nrow(df_y) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No data for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Ensure ADM2_ID is character in df_y\n", + " df_y$ADM2_ID <- as.character(df_y$ADM2_ID)\n", + " \n", + " # Use dplyr::left_join for sf objects to preserve geometry (use local copy)\n", + " map_df <- dplyr::left_join(sf_shapes_local, df_y, by = \"ADM2_ID\")\n", + "\n", + " # Check if value_col exists in map_df after merge\n", + " if (!(value_col %in% names(map_df))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found after merge for year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", "\n", - " p <- ggplot(map_df)\n", + " vals <- map_df[[value_col]]\n", + " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", + " \n", + " # If no valid values, skip this map\n", + " if (length(finite_vals) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No valid values for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", "\n", + " # Create cat column BEFORE creating the plot\n", + " cat_vals <- NULL\n", + " fill_palette <- NULL\n", + " \n", " if (is_rate) {\n", - " map_df$cat <- cut(\n", - " map_df[[value_col]],\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " p <- p + geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", - " scale_fill_brewer(palette = \"YlOrRd\", na.value = \"white\", drop = FALSE)\n", + " # Create cat column with proper handling of NA values\n", + " cat_result <- tryCatch({\n", + " cat_vals <- cut(\n", + " vals,\n", + " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", + " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", + " include.lowest = TRUE\n", + " )\n", + " fill_palette <- \"YlOrRd\"\n", + " TRUE # Success\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " FALSE # Failure\n", + " })\n", + " if (!cat_result) {\n", + " next\n", + " }\n", " } else {\n", - " vals <- map_df[[value_col]]\n", - " finite_vals <- vals[is.finite(vals)]\n", - " if (length(finite_vals) > 4) {\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", - " if (length(br) < 2) {\n", - " map_df$cat <- as.factor(\"all\")\n", + " cat_result <- tryCatch({\n", + " if (length(finite_vals) > 4) {\n", + " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", + " if (length(br) < 2) {\n", + " cat_vals <- as.factor(rep(\"all\", nrow(map_df)))\n", + " } else {\n", + " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)\n", + " }\n", " } else {\n", - " map_df$cat <- cut(vals, breaks = br, include.lowest = TRUE)\n", + " cat_vals <- as.factor(vals)\n", " }\n", - " } else {\n", - " map_df$cat <- as.factor(vals)\n", + " fill_palette <- \"Blues\"\n", + " TRUE # Success\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " FALSE # Failure\n", + " })\n", + " if (!cat_result) {\n", + " next\n", " }\n", - " p <- p + geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", - " scale_fill_brewer(palette = \"Blues\", na.value = \"white\", drop = FALSE)\n", " }\n", + " \n", + " # Check if cat_vals was created successfully\n", + " if (is.null(cat_vals) || length(cat_vals) != nrow(map_df)) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create 'cat' column for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Check if all values are NA (cut failed) - but allow some NA values\n", + " if (all(is.na(cat_vals))) {\n", + " log_msg(glue::glue(\"[WARNING] All 'cat' values are NA for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Add cat column using dplyr::mutate to ensure it's properly added to sf object\n", + " map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))\n", + " \n", + " # Verify cat column exists before creating plot\n", + " if (!(\"cat\" %in% names(map_df))) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to add 'cat' column to map_df for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Create plot AFTER cat column is added\n", + " p <- ggplot(map_df) +\n", + " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", + " scale_fill_brewer(palette = fill_palette, na.value = \"white\", drop = FALSE)\n", "\n", " p <- p +\n", " theme_void() +\n", @@ -266,7 +411,14 @@ " )\n", "\n", " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", - " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", + " \n", + " # Try to save the plot, catch any errors\n", + " tryCatch({\n", + " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", + " log_msg(glue::glue(\"Saved map: {out_png}\"))\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to save map for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " })\n", " }\n", "}\n", "\n", diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index f737d85..12e3616 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -28,7 +28,12 @@ "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", "\n", "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\"))\n", + "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\"))\n", + "\n", + "# Create output directories\n", + "REPORT_OUTPUTS_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\")\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", "\n", "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", "openhexa <- reticulate::import(\"openhexa.sdk\")\n", @@ -48,7 +53,8 @@ }, "outputs": [], "source": [ - "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_.*\\\\.parquet$\"), full.names = TRUE)\n", + "# Use only the main district-year file for the selected run (exclude summary side files)\n", + "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_(imputed|removed)\\\\.parquet$\"), full.names = TRUE)\n", "if (length(files) == 0) {\n", " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", "}\n", @@ -56,21 +62,570 @@ "latest_file <- files[which.max(file.info(files)$mtime)]\n", "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", "\n", - "summary_tbl <- qoc[, .(\n", - " testing_rate = mean(testing_rate, na.rm = TRUE),\n", - " treatment_rate = mean(treatment_rate, na.rm = TRUE),\n", - " case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE),\n", - " prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE),\n", - " prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE),\n", - " non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE),\n", - " presumed_cases = sum(presumed_cases, na.rm = TRUE)\n", - "), by = .(YEAR)][order(YEAR)]\n", + "# Build summary table with only available columns\n", + "# Start with unique YEAR values\n", + "summary_tbl <- unique(qoc[, .(YEAR)])\n", + "\n", + "# Add rate indicators (mean) - merge one by one\n", + "if (\"testing_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"treatment_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"case_fatality_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "\n", + "# Add absolute indicators (sum)\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"presumed_cases\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "\n", + "summary_tbl <- summary_tbl[order(YEAR)]\n", + "\n", + "# Explicitly list missing indicators so report is self-explanatory\n", + "expected_indicators <- c(\n", + " \"testing_rate\",\n", + " \"treatment_rate\",\n", + " \"case_fatality_rate\",\n", + " \"prop_adm_malaria\",\n", + " \"prop_malaria_deaths\",\n", + " \"non_malaria_all_cause_outpatients\",\n", + " \"presumed_cases\"\n", + ")\n", + "missing_indicators <- setdiff(expected_indicators, names(qoc))\n", + "if (length(missing_indicators) > 0) {\n", + " log_msg(glue::glue(\"[WARNING] Missing indicators in input file: {paste(missing_indicators, collapse=', ')}\"), level = \"warning\")\n", + " cat(glue::glue(\"\\nMissing indicators in this run: {paste(missing_indicators, collapse=', ')}\\n\"))\n", + " cat(\"Reason: required source columns are absent in the selected outliers file.\\n\")\n", + "}\n", + "\n", + "# Save summary data (parquet, csv, xlsx) - following other pipelines pattern\n", + "summary_parquet <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.parquet\"))\n", + "summary_csv <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.csv\"))\n", + "summary_xlsx <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.xlsx\"))\n", + "\n", + "# Save as parquet (primary format, like other pipelines)\n", + "arrow::write_parquet(summary_tbl, summary_parquet)\n", + "\n", + "# Save as csv and xlsx for compatibility\n", + "data.table::fwrite(summary_tbl, summary_csv)\n", + "writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx)\n", + "\n", + "log_msg(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\"))\n", "\n", "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", "\n", "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", - "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))" + "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", + "cat(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\\n\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3dc318ac", + "metadata": {}, + "source": [ + "## Graphs by Year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e86bb0a", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Create bar charts by year (same as original notebook - 4x2 grid layout)\n", + "# Prepare data - convert rates to percentages\n", + "plot_data <- copy(summary_tbl)\n", + "\n", + "# Create the same 4x2 subplot layout as original notebook\n", + "if (nrow(plot_data) > 0) {\n", + " # Create a list to store individual plots (in order: 4x2 grid)\n", + " plots_list <- list()\n", + " \n", + " # Row 0, Col 0: Testing rate\n", + " if (\"testing_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = testing_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(testing_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Testing rate (TEST / SUSP)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"testing_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 0, Col 1: Treatment rate\n", + " if (\"treatment_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = treatment_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(treatment_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Treatment rate (MALTREAT / CONF)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"treatment_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 1, Col 0: Case fatality rate\n", + " if (\"case_fatality_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = case_fatality_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(case_fatality_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Case fatality rate (MALDTH / MALADM)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"case_fatality_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 1, Col 1: Proportion admissions malaria\n", + " if (\"prop_adm_malaria\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_adm_malaria * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(prop_adm_malaria * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Prop. admissions paludisme (MALADM / ALLADM)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"prop_adm_malaria\"]] <- p\n", + " }\n", + " \n", + " # Row 2, Col 0: Proportion deaths malaria\n", + " if (\"prop_malaria_deaths\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_malaria_deaths * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(prop_malaria_deaths * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Prop. décès paludisme (MALDTH / ALLDTH)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"prop_malaria_deaths\"]] <- p\n", + " }\n", + " \n", + " # Row 2, Col 1: Presumed cases (absolute)\n", + " if (\"presumed_cases\" %in% names(plot_data)) {\n", + " format_label <- function(v) {\n", + " ifelse(is.na(v) | v == 0, \"0\",\n", + " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", + " format(round(v), big.mark = \" \", scientific = FALSE)\n", + " )\n", + " )\n", + " }\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = presumed_cases)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = format_label(presumed_cases)), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Cas présumés (PRES)\", x = \"Année\", y = \"Nombre\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"presumed_cases\"]] <- p\n", + " }\n", + " \n", + " # Row 3, Col 0: Non-malaria all-cause outpatients (absolute)\n", + " if (\"non_malaria_all_cause_outpatients\" %in% names(plot_data)) {\n", + " format_label <- function(v) {\n", + " ifelse(is.na(v) | v == 0, \"0\",\n", + " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", + " format(round(v), big.mark = \" \", scientific = FALSE)\n", + " )\n", + " )\n", + " }\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = non_malaria_all_cause_outpatients)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = format_label(non_malaria_all_cause_outpatients)), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Consultations externes non-paludisme (ALLOUT)\", x = \"Année\", y = \"Nombre\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"non_malaria_all_cause_outpatients\"]] <- p\n", + " }\n", + " \n", + " # Create and display combined plot (dynamic grid for readability)\n", + " if (length(plots_list) > 0) {\n", + " # Order plots as in original\n", + " plot_order <- c(\"testing_rate\", \"treatment_rate\", \"case_fatality_rate\", \"prop_adm_malaria\", \n", + " \"prop_malaria_deaths\", \"presumed_cases\", \"non_malaria_all_cause_outpatients\")\n", + " available_plots <- plots_list[intersect(plot_order, names(plots_list))]\n", + "\n", + " if (length(available_plots) > 0) {\n", + " n_plots <- length(available_plots)\n", + " ncol_layout <- 2\n", + " nrow_layout <- ceiling(n_plots / ncol_layout)\n", + "\n", + " # Bigger display in report so labels are readable\n", + " options(repr.plot.width = 14, repr.plot.height = max(7, 4.8 * nrow_layout))\n", + "\n", + " combined_plot <- do.call(grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))\n", + " print(combined_plot)\n", + "\n", + " # Save at larger size for presentation readability\n", + " combined_file <- file.path(FIGURES_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_by_year.png\"))\n", + " ggsave(\n", + " combined_file,\n", + " plot = combined_plot,\n", + " width = 18,\n", + " height = max(8, 5.2 * nrow_layout),\n", + " dpi = 300,\n", + " bg = \"white\",\n", + " units = \"in\"\n", + " )\n", + " log_msg(glue::glue(\"Combined bar charts saved: {combined_file}\"))\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3b625d36", + "metadata": {}, + "source": [ + "## Maps by District and Year\n", + "\n", + "Maps are generated directly from the quality-of-care data and district shapes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6056a979", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load shapes geojson from dataset (like seasonality pipeline)\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "shapes <- tryCatch({\n", + " get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", + "}, error = function(e) {\n", + " msg <- paste0(\"Error while loading DHIS2 Shapes data for: \", COUNTRY_CODE, \". \", conditionMessage(e))\n", + " log_msg(msg, level = \"error\")\n", + " stop(msg)\n", + "})\n", + "\n", + "# Ensure ADM2_ID is character in both datasets\n", + "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", + "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", + "\n", + "# Merge shapes with quality-of-care data\n", + "qoc_sf <- shapes %>%\n", + " dplyr::left_join(qoc, by = \"ADM2_ID\")\n", + "\n", + "# Helper to build readable interval labels for legends\n", + "format_interval_labels <- function(breaks_vec) {\n", + " labels <- c()\n", + " for (i in seq_len(length(breaks_vec) - 1)) {\n", + " a <- breaks_vec[i]\n", + " b <- breaks_vec[i + 1]\n", + " labels <- c(labels, paste0(scales::comma(round(a)), \" - \", scales::comma(round(b))))\n", + " }\n", + " labels\n", + "}\n", + "\n", + "# Function to plot yearly maps (similar to code notebook but inline in report)\n", + "plot_yearly_map_report <- function(sf_data, value_col, title_prefix, is_rate = TRUE) {\n", + " if (!(value_col %in% names(sf_data))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found. Skipping map generation.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " years <- sort(unique(sf_data$YEAR[!is.na(sf_data$YEAR)]))\n", + " if (length(years) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No valid years for '{value_col}'. Skipping map.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " # Create plots for each year\n", + " plot_list <- list()\n", + " base_shapes <- sf_data %>% dplyr::select(ADM2_ID, geometry) %>% dplyr::distinct()\n", + "\n", + " for (yr in years) {\n", + " # Keep all districts on map, then join year values\n", + " year_vals <- sf_data[sf_data$YEAR == yr, c(\"ADM2_ID\", value_col), drop = FALSE]\n", + " year_vals <- sf::st_drop_geometry(year_vals)\n", + " year_vals <- year_vals[!duplicated(year_vals$ADM2_ID), , drop = FALSE]\n", + " sf_y <- dplyr::left_join(base_shapes, year_vals, by = \"ADM2_ID\")\n", + "\n", + " vals <- sf_y[[value_col]]\n", + " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", + "\n", + " if (length(finite_vals) == 0) {\n", + " next\n", + " }\n", + "\n", + " # Create categories\n", + " if (is_rate) {\n", + " cat_vals <- cut(\n", + " vals,\n", + " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", + " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", + " include.lowest = TRUE\n", + " )\n", + " fill_palette <- \"YlOrRd\"\n", + " } else {\n", + " # Use readable fixed-count classes for absolute values\n", + " n_classes <- 5\n", + " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, length.out = n_classes + 1), na.rm = TRUE)))\n", + " br <- sort(br)\n", + " if (length(br) < 2) {\n", + " br <- c(min(finite_vals, na.rm = TRUE), max(finite_vals, na.rm = TRUE) + 1)\n", + " }\n", + " if (length(unique(br)) < 2) {\n", + " cat_vals <- as.factor(rep(\"single value\", nrow(sf_y)))\n", + " } else {\n", + " labels_abs <- format_interval_labels(br)\n", + " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE, labels = labels_abs)\n", + " }\n", + " fill_palette <- \"Blues\"\n", + " }\n", + "\n", + " sf_y$cat <- as.factor(cat_vals)\n", + "\n", + " p <- ggplot(sf_y) +\n", + " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.12) +\n", + " scale_fill_brewer(palette = fill_palette, na.value = \"#f3f4f6\", drop = FALSE) +\n", + " theme_void() +\n", + " labs(\n", + " title = paste0(title_prefix, \" - \", yr),\n", + " fill = ifelse(is_rate, \"Rate class\", \"Value class\")\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +\n", + " theme(\n", + " legend.position = \"bottom\",\n", + " legend.text = element_text(size = 9),\n", + " legend.title = element_text(size = 10, face = \"bold\"),\n", + " plot.title = element_text(face = \"bold\", size = 13)\n", + " )\n", + "\n", + " plot_list[[as.character(yr)]] <- p\n", + " }\n", + " \n", + " # Display all plots\n", + " if (length(plot_list) > 0) {\n", + " options(repr.plot.width = 10, repr.plot.height = 8)\n", + " for (yr_name in names(plot_list)) {\n", + " print(plot_list[[yr_name]])\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Generate maps for each available indicator\n", + "cat(\"### Testing Rate\\n\")\n", + "if (\"testing_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"testing_rate\", \"Testing rate (TEST / SUSP)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Treatment Rate\\n\")\n", + "if (\"treatment_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Case Fatality Rate\\n\")\n", + "if (\"case_fatality_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Proportion Admissions Malaria\\n\")\n", + "if (\"prop_adm_malaria\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Proportion Malaria Deaths\\n\")\n", + "if (\"prop_malaria_deaths\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Non-malaria All-cause Outpatients\\n\")\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", FALSE)\n", + "}\n", + "\n", + "cat(\"\\n### Presumed Cases\\n\")\n", + "if (\"presumed_cases\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"presumed_cases\", \"Presumed cases (PRES)\", FALSE)\n", + "}" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b31e4c8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "8229c37e", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07324c1c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7c084da7", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f52975", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006866ce", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f7225165", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "420ed27f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ddb838", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/snt_dhis2_quality_of_care/pipeline.py b/snt_dhis2_quality_of_care/pipeline.py index fe07855..147cc85 100644 --- a/snt_dhis2_quality_of_care/pipeline.py +++ b/snt_dhis2_quality_of_care/pipeline.py @@ -13,15 +13,6 @@ @pipeline("snt_dhis2_quality_of_care") -@parameter( - "outlier_imputation_method", - name="Outlier imputation method", - help="Choose which outlier detection/imputation method to use.", - type=str, - choices=["mean", "median", "iqr", "trend", "mg-partial", "mg-complete"], - default="mean", - required=True, -) @parameter( "data_action", name="Data action", @@ -48,7 +39,6 @@ required=False, ) def snt_dhis2_quality_of_care( - outlier_imputation_method: str, data_action: str, run_report_only: bool, pull_scripts: bool, @@ -75,7 +65,6 @@ def snt_dhis2_quality_of_care( country_code = snt_config["SNT_CONFIG"]["COUNTRY_CODE"] nb_parameters = { - "outlier_imputation_method": outlier_imputation_method, "data_action": data_action, } @@ -96,15 +85,24 @@ def snt_dhis2_quality_of_care( country_code=country_code, ) + files_to_dataset = [ + data_path / f"{country_code}_quality_of_care_{data_action}.parquet", + data_path / f"{country_code}_quality_of_care_{data_action}.csv", + data_path / f"{country_code}_quality_of_care_district_year_{data_action}.parquet", + data_path / f"{country_code}_quality_of_care_district_year_{data_action}.csv", + data_path / f"{country_code}_quality_of_care_year_summary_{data_action}.parquet", + data_path / f"{country_code}_quality_of_care_year_summary_{data_action}.csv", + parameters_file, + ] + existing_files = [f for f in files_to_dataset if f.exists()] + missing_files = [f for f in files_to_dataset if not f.exists()] + for missing in missing_files: + current_run.log_warning(f"Output file not found, skipped for dataset upload: {missing}") + add_files_to_dataset( dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], country_code=country_code, - file_paths=[ - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.parquet", - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.csv", - data_path / f"{country_code}_quality_of_care_{outlier_imputation_method}_{data_action}.xlsx", - parameters_file, - ], + file_paths=existing_files, ) else: current_run.log_info("Skipping computations, running only reporting notebook.") From 64d2c8f4c3392da0c879f16dda28a97d4b6b126a Mon Sep 17 00:00:00 2001 From: claude-marie Date: Wed, 11 Mar 2026 17:08:46 +0100 Subject: [PATCH 6/7] Refactor SNT Quality of Care outputs to focus on district-year data. Updated file naming conventions and logging to reflect changes in output handling, removing unnecessary summary outputs. Adjusted reporting notebook to align with new output structure. --- .../code/snt_dhis2_quality_of_care.ipynb | 47 ++----------------- .../snt_dhis2_quality_of_care_report.ipynb | 4 +- snt_dhis2_quality_of_care/pipeline.py | 8 +--- 3 files changed, 8 insertions(+), 51 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index 210dd16..accb2d3 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -218,53 +218,14 @@ " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", "}\n", "\n", - "# Main district-year outputs\n", - "out_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.parquet\"))\n", - "out_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.csv\"))\n", - "out_xlsx <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_{data_action}.xlsx\"))\n", + "# Persist only district-year outputs (requested)\n", + "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_imputed.parquet\"))\n", + "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_imputed.csv\"))\n", "\n", - "# Explicit district-year naming (requested format style)\n", - "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.parquet\"))\n", - "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.csv\"))\n", - "\n", - "# Build a compact year-level summary from computed indicators\n", - "summary_tbl <- unique(qoc[, .(YEAR)])\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "summary_tbl <- summary_tbl[order(YEAR)]\n", - "\n", - "out_summary_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_year_summary_{data_action}.parquet\"))\n", - "out_summary_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_year_summary_{data_action}.csv\"))\n", - "\n", - "# Persist all outputs\n", - "arrow::write_parquet(qoc, out_parquet)\n", - "data.table::fwrite(qoc, out_csv)\n", - "writexl::write_xlsx(list(quality_of_care = as.data.frame(qoc)), out_xlsx)\n", "arrow::write_parquet(qoc, out_district_parquet)\n", "data.table::fwrite(qoc, out_district_csv)\n", - "arrow::write_parquet(summary_tbl, out_summary_parquet)\n", - "data.table::fwrite(summary_tbl, out_summary_csv)\n", "\n", - "log_msg(glue::glue(\"Saved outputs: {out_parquet}, {out_csv}, {out_xlsx}, {out_district_parquet}, {out_district_csv}, {out_summary_parquet}, {out_summary_csv}\"))" + "log_msg(glue::glue(\"Saved outputs: {out_district_parquet}, {out_district_csv}\"))" ] }, { diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 12e3616..4021110 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -53,8 +53,8 @@ }, "outputs": [], "source": [ - "# Use only the main district-year file for the selected run (exclude summary side files)\n", - "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_(imputed|removed)\\\\.parquet$\"), full.names = TRUE)\n", + "# Use district-year output file\n", + "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_district_year_imputed\\\\.parquet$\"), full.names = TRUE)\n", "if (length(files) == 0) {\n", " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", "}\n", diff --git a/snt_dhis2_quality_of_care/pipeline.py b/snt_dhis2_quality_of_care/pipeline.py index 147cc85..b20f805 100644 --- a/snt_dhis2_quality_of_care/pipeline.py +++ b/snt_dhis2_quality_of_care/pipeline.py @@ -86,12 +86,8 @@ def snt_dhis2_quality_of_care( ) files_to_dataset = [ - data_path / f"{country_code}_quality_of_care_{data_action}.parquet", - data_path / f"{country_code}_quality_of_care_{data_action}.csv", - data_path / f"{country_code}_quality_of_care_district_year_{data_action}.parquet", - data_path / f"{country_code}_quality_of_care_district_year_{data_action}.csv", - data_path / f"{country_code}_quality_of_care_year_summary_{data_action}.parquet", - data_path / f"{country_code}_quality_of_care_year_summary_{data_action}.csv", + data_path / f"{country_code}_quality_of_care_district_year_imputed.parquet", + data_path / f"{country_code}_quality_of_care_district_year_imputed.csv", parameters_file, ] existing_files = [f for f in files_to_dataset if f.exists()] From d079e3e85e3f589e0ff0664711cdab1b609bd827 Mon Sep 17 00:00:00 2001 From: claude-marie Date: Thu, 12 Mar 2026 09:13:36 +0100 Subject: [PATCH 7/7] last push --- .../code/snt_dhis2_quality_of_care.ipynb | 850 +++++------ .../snt_dhis2_quality_of_care_report.ipynb | 1286 ++++++++--------- snt_dhis2_quality_of_care/pipeline.py | 17 +- 3 files changed, 1078 insertions(+), 1075 deletions(-) diff --git a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb index accb2d3..3d650ea 100644 --- a/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/code/snt_dhis2_quality_of_care.ipynb @@ -1,430 +1,430 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "fad6c24e", - "metadata": {}, - "source": [ - "## Quality of Care Indicators\n", - "\n", - "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", - "\n", - "Indicators:\n", - "- testing_rate = TEST / SUSP\n", - "- treatment_rate = MALTREAT / CONF\n", - "- case_fatality_rate = MALDTH / MALADM\n", - "- prop_adm_malaria = MALADM / ALLADM\n", - "- prop_malaria_deaths = MALDTH / ALLDTH\n", - "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", - "- presumed_cases = PRES (absolute)\n", - "\n", - "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "317c4085", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Preliminaries\n", - "options(scipen=999)\n", - "\n", - "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", - "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", - "\n", - "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"writexl\", \"dplyr\")\n", - "install_and_load(required_packages)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98b78bf7", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Validate data_action parameter\n", - "if (!exists(\"data_action\")) {\n", - " data_action <- \"imputed\"\n", - "}\n", - "\n", - "allowed_actions <- c(\"imputed\", \"removed\")\n", - "if (!(data_action %in% allowed_actions)) {\n", - " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", - "}\n", - "\n", - "# Automatically find the latest routine outliers-imputed file in the dataset\n", - "# Pattern: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "log_msg(glue::glue(\"Searching for latest routine outliers-imputed file in dataset (data_action: {data_action})...\"))\n", - "\n", - "dataset_last_version <- openhexa$workspace$get_dataset(OUTLIERS_DATASET)$latest_version\n", - "if (is.null(dataset_last_version)) {\n", - " stop(glue::glue(\"[ERROR] No version available in dataset `{OUTLIERS_DATASET}`. Process stopped.\"))\n", - "}\n", - "\n", - "# Pattern to match: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", - "pattern_prefix <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-\")\n", - "pattern_suffix <- glue::glue(\"_{data_action}.parquet\")\n", - "routine_filename <- NULL\n", - "files_list <- reticulate::iterate(dataset_last_version$files)\n", - "\n", - "# Find all matching files and select the latest one\n", - "matching_files <- c()\n", - "for (file in files_list) {\n", - " filename <- file$filename\n", - " if (startsWith(filename, pattern_prefix) && endsWith(filename, pattern_suffix)) {\n", - " matching_files <- c(matching_files, filename)\n", - " }\n", - "}\n", - "\n", - "if (length(matching_files) == 0) {\n", - " stop(glue::glue(\"[ERROR] No file matching pattern `{pattern_prefix}*{pattern_suffix}` found in dataset `{OUTLIERS_DATASET}`. \",\n", - " \"Please run an outlier imputation pipeline first (e.g., snt_dhis2_outliers_imputation_mean) with `data_action=\\\"{data_action}\\\"`.\"))\n", - "}\n", - "\n", - "# Select the latest file (alphabetically sorted, which should correspond to most recent method)\n", - "routine_filename <- sort(matching_files, decreasing = TRUE)[1]\n", - "\n", - "log_msg(glue::glue(\"Found {length(matching_files)} matching file(s). Using latest: {routine_filename}\"))\n", - "\n", - "# Load the routine file\n", - "routine <- tryCatch({\n", - " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", - "}, error = function(e) {\n", - " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file `\", routine_filename, \n", - " \"` from `\", OUTLIERS_DATASET, \"`. [ERROR DETAILS] \", conditionMessage(e))\n", - " stop(msg)\n", - "})\n", - "\n", - "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "\n", - "setDT(routine)\n", - "\n", - "# Core required columns (must exist)\n", - "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", - "core_missing <- setdiff(core_cols, names(routine))\n", - "if (length(core_missing) > 0) {\n", - " stop(glue::glue(\"Missing core required columns in routine data: {paste(core_missing, collapse=', ')}\"))\n", - "}\n", - "\n", - "# Optional indicator columns (will be checked and handled gracefully)\n", - "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", - "available_cols <- intersect(indicator_cols, names(routine))\n", - "missing_cols <- setdiff(indicator_cols, names(routine))\n", - "\n", - "if (length(missing_cols) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Some indicator columns are missing: {paste(missing_cols, collapse=', ')}. These indicators will not be calculated.\"), level = \"warning\")\n", - "}\n", - "\n", - "# Convert available numeric columns\n", - "# Handle \"-\" and other non-numeric values by converting them to NA first\n", - "num_cols <- intersect(available_cols, names(routine))\n", - "if (length(num_cols) > 0) {\n", - " for (col in num_cols) {\n", - " # First convert to character to handle \"-\" strings, then replace with NA, then convert to numeric\n", - " col_vals <- as.character(routine[[col]])\n", - " col_vals[is.na(col_vals) | col_vals == \"\" | col_vals == \"-\"] <- NA_character_\n", - " routine[, (col) := as.numeric(col_vals)]\n", - " }\n", - "}\n", - "routine[, YEAR := as.integer(YEAR)]\n", - "routine[, ADM2_ID := as.character(ADM2_ID)]\n", - "\n", - "# Aggregate available columns only using lapply\n", - "if (length(available_cols) > 0) {\n", - " qoc <- routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), \n", - " .SDcols = available_cols, \n", - " by = .(ADM2_ID, YEAR)]\n", - "} else {\n", - " # If no indicator columns available, create empty structure\n", - " qoc <- routine[, .(ADM2_ID, YEAR)]\n", - " qoc <- unique(qoc)\n", - "}\n", - "\n", - "# Calculate indicators only if required columns are available\n", - "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) {\n", - " qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate testing_rate: missing TEST or SUSP columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) {\n", - " qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate treatment_rate: missing MALTREAT or CONF columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) {\n", - " qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate case_fatality_rate: missing MALDTH or MALADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) {\n", - " qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_adm_malaria: missing MALADM or ALLADM columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", - " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", - " # Compatibility alias to match historical notebook export naming\n", - " qoc[, prop_deaths_malaria := prop_malaria_deaths]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"ALLOUT\" %in% names(qoc)) {\n", - " qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate non_malaria_all_cause_outpatients: missing ALLOUT column\", level = \"warning\")\n", - "}\n", - "\n", - "if (\"PRES\" %in% names(qoc)) {\n", - " qoc[, presumed_cases := PRES]\n", - "} else {\n", - " log_msg(\"[WARNING] Cannot calculate presumed_cases: missing PRES column\", level = \"warning\")\n", - "}\n", - "\n", - "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", - "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", - " shapes_dt[, ADM2_ID := as.character(ADM2_ID)]\n", - " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", - "}\n", - "\n", - "# Persist only district-year outputs (requested)\n", - "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_imputed.parquet\"))\n", - "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_imputed.csv\"))\n", - "\n", - "arrow::write_parquet(qoc, out_district_parquet)\n", - "data.table::fwrite(qoc, out_district_csv)\n", - "\n", - "log_msg(glue::glue(\"Saved outputs: {out_district_parquet}, {out_district_csv}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "984689b0", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Yearly maps by ADM2\n", - "# Ensure ADM2_ID is character in both objects (do this once before the function)\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", - " # Check if value_col exists in df\n", - " if (!(value_col %in% names(df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found in data. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create a local copy of sf_shapes to avoid modifying the original\n", - " sf_shapes_local <- sf_shapes\n", - " if (!is.character(sf_shapes_local$ADM2_ID)) {\n", - " sf_shapes_local$ADM2_ID <- as.character(sf_shapes_local$ADM2_ID)\n", - " }\n", - " \n", - " years <- sort(unique(df$YEAR))\n", - " for (yr in years) {\n", - " df_y <- df[YEAR == yr]\n", - " \n", - " # Check if df_y has any rows\n", - " if (nrow(df_y) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No data for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Ensure ADM2_ID is character in df_y\n", - " df_y$ADM2_ID <- as.character(df_y$ADM2_ID)\n", - " \n", - " # Use dplyr::left_join for sf objects to preserve geometry (use local copy)\n", - " map_df <- dplyr::left_join(sf_shapes_local, df_y, by = \"ADM2_ID\")\n", - "\n", - " # Check if value_col exists in map_df after merge\n", - " if (!(value_col %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found after merge for year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " vals <- map_df[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - " \n", - " # If no valid values, skip this map\n", - " if (length(finite_vals) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid values for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - "\n", - " # Create cat column BEFORE creating the plot\n", - " cat_vals <- NULL\n", - " fill_palette <- NULL\n", - " \n", - " if (is_rate) {\n", - " # Create cat column with proper handling of NA values\n", - " cat_result <- tryCatch({\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " } else {\n", - " cat_result <- tryCatch({\n", - " if (length(finite_vals) > 4) {\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", - " if (length(br) < 2) {\n", - " cat_vals <- as.factor(rep(\"all\", nrow(map_df)))\n", - " } else {\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)\n", - " }\n", - " } else {\n", - " cat_vals <- as.factor(vals)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " TRUE # Success\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " FALSE # Failure\n", - " })\n", - " if (!cat_result) {\n", - " next\n", - " }\n", - " }\n", - " \n", - " # Check if cat_vals was created successfully\n", - " if (is.null(cat_vals) || length(cat_vals) != nrow(map_df)) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to create 'cat' column for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Check if all values are NA (cut failed) - but allow some NA values\n", - " if (all(is.na(cat_vals))) {\n", - " log_msg(glue::glue(\"[WARNING] All 'cat' values are NA for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Add cat column using dplyr::mutate to ensure it's properly added to sf object\n", - " map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))\n", - " \n", - " # Verify cat column exists before creating plot\n", - " if (!(\"cat\" %in% names(map_df))) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to add 'cat' column to map_df for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", - " next\n", - " }\n", - " \n", - " # Create plot AFTER cat column is added\n", - " p <- ggplot(map_df) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"white\", drop = FALSE)\n", - "\n", - " p <- p +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = value_col,\n", - " caption = \"Source: SNT DHIS2 outliers-imputed routine data\"\n", - " ) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " plot.title = element_text(face = \"bold\", size = 12)\n", - " )\n", - "\n", - " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", - " \n", - " # Try to save the plot, catch any errors\n", - " tryCatch({\n", - " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", - " log_msg(glue::glue(\"Saved map: {out_png}\"))\n", - " }, error = function(e) {\n", - " log_msg(glue::glue(\"[WARNING] Failed to save map for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", - " })\n", - " }\n", - "}\n", - "\n", - "# Plot only indicators that were calculated (columns exist)\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", - "}\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", - "}\n", - "\n", - "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "fad6c24e", + "metadata": {}, + "source": [ + "## Quality of Care Indicators\n", + "\n", + "Compute district-year quality-of-care indicators from DHIS2 outliers-imputed routine data.\n", + "\n", + "Indicators:\n", + "- testing_rate = TEST / SUSP\n", + "- treatment_rate = MALTREAT / CONF\n", + "- case_fatality_rate = MALDTH / MALADM\n", + "- prop_adm_malaria = MALADM / ALLADM\n", + "- prop_malaria_deaths = MALDTH / ALLDTH\n", + "- non_malaria_all_cause_outpatients = ALLOUT (absolute)\n", + "- presumed_cases = PRES (absolute)\n", + "\n", + "Stock-out indicators are not implemented yet (on hold, NMDR data pending)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "317c4085", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Preliminaries\n", + "options(scipen=999)\n", + "\n", + "ROOT_PATH <- \"~/workspace\"\n", + "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", + "DATA_PATH <- file.path(ROOT_PATH, \"data\")\n", + "OUTPUT_DATA_PATH <- file.path(DATA_PATH, \"dhis2\", \"quality_of_care\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "\n", + "dir.create(OUTPUT_DATA_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "required_packages <- c(\"jsonlite\", \"data.table\", \"arrow\", \"sf\", \"ggplot2\", \"glue\", \"reticulate\", \"RColorBrewer\", \"dplyr\")\n", + "install_and_load(required_packages)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "OUTLIERS_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_OUTLIERS_IMPUTATION" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98b78bf7", + "metadata": { + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "# Validate data_action parameter\n", + "if (!exists(\"data_action\")) {\n", + " data_action <- \"imputed\"\n", + "}\n", + "\n", + "allowed_actions <- c(\"imputed\", \"removed\")\n", + "if (!(data_action %in% allowed_actions)) {\n", + " stop(glue::glue(\"Invalid data_action: {data_action}. Allowed: {paste(allowed_actions, collapse=', ')}\"))\n", + "}\n", + "\n", + "# Automatically find the latest routine outliers-imputed file in the dataset\n", + "# Pattern: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", + "log_msg(glue::glue(\"Searching for latest routine outliers-imputed file in dataset (data_action: {data_action})...\"))\n", + "\n", + "dataset_last_version <- openhexa$workspace$get_dataset(OUTLIERS_DATASET)$latest_version\n", + "if (is.null(dataset_last_version)) {\n", + " stop(glue::glue(\"[ERROR] No version available in dataset `{OUTLIERS_DATASET}`. Process stopped.\"))\n", + "}\n", + "\n", + "# Pattern to match: {COUNTRY_CODE}_routine_outliers-*_{data_action}.parquet\n", + "pattern_prefix <- glue::glue(\"{COUNTRY_CODE}_routine_outliers-\")\n", + "pattern_suffix <- glue::glue(\"_{data_action}.parquet\")\n", + "routine_filename <- NULL\n", + "files_list <- reticulate::iterate(dataset_last_version$files)\n", + "\n", + "# Find all matching files and select the latest one\n", + "matching_files <- c()\n", + "for (file in files_list) {\n", + " filename <- file$filename\n", + " if (startsWith(filename, pattern_prefix) && endsWith(filename, pattern_suffix)) {\n", + " matching_files <- c(matching_files, filename)\n", + " }\n", + "}\n", + "\n", + "if (length(matching_files) == 0) {\n", + " stop(glue::glue(\"[ERROR] No file matching pattern `{pattern_prefix}*{pattern_suffix}` found in dataset `{OUTLIERS_DATASET}`. \",\n", + " \"Please run an outlier imputation pipeline first (e.g., snt_dhis2_outliers_imputation_mean) with `data_action=\\\"{data_action}\\\"`.\"))\n", + "}\n", + "\n", + "# Select the latest file (alphabetically sorted, which should correspond to most recent method)\n", + "routine_filename <- sort(matching_files, decreasing = TRUE)[1]\n", + "\n", + "log_msg(glue::glue(\"Found {length(matching_files)} matching file(s). Using latest: {routine_filename}\"))\n", + "\n", + "# Load the routine file\n", + "routine <- tryCatch({\n", + " get_latest_dataset_file_in_memory(OUTLIERS_DATASET, routine_filename)\n", + "}, error = function(e) {\n", + " msg <- paste0(\"[ERROR] 🛑 Error while loading DHIS2 routine data file `\", routine_filename, \n", + " \"` from `\", OUTLIERS_DATASET, \"`. [ERROR DETAILS] \", conditionMessage(e))\n", + " stop(msg)\n", + "})\n", + "\n", + "shapes <- get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", + "\n", + "setDT(routine)\n", + "\n", + "# Core required columns (must exist)\n", + "core_cols <- c(\"ADM2_ID\", \"YEAR\")\n", + "core_missing <- setdiff(core_cols, names(routine))\n", + "if (length(core_missing) > 0) {\n", + " stop(glue::glue(\"Missing core required columns in routine data: {paste(core_missing, collapse=', ')}\"))\n", + "}\n", + "\n", + "# Optional indicator columns (will be checked and handled gracefully)\n", + "indicator_cols <- c(\"TEST\", \"SUSP\", \"MALTREAT\", \"CONF\", \"MALDTH\", \"MALADM\", \"ALLADM\", \"ALLDTH\", \"ALLOUT\", \"PRES\")\n", + "available_cols <- intersect(indicator_cols, names(routine))\n", + "missing_cols <- setdiff(indicator_cols, names(routine))\n", + "\n", + "if (length(missing_cols) > 0) {\n", + " log_msg(glue::glue(\"[WARNING] Some indicator columns are missing: {paste(missing_cols, collapse=', ')}. These indicators will not be calculated.\"), level = \"warning\")\n", + "}\n", + "\n", + "# Convert available numeric columns\n", + "# Handle \"-\" and other non-numeric values by converting them to NA first\n", + "num_cols <- intersect(available_cols, names(routine))\n", + "if (length(num_cols) > 0) {\n", + " for (col in num_cols) {\n", + " # First convert to character to handle \"-\" strings, then replace with NA, then convert to numeric\n", + " col_vals <- as.character(routine[[col]])\n", + " col_vals[is.na(col_vals) | col_vals == \"\" | col_vals == \"-\"] <- NA_character_\n", + " routine[, (col) := as.numeric(col_vals)]\n", + " }\n", + "}\n", + "routine[, YEAR := as.integer(YEAR)]\n", + "routine[, ADM2_ID := as.character(ADM2_ID)]\n", + "\n", + "# Aggregate available columns only using lapply\n", + "if (length(available_cols) > 0) {\n", + " qoc <- routine[, lapply(.SD, function(x) sum(x, na.rm = TRUE)), \n", + " .SDcols = available_cols, \n", + " by = .(ADM2_ID, YEAR)]\n", + "} else {\n", + " # If no indicator columns available, create empty structure\n", + " qoc <- routine[, .(ADM2_ID, YEAR)]\n", + " qoc <- unique(qoc)\n", + "}\n", + "\n", + "# Calculate indicators only if required columns are available\n", + "if (\"TEST\" %in% names(qoc) && \"SUSP\" %in% names(qoc)) {\n", + " qoc[, testing_rate := fifelse(SUSP > 0, TEST / SUSP, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate testing_rate: missing TEST or SUSP columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALTREAT\" %in% names(qoc) && \"CONF\" %in% names(qoc)) {\n", + " qoc[, treatment_rate := fifelse(CONF > 0, MALTREAT / CONF, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate treatment_rate: missing MALTREAT or CONF columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALDTH\" %in% names(qoc) && \"MALADM\" %in% names(qoc)) {\n", + " qoc[, case_fatality_rate := fifelse(MALADM > 0, MALDTH / MALADM, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate case_fatality_rate: missing MALDTH or MALADM columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALADM\" %in% names(qoc) && \"ALLADM\" %in% names(qoc)) {\n", + " qoc[, prop_adm_malaria := fifelse(ALLADM > 0, MALADM / ALLADM, NA_real_)]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate prop_adm_malaria: missing MALADM or ALLADM columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"MALDTH\" %in% names(qoc) && \"ALLDTH\" %in% names(qoc)) {\n", + " qoc[, prop_malaria_deaths := fifelse(ALLDTH > 0, MALDTH / ALLDTH, NA_real_)]\n", + " # Compatibility alias to match historical notebook export naming\n", + " qoc[, prop_deaths_malaria := prop_malaria_deaths]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate prop_malaria_deaths: missing MALDTH or ALLDTH columns\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"ALLOUT\" %in% names(qoc)) {\n", + " qoc[, non_malaria_all_cause_outpatients := ALLOUT]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate non_malaria_all_cause_outpatients: missing ALLOUT column\", level = \"warning\")\n", + "}\n", + "\n", + "if (\"PRES\" %in% names(qoc)) {\n", + " qoc[, presumed_cases := PRES]\n", + "} else {\n", + " log_msg(\"[WARNING] Cannot calculate presumed_cases: missing PRES column\", level = \"warning\")\n", + "}\n", + "\n", + "shapes_dt <- as.data.table(sf::st_drop_geometry(shapes))\n", + "if (\"ADM2_ID\" %in% names(shapes_dt) && \"ADM2_NAME\" %in% names(shapes_dt)) {\n", + " shapes_dt[, ADM2_ID := as.character(ADM2_ID)]\n", + " qoc <- merge(qoc, unique(shapes_dt[, .(ADM2_ID, ADM2_NAME)]), by = \"ADM2_ID\", all.x = TRUE)\n", + "}\n", + "\n", + "# Persist only district-year outputs (requested)\n", + "out_district_parquet <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.parquet\"))\n", + "out_district_csv <- file.path(OUTPUT_DATA_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_district_year_{data_action}.csv\"))\n", + "\n", + "arrow::write_parquet(qoc, out_district_parquet)\n", + "data.table::fwrite(qoc, out_district_csv)\n", + "\n", + "log_msg(glue::glue(\"Saved outputs: {out_district_parquet}, {out_district_csv}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "984689b0", + "metadata": { + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [ + "# Yearly maps by ADM2\n", + "# Ensure ADM2_ID is character in both objects (do this once before the function)\n", + "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", + "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", + "\n", + "plot_yearly_map <- function(df, sf_shapes, value_col, title_prefix, filename_prefix, is_rate = TRUE) {\n", + " # Check if value_col exists in df\n", + " if (!(value_col %in% names(df))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found in data. Skipping map generation.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " # Create a local copy of sf_shapes to avoid modifying the original\n", + " sf_shapes_local <- sf_shapes\n", + " if (!is.character(sf_shapes_local$ADM2_ID)) {\n", + " sf_shapes_local$ADM2_ID <- as.character(sf_shapes_local$ADM2_ID)\n", + " }\n", + " \n", + " years <- sort(unique(df$YEAR))\n", + " for (yr in years) {\n", + " df_y <- df[YEAR == yr]\n", + " \n", + " # Check if df_y has any rows\n", + " if (nrow(df_y) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No data for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Ensure ADM2_ID is character in df_y\n", + " df_y$ADM2_ID <- as.character(df_y$ADM2_ID)\n", + " \n", + " # Use dplyr::left_join for sf objects to preserve geometry (use local copy)\n", + " map_df <- dplyr::left_join(sf_shapes_local, df_y, by = \"ADM2_ID\")\n", + "\n", + " # Check if value_col exists in map_df after merge\n", + " if (!(value_col %in% names(map_df))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found after merge for year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + "\n", + " vals <- map_df[[value_col]]\n", + " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", + " \n", + " # If no valid values, skip this map\n", + " if (length(finite_vals) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No valid values for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + "\n", + " # Create cat column BEFORE creating the plot\n", + " cat_vals <- NULL\n", + " fill_palette <- NULL\n", + " \n", + " if (is_rate) {\n", + " # Create cat column with proper handling of NA values\n", + " cat_result <- tryCatch({\n", + " cat_vals <- cut(\n", + " vals,\n", + " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", + " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", + " include.lowest = TRUE\n", + " )\n", + " fill_palette <- \"YlOrRd\"\n", + " TRUE # Success\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " FALSE # Failure\n", + " })\n", + " if (!cat_result) {\n", + " next\n", + " }\n", + " } else {\n", + " cat_result <- tryCatch({\n", + " if (length(finite_vals) > 4) {\n", + " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, 0.2), na.rm = TRUE)))\n", + " if (length(br) < 2) {\n", + " cat_vals <- as.factor(rep(\"all\", nrow(map_df)))\n", + " } else {\n", + " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE)\n", + " }\n", + " } else {\n", + " cat_vals <- as.factor(vals)\n", + " }\n", + " fill_palette <- \"Blues\"\n", + " TRUE # Success\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create categories for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " FALSE # Failure\n", + " })\n", + " if (!cat_result) {\n", + " next\n", + " }\n", + " }\n", + " \n", + " # Check if cat_vals was created successfully\n", + " if (is.null(cat_vals) || length(cat_vals) != nrow(map_df)) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to create 'cat' column for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Check if all values are NA (cut failed) - but allow some NA values\n", + " if (all(is.na(cat_vals))) {\n", + " log_msg(glue::glue(\"[WARNING] All 'cat' values are NA for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Add cat column using dplyr::mutate to ensure it's properly added to sf object\n", + " map_df <- dplyr::mutate(map_df, cat = as.factor(cat_vals))\n", + " \n", + " # Verify cat column exists before creating plot\n", + " if (!(\"cat\" %in% names(map_df))) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to add 'cat' column to map_df for '{value_col}' in year {yr}. Skipping map.\"), level = \"warning\")\n", + " next\n", + " }\n", + " \n", + " # Create plot AFTER cat column is added\n", + " p <- ggplot(map_df) +\n", + " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.1) +\n", + " scale_fill_brewer(palette = fill_palette, na.value = \"white\", drop = FALSE)\n", + "\n", + " p <- p +\n", + " theme_void() +\n", + " labs(\n", + " title = paste0(title_prefix, \" - \", yr),\n", + " fill = value_col,\n", + " caption = \"Source: SNT DHIS2 outliers-imputed routine data\"\n", + " ) +\n", + " theme(\n", + " legend.position = \"bottom\",\n", + " plot.title = element_text(face = \"bold\", size = 12)\n", + " )\n", + "\n", + " out_png <- file.path(FIGURES_PATH, glue::glue(\"{filename_prefix}_{yr}.png\"))\n", + " \n", + " # Try to save the plot, catch any errors\n", + " tryCatch({\n", + " ggsave(out_png, plot = p, width = 9, height = 7, dpi = 300, bg = \"white\")\n", + " log_msg(glue::glue(\"Saved map: {out_png}\"))\n", + " }, error = function(e) {\n", + " log_msg(glue::glue(\"[WARNING] Failed to save map for '{value_col}' year {yr}: {conditionMessage(e)}\"), level = \"warning\")\n", + " })\n", + " }\n", + "}\n", + "\n", + "# Plot only indicators that were calculated (columns exist)\n", + "if (\"testing_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"testing_rate\", \"Testing rate (TEST / SUSP)\", \"testing_rate\", TRUE)\n", + "}\n", + "if (\"treatment_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", \"treatment_rate\", TRUE)\n", + "}\n", + "if (\"case_fatality_rate\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", \"case_fatality_rate\", TRUE)\n", + "}\n", + "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", \"prop_adm_malaria\", TRUE)\n", + "}\n", + "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", \"prop_malaria_deaths\", TRUE)\n", + "}\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", \"allout\", FALSE)\n", + "}\n", + "if (\"presumed_cases\" %in% names(qoc)) {\n", + " plot_yearly_map(qoc, shapes, \"presumed_cases\", \"Presumed cases (PRES)\", \"presumed_cases\", FALSE)\n", + "}\n", + "\n", + "log_msg(glue::glue(\"Saved yearly maps in: {FIGURES_PATH}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb index 4021110..045eb65 100644 --- a/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb +++ b/pipelines/snt_dhis2_quality_of_care/reporting/snt_dhis2_quality_of_care_report.ipynb @@ -1,648 +1,648 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "7d246ae9", - "metadata": {}, - "source": [ - "## Quality of Care Report\n", - "\n", - "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eaa5bab", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "ROOT_PATH <- \"~/workspace\"\n", - "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", - "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", - "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", - "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", - "\n", - "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", - "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\"))\n", - "\n", - "# Create output directories\n", - "REPORT_OUTPUTS_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\")\n", - "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", - "\n", - "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", - "openhexa <- reticulate::import(\"openhexa.sdk\")\n", - "\n", - "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", - "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a8320f8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Use district-year output file\n", - "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_district_year_imputed\\\\.parquet$\"), full.names = TRUE)\n", - "if (length(files) == 0) {\n", - " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", - "}\n", - "\n", - "latest_file <- files[which.max(file.info(files)$mtime)]\n", - "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", - "\n", - "# Build summary table with only available columns\n", - "# Start with unique YEAR values\n", - "summary_tbl <- unique(qoc[, .(YEAR)])\n", - "\n", - "# Add rate indicators (mean) - merge one by one\n", - "if (\"testing_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"treatment_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"case_fatality_rate\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "# Add absolute indicators (sum)\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "if (\"presumed_cases\" %in% names(qoc)) {\n", - " summary_tbl <- merge(summary_tbl, \n", - " qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], \n", - " by = \"YEAR\", all.x = TRUE)\n", - "}\n", - "\n", - "summary_tbl <- summary_tbl[order(YEAR)]\n", - "\n", - "# Explicitly list missing indicators so report is self-explanatory\n", - "expected_indicators <- c(\n", - " \"testing_rate\",\n", - " \"treatment_rate\",\n", - " \"case_fatality_rate\",\n", - " \"prop_adm_malaria\",\n", - " \"prop_malaria_deaths\",\n", - " \"non_malaria_all_cause_outpatients\",\n", - " \"presumed_cases\"\n", - ")\n", - "missing_indicators <- setdiff(expected_indicators, names(qoc))\n", - "if (length(missing_indicators) > 0) {\n", - " log_msg(glue::glue(\"[WARNING] Missing indicators in input file: {paste(missing_indicators, collapse=', ')}\"), level = \"warning\")\n", - " cat(glue::glue(\"\\nMissing indicators in this run: {paste(missing_indicators, collapse=', ')}\\n\"))\n", - " cat(\"Reason: required source columns are absent in the selected outliers file.\\n\")\n", - "}\n", - "\n", - "# Save summary data (parquet, csv, xlsx) - following other pipelines pattern\n", - "summary_parquet <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.parquet\"))\n", - "summary_csv <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.csv\"))\n", - "summary_xlsx <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.xlsx\"))\n", - "\n", - "# Save as parquet (primary format, like other pipelines)\n", - "arrow::write_parquet(summary_tbl, summary_parquet)\n", - "\n", - "# Save as csv and xlsx for compatibility\n", - "data.table::fwrite(summary_tbl, summary_csv)\n", - "writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx)\n", - "\n", - "log_msg(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\"))\n", - "\n", - "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", - "\n", - "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", - "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", - "cat(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\\n\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3dc318ac", - "metadata": {}, - "source": [ - "## Graphs by Year" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0e86bb0a", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Create bar charts by year (same as original notebook - 4x2 grid layout)\n", - "# Prepare data - convert rates to percentages\n", - "plot_data <- copy(summary_tbl)\n", - "\n", - "# Create the same 4x2 subplot layout as original notebook\n", - "if (nrow(plot_data) > 0) {\n", - " # Create a list to store individual plots (in order: 4x2 grid)\n", - " plots_list <- list()\n", - " \n", - " # Row 0, Col 0: Testing rate\n", - " if (\"testing_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = testing_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(testing_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Testing rate (TEST / SUSP)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"testing_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 0, Col 1: Treatment rate\n", - " if (\"treatment_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = treatment_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(treatment_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Treatment rate (MALTREAT / CONF)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"treatment_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 0: Case fatality rate\n", - " if (\"case_fatality_rate\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = case_fatality_rate * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(case_fatality_rate * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Case fatality rate (MALDTH / MALADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"case_fatality_rate\"]] <- p\n", - " }\n", - " \n", - " # Row 1, Col 1: Proportion admissions malaria\n", - " if (\"prop_adm_malaria\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_adm_malaria * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_adm_malaria * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. admissions paludisme (MALADM / ALLADM)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_adm_malaria\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 0: Proportion deaths malaria\n", - " if (\"prop_malaria_deaths\" %in% names(plot_data)) {\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_malaria_deaths * 100)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = paste0(round(prop_malaria_deaths * 100, 1), \"%\")), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Prop. décès paludisme (MALDTH / ALLDTH)\", x = \"Année\", y = \"%\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"prop_malaria_deaths\"]] <- p\n", - " }\n", - " \n", - " # Row 2, Col 1: Presumed cases (absolute)\n", - " if (\"presumed_cases\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = presumed_cases)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(presumed_cases)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Cas présumés (PRES)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"presumed_cases\"]] <- p\n", - " }\n", - " \n", - " # Row 3, Col 0: Non-malaria all-cause outpatients (absolute)\n", - " if (\"non_malaria_all_cause_outpatients\" %in% names(plot_data)) {\n", - " format_label <- function(v) {\n", - " ifelse(is.na(v) | v == 0, \"0\",\n", - " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", - " format(round(v), big.mark = \" \", scientific = FALSE)\n", - " )\n", - " )\n", - " }\n", - " p <- ggplot(plot_data, aes(x = factor(YEAR), y = non_malaria_all_cause_outpatients)) +\n", - " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", - " geom_text(aes(label = format_label(non_malaria_all_cause_outpatients)), \n", - " vjust = -0.5, size = 2.5) +\n", - " labs(title = \"Consultations externes non-paludisme (ALLOUT)\", x = \"Année\", y = \"Nombre\") +\n", - " theme_minimal() +\n", - " theme(\n", - " plot.title = element_text(face = \"bold\", size = 10),\n", - " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", - " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", - " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", - " plot.margin = margin(5, 5, 5, 5)\n", - " ) +\n", - " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", - " plots_list[[\"non_malaria_all_cause_outpatients\"]] <- p\n", - " }\n", - " \n", - " # Create and display combined plot (dynamic grid for readability)\n", - " if (length(plots_list) > 0) {\n", - " # Order plots as in original\n", - " plot_order <- c(\"testing_rate\", \"treatment_rate\", \"case_fatality_rate\", \"prop_adm_malaria\", \n", - " \"prop_malaria_deaths\", \"presumed_cases\", \"non_malaria_all_cause_outpatients\")\n", - " available_plots <- plots_list[intersect(plot_order, names(plots_list))]\n", - "\n", - " if (length(available_plots) > 0) {\n", - " n_plots <- length(available_plots)\n", - " ncol_layout <- 2\n", - " nrow_layout <- ceiling(n_plots / ncol_layout)\n", - "\n", - " # Bigger display in report so labels are readable\n", - " options(repr.plot.width = 14, repr.plot.height = max(7, 4.8 * nrow_layout))\n", - "\n", - " combined_plot <- do.call(grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))\n", - " print(combined_plot)\n", - "\n", - " # Save at larger size for presentation readability\n", - " combined_file <- file.path(FIGURES_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_by_year.png\"))\n", - " ggsave(\n", - " combined_file,\n", - " plot = combined_plot,\n", - " width = 18,\n", - " height = max(8, 5.2 * nrow_layout),\n", - " dpi = 300,\n", - " bg = \"white\",\n", - " units = \"in\"\n", - " )\n", - " log_msg(glue::glue(\"Combined bar charts saved: {combined_file}\"))\n", - " }\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "3b625d36", - "metadata": {}, - "source": [ - "## Maps by District and Year\n", - "\n", - "Maps are generated directly from the quality-of-care data and district shapes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6056a979", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [ - "# Load shapes geojson from dataset (like seasonality pipeline)\n", - "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", - "\n", - "shapes <- tryCatch({\n", - " get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", - "}, error = function(e) {\n", - " msg <- paste0(\"Error while loading DHIS2 Shapes data for: \", COUNTRY_CODE, \". \", conditionMessage(e))\n", - " log_msg(msg, level = \"error\")\n", - " stop(msg)\n", - "})\n", - "\n", - "# Ensure ADM2_ID is character in both datasets\n", - "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", - "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", - "\n", - "# Merge shapes with quality-of-care data\n", - "qoc_sf <- shapes %>%\n", - " dplyr::left_join(qoc, by = \"ADM2_ID\")\n", - "\n", - "# Helper to build readable interval labels for legends\n", - "format_interval_labels <- function(breaks_vec) {\n", - " labels <- c()\n", - " for (i in seq_len(length(breaks_vec) - 1)) {\n", - " a <- breaks_vec[i]\n", - " b <- breaks_vec[i + 1]\n", - " labels <- c(labels, paste0(scales::comma(round(a)), \" - \", scales::comma(round(b))))\n", - " }\n", - " labels\n", - "}\n", - "\n", - "# Function to plot yearly maps (similar to code notebook but inline in report)\n", - "plot_yearly_map_report <- function(sf_data, value_col, title_prefix, is_rate = TRUE) {\n", - " if (!(value_col %in% names(sf_data))) {\n", - " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found. Skipping map generation.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " years <- sort(unique(sf_data$YEAR[!is.na(sf_data$YEAR)]))\n", - " if (length(years) == 0) {\n", - " log_msg(glue::glue(\"[WARNING] No valid years for '{value_col}'. Skipping map.\"), level = \"warning\")\n", - " return(invisible(NULL))\n", - " }\n", - " \n", - " # Create plots for each year\n", - " plot_list <- list()\n", - " base_shapes <- sf_data %>% dplyr::select(ADM2_ID, geometry) %>% dplyr::distinct()\n", - "\n", - " for (yr in years) {\n", - " # Keep all districts on map, then join year values\n", - " year_vals <- sf_data[sf_data$YEAR == yr, c(\"ADM2_ID\", value_col), drop = FALSE]\n", - " year_vals <- sf::st_drop_geometry(year_vals)\n", - " year_vals <- year_vals[!duplicated(year_vals$ADM2_ID), , drop = FALSE]\n", - " sf_y <- dplyr::left_join(base_shapes, year_vals, by = \"ADM2_ID\")\n", - "\n", - " vals <- sf_y[[value_col]]\n", - " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", - "\n", - " if (length(finite_vals) == 0) {\n", - " next\n", - " }\n", - "\n", - " # Create categories\n", - " if (is_rate) {\n", - " cat_vals <- cut(\n", - " vals,\n", - " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", - " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", - " include.lowest = TRUE\n", - " )\n", - " fill_palette <- \"YlOrRd\"\n", - " } else {\n", - " # Use readable fixed-count classes for absolute values\n", - " n_classes <- 5\n", - " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, length.out = n_classes + 1), na.rm = TRUE)))\n", - " br <- sort(br)\n", - " if (length(br) < 2) {\n", - " br <- c(min(finite_vals, na.rm = TRUE), max(finite_vals, na.rm = TRUE) + 1)\n", - " }\n", - " if (length(unique(br)) < 2) {\n", - " cat_vals <- as.factor(rep(\"single value\", nrow(sf_y)))\n", - " } else {\n", - " labels_abs <- format_interval_labels(br)\n", - " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE, labels = labels_abs)\n", - " }\n", - " fill_palette <- \"Blues\"\n", - " }\n", - "\n", - " sf_y$cat <- as.factor(cat_vals)\n", - "\n", - " p <- ggplot(sf_y) +\n", - " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.12) +\n", - " scale_fill_brewer(palette = fill_palette, na.value = \"#f3f4f6\", drop = FALSE) +\n", - " theme_void() +\n", - " labs(\n", - " title = paste0(title_prefix, \" - \", yr),\n", - " fill = ifelse(is_rate, \"Rate class\", \"Value class\")\n", - " ) +\n", - " guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +\n", - " theme(\n", - " legend.position = \"bottom\",\n", - " legend.text = element_text(size = 9),\n", - " legend.title = element_text(size = 10, face = \"bold\"),\n", - " plot.title = element_text(face = \"bold\", size = 13)\n", - " )\n", - "\n", - " plot_list[[as.character(yr)]] <- p\n", - " }\n", - " \n", - " # Display all plots\n", - " if (length(plot_list) > 0) {\n", - " options(repr.plot.width = 10, repr.plot.height = 8)\n", - " for (yr_name in names(plot_list)) {\n", - " print(plot_list[[yr_name]])\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Generate maps for each available indicator\n", - "cat(\"### Testing Rate\\n\")\n", - "if (\"testing_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"testing_rate\", \"Testing rate (TEST / SUSP)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Treatment Rate\\n\")\n", - "if (\"treatment_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Case Fatality Rate\\n\")\n", - "if (\"case_fatality_rate\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Proportion Admissions Malaria\\n\")\n", - "if (\"prop_adm_malaria\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Proportion Malaria Deaths\\n\")\n", - "if (\"prop_malaria_deaths\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", TRUE)\n", - "}\n", - "\n", - "cat(\"\\n### Non-malaria All-cause Outpatients\\n\")\n", - "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", FALSE)\n", - "}\n", - "\n", - "cat(\"\\n### Presumed Cases\\n\")\n", - "if (\"presumed_cases\" %in% names(qoc_sf)) {\n", - " plot_yearly_map_report(qoc_sf, \"presumed_cases\", \"Presumed cases (PRES)\", FALSE)\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b31e4c8", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "8229c37e", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07324c1c", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "7c084da7", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9f52975", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "006866ce", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "f7225165", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "420ed27f", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67ddb838", - "metadata": { - "vscode": { - "languageId": "r" - } - }, - "outputs": [], - "source": [] + "cells": [ + { + "cell_type": "markdown", + "id": "7d246ae9", + "metadata": {}, + "source": [ + "## Quality of Care Report\n", + "\n", + "This report displays a compact year-level summary of quality-of-care indicators and points to generated map outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eaa5bab", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "ROOT_PATH <- \"~/workspace\"\n", + "CONFIG_PATH <- file.path(ROOT_PATH, \"configuration\")\n", + "CODE_PATH <- file.path(ROOT_PATH, \"code\")\n", + "DATA_PATH <- file.path(ROOT_PATH, \"data\", \"dhis2\", \"quality_of_care\")\n", + "FIGURES_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\", \"figures\")\n", + "\n", + "source(file.path(CODE_PATH, \"snt_utils.r\"))\n", + "install_and_load(c(\"jsonlite\", \"data.table\", \"arrow\", \"dplyr\", \"knitr\", \"glue\", \"reticulate\", \"writexl\", \"ggplot2\", \"scales\", \"gridExtra\", \"sf\"))\n", + "\n", + "# Create output directories\n", + "REPORT_OUTPUTS_PATH <- file.path(ROOT_PATH, \"pipelines\", \"snt_dhis2_quality_of_care\", \"reporting\", \"outputs\")\n", + "dir.create(REPORT_OUTPUTS_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "dir.create(FIGURES_PATH, recursive = TRUE, showWarnings = FALSE)\n", + "\n", + "Sys.setenv(RETICULATE_PYTHON = \"/opt/conda/bin/python\")\n", + "openhexa <- reticulate::import(\"openhexa.sdk\")\n", + "\n", + "config_json <- jsonlite::fromJSON(file.path(CONFIG_PATH, \"SNT_config.json\"))\n", + "COUNTRY_CODE <- config_json$SNT_CONFIG$COUNTRY_CODE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a8320f8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Use district-year output file (latest action)\n", + "files <- list.files(DATA_PATH, pattern = paste0(\"^\", COUNTRY_CODE, \"_quality_of_care_district_year_(imputed|removed)\\\\.parquet$\"), full.names = TRUE)\n", + "if (length(files) == 0) {\n", + " stop(glue::glue(\"No quality_of_care parquet found in {DATA_PATH}\"))\n", + "}\n", + "\n", + "latest_file <- files[which.max(file.info(files)$mtime)]\n", + "qoc <- as.data.table(arrow::read_parquet(latest_file))\n", + "\n", + "# Build summary table with only available columns\n", + "# Start with unique YEAR values\n", + "summary_tbl <- unique(qoc[, .(YEAR)])\n", + "\n", + "# Add rate indicators (mean) - merge one by one\n", + "if (\"testing_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(testing_rate = mean(testing_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"treatment_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(treatment_rate = mean(treatment_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"case_fatality_rate\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(case_fatality_rate = mean(case_fatality_rate, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_adm_malaria\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(prop_adm_malaria = mean(prop_adm_malaria, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"prop_malaria_deaths\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(prop_malaria_deaths = mean(prop_malaria_deaths, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "\n", + "# Add absolute indicators (sum)\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(non_malaria_all_cause_outpatients = sum(non_malaria_all_cause_outpatients, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "if (\"presumed_cases\" %in% names(qoc)) {\n", + " summary_tbl <- merge(summary_tbl, \n", + " qoc[, .(presumed_cases = sum(presumed_cases, na.rm = TRUE)), by = .(YEAR)], \n", + " by = \"YEAR\", all.x = TRUE)\n", + "}\n", + "\n", + "summary_tbl <- summary_tbl[order(YEAR)]\n", + "\n", + "# Explicitly list missing indicators so report is self-explanatory\n", + "expected_indicators <- c(\n", + " \"testing_rate\",\n", + " \"treatment_rate\",\n", + " \"case_fatality_rate\",\n", + " \"prop_adm_malaria\",\n", + " \"prop_malaria_deaths\",\n", + " \"non_malaria_all_cause_outpatients\",\n", + " \"presumed_cases\"\n", + ")\n", + "missing_indicators <- setdiff(expected_indicators, names(qoc))\n", + "if (length(missing_indicators) > 0) {\n", + " log_msg(glue::glue(\"[WARNING] Missing indicators in input file: {paste(missing_indicators, collapse=', ')}\"), level = \"warning\")\n", + " cat(glue::glue(\"\\nMissing indicators in this run: {paste(missing_indicators, collapse=', ')}\\n\"))\n", + " cat(\"Reason: required source columns are absent in the selected outliers file.\\n\")\n", + "}\n", + "\n", + "# Save summary data (parquet, csv, xlsx) - following other pipelines pattern\n", + "summary_parquet <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.parquet\"))\n", + "summary_csv <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.csv\"))\n", + "summary_xlsx <- file.path(REPORT_OUTPUTS_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_summary.xlsx\"))\n", + "\n", + "# Save as parquet (primary format, like other pipelines)\n", + "arrow::write_parquet(summary_tbl, summary_parquet)\n", + "\n", + "# Save as csv and xlsx for compatibility\n", + "data.table::fwrite(summary_tbl, summary_csv)\n", + "writexl::write_xlsx(list(summary = as.data.frame(summary_tbl)), summary_xlsx)\n", + "\n", + "log_msg(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\"))\n", + "\n", + "knitr::kable(summary_tbl, caption = \"Quality of Care - Year-level summary\")\n", + "\n", + "cat(glue::glue(\"\\nLoaded file: {latest_file}\\n\"))\n", + "cat(glue::glue(\"Map outputs folder: {FIGURES_PATH}\\n\"))\n", + "cat(glue::glue(\"Summary data saved to: {summary_parquet}, {summary_csv}, {summary_xlsx}\\n\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3dc318ac", + "metadata": {}, + "source": [ + "## Graphs by Year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e86bb0a", + "metadata": { + "vscode": { + "languageId": "r" } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "4.4.3" + }, + "outputs": [], + "source": [ + "# Create bar charts by year (same as original notebook - 4x2 grid layout)\n", + "# Prepare data - convert rates to percentages\n", + "plot_data <- copy(summary_tbl)\n", + "\n", + "# Create the same 4x2 subplot layout as original notebook\n", + "if (nrow(plot_data) > 0) {\n", + " # Create a list to store individual plots (in order: 4x2 grid)\n", + " plots_list <- list()\n", + " \n", + " # Row 0, Col 0: Testing rate\n", + " if (\"testing_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = testing_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(testing_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Testing rate (TEST / SUSP)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"testing_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 0, Col 1: Treatment rate\n", + " if (\"treatment_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = treatment_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(treatment_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Treatment rate (MALTREAT / CONF)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"treatment_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 1, Col 0: Case fatality rate\n", + " if (\"case_fatality_rate\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = case_fatality_rate * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(case_fatality_rate * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Case fatality rate (MALDTH / MALADM)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"case_fatality_rate\"]] <- p\n", + " }\n", + " \n", + " # Row 1, Col 1: Proportion admissions malaria\n", + " if (\"prop_adm_malaria\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_adm_malaria * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(prop_adm_malaria * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Prop. admissions paludisme (MALADM / ALLADM)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"prop_adm_malaria\"]] <- p\n", + " }\n", + " \n", + " # Row 2, Col 0: Proportion deaths malaria\n", + " if (\"prop_malaria_deaths\" %in% names(plot_data)) {\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = prop_malaria_deaths * 100)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = paste0(round(prop_malaria_deaths * 100, 1), \"%\")), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Prop. décès paludisme (MALDTH / ALLDTH)\", x = \"Année\", y = \"%\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"prop_malaria_deaths\"]] <- p\n", + " }\n", + " \n", + " # Row 2, Col 1: Presumed cases (absolute)\n", + " if (\"presumed_cases\" %in% names(plot_data)) {\n", + " format_label <- function(v) {\n", + " ifelse(is.na(v) | v == 0, \"0\",\n", + " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", + " format(round(v), big.mark = \" \", scientific = FALSE)\n", + " )\n", + " )\n", + " }\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = presumed_cases)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = format_label(presumed_cases)), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Cas présumés (PRES)\", x = \"Année\", y = \"Nombre\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"presumed_cases\"]] <- p\n", + " }\n", + " \n", + " # Row 3, Col 0: Non-malaria all-cause outpatients (absolute)\n", + " if (\"non_malaria_all_cause_outpatients\" %in% names(plot_data)) {\n", + " format_label <- function(v) {\n", + " ifelse(is.na(v) | v == 0, \"0\",\n", + " ifelse(v >= 1e6, paste0(round(v/1e6, 2), \"M\"),\n", + " format(round(v), big.mark = \" \", scientific = FALSE)\n", + " )\n", + " )\n", + " }\n", + " p <- ggplot(plot_data, aes(x = factor(YEAR), y = non_malaria_all_cause_outpatients)) +\n", + " geom_bar(stat = \"identity\", fill = \"#2563eb\", color = \"#1e40af\", width = 0.7) +\n", + " geom_text(aes(label = format_label(non_malaria_all_cause_outpatients)), \n", + " vjust = -0.5, size = 2.5) +\n", + " labs(title = \"Consultations externes non-paludisme (ALLOUT)\", x = \"Année\", y = \"Nombre\") +\n", + " theme_minimal() +\n", + " theme(\n", + " plot.title = element_text(face = \"bold\", size = 10),\n", + " axis.text.x = element_text(angle = 45, hjust = 1, size = 9),\n", + " panel.grid.major.y = element_line(linetype = \"dashed\", color = scales::alpha(\"grey\", 0.7)),\n", + " plot.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " panel.background = element_rect(fill = \"#fafafa\", color = NA),\n", + " plot.margin = margin(5, 5, 5, 5)\n", + " ) +\n", + " scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1)))\n", + " plots_list[[\"non_malaria_all_cause_outpatients\"]] <- p\n", + " }\n", + " \n", + " # Create and display combined plot (dynamic grid for readability)\n", + " if (length(plots_list) > 0) {\n", + " # Order plots as in original\n", + " plot_order <- c(\"testing_rate\", \"treatment_rate\", \"case_fatality_rate\", \"prop_adm_malaria\", \n", + " \"prop_malaria_deaths\", \"presumed_cases\", \"non_malaria_all_cause_outpatients\")\n", + " available_plots <- plots_list[intersect(plot_order, names(plots_list))]\n", + "\n", + " if (length(available_plots) > 0) {\n", + " n_plots <- length(available_plots)\n", + " ncol_layout <- 2\n", + " nrow_layout <- ceiling(n_plots / ncol_layout)\n", + "\n", + " # Bigger display in report so labels are readable\n", + " options(repr.plot.width = 14, repr.plot.height = max(7, 4.8 * nrow_layout))\n", + "\n", + " combined_plot <- do.call(grid.arrange, c(available_plots, ncol = ncol_layout, nrow = nrow_layout))\n", + " print(combined_plot)\n", + "\n", + " # Save at larger size for presentation readability\n", + " combined_file <- file.path(FIGURES_PATH, glue::glue(\"{COUNTRY_CODE}_quality_of_care_by_year.png\"))\n", + " ggsave(\n", + " combined_file,\n", + " plot = combined_plot,\n", + " width = 18,\n", + " height = max(8, 5.2 * nrow_layout),\n", + " dpi = 300,\n", + " bg = \"white\",\n", + " units = \"in\"\n", + " )\n", + " log_msg(glue::glue(\"Combined bar charts saved: {combined_file}\"))\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3b625d36", + "metadata": {}, + "source": [ + "## Maps by District and Year\n", + "\n", + "Maps are generated directly from the quality-of-care data and district shapes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6056a979", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Load shapes geojson from dataset (like seasonality pipeline)\n", + "DHIS2_FORMATTED_DATASET <- config_json$SNT_DATASET_IDENTIFIERS$DHIS2_DATASET_FORMATTED\n", + "\n", + "shapes <- tryCatch({\n", + " get_latest_dataset_file_in_memory(DHIS2_FORMATTED_DATASET, paste0(COUNTRY_CODE, \"_shapes.geojson\"))\n", + "}, error = function(e) {\n", + " msg <- paste0(\"Error while loading DHIS2 Shapes data for: \", COUNTRY_CODE, \". \", conditionMessage(e))\n", + " log_msg(msg, level = \"error\")\n", + " stop(msg)\n", + "})\n", + "\n", + "# Ensure ADM2_ID is character in both datasets\n", + "shapes$ADM2_ID <- as.character(shapes$ADM2_ID)\n", + "qoc$ADM2_ID <- as.character(qoc$ADM2_ID)\n", + "\n", + "# Merge shapes with quality-of-care data\n", + "qoc_sf <- shapes %>%\n", + " dplyr::left_join(qoc, by = \"ADM2_ID\")\n", + "\n", + "# Helper to build readable interval labels for legends\n", + "format_interval_labels <- function(breaks_vec) {\n", + " labels <- c()\n", + " for (i in seq_len(length(breaks_vec) - 1)) {\n", + " a <- breaks_vec[i]\n", + " b <- breaks_vec[i + 1]\n", + " labels <- c(labels, paste0(scales::comma(round(a)), \" - \", scales::comma(round(b))))\n", + " }\n", + " labels\n", + "}\n", + "\n", + "# Function to plot yearly maps (similar to code notebook but inline in report)\n", + "plot_yearly_map_report <- function(sf_data, value_col, title_prefix, is_rate = TRUE) {\n", + " if (!(value_col %in% names(sf_data))) {\n", + " log_msg(glue::glue(\"[WARNING] Column '{value_col}' not found. Skipping map generation.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " years <- sort(unique(sf_data$YEAR[!is.na(sf_data$YEAR)]))\n", + " if (length(years) == 0) {\n", + " log_msg(glue::glue(\"[WARNING] No valid years for '{value_col}'. Skipping map.\"), level = \"warning\")\n", + " return(invisible(NULL))\n", + " }\n", + " \n", + " # Create plots for each year\n", + " plot_list <- list()\n", + " base_shapes <- sf_data %>% dplyr::select(ADM2_ID, geometry) %>% dplyr::distinct()\n", + "\n", + " for (yr in years) {\n", + " # Keep all districts on map, then join year values\n", + " year_vals <- sf_data[sf_data$YEAR == yr, c(\"ADM2_ID\", value_col), drop = FALSE]\n", + " year_vals <- sf::st_drop_geometry(year_vals)\n", + " year_vals <- year_vals[!duplicated(year_vals$ADM2_ID), , drop = FALSE]\n", + " sf_y <- dplyr::left_join(base_shapes, year_vals, by = \"ADM2_ID\")\n", + "\n", + " vals <- sf_y[[value_col]]\n", + " finite_vals <- vals[is.finite(vals) & !is.na(vals)]\n", + "\n", + " if (length(finite_vals) == 0) {\n", + " next\n", + " }\n", + "\n", + " # Create categories\n", + " if (is_rate) {\n", + " cat_vals <- cut(\n", + " vals,\n", + " breaks = c(-Inf, 0, 0.2, 0.4, 0.6, 0.8, 1.0, Inf),\n", + " labels = c(\"<0\", \"0-0.2\", \"0.2-0.4\", \"0.4-0.6\", \"0.6-0.8\", \"0.8-1.0\", \">1.0\"),\n", + " include.lowest = TRUE\n", + " )\n", + " fill_palette <- \"YlOrRd\"\n", + " } else {\n", + " # Use readable fixed-count classes for absolute values\n", + " n_classes <- 5\n", + " br <- unique(as.numeric(quantile(finite_vals, probs = seq(0, 1, length.out = n_classes + 1), na.rm = TRUE)))\n", + " br <- sort(br)\n", + " if (length(br) < 2) {\n", + " br <- c(min(finite_vals, na.rm = TRUE), max(finite_vals, na.rm = TRUE) + 1)\n", + " }\n", + " if (length(unique(br)) < 2) {\n", + " cat_vals <- as.factor(rep(\"single value\", nrow(sf_y)))\n", + " } else {\n", + " labels_abs <- format_interval_labels(br)\n", + " cat_vals <- cut(vals, breaks = br, include.lowest = TRUE, labels = labels_abs)\n", + " }\n", + " fill_palette <- \"Blues\"\n", + " }\n", + "\n", + " sf_y$cat <- as.factor(cat_vals)\n", + "\n", + " p <- ggplot(sf_y) +\n", + " geom_sf(aes(fill = cat), color = \"grey60\", size = 0.12) +\n", + " scale_fill_brewer(palette = fill_palette, na.value = \"#f3f4f6\", drop = FALSE) +\n", + " theme_void() +\n", + " labs(\n", + " title = paste0(title_prefix, \" - \", yr),\n", + " fill = ifelse(is_rate, \"Rate class\", \"Value class\")\n", + " ) +\n", + " guides(fill = guide_legend(nrow = 2, byrow = TRUE)) +\n", + " theme(\n", + " legend.position = \"bottom\",\n", + " legend.text = element_text(size = 9),\n", + " legend.title = element_text(size = 10, face = \"bold\"),\n", + " plot.title = element_text(face = \"bold\", size = 13)\n", + " )\n", + "\n", + " plot_list[[as.character(yr)]] <- p\n", + " }\n", + " \n", + " # Display all plots\n", + " if (length(plot_list) > 0) {\n", + " options(repr.plot.width = 10, repr.plot.height = 8)\n", + " for (yr_name in names(plot_list)) {\n", + " print(plot_list[[yr_name]])\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Generate maps for each available indicator\n", + "cat(\"### Testing Rate\\n\")\n", + "if (\"testing_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"testing_rate\", \"Testing rate (TEST / SUSP)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Treatment Rate\\n\")\n", + "if (\"treatment_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"treatment_rate\", \"Treatment rate (MALTREAT / CONF)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Case Fatality Rate\\n\")\n", + "if (\"case_fatality_rate\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"case_fatality_rate\", \"In-hospital case fatality rate (MALDTH / MALADM)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Proportion Admissions Malaria\\n\")\n", + "if (\"prop_adm_malaria\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"prop_adm_malaria\", \"Proportion admitted for malaria (MALADM / ALLADM)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Proportion Malaria Deaths\\n\")\n", + "if (\"prop_malaria_deaths\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"prop_malaria_deaths\", \"Proportion of malaria deaths (MALDTH / ALLDTH)\", TRUE)\n", + "}\n", + "\n", + "cat(\"\\n### Non-malaria All-cause Outpatients\\n\")\n", + "if (\"non_malaria_all_cause_outpatients\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"non_malaria_all_cause_outpatients\", \"Non-malaria all-cause outpatients (ALLOUT)\", FALSE)\n", + "}\n", + "\n", + "cat(\"\\n### Presumed Cases\\n\")\n", + "if (\"presumed_cases\" %in% names(qoc_sf)) {\n", + " plot_yearly_map_report(qoc_sf, \"presumed_cases\", \"Presumed cases (PRES)\", FALSE)\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b31e4c8", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "8229c37e", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07324c1c", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "7c084da7", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f52975", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "006866ce", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "f7225165", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "420ed27f", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ddb838", + "metadata": { + "vscode": { + "languageId": "r" } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/snt_dhis2_quality_of_care/pipeline.py b/snt_dhis2_quality_of_care/pipeline.py index b20f805..5c066ae 100644 --- a/snt_dhis2_quality_of_care/pipeline.py +++ b/snt_dhis2_quality_of_care/pipeline.py @@ -86,8 +86,8 @@ def snt_dhis2_quality_of_care( ) files_to_dataset = [ - data_path / f"{country_code}_quality_of_care_district_year_imputed.parquet", - data_path / f"{country_code}_quality_of_care_district_year_imputed.csv", + data_path / f"{country_code}_quality_of_care_district_year_{data_action}.parquet", + data_path / f"{country_code}_quality_of_care_district_year_{data_action}.csv", parameters_file, ] existing_files = [f for f in files_to_dataset if f.exists()] @@ -95,11 +95,14 @@ def snt_dhis2_quality_of_care( for missing in missing_files: current_run.log_warning(f"Output file not found, skipped for dataset upload: {missing}") - add_files_to_dataset( - dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], - country_code=country_code, - file_paths=existing_files, - ) + if existing_files: + add_files_to_dataset( + dataset_id=snt_config["SNT_DATASET_IDENTIFIERS"]["DHIS2_QUALITY_OF_CARE"], + country_code=country_code, + file_paths=existing_files, + ) + else: + current_run.log_warning("No output files found for dataset upload.") else: current_run.log_info("Skipping computations, running only reporting notebook.")