Source code for medicaid_utils.adapted_algorithms.py_nyu_billings.billings_ed

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

""" Python implementation of The New York University (NYU) Emergency Department (ED) visit algorithm.
    This algorithm is the most widely used tool for retrospectively assessing the probability
    that ED visits are urgent, preventable, or optimally treated in an ED,
    using administrative data (Billings, Parikh, and Mijanovich 2000b; Feldman 2010).
"""
__author__ = "Manoradhan Murugesan"
__email__ = "manorathan@uchicago.edu"

import re
import os
from ast import literal_eval
from typing import Tuple

import pandas as pd
import dask.dataframe as dd



[docs]
class BillingsED:
    """This class packages functions to perform NYU/ Billings algorithm based classification of ED visits"""

    logger_name = __name__
    package_folder, filename = os.path.split(__file__)
    data_folder = os.path.join(package_folder, "data")

    # Recoding diagnosis code lookups
    pdf_recode = pd.read_csv(
        os.path.join(data_folder, "recode.csv"),
        dtype={
            "origin": "str",
            "target": "str",
            "comments": "str",
            "updates": "str",
            "deleted": "int",
            "starts_with": "int",
        },
    )

    pdf_recode = pdf_recode.loc[
        pdf_recode["deleted"] != 1,
    ]
    pdf_recode = pdf_recode.assign(
        origin=pdf_recode.origin.apply(literal_eval)
    )

    pdf_startswith_recode = pdf_recode.loc[pdf_recode["starts_with"] == 1]
    pdf_startswith_recode = (
        pdf_startswith_recode[["origin", "target"]]
        .explode("origin")
        .drop_duplicates(["origin"], keep="first")
    )

    pdf_nonstartswith_recode = pdf_recode.loc[pdf_recode["starts_with"] == 0][
        ["origin", "target"]
    ].explode("origin")
    pdf_nonstartswith_recode = pdf_nonstartswith_recode.drop_duplicates(
        ["origin"], keep="first"
    )
    dct_recode_non_startswith = pdf_nonstartswith_recode.set_index(
        "origin"
    ).to_dict()["target"]

    # Special categories lookups
    lst_special_ed_categories = ["acs", "psych", "drug", "alcohol", "injury"]
    # Load recode csv file that has information for
    # recoding principal diagnosis
    # Load category data files to a dictionary
    dct_category_dx_codes = {}
    for cat in lst_special_ed_categories:
        dct_category_dx_codes[cat] = (
            pd.read_csv(
                os.path.join(data_folder, cat + "_check.csv"),
                dtype={
                    "start_index": "int",
                    "end_index": "float",
                    "code": "str",
                    "comments": "str",
                },
            )
            .apply(
                lambda row: re.sub(
                    r"[^a-zA-Z0-9]+",
                    "",
                    row.code[
                        : (int(row.end_index) + 1)
                        if pd.notnull(row.end_index)
                        else len(row.code)
                    ].upper(),
                ),
                axis=1,
            )
            .tolist()
        )

    # Probabilities lookup
    df_eddxs = pd.read_sas(os.path.join(data_folder, "eddxs.sas7bdat"))
    df_eddxs = df_eddxs.assign(
        prindx=df_eddxs.prindx.str.decode("utf-8")
        .str.strip()
        .str.upper()
        .str.replace("[^a-zA-Z0-9]+", "", regex=True)
    )
    df_eddxs = df_eddxs.drop_duplicates(["prindx"], keep="first")


[docs]
    @classmethod
    def recode_diag_code(cls, dx_code: str) -> str:
        """
        Recodes diagnosis code

        Parameters
        ----------
        dx_code : str
            Diagnosis code

        Returns
        -------
        str
            Recoded diagnosis code.

        Examples
        --------
        >>> BillingsED.recode_diag_code('4659')  # doctest: +SKIP
        '4659'

        """
        return cls.dct_recode_non_startswith.get(
            dx_code,
            next(
                (
                    row.target
                    for idx, row in cls.pdf_startswith_recode.iterrows()
                    if dx_code.startswith(row.origin)
                ),
                dx_code,
            ),
        )



[docs]
    @classmethod
    def get_special_categories(cls, dx_code: str) -> dict:
        """
        Categorizes the diagnosis code into below categories

        ===============   ==============================================
        category_code     description
        ===============   ==============================================
          injury          Injury
          psych           Mental Health Related
          alcohol         Alcohol Related
          drug            Drug Related (excluding alcohol)
          acs
        ===============   ==============================================

        Parameters
        ----------
        dx_code : str
            Diagnosis code

        Returns
        -------
        dict
            Dictionary with keys 'acs', 'psych', 'drug', 'alcohol', 'injury'
            and integer values (0 or 1).

        Examples
        --------
        >>> BillingsED.get_special_categories('E8600')  # doctest: +SKIP
        {'acs': 0, 'psych': 0, 'drug': 0, 'alcohol': 1, 'injury': 0}

        """
        return {
            cat: int(dx_code.startswith(tuple(cls.dct_category_dx_codes[cat])))
            for cat in cls.lst_special_ed_categories
        }



[docs]
    @classmethod
    def get_nyu_ed_proba_for_dx_code(cls, dx_code: str) -> dict:
        """
        Merge with EDDXs gets the probbolities attached with the code

        ============   =====================================================
        ed_category    description
        ============   =====================================================
        nonemerg       Non-Emergent
        emergpc        Emergent, Primary Care Treatable
        emedpa         Emergent, ED Care Needed, Preventable/Avoidable
        emednpa        Emergent, ED Care Needed, Not Preventable/Avoidable
        ============   =====================================================

        Parameters
        ----------
        dx_code : str
            Diagnosis code

        Returns
        -------
        dict
            Dictionary with probability keys ('nonemerg', 'emergpc',
            'emedpa', 'emednpa') and float values, or empty dict if
            the code is not found.

        Examples
        --------
        >>> BillingsED.get_nyu_ed_proba_for_dx_code('4659')  # doctest: +SKIP

        """
        return (
            cls.df_eddxs.set_index("prindx")
            .to_dict(orient="index")
            .get(dx_code, {})
        )



[docs]
    @classmethod
    def get_nyu_ed_categories(
        cls, dx_code: str
    ) -> Tuple[int, int, int, int, int, int, float, float, float, float]:
        """
        Returns probabilities for each of the NYU ED categories, based on the input diagnosis code

        Parameters
        ----------
        dx_code : str
            Diagnosis code

        Returns
        -------
        unclassified : int
            The code did not meet any NYU/Billings category
        injury : int
            Injury
        drug : int
            Drug Related (excluding alcohol)
        psych : int
            Mental Health Related
        alcohol : int
            Alcohol Related
        peds_acs_ed : int
            Pediatric ambuilatory care sensitive ED visit
        ne : float
            Non-Emergent probability
        epct : float
            Emergent, Primary Care Treatable probability
        edcnpa : float
            Emergent, ED Care Needed, Preventable/Avoidable
        edcnnpa : float
            Emergent, ED Care Needed, Not Preventable/Avoidable

        Examples
        --------
        >>> BillingsED.get_nyu_ed_categories('4659')  # doctest: +SKIP

        """
        peds_acsed = cls.is_peds_acsed(dx_code)
        dx_code = cls.recode_diag_code(dx_code)
        dct_special_cat = cls.get_special_categories(dx_code)
        dct_nyu_ed_proba = cls.get_nyu_ed_proba_for_dx_code(dx_code)
        unclassified = int(
            not (
                any(
                    dct_special_cat[cat]
                    for cat in ["injury", "drug", "psych", "alcohol"]
                )
                or bool(dct_nyu_ed_proba)
            )
        )
        # ne = dct_nyu_ed_proba.get("nonemerg", 0)
        # epct = dct_nyu_ed_proba.get("emergpc", 0)
        ne = int(
            not any(
                dct_special_cat[cat]
                for cat in ["injury", "drug", "psych", "alcohol"]
            )
        ) * dct_nyu_ed_proba.get("nonemerg", 0)
        epct = int(
            not any(
                dct_special_cat[cat]
                for cat in ["injury", "drug", "psych", "alcohol"]
            )
        ) * dct_nyu_ed_proba.get("emergpc", 0)
        edcnpa = dct_special_cat["acs"] * sum(
            [
                dct_nyu_ed_proba.get("emedpa", 0),
                dct_nyu_ed_proba.get("emednpa", 0),
            ]
        )
        edcnnpa = int(dct_special_cat["acs"] != 1) * sum(
            [
                dct_nyu_ed_proba.get("emedpa", 0),
                dct_nyu_ed_proba.get("emednpa", 0),
            ]
        )

        edcnpa = (
            int(
                not any(
                    dct_special_cat[cat]
                    for cat in ["injury", "drug", "psych", "alcohol"]
                )
            )
            * edcnpa
        )
        edcnnpa = (
            int(
                not any(
                    dct_special_cat[cat]
                    for cat in ["injury", "drug", "psych", "alcohol"]
                )
            )
            * edcnnpa
        )

        peds_acsed = max(peds_acsed, sum([ne, epct, edcnpa]))

        return (
            unclassified,
            dct_special_cat["injury"],
            dct_special_cat["drug"],
            dct_special_cat["psych"],
            dct_special_cat["alcohol"],
            peds_acsed,
            ne,
            epct,
            edcnpa,
            edcnnpa,
        )



[docs]
    @classmethod
    def is_peds_acsed(cls, dx_code: str) -> int:
        """
        Checks if the diagnosis code meets pediatric ACS ED visit criteria

        Parameters
        ----------
        dx_code : str
            Diagnosis code

        Returns
        -------
        int
            1 if the code meets pediatric ACS ED criteria, 0 otherwise.

        Examples
        --------
        >>> BillingsED.is_peds_acsed('493')  # doctest: +SKIP
        1
        >>> BillingsED.is_peds_acsed('999')  # doctest: +SKIP
        0

        """
        return int(
            dx_code.startswith(
                (
                    "493",
                    "079",
                    "480",
                    "487",
                    "780",
                    "381",
                    "382",
                    "384",
                    "385",
                    "471",
                    "472",
                    "477",
                    "690",
                    "691",
                    "692",
                    "693",
                    "695",
                    "V03",
                    "V04",
                    "V05",
                    "V06",
                    "V07",
                    "V20",
                    "V67",
                    "V68",
                    "V69",
                    "V70",
                    "840",
                    "841",
                    "842",
                    "843",
                    "844",
                    "845",
                    "846",
                    "847",
                    "848",
                    "910",
                    "911",
                    "913",
                    "914",
                    "915",
                    "916",
                    "917",
                    "918",
                    "919",
                    "923",
                    "924",
                    "955",
                    "956",
                )
            )
        )





[docs]
def get_nyu_ed_proba(
    df: dd.DataFrame, date_col: str, index_col: str, cms_format: str = "MAX"
) -> pd.DataFrame:
    """
    This functions returns probabilities for dxgroup codes in the input dataframe being urgent, preventable, or
    optimally treated ED visits. This function recodes the passed dx codes to a set of dx codes used in (Billings,
    Parikh, and Mijanovich 2000b; Feldman 2010) study, adds columns (‘injury’, ‘psych’, ‘alcohol’ and ‘drug’),
    indicating the type of ED service.

    Parameters
    ----------
    df : dask.DataFrame
        dataframe with dx codes
    date_col : str
        Date column name
    index_col : str
        Index column name
    cms_format : {'MAX', 'TAF'}
        CMS file format

    Returns
    -------
    pd.DataFrame

    Examples
    --------
    >>> # Requires a dask DataFrame with ED claims and diagnosis columns
    >>> pdf = get_nyu_ed_proba(df, 'srvc_bgn_date', 'MSIS_ID')  # doctest: +SKIP

    """
    principal_diag_col_name = (
        "DIAG_CD_1" if (cms_format == "MAX") else "DGNS_CD_1"
    )
    _billings_cols = [
        "unclassified", "injury", "drug", "psych", "alcohol",
        "peds_acs_ed", "ne", "epct", "edcnpa", "edcnnpa",
    ]
    df = df.map_partitions(
        lambda pdf: pd.concat(
            [
                pdf,
                pd.DataFrame(
                    pdf[principal_diag_col_name]
                    .apply(BillingsED.get_nyu_ed_categories)
                    .tolist(),
                    columns=[
                        "unclassified",
                        "injury",
                        "drug",
                        "psych",
                        "alcohol",
                        "peds_acs_ed",
                        "ne",
                        "epct",
                        "edcnpa",
                        "edcnnpa",
                    ],
                    index=pdf.index,
                ),
            ],
            axis=1,
        ),
        meta=df._meta.assign(**{c: 0.0 for c in _billings_cols}),
    )
    _agg_cols = ["injury", "drug", "psych", "alcohol", "adult",
                  "ne", "epct", "edcnpa", "edcnnpa", "peds_acs_ed"]
    _agg_meta = pd.DataFrame(
        {c: pd.Series(dtype="float64") for c in _agg_cols},
        index=pd.MultiIndex.from_tuples([], names=[index_col, date_col]),
    )
    df = df.map_partitions(
        lambda pdf: pdf.groupby([index_col, date_col]).agg(
            dict(
                [
                    (col, "max")
                    for col in ["injury", "drug", "psych", "alcohol", "adult"]
                ]
                + [
                    (col, "mean")
                    for col in ["ne", "epct", "edcnpa", "edcnnpa"]
                ]
                + [(col, "min") for col in ["peds_acs_ed"]]
            )
        ),
        meta=_agg_meta,
    )

    df = df.assign(
        peds_acs_ed_visit=(
            (df["adult"] == 0) & (df["peds_acs_ed"] > 0.75)
        ).astype(int),
        non_emergent_visit=(df[["ne", "epct"]].sum(axis=1) > 0.5).astype(int),
        emergent_visit=(df[["edcnpa", "edcnnpa"]].sum(axis=1) > 0.5).astype(
            int
        ),
        non_emergent_or_pct_visit=(
            df[["ne", "epct", "edcnpa"]].sum(axis=1) > 0.5
        ).astype(int),
        indeterminate_ed_visit=(
            (df[["ne", "epct"]].sum(axis=1) == 0.5)
            | (df[["edcnpa", "edcnnpa"]].sum(axis=1) == 0.5)
        ).astype(int),
    )
    df = (
        df.groupby(index_col)
        .agg(
            {
                col: "sum"
                for col in [
                    "injury",
                    "drug",
                    "psych",
                    "alcohol",
                    "peds_acs_ed_visit",
                    "non_emergent_visit",
                    "emergent_visit",
                    "non_emergent_or_pct_visit",
                    "indeterminate_ed_visit",
                ]
            }
        )
        .compute()
    )
    return df