Source code for medicaid_utils.adapted_algorithms.py_betos.betos_proc_codes

#!/usr/bin/env python

"""Python module to apply Berenson-Eggers Type of Service (BETOS) categorization for claims"""
__author__ = "Manoradhan Murugesan"
__email__ = "manorathan@uchicago.edu"

import os

import numpy as np
import pandas as pd
import dask.dataframe as dd


[docs] class BetosProcCodes: package_folder, filename = os.path.split(__file__) data_folder = os.path.join(package_folder, "data")
[docs] @classmethod def get_betos_cpt_crosswalk(cls, year: int) -> pd.DataFrame: """ Get CPT x Betos code crosswalk, with betos code and betos category information. The returned CPT x Betos code crosswalk dataframe has the below columns: * cpt_code - HCPCS codes A0010-V9999 AMA/CPT-4 codes 00100-99999 * betos_code - BETOS codes D1A-Z2 * betos_code_name - BETOS code description * betos_cat - BETOS category abbrevation, with the below 7 values: * EVALUATION AND MANAGEMENT - betos_eval * PROCEDURES - betos_proc * IMAGING - betos_img * TESTS - betos_test * DURABLE MEDICAL EQUIPMENT - betos_dme * OTHER - betos_oth * EXCEPTIONS/UNCLASSIFIED - betos_uncla Parameters ---------- year : type Public use file year Returns ------- pandas.DataFrame Examples -------- >>> # Requires BETOS public use crosswalk files on disk >>> pdf = BetosProcCodes.get_betos_cpt_crosswalk(2018) # doctest: +SKIP >>> 'cpt_code' in pdf.columns # doctest: +SKIP True """ betpuf_path = os.path.join( cls.data_folder, f"betpuf{str(year)[-2:]}.txt" ) if year != 2020: pdf_crosswalk = pd.read_csv( betpuf_path, header=None, names=["tmp"], ) pdf_crosswalk["cpt_code"] = ( pdf_crosswalk["tmp"].str.split(" ").str[0] ) pdf_crosswalk["betos_code"] = ( pdf_crosswalk["tmp"] .str.split(" ") .str[1] .str[:3] .replace("", np.nan) ) pdf_crosswalk["termination_date"] = pd.to_datetime( pdf_crosswalk["tmp"] .str.split(" ") .str[2] .combine_first( pdf_crosswalk["tmp"].str.split(" ").str[1].str[3:] ), errors="coerce", ) pdf_crosswalk = pdf_crosswalk.loc[ pdf_crosswalk["betos_code"].notna() ][["cpt_code", "betos_code"]] pdf_code_lookup = pd.read_csv( os.path.join( cls.data_folder, f"r-me-bet{str(year)[-2:]}.txt" ), header=None, sep="=", on_bad_lines="skip", skiprows=54, names=["betos_code", "betos_code_name"], ) pdf_code_lookup = ( pdf_code_lookup.loc[~pdf_code_lookup["betos_code_name"].isna()] .astype(str) .apply(lambda x: x.str.strip(), axis=1) ) pdf_code_lookup = pdf_code_lookup.replace("", np.nan) pdf_code_lookup = pdf_code_lookup.dropna() pdf_cat = pd.read_csv( os.path.join(cls.data_folder, "betos_cat.csv") ) dct_betos_cat = dict( pdf_cat[ ["betos_code_start", "betos_code_abbr"] ].values.tolist() ) pdf_crosswalk["betos_cat"] = pdf_crosswalk["betos_code"].apply( lambda x: dct_betos_cat[x[0]] ) pdf_crosswalk = pdf_crosswalk.merge( pdf_code_lookup, on="betos_code", how="left" ) else: pdf_crosswalk = pd.read_csv( betpuf_path, header=None, names=["tmp"], sep="\t", ) pdf_crosswalk["cpt_code"] = ( pdf_crosswalk["tmp"].str.split(" ").str[0] ) pdf_crosswalk["betos_code"] = ( pdf_crosswalk["tmp"].str.split(" ").str[1] ) pdf_crosswalk["betos_code_name"] = ( pdf_crosswalk["tmp"].str.extract(r'"(.+)"')[0] ) pdf_crosswalk = pdf_crosswalk.drop(columns=["tmp"]) pdf_crosswalk = pdf_crosswalk.loc[ pdf_crosswalk["betos_code"].notna() ] pdf_cat = pd.read_csv( os.path.join(cls.data_folder, "betos2_cat.csv") ) pdf_cat_lvl2 = pd.read_csv( os.path.join(cls.data_folder, "betos2_cat_lvl2.csv"), na_filter=False, ) pdf_fam = pd.read_csv( os.path.join(cls.data_folder, "betfam20.txt"), dtype="object" ) pdf_fam["betos_fam_abbr"] = "betos_fam_" + pdf_fam["family"] dct_betos_cat = dict( pdf_cat[ ["betos_code_start", "betos_code_abbr"] ].values.tolist() ) dct_betos_cat_lvl2 = dict( pdf_cat_lvl2[ ["betos_code_start", "betos_code_abbr"] ].values.tolist() ) dct_betos_fam = dict( pdf_fam[["family", "betos_fam_abbr"]].values.tolist() ) pdf_crosswalk["betos_cat"] = pdf_crosswalk["betos_code"].apply( lambda x: dct_betos_cat[x[0]] ) pdf_crosswalk["betos_cat_lvl2"] = pdf_crosswalk[ "betos_code" ].apply(lambda x: dct_betos_cat_lvl2[x[:2]]) pdf_crosswalk["betos_fam"] = pdf_crosswalk["betos_code"].apply( lambda x: dct_betos_fam[x[2:4]] ) return pdf_crosswalk
[docs] @classmethod def get_betos_cat( cls, df: dd.DataFrame, pdf_crosswalk: pd.DataFrame, claim_type: str = "medicaid", proc_code_prefix: str = "PRCDR_CD", ) -> dd.DataFrame: """ Get claimwise Betos codes & categories related to CPT procedure codes in claim Parameters ---------- df : dask.DataFrame Claim dask dataframe pdf_crosswalk : pandas.DataFrame CPT x Betos code crosswalk, with betos code and betos category information claim_type : {'medicaid', 'medicare'} Medicaid or Medicare claim type proc_code_prefix : str, default='PRCDR_CD' Column name prefix for procedure code columns Returns ------- dask.DataFrame Examples -------- >>> # Requires BETOS crosswalk data and claim data >>> BetosProcCodes.get_betos_cat(df, pdf_crosswalk) # doctest: +SKIP """ dct_code_lookup = ( pdf_crosswalk.groupby(["betos_code"])["cpt_code"] .apply(tuple) .to_dict() ) dct_cat_lookup = ( pdf_crosswalk.groupby(["betos_cat"])["cpt_code"] .apply(tuple) .to_dict() ) lst_col_to_delete = [] if claim_type == "medicaid": proc_code_col_prefix = str(proc_code_prefix) # Filtering procedure codes with sys code = 1 (CPT) & 6 (HCPCS) valid_col_names = { "VALID_" + col.replace("_SYS", ""): "" for col in df.columns if col.startswith(proc_code_col_prefix + "_SYS") } df = df.map_partitions( lambda pdf: pdf.assign( **{ "VALID_" + (col.replace("_SYS", "")): pdf[col.replace("_SYS", "")] .where( ( pd.to_numeric( pdf[col], errors="coerce" ).isin([1, 6]) ), "", ) .str.strip() .str.upper() .str.replace(".", "") for col in pdf.columns if col.startswith(proc_code_col_prefix + "_SYS") } ), meta=df._meta.assign(**valid_col_names), ) lst_col_to_delete.extend( [ "VALID_" + (col.replace("_SYS", "")) for col in df.columns if col.startswith(proc_code_col_prefix + "_SYS") ] ) proc_code_prefix = "VALID_" + proc_code_prefix df = df.map_partitions( lambda pdf: pdf.assign( **dict( [ ( "lst_betos_code", pdf[ [ col for col in pdf.columns if col.startswith(proc_code_prefix) ] ].apply( lambda x: ",".join( list( { betos_code for betos_code in list( dct_code_lookup.keys() ) if any( cpt_code.strip().startswith( dct_code_lookup[ betos_code ] ) for cpt_code in ",".join( x.fillna("").astype(str) ).split(",") ) } ) ), axis=1, ), ), ( "lst_betos_cat", pdf[ [ col for col in pdf.columns if col.startswith(proc_code_prefix) ] ].apply( lambda x: ",".join( list( { betos_cat for betos_cat in list( dct_cat_lookup.keys() ) if any( cpt_code.strip().startswith( dct_cat_lookup[ betos_cat ] ) for cpt_code in ",".join( x.fillna("").astype(str) ).split(",") ) } ) ), axis=1, ), ), ] ) ), meta=df._meta.assign(lst_betos_code="", lst_betos_cat=""), ) df = df[[col for col in df.columns if col not in lst_col_to_delete]] return df
[docs] def assign_betos_cat( df: dd.DataFrame, year: int, claim_type: str = "medicaid", proc_code_prefix: str = "PRCDR_CD", ) -> dd.DataFrame: """ Get claimwise BETOS codes & categories related to CPT procedure codes in claim. Columns in output dataframe: * If concat_codes_to_list=True, * lst_betos_code - Comma separated BETOS codes * lst_betos_cat - Comma separated BETOS cat * Else, * One boolean column each for all BETOS codes & BETOS categories Parameters ---------- df : dask.DataFrame Claim dask dataframe year : int Public use file year claim_type : {'medicaid', 'medicare} Medicaid or Medicare claim type proc_code_prefix : str, default='PRCDR_CD' Column name prefix for procedure code columns Returns ------- dask.DataFrame Examples -------- >>> # Requires BETOS public use files and claim data >>> assign_betos_cat(df, 2018) # doctest: +SKIP """ pdf_crosswalk = BetosProcCodes.get_betos_cpt_crosswalk(year) df = BetosProcCodes.get_betos_cat( df, pdf_crosswalk, claim_type, proc_code_prefix ) return df