Source code for medicaid_utils.preprocessing.taf_ot

"""This module has TAFOT class which wraps together cleaning/ preprocessing routines specific for TAF OT files"""
from typing import Optional

from medicaid_utils.preprocessing import taf_file


[docs] class TAFOT(taf_file.TAFFile): def __init__( self, year: int, state: str, data_root: str, index_col: str = "BENE_MSIS", clean: bool = True, preprocess: bool = True, tmp_folder: Optional[str] = None, pq_engine: str = "pyarrow", ) -> None: """ Initializes TAF OT file object by preloading and preprocessing(if opted in) the associated files Parameters ---------- year : int Year of claim file state : str State of claim file data_root : str Root folder of raw claim files index_col : str, default='BENE_MSIS' Index column name. Eg. BENE_MSIS or MSIS_ID. The raw file is expected to be already sorted with index column clean : bool, default=True Should the associated files be cleaned? preprocess : bool, default=True Should the associated files be preprocessed? tmp_folder : str, default=None Folder location to use for caching intermediate results. Can be turned off by not passing this argument. pq_engine : str, default='pyarrow' Parquet engine to use Examples -------- >>> from medicaid_utils.preprocessing.taf_ot import TAFOT # doctest: +SKIP >>> ot = TAFOT(2019, 'AL', '/data/cms') # doctest: +SKIP """ super().__init__( "ot", year=year, state=state, data_root=data_root, index_col=index_col, clean=False, preprocess=False, tmp_folder=tmp_folder, pq_engine=pq_engine, ) self.dct_default_filters = {"missing_dob": 0, "duplicated": 0} if clean: self.clean() if preprocess: self.preprocess()
[docs] def clean(self) -> None: """Cleaning routines to clean diagnosis & procedure code columns, processes date and gender columns, and add duplicate check flags. Examples -------- >>> from medicaid_utils.preprocessing.taf_ot import TAFOT # doctest: +SKIP >>> ot = TAFOT(2019, 'AL', '/data/cms', clean=False) # doctest: +SKIP >>> ot.clean() # doctest: +SKIP """ super().clean() self.clean_codes() self.flag_common_exclusions() self.cache_results()
[docs] def preprocess(self) -> None: """Add basic constructed variables. Examples -------- >>> from medicaid_utils.preprocessing.taf_ot import TAFOT # doctest: +SKIP >>> ot = TAFOT(2019, 'AL', '/data/cms', preprocess=False) # doctest: +SKIP >>> ot.preprocess() # doctest: +SKIP """
[docs] def flag_common_exclusions(self) -> None: """ Adds commonly used IP claim exclusion flag columns. New Columns: - ffs_or_encounter_claim, 0 or 1, 1 when base claim is an FFS or Encounter claim - excl_missing_dob, 0 or 1, 1 when base claim does not have birth date - excl_missing_srvc_bgn_date, 0 or 1, 1 when base claim does not have service begin date Examples -------- >>> from medicaid_utils.preprocessing.taf_ot import TAFOT # doctest: +SKIP >>> ot = TAFOT(2019, 'AL', '/data/cms', clean=False) # doctest: +SKIP >>> ot.flag_common_exclusions() # doctest: +SKIP """ self.flag_ffs_and_encounter_claims() df = self.dct_files["base"] df = df.map_partitions( lambda pdf: pdf.assign( excl_missing_dob=pdf["birth_date"].isnull().astype(int), excl_missing_srvc_bgn_date=pdf["srvc_bgn_date"] .isnull() .astype(int), ) ) self.dct_files["base"] = df