Source code for medicaid_utils.preprocessing.taf_rx

"""This module has TAFRX class which wraps together cleaning/ preprocessing
routines specific for TAF Pharmacy files"""
from typing import Optional

from medicaid_utils.preprocessing import taf_file


[docs] class TAFRX(taf_file.TAFFile): def __init__( self, year: int, state: str, data_root: str, index_col: str = "BENE_MSIS", clean: bool = True, preprocess: bool = True, tmp_folder: Optional[str] = None, pq_engine: str = "pyarrow", ) -> None: """ Initializes TAF RX file object by preloading and preprocessing(if opted in) the associated files Parameters ---------- year : int Year of claim file state : str State of claim file data_root : str Root folder of raw claim files index_col : str, default='BENE_MSIS' Index column name. Eg. BENE_MSIS or MSIS_ID. The raw file is expected to be already sorted with index column clean : bool, default=True Should the associated files be cleaned? preprocess : bool, default=True Should the associated files be preprocessed? tmp_folder : str, default=None Folder location to use for caching intermediate results. Can be turned off by not passing this argument. pq_engine : str, default='pyarrow' Parquet engine to use Examples -------- >>> from medicaid_utils.preprocessing.taf_rx import TAFRX # doctest: +SKIP >>> rx = TAFRX(2019, 'AL', '/data/cms') # doctest: +SKIP """ super().__init__( "rx", year=year, state=state, data_root=data_root, index_col=index_col, clean=False, preprocess=False, tmp_folder=tmp_folder, pq_engine=pq_engine, ) self.dct_default_filters = {"missing_dob": 0, "duplicated": 0} if clean: self.clean() if preprocess: self.preprocess()
[docs] def clean(self) -> None: """Cleaning routines to clean diagnosis & procedure code columns, processes date and gender columns, and add duplicate check flags. Examples -------- >>> from medicaid_utils.preprocessing.taf_rx import TAFRX # doctest: +SKIP >>> rx = TAFRX(2019, 'AL', '/data/cms', clean=False) # doctest: +SKIP >>> rx.clean() # doctest: +SKIP """ super().clean() self.clean_ndc_codes() self.flag_duplicates()