Source code for medicaid_utils.preprocessing.max_cc
"""This module has MAXCC class which wraps together cleaning/ preprocessing routines specific for MAX CC files"""
from typing import List, Optional
import dask.dataframe as dd
import pandas as pd
import numpy as np
from medicaid_utils.preprocessing import max_file
[docs]
class MAXCC(max_file.MAXFile):
"""Scripts to preprocess CC file"""
def __init__(
self,
year: int,
state: str,
data_root: str,
index_col: str = "BENE_MSIS",
clean: bool = True,
preprocess: bool = True,
tmp_folder: Optional[str] = None,
pq_engine: str = "pyarrow",
) -> None:
"""
Initializes MAX file object by preloading and preprocessing(if opted in) the file
Parameters
----------
ftype : {'ip', 'ot', 'rx', 'ps', 'cc'}
MAX Claim type.
year : int
Year of claim file
state : str
State of claim file
data_root : str
Root folder of raw claim files
index_col : str, default='BENE_MSIS'
Index column name. Eg. BENE_MSIS or MSIS_ID. The raw file is expected to be already
sorted with index column
clean : bool, default=True
Should the associated files be cleaned?
preprocess : bool, default=True
Should the associated files be preprocessed?
tmp_folder : str, default=None
Folder location to use for caching intermediate results. Can be turned off by not passing this argument.
pq_engine : str, default='pyarrow'
Parquet engine to use
"""
super().__init__(
"cc",
year=year,
state=state,
data_root=data_root,
index_col=index_col,
clean=False,
preprocess=False,
tmp_folder=tmp_folder,
pq_engine=pq_engine,
)
if preprocess:
lst_conditions = [
col.removesuffix("_COMBINED").lower()
for col in self.df.columns
if col.endswith("_COMBINED")
]
self.agg_conditions(lst_conditions)
[docs]
def agg_conditions(self, lst_conditions: List[str]) -> None:
"""
Aggregate condition indicators across payer levels
Parameters
----------
lst_conditions : list of str
List of condition columns to aggregate
"""
self.df = self.df.map_partitions(
lambda pdf: pdf.assign(
**{
condn
+ "_combined": np.column_stack(
[
pd.to_numeric(
pdf[f"{condn.upper()}_{payer_type}"],
errors="coerce",
).isin([1, 3])
for payer_type in [
"MEDICAID",
"MEDICARE",
"COMBINED",
]
]
)
.any(axis=1)
.astype(int)
for condn in lst_conditions
}
)
)
[docs]
def get_chronic_conditions(
self, lst_conditions: Optional[List[str]] = None
) -> dd.DataFrame:
"""
Get chronic condition indidcators
Parameters
----------
lst_conditions : list of str, default=None
List of condition columns to check
"""
if lst_conditions is None:
lst_conditions = [
col.removesuffix("_COMBINED").lower()
for col in self.df.columns
if col.endswith("_COMBINED")
]
if not {condn + "_combined" for condn in lst_conditions}.issubset(
set(list(self.df.columns))
):
self.agg_conditions(lst_conditions)
return self.df[[condn + "_combined" for condn in lst_conditions]]