Common Recipes¶
Copy-paste code examples for frequently needed operations.
Load Claims and Compute Statistics¶
from medicaid_utils.preprocessing import taf_ip, taf_ps
ip = taf_ip.TAFIP(year=2019, state="AL", data_root="/data/cms")
ps = taf_ps.TAFPS(year=2019, state="AL", data_root="/data/cms")
# Number of unique beneficiaries with IP claims
n_benes = ip.dct_files["base"].index.nunique().compute()
print(f"Unique beneficiaries: {n_benes}")
# Total number of claims
n_claims = len(ip.dct_files["base"])
print(f"Total IP claims: {n_claims}")
Flag Multiple Conditions¶
from medicaid_utils.filters.claims import dx_and_proc
dct_codes = {
"diabetes_t2": {"incl": {10: ["E11"]}},
"hypertension": {"incl": {10: ["I10"]}},
"ckd": {"incl": {10: ["N18"]}},
"depression": {"incl": {10: ["F32", "F33"]}},
}
df_flagged = dx_and_proc.flag_diagnoses_and_procedures(
dct_diag_codes=dct_codes,
dct_proc_codes={},
df_claims=ip.dct_files["base"],
cms_format="TAF",
)
Identify Patients with Conditions¶
pdf_patients, dct_stats = dx_and_proc.get_patient_ids_with_conditions(
dct_diag_codes=dct_codes,
dct_proc_codes={},
cms_format="TAF",
ip=ip.dct_files["base"],
ot=ot.dct_files["base"],
)
# Patients with T2D in either IP or OT
t2d_patients = pdf_patients.loc[
(pdf_patients["ip_diag_diabetes_t2"] == 1) |
(pdf_patients["ot_diag_diabetes_t2"] == 1)
]
Flag Pharmacy Claims by NDC¶
from medicaid_utils.filters.claims import rx as rx_filter
dct_ndc = {
"buprenorphine": [
"00378451905", "00378451993", "00378617005",
# ... full NDC list
],
}
df_rx_flagged = rx_filter.flag_prescriptions(
dct_ndc_codes=dct_ndc,
df_claims=rx.dct_files["base"],
)
Add Elixhauser Comorbidity Scores¶
from medicaid_utils.adapted_algorithms.py_elixhauser.elixhauser_comorbidity import score
# For MAX data — first construct LST_DIAG_CD from individual diagnosis columns
diag_cols = [c for c in ip.df.columns if c.startswith("DIAG_CD_")]
ip.df = ip.df.map_partitions(
lambda pdf: pdf.assign(
LST_DIAG_CD=pdf[diag_cols].apply(
lambda row: ",".join(v for v in row if v and str(v).strip()), axis=1
)
)
)
df_with_elix = score(ip.df, lst_diag_col_name="LST_DIAG_CD", cms_format="MAX")
# For TAF data — gather diagnosis codes (creates LST_DIAG_CD on "base_diag_codes")
ip.gather_bene_level_diag_ndc_codes()
df_with_elix = score(
ip.dct_files["base_diag_codes"],
lst_diag_col_name="LST_DIAG_CD",
cms_format="TAF",
)
Classify ED Visits by Severity¶
from medicaid_utils.adapted_algorithms.py_nyu_billings.billings_ed import get_nyu_ed_proba
# Filter OT claims to ED visits
df_ed = ot.df[ot.df["ed_use"] == 1]
# Classify using NYU/Billings algorithm
pdf_nyu = get_nyu_ed_proba(
df_ed, date_col="srvc_bgn_date", index_col="MSIS_ID", cms_format="MAX"
)
Identify FQHC Providers¶
from medicaid_utils.other_datasets import fqhc
# Get FQHC NPI crosswalk
pdf_fqhc = fqhc.get_fqhc_crosswalk(start_year=2016)
Export Processed Claims¶
# To Parquet (recommended for large datasets)
ip.export("/output/processed/", output_format="parquet", repartition=True)
# To CSV (single file per claim type)
ip.export("/output/processed/", output_format="csv")
Run Multi-State Analysis¶
import gc
import pandas as pd
from medicaid_utils.preprocessing import taf_ip
results = []
for state in ["AL", "IL", "CA", "NY", "TX"]:
for year in [2016, 2017, 2018]:
ip = taf_ip.TAFIP(year=year, state=state, data_root="/data/cms")
n_claims = len(ip.dct_files["base"])
results.append({"state": state, "year": year, "n_claims": n_claims})
del ip
gc.collect()
df_summary = pd.DataFrame(results)
print(df_summary)
See also
Scaling with Dask for performance optimization, Geographic Data & Rural-Urban Classification for geographic crosswalks, MAX vs TAF: CMS File Formats for format-specific patterns.