Source code for medicaid_utils.other_datasets.fqhc

import os
from datetime import datetime
import numpy as np
import pandas as pd
import dask.dataframe as dd


constructed_folder = os.path.join(
    os.path.dirname(__file__), "data", "constructed"
)
nppes_lookup_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "nppes_taxonomy"
)
fara_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "nppes_taxonomy"
)
fqhc_lookup_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "fqhc"
)
delivery_folder = os.path.join(
    os.path.dirname(__file__), "data", "deliveries", "adults"
)
uds_delivery_folder = os.path.join(
    os.path.dirname(__file__), "data", "deliveries", "uds_factors"
)



[docs]
def generate_oscar_fqhc_npis(lst_year=None, pq_engine="fastparquet"):
    """Saves list of NPIs with FQHC range oscar provider ids into a pickle file"""
    if lst_year is None:
        lst_year = list(range(2009, datetime.now().year + 1))
    df_npi_provider = dd.concat(
        [
            dd.read_parquet(
                os.path.join(
                    nppes_lookup_folder,
                    str(yr),
                    "npi_provider_parquet_cleaned",
                ),
                engine=pq_engine,
                index=False,
            )
            for yr in lst_year
        ]
    )
    df_npi_provider["ccn"] = (
        df_npi_provider["provider_id"].str.strip().str[-4:]
    )
    df_npi_provider = df_npi_provider.map_partitions(
        lambda pdf: pdf.assign(ccn=pd.to_numeric(pdf["ccn"], errors="coerce"))
    )

    # Oscar provider numbers for FQHCs end in the range 1800 - 1989 or 1000 - 1199
    pdf_fqhc_npi = df_npi_provider.loc[
        df_npi_provider["ccn"].between(1800, 1989, inclusive="both")
        | df_npi_provider["ccn"].between(1000, 1199, inclusive="both")
    ].compute()
    pdf_fqhc_npi.loc[(pdf_fqhc_npi["provider_id_type"] == 6)].to_pickle(
        os.path.join(nppes_lookup_folder, "nppes_fqhc_range_npis.pickle")
    )




[docs]
def get_file_name_dict(source):
    dct_files = {
        "uds": "uds_all_years.pickle",
        "hcris": "hcris_all_years.pickle",
        "nppes_matches": f"{source}_nppes_based_matches.pickle",
        "api_matches": f"{source}_api_perfect_matches.pickle",
        "api_and_nppes_matches": f"{source}_api_and_nppes_perfect_matches.pickle",
        "api_nppes_state_relaxed_matches": f"{source}_api_and_nppes_perfect_matches_with_state_relaxed.pickle",
        "text_merged": f"nppes_{source}_text_merged.pickle",
        "text_merged_with_match_purity": f"nppes_{source}_text_merged_with_match_purity.pickle",
        "text_matches": f"{source}_text_based_perfect_matches.pickle",
        "perfect_matches": f"{source}_perfect_matches.pickle",
        "fuzzy_matches": f"{source}_fuzzy_matches.pickle",
        "no_leading_zeros_matches": f"{source}_nppes_based_matches_no_leading_zeros.pickle",
        "bhcmisid_perfect_matches": "bhcmisid_npi_perfect_matches.pickle",
        "bhcmisid_fuzzy_matches": "bhcmisid_npi_fuzzy_matches.pickle",
        "bhcmisid_x_npi": "bhcmisid_x_npi.pickle",
        "fqhc_x_npi": "fqhc_x_npi.ftr",
    }
    return dct_files




[docs]
def get_fqhc_crosswalk(start_year, data_folder=fqhc_lookup_folder):
    """Returns FQHC cross walk with FQHC NPI's seen in UDS datasets till the start_year"""
    pdf_fqhc_crosswalk = pd.read_feather(
        os.path.join(data_folder, get_file_name_dict("uds")["fqhc_x_npi"])
    )
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.sort_values(["start_year"])
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign(
        bhcmisid=pdf_fqhc_crosswalk["bhcmisid"].fillna("")
    )
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign(
        bhcmisid=pdf_fqhc_crosswalk.groupby(["npi"])["bhcmisid"].transform(
            ",".join
        )
    )
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign(
        bhcmisid=pdf_fqhc_crosswalk["bhcmisid"].replace("", np.nan)
    )
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.drop_duplicates(["npi"])
    pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.rename(
        columns={
            "perfect_fqhc": "taxonomy_perfect_fqhc",
            "fuzzy_fqhc": "taxonomy_fuzzy_fqhc",
        }
    )
    return pdf_fqhc_crosswalk.loc[
        pdf_fqhc_crosswalk["start_year"] <= start_year
    ]