Source code for medicaid_utils.other_datasets.fqhc

import os
from datetime import datetime
import numpy as np
import pandas as pd
import dask.dataframe as dd


constructed_folder = os.path.join(
    os.path.dirname(__file__), "data", "constructed"
)
nppes_lookup_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "nppes_taxonomy"
)
fara_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "nppes_taxonomy"
)
fqhc_lookup_folder = os.path.join(
    os.path.dirname(__file__), "data", "lookups", "fqhc"
)
delivery_folder = os.path.join(
    os.path.dirname(__file__), "data", "deliveries", "adults"
)
uds_delivery_folder = os.path.join(
    os.path.dirname(__file__), "data", "deliveries", "uds_factors"
)


[docs] def generate_oscar_fqhc_npis(lst_year=None, pq_engine="fastparquet"): """Saves list of NPIs with FQHC range oscar provider ids into a pickle file""" if lst_year is None: lst_year = list(range(2009, datetime.now().year + 1)) df_npi_provider = dd.concat( [ dd.read_parquet( os.path.join( nppes_lookup_folder, str(yr), "npi_provider_parquet_cleaned", ), engine=pq_engine, index=False, ) for yr in lst_year ] ) df_npi_provider["ccn"] = ( df_npi_provider["provider_id"].str.strip().str[-4:] ) df_npi_provider = df_npi_provider.map_partitions( lambda pdf: pdf.assign(ccn=pd.to_numeric(pdf["ccn"], errors="coerce")) ) # Oscar provider numbers for FQHCs end in the range 1800 - 1989 or 1000 - 1199 pdf_fqhc_npi = df_npi_provider.loc[ df_npi_provider["ccn"].between(1800, 1989, inclusive="both") | df_npi_provider["ccn"].between(1000, 1199, inclusive="both") ].compute() pdf_fqhc_npi.loc[(pdf_fqhc_npi["provider_id_type"] == 6)].to_pickle( os.path.join(nppes_lookup_folder, "nppes_fqhc_range_npis.pickle") )
[docs] def get_file_name_dict(source): dct_files = { "uds": "uds_all_years.pickle", "hcris": "hcris_all_years.pickle", "nppes_matches": f"{source}_nppes_based_matches.pickle", "api_matches": f"{source}_api_perfect_matches.pickle", "api_and_nppes_matches": f"{source}_api_and_nppes_perfect_matches.pickle", "api_nppes_state_relaxed_matches": f"{source}_api_and_nppes_perfect_matches_with_state_relaxed.pickle", "text_merged": f"nppes_{source}_text_merged.pickle", "text_merged_with_match_purity": f"nppes_{source}_text_merged_with_match_purity.pickle", "text_matches": f"{source}_text_based_perfect_matches.pickle", "perfect_matches": f"{source}_perfect_matches.pickle", "fuzzy_matches": f"{source}_fuzzy_matches.pickle", "no_leading_zeros_matches": f"{source}_nppes_based_matches_no_leading_zeros.pickle", "bhcmisid_perfect_matches": "bhcmisid_npi_perfect_matches.pickle", "bhcmisid_fuzzy_matches": "bhcmisid_npi_fuzzy_matches.pickle", "bhcmisid_x_npi": "bhcmisid_x_npi.pickle", "fqhc_x_npi": "fqhc_x_npi.ftr", } return dct_files
[docs] def get_fqhc_crosswalk(start_year, data_folder=fqhc_lookup_folder): """Returns FQHC cross walk with FQHC NPI's seen in UDS datasets till the start_year""" pdf_fqhc_crosswalk = pd.read_feather( os.path.join(data_folder, get_file_name_dict("uds")["fqhc_x_npi"]) ) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.sort_values(["start_year"]) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign( bhcmisid=pdf_fqhc_crosswalk["bhcmisid"].fillna("") ) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign( bhcmisid=pdf_fqhc_crosswalk.groupby(["npi"])["bhcmisid"].transform( ",".join ) ) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.assign( bhcmisid=pdf_fqhc_crosswalk["bhcmisid"].replace("", np.nan) ) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.drop_duplicates(["npi"]) pdf_fqhc_crosswalk = pdf_fqhc_crosswalk.rename( columns={ "perfect_fqhc": "taxonomy_perfect_fqhc", "fuzzy_fqhc": "taxonomy_fuzzy_fqhc", } ) return pdf_fqhc_crosswalk.loc[ pdf_fqhc_crosswalk["start_year"] <= start_year ]