import os
from operator import mul
from functools import reduce
from typing import Any, Dict, List, Optional, Tuple
import scipy.stats as st
import numpy as np
import pandas as pd
import dask.dataframe as dd
from pandas.api.types import is_numeric_dtype
try:
from holoviews import opts
except ImportError:
opts = None
[docs]
def cramers_corrected_stat(confusion_matrix: pd.DataFrame) -> float:
"""calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
chi2 = st.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
rcorr = r - ((r - 1) ** 2) / (n - 1)
kcorr = k - ((k - 1) ** 2) / (n - 1)
return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
[docs]
def get_phi(pdf_x: pd.DataFrame) -> float:
lst_diag = [pdf_x.iloc[i, i] for i in range(pdf_x.shape[0])]
lst_nondiag = [
pdf_x.iloc[
i, [j for j in range(pdf_x.shape[1]) if j != i]
].values.tolist()
for i in range(pdf_x.shape[0])
]
lst_nondiag = [item for sublist in lst_nondiag for item in sublist]
lst_margins = [
pdf_x.iloc[i, :].values.sum() for i in range(pdf_x.shape[0])
] + [pdf_x.iloc[:, i].values.sum() for i in range(pdf_x.shape[1])]
prod_diag = reduce(mul, lst_diag, 1)
prod_nondiag = reduce(mul, lst_nondiag, 1)
prod_margins = reduce(mul, lst_margins, 1)
phi = (prod_diag - prod_nondiag) / np.sqrt(prod_margins)
return phi
[docs]
def get_contingency_table(
pdf_dataset: pd.DataFrame,
lst_categorical_metrics: List[str],
pop_col_name: str,
lst_numeric_col_to_binarize: List[str],
dct_labels: Dict[int, str],
) -> Tuple[pd.DataFrame, pd.DataFrame]:
lst_pop_val = sorted(
pdf_dataset.loc[~pdf_dataset[pop_col_name].isna()][pop_col_name]
.astype(int)
.unique()
.tolist()
)
dct_labels = {
val: dct_labels[val] for val in dct_labels if (val in lst_pop_val)
}
lst_pop_val = sorted([val for val in lst_pop_val if val in dct_labels])
lst_labels = [dct_labels[val] for val in lst_pop_val]
lst_crosstab = []
pdf_dataset = pdf_dataset.assign(
**{
col: pd.to_numeric(pdf_dataset[col], errors="coerce")
for col in lst_numeric_col_to_binarize
}
)
pdf_dataset = pdf_dataset.assign(
**{
col: pd.to_numeric(pdf_dataset[col], errors="ignore")
for col in lst_categorical_metrics
}
)
lstpdf_crosstab_pretty = []
for col in lst_categorical_metrics + lst_numeric_col_to_binarize:
pdf_temp = pdf_dataset.copy()[[pop_col_name, col]]
pdf_temp = pdf_temp.loc[pd.notnull(pdf_temp[col])]
lst_metric_pop_size = [
pdf_temp.loc[pdf_temp[pop_col_name] == pop_val].shape[0]
for pop_val in lst_pop_val
]
if col in lst_numeric_col_to_binarize:
pdf_temp = pdf_temp.assign(
**{col: (pdf_temp[col] > 0).astype(int)}
)
pdf_temp = pdf_temp.sort_values(by=[pop_col_name, col])
phi = (
np.nan
if (
(len(lst_pop_val) > 2) or (not is_numeric_dtype(pdf_temp[col]))
)
else pdf_temp[[pop_col_name] + [col]].corr()
)
phi = (
phi.iloc[0, 1]
if ((pd.notna(phi)) and ((phi.shape[1] > 2)))
else np.nan
)
pdf_crosstab = None
if pdf_temp[col].nunique() > 1:
pdf_crosstab = pd.crosstab(pdf_temp[pop_col_name], pdf_temp[col])
p_val = st.chi2_contingency(pdf_crosstab)[1]
cramers_v = cramers_corrected_stat(pdf_crosstab)
pdf_crosstab = (
pd.DataFrame(pdf_crosstab.T.loc[1, :]).T
if (
len(set(pdf_crosstab.columns.tolist()).difference({0, 1}))
== 0
)
else pd.DataFrame(pdf_crosstab.T)
)
pdf_crosstab.index = (
[col]
if (pdf_crosstab.shape[0] <= 1)
else [
col + "_" + str(lev) for lev in pdf_crosstab.index.tolist()
]
)
pdf_crosstab["p_val"] = p_val
pdf_crosstab["phi"] = (
phi if (len(pdf_crosstab.index) == 1) else np.nan
)
# pdf_crosstab['phi'] = phi
pdf_crosstab["cramers_v"] = cramers_v
pdf_crosstab.index.name = "metric"
lst_crosstab.append(pdf_crosstab)
else:
pdf_crosstab = pd.DataFrame({"phi": phi}, index=[col])
pdf_crosstab.index.name = "metric"
lst_crosstab.append(pdf_crosstab)
pdf_pretty = pdf_crosstab.rename(
columns=dict(zip(lst_pop_val, lst_labels))
)
pdf_pretty = pdf_pretty.assign(
**{
col: np.nan
for col in lst_labels
+ ["p_val", "phi", "cramers_v", "pop_size"]
if col not in pdf_pretty.columns
}
)
pdf_pretty = pdf_pretty.assign(
**{
**{
tpl_label_n[0]
+ "_N": pdf_pretty[tpl_label_n[0]].fillna(0).astype(int)
for tpl_label_n in zip(lst_labels, lst_metric_pop_size)
},
**{
tpl_label_perc[0]
+ "_%": (
pdf_pretty[tpl_label_perc[0]].fillna(0)
* 100
/ tpl_label_perc[1]
)
if (tpl_label_perc[1] != 0)
else np.nan
for tpl_label_perc in zip(lst_labels, lst_metric_pop_size)
},
**{
tpl_label_denom[0] + "_denom": tpl_label_denom[1]
for tpl_label_denom in zip(lst_labels, lst_metric_pop_size)
},
**{"pop_size": sum(lst_metric_pop_size)},
}
)
pdf_lbls = {}
for lbl in lst_labels:
pdf_lbl = pdf_pretty[
[lbl + "_N", lbl + "_%", lbl + "_denom"]
].rename(
columns=dict(
[
(lbl + "_N", "N"),
(lbl + "_%", "%"),
(lbl + "_denom", "denominator"),
]
)
)
pdf_lbls[lbl] = pdf_lbl
pdf_lbls = pd.concat(pdf_lbls, axis=1)
pdf_lbls = pdf_lbls.assign(
**{
col: pdf_pretty[col]
for col in ["p_val", "phi", "cramers_v", "pop_size"]
}
)
lstpdf_crosstab_pretty.append(pdf_lbls)
pdf_crosstab = pd.concat(lst_crosstab)
pdf_crosstab_pretty = pd.concat(lstpdf_crosstab_pretty)
return pdf_crosstab, pdf_crosstab_pretty
[docs]
def get_ranksum_table(
pdf_dataset: pd.DataFrame,
lst_metrics: List[str],
pop_col_name: str,
dct_labels: Dict[int, str],
) -> Tuple[pd.DataFrame, pd.DataFrame]:
lst_pop_val = sorted(
pdf_dataset.loc[~pdf_dataset[pop_col_name].isna()][pop_col_name]
.astype(int)
.unique()
.tolist()
)
dct_labels = {
val: dct_labels[val]
for val in list(dct_labels.keys())
if (val in lst_pop_val)
}
lst_pop_val = sorted([val for val in lst_pop_val if val in dct_labels])
lst_labels = [dct_labels[val] for val in lst_pop_val]
lst_pop_size = [
pdf_dataset.loc[pdf_dataset[pop_col_name] == pop_val].shape[0]
for pop_val in lst_pop_val
]
lst_ranksum = []
lst_ranksum_pretty = []
for col in lst_metrics:
corr = np.nan
if len(lst_pop_val) == 2:
corr = pdf_dataset[[pop_col_name] + [col]].corr().iloc[0, 1]
lst_pop_metric_mean = [
pdf_dataset[pdf_dataset[pop_col_name] == pop_val][col].mean()
for pop_val in lst_pop_val
]
lst_pop_metric_std = [
pdf_dataset[pdf_dataset[pop_col_name] == pop_val][col].std()
for pop_val in lst_pop_val
]
lst_pop_metric_sem = [
pdf_dataset[pdf_dataset[pop_col_name] == pop_val][col].sem()
for pop_val in lst_pop_val
]
lst_pop_metric_ci = [
st.t.interval(
0.95, tpl_val[0] - 1, loc=tpl_val[1], scale=tpl_val[2]
)
for tpl_val in zip(
lst_pop_size, lst_pop_metric_mean, lst_pop_metric_sem
)
]
z = np.nan
p = np.nan
try:
if len(lst_pop_val) == 2:
z, p = st.ranksums(
*[
pdf_dataset.loc[pdf_dataset[pop_col_name] == pop_val][
col
].dropna()
for pop_val in lst_pop_val
]
)
if len(lst_pop_val) > 2:
z, p = st.kruskal(
*[
pdf_dataset.loc[pdf_dataset[pop_col_name] == pop_val][
col
].dropna()
for pop_val in lst_pop_val
]
)
except Exception: # pylint: disable=broad-exception-caught
pass
pdf_ranksum = pd.DataFrame(
dict(
[("metric", col)]
+ [
(f"pop_{tpl_val[0]}_mean", tpl_val[1])
for tpl_val in zip(lst_pop_val, lst_pop_metric_mean)
]
+ [
(f"pop_{tpl_val[0]}_std", tpl_val[1])
for tpl_val in zip(lst_pop_val, lst_pop_metric_std)
]
+ [
(
f"pop_{tpl_val[0]}_ci",
f"({tpl_val[1][0]}, {tpl_val[1][1]})",
)
for tpl_val in zip(lst_pop_val, lst_pop_metric_ci)
]
+ [("rho", corr), ("z-statistic", z), ("p-value", p)]
),
index=[0],
)
lst_ranksum.append(pdf_ranksum)
pdf_ranksum_pretty = pd.DataFrame(
dict(
[
(
tpl_val[0] + f" (N: {tpl_val[1]})",
f"{tpl_val[2]:0.2f} (STD: {tpl_val[3]:0.2f}) (CI: {tpl_val[4][0]:0.2f}, {tpl_val[4][1]:0.2f})",
)
for tpl_val in zip(
lst_labels,
lst_pop_size,
lst_pop_metric_mean,
lst_pop_metric_std,
lst_pop_metric_ci,
)
]
+ [
("Z score", f"{z} (p: {p})"),
("Correlation with indicator variable", corr),
]
),
index=[col],
)
pdf_ranksum_pretty.index.name = "metric"
lst_ranksum_pretty.append(pdf_ranksum_pretty)
pdf_ranksum = pd.concat(lst_ranksum, ignore_index=True)
pdf_ranksum_pretty = pd.concat(lst_ranksum_pretty)
return pdf_ranksum, pdf_ranksum_pretty
[docs]
def get_cont_table_statewise(
pdf_included: pd.DataFrame,
lst_metrics: List[str],
pop_col_name: str,
lst_count_metrics: List[str],
dct_labels: Dict[int, str],
lst_st: List[str],
state_col_name: str,
) -> pd.DataFrame:
pdf_pretty_combined = {}
lst_st = sorted(lst_st)
for state in ["All States"] + lst_st:
_, pdf_pretty = get_contingency_table(
(
pdf_included.loc[pdf_included[state_col_name] == state]
if (state != "All States")
else pdf_included
),
lst_metrics,
pop_col_name,
lst_count_metrics,
dct_labels,
)
pdf_pretty_combined[state] = pdf_pretty
pdf_pretty_combined = pd.concat(pdf_pretty_combined, axis=1)
return pdf_pretty_combined
[docs]
def get_ranksum_table_statewise(
pdf_included: pd.DataFrame,
lst_metrics: List[str],
pop_col_name: str,
dct_labels: Dict[int, str],
lst_st: List[str],
state_col_name: str,
) -> pd.DataFrame:
pdf_pretty_combined = {}
lst_st = sorted(lst_st)
for state in ["All States"] + lst_st:
_, pdf_pretty = get_ranksum_table(
(
pdf_included.loc[pdf_included[state_col_name] == state]
if (state != "All States")
else pdf_included
),
lst_metrics,
pop_col_name,
dct_labels,
)
pdf_pretty_combined[state] = pdf_pretty
pdf_pretty_combined = pd.concat(pdf_pretty_combined, axis=1)
return pdf_pretty_combined
[docs]
def color_positive_green(x: pd.DataFrame) -> pd.DataFrame:
c1 = "background-color: #d1cfa1"
c2 = ""
cols = x.select_dtypes(np.number).columns
cols = [col for col in cols if col.lower().startswith("correlation with")]
df1 = pd.DataFrame("", index=x.index, columns=x.columns)
df1[cols] = np.where(x[cols] > 0, c1, c2)
return df1
[docs]
def get_descriptives(
pdf: pd.DataFrame,
lst_st: List[str],
lst_col: List[str],
state_col_name: str,
) -> pd.DataFrame:
lst_pdf_desc = []
for state in lst_st + ["Σ All States"]:
pdf_state = (
pdf.loc[pdf[state_col_name] == state]
if (state != "Σ All States")
else pdf
)
pdf_desc = (
pdf_state[lst_col]
.describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.95, 0.99])
.T.reset_index()
)
pdf_desc = pdf_desc.rename(columns={"index": "metric"})
pdf_desc[state_col_name] = state
pdf_desc = pdf_desc[
[state_col_name, "metric"]
+ [
col
for col in pdf_desc.columns
if col not in [state_col_name, "metric"]
]
]
lst_pdf_desc.append(pdf_desc)
pdf_all_desc = pd.concat(lst_pdf_desc, ignore_index=True)
return pdf_all_desc.sort_values(by=["metric", state_col_name]).set_index(
"metric"
)
[docs]
def get_missingness_stats(df: dd.DataFrame, outputfname: str) -> pd.DataFrame:
df_missing = df.map_partitions(
lambda pdf: pdf.groupby("STATE_CD").apply(
lambda gpdf: pd.DataFrame(
{
"n_benes": gpdf.shape[0],
"n_missing_ruca_codes": gpdf.loc[
pd.to_numeric(
gpdf["ruca_code"], errors="coerce"
).isna()
| (
pd.to_numeric(gpdf["ruca_code"], errors="coerce")
== -80
)
].shape[0],
"n_unmatched_zip_codes": gpdf.loc[
gpdf["out_of_state"].isna()
].shape[0],
"n_out_of_state_benes": gpdf.loc[
gpdf["out_of_state"] == 1
].shape[0],
"n_missing_distance_to_fqhc": gpdf.loc[
~(
pd.to_numeric(
gpdf["distance_to_fqhc"], errors="coerce"
)
>= 0
)
].shape[0],
"n_missing_gender": gpdf.loc[gpdf["Female"].isna()].shape[
0
],
"race_hrsa": gpdf.loc[
gpdf["race_hrsa"].isin([0, -1])
].shape[0],
"n_benes_with_no_diag_codes": gpdf.loc[
gpdf["LST_DIAG_CD"].isna()
| (gpdf["LST_DIAG_CD"].str.strip() == "")
].shape[0],
"n_benes_with_no_ndc": gpdf.loc[
gpdf["LST_NDC"].isna()
| (gpdf["LST_NDC"].str.strip() == "")
].shape[0],
},
index=[gpdf["STATE_CD"].values[0]],
)
)
).compute()
df_missing = df_missing.reset_index()
del df_missing["level_1"]
df_missing = df_missing.set_index(["STATE_CD"])
df_missing = df_missing.groupby("STATE_CD").sum()
df_missing_with_total = pd.concat([
df_missing.reset_index(drop=False),
pd.DataFrame(
{**{"STATE_CD": "ALL STATES"}, **df_missing.sum(axis=0).to_dict()},
index=[0],
)]
)
df_missing_with_total.to_excel(outputfname, index=False)
return df_missing_with_total
[docs]
def get_utilisation_histograms(pdf: pd.DataFrame, lst_covar: List[str]) -> Any:
dct_new_util_vist_names = {
"nhrsapcvst": "No. PC Visits",
"nhrsa_nonfqhc_pcvst": "No. PC Visits (excluding FQHC)",
"nhrsaotvst": "No. OT Visits",
"notfqhcvst": "No. FQHC Visits",
"nipvst": "No. IP Visits",
"nedvst": "No. ED Visits",
"nrxvst": "No. Rx Visits",
}
dct_new_util_cost_names = {
"hrsapccost": "PC Cost",
"hrsa_nonfqhc_pccost": "PC Cost (excluding FQHC)",
"hrsaotcost": "OT Cost",
"otfqhccost": "FQHC Cost",
"ipcost": "IP Cost",
"edcost": "ED Cost",
"rxcost": "Rx Cost",
}
dct_known_titles = {**dct_new_util_vist_names, **dct_new_util_cost_names}
subplots = ()
for covar in lst_covar:
subplots = subplots + pdf.hvplot.hist(
covar,
xlabel=dct_known_titles.get(covar, covar),
ylabel="No. Benes",
title=dct_known_titles.get(covar, covar),
)
return subplots.opts(opts.Layout(shared_axes=False)).cols(1)
[docs]
def get_covar_plots(
pdf: pd.DataFrame,
lst_covar: List[str],
lst_hist_covar: List[str],
cut_outliers: bool = False,
) -> Any:
dct_known_titles = {
"TotElMon": "Total Eligible Months",
"total_ffs_month": "Total FFS Months",
"total_mc_month": "Total MC Months",
"hrsa_fqhc_ratio": "% of FQHC Visits (in Benes who have FQHC visits)",
"elixhauser_score": "Elixhauser Score",
"cdps_mrx_score": "CDPS MRX Score",
"pmca_cond_less": "PMCA Score",
"pmca_cond_more": "PMCA Score (conservative)",
}
plots = None
if "age" in lst_covar:
age_plots = (
pdf.groupby("age")
.size()
.reset_index()
.rename(columns={0: "No. Benes"})
.hvplot.bar("age", title="Age")
)
if pdf.loc[pdf["age"] < 1].shape[0] > 0:
age_plots = age_plots + pdf.loc[pdf["age"] < 1].groupby(
"age_in_months"
).size().reset_index().rename(columns={0: "No. Benes"}).hvplot.bar(
"age_in_months", title="Age in months (<1 year old benes)"
)
plots = age_plots if (plots is None) else (plots + age_plots)
if "hrsa_fqhc_ratio" in lst_covar:
fqhc_plots = pdf.hvplot.hist(
"hrsa_fqhc_ratio", ylabel="No. Benes", title="% of FQHC Visits"
) + pdf.loc[pdf["hrsa_fqhc_ratio"] > 0].hvplot.hist(
"hrsa_fqhc_ratio",
ylabel="No. Benes",
title=dct_known_titles["hrsa_fqhc_ratio"],
)
plots = fqhc_plots if (plots is None) else (plots + fqhc_plots)
pdf_temp = pdf.copy()
if cut_outliers:
pdf_temp = pdf.assign(
**{
covar: pdf[covar].where(
pdf[covar] <= pdf[covar].quantile(0.99)
)
for covar in lst_hist_covar
}
)
for covar in lst_covar:
if covar not in ["age", "hrsa_fqhc_ratio", "ageday", "age_in_months"]:
subplot = (
pdf_temp.groupby(covar)
.size()
.reset_index()
.rename(columns={0: "No. Benes"})
.hvplot.bar(covar, title=dct_known_titles.get(covar, covar))
if (covar not in lst_hist_covar)
else pdf_temp.hvplot.hist(
covar,
ylabel="No. Benes",
title=dct_known_titles.get(covar, covar),
)
)
plots = subplot if (plots is None) else (plots + subplot)
return plots.opts(opts.Layout(shared_axes=False)).cols(1)
[docs]
def compute_descriptives(
pdf: pd.DataFrame,
lst_states: List[str],
lst_metrics: List[str],
output_fname: str,
state_col_name: str = "STATE_CD",
) -> pd.DataFrame:
pdf_desc = get_descriptives(pdf, lst_states, lst_metrics, state_col_name)
pdf_desc.to_excel(output_fname)
print(f"Statewise stats is saved at {os.path.abspath(output_fname)}")
pdf_desc = pdf_desc.loc[pdf_desc[state_col_name].str.contains("All")]
del pdf_desc[state_col_name]
return pdf_desc
[docs]
def compute_t_stats(
pdf: pd.DataFrame,
lst_states: List[str],
lst_metrics: List[str],
output_fname: str,
pop_col_name: str = "gt_50pc_hrsa_fqhc",
dct_labels: Optional[Dict[int, str]] = None,
state_col_name: str = "STATE_CD",
) -> "pd.io.formats.style.Styler":
if dct_labels is None:
dct_labels = {0: "non-FQHC", 1: "FQHC"}
pdf_tstats = get_ranksum_table_statewise(
pdf, lst_metrics, pop_col_name, dct_labels, lst_states, state_col_name
)
pdf_tstats.to_excel(output_fname)
print(f"Statewise T stats is saved at {os.path.abspath(output_fname)}")
pdf_tstats = pdf_tstats.loc[pdf_tstats[state_col_name].str.contains("All")]
print(
f"T stats for {pdf_tstats[state_col_name].values[0].replace('Σ', '')}"
)
del pdf_tstats[state_col_name]
return pdf_tstats.style.apply(color_positive_green, axis=None)
[docs]
def compute_contingency_table(
pdf: pd.DataFrame,
lst_states: List[str],
lst_metrics: List[str],
lst_count_metrics: List[str],
output_fname: str,
pop_col_name: str = "gt_50pc_hrsa_fqhc",
dct_labels: Optional[Dict[int, str]] = None,
state_col_name: str = "STATE_CD",
) -> "pd.io.formats.style.Styler":
if dct_labels is None:
dct_labels = {0: "non-FQHC", 1: "FQHC"}
pdf_crosstab = get_cont_table_statewise(
pdf,
lst_metrics,
pop_col_name,
lst_count_metrics,
dct_labels,
lst_states,
state_col_name,
)
pdf_crosstab.to_excel(output_fname)
print(
f"Statewise contingency table is saved at {os.path.abspath(output_fname)}"
)
pdf_crosstab = pdf_crosstab.loc[
pdf_crosstab[state_col_name].str.contains("All")
]
print(
f"Contingency table for {pdf_crosstab[state_col_name].values[0].replace('Σ', '')}"
)
del pdf_crosstab[state_col_name]
return pdf_crosstab.style.apply(color_positive_green, axis=None)
[docs]
def compute_missing_stats(
df: dd.DataFrame,
output_fname: str,
state_col_name: str = "STATE_CD",
) -> pd.DataFrame:
pdf_missing_stats = df.map_partitions(
lambda pdf: pd.concat(
[
pdf.groupby(state_col_name).agg(
**{"n_benes": ("MSIS_ID", "count")}
),
pdf.loc[
pd.to_numeric(pdf["ruca_code"], errors="coerce").isna()
| (pd.to_numeric(pdf["ruca_code"], errors="coerce") == -80)
]
.groupby(state_col_name)
.agg(**{"n_missing_ruca_codes": ("MSIS_ID", "count")}),
pdf.loc[pdf["out_of_state"].isna()]
.groupby(state_col_name)
.agg(**{"n_out_of_state_benes": ("MSIS_ID", "count")}),
pdf.loc[
~(
pd.to_numeric(pdf["distance_to_fqhc"], errors="coerce")
>= 0
)
]
.groupby(state_col_name)
.agg(**{"n_missing_distance_to_fqhc": ("MSIS_ID", "count")}),
pdf.loc[pdf["Female"].isna()]
.groupby(state_col_name)
.agg(**{"n_missing_gender": ("MSIS_ID", "count")}),
pdf.loc[pdf["race_hrsa"].isin([0, -1])]
.groupby(state_col_name)
.agg(**{"n_missing_race": ("MSIS_ID", "count")}),
pdf.loc[
pdf["LST_DIAG_CD"].isna()
| (pdf["LST_DIAG_CD"].str.strip() == "")
]
.groupby(state_col_name)
.agg(**{"n_benes_with_no_diag_codes": ("MSIS_ID", "count")}),
pdf.loc[
pdf["LST_NDC"].isna() | (pdf["LST_NDC"].str.strip() == "")
]
.groupby(state_col_name)
.agg(**{"n_benes_with_no_ndc_codes": ("MSIS_ID", "count")}),
],
axis=1,
)
).compute()
pdf_missing_stats = pdf_missing_stats.groupby(
pdf_missing_stats.index
).sum()
pdf_missing_stats = (
pd.concat([
pdf_missing_stats.reset_index(drop=False)
.rename(columns={"index": state_col_name}),
pd.DataFrame(
{
**{state_col_name: "All States"},
**pdf_missing_stats.sum(axis=0).to_dict(),
},
index=[0],
)
])
.set_index(state_col_name)
.astype(int)
.reset_index(drop=False)
)
pdf_missing_stats.to_excel(output_fname)
print(
f"Statewise missing stats is saved at {os.path.abspath(output_fname)}"
)
pdf_missing_stats = pdf_missing_stats.loc[
pdf_missing_stats[state_col_name].str.contains("All")
]
print(
f"Missing stats for {pdf_missing_stats[state_col_name].values[0].replace('Σ', '')}"
)
del pdf_missing_stats[state_col_name]
return pdf_missing_stats.T