SNF Model Ablation And ML Value

Modified

May 15, 2026

SNF Model Ablation And ML Value

Executive takeaway: The business notebook shows that automated ranking improves SNF payroll approval workflow. This notebook tests whether the modeling stack earns its complexity by comparing manual thresholds, deterministic rules, robust statistics, unsupervised ML, and the hybrid ranking under approval-budget, dollar-capture, temporal, uncertainty, and robustness views.

Synthetic labels and injected anomaly dollars are used here only for evaluation. They are not scoring features and they should not appear in administrator-facing queues.

import polars as pl
from common.execution import notebook_fast_mode
from common.plots import (
    LetsPlot,
    aes,
    geom_bar,
    geom_line,
    geom_point,
    geom_tile,
    ggplot,
    labs,
    rotated_x_labels,
    theme_minimal,
)

from payroll_anomaly_ranking.columns import (
    AggregateCol,
    MetricCol,
    PayrollCol,
    ReviewCol,
    ScoreCol,
)
from payroll_anomaly_ranking.config import PayrollConfig
from payroll_anomaly_ranking.evaluation import evaluate_scores
from payroll_anomaly_ranking.pipeline import PipelineIncludeConfig, run_pipeline
from payroll_anomaly_ranking.scenarios import diagnostic_scenario_presets

LetsPlot.setup_html()

config = PayrollConfig(employee_count=200, pay_periods=14, review_budgets=(10, 25, 50))
FAST_CONFIG = PayrollConfig(employee_count=90, pay_periods=10, review_budgets=(10, 25))
NOTEBOOK_FAST = notebook_fast_mode()
active_config = FAST_CONFIG if NOTEBOOK_FAST else config
active_include = (
    PipelineIncludeConfig(
        validation=False,
        aggregations=False,
        evaluation=True,
        backtest=True,
        rolling_origin=True,
        review_queues=True,
        leakage_checks=True,
    )
    if NOTEBOOK_FAST
    else PipelineIncludeConfig.all()
)
scenario = diagnostic_scenario_presets(("premium-mismatch",))["premium-mismatch"]
results = run_pipeline(active_config, scenario=scenario, include=active_include)

SCORE_METHODS = [
    ("rule score", ScoreCol.RULE_SCORE, "deterministic rules"),
    ("robust statistical score", ScoreCol.STATISTICAL_SCORE, "robust statistics"),
    (
        "schedule/timeclock score",
        ScoreCol.SCHEDULE_TIMECLOCK_SCORE,
        "operational context",
    ),
    (
        "premium eligibility score",
        ScoreCol.PREMIUM_ELIGIBILITY_SCORE,
        "pay policy context",
    ),
    ("ML score", ScoreCol.ML_SCORE, "unsupervised ML"),
    ("hybrid score", ScoreCol.FINAL_ANOMALY_SCORE, "hybrid ranking"),
]
THRESHOLD_FLAGS = [
    ("gross pay threshold", ScoreCol.THRESHOLD_GROSS_PAY_FLAG),
    ("total hours threshold", ScoreCol.THRESHOLD_TOTAL_HOURS_FLAG),
    ("overtime threshold", ScoreCol.THRESHOLD_OVERTIME_HOURS_FLAG),
    ("premium dollars threshold", ScoreCol.THRESHOLD_PREMIUM_DOLLARS_FLAG),
    ("paid-vs-scheduled threshold", ScoreCol.THRESHOLD_PAID_VS_SCHEDULED_FLAG),
]


def _score_method_budget_metrics(scored: pl.DataFrame, budget: int) -> pl.DataFrame:
    total_anomalies = scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
    total_dollars = float(
        scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
        .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
        .item()
        or 0.0,
    )
    rows = []
    for method, score_col, method_type in SCORE_METHODS:
        top = (
            scored.sort(
                [PayrollCol.PAY_PERIOD_INDEX, score_col],
                descending=[False, True],
            )
            .group_by(PayrollCol.PAY_PERIOD_INDEX)
            .head(budget)
        )
        true_positives = top.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
        captured_dollars = float(
            top.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
            .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
            .item()
            or 0.0,
        )
        rows.append(
            {
                "method": method,
                "method_type": method_type,
                MetricCol.K: float(budget),
                MetricCol.REVIEW_VOLUME: top.height,
                MetricCol.PRECISION_AT_K: true_positives / max(top.height, 1),
                MetricCol.RECALL_AT_K: true_positives / max(total_anomalies, 1),
                MetricCol.EXPOSURE_CAPTURED_AT_K: float(
                    top.select(pl.sum(ScoreCol.ESTIMATED_EXPOSURE)).item() or 0.0,
                ),
                MetricCol.DOLLARS_CAPTURED_AT_K: captured_dollars,
                MetricCol.DOLLAR_CAPTURE_RATE: captured_dollars / total_dollars
                if total_dollars
                else 0.0,
            },
        )
    return pl.DataFrame(rows)


def _method_ladder(scored: pl.DataFrame) -> pl.DataFrame:
    score_rows = pl.concat(
        [
            _score_method_budget_metrics(scored, budget)
            for budget in active_config.review_budgets
        ],
        how="diagonal",
    )
    threshold_rows = evaluate_scores(scored, active_config).threshold_baseline_metrics
    threshold_rows = threshold_rows.with_columns(
        pl.col("baseline").str.replace("_", " ").alias("method"),
        pl.lit("manual threshold").alias("method_type"),
        pl.lit(float(active_config.review_budgets[0])).alias(MetricCol.K),
        (
            pl.col(MetricCol.DOLLARS_CAPTURED_AT_K)
            / max(
                float(
                    scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
                    .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
                    .item()
                    or 0.0,
                ),
                1.0,
            )
        ).alias(MetricCol.DOLLAR_CAPTURE_RATE),
    ).select(score_rows.columns)
    return pl.concat([threshold_rows, score_rows], how="diagonal")


def _threshold_error_summary(scored: pl.DataFrame) -> pl.DataFrame:
    rows = []
    total_anomalies = scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
    for method, flag in THRESHOLD_FLAGS:
        reviewed = scored.filter(pl.col(flag) == 1)
        false_negatives = scored.filter(
            (pl.col(flag) == 0) & (pl.col(PayrollCol.IS_ANOMALY) == 1),
        )
        rows.append(
            {
                "method": method,
                MetricCol.REVIEW_VOLUME: reviewed.height,
                AggregateCol.TRUE_ANOMALIES: reviewed.filter(
                    pl.col(PayrollCol.IS_ANOMALY) == 1,
                ).height,
                "false_positives": reviewed.filter(
                    pl.col(PayrollCol.IS_ANOMALY) == 0,
                ).height,
                "false_negatives": false_negatives.height,
                "missed_anomaly_rate": false_negatives.height / max(total_anomalies, 1),
                "missed_synthetic_dollars": float(
                    false_negatives.select(pl.sum(PayrollCol.ANOMALY_DOLLARS)).item()
                    or 0.0,
                ),
            },
        )
    return pl.DataFrame(rows)


def _component_heatmap_data(scored: pl.DataFrame, *, top_n: int = 20) -> pl.DataFrame:
    latest_period = scored.select(pl.max(PayrollCol.PAY_PERIOD_INDEX)).item()
    top = (
        scored.filter(pl.col(PayrollCol.PAY_PERIOD_INDEX) == latest_period)
        .sort(ScoreCol.FINAL_ANOMALY_SCORE, descending=True)
        .with_row_index("queue_row", offset=1)
        .head(top_n)
        .select(
            [
                "queue_row",
                ScoreCol.RULE_SCORE,
                ScoreCol.STATISTICAL_SCORE,
                ScoreCol.SCHEDULE_TIMECLOCK_SCORE,
                ScoreCol.PREMIUM_ELIGIBILITY_SCORE,
                ScoreCol.ML_SCORE,
                ScoreCol.EXPOSURE_SCORE,
                ScoreCol.FINAL_ANOMALY_SCORE,
            ],
        )
    )
    return top.unpivot(
        index="queue_row",
        variable_name="component",
        value_name="score",
    )

Evaluation Design

The ablation uses later-period synthetic labels only for evaluation. Scoring features remain leakage-safe and exclude injected labels, anomaly categories, and anomaly dollars. The operational metric is approval-budget ranking: what the system captures inside the number of records an administrator can realistically review each payroll cycle.

results.leakage_checks

shape: (3, 2)

check	passed
str	bool
"model_features_exclude_labels"	true
"analyst_queue_excludes_labels"	true
"scoring_features_exclude_anoma…	true

Method-Complexity Ladder

The ladder below separates the value of each method family. Manual thresholds are the baseline workflow; rules add deterministic SNF approval logic; robust statistics add peer/history context; ML adds multivariate unusualness; hybrid ranking combines those signals with exposure so the queue is useful under constrained review capacity.

method_ladder = _method_ladder(results.scored)
method_ladder.sort(
    [MetricCol.K, MetricCol.DOLLAR_CAPTURE_RATE],
    descending=[False, True],
)

shape: (25, 9)

method	method_type	k	review_volume	precision_at_k	recall_at_k	exposure_captured_at_k	dollars_captured_at_k	dollar_capture_rate
str	str	f64	i64	f64	f64	f64	f64	f64
"rule score"	"deterministic rules"	10.0	140	0.892857	0.961538	19457.682525	2630.74	0.960531
"premium eligibility score"	"pay policy context"	10.0	140	0.835714	0.9	16415.8028	2437.18	0.889858
"manual threshold_pack"	"manual threshold"	10.0	1672	0.046651	0.6	210852.845442	1832.1	0.668933
"facility payroll_variance_thre…	"manual threshold"	10.0	2588	0.016615	0.330769	318221.704375	915.21	0.33416
"hybrid score"	"hybrid ranking"	10.0	140	0.1	0.107692	45905.20535	344.58	0.125812
…	…	…	…	…	…	…	…	…
"rule score"	"deterministic rules"	50.0	700	0.185714	1.0	79998.595028	2738.84	1.0
"hybrid score"	"hybrid ranking"	50.0	700	0.175714	0.946154	134582.36125	2610.24	0.953046
"ML score"	"unsupervised ML"	50.0	700	0.052857	0.284615	132068.855756	853.37	0.311581
"robust statistical score"	"robust statistics"	50.0	700	0.021429	0.115385	138951.437431	368.24	0.134451
"schedule/timeclock score"	"operational context"	50.0	700	0.0	0.0	54056.884232	0.0	0.0

top_budget = float(active_config.review_budgets[0])
budget_ladder = method_ladder.filter(pl.col(MetricCol.K) == top_budget).with_columns(
    (
        pl.col(MetricCol.EXPOSURE_CAPTURED_AT_K)
        / pl.max_horizontal(pl.col(MetricCol.REVIEW_VOLUME), pl.lit(1))
    ).alias("exposure_per_review"),
)
(
    ggplot(budget_ladder, aes("method", "exposure_per_review"))
    + geom_bar(aes(fill="method_type"), stat="identity")
    + labs(
        title="Incremental method value at the first approval budget",
        x="Method",
        y="Estimated exposure per reviewed record",
        fill="Method family",
    )
    + theme_minimal()
    + rotated_x_labels()
)

(
    ggplot(method_ladder, aes(MetricCol.K, MetricCol.DOLLAR_CAPTURE_RATE))
    + geom_line(aes(color="method"))
    + geom_point(aes(color="method"), size=3)
    + labs(
        title="Dollar capture improves or degrades as review budget changes",
        x="Review budget per pay period",
        y="Synthetic dollar capture rate",
        color="Method",
    )
    + theme_minimal()
)

Component Contribution Heatmap

The heatmap makes hybrid ranking explainable at the record level. Rows are the top latest-period queue records. Columns show the major component scores that contribute to the final approval exception score.

component_heatmap = _component_heatmap_data(results.scored)
(
    ggplot(component_heatmap, aes("component", "queue_row", fill="score"))
    + geom_tile()
    + labs(
        title="Top queue records by score component",
        x="Score component",
        y="Latest-period queue row",
        fill="Score",
    )
    + theme_minimal()
    + rotated_x_labels()
)

Manual Threshold Error Modes

This section intentionally uses evaluation-only synthetic labels. It shows why threshold review is fragile: a broad threshold can create many false positives, while a narrow threshold can leave synthetic anomaly dollars outside the review queue.

threshold_errors = _threshold_error_summary(results.scored)
threshold_errors.sort("missed_synthetic_dollars", descending=True)

shape: (5, 7)

method	review_volume	true_anomalies	false_positives	false_negatives	missed_anomaly_rate	missed_synthetic_dollars
str	i64	i64	i64	i64	f64	f64
"gross pay threshold"	0	0	0	130	1.0	2738.84
"paid-vs-scheduled threshold"	307	0	307	130	1.0	2738.84
"premium dollars threshold"	1	1	0	129	0.992308	2703.6
"total hours threshold"	25	1	24	129	0.992308	2702.37
"overtime threshold"	25	1	24	129	0.992308	2702.37

(
    ggplot(threshold_errors, aes("method", "missed_synthetic_dollars"))
    + geom_bar(stat="identity", fill="#b45309")
    + labs(
        title="Evaluation-only synthetic dollars missed by manual thresholds",
        x="Manual threshold",
        y="Missed synthetic anomaly dollars",
    )
    + theme_minimal()
    + rotated_x_labels()
)

Temporal, Uncertainty, And Calibration Context

A stronger data-science proof does not stop at one aggregate table. The views below check whether selected thresholds transfer over time, whether uncertainty buckets carry useful signal, and whether expected-pay intervals behave plausibly on synthetic evaluation records.

results.model_comparison

shape: (6, 8)

model	k	precision_at_k	recall_at_k	f1_at_k	average_anomaly_rank	mean_reciprocal_rank	pr_auc
str	f64	f64	f64	f64	f64	f64	f64
"rule_score"	10.0	0.892857	0.961538	0.925926	5.307692	0.306057	1.0
"statistical_score"	10.0	0.028571	0.030769	0.02963	398.669231	0.01279	0.013481
"schedule_timeclock_score"	10.0	0.0	0.0	0.0	484.469231	0.003116	0.009075
"premium_eligibility_score"	10.0	0.835714	0.9	0.866667	5.723077	0.291564	0.872483
"ml_score"	10.0	0.028571	0.030769	0.02963	60.953846	0.048181	0.084428
"hybrid_score"	10.0	0.1	0.107692	0.103704	30.438462	0.094971	0.17436

results.rolling_origin_metrics

shape: (9, 20)

origin	train_start_period	train_end_period	validation_period	test_period	selected_threshold	validation_f1	threshold_precision	threshold_recall	k	review_volume	native_review_burden	precision_at_k	recall_at_k	f1_at_k	exposure_captured_at_k	exposure_per_review	dollars_captured_at_k	dollar_capture_rate	test_score_mean
i64	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64
1	1	3	4	5	0.35	0.205882	0.140845	1.0	10.0	60.0	60.0	0.166667	1.0	0.285714	11652.791925	194.213199	209.38	1.0	0.166859
2	1	4	5	6	0.5	0.266667	0.137931	0.444444	10.0	60.0	60.0	0.15	1.0	0.26087	10565.449548	176.090826	171.59	1.0	0.160688
3	1	5	6	7	0.35	0.253521	0.118644	1.0	10.0	60.0	60.0	0.116667	1.0	0.208955	9546.391323	159.106522	165.2	1.0	0.160212
4	1	6	7	8	0.5	0.307692	0.166667	0.777778	10.0	60.0	60.0	0.116667	0.777778	0.202899	10111.77825	168.529637	129.04	0.787886	0.163613
5	1	7	8	9	0.5	0.27451	0.171429	0.666667	10.0	60.0	60.0	0.133333	0.888889	0.231884	10175.630775	169.593846	166.74	0.910849	0.158444
6	1	8	9	10	0.35	0.290323	0.177419	1.0	10.0	60.0	60.0	0.166667	0.909091	0.28169	10752.960125	179.216002	234.17	0.937242	0.162808
7	1	9	10	11	0.35	0.30137	0.177419	1.0	10.0	60.0	60.0	0.133333	0.727273	0.225352	12212.705525	203.545092	185.92	0.752438	0.161991
8	1	10	11	12	0.35	0.30137	0.138462	1.0	10.0	60.0	60.0	0.15	1.0	0.26087	10103.247375	168.387456	219.68	1.0	0.158278
9	1	11	12	13	0.5	0.341463	0.135135	0.5	10.0	60.0	60.0	0.166667	1.0	0.285714	9554.4226	159.240377	194.97	1.0	0.164788

(
    ggplot(results.rolling_origin_metrics, aes("test_period", MetricCol.PRECISION_AT_K))
    + geom_line()
    + geom_point(size=3)
    + labs(
        title="Rolling-origin approval-budget precision over time",
        x="Test pay period",
        y="Precision at selected budget",
    )
    + theme_minimal()
)

results.uncertainty_bucket_metrics

shape: (2, 5)

uncertainty_bucket	records	true_anomalies	anomaly_rate	avg_uncertainty
str	u32	i64	f64	f64
"Low"	11074	93	0.008398	0.229317
"Medium"	1291	37	0.02866	0.388043

(
    ggplot(
        results.uncertainty_bucket_metrics,
        aes(ReviewCol.UNCERTAINTY_BUCKET, MetricCol.ANOMALY_RATE),
    )
    + geom_bar(stat="identity", fill="#2563eb")
    + labs(
        title="Evaluation anomaly rate by uncertainty bucket",
        x="Uncertainty bucket",
        y="Synthetic anomaly rate",
    )
    + theme_minimal()
)

results.risk_coverage_analysis

shape: (5, 5)

coverage	records	abstained_records	review_precision	review_budget
f64	i64	i64	f64	i64
1.0	12365	0	0.3	10
0.9	11128	1237	0.4	10
0.8	9892	2473	0.7	10
0.7	8655	3710	0.3	10
0.6	7419	4946	0.3	10

(
    ggplot(
        results.risk_coverage_analysis,
        aes(MetricCol.COVERAGE, MetricCol.REVIEW_PRECISION),
    )
    + geom_line()
    + geom_point(size=3)
    + labs(
        title="Risk-coverage diagnostic",
        x="Share of least-uncertain records retained",
        y="Review precision",
    )
    + theme_minimal()
)

results.expected_gross_pay_interval_metrics

shape: (1, 4)

normal_interval_coverage	anomaly_exceeds_p90_rate	avg_interval_width	avg_anomaly_excess_vs_p90
f64	f64	f64	f64
0.721782	0.346154	79.618719	14.431411

What This Proves

The ML score is useful because it can rank unusual multivariate combinations that do not reduce to one raw threshold. The hybrid score is the stronger product signal because payroll approval also needs deterministic policy rules, robust peer/history context, premium eligibility, schedule/timeclock evidence, uncertainty context, and estimated exposure. Simpler components remain valuable: rule and premium signals are strong in premium-heavy scenarios, while hybrid ranking is the deployable approval queue because it balances detection, actionability, and business impact.