SNF Model Ablation And ML Value

Modified

May 15, 2026

SNF Model Ablation And ML Value

Executive takeaway: The business notebook shows that automated ranking improves SNF payroll approval workflow. This notebook tests whether the modeling stack earns its complexity by comparing manual thresholds, deterministic rules, robust statistics, unsupervised ML, and the hybrid ranking under approval-budget, dollar-capture, temporal, uncertainty, and robustness views.

Synthetic labels and injected anomaly dollars are used here only for evaluation. They are not scoring features and they should not appear in administrator-facing queues.

import polars as pl
from common.execution import notebook_fast_mode
from common.plots import (
    LetsPlot,
    aes,
    geom_bar,
    geom_line,
    geom_point,
    geom_tile,
    ggplot,
    labs,
    rotated_x_labels,
    theme_minimal,
)

from payroll_anomaly_ranking.columns import (
    AggregateCol,
    MetricCol,
    PayrollCol,
    ReviewCol,
    ScoreCol,
)
from payroll_anomaly_ranking.config import PayrollConfig
from payroll_anomaly_ranking.evaluation import evaluate_scores
from payroll_anomaly_ranking.pipeline import PipelineIncludeConfig, run_pipeline
from payroll_anomaly_ranking.scenarios import diagnostic_scenario_presets

LetsPlot.setup_html()
config = PayrollConfig(employee_count=200, pay_periods=14, review_budgets=(10, 25, 50))
FAST_CONFIG = PayrollConfig(employee_count=90, pay_periods=10, review_budgets=(10, 25))
NOTEBOOK_FAST = notebook_fast_mode()
active_config = FAST_CONFIG if NOTEBOOK_FAST else config
active_include = (
    PipelineIncludeConfig(
        validation=False,
        aggregations=False,
        evaluation=True,
        backtest=True,
        rolling_origin=True,
        review_queues=True,
        leakage_checks=True,
    )
    if NOTEBOOK_FAST
    else PipelineIncludeConfig.all()
)
scenario = diagnostic_scenario_presets(("premium-mismatch",))["premium-mismatch"]
results = run_pipeline(active_config, scenario=scenario, include=active_include)

SCORE_METHODS = [
    ("rule score", ScoreCol.RULE_SCORE, "deterministic rules"),
    ("robust statistical score", ScoreCol.STATISTICAL_SCORE, "robust statistics"),
    (
        "schedule/timeclock score",
        ScoreCol.SCHEDULE_TIMECLOCK_SCORE,
        "operational context",
    ),
    (
        "premium eligibility score",
        ScoreCol.PREMIUM_ELIGIBILITY_SCORE,
        "pay policy context",
    ),
    ("ML score", ScoreCol.ML_SCORE, "unsupervised ML"),
    ("hybrid score", ScoreCol.FINAL_ANOMALY_SCORE, "hybrid ranking"),
]
THRESHOLD_FLAGS = [
    ("gross pay threshold", ScoreCol.THRESHOLD_GROSS_PAY_FLAG),
    ("total hours threshold", ScoreCol.THRESHOLD_TOTAL_HOURS_FLAG),
    ("overtime threshold", ScoreCol.THRESHOLD_OVERTIME_HOURS_FLAG),
    ("premium dollars threshold", ScoreCol.THRESHOLD_PREMIUM_DOLLARS_FLAG),
    ("paid-vs-scheduled threshold", ScoreCol.THRESHOLD_PAID_VS_SCHEDULED_FLAG),
]


def _score_method_budget_metrics(scored: pl.DataFrame, budget: int) -> pl.DataFrame:
    total_anomalies = scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
    total_dollars = float(
        scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
        .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
        .item()
        or 0.0,
    )
    rows = []
    for method, score_col, method_type in SCORE_METHODS:
        top = (
            scored.sort(
                [PayrollCol.PAY_PERIOD_INDEX, score_col],
                descending=[False, True],
            )
            .group_by(PayrollCol.PAY_PERIOD_INDEX)
            .head(budget)
        )
        true_positives = top.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
        captured_dollars = float(
            top.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
            .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
            .item()
            or 0.0,
        )
        rows.append(
            {
                "method": method,
                "method_type": method_type,
                MetricCol.K: float(budget),
                MetricCol.REVIEW_VOLUME: top.height,
                MetricCol.PRECISION_AT_K: true_positives / max(top.height, 1),
                MetricCol.RECALL_AT_K: true_positives / max(total_anomalies, 1),
                MetricCol.EXPOSURE_CAPTURED_AT_K: float(
                    top.select(pl.sum(ScoreCol.ESTIMATED_EXPOSURE)).item() or 0.0,
                ),
                MetricCol.DOLLARS_CAPTURED_AT_K: captured_dollars,
                MetricCol.DOLLAR_CAPTURE_RATE: captured_dollars / total_dollars
                if total_dollars
                else 0.0,
            },
        )
    return pl.DataFrame(rows)


def _method_ladder(scored: pl.DataFrame) -> pl.DataFrame:
    score_rows = pl.concat(
        [
            _score_method_budget_metrics(scored, budget)
            for budget in active_config.review_budgets
        ],
        how="diagonal",
    )
    threshold_rows = evaluate_scores(scored, active_config).threshold_baseline_metrics
    threshold_rows = threshold_rows.with_columns(
        pl.col("baseline").str.replace("_", " ").alias("method"),
        pl.lit("manual threshold").alias("method_type"),
        pl.lit(float(active_config.review_budgets[0])).alias(MetricCol.K),
        (
            pl.col(MetricCol.DOLLARS_CAPTURED_AT_K)
            / max(
                float(
                    scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1)
                    .select(pl.sum(PayrollCol.ANOMALY_DOLLARS))
                    .item()
                    or 0.0,
                ),
                1.0,
            )
        ).alias(MetricCol.DOLLAR_CAPTURE_RATE),
    ).select(score_rows.columns)
    return pl.concat([threshold_rows, score_rows], how="diagonal")


def _threshold_error_summary(scored: pl.DataFrame) -> pl.DataFrame:
    rows = []
    total_anomalies = scored.filter(pl.col(PayrollCol.IS_ANOMALY) == 1).height
    for method, flag in THRESHOLD_FLAGS:
        reviewed = scored.filter(pl.col(flag) == 1)
        false_negatives = scored.filter(
            (pl.col(flag) == 0) & (pl.col(PayrollCol.IS_ANOMALY) == 1),
        )
        rows.append(
            {
                "method": method,
                MetricCol.REVIEW_VOLUME: reviewed.height,
                AggregateCol.TRUE_ANOMALIES: reviewed.filter(
                    pl.col(PayrollCol.IS_ANOMALY) == 1,
                ).height,
                "false_positives": reviewed.filter(
                    pl.col(PayrollCol.IS_ANOMALY) == 0,
                ).height,
                "false_negatives": false_negatives.height,
                "missed_anomaly_rate": false_negatives.height / max(total_anomalies, 1),
                "missed_synthetic_dollars": float(
                    false_negatives.select(pl.sum(PayrollCol.ANOMALY_DOLLARS)).item()
                    or 0.0,
                ),
            },
        )
    return pl.DataFrame(rows)


def _component_heatmap_data(scored: pl.DataFrame, *, top_n: int = 20) -> pl.DataFrame:
    latest_period = scored.select(pl.max(PayrollCol.PAY_PERIOD_INDEX)).item()
    top = (
        scored.filter(pl.col(PayrollCol.PAY_PERIOD_INDEX) == latest_period)
        .sort(ScoreCol.FINAL_ANOMALY_SCORE, descending=True)
        .with_row_index("queue_row", offset=1)
        .head(top_n)
        .select(
            [
                "queue_row",
                ScoreCol.RULE_SCORE,
                ScoreCol.STATISTICAL_SCORE,
                ScoreCol.SCHEDULE_TIMECLOCK_SCORE,
                ScoreCol.PREMIUM_ELIGIBILITY_SCORE,
                ScoreCol.ML_SCORE,
                ScoreCol.EXPOSURE_SCORE,
                ScoreCol.FINAL_ANOMALY_SCORE,
            ],
        )
    )
    return top.unpivot(
        index="queue_row",
        variable_name="component",
        value_name="score",
    )

Evaluation Design

The ablation uses later-period synthetic labels only for evaluation. Scoring features remain leakage-safe and exclude injected labels, anomaly categories, and anomaly dollars. The operational metric is approval-budget ranking: what the system captures inside the number of records an administrator can realistically review each payroll cycle.

results.leakage_checks
shape: (3, 2)
check passed
str bool
"model_features_exclude_labels" true
"analyst_queue_excludes_labels" true
"scoring_features_exclude_anoma… true

Method-Complexity Ladder

The ladder below separates the value of each method family. Manual thresholds are the baseline workflow; rules add deterministic SNF approval logic; robust statistics add peer/history context; ML adds multivariate unusualness; hybrid ranking combines those signals with exposure so the queue is useful under constrained review capacity.

method_ladder = _method_ladder(results.scored)
method_ladder.sort(
    [MetricCol.K, MetricCol.DOLLAR_CAPTURE_RATE],
    descending=[False, True],
)
shape: (25, 9)
method method_type k review_volume precision_at_k recall_at_k exposure_captured_at_k dollars_captured_at_k dollar_capture_rate
str str f64 i64 f64 f64 f64 f64 f64
"rule score" "deterministic rules" 10.0 140 0.892857 0.961538 19457.682525 2630.74 0.960531
"premium eligibility score" "pay policy context" 10.0 140 0.835714 0.9 16415.8028 2437.18 0.889858
"manual threshold_pack" "manual threshold" 10.0 1672 0.046651 0.6 210852.845442 1832.1 0.668933
"facility payroll_variance_thre… "manual threshold" 10.0 2588 0.016615 0.330769 318221.704375 915.21 0.33416
"hybrid score" "hybrid ranking" 10.0 140 0.1 0.107692 45905.20535 344.58 0.125812
"rule score" "deterministic rules" 50.0 700 0.185714 1.0 79998.595028 2738.84 1.0
"hybrid score" "hybrid ranking" 50.0 700 0.175714 0.946154 134582.36125 2610.24 0.953046
"ML score" "unsupervised ML" 50.0 700 0.052857 0.284615 132068.855756 853.37 0.311581
"robust statistical score" "robust statistics" 50.0 700 0.021429 0.115385 138951.437431 368.24 0.134451
"schedule/timeclock score" "operational context" 50.0 700 0.0 0.0 54056.884232 0.0 0.0
top_budget = float(active_config.review_budgets[0])
budget_ladder = method_ladder.filter(pl.col(MetricCol.K) == top_budget).with_columns(
    (
        pl.col(MetricCol.EXPOSURE_CAPTURED_AT_K)
        / pl.max_horizontal(pl.col(MetricCol.REVIEW_VOLUME), pl.lit(1))
    ).alias("exposure_per_review"),
)
(
    ggplot(budget_ladder, aes("method", "exposure_per_review"))
    + geom_bar(aes(fill="method_type"), stat="identity")
    + labs(
        title="Incremental method value at the first approval budget",
        x="Method",
        y="Estimated exposure per reviewed record",
        fill="Method family",
    )
    + theme_minimal()
    + rotated_x_labels()
)
(
    ggplot(method_ladder, aes(MetricCol.K, MetricCol.DOLLAR_CAPTURE_RATE))
    + geom_line(aes(color="method"))
    + geom_point(aes(color="method"), size=3)
    + labs(
        title="Dollar capture improves or degrades as review budget changes",
        x="Review budget per pay period",
        y="Synthetic dollar capture rate",
        color="Method",
    )
    + theme_minimal()
)

Component Contribution Heatmap

The heatmap makes hybrid ranking explainable at the record level. Rows are the top latest-period queue records. Columns show the major component scores that contribute to the final approval exception score.

component_heatmap = _component_heatmap_data(results.scored)
(
    ggplot(component_heatmap, aes("component", "queue_row", fill="score"))
    + geom_tile()
    + labs(
        title="Top queue records by score component",
        x="Score component",
        y="Latest-period queue row",
        fill="Score",
    )
    + theme_minimal()
    + rotated_x_labels()
)

Manual Threshold Error Modes

This section intentionally uses evaluation-only synthetic labels. It shows why threshold review is fragile: a broad threshold can create many false positives, while a narrow threshold can leave synthetic anomaly dollars outside the review queue.

threshold_errors = _threshold_error_summary(results.scored)
threshold_errors.sort("missed_synthetic_dollars", descending=True)
shape: (5, 7)
method review_volume true_anomalies false_positives false_negatives missed_anomaly_rate missed_synthetic_dollars
str i64 i64 i64 i64 f64 f64
"gross pay threshold" 0 0 0 130 1.0 2738.84
"paid-vs-scheduled threshold" 307 0 307 130 1.0 2738.84
"premium dollars threshold" 1 1 0 129 0.992308 2703.6
"total hours threshold" 25 1 24 129 0.992308 2702.37
"overtime threshold" 25 1 24 129 0.992308 2702.37
(
    ggplot(threshold_errors, aes("method", "missed_synthetic_dollars"))
    + geom_bar(stat="identity", fill="#b45309")
    + labs(
        title="Evaluation-only synthetic dollars missed by manual thresholds",
        x="Manual threshold",
        y="Missed synthetic anomaly dollars",
    )
    + theme_minimal()
    + rotated_x_labels()
)

Temporal, Uncertainty, And Calibration Context

A stronger data-science proof does not stop at one aggregate table. The views below check whether selected thresholds transfer over time, whether uncertainty buckets carry useful signal, and whether expected-pay intervals behave plausibly on synthetic evaluation records.

results.model_comparison
shape: (6, 8)
model k precision_at_k recall_at_k f1_at_k average_anomaly_rank mean_reciprocal_rank pr_auc
str f64 f64 f64 f64 f64 f64 f64
"rule_score" 10.0 0.892857 0.961538 0.925926 5.307692 0.306057 1.0
"statistical_score" 10.0 0.028571 0.030769 0.02963 398.669231 0.01279 0.013481
"schedule_timeclock_score" 10.0 0.0 0.0 0.0 484.469231 0.003116 0.009075
"premium_eligibility_score" 10.0 0.835714 0.9 0.866667 5.723077 0.291564 0.872483
"ml_score" 10.0 0.028571 0.030769 0.02963 60.953846 0.048181 0.084428
"hybrid_score" 10.0 0.1 0.107692 0.103704 30.438462 0.094971 0.17436
results.rolling_origin_metrics
shape: (9, 20)
origin train_start_period train_end_period validation_period test_period selected_threshold validation_f1 threshold_precision threshold_recall k review_volume native_review_burden precision_at_k recall_at_k f1_at_k exposure_captured_at_k exposure_per_review dollars_captured_at_k dollar_capture_rate test_score_mean
i64 i64 i64 i64 i64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64 f64
1 1 3 4 5 0.35 0.205882 0.140845 1.0 10.0 60.0 60.0 0.166667 1.0 0.285714 11652.791925 194.213199 209.38 1.0 0.166859
2 1 4 5 6 0.5 0.266667 0.137931 0.444444 10.0 60.0 60.0 0.15 1.0 0.26087 10565.449548 176.090826 171.59 1.0 0.160688
3 1 5 6 7 0.35 0.253521 0.118644 1.0 10.0 60.0 60.0 0.116667 1.0 0.208955 9546.391323 159.106522 165.2 1.0 0.160212
4 1 6 7 8 0.5 0.307692 0.166667 0.777778 10.0 60.0 60.0 0.116667 0.777778 0.202899 10111.77825 168.529637 129.04 0.787886 0.163613
5 1 7 8 9 0.5 0.27451 0.171429 0.666667 10.0 60.0 60.0 0.133333 0.888889 0.231884 10175.630775 169.593846 166.74 0.910849 0.158444
6 1 8 9 10 0.35 0.290323 0.177419 1.0 10.0 60.0 60.0 0.166667 0.909091 0.28169 10752.960125 179.216002 234.17 0.937242 0.162808
7 1 9 10 11 0.35 0.30137 0.177419 1.0 10.0 60.0 60.0 0.133333 0.727273 0.225352 12212.705525 203.545092 185.92 0.752438 0.161991
8 1 10 11 12 0.35 0.30137 0.138462 1.0 10.0 60.0 60.0 0.15 1.0 0.26087 10103.247375 168.387456 219.68 1.0 0.158278
9 1 11 12 13 0.5 0.341463 0.135135 0.5 10.0 60.0 60.0 0.166667 1.0 0.285714 9554.4226 159.240377 194.97 1.0 0.164788
(
    ggplot(results.rolling_origin_metrics, aes("test_period", MetricCol.PRECISION_AT_K))
    + geom_line()
    + geom_point(size=3)
    + labs(
        title="Rolling-origin approval-budget precision over time",
        x="Test pay period",
        y="Precision at selected budget",
    )
    + theme_minimal()
)
results.uncertainty_bucket_metrics
shape: (2, 5)
uncertainty_bucket records true_anomalies anomaly_rate avg_uncertainty
str u32 i64 f64 f64
"Low" 11074 93 0.008398 0.229317
"Medium" 1291 37 0.02866 0.388043
(
    ggplot(
        results.uncertainty_bucket_metrics,
        aes(ReviewCol.UNCERTAINTY_BUCKET, MetricCol.ANOMALY_RATE),
    )
    + geom_bar(stat="identity", fill="#2563eb")
    + labs(
        title="Evaluation anomaly rate by uncertainty bucket",
        x="Uncertainty bucket",
        y="Synthetic anomaly rate",
    )
    + theme_minimal()
)
results.risk_coverage_analysis
shape: (5, 5)
coverage records abstained_records review_precision review_budget
f64 i64 i64 f64 i64
1.0 12365 0 0.3 10
0.9 11128 1237 0.4 10
0.8 9892 2473 0.7 10
0.7 8655 3710 0.3 10
0.6 7419 4946 0.3 10
(
    ggplot(
        results.risk_coverage_analysis,
        aes(MetricCol.COVERAGE, MetricCol.REVIEW_PRECISION),
    )
    + geom_line()
    + geom_point(size=3)
    + labs(
        title="Risk-coverage diagnostic",
        x="Share of least-uncertain records retained",
        y="Review precision",
    )
    + theme_minimal()
)
results.expected_gross_pay_interval_metrics
shape: (1, 4)
normal_interval_coverage anomaly_exceeds_p90_rate avg_interval_width avg_anomaly_excess_vs_p90
f64 f64 f64 f64
0.721782 0.346154 79.618719 14.431411

What This Proves

The ML score is useful because it can rank unusual multivariate combinations that do not reduce to one raw threshold. The hybrid score is the stronger product signal because payroll approval also needs deterministic policy rules, robust peer/history context, premium eligibility, schedule/timeclock evidence, uncertainty context, and estimated exposure. Simpler components remain valuable: rule and premium signals are strong in premium-heavy scenarios, while hybrid ranking is the deployable approval queue because it balances detection, actionability, and business impact.