NDIS Data Pipeline

Processing and Quality Control of NDIS Data

Author

Tina Lasisi | Virum Ranka

Published

February 1, 2026

1. Create Intermediate data from raw data

This section processes raw NDIS data to an intermediate stage. Steps:

  • Load raw data
  • Clean and aggregate jurisdiction names
  • Parse timestamps and create derived fields
  • Save intermediate dataset
Show setup code
from pathlib import Path
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

# Define paths using main data folder structure
raw_path = Path("../data/ndis/raw/ndis_data_raw.csv")

print(f"Raw data path: {raw_path}")
Raw data path: ../data/ndis/raw/ndis_data_raw.csv

1.1 Load Raw Data

Code
# Load raw data
raw_df = pd.read_csv(raw_path, dtype=str)
raw_df.insert(0, "row_id", np.arange(len(raw_df)))

# Convert numeric columns
numeric_cols = [
    "offender_profiles",
    "arrestee",
    "forensic_profiles",
    "ndis_labs",
    "investigations_aided",
]
for col in numeric_cols:
    raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce")

raw_df["report_month"] = pd.to_numeric(raw_df["report_month"], errors="coerce")
raw_df["report_year"] = pd.to_numeric(raw_df["report_year"], errors="coerce")

print(f"Loaded {len(raw_df):,} rows")
Loaded 32,009 rows
Code
print(f"Columns: {list(raw_df.columns)}")
Columns: ['row_id', 'timestamp', 'report_month', 'report_year', 'jurisdiction', 'offender_profiles', 'arrestee', 'forensic_profiles', 'ndis_labs', 'investigations_aided', 'parser_used', 'source_file', 'capture_date', 'capture_year']
Code
raw_df.head()
row_id timestamp report_month report_year jurisdiction offender_profiles arrestee forensic_profiles ndis_labs investigations_aided parser_used source_file capture_date capture_year
0 0 20010715040342 NaN NaN Pennsylvania 7099 0 883 3 68 pre2007 20010715040342_pa.html 2001-07-15 04:03:42 2001
1 1 20010715040408 NaN NaN North Carolina 21246 0 727 2 40 pre2007 20010715040408_nc.html 2001-07-15 04:04:08 2001
2 2 20010715040546 NaN NaN Connecticut 4193 0 138 1 19 pre2007 20010715040546_ct.html 2001-07-15 04:05:46 2001
3 3 20010715040559 NaN NaN West Virginia 0 0 0 1 0 pre2007 20010715040559_wv.html 2001-07-15 04:05:59 2001
4 4 20010715040600 NaN NaN Kansas 1023 0 200 4 0 pre2007 20010715040600_ks.html 2001-07-15 04:06:00 2001

1.2 Clean Jurisdiction Names

Show jurisdiction cleaning code
# Keep original jurisdiction for reference
raw_df["jurisdiction_raw"] = raw_df["jurisdiction"]

# Jurisdiction cleaning patterns
JURISDICTION_PATTERNS = [
    (r"Alabama$|Alabama Stats", "Alabama"),
    (r"Alaska$|Alaska Stats", "Alaska"),
    (r"Arizona$|Arizona Stats", "Arizona"),
    (r"Arkansas$|Arkansas Stats", "Arkansas"),
    (r"California$|California Stats", "California"),
    (r"Colorado$|Colorado Stats", "Colorado"),
    (r"Connecticut$|Connecticut Stats", "Connecticut"),
    (r"Delaware$|Delaware Stats", "Delaware"),
    (r"Florida$|Florida Stats", "Florida"),
    (r"Georgia$|Georgia Stats", "Georgia"),
    (r"Hawaii$|Hawaii Stats", "Hawaii"),
    (r"Idaho$|Idaho Stats", "Idaho"),
    (r"Illinois$|Illinois Stats", "Illinois"),
    (r"Indiana$|Indiana Stats", "Indiana"),
    (r"Iowa$|Iowa Stats", "Iowa"),
    (r"Kansas$|Kansas Stats", "Kansas"),
    (r"Kentucky$|Kentucky Stats", "Kentucky"),
    (r"Louisiana$|Louisiana Stats", "Louisiana"),
    (r"Maine$|Maine Stats", "Maine"),
    (r"Maryland$|Maryland Stats", "Maryland"),
    (r"Massachusetts$|Massachusetts Stats", "Massachusetts"),
    (r"Michigan$|Michigan Stats", "Michigan"),
    (r"Minnesota$|Minnesota Stats", "Minnesota"),
    (r"Mississippi$|Mississippi Stats", "Mississippi"),
    (r"Missouri$|Missouri Stats", "Missouri"),
    (r"Montana$|Montana Stats", "Montana"),
    (r"Nebraska$|Nebraska Stats", "Nebraska"),
    (r"Nevada$|Nevada Stats", "Nevada"),
    (r"New Hampshire$|New Hampshire Stats", "New Hampshire"),
    (r"New Jersey$|New Jersey Stats", "New Jersey"),
    (r"New Mexico$|New Mexico Stats|Mexico Stats", "New Mexico"),
    (r"New York$|New York Stats", "New York"),
    (r"North Carolina$|North Carolina Stats", "North Carolina"),
    (r"North Dakota$|North Dakota Stats", "North Dakota"),
    (r"Ohio$|Ohio Stats", "Ohio"),
    (r"Oklahoma$|Oklahoma Stats", "Oklahoma"),
    (r"Oregon$|Oregon Stats", "Oregon"),
    (r"Pennsylvania$|Pennsylvania Stats", "Pennsylvania"),
    (r"Rhode Island$|Rhode Island Stats", "Rhode Island"),
    (r"South Carolina$|South Carolina Stats", "South Carolina"),
    (r"South Dakota$|South Dakota Stats", "South Dakota"),
    (r"Tennessee$|Tennessee Stats", "Tennessee"),
    (r"Texas$|Texas Stats", "Texas"),
    (r"Utah$|Utah Stats", "Utah"),
    (r"Vermont$|Vermont Stats", "Vermont"),
    (r"West Virginia$|West Virginia Stats", "West Virginia"),
    (r"Virginia$|Virginia Stats", "Virginia"),
    (r"Washington$|Washington State Stats", "Washington"),
    (r"Wisconsin$|Wisconsin Stats", "Wisconsin"),
    (r"Wyoming$|Wyoming Stats", "Wyoming"),
    (r"DC/FBI|Washington DC Stats|Lab", "DC/FBI Lab"),
    (r"DC/Metro|DC", "DC/Metro PD"),
    (r"^Army$|U\.S\. Army$|U\.S\. Army Stats", "U.S. Army"),
    (r"^PR$|Puerto Rico$|Puerto Rico Stats", "Puerto Rico"),
    (r"Tables by NDIS Participant", "Alabama"),
]


def clean_jurisdiction(value: str) -> str:
    """Clean jurisdiction name using pattern matching."""
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return value
    text = str(value).strip()
    if not text or text.lower() == "nan":
        return np.nan
    for pattern, replacement in JURISDICTION_PATTERNS:
        if re.search(pattern, text, flags=re.IGNORECASE):
            return replacement
    return text


raw_df["jurisdiction"] = raw_df["jurisdiction"].apply(clean_jurisdiction)

1.3 Parse Timestamps and Create Derived Fields

Code
# Parse capture datetime from timestamp
raw_df["capture_datetime"] = pd.to_datetime(
    raw_df["timestamp"], format="%Y%m%d%H%M%S", errors="coerce"
)

# Calculate total profiles
raw_df["total_profiles"] = (
    raw_df["offender_profiles"] + raw_df["arrestee"] + raw_df["forensic_profiles"]
)

# Create asof_date from report year/month
raw_df["asof_date"] = pd.to_datetime(
    {"year": raw_df["report_year"], "month": raw_df["report_month"], "day": 1},
    errors="coerce",
)

print(f"Date range: {raw_df['capture_datetime'].min()} to {raw_df['capture_datetime'].max()}")
Date range: 2001-07-15 04:03:42 to 2025-08-15 00:20:50
Code
raw_df[["timestamp", "capture_datetime", "asof_date", "total_profiles"]].head()
timestamp capture_datetime asof_date total_profiles
0 20010715040342 2001-07-15 04:03:42 NaT 7982
1 20010715040408 2001-07-15 04:04:08 NaT 21973
2 20010715040546 2001-07-15 04:05:46 NaT 4331
3 20010715040559 2001-07-15 04:05:59 NaT 0
4 20010715040600 2001-07-15 04:06:00 NaT 1223

1.4 Save Intermediate Dataset

Code
# Sort by jurisdiction and capture datetime
raw_df = raw_df.sort_values(["jurisdiction", "capture_datetime"])
print(f"Total rows: {len(raw_df):,}")
Total rows: 32,009
Code
# Select columns for intermediate dataset
intermediate_cols = [
    "capture_datetime",
    "asof_date",
    "jurisdiction",
    "offender_profiles",
    "arrestee",
    "forensic_profiles",
    "total_profiles",
    "ndis_labs",
    "investigations_aided",
    "source_file",
]

ndis_intermediate = raw_df[intermediate_cols].copy()

print(f"Shape: {ndis_intermediate.shape}")
Shape: (32009, 10)
Code
print(f"\nJurisdictions ({ndis_intermediate['jurisdiction'].nunique()}):")

Jurisdictions (54):
Code
print(sorted(ndis_intermediate["jurisdiction"].dropna().unique()))
['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'DC/FBI Lab', 'DC/Metro PD', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'U.S. Army', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
Code
ndis_intermediate.head()
capture_datetime asof_date jurisdiction offender_profiles arrestee forensic_profiles total_profiles ndis_labs investigations_aided source_file
21 2001-07-15 04:15:59 NaT Alabama 0 0 0 0 4 88 20010715041559_al.html
58 2001-08-22 11:55:31 NaT Alabama 0 0 0 0 4 88 20010822115531_al.html
84 2001-09-13 00:17:54 NaT Alabama 0 0 0 0 4 88 20010913001754_al.html
121 2001-09-13 04:17:42 NaT Alabama 0 0 0 0 4 88 20010913041742_al.html
156 2001-09-13 05:46:14 NaT Alabama 0 0 0 0 4 88 20010913054614_al.html

1.5 Summary

Code
# Summary statistics
print(" Intermediate Data Summary ")
 Intermediate Data Summary 
Code
print(f"Total rows: {len(ndis_intermediate):,}")
Total rows: 32,009
Code
print(f"Unique jurisdictions: {ndis_intermediate['jurisdiction'].nunique()}")
Unique jurisdictions: 54
Code
print(f"Date range: {ndis_intermediate['capture_datetime'].min().year} to {ndis_intermediate['capture_datetime'].max().year}")
Date range: 2001 to 2025

2. Time Gaps Analysis

This section visualizes the time gaps between snapshots captured from the wayback machine.

Show time gaps visualization code
import matplotlib.pyplot as plt

# Use intermediate data for visualization
df = ndis_intermediate.copy()

# Combined figure with aligned x-axes (a,b) and labeled outliers in (c)
fig = plt.figure(figsize=(14, 12))

# Create GridSpec: 3 rows, 2 columns
gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1], hspace=0.35, wspace=0.25)

# ============ Prepare shared data for x-axis alignment ============
work_shared = df.copy()
work_shared["capture_datetime"] = pd.to_datetime(work_shared["capture_datetime"], errors="coerce")
work_shared = work_shared[work_shared["capture_datetime"].notna()]

# Determine global date range for consistent x-axis
date_min = work_shared["capture_datetime"].min()
date_max = work_shared["capture_datetime"].max()

# ============ (a) Monthly Snapshot Counts - All Jurisdictions ============
ax_a = fig.add_subplot(gs[0, :])

work_a = work_shared.copy()
work_a["capture_month"] = work_a["capture_datetime"].dt.to_period("M").astype(str)

monthly_counts_a = (
    work_a.groupby("capture_month")
        .size()
        .reset_index(name="snapshot_count")
        .sort_values("capture_month")
)
monthly_counts_a["capture_month"] = pd.to_datetime(monthly_counts_a["capture_month"], format="%Y-%m")

ax_a.plot(monthly_counts_a["capture_month"], monthly_counts_a["snapshot_count"],
          color="#4C78A8", linewidth=1.5)
ax_a.set_title("(a) Monthly Snapshot Counts (All Jurisdictions)", fontsize=12, fontweight='bold', loc='left')
ax_a.set_xlabel("")
ax_a.set_ylabel("Snapshot count")
ax_a.grid(axis="y", color="0.9", linestyle="--", linewidth=0.7)
ax_a.set_xlim(date_min, date_max)
(np.float64(11518.16923611111), np.float64(20315.01446759259))
Show time gaps visualization code
# ============ (b) Yearly Variance in Snapshot Counts ============
ax_b = fig.add_subplot(gs[1, :], sharex=ax_a)

work_b = work_shared.copy()
work_b["capture_year"] = work_b["capture_datetime"].dt.year

yearly_counts_b = (
    work_b.groupby(["jurisdiction", "capture_year"])
        .size()
        .reset_index(name="snapshot_count")
)

variance_by_year_b = (
    yearly_counts_b.groupby("capture_year")["snapshot_count"]
    .var()
    .reset_index(name="variance")
    .sort_values("capture_year")
)

# Convert years to datetime (mid-year) for alignment with plot (a)
variance_by_year_b["year_date"] = pd.to_datetime(variance_by_year_b["capture_year"].astype(str) + "-07-01")

ax_b.plot(variance_by_year_b["year_date"], variance_by_year_b["variance"],
          color="#4C78A8", linewidth=2)
ax_b.scatter(variance_by_year_b["year_date"], variance_by_year_b["variance"],
             color="#4C78A8", s=25)
ax_b.set_title("(b) Yearly Variance in Snapshot Counts Across Jurisdictions", fontsize=12, fontweight='bold', loc='left')
ax_b.set_xlabel("Date")
ax_b.set_ylabel("Variance of snapshot count")
ax_b.grid(axis="y", color="0.9", linestyle="--", linewidth=0.7)

# ============ (c) Gap Distribution WITH Outliers ============
ax_c = fig.add_subplot(gs[2, 0])

work_c = df.copy()
work_c["capture_datetime"] = pd.to_datetime(work_c["capture_datetime"], errors="coerce")
work_c = work_c[work_c["capture_datetime"].notna()]
work_c = work_c.sort_values(["jurisdiction", "capture_datetime"])
work_c["gap_days"] = work_c.groupby("jurisdiction")["capture_datetime"].diff().dt.days

gaps_c = work_c.dropna(subset=["gap_days"])
all_gap_vals_c = gaps_c["gap_days"].values

min_gap_c = all_gap_vals_c.min()
max_gap_c = all_gap_vals_c.max()
mean_gap_c = all_gap_vals_c.mean()
median_gap_c = np.median(all_gap_vals_c)

# Identify top outliers for labeling
gap_threshold_c = 100
outlier_df_c = gaps_c[gaps_c["gap_days"] > gap_threshold_c].copy()
top_outliers_c = outlier_df_c.nlargest(5, "gap_days")[["jurisdiction", "gap_days", "capture_datetime"]]

ax_c.hist(all_gap_vals_c, bins=50, color="#4C78A8", alpha=0.8, edgecolor="white")
ax_c.axvline(min_gap_c, color="#2ca02c", linewidth=2, linestyle="--", label=f"Min: {min_gap_c:.0f}d")
ax_c.axvline(max_gap_c, color="#d62728", linewidth=2, linestyle="--", label=f"Max: {max_gap_c:.0f}d")
ax_c.axvline(mean_gap_c, color="#ff7f0e", linewidth=2, linestyle="-", label=f"Mean: {mean_gap_c:.1f}d")
ax_c.axvline(median_gap_c, color="#9467bd", linewidth=2, linestyle="-", label=f"Median: {median_gap_c:.1f}d")

# Annotate top outliers
for i, (_, row) in enumerate(top_outliers_c.iterrows()):
    if i < 3:  # Label top 3 outliers
        ax_c.annotate(
            f"{row['jurisdiction']}\n({row['gap_days']:.0f}d)",
            xy=(row['gap_days'], 0),
            xytext=(row['gap_days'] - 200, 5000 + i * 3000),
            fontsize=8,
            ha='center',
            arrowprops=dict(arrowstyle='->', color='#d62728', lw=0.8),
            color='#d62728'
        )
Text(2328.0, 5000, 'Puerto Rico\n(2528d)')
Text(283.0, 8000, 'Oregon\n(483d)')
Text(231.0, 11000, 'Kansas\n(431d)')
Show time gaps visualization code
ax_c.set_title("(c) Gap Distribution: WITH Outliers", fontsize=11, fontweight='bold', loc='left')
Text(0.0, 1.0, '(c) Gap Distribution: WITH Outliers')
Show time gaps visualization code
ax_c.set_xlabel("Days between snapshots")
Text(0.5, 0, 'Days between snapshots')
Show time gaps visualization code
ax_c.set_ylabel("Count")
Text(0, 0.5, 'Count')
Show time gaps visualization code
ax_c.legend(fontsize=8, loc='upper right')
<matplotlib.legend.Legend object at 0x10d496e40>
Show time gaps visualization code
# ============ (d) Gap Distribution WITHOUT Outliers ============
ax_d = fig.add_subplot(gs[2, 1])

gap_threshold_d = 100
filtered_gaps_d = all_gap_vals_c[all_gap_vals_c <= gap_threshold_d]
filtered_mean_d = filtered_gaps_d.mean()
filtered_median_d = np.median(filtered_gaps_d)

ax_d.hist(filtered_gaps_d, bins=50, color="#4C78A8", alpha=0.8, edgecolor="white")
(array([7.919e+03, 3.533e+03, 2.623e+03, 5.872e+03, 1.401e+03, 7.600e+02,
       1.196e+03, 9.330e+02, 4.350e+02, 6.410e+02, 5.220e+02, 4.370e+02,
       2.650e+02, 6.480e+02, 7.940e+02, 2.800e+02, 3.250e+02, 4.790e+02,
       2.210e+02, 2.540e+02, 2.890e+02, 2.700e+02, 1.300e+02, 2.240e+02,
       2.000e+00, 3.300e+01, 5.000e+01, 1.900e+01, 1.180e+02, 1.240e+02,
       3.900e+01, 0.000e+00, 6.000e+01, 1.060e+02, 0.000e+00, 1.500e+01,
       5.400e+01, 5.300e+01, 1.000e+00, 1.070e+02, 0.000e+00, 1.200e+01,
       5.500e+01, 5.600e+01, 0.000e+00, 5.400e+01, 1.000e+00, 8.000e+00,
       4.000e+00, 1.000e+00]), array([ 0.  ,  1.96,  3.92,  5.88,  7.84,  9.8 , 11.76, 13.72, 15.68,
       17.64, 19.6 , 21.56, 23.52, 25.48, 27.44, 29.4 , 31.36, 33.32,
       35.28, 37.24, 39.2 , 41.16, 43.12, 45.08, 47.04, 49.  , 50.96,
       52.92, 54.88, 56.84, 58.8 , 60.76, 62.72, 64.68, 66.64, 68.6 ,
       70.56, 72.52, 74.48, 76.44, 78.4 , 80.36, 82.32, 84.28, 86.24,
       88.2 , 90.16, 92.12, 94.08, 96.04, 98.  ]), <BarContainer object of 50 artists>)
Show time gaps visualization code
ax_d.axvline(filtered_gaps_d.min(), color="#2ca02c", linewidth=2, linestyle="--", label=f"Min: {filtered_gaps_d.min():.0f}d")
<matplotlib.lines.Line2D object at 0x10d611010>
Show time gaps visualization code
ax_d.axvline(filtered_gaps_d.max(), color="#d62728", linewidth=2, linestyle="--", label=f"Max: {filtered_gaps_d.max():.0f}d")
<matplotlib.lines.Line2D object at 0x10d6112b0>
Show time gaps visualization code
ax_d.axvline(filtered_mean_d, color="#ff7f0e", linewidth=2, linestyle="-", label=f"Mean: {filtered_mean_d:.1f}d")
<matplotlib.lines.Line2D object at 0x10d611400>
Show time gaps visualization code
ax_d.axvline(filtered_median_d, color="#9467bd", linewidth=2, linestyle="-", label=f"Median: {filtered_median_d:.1f}d")
<matplotlib.lines.Line2D object at 0x10d611550>
Show time gaps visualization code
ax_d.set_title(f"(d) Gap Distribution: WITHOUT Outliers (≤{gap_threshold_d}d)", fontsize=11, fontweight='bold', loc='left')
Text(0.0, 1.0, '(d) Gap Distribution: WITHOUT Outliers (≤100d)')
Show time gaps visualization code
ax_d.set_xlabel("Days between snapshots")
Text(0.5, 0, 'Days between snapshots')
Show time gaps visualization code
ax_d.set_ylabel("Count")
Text(0, 0.5, 'Count')
Show time gaps visualization code
ax_d.legend(fontsize=8, loc='upper right')
<matplotlib.legend.Legend object at 0x10d6116a0>
Show time gaps visualization code
plt.tight_layout()

# Save figure
fig_output_dir = Path("../output/figures/manuscript_figures")
fig_output_dir.mkdir(parents=True, exist_ok=True)
fig.savefig(fig_output_dir / "fig6_time_gap_analysis.png", dpi=300, bbox_inches='tight')
fig.savefig(fig_output_dir / "fig6_time_gap_analysis.pdf", bbox_inches='tight')
print(f"Saved figure to {fig_output_dir}")
Saved figure to ../output/figures/manuscript_figures
Show time gaps visualization code
# Summary statistics
outlier_count_d = len(all_gap_vals_c) - len(filtered_gaps_d)
print(f"Total gaps: {len(all_gap_vals_c):,}")
Total gaps: 31,955
Show time gaps visualization code
print(f"Gaps ≤{gap_threshold_d} days: {len(filtered_gaps_d):,} ({100*len(filtered_gaps_d)/len(all_gap_vals_c):.1f}%)")
Gaps ≤100 days: 31,423 (98.3%)
Show time gaps visualization code
print(f"Outliers (>{gap_threshold_d} days): {outlier_count_d:,} ({100*outlier_count_d/len(all_gap_vals_c):.1f}%)")
Outliers (>100 days): 532 (1.7%)

3. Create Stacked (Long) Format

Code
# Create stacked (long) format from intermediate data
# ID columns: columns that identify each observation
id_cols = ["capture_datetime", "asof_date", "jurisdiction", "source_file"]

# Value columns: columns to be stacked into rows
value_cols = ["offender_profiles", "arrestee", "forensic_profiles", "ndis_labs", "investigations_aided"]

# Melt the dataframe
intermediate_stacked = pd.melt(
    ndis_intermediate.drop(columns=["total_profiles"]),
    id_vars=id_cols,
    value_vars=value_cols,
    var_name="metric_type",
    value_name="value"
)

# Sort for easier viewing
intermediate_stacked = intermediate_stacked.sort_values(
    ["jurisdiction", "capture_datetime", "metric_type"]
).reset_index(drop=True)

# Save stacked (long) data as intermediate file
intermediate_path = Path("../data/ndis/intermediate/ndis_intermediate.csv")
intermediate_path.parent.mkdir(parents=True, exist_ok=True)
intermediate_stacked.to_csv(intermediate_path, index=False)

print(f"Original shape: {ndis_intermediate.shape}")
Original shape: (32009, 10)
Code
print(f"Stacked shape: {intermediate_stacked.shape}")
Stacked shape: (160045, 6)
Code
print(f"Saved to: {intermediate_path}")
Saved to: ../data/ndis/intermediate/ndis_intermediate.csv
Code
print(f"\nMetric types: {intermediate_stacked['metric_type'].unique().tolist()}")

Metric types: ['arrestee', 'forensic_profiles', 'investigations_aided', 'ndis_labs', 'offender_profiles']
Code
intermediate_stacked.head(10)
     capture_datetime asof_date  ...           metric_type value
0 2001-07-15 04:15:59       NaT  ...              arrestee     0
1 2001-07-15 04:15:59       NaT  ...     forensic_profiles     0
2 2001-07-15 04:15:59       NaT  ...  investigations_aided    88
3 2001-07-15 04:15:59       NaT  ...             ndis_labs     4
4 2001-07-15 04:15:59       NaT  ...     offender_profiles     0
5 2001-08-22 11:55:31       NaT  ...              arrestee     0
6 2001-08-22 11:55:31       NaT  ...     forensic_profiles     0
7 2001-08-22 11:55:31       NaT  ...  investigations_aided    88
8 2001-08-22 11:55:31       NaT  ...             ndis_labs     4
9 2001-08-22 11:55:31       NaT  ...     offender_profiles     0

[10 rows x 6 columns]

4. Apply Flagging Rules

Apply the following flagging rules to detect data quality issues. Rules are applied in sequence, with each subsequent rule aware of flags from previous rules to avoid double-counting. All comparisons are made against previous UNFLAGGED values only.

Recovery Window

All spike/dip rules use a recovery window of at least 6 months, extended if needed to ensure at least 10 data points. This dual threshold ensures:

  • 6 months minimum: Transient errors (typos, decimal mistakes, stale cache) should be corrected within this timeframe as the website is refreshed. If a value persists 6+ months, it’s likely real.
  • 10 points minimum: Ensures sufficient sample size to reliably detect the “spike-then-recover” pattern, especially in data-sparse early periods where 6 months might contain only 2-3 snapshots.

Rule Sequence:

  1. decimal_spike_dip (8x / 0.125x threshold)
    • Applies to: ALL metrics
    • Logic: Catches decimal place entry errors (e.g., 100,000 entered as 10,000 or 1,000,000)
  2. spike_dip (2x / 0.5x threshold)
    • Applies to: offender_profiles, forensic_profiles, arrestee, investigations_aided
    • Logic: Catches doubling/halving typos or transient errors
  3. ndis_labs_spike (3x / 0.33x threshold)
    • Applies to: ndis_labs ONLY
    • Logic: Lower threshold for small numbers (labs typically range 1-20)
  4. stale_exact_reappearance (no recovery window needed)
    • Applies to: offender_profiles, forensic_profiles, arrestee
    • Logic: Flags when an EXACT value reappears after higher values were recorded (cached stale data)
    • Condition: Value must be below current max AND be an exact match to a previously seen value
    • Exception: Consecutive identical values are NOT flagged (indicates no change, not stale cache)
    • Key insight: Real corrections would produce NEW lower values, not exact repeats of old values
Show flagging rules code
# Load stacked data
df = intermediate_stacked.copy()
df["capture_datetime"] = pd.to_datetime(df["capture_datetime"], errors="coerce")
df = df.sort_values(["jurisdiction", "metric_type", "capture_datetime"]).reset_index(drop=True)

# ===== CORE SPIKE/DIP DETECTION FUNCTION =====
def apply_spike_dip_flags(df, spike_thresh, dip_thresh, flag_name, 
                           max_days=180, min_points=10,
                           applicable_metrics=None, existing_flag_cols=None):
    """
    Detect spike/dip anomalies with recovery check.
    
    Parameters:
    - spike_thresh: ratio threshold for spikes (e.g., 8 for 8x increase)
    - dip_thresh: ratio threshold for dips (e.g., 0.125 for 1/8 decrease)
    - flag_name: name of the flag column to create
    - max_days: time window for recovery (days) - default 180 (6 months)
    - min_points: minimum data points to check - default 10
    - applicable_metrics: list of metrics to apply to (None = all)
    - existing_flag_cols: previous flag columns to consider when finding baseline
    
    Logic:
    1. Compare each value to LAST UNFLAGGED value (baseline)
    2. If ratio exceeds threshold (spike OR dip), look for recovery
    3. Recovery = value returns to within 50% of baseline (for spike) or ~67% for dip
    4. Window = 6 months OR 10 data points, whichever is LARGER
    5. Only flag if recovery occurs within window
    6. If no recovery within window, treat as real change (update baseline)
    """
    df[flag_name] = False
    
    for (jurisdiction, metric_type), group_idx in df.groupby(["jurisdiction", "metric_type"]).groups.items():
        # Skip if not in applicable metrics
        if applicable_metrics and metric_type not in applicable_metrics:
            continue
        
        group_indices = list(group_idx)
        n = len(group_indices)
        
        values = df.loc[group_indices, "value"].values
        datetimes = df.loc[group_indices, "capture_datetime"].values
        
        # Get existing flags from previous rules (these are already flagged)
        existing_flags = [False] * n
        if existing_flag_cols:
            for col in existing_flag_cols:
                if col in df.columns:
                    col_flags = df.loc[group_indices, col].values
                    for k in range(n):
                        if col_flags[k]:
                            existing_flags[k] = True
        
        local_flags = [False] * n
        last_good_value = None
        
        i = 0
        while i < n:
            # Skip already flagged values - don't use them as baseline
            if existing_flags[i] or local_flags[i]:
                i += 1
                continue
            
            curr_val = values[i]
            
            # Skip invalid values
            if curr_val == 0 or pd.isna(curr_val):
                i += 1
                continue
            
            # Initialize baseline with first valid unflagged value
            if last_good_value is None:
                last_good_value = curr_val
                i += 1
                continue
            
            # Compare to baseline (last unflagged good value)
            ratio = curr_val / last_good_value
            is_spike = ratio >= spike_thresh
            is_dip = ratio <= dip_thresh
            
            if is_spike or is_dip:
                baseline = last_good_value
                entry_datetime = datetimes[i]
                region_pos = [i]
                region_closed = False
                
                # Look ahead for recovery
                # Window = 6 months OR 10 data points, whichever is LARGER
                # (i.e., continue until BOTH conditions are exceeded)
                points_checked = 0
                j = i + 1
                while j < n:
                    # Skip already flagged points when looking for recovery
                    if existing_flags[j] or local_flags[j]:
                        j += 1
                        continue
                    
                    points_checked += 1
                    days_elapsed = (datetimes[j] - entry_datetime) / np.timedelta64(1, 'D')
                    
                    # Stop only when BOTH: past 6 months AND checked at least 10 points
                    if days_elapsed > max_days and points_checked > min_points:
                        break
                    
                    next_val = values[j]
                    if next_val == 0 or pd.isna(next_val):
                        j += 1
                        continue
                    
                    # Check for recovery (value returns near baseline)
                    if is_spike:
                        # Recovery from spike: value drops back down near baseline
                        if next_val <= baseline * 1.5:  # Within 50% of baseline
                            region_closed = True
                            break
                    else:  # is_dip
                        # Recovery from dip: value rises back up near baseline
                        if next_val >= baseline * 0.67:  # Within 33% of baseline
                            region_closed = True
                            break
                    
                    # This point is still anomalous, add to region
                    region_pos.append(j)
                    j += 1
                
                if region_closed:
                    # Flag all points in the anomalous region
                    for pos in region_pos:
                        local_flags[pos] = True
                    # Recovery point becomes the new baseline
                    last_good_value = values[j] if j < n else last_good_value
                    i = j + 1
                else:
                    # No recovery = real change, update baseline to current value
                    last_good_value = curr_val
                    i += 1
            else:
                # Normal value, update baseline
                last_good_value = curr_val
                i += 1
        
        for pos, flag in enumerate(local_flags):
            if flag:
                df.loc[group_indices[pos], flag_name] = True
    
    return df


# ===== RULE 1: decimal_spike_dip (8x / 0.125x) - ALL metrics =====
df = apply_spike_dip_flags(
    df, 
    spike_thresh=8, 
    dip_thresh=0.125, 
    flag_name="flag_decimal_spike_dip", 
    max_days=180,  # 6 months
    min_points=10,  # at least 10 points
    applicable_metrics=None,  # All metrics
    existing_flag_cols=None
)

# ===== RULE 2: spike_dip (2x / 0.5x) - specific metrics =====
df = apply_spike_dip_flags(
    df, 
    spike_thresh=2, 
    dip_thresh=0.5, 
    flag_name="flag_spike_dip", 
    max_days=180,  # 6 months
    min_points=10,  # at least 10 points
    applicable_metrics=["offender_profiles", "forensic_profiles", "arrestee", "investigations_aided"],
    existing_flag_cols=["flag_decimal_spike_dip"]
)

# ===== RULE 3: ndis_labs_spike (3x / 0.33x) - ndis_labs only =====
df = apply_spike_dip_flags(
    df, 
    spike_thresh=3, 
    dip_thresh=0.33, 
    flag_name="flag_ndis_labs_spike", 
    max_days=180,  # 6 months
    min_points=10,  # at least 10 points
    applicable_metrics=["ndis_labs"],
    existing_flag_cols=["flag_decimal_spike_dip"]
)

# ===== RULE 4: stale_exact_reappearance - cumulative metrics =====
def apply_stale_exact_reappearance_flags(df, existing_flag_cols=None):
    """
    Detect stale cached data: when an EXACT value reappears after higher values.
    
    Logic:
    - Track history of all unflagged values seen
    - Track max unflagged value seen so far
    - If current value < max AND that exact value appeared before → STALE CACHE
    - EXCEPTION: Consecutive identical values are NOT flagged (indicates no change)
    - No recovery window needed: the exact match itself is the evidence
    
    Key insight: Real corrections would produce NEW values, not exact repeats.
    A cached stale value will be an exact match to a previous value that appeared
    earlier in the series (not immediately before).
    """
    df["flag_stale_exact_reappearance"] = False
    
    applicable_metrics = ["offender_profiles", "forensic_profiles", "arrestee"]
    
    for (jurisdiction, metric_type), group_idx in df.groupby(["jurisdiction", "metric_type"]).groups.items():
        if metric_type not in applicable_metrics:
            continue
        
        group_indices = list(group_idx)
        n = len(group_indices)
        
        values = df.loc[group_indices, "value"].values
        
        existing_flags = [False] * n
        if existing_flag_cols:
            for col in existing_flag_cols:
                if col in df.columns:
                    col_flags = df.loc[group_indices, col].values
                    for k in range(n):
                        if col_flags[k]:
                            existing_flags[k] = True
        
        local_flags = [False] * n
        max_unflagged = None
        seen_values = set()  # Track all unflagged values seen before
        prev_val = None  # Track the previous value for consecutive detection
        
        for i in range(n):
            curr_val = values[i]
            
            if pd.isna(curr_val) or existing_flags[i] or curr_val == 0:
                continue
            
            if max_unflagged is None:
                max_unflagged = curr_val
                seen_values.add(curr_val)
                prev_val = curr_val
                continue
            
            # Check if this is an exact reappearance of a previous value below max
            # BUT skip if it's the same as the immediately previous value (consecutive same = no change)
            if curr_val < max_unflagged and curr_val in seen_values and curr_val != prev_val:
                # This is stale cached data - flag it!
                local_flags[i] = True
                # Don't add flagged value to seen_values or update max
                # Don't update prev_val for flagged values
            elif curr_val >= max_unflagged:
                # New maximum - update and add to seen
                max_unflagged = curr_val
                seen_values.add(curr_val)
                prev_val = curr_val
            else:
                # Below max but NOT an exact match (or is consecutive same) - this is a new lower value
                # Could be a correction or no-change, add to seen_values
                seen_values.add(curr_val)
                prev_val = curr_val
        
        for pos, flag in enumerate(local_flags):
            if flag:
                df.loc[group_indices[pos], "flag_stale_exact_reappearance"] = True
    
    return df

df = apply_stale_exact_reappearance_flags(
    df, 
    existing_flag_cols=["flag_decimal_spike_dip", "flag_spike_dip"]
)

# ===== SUMMARY =====
print("=== Flagging Summary ===")
=== Flagging Summary ===
Show flagging rules code
print(f"Total rows: {len(df):,}")
Total rows: 160,045
Show flagging rules code
print(f"flag_decimal_spike_dip: {df['flag_decimal_spike_dip'].sum():,} ({100*df['flag_decimal_spike_dip'].mean():.2f}%)")
flag_decimal_spike_dip: 68 (0.04%)
Show flagging rules code
print(f"flag_spike_dip: {df['flag_spike_dip'].sum():,} ({100*df['flag_spike_dip'].mean():.2f}%)")
flag_spike_dip: 77 (0.05%)
Show flagging rules code
print(f"flag_ndis_labs_spike: {df['flag_ndis_labs_spike'].sum():,} ({100*df['flag_ndis_labs_spike'].mean():.2f}%)")
flag_ndis_labs_spike: 10 (0.01%)
Show flagging rules code
print(f"flag_stale_exact_reappearance: {df['flag_stale_exact_reappearance'].sum():,} ({100*df['flag_stale_exact_reappearance'].mean():.2f}%)")
flag_stale_exact_reappearance: 743 (0.46%)
Show flagging rules code
# Combined any-flag count
any_flag = (
    df['flag_decimal_spike_dip'] | 
    df['flag_spike_dip'] | 
    df['flag_ndis_labs_spike'] |
    df['flag_stale_exact_reappearance']
)
print(f"\nANY FLAG: {any_flag.sum():,} ({100*any_flag.mean():.2f}%)")

ANY FLAG: 898 (0.56%)
Show flagging rules code
# Save flagged data as the final time series file
final_path = Path("../data/ndis/final/ndis_time_series.csv")
final_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(final_path, index=False)
print(f"\nSaved final time series to: {final_path}")

Saved final time series to: ../data/ndis/final/ndis_time_series.csv
Show flagging rules code
# ===== CREATE ANOMALY LOG =====
# Create anomaly_log.csv with all flagged rows for downstream analysis
metric_display_names = {
    "offender_profiles": "Offender Profiles",
    "forensic_profiles": "Forensic Profiles",
    "arrestee": "Arrestee Profiles",
    "investigations_aided": "Investigations Aided",
    "ndis_labs": "NDIS Labs"
}

# Get all flagged rows
flagged_rows = df[any_flag].copy()

# Determine flag type for each row (use first matching flag)
def get_flag_type(row):
    if row['flag_decimal_spike_dip']:
        return 'decimal_spike_dip'
    elif row['flag_spike_dip']:
        return 'spike_dip'
    elif row['flag_ndis_labs_spike']:
        return 'ndis_labs_spike'
    elif row['flag_stale_exact_reappearance']:
        return 'stale_exact_reappearance'
    return 'unknown'

flagged_rows['flag_type'] = flagged_rows.apply(get_flag_type, axis=1)
flagged_rows['metric'] = flagged_rows['metric_type'].map(metric_display_names)

# Select columns for anomaly log
anomaly_log = flagged_rows[[
    'jurisdiction', 'metric', 'metric_type', 'capture_datetime', 'value',
    'flag_type', 'flag_decimal_spike_dip', 'flag_spike_dip', 
    'flag_ndis_labs_spike', 'flag_stale_exact_reappearance'
]].copy()

# Save anomaly log
anomaly_log_path = Path("../data/ndis/intermediate/anomaly_log.csv")
anomaly_log.to_csv(anomaly_log_path, index=False)
print(f"\nSaved anomaly log to: {anomaly_log_path}")

Saved anomaly log to: ../data/ndis/intermediate/anomaly_log.csv
Show flagging rules code
print(f"Total anomalies logged: {len(anomaly_log):,}")
Total anomalies logged: 898
Show flagging rules code
# Summary by metric
print("\nAnomalies by metric:")

Anomalies by metric:
Show flagging rules code
print(anomaly_log.groupby('metric').size().sort_values(ascending=False))
metric
Offender Profiles       420
Forensic Profiles       379
Investigations Aided     49
Arrestee Profiles        40
NDIS Labs                10
dtype: int64
Show flagging rules code
df.head(10)
     capture_datetime  ... flag_stale_exact_reappearance
0 2001-07-15 04:15:59  ...                         False
1 2001-08-22 11:55:31  ...                         False
2 2001-09-13 00:17:54  ...                         False
3 2001-09-13 04:17:42  ...                         False
4 2001-09-13 05:46:14  ...                         False
5 2001-09-13 12:18:54  ...                         False
6 2001-09-13 19:14:31  ...                         False
7 2001-09-13 22:48:20  ...                         False
8 2001-09-14 19:23:39  ...                         False
9 2001-09-14 21:16:11  ...                         False

[10 rows x 10 columns]

5. Visualization: Flagged Time Series

Visualize time series for ALL jurisdictions, organized by metric.

Color Legend:

  • Blue: Normal (unflagged) points
  • Red: decimal_spike_dip (8x/0.125x errors)
  • Orange: spike_dip (2x/0.5x errors)
  • Green: ndis_labs_spike (3x errors for small numbers)
  • Yellow: stale_exact_reappearance (cached stale values)
Show visualization setup code
# Get all jurisdictions alphabetically
jurisdictions_sorted = sorted(df["jurisdiction"].dropna().unique())

# Calculate global date range across ALL metrics for consistent x-axes
global_date_min = df["capture_datetime"].min()
global_date_max = df["capture_datetime"].max()
print(f"Global date range: {global_date_min} to {global_date_max}")
Global date range: 2001-07-15 04:03:42 to 2025-08-15 00:20:50
Show visualization setup code
print(f"Jurisdictions: {len(jurisdictions_sorted)} total")
Jurisdictions: 54 total
Show visualization setup code
# Color scheme - each flag gets a unique color
COLORS = {
    "normal": "#4C78A8",               # Blue
    "decimal_spike_dip": "#D62728",    # Red
    "spike_dip": "#FF7F0E",            # Orange
    "ndis_labs_spike": "#2CA02C",      # Green
    "stale_exact": "#BCBD22"           # Yellow-green
}

# Human-readable number formatter for y-axis
from matplotlib.ticker import FuncFormatter

def human_format(x, pos):
    """Format large numbers as K, M, B"""
    if x >= 1e9:
        return f'{x/1e9:.1f}B'
    elif x >= 1e6:
        return f'{x/1e6:.1f}M'
    elif x >= 1e3:
        return f'{x/1e3:.0f}K'
    else:
        return f'{x:.0f}'

human_formatter = FuncFormatter(human_format)

def plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max):
    """Plot a single metric-jurisdiction combination with flagged vs clean comparison."""
    mask = (df["jurisdiction"] == jurisdiction) & (df["metric_type"] == metric)
    plot_df = df[mask].copy()
    
    if len(plot_df) == 0:
        return None
    
    # Create masks for flags (priority order for display: decimal > spike > ndis_labs > stale)
    decimal_spike_dip_mask = plot_df["flag_decimal_spike_dip"].fillna(False).astype(bool)
    spike_dip_mask = plot_df["flag_spike_dip"].fillna(False).astype(bool) & ~decimal_spike_dip_mask
    ndis_labs_spike_mask = plot_df["flag_ndis_labs_spike"].fillna(False).astype(bool) & ~decimal_spike_dip_mask & ~spike_dip_mask
    stale_exact_mask = plot_df["flag_stale_exact_reappearance"].fillna(False).astype(bool) & ~decimal_spike_dip_mask & ~spike_dip_mask & ~ndis_labs_spike_mask
    
    # Calculate any_flag_mask
    any_flag_mask = decimal_spike_dip_mask | spike_dip_mask | ndis_labs_spike_mask | stale_exact_mask
    normal_mask = ~any_flag_mask
    
    # Create side-by-side figure (flagged vs clean)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 4))
    marker_size = 30
    
    # ===== LEFT PLOT: With flags =====
    ax1.plot(plot_df["capture_datetime"], plot_df["value"], 
            color=COLORS["normal"], linewidth=0.8, alpha=0.5, zorder=1)
    
    if normal_mask.sum() > 0:
        ax1.scatter(plot_df.loc[normal_mask, "capture_datetime"], 
                   plot_df.loc[normal_mask, "value"],
                   color=COLORS["normal"], s=15, alpha=0.6, label="Normal", zorder=2)
    
    if decimal_spike_dip_mask.sum() > 0:
        ax1.scatter(plot_df.loc[decimal_spike_dip_mask, "capture_datetime"], 
                   plot_df.loc[decimal_spike_dip_mask, "value"],
                   color=COLORS["decimal_spike_dip"], s=marker_size, label="decimal_spike_dip", zorder=3)
    
    if spike_dip_mask.sum() > 0:
        ax1.scatter(plot_df.loc[spike_dip_mask, "capture_datetime"], 
                   plot_df.loc[spike_dip_mask, "value"],
                   color=COLORS["spike_dip"], s=marker_size, label="spike_dip", zorder=3)
    
    if ndis_labs_spike_mask.sum() > 0:
        ax1.scatter(plot_df.loc[ndis_labs_spike_mask, "capture_datetime"], 
                   plot_df.loc[ndis_labs_spike_mask, "value"],
                   color=COLORS["ndis_labs_spike"], s=marker_size, label="ndis_labs_spike", zorder=3)
    
    if stale_exact_mask.sum() > 0:
        ax1.scatter(plot_df.loc[stale_exact_mask, "capture_datetime"], 
                   plot_df.loc[stale_exact_mask, "value"],
                   color=COLORS["stale_exact"], s=marker_size, label="stale_exact_reappearance", zorder=3)
    
    ax1.set_title(f"{metric} - {jurisdiction} (WITH FLAGS)", fontsize=11, fontweight="bold")
    ax1.set_xlabel("Date")
    ax1.set_ylabel("Value")
    ax1.legend(loc="upper left", fontsize=7, ncol=2)
    ax1.grid(axis="y", alpha=0.3)
    
    # ===== RIGHT PLOT: Clean (flags removed) =====
    clean_df = plot_df[~any_flag_mask]
    
    ax2.plot(clean_df["capture_datetime"], clean_df["value"], 
            color=COLORS["normal"], linewidth=0.8, alpha=0.5, zorder=1)
    ax2.scatter(clean_df["capture_datetime"], clean_df["value"],
               color=COLORS["normal"], s=15, alpha=0.6, label="Clean data", zorder=2)
    
    ax2.set_title(f"{metric} - {jurisdiction} (FLAGS REMOVED)", fontsize=11, fontweight="bold")
    ax2.set_xlabel("Date")
    ax2.set_ylabel("Value")
    ax2.legend(loc="upper left", fontsize=7)
    ax2.grid(axis="y", alpha=0.3)
    
    # Match y-axis scales for comparison - start at 0
    y_max = max(ax1.get_ylim()[1], ax2.get_ylim()[1])
    ax1.set_ylim(0, y_max)
    ax2.set_ylim(0, y_max)
    
    # Human-readable y-axis formatting
    ax1.yaxis.set_major_formatter(human_formatter)
    ax2.yaxis.set_major_formatter(human_formatter)
    
    # Set consistent x-axis (full timeline across all metrics)
    ax1.set_xlim(global_date_min, global_date_max)
    ax2.set_xlim(global_date_min, global_date_max)
    
    plt.tight_layout()
    plt.show()
    
    return any_flag_mask.sum()

print("Visualization functions defined.")
Visualization functions defined.

Offender Profiles

Show offender profiles visualization code
metric = "offender_profiles"
print(f"\n{'='*60}")

============================================================
Show offender profiles visualization code
print(f"METRIC: {metric}")
METRIC: offender_profiles
Show offender profiles visualization code
print(f"{'='*60}")
============================================================
Show offender profiles visualization code
for jurisdiction in jurisdictions_sorted:
    try:
        n_flags = plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max)
        if n_flags is not None:
            print(f"  {jurisdiction}: {n_flags} flags")
    except Exception as e:
        print(f"  ERROR plotting {jurisdiction} - {metric}: {e}")
        plt.close('all')
  Alabama: 7 flags
  Alaska: 4 flags
  Arizona: 5 flags
  Arkansas: 2 flags
  California: 18 flags
  Colorado: 15 flags
  Connecticut: 2 flags
  DC/FBI Lab: 6 flags
  DC/Metro PD: 0 flags
  Delaware: 1 flags
  Florida: 4 flags
  Georgia: 1 flags
  Hawaii: 2 flags
  Idaho: 0 flags
  Illinois: 4 flags
  Indiana: 2 flags
  Iowa: 2 flags
  Kansas: 15 flags
  Kentucky: 2 flags
  Louisiana: 1 flags
  Maine: 2 flags
  Maryland: 2 flags
  Massachusetts: 26 flags
  Michigan: 14 flags
  Minnesota: 7 flags
  Mississippi: 3 flags
  Missouri: 2 flags
  Montana: 15 flags
  Nebraska: 15 flags
  Nevada: 2 flags
  New Hampshire: 2 flags
  New Jersey: 21 flags
  New Mexico: 15 flags
  New York: 3 flags
  North Carolina: 5 flags
  North Dakota: 25 flags
  Ohio: 2 flags
  Oklahoma: 17 flags
  Oregon: 15 flags
  Pennsylvania: 8 flags
  Puerto Rico: 18 flags
  Rhode Island: 2 flags
  South Carolina: 4 flags
  South Dakota: 15 flags
  Tennessee: 4 flags
  Texas: 3 flags
  U.S. Army: 2 flags
  Utah: 2 flags
  Vermont: 2 flags
  Virginia: 2 flags
  Washington: 4 flags
  West Virginia: 6 flags
  Wisconsin: 2 flags
  Wyoming: 60 flags

Arrestee Profiles

Show arrestee profiles visualization code
metric = "arrestee"
print(f"\n{'='*60}")

============================================================
Show arrestee profiles visualization code
print(f"METRIC: {metric}")
METRIC: arrestee
Show arrestee profiles visualization code
print(f"{'='*60}")
============================================================
Show arrestee profiles visualization code
for jurisdiction in jurisdictions_sorted:
    try:
        n_flags = plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max)
        if n_flags is not None:
            print(f"  {jurisdiction}: {n_flags} flags")
    except Exception as e:
        print(f"  ERROR plotting {jurisdiction} - {metric}: {e}")
        plt.close('all')
  Alabama: 0 flags
  Alaska: 0 flags
  Arizona: 0 flags
  Arkansas: 0 flags
  California: 0 flags
  Colorado: 0 flags
  Connecticut: 0 flags
  DC/FBI Lab: 0 flags
  DC/Metro PD: 0 flags
  Delaware: 0 flags
  Florida: 0 flags
  Georgia: 0 flags
  Hawaii: 0 flags
  Idaho: 0 flags
  Illinois: 0 flags
  Indiana: 0 flags
  Iowa: 0 flags
  Kansas: 0 flags
  Kentucky: 0 flags
  Louisiana: 0 flags
  Maine: 0 flags
  Maryland: 0 flags
  Massachusetts: 0 flags
  Michigan: 0 flags
  Minnesota: 0 flags
  Mississippi: 0 flags
  Missouri: 0 flags
  Montana: 0 flags
  Nebraska: 0 flags
  Nevada: 0 flags
  New Hampshire: 0 flags
  New Jersey: 0 flags
  New Mexico: 0 flags
  New York: 0 flags
  North Carolina: 0 flags
  North Dakota: 0 flags
  Ohio: 0 flags
  Oklahoma: 0 flags
  Oregon: 0 flags
  Pennsylvania: 0 flags
  Puerto Rico: 0 flags
  Rhode Island: 10 flags
  South Carolina: 0 flags
  South Dakota: 0 flags
  Tennessee: 0 flags
  Texas: 0 flags
  U.S. Army: 0 flags
  Utah: 8 flags
  Vermont: 0 flags
  Virginia: 22 flags
  Washington: 0 flags
  West Virginia: 0 flags
  Wisconsin: 0 flags
  Wyoming: 0 flags

Forensic Profiles

Show forensic profiles visualization code
metric = "forensic_profiles"
print(f"\n{'='*60}")

============================================================
Show forensic profiles visualization code
print(f"METRIC: {metric}")
METRIC: forensic_profiles
Show forensic profiles visualization code
print(f"{'='*60}")
============================================================
Show forensic profiles visualization code
for jurisdiction in jurisdictions_sorted:
    try:
        n_flags = plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max)
        if n_flags is not None:
            print(f"  {jurisdiction}: {n_flags} flags")
    except Exception as e:
        print(f"  ERROR plotting {jurisdiction} - {metric}: {e}")
        plt.close('all')
  Alabama: 5 flags
  Alaska: 17 flags
  Arizona: 5 flags
  Arkansas: 2 flags
  California: 18 flags
  Colorado: 15 flags
  Connecticut: 2 flags
  DC/FBI Lab: 2 flags
  DC/Metro PD: 0 flags
  Delaware: 8 flags
  Florida: 4 flags
  Georgia: 2 flags
  Hawaii: 2 flags
  Idaho: 2 flags
  Illinois: 4 flags
  Indiana: 3 flags
  Iowa: 2 flags
  Kansas: 15 flags
  Kentucky: 2 flags
  Louisiana: 2 flags
  Maine: 6 flags
  Maryland: 2 flags
  Massachusetts: 4 flags
  Michigan: 10 flags
  Minnesota: 12 flags
  Mississippi: 2 flags
  Missouri: 2 flags
  Montana: 15 flags
  Nebraska: 30 flags
  Nevada: 2 flags
  New Hampshire: 2 flags
  New Jersey: 21 flags
  New Mexico: 16 flags
  New York: 3 flags
  North Carolina: 3 flags
  North Dakota: 15 flags
  Ohio: 2 flags
  Oklahoma: 17 flags
  Oregon: 15 flags
  Pennsylvania: 2 flags
  Puerto Rico: 0 flags
  Rhode Island: 3 flags
  South Carolina: 4 flags
  South Dakota: 12 flags
  Tennessee: 14 flags
  Texas: 3 flags
  U.S. Army: 6 flags
  Utah: 15 flags
  Vermont: 2 flags
  Virginia: 2 flags
  Washington: 4 flags
  West Virginia: 7 flags
  Wisconsin: 2 flags
  Wyoming: 12 flags

Investigations Aided

Show investigations aided visualization code
metric = "investigations_aided"
print(f"\n{'='*60}")

============================================================
Show investigations aided visualization code
print(f"METRIC: {metric}")
METRIC: investigations_aided
Show investigations aided visualization code
print(f"{'='*60}")
============================================================
Show investigations aided visualization code
for jurisdiction in jurisdictions_sorted:
    try:
        n_flags = plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max)
        if n_flags is not None:
            print(f"  {jurisdiction}: {n_flags} flags")
    except Exception as e:
        print(f"  ERROR plotting {jurisdiction} - {metric}: {e}")
        plt.close('all')
  Alabama: 0 flags
  Alaska: 0 flags
  Arizona: 0 flags
  Arkansas: 0 flags
  California: 2 flags
  Colorado: 0 flags
  Connecticut: 0 flags
  DC/FBI Lab: 0 flags
  DC/Metro PD: 4 flags
  Delaware: 4 flags
  Florida: 0 flags
  Georgia: 0 flags
  Hawaii: 1 flags
  Idaho: 0 flags
  Illinois: 0 flags
  Indiana: 3 flags
  Iowa: 0 flags
  Kansas: 0 flags
  Kentucky: 0 flags
  Louisiana: 0 flags
  Maine: 0 flags
  Maryland: 0 flags
  Massachusetts: 0 flags
  Michigan: 7 flags
  Minnesota: 1 flags
  Mississippi: 1 flags
  Missouri: 0 flags
  Montana: 6 flags
  Nebraska: 0 flags
  Nevada: 0 flags
  New Hampshire: 0 flags
  New Jersey: 0 flags
  New Mexico: 0 flags
  New York: 0 flags
  North Carolina: 0 flags
  North Dakota: 1 flags
  Ohio: 4 flags
  Oklahoma: 3 flags
  Oregon: 0 flags
  Pennsylvania: 0 flags
  Puerto Rico: 1 flags
  Rhode Island: 1 flags
  South Carolina: 0 flags
  South Dakota: 1 flags
  Tennessee: 0 flags
  Texas: 0 flags
  U.S. Army: 2 flags
  Utah: 0 flags
  Vermont: 0 flags
  Virginia: 0 flags
  Washington: 0 flags
  West Virginia: 6 flags
  Wisconsin: 1 flags
  Wyoming: 0 flags

NDIS Labs

Show NDIS labs visualization code
metric = "ndis_labs"
print(f"\n{'='*60}")

============================================================
Show NDIS labs visualization code
print(f"METRIC: {metric}")
METRIC: ndis_labs
Show NDIS labs visualization code
print(f"{'='*60}")
============================================================
Show NDIS labs visualization code
for jurisdiction in jurisdictions_sorted:
    try:
        n_flags = plot_metric_jurisdiction(df, metric, jurisdiction, COLORS, human_formatter, global_date_min, global_date_max)
        if n_flags is not None:
            print(f"  {jurisdiction}: {n_flags} flags")
    except Exception as e:
        print(f"  ERROR plotting {jurisdiction} - {metric}: {e}")
        plt.close('all')
  Alabama: 0 flags
  Alaska: 0 flags
  Arizona: 0 flags
  Arkansas: 0 flags
  California: 0 flags
  Colorado: 0 flags
  Connecticut: 0 flags
  DC/FBI Lab: 0 flags
  DC/Metro PD: 0 flags
  Delaware: 0 flags
  Florida: 0 flags
  Georgia: 0 flags
  Hawaii: 0 flags
  Idaho: 0 flags
  Illinois: 0 flags
  Indiana: 0 flags
  Iowa: 0 flags
  Kansas: 0 flags
  Kentucky: 0 flags
  Louisiana: 0 flags
  Maine: 0 flags
  Maryland: 0 flags
  Massachusetts: 0 flags
  Michigan: 7 flags
  Minnesota: 0 flags
  Mississippi: 0 flags
  Missouri: 0 flags
  Montana: 0 flags
  Nebraska: 0 flags
  Nevada: 0 flags
  New Hampshire: 0 flags
  New Jersey: 0 flags
  New Mexico: 0 flags
  New York: 0 flags
  North Carolina: 0 flags
  North Dakota: 0 flags
  Ohio: 0 flags
  Oklahoma: 3 flags
  Oregon: 0 flags
  Pennsylvania: 0 flags
  Puerto Rico: 0 flags
  Rhode Island: 0 flags
  South Carolina: 0 flags
  South Dakota: 0 flags
  Tennessee: 0 flags
  Texas: 0 flags
  U.S. Army: 0 flags
  Utah: 0 flags
  Vermont: 0 flags
  Virginia: 0 flags
  Washington: 0 flags
  West Virginia: 0 flags
  Wisconsin: 0 flags
  Wyoming: 0 flags

6. Anomaly Summary and Statistics

This section summarizes the anomalies detected by our flagging rules and provides statistics for reporting.

Show anomaly statistics code
library(tidyverse)
library(here)

# Load anomaly log
anomaly_log <- read_csv(here("data", "ndis", "intermediate", "anomaly_log.csv"), show_col_types = FALSE)

# Define metric order and colors
metric_order <- c("Offender Profiles", "Forensic Profiles", "Arrestee Profiles",
                  "Investigations Aided", "NDIS Labs")

metric_colors <- c(
  "Offender Profiles" = "#31688e",
  "Forensic Profiles" = "#440154",
  "Arrestee Profiles" = "#35b779",
  "Investigations Aided" = "#fde725",
  "NDIS Labs" = "#21918c"
)

# ===== SUMMARY STATISTICS =====
total_anomalies <- nrow(anomaly_log)

# By flag type
flag_type_counts <- anomaly_log %>%
  count(flag_type) %>%
  arrange(desc(n))

# Map flag types to descriptive names
flag_type_labels <- c(
  "decimal_spike_dip" = "decimal place errors",
  "spike_dip" = "spike–dip events",
  "ndis_labs_spike" = "NDIS labs spikes",
  "stale_exact_reappearance" = "stale cache reappearances"
)

# By metric
metric_counts <- anomaly_log %>%
  count(metric) %>%
  arrange(desc(n))

cat("===== ANOMALY SUMMARY STATISTICS =====\n\n")
===== ANOMALY SUMMARY STATISTICS =====
Show anomaly statistics code
cat(sprintf("Total anomalies detected: %d\n\n", total_anomalies))
Total anomalies detected: 898
Show anomaly statistics code
cat("By flag type:\n")
By flag type:
Show anomaly statistics code
for (i in 1:nrow(flag_type_counts)) {
  label <- flag_type_labels[flag_type_counts$flag_type[i]]
  if (is.na(label)) label <- flag_type_counts$flag_type[i]
  cat(sprintf("  - %s: %d\n", label, flag_type_counts$n[i]))
}
  - stale cache reappearances: 743
  - spike–dip events: 77
  - decimal place errors: 68
  - NDIS labs spikes: 10
Show anomaly statistics code
cat("\nBy metric:\n")

By metric:
Show anomaly statistics code
for (i in 1:nrow(metric_counts)) {
  cat(sprintf("  - %s: %d\n", metric_counts$metric[i], metric_counts$n[i]))
}
  - Offender Profiles: 420
  - Forensic Profiles: 379
  - Investigations Aided: 49
  - Arrestee Profiles: 40
  - NDIS Labs: 10
Show anomaly statistics code
# By jurisdiction (top 10)
jurisdiction_counts <- anomaly_log %>%
  count(jurisdiction) %>%
  arrange(desc(n)) %>%
  head(10)

cat("\nTop 10 jurisdictions by anomaly count:\n")

Top 10 jurisdictions by anomaly count:
Show anomaly statistics code
for (i in 1:nrow(jurisdiction_counts)) {
  cat(sprintf("  - %s: %d\n", jurisdiction_counts$jurisdiction[i], jurisdiction_counts$n[i]))
}
  - Wyoming: 72
  - Nebraska: 45
  - New Jersey: 42
  - North Dakota: 41
  - Oklahoma: 40
  - California: 38
  - Michigan: 38
  - Montana: 36
  - New Mexico: 31
  - Colorado: 30
Show anomaly distribution figure code
library(patchwork)

# Use viridis-inspired colors for flag types (matching the metric color scheme)
flag_colors <- c(
  "decimal_spike_dip" = "#440154",        # Deep purple (viridis)
  "spike_dip" = "#31688e",                 # Blue (viridis)
  "ndis_labs_spike" = "#35b779",           # Green (viridis)
  "stale_exact_reappearance" = "#fde725"   # Yellow (viridis)
)

# Flag type display labels
flag_labels <- c(
  "decimal_spike_dip" = "Decimal Error",
  "spike_dip" = "Spike/Dip",
  "ndis_labs_spike" = "Labs Spike",
  "stale_exact_reappearance" = "Stale Cache"
)

# Get ALL jurisdictions alphabetically (for consistent x-axis across panels)
all_jurisdictions <- anomaly_log %>%
  pull(jurisdiction) %>%
  unique() %>%
  sort()

# Prepare data: count by metric, jurisdiction, and flag type
anomaly_plot_data <- anomaly_log %>%
  group_by(metric, jurisdiction, flag_type) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(
    metric = factor(metric, levels = metric_order),
    flag_type = factor(flag_type, levels = names(flag_colors)),
    jurisdiction = factor(jurisdiction, levels = all_jurisdictions)
  )

# Calculate global y-axis max (sum of all flag types per jurisdiction per metric)
y_max_global <- anomaly_plot_data %>%
  group_by(metric, jurisdiction) %>%
  summarise(total = sum(count), .groups = "drop") %>%
  pull(total) %>%
  max(na.rm = TRUE)

# Add 10% padding
y_max_global <- ceiling(y_max_global * 1.1)

# Common theme for panels (legend will be collected at bottom)
theme_panel <- theme_minimal(base_size = 10) +
  theme(
    panel.grid = element_blank(),
    axis.line = element_line(color = "black", linewidth = 0.4),
    axis.ticks = element_line(color = "black", linewidth = 0.4),
    axis.text = element_text(color = "black", size = 8),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 10, face = "bold", margin = margin(r = 5)),
    legend.position = "bottom",
    legend.title = element_text(face = "bold", size = 12),
    legend.text = element_text(size = 11),
    legend.key.size = unit(0.6, "cm"),
    plot.title = element_text(face = "bold", size = 11),
    plot.margin = margin(t = 5, r = 5, b = 2, l = 5)
  )

# Create individual panels for each metric with consistent y-axis
create_metric_panel <- function(metric_name, panel_label, y_max) {
  plot_data <- anomaly_plot_data %>%
    filter(metric == metric_name)
  
  ggplot(plot_data, aes(x = jurisdiction, y = count, fill = flag_type)) +
    geom_bar(stat = "identity", position = "stack", color = "black",
             linewidth = 0.2, width = 0.8) +
    scale_fill_manual(name = "Anomaly Type", values = flag_colors, labels = flag_labels,
                      drop = FALSE) +
    scale_x_discrete(limits = all_jurisdictions, drop = FALSE) +
    scale_y_continuous(limits = c(0, y_max), expand = expansion(mult = c(0, 0))) +
    labs(title = paste0(panel_label, " ", metric_name), y = "Count") +
    theme_panel +
    guides(fill = guide_legend(nrow = 1))
}

# Create panels with consistent y-axis
p_offender <- create_metric_panel("Offender Profiles", "(A)", y_max_global)
p_forensic <- create_metric_panel("Forensic Profiles", "(B)", y_max_global)
p_arrestee <- create_metric_panel("Arrestee Profiles", "(C)", y_max_global)
p_investigations <- create_metric_panel("Investigations Aided", "(D)", y_max_global)

# Combine panels vertically with shared legend at bottom using patchwork
final_plot <- (p_offender / p_forensic / p_arrestee / p_investigations) +
  plot_layout(heights = c(1, 1, 1, 1), guides = "collect") +
  plot_annotation(
    title = "Anomaly Distribution by Metric and Jurisdiction",
    theme = theme(
      plot.title = element_text(face = "bold", size = 14, hjust = 0.5)
    )
  ) &
  theme(legend.position = "bottom")

final_plot

# Save figure
fig_output_dir <- here("output", "figures", "manuscript_figures")
dir.create(fig_output_dir, recursive = TRUE, showWarnings = FALSE)

ggsave(
  filename = file.path(fig_output_dir, "fig5_anomaly_distribution.pdf"),
  plot = final_plot,
  width = 14,
  height = 16,
  dpi = 300,
  units = "in",
  device = cairo_pdf
)

ggsave(
  filename = file.path(fig_output_dir, "fig5_anomaly_distribution.png"),
  plot = final_plot,
  width = 14,
  height = 16,
  dpi = 300,
  units = "in"
)

cat(sprintf("\nSaved figure to: %s\n", fig_output_dir))

Saved figure to: /Users/tlasisi/GitHub/PODFRIDGE-Databases/output/figures/manuscript_figures
Figure 1

7. Comparison with Peer-Reviewed Literature

As an additional validation check, we compare corrected national aggregates against published NDIS totals from FBI press releases and peer-reviewed articles. This comparison helps verify the technical quality of our NDIS time series reconstruction.

Show yearly aggregate computation code
# Compute yearly aggregates from our cleaned data
# For each jurisdiction-year, take the maximum value (end-of-year snapshot)
df_clean = df[~(df["flag_decimal_spike_dip"] | df["flag_spike_dip"] | 
                df["flag_ndis_labs_spike"] | df["flag_stale_exact_reappearance"])].copy()

df_clean["year"] = df_clean["capture_datetime"].dt.year

# Pivot to wide format and get max per jurisdiction-year
yearly_by_jurisdiction = df_clean.groupby(["jurisdiction", "year", "metric_type"])["value"].max().reset_index()
yearly_by_jurisdiction = yearly_by_jurisdiction.pivot_table(
    index=["jurisdiction", "year"], 
    columns="metric_type", 
    values="value"
).reset_index()

# Sum across all jurisdictions for national totals
growth_data_yearly = yearly_by_jurisdiction.groupby("year").agg({
    "offender_profiles": "sum",
    "arrestee": "sum",
    "forensic_profiles": "sum",
    "investigations_aided": "sum",
    "ndis_labs": "sum"
}).reset_index()

# Fill NaN with 0 and compute total
growth_data_yearly = growth_data_yearly.fillna(0)
growth_data_yearly["total_profiles"] = (
    growth_data_yearly["offender_profiles"] + 
    growth_data_yearly["arrestee"] + 
    growth_data_yearly["forensic_profiles"]
)

# Create date column (mid-year)
growth_data_yearly["date"] = pd.to_datetime(growth_data_yearly["year"].astype(str) + "-06-01")

print("Yearly national aggregates (cleaned data):")
Yearly national aggregates (cleaned data):
Show yearly aggregate computation code
print(growth_data_yearly[["year", "offender_profiles", "arrestee", "forensic_profiles", "total_profiles", "investigations_aided"]].to_string())
metric_type  year  offender_profiles   arrestee  forensic_profiles  total_profiles  investigations_aided
0            2001           655293.0        0.0            24618.0        679911.0                2825.0
1            2002          1036328.0        0.0            38022.0       1074350.0                5207.0
2            2003          1446640.0        0.0            67295.0       1513935.0               10326.0
3            2004          1907536.0        0.0            93798.0       2001334.0               20439.0
4            2005          2762361.0        0.0           124488.0       2886849.0               30298.0
5            2006          3720741.0        0.0           154072.0       3874813.0               40263.0
6            2007          5070961.0        0.0           195433.0       5266394.0               59754.0
7            2008          6249619.0        0.0           241881.0       6491500.0               78100.0
8            2009          7008145.0        0.0           277159.0       7285304.0               92996.0
9            2010          8229049.0        0.0           332792.0       8561841.0              121974.0
10           2011          9316214.0        0.0           399048.0       9715262.0              158053.0
11           2012          9930784.0  1245961.0           451747.0      11628492.0              182828.0
12           2013         10692447.0  1712547.0           527076.0      12932070.0              219307.0
13           2014         11219306.0  2065981.0           589556.0      13874843.0              252248.0
14           2015         11962455.0  2325904.0           656701.0      14945060.0              282113.0
15           2016         12255893.0  2298808.0           690195.0      15244896.0              313502.0
16           2017         13084146.0  2891857.0           811242.0      16787245.0              380763.0
17           2018         13566718.0  3324282.0           895228.0      17786228.0              428862.0
18           2019         14013946.0  3760236.0           979841.0      18754023.0              477812.0
19           2020         14377412.0  4181569.0          1069155.0      19628136.0              528887.0
20           2021         14836566.0  4513955.0          1144266.0      20494787.0              576427.0
21           2022         14836490.0  4513955.0          1144255.0      20494700.0              574343.0
22           2023         16532333.0  5190629.0          1282432.0      23005394.0              659223.0
23           2024         17004068.0  5392101.0          1322543.0      23718712.0              680122.0
24           2025         18648655.0  5954756.0          1421751.0      26025162.0              739456.0
Show cross-reference validation plots
library(tidyverse)
library(ggrepel)
library(scales)
library(patchwork)
library(here)
library(lubridate)
library(qualpalr)

# Load literature data
literature_data <- read_csv(here("data", "ndis_crossref", "literature_data.csv"), show_col_types = FALSE) |>
  mutate(asof_date = as.Date(asof_date))

# Get yearly aggregates from Python via reticulate
# Convert pandas DataFrame to R data.frame
py_df <- py$growth_data_yearly
growth_data <- data.frame(
  year = as.numeric(py_df$year$values),
  offender_profiles = as.numeric(py_df$offender_profiles$values),
  arrestee = as.numeric(py_df$arrestee$values),
  forensic_profiles = as.numeric(py_df$forensic_profiles$values),
  investigations_aided = as.numeric(py_df$investigations_aided$values),
  ndis_labs = as.numeric(py_df$ndis_labs$values),
  total_profiles = as.numeric(py_df$total_profiles$values)
) |>
  mutate(date = as.Date(paste0(year, "-06-01")))

# Define qualpal colors (matching manuscript_figures.qmd)
qualpal_palette <- qualpal(
  n = 10,
  list(
    h = c(190, 330),
    s = c(0.35, 0.75),
    l = c(0.45, 0.85)
  )
)
qp_hex <- qualpal_palette$hex

offender_color <- qp_hex[1]
arrestee_color <- qp_hex[3]
forensic_color <- qp_hex[4]
total_color <- qp_hex[5]
investigations_color <- qp_hex[6]

# Extended date range for consistent x-axis
extended_date_range <- c(
  min(growth_data$date) - years(1),
  max(growth_data$date) + years(1)
)

# Axis formatter
scale_axis <- function(values) {
  ifelse(values >= 1e6, paste0(values / 1e6, "M"),
         ifelse(values >= 1e3, paste0(values / 1e3, "K"), values))
}

label_size <- 10 / .pt

# Common theme
theme_crossref <- theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold", size = 13),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray90", linewidth = 0.3),
    axis.line = element_line(color = "black", linewidth = 0.3),
    axis.ticks = element_line(color = "black", linewidth = 0.3),
    axis.text = element_text(color = "black", size = 10),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

# 1. Offender Profiles
offender_lit <- filter(literature_data, !is.na(offender_profiles))
p1 <- ggplot() +
  geom_line(data = growth_data, aes(x = date, y = offender_profiles), 
            color = offender_color, linewidth = 0.8) +
  geom_point(data = growth_data, aes(x = date, y = offender_profiles), 
             color = offender_color, size = 1.5) +
  geom_point(data = offender_lit, aes(x = asof_date, y = offender_profiles), 
             shape = 4, size = 2.5, stroke = 1, color = offender_color) +
  geom_label_repel(
    data = filter(offender_lit, year(asof_date) < 2020),
    aes(x = asof_date, y = offender_profiles, label = short_label),
    size = label_size, nudge_y = 4e6,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(offender_lit, year(asof_date) >= 2020),
    aes(x = asof_date, y = offender_profiles, label = short_label),
    size = label_size, nudge_y = 1e6,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y", limits = extended_date_range) +
  scale_y_continuous(labels = scale_axis, limits = c(0, max(offender_lit$offender_profiles, na.rm = TRUE) * 1.15),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "(A) Offender Profiles", x = NULL, y = "Count") +
  theme_crossref

# 2. Arrestee Profiles  
arrestee_lit <- filter(literature_data, !is.na(arrestee_profiles))
p2 <- ggplot() +
  geom_line(data = growth_data, aes(x = date, y = arrestee), 
            color = arrestee_color, linewidth = 0.8) +
  geom_point(data = growth_data, aes(x = date, y = arrestee), 
             color = arrestee_color, size = 1.5) +
  geom_point(data = arrestee_lit, aes(x = asof_date, y = arrestee_profiles), 
             shape = 4, size = 2.5, stroke = 1, color = arrestee_color) +
  geom_label_repel(
    data = arrestee_lit,
    aes(x = asof_date, y = arrestee_profiles, label = short_label),
    size = label_size, nudge_y = 5e5,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y", limits = extended_date_range) +
  scale_y_continuous(labels = scale_axis, limits = c(0, max(arrestee_lit$arrestee_profiles, na.rm = TRUE) * 1.15),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "(B) Arrestee Profiles", x = NULL, y = "Count") +
  theme_crossref

# 3. Forensic Profiles
forensic_lit <- filter(literature_data, !is.na(forensic_profiles))
p3 <- ggplot() +
  geom_line(data = growth_data, aes(x = date, y = forensic_profiles), 
            color = forensic_color, linewidth = 0.8) +
  geom_point(data = growth_data, aes(x = date, y = forensic_profiles), 
             color = forensic_color, size = 1.5) +
  geom_point(data = forensic_lit, aes(x = asof_date, y = forensic_profiles), 
             shape = 4, size = 2.5, stroke = 1, color = forensic_color) +
  geom_label_repel(
    data = filter(forensic_lit, year(asof_date) < 2020),
    aes(x = asof_date, y = forensic_profiles, label = short_label),
    size = label_size, nudge_y = 4e5,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(forensic_lit, year(asof_date) >= 2020),
    aes(x = asof_date, y = forensic_profiles, label = short_label),
    size = label_size, nudge_y = 1e5,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y", limits = extended_date_range) +
  scale_y_continuous(labels = scale_axis, limits = c(0, max(forensic_lit$forensic_profiles, na.rm = TRUE) * 1.15),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "(C) Forensic Profiles", x = NULL, y = "Count") +
  theme_crossref

# 4. Total Profiles
total_lit <- filter(literature_data, !is.na(total_profiles))
p4 <- ggplot() +
  geom_line(data = growth_data, aes(x = date, y = total_profiles), 
            color = total_color, linewidth = 0.8) +
  geom_point(data = growth_data, aes(x = date, y = total_profiles), 
             color = total_color, size = 1.5) +
  geom_point(data = total_lit, aes(x = asof_date, y = total_profiles), 
             shape = 4, size = 2.5, stroke = 1, color = total_color) +
  geom_label_repel(
    data = filter(total_lit, year(asof_date) < 2015),
    aes(x = asof_date, y = total_profiles, label = short_label),
    size = label_size, nudge_y = 6e6, nudge_x = -200,
    box.padding = 0.5, point.padding = 0.5, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf, direction = "both",
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(total_lit, year(asof_date) >= 2015 & year(asof_date) < 2020),
    aes(x = asof_date, y = total_profiles, label = short_label),
    size = label_size, nudge_y = 4e6,
    box.padding = 0.5, point.padding = 0.5, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf, direction = "both",
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(total_lit, year(asof_date) >= 2020),
    aes(x = asof_date, y = total_profiles, label = short_label),
    size = label_size, nudge_y = 2e6, nudge_x = -300,
    box.padding = 0.5, point.padding = 0.5, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf, direction = "x",
    fill = "white", label.size = 0.2
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y", limits = extended_date_range) +
  scale_y_continuous(labels = scale_axis, limits = c(0, max(total_lit$total_profiles, na.rm = TRUE) * 1.2),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "(D) Total Profiles", x = NULL, y = "Count") +
  theme_crossref

# 5. Investigations Aided (full width at bottom)
investigations_lit <- filter(literature_data, !is.na(investigations_aided))
p5 <- ggplot() +
  geom_line(data = growth_data, aes(x = date, y = investigations_aided), 
            color = investigations_color, linewidth = 0.8) +
  geom_point(data = growth_data, aes(x = date, y = investigations_aided), 
             color = investigations_color, size = 1.5) +
  geom_point(data = investigations_lit, aes(x = asof_date, y = investigations_aided), 
             shape = 4, size = 2.5, stroke = 1, color = investigations_color) +
  geom_label_repel(
    data = filter(investigations_lit, year(asof_date) < 2021),
    aes(x = asof_date, y = investigations_aided, label = short_label),
    size = label_size, nudge_y = 80000,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", direction = "both", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(investigations_lit, year(asof_date) == 2021),
    aes(x = asof_date, y = investigations_aided, label = short_label),
    size = label_size, nudge_y = 10000, nudge_x = 400,
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  geom_label_repel(
    data = filter(investigations_lit, year(asof_date) > 2021),
    aes(x = asof_date, y = investigations_aided, label = short_label),
    size = label_size, nudge_y = 15000, nudge_x = -300, direction = "x",
    box.padding = 0.3, point.padding = 0.3, min.segment.length = 0.3,
    segment.color = "gray50", max.overlaps = Inf,
    fill = "white", label.size = 0.2
  ) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y", limits = extended_date_range) +
  scale_y_continuous(labels = scale_axis, limits = c(0, max(investigations_lit$investigations_aided, na.rm = TRUE) * 1.15),
                     expand = expansion(mult = c(0, 0.05))) +
  labs(title = "(E) Investigations Aided", x = "Year", y = "Count") +
  theme_crossref

# Combine plots - 3 rows: top row 2 plots, middle row 2 plots, bottom row full width
fig7_verification <- (p1 + p2) / (p3 + p4) / p5 +
  plot_layout(heights = c(1, 1, 1)) +
  plot_annotation(
    title = "Cross-Validation: Our Data vs. Published Literature",
    subtitle = "X markers show published values from FBI brochures and peer-reviewed papers",
    theme = theme(plot.title = element_text(face = "bold", size = 14))
  )

fig7_verification

Show cross-reference validation plots
# Save figure
fig_output_dir <- here("output", "figures", "manuscript_figures")
dir.create(fig_output_dir, recursive = TRUE, showWarnings = FALSE)

ggsave(
  filename = file.path(fig_output_dir, "fig7_verification_grid.pdf"),
  plot = fig7_verification,
  width = 14,
  height = 14,
  dpi = 300,
  units = "in",
  device = cairo_pdf
)

ggsave(
  filename = file.path(fig_output_dir, "fig7_verification_grid.png"),
  plot = fig7_verification,
  width = 14,
  height = 14,
  dpi = 300,
  units = "in"
)

cat(sprintf("\nSaved fig7_verification_grid to: %s\n", fig_output_dir))

Saved fig7_verification_grid to: /Users/tlasisi/GitHub/PODFRIDGE-Databases/output/figures/manuscript_figures