Worked example: Chicago backtest (committed fixture)

This notebook uses a committed sample dataset at docs/tutorials/data/chicago_sample_2024_2025.parquet so local vignette rendering is fast and reproducible without live API calls.

If you want to refresh data from the Socrata API yourself, use:

uv run motac fetch chicago --start-year 2024 --end-year 2025 --out-dir docs/tutorials/_local_data

The analysis below intentionally runs against the checked-in fixture.

[1]:
from __future__ import annotations

from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp


def find_repo_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    raise RuntimeError("Could not find repo root")


root = find_repo_root(Path.cwd())

from motac.data import load_events_csv  # noqa: E402
from motac.models.fit import fit_road_hawkes_mle  # noqa: E402
from motac.models.forecast import forecast_probabilistic_horizon  # noqa: E402
from motac.models.metrics import mean_negative_log_likelihood  # noqa: E402
from motac.spatial.grid_builder import LonLatBounds, build_regular_grid  # noqa: E402


def scan_event_table(
    path: Path,
    *,
    lat_col: str,
    lon_col: str,
    mark_col: str | None,
    date_col: str,
) -> tuple[tuple[float, float, float, float], Counter, list[str]]:
    if path.suffix.lower() in {".parquet", ".pq"}:
        frame = pd.read_parquet(path)
    else:
        frame = pd.read_csv(path)

    frame[lat_col] = pd.to_numeric(frame[lat_col], errors="coerce")
    frame[lon_col] = pd.to_numeric(frame[lon_col], errors="coerce")
    frame = frame.dropna(subset=[lat_col, lon_col])

    lats = frame[lat_col].astype(float).to_numpy()
    lons = frame[lon_col].astype(float).to_numpy()

    marks: Counter = Counter()
    if mark_col and mark_col in frame.columns:
        marks.update(str(m) for m in frame[mark_col].dropna())

    dates = (
        pd.to_datetime(frame[date_col], errors="coerce")
        .dropna()
        .dt.strftime("%Y-%m-%d")
        .tolist()
    )

    if lats.size == 0 or lons.size == 0:
        raise ValueError("No valid lat/lon values found in source file")

    bounds = (float(np.min(lons)), float(np.max(lons)), float(np.min(lats)), float(np.max(lats)))
    return bounds, marks, dates


def build_travel_time(
    lat: np.ndarray,
    lon: np.ndarray,
    k: int = 6,
    speed_kph: float = 30.0,
) -> sp.csr_matrix:
    lat = np.asarray(lat, dtype=float)
    lon = np.asarray(lon, dtype=float)
    n = int(lat.shape[0])
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    mean_lat = float(np.mean(lat_rad))
    dx = (lon_rad[None, :] - lon_rad[:, None]) * np.cos(mean_lat)
    dy = lat_rad[None, :] - lat_rad[:, None]
    dist = 6_371_000.0 * np.sqrt(dx**2 + dy**2)
    np.fill_diagonal(dist, np.inf)

    k = min(int(k), max(n - 1, 1))
    idx = np.argpartition(dist, kth=k, axis=1)[:, :k]
    rows = np.repeat(np.arange(n), k)
    cols = idx.reshape(-1)
    data = dist[np.arange(n)[:, None], idx].reshape(-1)

    speed_mps = speed_kph * 1000.0 / 3600.0
    travel_time = data / speed_mps

    rows_sym = np.concatenate([rows, cols, np.arange(n)])
    cols_sym = np.concatenate([cols, rows, np.arange(n)])
    data_sym = np.concatenate([travel_time, travel_time, np.zeros(n)])
    return sp.csr_matrix((data_sym, (rows_sym, cols_sym)), shape=(n, n))


fixture_path = root / "docs" / "tutorials" / "data" / "chicago_sample_2024_2025.parquet"
if not fixture_path.exists():
    raise FileNotFoundError(
        "Committed Chicago tutorial fixture not found at "
        "docs/tutorials/data/chicago_sample_2024_2025.parquet"
    )

bounds_tuple, mark_counts, dates = scan_event_table(
    fixture_path,
    lat_col="latitude",
    lon_col="longitude",
    mark_col="primary_type",
    date_col="date",
)

mark_order = [m for m, _ in mark_counts.most_common(3)]
min_date = np.datetime64(min(dates), "D")
max_date = np.datetime64(max(dates), "D")
start_date = str(min_date)
end_date = str(max_date)

bounds = LonLatBounds(
    lon_min=bounds_tuple[0],
    lon_max=bounds_tuple[1],
    lat_min=bounds_tuple[2],
    lat_max=bounds_tuple[3],
)

grid = build_regular_grid(bounds, cell_size_m=1500.0)
dataset = load_events_csv(
    path=fixture_path,
    grid=grid,
    date_col="date",
    lat_col="latitude",
    lon_col="longitude",
    mark_col="primary_type",
    event_id_col="id",
    start_date=start_date,
    end_date=end_date,
    mark_order=mark_order,
)

counts = np.asarray(dataset.counts)
if counts.ndim == 3:
    counts = counts.sum(axis=1)

active_mask = counts.sum(axis=1) > 0
active_idx = np.where(active_mask)[0]
print("Active cells:", active_idx.shape[0])

counts = counts[active_idx]
lat = grid.lat[active_idx]
lon = grid.lon[active_idx]

print("Counts shape:", counts.shape)

travel_time_s = build_travel_time(lat, lon)

n_days = counts.shape[1]
horizon = min(14, max(1, n_days // 5))
n_train = n_days - horizon
if n_train < 2:
    raise ValueError("Not enough days for backtest")

y_train = counts[:, :n_train]
y_test = counts[:, n_train : n_train + horizon]

kernel = np.array([0.6, 0.3, 0.1])
fit = fit_road_hawkes_mle(
    travel_time_s=travel_time_s,
    kernel=kernel,
    y=y_train,
    maxiter=60,
    stability_mode="penalty",
    mu_ridge=1e-4,
)

prob = forecast_probabilistic_horizon(
    travel_time_s=travel_time_s,
    mu=np.asarray(fit["mu"], dtype=float),
    alpha=float(fit["alpha"]),
    beta=float(fit["beta"]),
    kernel=kernel,
    y_history=y_train,
    horizon=horizon,
    n_paths=200,
    seed=0,
)

lam = np.asarray(prob["count_mean"])
q = np.asarray(prob["count_quantiles"])
rmse = float(np.sqrt(np.mean((y_test - lam) ** 2)))
mae = float(np.mean(np.abs(y_test - lam)))
nll = mean_negative_log_likelihood(y=y_test, mean=lam)
coverage = float(np.mean((y_test >= q[0]) & (y_test <= q[-1])))

print("Backtest metrics:")
print("  nll:", round(nll, 4))
print("  rmse:", round(rmse, 4))
print("  mae:", round(mae, 4))
print("  coverage:", round(coverage, 4))
Active cells: 318
Counts shape: (318, 731)
Backtest metrics:
  nll: 1.0758
  rmse: 1.2192
  mae: 0.7499
  coverage: 0.9717
[2]:
daily_total = counts.sum(axis=0)

plt.figure(figsize=(8, 3))
plt.plot(daily_total, color="#3b4cc0")
plt.title("Daily event totals")
plt.xlabel("Day index")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

forecast_total = lam.sum(axis=0)
actual_total = y_test.sum(axis=0)

plt.figure(figsize=(6, 3))
plt.plot(actual_total, label="actual", color="#dd8452")
plt.plot(forecast_total, label="forecast", color="#55a868")
plt.title("Held-out totals")
plt.xlabel("Horizon step")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()
../_images/tutorials_04_chicago_worked_example_2_0.png
../_images/tutorials_04_chicago_worked_example_2_1.png