Worked example: Gaza ACLED backtest (local cache)

This notebook loads ACLED events from a local cache under docs/tutorials/_local_data/, focuses on Gaza, builds a regular grid, aggregates to daily counts, fits a small road-kernel Hawkes model, and reports basic forecast accuracy metrics.

Run this once before rendering the notebook locally (credentials handled by acled-viz):

uv run motac fetch acled --start 2024-01-01 --end 2025-12-31 --out-dir docs/tutorials/_local_data
[1]:
from __future__ import annotations

from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp


def find_repo_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "pyproject.toml").exists():
            return parent
    raise RuntimeError("Could not find repo root")


root = find_repo_root(Path.cwd())

from motac.data import load_events_csv  # noqa: E402
from motac.models.fit import fit_road_hawkes_mle  # noqa: E402
from motac.models.forecast import forecast_probabilistic_horizon  # noqa: E402
from motac.models.metrics import mean_negative_log_likelihood  # noqa: E402
from motac.spatial.grid_builder import LonLatBounds, build_regular_grid  # noqa: E402


def scan_event_table(
    path: Path,
    *,
    lat_col: str,
    lon_col: str,
    mark_col: str | None,
    date_col: str,
    bbox: tuple[float, float, float, float],
) -> tuple[tuple[float, float, float, float], Counter, list[str]]:
    if path.suffix.lower() in {".parquet", ".pq"}:
        frame = pd.read_parquet(path)
    else:
        frame = pd.read_csv(path)

    frame[lat_col] = pd.to_numeric(frame[lat_col], errors="coerce")
    frame[lon_col] = pd.to_numeric(frame[lon_col], errors="coerce")
    frame = frame.dropna(subset=[lat_col, lon_col])

    lon_min, lon_max, lat_min, lat_max = bbox
    frame = frame[
        (frame[lon_col] >= lon_min)
        & (frame[lon_col] <= lon_max)
        & (frame[lat_col] >= lat_min)
        & (frame[lat_col] <= lat_max)
    ]

    if frame.empty:
        raise ValueError("No valid lat/lon values found in source file")

    marks: Counter = Counter()
    if mark_col and mark_col in frame.columns:
        marks.update(str(m) for m in frame[mark_col].dropna())

    dates = (
        pd.to_datetime(frame[date_col], errors="coerce")
        .dropna()
        .dt.strftime("%Y-%m-%d")
        .tolist()
    )

    lats = frame[lat_col].astype(float).to_numpy()
    lons = frame[lon_col].astype(float).to_numpy()
    bounds = (float(np.min(lons)), float(np.max(lons)), float(np.min(lats)), float(np.max(lats)))
    return bounds, marks, dates


def build_travel_time(
    lat: np.ndarray,
    lon: np.ndarray,
    k: int = 6,
    speed_kph: float = 30.0,
) -> sp.csr_matrix:
    lat = np.asarray(lat, dtype=float)
    lon = np.asarray(lon, dtype=float)
    n = int(lat.shape[0])
    if n <= 1:
        return sp.csr_matrix((n, n), dtype=float)

    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    mean_lat = float(np.mean(lat_rad))
    dx = (lon_rad[None, :] - lon_rad[:, None]) * np.cos(mean_lat)
    dy = lat_rad[None, :] - lat_rad[:, None]
    dist = 6_371_000.0 * np.sqrt(dx**2 + dy**2)
    np.fill_diagonal(dist, np.inf)

    k = min(int(k), max(n - 1, 1))
    idx = np.argpartition(dist, kth=k, axis=1)[:, :k]
    rows = np.repeat(np.arange(n), k)
    cols = idx.reshape(-1)
    data = dist[np.arange(n)[:, None], idx].reshape(-1)

    speed_mps = speed_kph * 1000.0 / 3600.0
    travel_time = data / speed_mps

    rows_sym = np.concatenate([rows, cols, np.arange(n)])
    cols_sym = np.concatenate([cols, rows, np.arange(n)])
    data_sym = np.concatenate([travel_time, travel_time, np.zeros(n)])
    return sp.csr_matrix((data_sym, (rows_sym, cols_sym)), shape=(n, n))


fixture_path = (
    root
    / "docs"
    / "tutorials"
    / "_local_data"
    / "acled_gaza_2024-01-01_2025-12-31"
    / "events.parquet"
)
if not fixture_path.exists():
    raise FileNotFoundError(
        "Local ACLED tutorial cache not found. Run: "
        "uv run motac fetch acled --start 2024-01-01 --end 2025-12-31 "
        "--out-dir docs/tutorials/_local_data"
    )

# Initial broad bbox, then tighten to observed extent.
bbox = (-180.0, 180.0, -90.0, 90.0)

bounds_tuple, mark_counts, dates = scan_event_table(
    fixture_path,
    lat_col="latitude",
    lon_col="longitude",
    mark_col="event_type",
    date_col="event_date",
    bbox=bbox,
)

bbox = bounds_tuple
mark_order = [m for m, _ in mark_counts.most_common(3)]
min_date = np.datetime64(min(dates), "D")
max_date = np.datetime64(max(dates), "D")

bounds = LonLatBounds(
    lon_min=bounds_tuple[0],
    lon_max=bounds_tuple[1],
    lat_min=bounds_tuple[2],
    lat_max=bounds_tuple[3],
)

grid = build_regular_grid(bounds, cell_size_m=200.0)
dataset = load_events_csv(
    path=fixture_path,
    grid=grid,
    date_col="event_date",
    lat_col="latitude",
    lon_col="longitude",
    mark_col="event_type",
    event_id_col="event_id",
    value_col="fatalities",
    start_date=str(min_date),
    end_date=str(max_date),
    mark_order=mark_order,
    bbox=bbox,
)

counts = np.asarray(dataset.counts)
if counts.ndim == 3:
    counts = counts.sum(axis=1)

active_mask = counts.sum(axis=1) > 0
active_idx = np.where(active_mask)[0]
print("Active cells:", active_idx.shape[0])

counts = counts[active_idx]
lat = grid.lat[active_idx]
lon = grid.lon[active_idx]

print("Counts shape:", counts.shape)

travel_time_s = build_travel_time(lat, lon)

n_days = counts.shape[1]
if n_days < 2:
    raise ValueError("Need at least two days of data")

horizon = 1 if n_days < 7 else min(21, max(1, n_days // 6))
n_train = max(1, n_days - horizon)

y_train = counts[:, :n_train]
y_test = counts[:, n_train : n_train + horizon]

kernel = np.array([0.6, 0.3, 0.1])
fit = fit_road_hawkes_mle(
    travel_time_s=travel_time_s,
    kernel=kernel,
    y=y_train,
    maxiter=80,
    stability_mode="penalty",
    mu_ridge=1e-4,
)

prob = forecast_probabilistic_horizon(
    travel_time_s=travel_time_s,
    mu=np.asarray(fit["mu"], dtype=float),
    alpha=float(fit["alpha"]),
    beta=float(fit["beta"]),
    kernel=kernel,
    y_history=y_train,
    horizon=horizon,
    n_paths=300,
    seed=0,
)

lam = np.asarray(prob["count_mean"])
q = np.asarray(prob["count_quantiles"])
rmse = float(np.sqrt(np.mean((y_test - lam) ** 2)))
mae = float(np.mean(np.abs(y_test - lam)))
nll = mean_negative_log_likelihood(y=y_test, mean=lam)
coverage = float(np.mean((y_test >= q[0]) & (y_test <= q[-1])))

print("Backtest metrics:")
print("  nll:", round(nll, 4))
print("  rmse:", round(rmse, 4))
print("  mae:", round(mae, 4))
print("  coverage:", round(coverage, 4))
Active cells: 1736
Counts shape: (1736, 731)
Backtest metrics:
  nll: 0.0931
  rmse: 0.0899
  mae: 0.007
  coverage: 0.9983
[2]:
daily_total = counts.sum(axis=0)

plt.figure(figsize=(8, 3))
plt.plot(daily_total, color="#4c72b0")
plt.title("Daily event totals (Gaza)")
plt.xlabel("Day index")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

forecast_total = lam.sum(axis=0)
actual_total = y_test.sum(axis=0)

plt.figure(figsize=(6, 3))
plt.plot(actual_total, label="actual", color="#c44e52")
plt.plot(forecast_total, label="forecast", color="#8172b2")
plt.title("Held-out totals")
plt.xlabel("Horizon step")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()
../_images/tutorials_05_gaza_acled_worked_example_2_0.png
../_images/tutorials_05_gaza_acled_worked_example_2_1.png