Worked example: Gaza ACLED backtest (local cache)
This notebook loads ACLED events from a local cache under docs/tutorials/_local_data/, focuses on Gaza, builds a regular grid, aggregates to daily counts, fits a small road-kernel Hawkes model, and reports basic forecast accuracy metrics.
Run this once before rendering the notebook locally (credentials handled by acled-viz):
uv run motac fetch acled --start 2024-01-01 --end 2025-12-31 --out-dir docs/tutorials/_local_data
[1]:
from __future__ import annotations
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp
def find_repo_root(start: Path) -> Path:
for parent in [start] + list(start.parents):
if (parent / "pyproject.toml").exists():
return parent
raise RuntimeError("Could not find repo root")
root = find_repo_root(Path.cwd())
from motac.data import load_events_csv # noqa: E402
from motac.models.fit import fit_road_hawkes_mle # noqa: E402
from motac.models.forecast import forecast_probabilistic_horizon # noqa: E402
from motac.models.metrics import mean_negative_log_likelihood # noqa: E402
from motac.spatial.grid_builder import LonLatBounds, build_regular_grid # noqa: E402
def scan_event_table(
path: Path,
*,
lat_col: str,
lon_col: str,
mark_col: str | None,
date_col: str,
bbox: tuple[float, float, float, float],
) -> tuple[tuple[float, float, float, float], Counter, list[str]]:
if path.suffix.lower() in {".parquet", ".pq"}:
frame = pd.read_parquet(path)
else:
frame = pd.read_csv(path)
frame[lat_col] = pd.to_numeric(frame[lat_col], errors="coerce")
frame[lon_col] = pd.to_numeric(frame[lon_col], errors="coerce")
frame = frame.dropna(subset=[lat_col, lon_col])
lon_min, lon_max, lat_min, lat_max = bbox
frame = frame[
(frame[lon_col] >= lon_min)
& (frame[lon_col] <= lon_max)
& (frame[lat_col] >= lat_min)
& (frame[lat_col] <= lat_max)
]
if frame.empty:
raise ValueError("No valid lat/lon values found in source file")
marks: Counter = Counter()
if mark_col and mark_col in frame.columns:
marks.update(str(m) for m in frame[mark_col].dropna())
dates = (
pd.to_datetime(frame[date_col], errors="coerce")
.dropna()
.dt.strftime("%Y-%m-%d")
.tolist()
)
lats = frame[lat_col].astype(float).to_numpy()
lons = frame[lon_col].astype(float).to_numpy()
bounds = (float(np.min(lons)), float(np.max(lons)), float(np.min(lats)), float(np.max(lats)))
return bounds, marks, dates
def build_travel_time(
lat: np.ndarray,
lon: np.ndarray,
k: int = 6,
speed_kph: float = 30.0,
) -> sp.csr_matrix:
lat = np.asarray(lat, dtype=float)
lon = np.asarray(lon, dtype=float)
n = int(lat.shape[0])
if n <= 1:
return sp.csr_matrix((n, n), dtype=float)
lat_rad = np.radians(lat)
lon_rad = np.radians(lon)
mean_lat = float(np.mean(lat_rad))
dx = (lon_rad[None, :] - lon_rad[:, None]) * np.cos(mean_lat)
dy = lat_rad[None, :] - lat_rad[:, None]
dist = 6_371_000.0 * np.sqrt(dx**2 + dy**2)
np.fill_diagonal(dist, np.inf)
k = min(int(k), max(n - 1, 1))
idx = np.argpartition(dist, kth=k, axis=1)[:, :k]
rows = np.repeat(np.arange(n), k)
cols = idx.reshape(-1)
data = dist[np.arange(n)[:, None], idx].reshape(-1)
speed_mps = speed_kph * 1000.0 / 3600.0
travel_time = data / speed_mps
rows_sym = np.concatenate([rows, cols, np.arange(n)])
cols_sym = np.concatenate([cols, rows, np.arange(n)])
data_sym = np.concatenate([travel_time, travel_time, np.zeros(n)])
return sp.csr_matrix((data_sym, (rows_sym, cols_sym)), shape=(n, n))
fixture_path = (
root
/ "docs"
/ "tutorials"
/ "_local_data"
/ "acled_gaza_2024-01-01_2025-12-31"
/ "events.parquet"
)
if not fixture_path.exists():
raise FileNotFoundError(
"Local ACLED tutorial cache not found. Run: "
"uv run motac fetch acled --start 2024-01-01 --end 2025-12-31 "
"--out-dir docs/tutorials/_local_data"
)
# Initial broad bbox, then tighten to observed extent.
bbox = (-180.0, 180.0, -90.0, 90.0)
bounds_tuple, mark_counts, dates = scan_event_table(
fixture_path,
lat_col="latitude",
lon_col="longitude",
mark_col="event_type",
date_col="event_date",
bbox=bbox,
)
bbox = bounds_tuple
mark_order = [m for m, _ in mark_counts.most_common(3)]
min_date = np.datetime64(min(dates), "D")
max_date = np.datetime64(max(dates), "D")
bounds = LonLatBounds(
lon_min=bounds_tuple[0],
lon_max=bounds_tuple[1],
lat_min=bounds_tuple[2],
lat_max=bounds_tuple[3],
)
grid = build_regular_grid(bounds, cell_size_m=200.0)
dataset = load_events_csv(
path=fixture_path,
grid=grid,
date_col="event_date",
lat_col="latitude",
lon_col="longitude",
mark_col="event_type",
event_id_col="event_id",
value_col="fatalities",
start_date=str(min_date),
end_date=str(max_date),
mark_order=mark_order,
bbox=bbox,
)
counts = np.asarray(dataset.counts)
if counts.ndim == 3:
counts = counts.sum(axis=1)
active_mask = counts.sum(axis=1) > 0
active_idx = np.where(active_mask)[0]
print("Active cells:", active_idx.shape[0])
counts = counts[active_idx]
lat = grid.lat[active_idx]
lon = grid.lon[active_idx]
print("Counts shape:", counts.shape)
travel_time_s = build_travel_time(lat, lon)
n_days = counts.shape[1]
if n_days < 2:
raise ValueError("Need at least two days of data")
horizon = 1 if n_days < 7 else min(21, max(1, n_days // 6))
n_train = max(1, n_days - horizon)
y_train = counts[:, :n_train]
y_test = counts[:, n_train : n_train + horizon]
kernel = np.array([0.6, 0.3, 0.1])
fit = fit_road_hawkes_mle(
travel_time_s=travel_time_s,
kernel=kernel,
y=y_train,
maxiter=80,
stability_mode="penalty",
mu_ridge=1e-4,
)
prob = forecast_probabilistic_horizon(
travel_time_s=travel_time_s,
mu=np.asarray(fit["mu"], dtype=float),
alpha=float(fit["alpha"]),
beta=float(fit["beta"]),
kernel=kernel,
y_history=y_train,
horizon=horizon,
n_paths=300,
seed=0,
)
lam = np.asarray(prob["count_mean"])
q = np.asarray(prob["count_quantiles"])
rmse = float(np.sqrt(np.mean((y_test - lam) ** 2)))
mae = float(np.mean(np.abs(y_test - lam)))
nll = mean_negative_log_likelihood(y=y_test, mean=lam)
coverage = float(np.mean((y_test >= q[0]) & (y_test <= q[-1])))
print("Backtest metrics:")
print(" nll:", round(nll, 4))
print(" rmse:", round(rmse, 4))
print(" mae:", round(mae, 4))
print(" coverage:", round(coverage, 4))
Active cells: 1736
Counts shape: (1736, 731)
Backtest metrics:
nll: 0.0931
rmse: 0.0899
mae: 0.007
coverage: 0.9983
[2]:
daily_total = counts.sum(axis=0)
plt.figure(figsize=(8, 3))
plt.plot(daily_total, color="#4c72b0")
plt.title("Daily event totals (Gaza)")
plt.xlabel("Day index")
plt.ylabel("Count")
plt.tight_layout()
plt.show()
forecast_total = lam.sum(axis=0)
actual_total = y_test.sum(axis=0)
plt.figure(figsize=(6, 3))
plt.plot(actual_total, label="actual", color="#c44e52")
plt.plot(forecast_total, label="forecast", color="#8172b2")
plt.title("Held-out totals")
plt.xlabel("Horizon step")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()