Add duplicate publisher determinism proof
Some checks failed
deploy-cloudflare / checks (push) Failing after 3s
ci-gates / checks (push) Failing after 5s
deploy-cloudflare / deploy (push) Has been skipped

This commit is contained in:
every.channel 2026-06-10 03:28:55 -07:00
parent 5d0f3077d3
commit 91dad67fc2
No known key found for this signature in database
18 changed files with 21569 additions and 595 deletions

View file

@ -109,6 +109,35 @@ jobs:
fi
cargo test -p ec-core -p ec-crypto -p ec-moq -p ec-iroh -p ec-linux-iptv
- name: Duplicate publisher proof gates
shell: bash
run: |
set -euo pipefail
cd .repo
if [[ -f "$HOME/.cargo/env" ]]; then
. "$HOME/.cargo/env"
fi
cargo test -p ec-node publisher_proof
cargo test -p ec-node archive_convergence
- name: Distributed simulation gates
shell: bash
run: |
set -euo pipefail
cd .repo
if [[ -f "$HOME/.cargo/env" ]]; then
. "$HOME/.cargo/env"
fi
cargo test -p ec-node sim_system_
cargo run -p ec-node -- sim-system \
--fault-profile foundationdb \
--seed 1 \
--iterations 1024 \
--max-system-complete-ms 6000 \
--failure-artifact /tmp/ec-sim-system-foundationdb-failure.json \
--pretty \
> /tmp/ec-sim-system-foundationdb.json
- name: Build web (apps/web)
shell: bash
run: |

291
Cargo.lock generated
View file

@ -1038,15 +1038,6 @@ dependencies = [
"alloc-stdlib",
]
[[package]]
name = "buf-list"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6b175f9cf8fffedd4c4b18bcfef092356e952b81f596e148f18e98280994593"
dependencies = [
"bytes",
]
[[package]]
name = "bumpalo"
version = "3.19.1"
@ -1375,6 +1366,15 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "conducer"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d2cb64e61144d6960a830d3e6f2ba3a61d5c0ca689e87e11dc9effb96dcfff5"
dependencies = [
"smallvec",
]
[[package]]
name = "const-hex"
version = "1.18.1"
@ -2251,9 +2251,12 @@ dependencies = [
"hex",
"iroh",
"just-webrtc",
"moq-lite 0.14.0",
"moq-lite 0.16.0",
"moq-mux",
"moq-native",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry_sdk",
"quinn",
"reqwest",
"rustls",
@ -2261,9 +2264,10 @@ dependencies = [
"serde",
"serde_json",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.24.0",
"tokio-util",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"url",
"urlencoding",
@ -3264,24 +3268,20 @@ checksum = "253b313319f7109de64e480ffb606f89475cd758bae82e096e00c5d95341d30e"
[[package]]
name = "hang"
version = "0.14.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f548f7cdc8ec3b9eae085f7b61ff9603d6dc9f09192c5f4b0db4c02577786070"
checksum = "59435f843c8a41ac499ce68828d16c575438e34ffa85b1ea46ba2529bb2a5b16"
dependencies = [
"buf-list",
"bytes",
"derive_more 2.1.1",
"futures",
"hex",
"lazy_static",
"moq-lite 0.14.0",
"moq-lite 0.16.0",
"regex",
"serde",
"serde_json",
"serde_with",
"thiserror 2.0.18",
"tokio",
"tracing",
"url",
]
@ -4494,9 +4494,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "m3u8-rs"
version = "5.0.5"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c1d7ba86f7ea62f17f4310c55e93244619ddc7dadfc7e565de1967e4e41e6e7"
checksum = "f03cd3335fb5f2447755d45cda9c70f76013626a9db44374973791b0926a86c3"
dependencies = [
"chrono",
"nom",
@ -4707,14 +4707,13 @@ dependencies = [
[[package]]
name = "moq-lite"
version = "0.14.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8a4c4e66081bc21067488da13f4131540b38b1cb79fb5176ef4ddacd104786b"
checksum = "15b02845fa5cef29b516e0ed60dc95f5904502bf001a8a2790d543fae6571a94"
dependencies = [
"async-channel",
"bytes",
"conducer",
"futures",
"hex",
"num_enum",
"rand 0.9.2",
"serde",
@ -4726,24 +4725,38 @@ dependencies = [
]
[[package]]
name = "moq-mux"
version = "0.2.1"
name = "moq-msf"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73e2570aa39feef3aa00fa0990862dcdfb44937d3eb9c448c3a4eb1fb8ff43d3"
checksum = "2d61b0d5ce8285c75ed59343934aae278c4c49b1dedf41f1356939b40fab4d29"
dependencies = [
"serde",
"serde_json",
"serde_with",
]
[[package]]
name = "moq-mux"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fd5f397f0d147ca8920434a74f092e0846ce23bb1cb5411253123913a3e7576"
dependencies = [
"anyhow",
"buf-list",
"base64 0.22.1",
"bytes",
"conducer",
"derive_more 2.1.1",
"h264-parser",
"hang",
"m3u8-rs",
"moq-lite 0.14.0",
"moq-lite 0.16.0",
"moq-msf",
"mp4-atom",
"num_enum",
"reqwest",
"scuffle-av1",
"scuffle-h265",
"thiserror 2.0.18",
"tokio",
"tracing",
"url",
@ -4751,9 +4764,9 @@ dependencies = [
[[package]]
name = "moq-native"
version = "0.13.1"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9848c21bf5db3f8ff5e5a7d89bf2c567f0eb526390c26d5f66f3fec99a6751a5"
checksum = "6942bac34d380bbab511e10069bc0f9615f20109807dab01b52d45e0812dc571"
dependencies = [
"anyhow",
"clap",
@ -4761,8 +4774,9 @@ dependencies = [
"hex",
"humantime",
"humantime-serde",
"moq-lite 0.14.0",
"moq-lite 0.16.0",
"parking_lot",
"qmux",
"quinn",
"rand 0.9.2",
"rcgen 0.14.7",
@ -4779,7 +4793,6 @@ dependencies = [
"tracing-subscriber",
"url",
"web-transport-quinn",
"web-transport-ws",
]
[[package]]
@ -5519,6 +5532,78 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
[[package]]
name = "opentelemetry"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"pin-project-lite",
"thiserror 2.0.18",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d"
dependencies = [
"async-trait",
"bytes",
"http",
"opentelemetry",
"reqwest",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f"
dependencies = [
"flate2",
"http",
"opentelemetry",
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry_sdk",
"prost",
"reqwest",
"thiserror 2.0.18",
]
[[package]]
name = "opentelemetry-proto"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"prost",
"tonic",
"tonic-prost",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd"
dependencies = [
"futures-channel",
"futures-executor",
"futures-util",
"opentelemetry",
"percent-encoding",
"rand 0.9.2",
"thiserror 2.0.18",
]
[[package]]
name = "option-ext"
version = "0.2.0"
@ -6206,6 +6291,47 @@ dependencies = [
"unarray",
]
[[package]]
name = "prost"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
dependencies = [
"bytes",
"prost-derive",
]
[[package]]
name = "prost-derive"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
dependencies = [
"anyhow",
"itertools 0.13.0",
"proc-macro2",
"quote",
"syn 2.0.114",
]
[[package]]
name = "qmux"
version = "0.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a87859012c43a1e38dda29f2464e0ee39b0e96d0f95f870a73610bc6f2c3c2"
dependencies = [
"bytes",
"futures",
"rustls",
"thiserror 2.0.18",
"tokio",
"tokio-rustls",
"tokio-tungstenite 0.28.0",
"tracing",
"web-transport-proto 0.6.0",
"web-transport-trait",
]
[[package]]
name = "quick-error"
version = "1.2.3"
@ -8315,6 +8441,22 @@ dependencies = [
"webpki-roots 0.26.11",
]
[[package]]
name = "tokio-tungstenite"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d25a406cddcc431a75d3d9afc6a7c0f7428d4891dd973e4d54c56b46127bf857"
dependencies = [
"futures-util",
"log",
"rustls",
"rustls-native-certs",
"rustls-pki-types",
"tokio",
"tokio-rustls",
"tungstenite 0.28.0",
]
[[package]]
name = "tokio-util"
version = "0.7.18"
@ -8447,6 +8589,38 @@ version = "1.0.6+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
[[package]]
name = "tonic"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef"
dependencies = [
"async-trait",
"base64 0.22.1",
"bytes",
"http",
"http-body",
"http-body-util",
"percent-encoding",
"pin-project",
"sync_wrapper",
"tokio-stream",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tonic-prost"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0"
dependencies = [
"bytes",
"prost",
"tonic",
]
[[package]]
name = "tower"
version = "0.5.3"
@ -8567,6 +8741,22 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "tracing-opentelemetry"
version = "0.32.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc"
dependencies = [
"js-sys",
"opentelemetry",
"smallvec",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"web-time",
]
[[package]]
name = "tracing-subscriber"
version = "0.3.22"
@ -8656,6 +8846,8 @@ dependencies = [
"httparse",
"log",
"rand 0.9.2",
"rustls",
"rustls-pki-types",
"sha1",
"thiserror 2.0.18",
"utf-8",
@ -9136,9 +9328,9 @@ dependencies = [
[[package]]
name = "web-async"
version = "0.1.1"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6b2260b739b0e95cf9b78f22a64704af7ed9760ea12baa3745b4b97899dc89a"
checksum = "f5414b65d9a5094649bb99987bb74db71febfdfa3677b7954a0a05c99d0424e8"
dependencies = [
"tokio",
"tracing",
@ -9198,7 +9390,9 @@ dependencies = [
[[package]]
name = "web-transport-proto"
version = "0.5.2"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0225d295c8ac00a2e9a498aefeaf3f3c6186da12a251c938189b15b82ea22808"
dependencies = [
"bytes",
"http",
@ -9210,9 +9404,9 @@ dependencies = [
[[package]]
name = "web-transport-quinn"
version = "0.11.4"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96b195557749e84091d7b912a25e190e9606283b5121d041faf538b0b55f40d7"
checksum = "cac11b6caf163be7f980442a26fcba15e8074a5f22e85fbb71f0f77d11cecf60"
dependencies = [
"bytes",
"futures",
@ -9224,34 +9418,19 @@ dependencies = [
"tokio",
"tracing",
"url",
"web-transport-proto 0.5.2",
"web-transport-proto 0.6.0",
"web-transport-trait",
]
[[package]]
name = "web-transport-trait"
version = "0.3.3"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "802d6aa508f2c63c9050ceabc17265bbf90ed4d6f4e4357e987583883628e79c"
checksum = "cb67841c4a481ca3c1412ee4c9f463987401991e1ddc000903df2124f3dc85e9"
dependencies = [
"bytes",
]
[[package]]
name = "web-transport-ws"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7b1cd89c36a28eae759329839e85f7dbca733896f048a6daaf5f8fc80f3bcba"
dependencies = [
"bytes",
"futures",
"thiserror 2.0.18",
"tokio",
"tokio-tungstenite",
"web-transport-proto 0.5.2",
"web-transport-trait",
]
[[package]]
name = "webkit2gtk"
version = "2.0.1"

View file

@ -33,12 +33,9 @@ blake3 = "1"
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
opentelemetry = { version = "0.31", features = ["trace"] }
opentelemetry-otlp = { version = "0.31", default-features = false, features = ["http-proto", "reqwest-client", "trace", "gzip-http"] }
opentelemetry_sdk = { version = "0.31", features = ["trace"] }
tracing = "0.1"
tracing-opentelemetry = "0.32"
tracing-subscriber = "0.3"
[patch.crates-io]
# Cloudflare's relay uses standard WebTransport subprotocol negotiation. The upstream
# `web-transport-proto` crate (used by `web-transport-quinn`) currently uses legacy
# header names (`wt-available-protocols` / `wt-protocol`), which prevents negotiating
# `moqt-*` and causes the relay to close after MoQ SETUP.
web-transport-proto = { path = "third_party/web-transport-proto" }

View file

@ -12,6 +12,7 @@ use ec_core::{
};
use ec_ts::{SectionAssembler, TimeSyncEngine, TimeSyncUpdate, TsReader};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
@ -299,12 +300,55 @@ pub fn chunk_ts_stream<T: Read>(
})
}
pub fn chunk_ts_stream_with_preroll<T: Read>(
stream: T,
output_dir: &Path,
chunk_duration_ms: u64,
max_chunks: Option<usize>,
preroll_packets: usize,
) -> Result<TsChunkManifest> {
let mut chunks = Vec::new();
chunk_ts_stream_live_with_preroll(
stream,
output_dir,
chunk_duration_ms,
max_chunks,
preroll_packets,
|chunk| {
chunks.push(chunk);
Ok(())
},
)?;
Ok(TsChunkManifest {
output_dir: output_dir.to_path_buf(),
chunks,
})
}
pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
stream: T,
output_dir: &Path,
chunk_duration_ms: u64,
max_chunks: Option<usize>,
mut on_chunk: F,
) -> Result<()> {
chunk_ts_stream_live_with_preroll(
stream,
output_dir,
chunk_duration_ms,
max_chunks,
0,
|chunk| on_chunk(chunk),
)
}
pub fn chunk_ts_stream_live_with_preroll<T: Read, F: FnMut(TsChunk) -> Result<()>>(
stream: T,
output_dir: &Path,
chunk_duration_ms: u64,
max_chunks: Option<usize>,
preroll_packets: usize,
mut on_chunk: F,
) -> Result<()> {
fs::create_dir_all(output_dir)
.with_context(|| format!("failed to create {}", output_dir.display()))?;
@ -317,6 +361,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
let mut current_file: Option<std::fs::File> = None;
let mut current_timing: Option<ChunkTiming> = None;
let mut emitted = 0usize;
let mut preroll = VecDeque::<[u8; ec_ts::TS_PACKET_SIZE]>::with_capacity(preroll_packets);
let mut close_and_emit =
|index: u64, timing: ChunkTiming, file: std::fs::File| -> Result<bool> {
@ -332,6 +377,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
};
while let Some(packet) = reader.read_packet()? {
let packet_bytes = *packet.as_bytes();
let updates = engine.ingest_packet(&packet, &mut assembler);
for update in updates {
if update.discontinuity {
@ -344,6 +390,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
return Ok(());
}
}
preroll.clear();
}
if let Some(index) = update.chunk_index {
@ -359,8 +406,11 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
}
let path = chunk_path(output_dir, index);
let file = std::fs::File::create(&path)
let mut file = std::fs::File::create(&path)
.with_context(|| format!("failed to create {}", path.display()))?;
for bytes in &preroll {
file.write_all(bytes)?;
}
current_file = Some(file);
current_index = Some(index);
current_timing = Some(ChunkTiming {
@ -381,6 +431,13 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
if let Some(file) = current_file.as_mut() {
file.write_all(packet.as_bytes())?;
}
if preroll_packets > 0 {
preroll.push_back(packet_bytes);
while preroll.len() > preroll_packets {
preroll.pop_front();
}
}
}
if let (Some(index), Some(timing), Some(file)) = (
@ -388,7 +445,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
current_timing.take(),
current_file.take(),
) {
let _ = close_and_emit(index, timing, file);
close_and_emit(index, timing, file)?;
}
Ok(())
@ -929,6 +986,43 @@ mod tests {
let _ = fs::remove_dir_all(&dir);
}
#[test]
fn chunk_ts_stream_with_preroll_prepends_previous_packets() {
let chunk_ms = 1000u64;
let dir =
std::env::temp_dir().join(format!("ec-chopper-chunks-preroll-{}", std::process::id()));
let _ = fs::remove_dir_all(&dir);
fs::create_dir_all(&dir).unwrap();
let packet0 = ts_packet_with_pcr(0x0100, 0, 0);
let packet1 = ts_packet_with_pcr(0x0100, 1, 27_000_000);
let packet2 = ts_packet_with_pcr(0x0100, 2, 54_000_000);
let mut bytes = Vec::new();
bytes.extend_from_slice(&packet0);
bytes.extend_from_slice(&packet1);
bytes.extend_from_slice(&packet2);
let manifest =
chunk_ts_stream_with_preroll(Cursor::new(bytes), &dir, chunk_ms, None, 1).unwrap();
let indices = manifest.chunks.iter().map(|c| c.index).collect::<Vec<_>>();
assert_eq!(indices, vec![0, 1, 2]);
assert_eq!(
fs::read(&manifest.chunks[0].path).unwrap(),
packet0.to_vec()
);
assert_eq!(
fs::read(&manifest.chunks[1].path).unwrap(),
[packet0, packet1].concat()
);
assert_eq!(
fs::read(&manifest.chunks[2].path).unwrap(),
[packet1, packet2].concat()
);
let _ = fs::remove_dir_all(&dir);
}
#[test]
fn hashed_manifest_merkle_root_matches_core() {
let dir = std::env::temp_dir().join(format!("ec-chopper-merkle-{}", std::process::id()));

File diff suppressed because it is too large Load diff

2937
crates/ec-core/src/sim.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,986 @@
use ec_core::sim::{
check_control_plane_propagation_invariants, check_duplicate_publisher_invariants,
check_system_duplicate_publisher_invariants, run_control_plane_propagation_campaign,
run_control_plane_propagation_simulation, run_duplicate_publisher_campaign,
run_duplicate_publisher_simulation, run_seeded_simulation_campaign,
run_system_duplicate_publisher_campaign, run_system_duplicate_publisher_simulation,
shrink_duplicate_publisher_failure, simulated_media_hash,
ControlPlanePropagationInvariantConfig, ControlPlanePropagationScenario,
ControlPlaneTraceEvent, DeterministicSimulation, DuplicatePublisherInvariantConfig,
DuplicatePublisherScenario, EncoderDriftFault, FoundationStyleSystemScenarioConfig,
PublisherSequenceClock, SimulationOutage, SimulationPartition, SimulationSeed,
SystemDuplicatePublisherInvariantConfig, SystemDuplicatePublisherScenario,
};
const STREAM: &str = "la-kcop";
const RENDITION: &str = "720p";
const TRACK: &str = "0.m4s";
const PROFILE: &str = "x264-hd3-v1";
fn schedule_publisher_window(
sim: &mut DeterministicSimulation,
node: &str,
start_sequence: u64,
end_sequence: u64,
first_delivery_ms: u64,
step_ms: u64,
profile: &str,
) {
for sequence in start_sequence..end_sequence {
let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, profile);
sim.schedule_observation(
first_delivery_ms + (sequence - start_sequence) * step_ms,
node,
STREAM,
RENDITION,
TRACK,
sequence,
&hash,
);
}
}
#[test]
fn duplicate_publishers_converge_after_delayed_backfill() {
let mut sim = DeterministicSimulation::new();
schedule_publisher_window(&mut sim, "nuc-a", 0, 12, 0, 10, PROFILE);
schedule_publisher_window(&mut sim, "nuc-b", 0, 4, 30, 10, PROFILE);
schedule_publisher_window(&mut sim, "nuc-b", 4, 12, 500, 10, PROFILE);
sim.run_until(250);
let before_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12);
assert_eq!(before_backfill.expected_sequences, 12);
assert_eq!(before_backfill.missing_sequences, Vec::<u64>::new());
assert_eq!(
before_backfill.matching_duplicate_sequences,
vec![0, 1, 2, 3]
);
assert!(before_backfill.ok());
sim.run_to_idle();
let after_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12);
let duplicate_complete_at_ms = sim
.convergence()
.duplicate_complete_at_ms(STREAM, RENDITION, TRACK, 0, 12);
assert_eq!(after_backfill.missing_sequences, Vec::<u64>::new());
assert_eq!(after_backfill.divergent_sequences, Vec::<u64>::new());
assert_eq!(
after_backfill.matching_duplicate_sequences,
(0_u64..12).collect::<Vec<_>>()
);
assert_eq!(after_backfill.duplicate_source_records, 24);
assert_eq!(duplicate_complete_at_ms, Some(570));
assert_eq!(sim.trace().len(), 24);
assert!(
sim.trace()
.windows(2)
.all(|pair| (pair[0].at_ms, pair[0].order) <= (pair[1].at_ms, pair[1].order)),
"trace should preserve deterministic event order"
);
assert!(after_backfill.ok());
}
#[test]
fn media_convergence_can_summarize_sparse_observed_sequences() {
let mut sim = DeterministicSimulation::new();
for sequence in [7_287_381_184_512, 7_287_381_188_608] {
let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, PROFILE);
sim.schedule_observation(0, "nuc-a", STREAM, RENDITION, TRACK, sequence, &hash);
sim.schedule_observation(1, "nuc-b", STREAM, RENDITION, TRACK, sequence, &hash);
}
sim.run_to_idle();
let dense = sim.convergence().summarize(
STREAM,
RENDITION,
TRACK,
7_287_381_184_512,
7_287_381_188_609,
);
let sparse = sim.convergence().summarize_observed_sequences(
STREAM,
RENDITION,
TRACK,
7_287_381_184_512,
7_287_381_188_609,
);
assert!(!dense.missing_sequences.is_empty());
assert_eq!(sparse.expected_sequences, 2);
assert_eq!(sparse.missing_sequences, Vec::<u64>::new());
assert_eq!(
sparse.matching_duplicate_sequences,
vec![7_287_381_184_512, 7_287_381_188_608]
);
assert!(sparse.ok());
}
#[test]
fn duplicate_publisher_simulation_detects_encoder_drift() {
let mut sim = DeterministicSimulation::new();
schedule_publisher_window(&mut sim, "nuc-a", 0, 8, 0, 10, PROFILE);
schedule_publisher_window(&mut sim, "nuc-b", 0, 8, 5, 10, PROFILE);
let drift_hash = simulated_media_hash(STREAM, RENDITION, TRACK, 4, "x264-hd3-drift");
sim.schedule_observation(90, "nuc-b", STREAM, RENDITION, TRACK, 4, &drift_hash);
sim.run_to_idle();
let summary = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 8);
assert_eq!(summary.missing_sequences, Vec::<u64>::new());
assert_eq!(summary.divergent_sequences, vec![4]);
assert!(!summary.ok());
}
#[test]
fn duplicate_publisher_fault_schedule_replays_from_seed() {
let scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6d6f_712d_6475_7021));
let first = run_duplicate_publisher_simulation(&scenario);
let second = run_duplicate_publisher_simulation(&scenario);
assert_eq!(first, second);
assert!(first.duplicate_complete(), "replay {}", first.replay_hint);
assert_eq!(first.summary.matching_duplicate_sequences.len(), 48);
assert_eq!(
first.trace, second.trace,
"replayed reports should carry the same event history"
);
}
#[test]
fn duplicate_publisher_many_seed_fault_schedules_converge() {
let mut saw_transient_drop = false;
let mut saw_partition_delay = false;
let mut saw_publisher_outage = false;
for seed in 1..=96 {
let scenario = faulted_duplicate_scenario(SimulationSeed::new(seed));
let report = run_duplicate_publisher_simulation(&scenario);
saw_transient_drop |= report.fault_stats.transient_dropped_observations > 0;
saw_partition_delay |= report.fault_stats.partition_delayed_observations > 0;
saw_publisher_outage |= report.fault_stats.publisher_outage_observations > 0;
assert!(
report.duplicate_complete(),
"duplicate publisher convergence failed for {}: {:?}",
report.replay_hint,
report.summary
);
assert_eq!(report.summary.missing_sequences, Vec::<u64>::new());
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
assert_eq!(report.summary.duplicate_source_records, 96);
}
assert!(
saw_transient_drop,
"fault suite did not exercise transient drops"
);
assert!(
saw_partition_delay,
"fault suite did not exercise partitions"
);
assert!(
saw_publisher_outage,
"fault suite did not exercise publisher outages"
);
}
#[test]
fn seeded_fault_scenario_detects_encoder_drift() {
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6472_6966_7421));
scenario
.encoder_drifts
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
let report = run_duplicate_publisher_simulation(&scenario);
assert!(!report.duplicate_complete());
assert_eq!(report.summary.divergent_sequences, vec![17]);
assert_eq!(report.duplicate_complete_at_ms, None);
assert_eq!(report.fault_stats.encoder_drift_observations, 1);
}
#[test]
fn duplicate_publisher_simulation_detects_unaligned_publisher_phase() {
let mut scenario = DuplicatePublisherScenario::new(
SimulationSeed::new(0x7068_6173_652d_6275),
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
8,
);
scenario.base_network_delay_ms = 0;
scenario.max_jitter_ms = 0;
scenario
.publisher_sequence_offsets
.insert("nuc-b".to_string(), 3);
let report = run_duplicate_publisher_simulation(&scenario);
let invariant = check_duplicate_publisher_invariants(
&report,
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
);
assert!(!report.duplicate_complete());
assert_eq!(report.summary.missing_sequences, Vec::<u64>::new());
assert_eq!(
report.summary.matching_duplicate_sequences,
Vec::<u64>::new()
);
assert_eq!(
report.summary.divergent_sequences,
(0_u64..8).collect::<Vec<_>>()
);
assert_eq!(report.fault_stats.publisher_phase_offset_observations, 8);
assert_eq!(
invariant.failures,
vec![
"divergent_sequences".to_string(),
"media_timing_conflict_sequences".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
}
#[test]
fn duplicate_publisher_simulation_rejects_missing_media_timing() {
let mut scenario = DuplicatePublisherScenario::new(
SimulationSeed::new(0x7469_6d65_2d6d_6973),
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
6,
);
scenario.base_network_delay_ms = 0;
scenario.max_jitter_ms = 0;
scenario
.missing_media_timing_publishers
.insert("nuc-b".to_string());
let report = run_duplicate_publisher_simulation(&scenario);
let invariant = check_duplicate_publisher_invariants(
&report,
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
);
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
assert_eq!(report.summary.media_timing_missing_records, 6);
assert_eq!(
invariant.failures,
vec![
"media_timing_missing_records".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
}
#[test]
fn duplicate_publisher_simulation_rejects_conflicting_media_timing() {
let mut scenario = DuplicatePublisherScenario::new(
SimulationSeed::new(0x7469_6d65_2d73_6b65),
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
6,
);
scenario.base_network_delay_ms = 0;
scenario.max_jitter_ms = 0;
scenario
.publisher_media_time_offsets_ms
.insert("nuc-b".to_string(), 17);
let report = run_duplicate_publisher_simulation(&scenario);
let invariant = check_duplicate_publisher_invariants(
&report,
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
);
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
assert_eq!(
report.summary.media_timing_conflict_sequences,
(0_u64..6).collect::<Vec<_>>()
);
assert_eq!(
invariant.failures,
vec![
"media_timing_conflict_sequences".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
}
#[test]
fn duplicate_publisher_simulation_rejects_independent_source_material() {
let mut scenario = DuplicatePublisherScenario::new(
SimulationSeed::new(0x736f_7572_6365_6d61),
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
6,
);
scenario.base_network_delay_ms = 0;
scenario.max_jitter_ms = 0;
scenario
.publisher_source_material
.insert("nuc-b".to_string(), "independent-rf-window".to_string());
let report = run_duplicate_publisher_simulation(&scenario);
let invariant = check_duplicate_publisher_invariants(
&report,
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
);
assert_eq!(
report.summary.divergent_sequences,
(0_u64..6).collect::<Vec<_>>()
);
assert_eq!(
report.summary.media_timing_conflict_sequences,
Vec::<u64>::new()
);
assert_eq!(report.fault_stats.source_material_mismatch_observations, 12);
assert_eq!(
invariant.failures,
vec![
"divergent_sequences".to_string(),
"source_material_mismatch_observations".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
}
#[test]
fn duplicate_publisher_outage_backfills_after_restart() {
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6f75_7461_6765));
scenario.partitions.clear();
scenario.transient_drop_per_million = 0;
scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 320, 760, 180)];
let report = run_duplicate_publisher_simulation(&scenario);
assert!(
report.duplicate_complete(),
"{} {:?}",
report.replay_hint,
report.summary
);
assert!(report.fault_stats.publisher_outage_observations > 0);
assert_eq!(
report.fault_stats.backfill_observations,
report.fault_stats.publisher_outage_observations
);
assert!(
report.duplicate_complete_at_ms.unwrap() >= 940,
"outage restart should move convergence later than the live path"
);
assert!(report.duplicate_complete_at_ms.unwrap() <= 3_000);
}
#[test]
fn duplicate_publisher_simulation_checks_convergence_deadline() {
let report = run_duplicate_publisher_simulation(&faulted_duplicate_scenario(
SimulationSeed::new(0x6465_6164_6c69_6e65),
));
let invariant = check_duplicate_publisher_invariants(
&report,
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000),
);
assert!(
invariant.ok,
"{} {:?}",
invariant.replay_hint, invariant.failures
);
assert!(invariant.duplicate_complete_at_ms.is_some());
assert!(
invariant.duplicate_complete_at_ms.unwrap() <= 3_000,
"{} completed too late: {:?}",
invariant.replay_hint,
invariant.duplicate_complete_at_ms
);
}
#[test]
fn seeded_simulation_campaign_preserves_first_failure() {
let campaign = run_seeded_simulation_campaign(
"generic-seeded-campaign",
SimulationSeed::new(40),
8,
|seed| (seed.0 == 44).then_some(seed.replay_hint()),
);
assert!(!campaign.all_passed());
assert_eq!(campaign.passed, 7);
assert_eq!(campaign.failed, 1);
assert_eq!(
campaign.first_failure.as_deref(),
Some("EC_SIM_SEED=000000000000002c")
);
}
#[test]
fn control_plane_propagation_replays_from_seed() {
let scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6374_726c_7265_706c));
let first = run_control_plane_propagation_simulation(&scenario);
let second = run_control_plane_propagation_simulation(&scenario);
assert_eq!(first, second);
assert!(
first.propagation_complete(),
"control propagation failed for {}: {:?}",
first.replay_hint,
first.missing_nodes
);
assert_eq!(first.known_count, scenario.nodes.len() as u64);
assert_eq!(
first.trace, second.trace,
"replayed control-plane schedules should carry identical traces"
);
assert!(first
.trace
.iter()
.any(|entry| matches!(entry.event, ControlPlaneTraceEvent::MessageScheduled { .. })));
assert!(first
.trace
.iter()
.any(|entry| matches!(entry.event, ControlPlaneTraceEvent::NodeLearned { .. })));
}
#[test]
fn control_plane_campaign_runs_many_fault_schedules() {
let invariant = ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900);
let campaign = run_control_plane_propagation_campaign(
"control-plane-gossip-fault-campaign",
SimulationSeed::new(1),
512,
&invariant,
faulted_control_plane_scenario,
);
assert!(
campaign.all_passed(),
"campaign failed: {:?}",
campaign.first_failure
);
assert_eq!(campaign.passed, 512);
assert_eq!(campaign.failed, 0);
assert!(campaign.total_transient_dropped_messages > 0);
assert!(campaign.total_partition_delayed_messages > 0);
assert!(campaign.total_node_outage_delayed_messages > 0);
assert!(campaign.total_duplicate_messages > 0);
assert!(campaign.max_propagation_complete_ms_observed <= 900);
}
#[test]
fn control_plane_simulation_detects_dead_fanout() {
let mut scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6661_6e6f_7574));
scenario.fanout = 0;
scenario.transient_drop_per_million = 0;
scenario.partitions.clear();
scenario.node_outages.clear();
let report = run_control_plane_propagation_simulation(&scenario);
let invariant = check_control_plane_propagation_invariants(
&report,
&ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900),
);
assert!(!report.propagation_complete());
assert_eq!(report.known_nodes, vec!["nuc-a".to_string()]);
assert_eq!(report.missing_nodes.len(), 6);
assert_eq!(
invariant.failures,
vec![
"propagation_incomplete".to_string(),
"propagation_deadline_unreached".to_string(),
]
);
}
#[test]
fn duplicate_publisher_campaign_runs_many_seed_schedules() {
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
let campaign = run_duplicate_publisher_campaign(
"duplicate-publisher-fault-campaign",
SimulationSeed::new(1),
512,
&invariant,
faulted_duplicate_scenario,
);
assert!(
campaign.all_passed(),
"campaign failed: {:?}",
campaign.first_failure
);
assert_eq!(campaign.passed, 512);
assert_eq!(campaign.failed, 0);
assert!(campaign.total_transient_dropped_observations > 0);
assert!(campaign.total_partition_delayed_observations > 0);
assert!(campaign.total_publisher_outage_observations > 0);
assert!(campaign.total_backfill_observations > 0);
assert!(campaign.max_duplicate_complete_ms_observed <= 3_000);
}
#[test]
fn duplicate_publisher_shrinker_minimizes_noisy_drift_failure() {
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(19));
scenario
.encoder_drifts
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
let shrunk = shrink_duplicate_publisher_failure(&scenario, &invariant)
.expect("drift should fail and be shrinkable");
assert_eq!(shrunk.seed, SimulationSeed::new(19));
assert_eq!(shrunk.scenario.expected_sequences(), 18);
assert_eq!(shrunk.scenario.partitions.len(), 0);
assert_eq!(shrunk.scenario.publisher_outages.len(), 0);
assert_eq!(shrunk.scenario.transient_drop_per_million, 0);
assert_eq!(shrunk.scenario.max_jitter_ms, 0);
assert_eq!(shrunk.scenario.base_network_delay_ms, 0);
assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]);
assert_eq!(
shrunk.invariant.failures,
vec![
"divergent_sequences".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
assert!(
shrunk
.steps
.iter()
.any(|step| step.dimension == "sequence_count" && step.after == "18"),
"shrink steps should record the minimized failing media window"
);
}
#[test]
fn duplicate_publisher_campaign_keeps_first_replayable_failure() {
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
let campaign = run_duplicate_publisher_campaign(
"duplicate-publisher-replayable-failure",
SimulationSeed::new(10),
32,
&invariant,
|seed| {
let mut scenario = faulted_duplicate_scenario(seed);
if seed.0 == 19 {
scenario
.encoder_drifts
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
}
scenario
},
);
let failure = campaign
.first_failure
.as_ref()
.expect("campaign should preserve first failure");
let shrunk = failure
.shrunk_failure
.as_ref()
.expect("campaign should preserve a shrunk replay");
assert_eq!(failure.seed, SimulationSeed::new(19));
assert_eq!(
failure.invariant.failures,
vec![
"divergent_sequences".to_string(),
"duplicate_incomplete".to_string(),
"duplicate_complete_deadline_unreached".to_string(),
]
);
let mut replay = faulted_duplicate_scenario(failure.seed);
replay
.encoder_drifts
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
let replayed_report = run_duplicate_publisher_simulation(&replay);
assert_eq!(replayed_report, failure.report);
assert_eq!(shrunk.scenario.expected_sequences(), 18);
assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]);
}
#[test]
fn system_duplicate_publishers_converge_with_global_sequence_clock() {
let scenario = system_duplicate_scenario(
SimulationSeed::new(0x7379_7374_656d_676c),
PublisherSequenceClock::Global,
);
let report = run_system_duplicate_publisher_simulation(&scenario);
let invariant = check_system_duplicate_publisher_invariants(
&report,
&SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500),
);
assert!(
report.system_complete(),
"{} control={:?} media={:?}",
report.replay_hint,
report.control.missing_nodes,
report.media.summary
);
assert!(
invariant.ok,
"{} {:?}",
invariant.replay_hint, invariant.failures
);
assert_eq!(report.media.summary.divergent_sequences, Vec::<u64>::new());
assert_eq!(
report.media.summary.matching_duplicate_sequences.len() as u64,
scenario.media.expected_sequences()
);
assert!(
report
.publisher_activation_ms
.get("nuc-b")
.copied()
.unwrap_or_default()
> report
.publisher_activation_ms
.get("nuc-a")
.copied()
.unwrap_or_default(),
"faulted control plane should activate nuc-b later than nuc-a"
);
}
#[test]
fn system_duplicate_publishers_reject_local_activation_sequence_clock() {
let scenario = system_duplicate_scenario(
SimulationSeed::new(0x7379_7374_656d_6c6f),
PublisherSequenceClock::LocalActivation,
);
let report = run_system_duplicate_publisher_simulation(&scenario);
let invariant = check_system_duplicate_publisher_invariants(
&report,
&SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500),
);
assert!(report.control.propagation_complete());
assert!(!report.media.duplicate_complete());
assert!(
!report.media.summary.divergent_sequences.is_empty(),
"local activation clock should cause same advertised sequence to hash differently"
);
assert_eq!(
invariant.failures,
vec![
"media_divergent_sequences".to_string(),
"media_timing_conflict_sequences".to_string(),
"media_duplicate_incomplete".to_string(),
"system_complete_deadline_unreached".to_string(),
]
);
}
#[test]
fn system_duplicate_publisher_campaign_runs_many_seed_schedules() {
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500);
let campaign = run_system_duplicate_publisher_campaign(
"system-duplicate-publisher-fault-campaign",
SimulationSeed::new(1),
256,
&invariant,
|seed| system_duplicate_scenario(seed, PublisherSequenceClock::Global),
);
assert!(
campaign.all_passed(),
"campaign failed: {:?}",
campaign.first_failure
);
assert_eq!(campaign.passed, 256);
assert_eq!(campaign.failed, 0);
assert!(campaign.max_control_propagation_ms_observed > 0);
assert!(campaign.max_media_duplicate_complete_ms_observed > 0);
assert!(campaign.max_system_complete_ms_observed <= 3_500);
assert!(campaign.total_system_complete_ms_observed > 0);
assert!(campaign.total_control_trace_events > 0);
assert!(campaign.total_media_trace_events > 0);
assert_eq!(
campaign.total_trace_events,
campaign.total_control_trace_events + campaign.total_media_trace_events
);
assert!(campaign.total_control_transient_drops > 0);
assert!(campaign.total_media_transient_drops > 0);
assert!(campaign.total_media_backfill_observations > 0);
assert!(campaign.seeds_with_system_convergence_time > 0);
assert!(campaign.seeds_with_control_transient_drops > 0);
assert!(campaign.seeds_with_media_transient_drops > 0);
assert!(campaign.seeds_with_media_backfill_observations > 0);
assert!(!campaign.slowest_system_runs.is_empty());
assert!(campaign.slowest_system_runs.len() <= 16);
assert!(campaign
.slowest_system_runs
.windows(2)
.all(|pair| pair[0].system_complete_at_ms.unwrap_or(u64::MAX)
>= pair[1].system_complete_at_ms.unwrap_or(u64::MAX)));
assert_eq!(campaign.total_media_publisher_phase_offsets, 0);
}
#[test]
fn foundation_style_system_campaign_runs_replayable_fault_schedules() {
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000);
let config = FoundationStyleSystemScenarioConfig::default();
let campaign = run_system_duplicate_publisher_campaign(
"foundation-style-system-campaign",
SimulationSeed::new(1),
512,
&invariant,
|seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config),
);
assert!(
campaign.all_passed(),
"campaign failed: {:?}",
campaign.first_failure
);
assert_eq!(campaign.passed, 512);
assert_eq!(campaign.failed, 0);
assert!(campaign.max_system_complete_ms_observed <= 6_000);
assert!(campaign.total_system_complete_ms_observed > 0);
assert!(campaign.total_control_trace_events > 0);
assert!(campaign.total_media_trace_events > 0);
assert_eq!(
campaign.total_trace_events,
campaign.total_control_trace_events + campaign.total_media_trace_events
);
assert!(campaign.total_control_transient_drops > 0);
assert!(campaign.total_control_partition_delays > 0);
assert!(campaign.total_control_node_outage_delays > 0);
assert!(campaign.total_control_duplicate_messages > 0);
assert!(campaign.total_media_transient_drops > 0);
assert!(campaign.total_media_partition_delays > 0);
assert!(campaign.total_media_publisher_outages > 0);
assert!(campaign.total_media_backfill_observations > 0);
assert!(campaign.seeds_with_system_convergence_time > 0);
assert!(campaign.seeds_with_control_propagation_time > 0);
assert!(campaign.seeds_with_media_duplicate_convergence_time > 0);
assert!(campaign.seeds_with_control_transient_drops > 0);
assert!(campaign.seeds_with_control_partition_delays > 0);
assert!(campaign.seeds_with_control_node_outage_delays > 0);
assert!(campaign.seeds_with_control_duplicate_messages > 0);
assert!(campaign.seeds_with_media_transient_drops > 0);
assert!(campaign.seeds_with_media_partition_delays > 0);
assert!(campaign.seeds_with_media_publisher_outages > 0);
assert!(campaign.seeds_with_media_backfill_observations > 0);
assert!(campaign.fault_coverage_ok());
assert!(!campaign.slowest_system_runs.is_empty());
assert!(campaign.slowest_system_runs.len() <= 16);
assert_eq!(campaign.total_media_publisher_phase_offsets, 0);
}
#[test]
fn foundation_style_system_campaign_rejects_local_activation_sequence_clock() {
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000);
let mut config = FoundationStyleSystemScenarioConfig::default();
config.sequence_clock = PublisherSequenceClock::LocalActivation;
let campaign = run_system_duplicate_publisher_campaign(
"foundation-style-local-activation-failure",
SimulationSeed::new(1),
32,
&invariant,
|seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config),
);
let failure = campaign
.first_failure
.as_ref()
.expect("local activation clock should fail under foundation-style faults");
assert!(!campaign.all_passed());
assert!(failure
.invariant
.failures
.contains(&"media_divergent_sequences".to_string()));
assert!(!failure.report.media.summary.divergent_sequences.is_empty());
assert!(
failure
.report
.media
.fault_stats
.publisher_phase_offset_observations
> 0
);
assert!(campaign.total_media_publisher_phase_offsets > 0);
assert!(campaign.seeds_with_media_publisher_phase_offsets > 0);
}
#[test]
fn system_duplicate_publisher_campaign_classifies_source_material_mismatch() {
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500);
let campaign = run_system_duplicate_publisher_campaign(
"system-source-material-failure",
SimulationSeed::new(1),
1,
&invariant,
|seed| {
let mut scenario = system_duplicate_scenario(seed, PublisherSequenceClock::Global);
scenario
.media
.publisher_source_material
.insert("nuc-b".to_string(), "independent-rf-window".to_string());
scenario
},
);
let failure = campaign
.first_failure
.as_ref()
.expect("source material mismatch should fail");
assert!(!campaign.all_passed());
assert!(failure
.invariant
.failures
.contains(&"media_source_material_mismatch_observations".to_string()));
assert!(!failure.report.media.summary.divergent_sequences.is_empty());
assert!(
failure
.report
.media
.fault_stats
.source_material_mismatch_observations
> 0
);
assert!(campaign.total_media_source_material_mismatches > 0);
assert_eq!(campaign.seeds_with_media_source_material_mismatches, 1);
}
fn faulted_duplicate_scenario(seed: SimulationSeed) -> DuplicatePublisherScenario {
let mut scenario = DuplicatePublisherScenario::new(
seed,
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
48,
);
scenario.segment_step_ms = 40;
scenario.base_network_delay_ms = 5;
scenario.max_jitter_ms = 75;
scenario.transient_drop_per_million = 275_000;
scenario.backfill_after_ms = 600;
scenario.partitions = vec![
SimulationPartition::new("nuc-b", 120, 520, 140),
SimulationPartition::new("nuc-a", 940, 1_260, 90),
];
scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)];
scenario
}
fn faulted_control_plane_scenario(seed: SimulationSeed) -> ControlPlanePropagationScenario {
let mut scenario = ControlPlanePropagationScenario::new(
seed,
vec![
"nuc-a".to_string(),
"nuc-b".to_string(),
"tower".to_string(),
"forge".to_string(),
"relay-lax".to_string(),
"relay-nyc".to_string(),
"relay-hel".to_string(),
],
"nuc-a",
"ec.control.broadcast.la-kcop",
"la-kcop@42",
);
scenario.fanout = 3;
scenario.gossip_interval_ms = 35;
scenario.max_gossip_rounds = 12;
scenario.base_network_delay_ms = 6;
scenario.max_jitter_ms = 45;
scenario.transient_drop_per_million = 120_000;
scenario.partitions = vec![
SimulationPartition::new("relay-hel", 70, 190, 55),
SimulationPartition::new("tower", 220, 310, 40),
];
scenario.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)];
scenario
}
fn system_duplicate_scenario(
seed: SimulationSeed,
sequence_clock: PublisherSequenceClock,
) -> SystemDuplicatePublisherScenario {
let mut control = ControlPlanePropagationScenario::new(
seed,
vec![
"forge".to_string(),
"nuc-a".to_string(),
"nuc-b".to_string(),
"tower".to_string(),
"relay-lax".to_string(),
"relay-nyc".to_string(),
"relay-hel".to_string(),
],
"forge",
"ec.control.broadcast.la-kcop",
"la-kcop@42",
);
control.fanout = 3;
control.gossip_interval_ms = 35;
control.max_gossip_rounds = 12;
control.base_network_delay_ms = 6;
control.max_jitter_ms = 45;
control.transient_drop_per_million = 120_000;
control.partitions = vec![
SimulationPartition::new("nuc-b", 0, 180, 40),
SimulationPartition::new("relay-hel", 70, 190, 55),
];
control.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)];
let mut media = DuplicatePublisherScenario::new(
SimulationSeed::new(seed.0 ^ 0x6d65_6469_6121),
vec!["nuc-a".to_string(), "nuc-b".to_string()],
STREAM,
RENDITION,
TRACK,
PROFILE,
0,
48,
);
media.segment_step_ms = 40;
media.base_network_delay_ms = 5;
media.max_jitter_ms = 75;
media.transient_drop_per_million = 275_000;
media.backfill_after_ms = 600;
media.partitions = vec![SimulationPartition::new("nuc-a", 940, 1_260, 90)];
media.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)];
let mut scenario = SystemDuplicatePublisherScenario::new(seed, control, media);
scenario.publisher_activation_delay_ms = 25;
scenario.publisher_backfill_delay_ms = 180;
scenario.sequence_clock = sequence_clock;
scenario
}

View file

@ -29,17 +29,21 @@ rustls-native-certs = "0.8.3"
urlencoding = "2"
serde.workspace = true
serde_json.workspace = true
opentelemetry.workspace = true
opentelemetry-otlp.workspace = true
opentelemetry_sdk.workspace = true
tokio = { version = "1", features = ["full"] }
tokio-tungstenite = { version = "0.24", default-features = false, features = ["connect", "rustls-tls-webpki-roots"] }
futures-util = "0.3"
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
web-transport-quinn = "0.11.4"
web-transport-trait = "0.3.3"
hang = "0.14.0"
moq-mux = "0.2.1"
moq-lite = "0.14.0"
moq-native = { version = "0.13.1", default-features = true }
web-transport-quinn = "0.11.9"
web-transport-trait = "0.3.4"
hang = "0.16.0"
moq-mux = "0.4.0"
moq-lite = "0.16.0"
moq-native = { version = "0.14.0", default-features = true }
headless_chrome = "1"
tokio-util = "0.7"
url = "2"

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@ use anyhow::{anyhow, Context, Result};
use headless_chrome::protocol::cdp::Page;
use headless_chrome::{Browser, Tab};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::env;
use std::fs;
use std::io::{BufRead, BufReader, Cursor, Read, Write};
@ -65,11 +65,15 @@ pub struct BootstrapResult {
pub page_url: String,
pub interactive_auth_required: bool,
pub authorized: bool,
pub video_ready: bool,
pub current_time: f64,
pub width: u64,
pub height: u64,
pub screenshot_path: Option<PathBuf>,
}
#[derive(Debug)]
struct WaitOutcome {
tab: Arc<Tab>,
state: NbcVideoState,
trace: NbcTraceState,
interactive_auth_required: bool,
@ -199,6 +203,14 @@ fn nbc_bootstrap_timeout() -> Duration {
.unwrap_or_else(|| Duration::from_secs(1800))
}
fn nbc_profile_signin_gate_timeout() -> Duration {
env::var("EVERY_CHANNEL_NBC_PROFILE_SIGNIN_GATE_TIMEOUT_SECS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.map(Duration::from_secs)
.unwrap_or_else(|| Duration::from_secs(8))
}
fn nbc_env_flag(name: &str) -> Option<bool> {
env::var(name).ok().map(|value| {
let value = value.trim().to_ascii_lowercase();
@ -403,6 +415,10 @@ pub fn bootstrap_nbc_auth(
)?;
Ok(BootstrapResult {
video_ready: nbc_video_state_has_decoded_frame(&outcome.state),
current_time: outcome.state.current_time,
width: outcome.state.width,
height: outcome.state.height,
title: outcome.state.title,
page_url: outcome.state.page_url,
interactive_auth_required: outcome.interactive_auth_required,
@ -619,7 +635,7 @@ fn run_nbc_capture_loop(
register_nbc_trace_handlers(&tab, trace.clone())?;
tab.navigate_to(&url)?;
tab.wait_until_navigated()?;
wait_for_nbc_playback(
let outcome = wait_for_nbc_playback(
chrome.browser(),
&tab,
&url,
@ -627,31 +643,34 @@ fn run_nbc_capture_loop(
AuthMode::Forbidden,
None,
)?;
let capture_tab = outcome.tab;
let frame_interval = Duration::from_millis(1000 / nbc_capture_fps().max(1));
let quality = nbc_capture_quality();
let mut first_frame = true;
loop {
kick_nbc_player(&tab).ok();
let frame = tab
kick_nbc_player(&capture_tab).ok();
let state = probe_nbc_video(&capture_tab).unwrap_or_default();
if !nbc_video_state_has_decoded_frame(&state) {
return Err(anyhow!(
"NBC capture tab lost decoded video (title='{}', page_url='{}', current_time={}, ready_state={}, has_video={})",
state.title,
state.page_url,
state.current_time,
state.ready_state,
state.has_video,
));
}
let video = capture_tab
.find_element("video")
.and_then(|video| {
video.parent.capture_screenshot(
Page::CaptureScreenshotFormatOption::Jpeg,
Some(quality),
Some(video.get_box_model()?.content_viewport()),
true,
)
})
.or_else(|_| {
tab.capture_screenshot(
Page::CaptureScreenshotFormatOption::Jpeg,
Some(quality),
None,
true,
)
})?;
.context("NBC capture tab has no video element after playback readiness")?;
let frame = video.parent.capture_screenshot(
Page::CaptureScreenshotFormatOption::Jpeg,
Some(quality),
Some(video.get_box_model()?.content_viewport()),
true,
)?;
if first_frame {
first_frame = false;
@ -785,6 +804,15 @@ fn nbc_url_is_provider_linked(url: &str) -> bool {
(host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("provider-linked")
}
fn nbc_url_is_mvpd_complete(url: &str) -> bool {
let Ok(url) = Url::parse(url) else {
return false;
};
let host = url.host_str().unwrap_or_default().to_ascii_lowercase();
let path = url.path().to_ascii_lowercase();
(host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("mvpd-complete")
}
fn nbc_url_is_optional_profile_signin(url: &str) -> bool {
let Ok(url) = Url::parse(url) else {
return false;
@ -812,6 +840,7 @@ fn nbc_page_is_watch_surface(url: &str) -> bool {
(host.ends_with("nbc.com") || host.ends_with(".nbc.com"))
&& !nbc_url_is_optional_profile_signin(url.as_str())
&& !nbc_url_is_provider_linked(url.as_str())
&& !nbc_url_is_mvpd_complete(url.as_str())
}
fn nbc_title_looks_like_verizon_popup(title: &str) -> bool {
@ -836,6 +865,14 @@ fn nbc_state_is_optional_profile_signin(state: &NbcVideoState) -> bool {
|| nbc_title_looks_like_optional_profile_signin(&state.title)
}
fn nbc_clues_look_geo_blocked(clues: &NbcPageClues) -> bool {
let body_text = clues.body_text.to_ascii_lowercase();
body_text.contains("not authorized to access this content from outside of the us")
|| body_text.contains("not authorized to access this content from outside of the u.s.")
|| body_text.contains("outside of the us and its territories")
|| body_text.contains("outside of the u.s. and its territories")
}
fn browser_tabs(browser: &Browser) -> Vec<Arc<Tab>> {
browser.register_missing_tabs();
browser.get_tabs().lock().unwrap().iter().cloned().collect()
@ -877,20 +914,20 @@ fn find_primary_tab_state<'a>(
.find(|candidate| candidate.tab.get_target_id() == target_id)
}
fn find_playing_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> {
tabs.iter().find(|candidate| {
candidate.state.has_video
&& candidate.state.width > 0
&& candidate.state.height > 0
&& !candidate.state.paused
&& (candidate.state.current_time > 0.0 || candidate.state.ready_state >= 2)
})
fn nbc_video_state_has_decoded_frame(state: &NbcVideoState) -> bool {
state.has_video
&& state.width > 0
&& state.height > 0
&& !state.paused
&& state.current_time > 0.0
&& state.ready_state >= 2
}
fn find_provider_linked_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> {
tabs.iter().find(|candidate| {
nbc_title_looks_like_provider_linked(&candidate.state.title)
|| nbc_url_is_provider_linked(&candidate.state.page_url)
|| nbc_url_is_mvpd_complete(&candidate.state.page_url)
})
}
@ -1038,25 +1075,40 @@ fn advance_nbc_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult>>
}};
const actions = [];
const url = window.location.href || "";
let host = "";
try {{
host = new URL(url).hostname.toLowerCase();
}} catch (_err) {{}}
const title = document.title || "";
const titleText = `${{title}} ${{url}}`.toLowerCase();
const looksLikeOptionalNbcProfile =
(host.endsWith("nbc.com") || host.endsWith(".nbc.com")) &&
(url.includes("/sign-in") ||
url.includes("/login") ||
titleText.includes("nbc account sign in") ||
titleText.includes("nbcuniversal profile") ||
titleText.includes("nbc profile"));
if (looksLikeOptionalNbcProfile) {{
return {{ pageUrl: url, title, actions }};
}}
const candidates = Array.from(
document.querySelectorAll(
"button,a,[role='button'],[role='option'],label,li,[data-provider-name],[data-provider-id],[data-provider]"
)
);
const providerCta = candidates.find((node) => {{
const text = textOf(node);
return visible(node) &&
(
text === "link tv provider" ||
text === "link provider" ||
text.startsWith("link tv provider ") ||
text.startsWith("link provider ")
);
}});
clickNode(providerCta, "click:link-provider");
if (url.includes("mvpd")) {{
const providerCta = candidates.find((node) => {{
const text = textOf(node);
return visible(node) &&
(
text === "link tv provider" ||
text === "link provider" ||
text.startsWith("link tv provider ") ||
text.startsWith("link provider ")
);
}});
clickNode(providerCta, "click:link-provider");
const fullListNode = candidates.find((node) => {{
const text = textOf(node);
return visible(node) && (text === "full list" || text.startsWith("full list "));
@ -1112,7 +1164,7 @@ fn advance_nbc_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult>>
return {{
pageUrl: url,
title: document.title || "",
title,
actions,
}};
}})())
@ -1226,16 +1278,30 @@ fn advance_mvpd_login_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult
titleText.includes("nbc profile"));
if (looksLikeOptionalNbcProfile) {{
const profileButtons = Array.from(document.querySelectorAll("button,a,[role='button'],input[type='submit'],input[type='button']"));
const providerLink = profileButtons.find((node) => {{
const dismissButton = profileButtons.find((node) => {{
const text = textOf(node);
return visible(node) && (
text === "link tv provider" ||
text === "link provider" ||
text.startsWith("link tv provider ") ||
text.startsWith("link provider ")
text === "skip" ||
text.startsWith("skip ") ||
text === "skip for now" ||
text === "maybe later" ||
text === "not now" ||
text === "no thanks" ||
text === "close" ||
text === "continue watching" ||
text.startsWith("continue watching ") ||
text === "continue without signing in" ||
text === "continue without profile" ||
text === "continue as guest" ||
text === "watch live" ||
text === "watch now" ||
text.startsWith("watch live ") ||
text.startsWith("watch now ")
);
}});
clickNode(providerLink, "click:profile-link-provider");
if (dismissButton) {{
clickNode(dismissButton, `click:profile-dismiss:${{textOf(dismissButton).slice(0, 120)}}`);
}}
return {{ pageUrl: url, title, actions }};
}}
if (!looksLikeProviderLogin) {{
@ -1333,8 +1399,15 @@ fn advance_nbc_post_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceRes
const bodyText = normalize(document.body?.innerText || "");
const looksLinked = title.toLowerCase().includes("tv provider linked")
|| url.includes("provider-linked")
|| url.includes("mvpd-complete")
|| bodyText.includes("tv provider linked");
if (!looksLinked) {
const looksOptionalProfile =
(url.includes("/sign-in") ||
url.includes("/login") ||
normalize(title).includes("nbc account sign in") ||
normalize(title).includes("nbcuniversal profile") ||
normalize(title).includes("nbc profile"));
if (!looksLinked && !looksOptionalProfile) {
return { pageUrl: url, title, actions };
}
@ -1344,8 +1417,22 @@ fn advance_nbc_post_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceRes
return visible(node) && (
text === "skip" ||
text.startsWith("skip ") ||
text === "continue" ||
text.startsWith("continue watching")
text === "skip for now" ||
(looksLinked && text === "continue") ||
(looksLinked && text.startsWith("continue ")) ||
text === "maybe later" ||
text === "not now" ||
text === "no thanks" ||
text === "close" ||
text === "continue watching" ||
text.startsWith("continue watching") ||
text === "continue without signing in" ||
text === "continue without profile" ||
text === "continue as guest" ||
text === "watch live" ||
text === "watch now" ||
text.startsWith("watch live ") ||
text.startsWith("watch now ")
);
});
if (skipButton) {
@ -1533,6 +1620,7 @@ fn wait_for_nbc_playback(
screenshot_out: Option<PathBuf>,
) -> Result<WaitOutcome> {
let deadline = Instant::now() + nbc_capture_timeout();
let auth_forbidden = matches!(&auth_mode, AuthMode::Forbidden);
let mut interactive_deadline = None::<Instant>;
let mut interactive_auth_required = false;
let mut screenshot_path = None::<PathBuf>;
@ -1540,11 +1628,13 @@ fn wait_for_nbc_playback(
let mut last_trace_state = None::<NbcTraceState>;
let mut last_log = Instant::now() - Duration::from_secs(10);
let mut last_clue_log = Instant::now() - Duration::from_secs(30);
let mut playback_samples = HashMap::<String, (f64, Instant)>::new();
let mut resumed_after_background_login = false;
let mut resumed_after_authenticated_surface = false;
let mut optional_profile_signin_recoveries = 0_u8;
let mut last_optional_profile_signin_retry = None::<Instant>;
let mut watch_surface_seen_at = None::<Instant>;
let mut optional_profile_signin_seen_at = None::<Instant>;
let mut tracked_tabs = HashSet::new();
let mut provider_linked_completed = false;
@ -1554,13 +1644,33 @@ fn wait_for_nbc_playback(
let primary_state = find_primary_tab_state(&tab_states, tab)
.map(|value| value.state.clone())
.unwrap_or_else(|| probe_nbc_video(tab).unwrap_or_default());
if let Some(playing_tab) = find_playing_tab_state(&tab_states) {
return Ok(WaitOutcome {
state: playing_tab.state.clone(),
trace: trace.lock().map(|state| state.clone()).unwrap_or_default(),
interactive_auth_required,
screenshot_path,
});
let now = Instant::now();
for playing_tab in tab_states
.iter()
.filter(|candidate| nbc_video_state_has_decoded_frame(&candidate.state))
{
let target_id = playing_tab.tab.get_target_id().to_string();
if let Some((previous_time, first_seen)) = playback_samples.get(&target_id) {
if playing_tab.state.current_time >= *previous_time + 0.25
&& first_seen.elapsed() >= Duration::from_millis(500)
{
return Ok(WaitOutcome {
tab: playing_tab.tab.clone(),
state: playing_tab.state.clone(),
trace: trace.lock().map(|state| state.clone()).unwrap_or_default(),
interactive_auth_required,
screenshot_path,
});
}
}
playback_samples
.entry(target_id)
.and_modify(|(previous_time, _)| {
if playing_tab.state.current_time < *previous_time {
*previous_time = playing_tab.state.current_time;
}
})
.or_insert((playing_tab.state.current_time, now));
}
let interaction_tab = find_interaction_tab_state(&tab_states, tab)
@ -1571,6 +1681,7 @@ fn wait_for_nbc_playback(
let pre_state = probe_nbc_video(&interaction_tab).unwrap_or_default();
if nbc_title_looks_like_provider_linked(&pre_state.title)
|| nbc_url_is_provider_linked(&pre_state.page_url)
|| nbc_url_is_mvpd_complete(&pre_state.page_url)
{
provider_linked_completed = true;
}
@ -1579,6 +1690,7 @@ fn wait_for_nbc_playback(
if let Some(progress) = advance_nbc_post_auth_flow(&interaction_tab).ok().flatten() {
if nbc_title_looks_like_provider_linked(&progress.title)
|| nbc_url_is_provider_linked(&progress.page_url)
|| nbc_url_is_mvpd_complete(&progress.page_url)
|| progress
.actions
.iter()
@ -1614,6 +1726,7 @@ fn wait_for_nbc_playback(
let state = probe_nbc_video(&interaction_tab).unwrap_or_default();
if nbc_title_looks_like_provider_linked(&state.title)
|| nbc_url_is_provider_linked(&state.page_url)
|| nbc_url_is_mvpd_complete(&state.page_url)
{
provider_linked_completed = true;
}
@ -1621,6 +1734,19 @@ fn wait_for_nbc_playback(
let authorized = nbc_trace_is_authorized(&trace_state) || provider_linked_completed;
let recent_media_activity = nbc_trace_has_recent_media_activity(&trace_state);
if !authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video {
let first_seen = *optional_profile_signin_seen_at.get_or_insert_with(Instant::now);
if auth_forbidden && first_seen.elapsed() >= nbc_profile_signin_gate_timeout() {
return Err(anyhow!(
"NBC account sign-in gate reached before TV-provider auth; refusing non-interactive retry loop without decoded video (title='{}', page_url='{}')",
state.title,
state.page_url,
));
}
} else {
optional_profile_signin_seen_at = None;
}
if last_log.elapsed() >= Duration::from_secs(5) {
last_log = Instant::now();
tracing::info!(
@ -1661,8 +1787,9 @@ fn wait_for_nbc_playback(
}
}
if (trace_state.background_login_complete
|| nbc_url_is_background_login_complete(&state.page_url))
let auth_completion_page = nbc_url_is_background_login_complete(&state.page_url)
|| nbc_url_is_mvpd_complete(&state.page_url);
if (trace_state.background_login_complete || auth_completion_page)
&& !resumed_after_background_login
{
resumed_after_background_login = true;
@ -1673,41 +1800,49 @@ fn wait_for_nbc_playback(
);
close_auxiliary_browser_tabs(browser, tab);
let _ = tab.activate();
let _ = tab.evaluate("window.location.reload()", true);
if nbc_url_is_mvpd_complete(&state.page_url) {
tab.navigate_to(source_url)?;
tab.wait_until_navigated()?;
} else {
let _ = tab.evaluate("window.location.reload()", true);
}
std::thread::sleep(Duration::from_secs(2));
continue;
}
if authorized
&& nbc_state_is_optional_profile_signin(&state)
&& !recent_media_activity
&& optional_profile_signin_recoveries < 3
&& last_optional_profile_signin_retry
.map(|instant| instant.elapsed() >= Duration::from_secs(3))
.unwrap_or(true)
{
optional_profile_signin_recoveries += 1;
last_optional_profile_signin_retry = Some(Instant::now());
tracing::info!(
title = %state.title,
page_url = %state.page_url,
authorized,
source_url,
optional_profile_signin_recoveries,
"NBC profile sign-in surface detected after authorization; returning to the live source URL"
);
close_auxiliary_browser_tabs(browser, tab);
let _ = tab.activate();
tab.navigate_to(source_url)?;
tab.wait_until_navigated()?;
std::thread::sleep(Duration::from_secs(2));
continue;
if authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video {
if optional_profile_signin_recoveries == 0
&& last_optional_profile_signin_retry
.map(|instant| instant.elapsed() >= Duration::from_secs(3))
.unwrap_or(true)
{
optional_profile_signin_recoveries += 1;
last_optional_profile_signin_retry = Some(Instant::now());
tracing::info!(
title = %state.title,
page_url = %state.page_url,
authorized,
source_url,
"NBC account sign-in gate detected after provider authorization; trying one live-url recovery"
);
close_auxiliary_browser_tabs(browser, tab);
let _ = tab.activate();
tab.navigate_to(source_url)?;
tab.wait_until_navigated()?;
std::thread::sleep(Duration::from_secs(2));
continue;
}
return Err(anyhow!(
"NBC account sign-in gate reached after TV-provider auth; refusing retry loop without decoded video (title='{}', page_url='{}')",
state.title,
state.page_url,
));
}
if authorized && nbc_state_is_optional_profile_signin(&state) && recent_media_activity {
if authorized && nbc_state_is_optional_profile_signin(&state) && state.has_video {
tracing::debug!(
title = %state.title,
page_url = %state.page_url,
"NBC optional profile sign-in is visible but media activity is already in flight; staying on the page"
"NBC optional profile sign-in is visible but a video element is already present; staying on the page"
);
}
@ -1733,6 +1868,13 @@ fn wait_for_nbc_playback(
body_text = %clues.body_text,
"NBC watch surface clues"
);
if nbc_clues_look_geo_blocked(&clues) {
return Err(anyhow!(
"NBC geo-blocked current egress; page says this content is not authorized outside the US/territories (title='{}', page_url='{}')",
primary_state.title,
primary_state.page_url,
));
}
}
}
if fully_loaded_watch_surface && !primary_state.has_video {
@ -1862,6 +2004,9 @@ mod tests {
assert!(nbc_url_is_provider_linked(
"https://www.nbc.com/provider-linked"
));
assert!(nbc_url_is_mvpd_complete(
"https://www.nbc.com/mvpd-complete"
));
assert!(nbc_title_looks_like_provider_linked("TV Provider Linked"));
assert!(!nbc_url_is_provider_linked(
"https://www.nbc.com/live?brand=nbc-sports-philadelphia"
@ -1884,11 +2029,31 @@ mod tests {
#[test]
fn optional_profile_signin_is_not_treated_as_watch_surface() {
assert!(!nbc_page_is_watch_surface("https://www.nbc.com/sign-in"));
assert!(!nbc_page_is_watch_surface(
"https://www.nbc.com/mvpd-complete"
));
assert!(nbc_page_is_watch_surface(
"https://www.nbc.com/live?brand=nbc-sports-philadelphia"
));
}
#[test]
fn geo_block_clues_fail_closed() {
let clues = NbcPageClues {
body_text:
"We're sorry. You are not authorized to access this content from outside of the US and its territories."
.to_string(),
..NbcPageClues::default()
};
assert!(nbc_clues_look_geo_blocked(&clues));
let allowed = NbcPageClues {
body_text: "NBC News NOW ON NOW until 7:00 AM".to_string(),
..NbcPageClues::default()
};
assert!(!nbc_clues_look_geo_blocked(&allowed));
}
#[test]
fn cssott_media_requests_mark_recent_media_activity() {
let mut trace = NbcTraceState::default();
@ -1899,4 +2064,25 @@ mod tests {
assert!(trace.media_activity_seen);
assert!(nbc_trace_has_recent_media_activity(&trace));
}
#[test]
fn decoded_frame_detection_requires_advancing_video_surface() {
let mut state = NbcVideoState {
has_video: true,
width: 1920,
height: 1080,
paused: false,
ready_state: 2,
current_time: 1.0,
..NbcVideoState::default()
};
assert!(nbc_video_state_has_decoded_frame(&state));
state.current_time = 0.0;
assert!(!nbc_video_state_has_decoded_frame(&state));
state.current_time = 1.0;
state.width = 0;
assert!(!nbc_video_state_has_decoded_frame(&state));
}
}

View file

@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::process::{Command, Stdio};
@ -46,6 +47,15 @@ fn blake3_hex(path: &Path) -> anyhow::Result<String> {
Ok(blake3::hash(&bytes).to_hex().to_string())
}
fn command_available(name: &str) -> bool {
Command::new(name)
.arg("-version")
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
.is_ok()
}
fn concat_init_and_segment(init: &Path, seg: &Path, out: &Path) -> anyhow::Result<()> {
let init_bytes = std::fs::read(init)?;
let seg_bytes = std::fs::read(seg)?;
@ -157,11 +167,15 @@ fn write_deterministic_ts(out_path: &Path) -> anyhow::Result<()> {
Ok(())
}
fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> {
fn run_ladder_with_identity(
ec_node: &Path,
input_ts: &Path,
out_dir: &Path,
stream_id: &str,
broadcast_name: &str,
) -> anyhow::Result<()> {
let signing_key = "11".repeat(32);
let network_secret = "22".repeat(32);
let stream_id = "every.channel/determinism/cmaf-ladder";
let broadcast_name = "every.channel/determinism/cmaf-ladder";
let mut cmd = Command::new(ec_node);
cmd.env("EVERY_CHANNEL_MANIFEST_SIGNING_KEY", &signing_key)
@ -210,6 +224,40 @@ fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result
Ok(())
}
fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> {
run_ladder_with_identity(
ec_node,
input_ts,
out_dir,
"every.channel/determinism/cmaf-ladder",
"every.channel/determinism/cmaf-ladder",
)
}
fn ladder_artifact_hashes(root: &Path) -> BTreeMap<String, String> {
let mut hashes = BTreeMap::new();
for variant in ["1080p", "720p", "480p"] {
let variant_dir = root.join("cmaf-ladder").join(variant);
// `moq-publish --max-chunks 3` publishes init plus segments 0..=2.
// ffmpeg can race ahead and leave an unpublished tail segment before it is killed.
let init = variant_dir.join("init.mp4");
assert!(init.exists(), "missing init for {variant}");
hashes.insert(format!("{variant}/init.mp4"), blake3_hex(&init).unwrap());
for idx in 0..3 {
let name = format!("segment_{idx:06}.m4s");
let path = variant_dir.join(&name);
assert!(path.exists(), "missing {name} for {variant}");
hashes.insert(format!("{variant}/{name}"), blake3_hex(&path).unwrap());
}
}
hashes
}
fn assert_ladder_bytes_match(left: &Path, right: &Path) {
assert_eq!(ladder_artifact_hashes(left), ladder_artifact_hashes(right));
}
#[test]
#[ignore]
fn deterministic_cmaf_ladder_outputs_match_across_runs() {
@ -235,36 +283,53 @@ fn deterministic_cmaf_ladder_outputs_match_across_runs() {
run_ladder(&ec_node, &input_ts, &run1).expect("run ladder 1");
run_ladder(&ec_node, &input_ts, &run2).expect("run ladder 2");
for variant in ["1080p", "720p", "480p"] {
let v1 = run1.join("cmaf-ladder").join(variant);
let v2 = run2.join("cmaf-ladder").join(variant);
assert_ladder_bytes_match(&run1, &run2);
}
let init1 = v1.join("init.mp4");
let init2 = v2.join("init.mp4");
assert!(
init1.exists() && init2.exists(),
"missing init for {variant}"
);
assert_eq!(
blake3_hex(&init1).unwrap(),
blake3_hex(&init2).unwrap(),
"init differs for {variant}"
);
for idx in 0..3 {
let s1 = v1.join(format!("segment_{idx:06}.m4s"));
let s2 = v2.join(format!("segment_{idx:06}.m4s"));
assert!(
s1.exists() && s2.exists(),
"missing segment {idx} for {variant}"
);
assert_eq!(
blake3_hex(&s1).unwrap(),
blake3_hex(&s2).unwrap(),
"segment {idx} differs for {variant}"
);
}
#[test]
fn duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes() {
if !command_available("ffmpeg") {
eprintln!("skipping duplicate publisher CMAF ladder determinism test: ffmpeg unavailable");
return;
}
let ec_node = ec_node_path();
let ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis();
let tmp = std::env::temp_dir().join(format!("ec-duplicate-publisher-cmaf-ladder-{ts}"));
let _ = std::fs::create_dir_all(&tmp);
let input_ts = tmp.join("input.ts");
write_deterministic_ts(&input_ts).expect("write deterministic TS");
let publisher_a = tmp.join("publisher-a");
let publisher_b = tmp.join("publisher-b");
let _ = std::fs::remove_dir_all(&publisher_a);
let _ = std::fs::remove_dir_all(&publisher_b);
std::fs::create_dir_all(&publisher_a).unwrap();
std::fs::create_dir_all(&publisher_b).unwrap();
run_ladder_with_identity(
&ec_node,
&input_ts,
&publisher_a,
"every.channel/determinism/duplicate/publisher-a/la-kcop",
"publisher-a-la-kcop",
)
.expect("run duplicate publisher a");
run_ladder_with_identity(
&ec_node,
&input_ts,
&publisher_b,
"every.channel/determinism/duplicate/publisher-b/la-kcop",
"publisher-b-la-kcop",
)
.expect("run duplicate publisher b");
assert_ladder_bytes_match(&publisher_a, &publisher_b);
}
#[test]

View file

@ -1,4 +1,5 @@
use std::ffi::OsStr;
use std::process::{Command, Stdio};
use std::time::{Duration, Instant};
fn which(cmd: &str) -> Option<std::path::PathBuf> {
@ -16,6 +17,24 @@ fn chrome_path() -> Option<std::path::PathBuf> {
.or_else(|| which("chromium"))
}
fn ec_node_path() -> std::path::PathBuf {
if let Ok(value) = std::env::var("EC_NODE_BIN") {
return value.into();
}
if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec_node") {
return value.into();
}
if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec-node") {
return value.into();
}
let exe = std::env::current_exe().expect("current_exe");
let debug_dir = exe
.parent()
.and_then(|p| p.parent())
.expect("expected target/debug/deps");
debug_dir.join("ec-node")
}
fn wait_for_canvas_element(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
let deadline = Instant::now() + timeout;
while Instant::now() < deadline {
@ -46,14 +65,41 @@ fn wait_for_moq_watch_element(tab: &headless_chrome::Tab, timeout: Duration) ->
anyhow::bail!("timed out waiting for <moq-watch> element");
}
fn wait_for_live_or_archive_player(
tab: &headless_chrome::Tab,
timeout: Duration,
) -> anyhow::Result<()> {
let deadline = Instant::now() + timeout;
while Instant::now() < deadline {
let js = r#"(function() {
return !!document.querySelector('moq-watch, video.archiveVideo');
})();"#;
let v = tab.evaluate(js, false)?;
if v.value.and_then(|v| v.as_bool()).unwrap_or(false) {
return Ok(());
}
std::thread::sleep(Duration::from_millis(200));
}
anyhow::bail!("timed out waiting for live or archive player");
}
fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result<String> {
let js = r#"(function() {
let watch = document.querySelector('moq-watch');
let canvas = document.querySelector('moq-watch canvas');
let video = document.querySelector('video.archiveVideo');
let placeholder = document.querySelector('.placeholder');
let placeholderText = placeholder ? (placeholder.innerText || '') : null;
let status = document.querySelector('.source-status');
let statusText = status ? (status.innerText || '') : null;
let statusLine = document.querySelector('#statusLine');
let statusLineText = statusLine ? (statusLine.innerText || '') : null;
let catalog = watch && watch.broadcast && watch.broadcast.catalog && watch.broadcast.catalog.peek
? watch.broadcast.catalog.peek()
: null;
let established = watch && watch.connection && watch.connection.established && watch.connection.established.peek
? watch.connection.established.peek()
: null;
let sources = Array.from(document.querySelectorAll('button[data-testid="global-watch"]')).length;
let hint = document.querySelector('#hint');
let hintText = hint ? (hint.innerText || '') : null;
@ -62,8 +108,27 @@ fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result<String> {
hasCanvas: !!canvas,
canvasWidth: canvas ? canvas.width : null,
canvasHeight: canvas ? canvas.height : null,
hasArchiveVideo: !!video,
videoCurrentTime: video ? video.currentTime : null,
videoDuration: video ? video.duration : null,
videoPaused: video ? video.paused : null,
videoReadyState: video ? video.readyState : null,
videoMuted: video ? video.muted : null,
videoVolume: video ? video.volume : null,
videoSrc: video ? (video.currentSrc || video.src || '') : null,
muted: watch ? watch.muted : null,
volume: watch ? watch.volume : null,
connectionStatus: watch?.connection?.status?.peek ? watch.connection.status.peek() : null,
connectionKind: established ? established.constructor?.name || null : null,
broadcastStatus: watch?.broadcast?.status?.peek ? watch.broadcast.status.peek() : null,
paused: watch?.backend?.paused?.peek ? watch.backend.paused.peek() : null,
audioMuted: watch?.backend?.audio?.muted?.peek ? watch.backend.audio.muted.peek() : null,
audioVolume: watch?.backend?.audio?.volume?.peek ? watch.backend.audio.volume.peek() : null,
catalogSeen: !!catalog,
catalogHasVideo: !!(catalog?.video?.renditions),
catalogHasAudio: !!(catalog?.audio?.renditions),
metrics: window.__ecPlaybackMetrics || null,
statusLineText,
hintText,
placeholderText,
statusText,
@ -110,23 +175,120 @@ fn canvas_motion_sample(tab: &headless_chrome::Tab) -> anyhow::Result<Option<(f6
Ok(Some((current_time, hash)))
}
fn wait_for_canvas_motion(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
fn archive_video_motion_sample(
tab: &headless_chrome::Tab,
) -> anyhow::Result<Option<serde_json::Value>> {
let js = r#"(function() {
let video = document.querySelector('video.archiveVideo');
if (!video) return null;
if (video.paused) video.play().catch(() => {});
return JSON.stringify({
wallTime: performance.now() / 1000,
currentTime: video.currentTime || 0,
readyState: video.readyState || 0,
paused: !!video.paused,
ended: !!video.ended,
muted: !!video.muted,
volume: video.volume || 0,
src: video.currentSrc || video.src || ''
});
})();"#;
let v = tab.evaluate(js, false)?;
let Some(s) = v.value.and_then(|v| v.as_str().map(|s| s.to_string())) else {
return Ok(None);
};
Ok(Some(serde_json::from_str(&s)?))
}
fn wait_for_canvas_or_archive_motion(
tab: &headless_chrome::Tab,
timeout: Duration,
) -> anyhow::Result<String> {
let deadline = Instant::now() + timeout;
let mut first: Option<(f64, u32)> = None;
let mut first_canvas: Option<(f64, u32)> = None;
let mut first_video_time: Option<f64> = None;
while Instant::now() < deadline {
if let Some(sample) = canvas_motion_sample(tab)? {
if let Some((first_time, first_hash)) = first {
if let Some((first_time, first_hash)) = first_canvas {
if sample.0 > first_time + 0.5 && sample.1 != first_hash {
return Ok(());
return Ok("moq-canvas".to_string());
}
} else {
first = Some(sample);
first_canvas = Some(sample);
}
}
if let Some(sample) = archive_video_motion_sample(tab)? {
let current_time = sample
.get("currentTime")
.and_then(|v| v.as_f64())
.unwrap_or_default();
let ready_state = sample
.get("readyState")
.and_then(|v| v.as_u64())
.unwrap_or_default();
let ended = sample
.get("ended")
.and_then(|v| v.as_bool())
.unwrap_or(false);
if ready_state >= 2 && !ended {
if let Some(first) = first_video_time {
if current_time > first + 0.5 {
return Ok("archive-video".to_string());
}
} else {
first_video_time = Some(current_time);
}
}
}
std::thread::sleep(Duration::from_millis(500));
}
let st = debug_player_state(tab).unwrap_or_default();
anyhow::bail!("timed out waiting for changing canvas frames\nplayer_state={st}");
anyhow::bail!("timed out waiting for live or archive motion\nplayer_state={st}");
}
fn wait_for_playback_probe_ok(
tab: &headless_chrome::Tab,
timeout: Duration,
) -> anyhow::Result<String> {
let deadline = Instant::now() + timeout;
let mut last_metrics = String::new();
while Instant::now() < deadline {
let js = r#"(function() {
const metrics = window.__ecPlaybackMetrics || null;
return metrics ? JSON.stringify(metrics) : "";
})();"#;
let v = tab.evaluate(js, false)?;
last_metrics = v
.value
.and_then(|v| v.as_str().map(|s| s.to_string()))
.unwrap_or_default();
if !last_metrics.is_empty() {
let metrics: serde_json::Value = serde_json::from_str(&last_metrics)?;
let ok = metrics.get("ok").and_then(|v| v.as_bool()).unwrap_or(false);
let samples = metrics
.get("samples")
.and_then(|v| v.as_u64())
.unwrap_or_default();
let changed = metrics
.get("changed_samples")
.and_then(|v| v.as_u64())
.unwrap_or_default();
let longest_static = metrics
.get("longest_same_hash_ms")
.and_then(|v| v.as_u64())
.unwrap_or_default();
if ok && samples >= 8 && changed >= 2 && longest_static < 5_000 {
return Ok(last_metrics);
}
}
std::thread::sleep(Duration::from_millis(250));
}
let st = debug_player_state(tab).unwrap_or_default();
anyhow::bail!(
"timed out waiting for playback probe ok\nplayer_state={st}\nmetrics={last_metrics}"
);
}
fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
@ -134,7 +296,9 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any
while Instant::now() < deadline {
let js = r#"(function() {
let watch = document.querySelector('moq-watch');
return !!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted');
let video = document.querySelector('video.archiveVideo');
return (!!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted')) ||
(!!video && video.muted === false && video.volume > 0);
})();"#;
let v = tab.evaluate(js, false)?;
if v.value.and_then(|v| v.as_bool()).unwrap_or(false) {
@ -146,13 +310,21 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any
anyhow::bail!("timed out waiting for unmuted player\nplayer_state={st}");
}
fn watch_url(site_url: &str, relay_url: &str, stream_id: &str) -> anyhow::Result<String> {
fn watch_url(
site_url: &str,
relay_url: &str,
stream_id: &str,
verify: bool,
) -> anyhow::Result<String> {
let mut url = url::Url::parse(site_url)?;
url.set_path("/watch");
url.query_pairs_mut()
.clear()
.append_pair("url", relay_url)
.append_pair("name", stream_id);
if verify {
url.query_pairs_mut().append_pair("verify", "1");
}
Ok(url.to_string())
}
@ -190,23 +362,104 @@ fn e2e_remote_website_watch_existing_stream_id() -> anyhow::Result<()> {
.unwrap();
let browser = headless_chrome::Browser::new(launch_options)?;
let tab = browser.new_tab()?;
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id)?)?;
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, false)?)?;
tab.wait_until_navigated()?;
// Ensure the player is instantiated.
if let Err(err) = wait_for_moq_watch_element(&tab, Duration::from_secs(90)) {
// Ensure either the native MoQ player or the archive live-edge fallback is instantiated.
if let Err(err) = wait_for_live_or_archive_player(&tab, Duration::from_secs(90)) {
let st = debug_player_state(&tab).unwrap_or_default();
anyhow::bail!("{err}\nplayer_state={st}");
}
if let Err(err) = wait_for_canvas_element(&tab, Duration::from_secs(90)) {
let st = debug_player_state(&tab).unwrap_or_default();
anyhow::bail!("{err}\nplayer_state={st}");
}
tab.wait_for_element("moq-watch canvas")?.click()?;
tab.evaluate(
r#"(function() {
const canvas = document.querySelector('moq-watch canvas');
if (canvas) canvas.click();
const audioButton = document.querySelector('#audioBtn');
if (audioButton && audioButton.getAttribute('aria-pressed') !== 'true') {
audioButton.click();
}
})();"#,
false,
)?;
wait_for_unmuted_player(&tab, Duration::from_secs(10))?;
wait_for_canvas_motion(&tab, Duration::from_secs(30))?;
let playback_path = wait_for_canvas_or_archive_motion(&tab, Duration::from_secs(60))?;
eprintln!("playback path: {playback_path}");
Ok(())
}
#[test]
#[ignore]
fn e2e_remote_website_watch_synthetic_relay_stream() -> anyhow::Result<()> {
if which("ffmpeg").is_none() {
return Ok(()); // skip
}
let chrome = match chrome_path() {
Some(p) => p,
None => return Ok(()), // skip
};
let site_url = std::env::var("EVERY_CHANNEL_SITE_URL")
.unwrap_or_else(|_| "https://every.channel/".to_string());
let relay_url = std::env::var("EVERY_CHANNEL_RELAY_URL")
.unwrap_or_else(|_| "https://relay.every.channel/anon".to_string());
let tls_disable_verify = std::env::var("EVERY_CHANNEL_RELAY_TLS_DISABLE_VERIFY")
.map(|v| v != "0" && v.to_lowercase() != "false")
.unwrap_or(true);
let ts = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_millis();
let stream_id = format!("e2e-synthetic-{ts}");
let ec_node = ec_node_path();
let mut publisher = Command::new(&ec_node);
publisher
.arg("wt-publish")
.arg("--url")
.arg(&relay_url)
.arg("--name")
.arg(&stream_id)
.arg("--realtime-input")
.arg("--input-format")
.arg("lavfi")
.arg("--input")
.arg("testsrc2=size=1280x720:rate=30")
.stdout(Stdio::null())
.stderr(Stdio::inherit());
if tls_disable_verify {
publisher.arg("--tls-disable-verify");
}
let mut publisher = publisher.spawn()?;
let test_result = (|| -> anyhow::Result<()> {
let launch_options = headless_chrome::LaunchOptionsBuilder::default()
.path(Some(chrome))
.headless(true)
.args(vec![
OsStr::new("--autoplay-policy=no-user-gesture-required"),
OsStr::new("--disable-application-cache"),
OsStr::new("--disable-service-worker"),
OsStr::new("--disk-cache-size=0"),
OsStr::new("--mute-audio"),
])
.build()
.unwrap();
let browser = headless_chrome::Browser::new(launch_options)?;
let tab = browser.new_tab()?;
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, true)?)?;
tab.wait_until_navigated()?;
wait_for_moq_watch_element(&tab, Duration::from_secs(90))?;
wait_for_canvas_element(&tab, Duration::from_secs(90))?;
let metrics = wait_for_playback_probe_ok(&tab, Duration::from_secs(60))?;
eprintln!("playback metrics: {metrics}");
Ok(())
})();
let _ = publisher.kill();
let _ = publisher.wait();
test_result
}

View file

@ -0,0 +1,334 @@
# ECP-0156: Duplicate Publisher Deterministic Data Layer
Status: Draft
## Context
Two publisher nodes may broadcast the same logical channel at the same time. The archive and relay
layers need this for resilience, but duplicate publishers currently risk looking like conflicting
streams instead of convergent copies of the same media.
## Decision
Duplicate publishers are valid for a published channel. The data layer dedupes and verifies media by
content identity, not by publisher envelope identity:
- CMAF init and media segment bytes for the same input, ladder profile, and chunk cadence must be
byte-for-byte identical.
- BLAKE3 media hashes and per-rung Merkle roots are the shared data identity.
- Publisher manifests may carry different `stream_id`, `epoch_id`, `created_unix_ms`, signatures,
locators, and manifest ids.
- The archive must treat matching media hashes from different publishers as corroborating sources.
- Archive records must carry source identity. Two copied buffers with the same `source_node` are not
duplicate-publisher proof, even when their BLAKE3 hashes match.
- Divergent hashes for the same logical channel, rendition, and media time are misses that must be
measured before the data is promoted as redundant.
## Verification
The proof path has two stages:
1. Single-node duplicate-publisher tests produce the same ladder twice with different publisher
identities and assert byte-for-byte BLAKE3 equality for every generated init and media segment.
The `duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes` test is part of the
default Rust test path when ffmpeg is present; it is not an ignored E2E.
2. Production verification runs the same channel on two real publishers long enough to measure
duplicate media convergence, hash divergence, missing objects, and backfill behavior in Grafana.
The goal is not just "two publishers are online." Success requires elapsed production time behind the
numbers and dashboards that show duplicate hits, misses, and archive repair.
## Consequences
Manifest ids cannot be used as the archive dedupe key for duplicate publishers. Operators get a
clear signal when two publishers produce identical bytes versus merely announcing the same channel.
If encoder determinism changes, the single-node test fails before production redundancy silently
degrades.
## Alternatives considered
- Dedupe by manifest id. This preserves envelope identity but misses the resilience property because
duplicate publishers necessarily produce different envelopes.
- Dedupe by logical channel and time only. This can hide encoder divergence and promote bad
redundancy before byte-level media equality is proven.
- Disable duplicate publishers until the scheduler is perfect. This avoids conflict handling but
weakens live resilience and leaves the archive data layer untested.
## Rollout/teardown
Roll forward by landing the local deterministic test, adding miss/duplicate metrics to the archive
scrape surface, then running two publishers for one logical channel in production. Roll back by
disabling duplicate scheduling for that channel; existing content-addressed archive objects remain
valid.
## Implementation notes
The node-agent archive scrape now exposes duplicate-source and miss gauges without placing hashes in
labels. Per node, role, broadcast, rendition, and track it reports duplicate matching hash sources,
duplicate hash sequences, divergent hash sequences, and missing hash records. Grafana shows those
next to archive ladder coverage so the production duplicate-publisher run has an operator-visible
convergence and miss signal.
`ec-node archive-convergence` is the primary proof surface for duplicate media identity. It compares
named archive manifest roots directly inside the Rust node binary, groups records by logical stream,
rendition, track, and sequence, and only returns `ok` when every expected sequence has matching
duplicate source hashes with no missing or divergent sequence. It also requires archive records to
carry at least two distinct `source_node` values, so mirrored global-origin manifests cannot pass as
independent publishers. This keeps the media-data invariant in the already-shipped Rust artifact
instead of extending the Python node-agent. Rollout gates should use
`ec-node archive-convergence --require-ok`; the command emits the JSON report either way, but
`--require-ok` exits non-zero unless duplicate convergence is actually proven.
`ec-node archive-convergence --prometheus` renders the same Rust convergence report as scrapeable
`every_channel_archive_*` gauges for duplicate source records, duplicate sequences, divergent
sequences, source-local divergence, missing hashes, missing source identity, media timing conflicts,
record source count, and pass/fail state. This gives Grafana a Rust-owned proof metric path while
the older node-agent ladder metrics remain available during migration.
`ec-node archive-convergence-serve` keeps that proof path live for Prometheus: it serves `/health`
and `/metrics`, recomputes convergence on each scrape, and emits `scrape_ok=0` metrics instead of
disappearing when manifests are missing or not ready. Production Grafana can therefore distinguish a
healthy metrics target from an unproven duplicate-publisher run.
The Nix `services.every-channel.ec-node.archive.convergence.proofs` option turns those Rust proof
servers into named systemd units. Each proof must name at least two `NAME=PATH` sources and gets a
dedicated listen address, so operators can add one Prometheus scrape target per duplicate channel
without resurrecting the Python node-agent as the proof oracle.
Forge enables an initial `la-kcop-publisher-origin` proof target on `127.0.0.1:7812` and Prometheus
scrapes it alongside the other local every.channel targets. Until two real publisher manifest roots
are mounted or fetched into Forge, the target intentionally uses the Forge manifest root as a
placeholder peer and must report unproven convergence rather than green duplicate-publisher proof.
Forge also exposes a static two-NUC `la-kcet-remote-publisher-origin` proof target once that channel
is the live converged duplicate sample. Dynamic Headscale file-SD remains useful for discovery, but
it can include relays and stale nodes; duplicate-publisher proof should use an explicit publisher
pair or future scheduler group labels so unrelated agents do not turn a passing channel red.
This static proof exports its own Rust convergence gauges rather than gating on broad legacy
Prometheus aggregates, because older node-agent archive metrics do not yet carry enough proof-role
labels to avoid summing stale divergence from unrelated scrape targets.
`ec-node archive-convergence-measure` is the primary production proof harness. It fetches named
node-agent `/v1/archive-manifest` samples or direct manifest JSONL URLs, writes bounded temporary
manifest roots, reuses the Rust `archive-convergence` report, and optionally queries Prometheus for
the Grafana-facing duplicate/miss series. A production run only counts as complete when the report
has elapsed samples, matching duplicate media hashes, zero divergent hash sequences, and live
Prometheus series for the duplicate/miss gauges. The measurement groups records by archive record
source identity, not by the URL used to fetch a manifest, and reports source identity failures when
the sample is too weak to prove independent publisher data. The older
`scripts/measure-duplicate-publishers.py` stays compatibility-only until live operators and Forge
jobs are switched to the Rust command.
The convergence report carries bounded divergent-sequence samples with per-source hash, byte size,
receive time, source node/session, CAS path, and media timing when present, so a red proof is
immediately actionable without fetching full manifests by hand.
It also reports a non-blocking media-timing-missing count and Prometheus gauge; hash equality can
still prove duplicate bytes, but missing timing means a divergent proof cannot yet classify whether
the mismatch is a phase/windowing problem or an encoder byte problem.
Publisher service builders must pass proof cadence explicitly. Both the node-agent publisher
supervisor and Nix systemd publisher module set `--publisher-archive-segment-duration-ms` and
`--publisher-start-boundary-ms` by default, so netbooted NUCs do not depend on stale hotpatch CLI
defaults when aligning duplicate publisher proof windows.
`ec-node archive-convergence-measure-serve` turns that production proof harness into a live
Prometheus target. Each `/metrics` scrape fetches one fresh sample from node-agent or direct JSONL
manifest URLs, keeps a bounded in-memory sample window, and only reports measurement `ok` after the
configured elapsed window has passed. This avoids blocking Prometheus scrapes for the measurement
duration while still preventing two immediate samples from looking like a real production run.
The service emits measurement-level gauges for fetch success, source record counts, invalid records,
elapsed seconds, Prometheus series presence, reasons, and then appends the same
`every_channel_archive_*` convergence gauges from the latest sample. The service can also read
Prometheus file-SD JSON from Forge's Headscale node-agent discovery and turn each discovered target
into a sampled node-agent manifest source. The Nix
`services.every-channel.ec-node.archive.convergence.remoteProofs` option creates these remote proof
services as systemd units from either static `NAME=URL` endpoints or dynamic file-SD inputs. Forge
now exposes `la-kcop-remote-publisher-origin` on `127.0.0.1:7813` using the live
`/var/lib/prometheus/every-channel-node-agents.json` inventory. It must stay red until that
inventory contains at least two independent publisher node-agents whose `publisher.m4s` records
converge.
When archive-serve ports are not reachable from the proof runner, the node-agent exposes a bounded,
tailnet-authenticated `/v1/archive-manifest` sample endpoint. The harness can use that endpoint for
each named publisher, compare local manifest records directly, and still require at least two elapsed
samples before declaring success.
Production duplicate proof also requires archive-buffer freshness on each participating publisher.
During mixed-generation rollouts, the current node-agent may supervise an older installed
`archive-hot-sync` helper. The agent must probe helper flag support and omit optional arguments such
as `--link-mode` when an older helper lacks them, because a silently failing archive-buffer sync can
leave one publisher with healthy live streams but stale manifests.
The publisher buffer refresh is freshness-first: the node-managed sync must mirror full manifests
without origin object fetch before running the slower cache fill/prune pass. This lets convergence
checks, Grafana scrape surfaces, and demand fetch see current BLAKE3 indexes even when proactive CAS
object backfill is still catching up.
`wt-archive` stamps each archive index record with `source_node` and `source_session`. The Nix
archive launcher passes the runtime hostname as `--source-node`; explicit CLI users can override it.
Older records without this identity continue to parse, but proof commands and production measurement
mark them incomplete instead of accepting them as independent publisher evidence.
Publisher-origin proof must be captured before relay/archive mirroring can collapse source identity.
When node-agent archive buffering is enabled, supervised `wt-publish` processes pass
`--publisher-archive-output-dir`, `--publisher-archive-manifest-dir`, and
`--publisher-archive-source-node`. `wt-publish` now supervises the Rust
`publisher-proof-archive-source` worker for that archive track. The worker splits the MPEG-TS source
by source-clock windows, fresh-encodes each bounded window with the deterministic proof profile,
stores the resulting media fragments under `publisher.m4s` in the same CAS/index format, and stamps
them with node-agent source identity. The relay playback encoder remains continuous for watchability,
but it is no longer the BLAKE3 data identity for duplicate-publisher proof. The source identity is
explicit override first, then hostname plus a short hash of machine-id, with boot-id only as a
fallback; hostname alone is not enough because publisher images can share names like `ec-node`.
Production duplicate verification can therefore compare `publisher.m4s` from two publisher buffers
without treating copied relay-origin manifests as independent sources.
Proof tooling defaults to `publisher.m4s`. The relay video track `0.m4s` is useful playback data,
but it is not duplicate-publisher proof: a publisher buffer may hold relay/cache records on `0.m4s`
that have no publisher source identity. Production convergence checks that sample `0.m4s` should be
treated as playback/archive-cache diagnostics, not byte-for-byte duplicate publisher evidence.
The first live publisher-origin measurements on 2026-06-08 showed correct distinct source labels but
zero matching duplicate sequences for `la-nbc4`, `la-pbs-socal`, and `la-kcet`. The failure is
useful: independent `wt-publish` processes currently start their fragment sequence and encoder chunk
phase at local process start, so sequence `0` from two publishers is not necessarily the same
broadcast moment. Duplicate-publisher proof therefore requires a shared chunk clock or
scheduler-controlled aligned encoder phase before byte-for-byte archive convergence can pass in
production.
Publisher-origin `publisher.m4s` records now require timed fMP4 fragments for global proof and map
those fragments onto observed wall-clock epoch buckets instead of local process counters. The Rust
writer learns track timescales from the init `moov` box, reads fragment
`moof/traf/tfhd+tfdt` decode timestamps to reject untimed proof when possible, then assigns
`group_sequence = observed_epoch_bucket * bucket_stride + fragment_slot`. Fragments that lack usable
timing still fall back to the previous local counter so publishing does not fail hard on malformed
metadata, but duplicate-publisher proof should use timed fragments. The `wt-publish` ffmpeg path
also preserves source timestamps and uses closed-GOP, single-threaded x264 settings with forced
keyframe cadence so independent publishers have a real chance of producing identical bytes for the
same media time window.
A later live run on 2026-06-08 found a stricter local invariant before cross-publisher byte equality:
each publisher must produce at most one hash for a given `source_node` and `group_sequence`.
Production `publisher.m4s` samples for `la-kcop` and `la-ktla` showed multiple hashes from the same
source in the same sequence bucket because real fMP4 fragments can arrive faster than the configured
proof segment duration, and the writer rounded decode time into repeated buckets. The writer now
uses a fixed per-epoch bucket stride and increments an in-bucket fragment slot when multiple timed
fragments arrive inside the same proof duration. This keeps source-local manifests unique while
allowing independently restarted publishers to align on the same observed wall-clock bucket.
`ec-node archive-convergence` reports this separately as `source_local_divergent_sequences` so
operator tooling can distinguish a self-contradicting publisher from two publishers that simply
disagree about the same sequence.
Because bucket-strided proof sequences intentionally leave numeric gaps, archive convergence uses
the observed sparse sequence union for publisher-origin manifests. Dense contiguous sequence ranges
remain available in the simulation layer when a model explicitly expects every integer sequence.
The 2026-06-08 live `la-kcet/publisher.m4s` sample from Forge confirmed that both publishers now
emit distinct source identities (`ec-node-c3546fa5abc3` and `ec-node-72cf1c3aa196`) with no missing
source identity records on the sampled publisher-origin manifests. It also confirmed the remaining
bug: 156 shared publisher-origin sequences had zero byte-for-byte BLAKE3 matches and 156 divergent
hashes. The next production fix must align the publisher chunk clock and encoded fMP4 byte stream,
not merely improve scrape or Grafana plumbing.
After the wall-clock bucket hotpatch, the same live proof no longer has fake sparse-range missing
IDs: `la-kcet/publisher.m4s` reported 376 observed proof sequences, zero missing source identities,
zero source-local divergent sequences, and 234 divergent shared sequences. A byte-level sample for
sequence `7287381184512` had different sizes, different BLAKE3 hashes, different `tfdt`
base-media-decode-times (`210210` versus `0`), and different `mdat` payload prefixes. Across that
sampled window there were zero common fragment hashes even when sequence IDs were ignored, proving
that the remaining failure was independent-encoder media phase and fMP4 payload determinism, not an
archive manifest identity bug.
A later `la-kcop/publisher.m4s` sample exposed a stricter live-source bug: source-window proof
records were using unsynced MPEG-TS PCR chunk indexes as `group_sequence` when the OTA UTC clock was
unavailable, causing restart-dependent jumps such as 93M, 135M, 341M, and 390M. The source-proof
writer now uses the chunk UTC start only when the chopper reports synced timing, otherwise it falls
back to the local wall-clock window start, and rewrites fMP4 `tfdt` onto that shared window before
hashing. The live HTTP proof worker also retries transient source opens/reader failures in unbounded
live mode, so a tuner `503` or malformed TS burst is skipped/retried instead of killing the
publisher proof process.
The synced source-window clock must use the chopper's exact global chunk index, not integer UTC
seconds. A 1001 ms proof cadence makes whole-second UTC start metadata lossy: adjacent source
windows can share the same `utc_start_unix`, which caused one publisher to write several different
hashes under the same source-local `group_sequence`. Synced chunks therefore use
`ChunkTiming.chunk_index` directly; only unsynced chunks fall back to local wall-clock receipt.
The live source-window proof writer also keeps subfragment slot allocation as stream state instead
of per-chunk state. Real source windows can be emitted in more than one proof chunk for the same
media timing sequence; resetting the slot counter for every chunk reused the same
`group_sequence` and made one healthy publisher look self-divergent. The counter is bounded so the
long-running live worker does not grow state unbounded.
`wt-publish` now has an explicit Unix-epoch start boundary, defaulting to the publisher-origin proof
cadence. After relay setup and immediately before spawning ffmpeg it waits until the next boundary,
so a newly restarted duplicate publisher starts its forced-keyframe clock on the same global cadence
as already-running publishers.
This does not by itself prove byte equality; it removes the local-process-start phase error from the
live publisher path and gives rollout measurement a deterministic knob (`--publisher-start-boundary-ms
0` disables it). The live ffmpeg argument plan is factored into a Rust unit-testable helper so
future timestamp/keyframe changes are pinned in `ec-node` instead of being inferred from node-agent
process strings or production samples.
The first post-start-clock live sample still failed duplicate byte identity: both publishers landed
in the same wall-clock proof bucket, but one fragment carried `tfdt=390390` while the other carried
`tfdt=30030`, matching the staggered restart gap. Their `mdat` prefixes differed too, which means a
continuous x264 encoder keeps enough local history that a later restart cannot prove byte equality
merely by joining the same wall-clock cadence. The live profile therefore enables x264
`stitchable=1` alongside closed GOP, no scenecut, no B-frames, no lookahead, and one thread. If that
still does not converge in production, the next fix is a deliberately stateless per-fragment encode
or a Rust-owned media clock/segmenter that resets encoder history at each proof boundary.
The follow-up production hotpatch moved the start-boundary wait to immediately before ffmpeg spawn,
enabled `stitchable=1`, and restarted both publisher nodes in the same batch. The latest `la-kcet`
sample still reported zero matching duplicate hashes with no missing source identity and no
source-local divergence. A final sampled shared sequence differed by hundreds of milliseconds of
receive time and by media size (`439737` versus `270283` bytes for the video fragment), so the
remaining mismatch is not just MP4 timestamp metadata. Production duplicate proof now needs a
stateless fragment boundary: either encode each proof segment from the same bounded source window
with fresh encoder state, or make the Rust media pipeline own exact frame-window capture before
calling ffmpeg/x264.
Archive manifests now carry optional fMP4 media timing for publisher-origin fragments. The
`archive-convergence` gate treats equal archive group sequence IDs with different media sequence or
decode-time metadata as `media_sequence_conflict`, even if the byte hash happens to match. This keeps
production proof aligned with the Rust simulation model: a duplicate publisher only proves the same
broadcast moment when the archive sequence and media window agree.
The first stateless proof primitives are now in `ec-node`. `publisher-proof-segment` takes one
bounded MPEG-TS source-clock window, runs a fresh deterministic x264/AAC fMP4 encode, splits the
result into init bytes and media fragments, and emits BLAKE3 hashes for each. `publisher-proof-windows`
uses the Rust MPEG-TS source-clock splitter first, then fresh-encodes each bounded window and reports
per-window source TS, init, and media hashes. Proof windows carry explicit MPEG-TS decoder context
with `--preroll-packets`, defaulting to the repo-owned `WT_PUBLISH_PROOF_PREROLL_PACKETS` budget, so
mid-GOP windows do not silently depend on best-effort decoder recovery. Focused Rust tests
fresh-encode the same bounded input and the same finite source-window campaign twice and assert
byte-for-byte identical proof hashes.
`publisher-proof-duplicates` is the single-node duplicate-publisher gate for the stateless path. It
runs `publisher-proof-windows` independently under at least two publisher identity labels, defaults
to `publisher-a` and `publisher-b`, and compares source TS, init, and media fragment BLAKE3 hashes
for every source-clock window. `--require-ok` exits non-zero unless every compared window matches,
and duplicate publisher labels are rejected so the proof cannot accidentally collapse to one source
identity. `publisher-proof-compare` is the cross-machine stateless proof gate: each publisher can run
`publisher-proof-windows` against the same bounded source TS file locally, copy the JSON report back
to the operator host, and compare the reports by named publisher. It rejects mismatched chunk cadence,
missing windows, source TS hash mismatches, init hash mismatches, media fragment hash mismatches, and
empty media windows.
`publisher-proof-remote-compare` is the production operator harness for that cross-machine gate. It
copies one bounded `.ts` proof input to each named SSH target, runs `ec-node publisher-proof-windows`
on the target, stores each returned JSON report under the local output directory, writes a
`compare.json`, and returns the existing compare report with upload/proof timing. Remote labels use
the same single-component validation as publisher identities, remote proof roots are constrained to
`/tmp/every-channel-*`, and cleanup is opt-in so the generated proof files remain inspectable unless
the operator explicitly requests removal. This keeps the live proof path in Rust without making the
Python node-agent a new oracle. It proves the machine/runtime/compiler boundary without requiring
the two NUCs to share a live tuner at the exact same instant.
`publisher-proof-archive-source` is the live archive implementation of the same proof model. It can
read local source files directly, read plain HTTP MPEG-TS bodies directly for HDHomeRun-style
sources, or fall back to an ffmpeg MPEG-TS copy reader for other inputs. Each emitted source-clock
window is encoded with fresh proof state, archived as CAS-backed `publisher.m4s` records, and mapped
to source-clock group sequences with explicit media timing metadata. A focused Rust regression now
archives the same bounded TS input as two source nodes, then runs `archive-convergence` against the
two manifest roots and requires full duplicate convergence with zero divergent or source-local
divergent sequences.
Forge `ci-gates` now runs the `publisher_proof` and `archive_convergence` Rust filters before the
distributed simulator campaign, so single-node byte-for-byte determinism, source-window archive
proof semantics, and duplicate archive convergence are checked before production rollout evidence is
considered. The next production step is to deploy the updated node binary and let fresh
`publisher.m4s` source-window records age into the Grafana scrape window so live duplicate metrics
can replace the older continuous-encoder divergence.

View file

@ -0,0 +1,158 @@
# ECP-0157: Rust Simulation Testing
Status: Draft
## Context
Production is now fast enough to expose distributed bugs quickly, but it is still the wrong first
place to discover scheduler, archive, and duplicate-publisher invariants. The Python node-agent also
made this worse by putting core control behavior outside the already-built Rust node binary.
## Decision
Add a small deterministic simulation layer in `ec-core` and use it for distributed media invariants:
- `ec-node` remains the runtime owner for node behavior.
- Tests model logical time, delayed delivery, backfill, duplicate publishers, and archive
convergence in Rust.
- Simulation scenarios are seed-replayable and include deterministic jitter, transient drops,
partition windows, publisher outage/restart windows, backfill retries, and encoder drift faults.
- A failing simulation must print or carry a replay hint so the exact schedule can be rerun.
- Simulation reports include deterministic execution history so a failure has an ordered event trace,
not only a final assertion.
- Simulation campaigns run many seed schedules in one fast test and preserve the first failing seed,
invariant report, and final state as the failure artifact.
- Campaign execution has a reusable seeded runner so new models can share replay/failure accounting
instead of copying bespoke loops.
- First failures are automatically shrunk where the model supports it. For duplicate publishers the
shrinker removes irrelevant partitions, publisher outages, timing jitter, transient drops, and
excess media sequence range while keeping the original invariant unchanged.
- Invariants are explicit checks, not implicit test prose: duplicate source count, missing
sequences, divergent hashes, missing media timing, conflicting media timing, complete duplicate
coverage, and convergence-deadline budgets.
- Media identity is checked by BLAKE3 hashes for stream, rendition, track, sequence, profile, and
source-material identity.
- Media timing is part of the proof model. Matching hashes are not considered a complete duplicate
proof unless both publishers also expose a shared logical media clock for the chunk.
- Source-material identity is separate from stream metadata. Two publishers can advertise the same
channel, sequence, timing, and encoder profile while still encoding different RF/source windows;
that must fail in simulation before production archive comparisons burn wall-clock time.
- Publisher-origin archive `group_sequence` is derived from parsed media-time identity plus stable
track id, not local receive time. Receive time is telemetry; it is not proof that two publishers
archived the same broadcast moment.
- Live publisher archive proof normalizes fMP4 `tfdt` to the Unix media slot before hashing a
fragment. The first fragment for each track anchors the process-local media clock to wall-clock
time; later fragments preserve ffmpeg's media cadence from that origin. ffmpeg still runs with
wall-clock timestamp input enabled where possible, but the Rust archive writer is the authority
for the proof clock when source MPEG-TS timestamps are process-relative.
- Archive `group_sequence` includes a stable subfragment slot inside each `(track_id,
media_sequence)` pair, because audio can legitimately emit multiple fragments within one media
slot and those must compare in order instead of colliding as source-local divergences.
- Duplicate-publisher scenarios model publisher content phase separately from advertised archive
sequence. A publisher that starts its local encoder at a different content phase must fail fast in
simulation, because production fragments with the same local sequence are not proof of the same
broadcast moment unless the chunk clock is shared.
- `ec-node sim-duplicate-publishers` runs the same campaign model from the compiled Rust binary and
emits JSON suitable for CI artifacts and rollout gates.
- `ec-node sim-duplicate-publishers --failure-artifact <path>` writes the first failing campaign as
a replayable JSON artifact with the shrunk scenario, invariant report, event trace, shrink steps,
and a command hint for replaying `replay_scenario` through `--scenario-json -`.
- `ec-node sim-duplicate-publishers --scenario-json <path-or->` replays an exact serialized
`DuplicatePublisherScenario`, so a shrunk failure from CI or production investigation can be rerun
without reconstructing command-line flags.
- `ec-node sim-duplicate-publishers` can inject timing faults directly with
`--missing-media-timing-publisher NODE` and `--publisher-media-time-offset NODE:OFFSET_MS`, so
the current production proof class can be reproduced without hand-writing scenario JSON.
- `ec-node sim-duplicate-publishers` and `ec-node sim-system` can inject source-window faults with
`--publisher-source-material NODE:MATERIAL_ID`. Any campaign with multiple source-material ids
reports source-material mismatch observations instead of leaving operators to infer that class
from divergent hashes.
- `ec-node archive-convergence` reads existing archive manifest JSONL and applies the same
convergence semantics to real duplicate publisher outputs.
- Control-plane simulation models logical nodes, seeded gossip fanout, delivery jitter, transient
drops, node-specific partitions, node outages, duplicate deliveries, and propagation deadlines.
- `ec-node sim-control-plane` runs the control-plane model from the compiled Rust binary and emits
replayable JSON with the first failing seed, scenario, invariant report, and ordered trace.
- Control-plane campaign reports track max propagation time, max delivery time, dropped messages,
partition-delayed messages, outage-delayed messages, and duplicate messages, so prod rollout
measurements have a fast simulation baseline.
- System simulation composes control-plane propagation with duplicate-publisher media production.
Control gossip produces per-publisher activation times; the media workload then proves that delayed
schedule propagation still converges when publishers use the global media sequence clock and fails
when they derive chunk identity from local activation time.
- `ec-node sim-system` runs that composed workload from the deployed node binary. Its default
campaign models the current publisher topology class and can switch `--sequence-clock` between
`global` and `local-activation` to reproduce the exact class of duplicate-publisher phase bug
before waiting for production samples.
- `ec-node sim-system --fault-profile foundationdb` uses a FoundationDB-style fault profile: each
seed generates a different but replayable cluster schedule with randomized control partitions, node
outages, transient gossip drops, duplicate messages, media partitions, publisher outages, and
archive backfill pressure.
- The FoundationDB-style profile must also have an explicit negative regression for
`local-activation` sequence clocks, so the model proves the current production failure class is
caught in Rust before any rollout waits for live fragments.
- `ec-node sim-system --failure-artifact <path>` writes the first failing composed system schedule
as replayable JSON, including the exact control/media scenario, invariant report, ordered trace,
and command hint for rerunning `--scenario-json -`.
- System campaign reports must include fault coverage counters, not just pass/fail. A fast campaign
is only useful if it proves that the simulated run actually exercised the failure modes operators
care about.
- System campaign reports also aggregate publisher phase-offset observations. A production-like
divergence caused by local activation clocks should identify itself as a phase bug in the campaign
JSON instead of requiring operators to infer that only from divergent hashes.
- System campaign reports also aggregate source-material mismatch observations. A production-like
divergence caused by independent tuner/source windows should identify itself as a source-material
bug in the campaign JSON instead of being confused with codec nondeterminism.
- System and duplicate-publisher reports aggregate missing media-timing records and media-timing
conflicts, so the live failure class where fragments arrive without a usable media clock is visible
in fast Rust simulation output.
- FoundationDB-profile `sim-system` campaigns require that coverage by default: control transient
drops, partition delays, node outage delays, duplicate messages, media transient drops, media
partition delays, publisher outages, backfill, and observed convergence timing must all appear in
the campaign report. A campaign that passes invariants but misses these classes is reported as a
weak simulation, not a green rollout gate.
- FoundationDB-profile coverage is breadth-gated, not only boolean-gated. By default at least
`max(2, iterations / 32)` seeds must exercise every required distributed fault class; operators
can raise that floor with `--min-fault-seed-coverage` for longer scientific campaigns.
- Campaign reports track both event totals and seed counts per fault class, plus a bounded list of
the slowest system schedules with replay hints. This makes green runs inspectable: operators can
see how broadly the randomized schedule space was exercised and which seeds define the current
latency tail.
- System campaign reports also aggregate deterministic simulated convergence time and trace event
counts. `ec-node sim-system` stamps wall-clock execution telemetry around the campaign so a run
reports iterations per second, simulated system seconds per wall second, and trace events per
second without putting wall-clock data into the replayed scenario itself.
- `sim-system --failure-artifact <path>` writes an artifact for weak coverage as well as invariant
failures, so CI can preserve evidence when a campaign was too small or too narrow to exercise the
required distributed faults.
- Forge `ci-gates` runs the Rust system simulator tests and a 1024-seed
`sim-system --fault-profile foundationdb` campaign from the compiled `ec-node` binary before web
build/deploy gates. This keeps the fast randomized check ahead of production rollout evidence.
- Simulation failures must be actionable before any matching production rollout is considered
healthy.
## Consequences
We get FoundationDB-style pressure in a much smaller shape: many deterministic failure schedules can
run as normal Rust tests without booting machines. The first media model covers duplicate publisher
convergence, network partitions, transient loss, publisher restart/backfill, convergence latency,
encoder drift, and publisher phase alignment, and the first runtime command applies it to archive
manifests. The first control model covers gossip propagation across relays and nodes under dropped,
delayed, duplicated, partitioned, and outage-delayed control messages. The shrink/replay path makes
supported failures small enough to debug before they become production event archaeology; exact
scenario JSON is the replay contract. Later models can add tuner scheduling, relay cache eviction,
and image rollout state machines. The composed system model is the first workload-level step: it
checks the boundary between control-plane speed and media determinism, which is where production
duplicate publishers are currently most fragile.
## Alternatives considered
- Keep writing production probes only. Rejected because probes prove what happened once, not what
should happen across many fault schedules.
- Extend the Python node-agent as the simulation oracle. Rejected because the image should get
thinner and the runtime behavior belongs in the Rust node.
## Rollout/teardown
Roll forward by adding simulation tests next to each new distributed invariant. Roll back by keeping
the production probes; the simulation module is library-only and has no runtime service impact.

File diff suppressed because it is too large Load diff

View file

@ -7,25 +7,38 @@
}:
let
# Keep the build input stable and small; avoid copying `target/`, `tmp/`, etc. into the Nix store.
root = ../../.;
# Keep the build input stable and small. NixOS, infra, docs, and script-only
# changes should not perturb the Rust source hash for config-only deploys.
src = lib.cleanSourceWith {
src = ../../.;
src = root;
filter = path: type:
let
base = baseNameOf path;
rel = lib.removePrefix "${toString root}/" (toString path);
in
# Skip typical build outputs and large scratch dirs.
!(base == "target"
|| base == ".git"
|| base == ".direnv"
|| base == "tmp"
|| base == "node_modules"
|| base == "out"
|| base == "test-results"
|| base == "deploy"
|| base == "intake"
|| base == "cache"
|| base == ".tower-minimal");
rel == ""
|| rel == "Cargo.toml"
|| rel == "Cargo.lock"
|| rel == "crates"
|| lib.hasPrefix "crates/" rel
|| rel == "third_party"
|| rel == "third_party/iroh-org"
|| rel == "third_party/iroh-org/iroh-gossip"
|| lib.hasPrefix "third_party/iroh-org/iroh-gossip/" rel
|| rel == "third_party/iroh-live"
|| rel == "third_party/iroh-live/iroh-moq"
|| lib.hasPrefix "third_party/iroh-live/iroh-moq/" rel
|| rel == "third_party/iroh-live/web-transport-iroh"
|| lib.hasPrefix "third_party/iroh-live/web-transport-iroh/" rel
|| rel == "apps"
|| rel == "apps/tauri"
|| rel == "apps/tauri/Cargo.toml"
|| rel == "apps/tauri/build.rs"
|| rel == "apps/tauri/tauri.conf.json"
|| rel == "apps/tauri/gen"
|| lib.hasPrefix "apps/tauri/gen/" rel
|| rel == "apps/tauri/src"
|| lib.hasPrefix "apps/tauri/src/" rel;
};
in
rustPlatform.buildRustPackage {
@ -52,7 +65,7 @@ rustPlatform.buildRustPackage {
doCheck = false;
meta = with lib; {
description = "every.channel node runner (ingest + chunk + MoQ publish)";
description = "every.channel node (ingest + chunk + MoQ publish)";
mainProgram = "ec-node";
platforms = platforms.unix;
license = licenses.agpl3Only;

View file

@ -0,0 +1,320 @@
#!/usr/bin/env python3
from __future__ import annotations
import importlib.util
import json
import sys
import unittest
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
SCRIPT = REPO / "scripts" / "measure-duplicate-publishers.py"
def load_module():
spec = importlib.util.spec_from_file_location("measure_duplicate_publishers", SCRIPT)
if spec is None or spec.loader is None:
raise RuntimeError(f"unable to load {SCRIPT}")
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module
class MeasureDuplicatePublishersTest(unittest.TestCase):
def test_manifest_hash_stats_counts_duplicates_divergence_and_missing_hashes(self) -> None:
module = load_module()
records = [
{"group_sequence": 10, "received_unix_ms": 1_000, "blake3": "same", "source_node": "nuc-a"},
{"group_sequence": 10, "received_unix_ms": 1_001, "blake3": "same", "source_node": "nuc-b"},
{"group_sequence": 11, "received_unix_ms": 2_000, "blake3": "left", "source_node": "nuc-a"},
{"group_sequence": 11, "received_unix_ms": 2_001, "blake3": "right", "source_node": "nuc-b"},
{"group_sequence": 12, "received_unix_ms": 3_000},
]
stats = module.manifest_hash_stats(records, invalid_lines=2)
self.assertEqual(5, stats["record_count"])
self.assertEqual(2, stats["invalid_lines"])
self.assertEqual(2, stats["sequence_count"])
self.assertEqual(2, stats["source_identity_count"])
self.assertEqual(["nuc-a", "nuc-b"], stats["source_identities"])
self.assertEqual(1, stats["missing_source_identity_records"])
self.assertEqual(1, stats["duplicate_hash_source_records"])
self.assertEqual(1, stats["duplicate_hash_sequences"])
self.assertEqual(1, stats["hash_divergent_sequences"])
self.assertEqual(1, stats["missing_hash_records"])
self.assertEqual(1_000, stats["first_received_unix_ms"])
self.assertEqual(3_000, stats["latest_received_unix_ms"])
def test_compare_manifest_hashes_proves_byte_for_byte_matches(self) -> None:
module = load_module()
comparison = module.compare_manifest_hashes(
{
"publisher-a": [
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"},
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"},
],
"publisher-b": [
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"},
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-b"},
],
}
)
self.assertTrue(comparison["byte_for_byte_hash_match"])
self.assertTrue(comparison["source_identity_ok"])
self.assertEqual(["publisher-a", "publisher-b"], comparison["source_identities"])
self.assertEqual(2, comparison["matching_sequence_count"])
self.assertEqual(0, comparison["divergent_sequence_count"])
self.assertEqual(0, comparison["missing_sequence_count"])
def test_compare_manifest_hashes_reports_divergent_sequences(self) -> None:
module = load_module()
comparison = module.compare_manifest_hashes(
{
"publisher-a": [
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"},
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"},
],
"publisher-b": [
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"},
{"group_sequence": 2, "blake3": "different", "source_node": "publisher-b"},
{"group_sequence": 3, "blake3": "extra", "source_node": "publisher-b"},
],
}
)
self.assertFalse(comparison["byte_for_byte_hash_match"])
self.assertEqual(1, comparison["matching_sequence_count"])
self.assertEqual(1, comparison["divergent_sequence_count"])
self.assertEqual(1, comparison["missing_sequence_count"])
self.assertEqual(2, comparison["divergent_examples"][0]["sequence"])
self.assertEqual(["different"], comparison["divergent_examples"][0]["hashes"]["publisher-b"])
def test_compare_manifest_hashes_rejects_intra_manifest_divergence(self) -> None:
module = load_module()
comparison = module.compare_manifest_hashes(
{
"publisher-a": [
{"group_sequence": 1, "blake3": "same", "source_node": "publisher-a"},
],
"publisher-b": [
{"group_sequence": 1, "blake3": "same", "source_node": "publisher-b"},
{"group_sequence": 1, "blake3": "different", "source_node": "publisher-b"},
],
}
)
self.assertFalse(comparison["byte_for_byte_hash_match"])
self.assertEqual(0, comparison["matching_sequence_count"])
self.assertEqual(1, comparison["divergent_sequence_count"])
self.assertEqual(["different", "same"], comparison["divergent_examples"][0]["hashes"]["publisher-b"])
def test_compare_manifest_hashes_rejects_mirrored_same_source_records(self) -> None:
module = load_module()
comparison = module.compare_manifest_hashes(
{
"nuc-a-buffer": [
{"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"},
],
"nuc-b-buffer": [
{"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"},
],
}
)
self.assertFalse(comparison["byte_for_byte_hash_match"])
self.assertFalse(comparison["source_identity_ok"])
self.assertEqual(["archive-origin"], comparison["source_identities"])
def test_summary_requires_manifest_comparison_and_prometheus_series(self) -> None:
module = load_module()
summary = module.summarize(
[
{
"sample_unix_ms": 1_000,
"publishers": {
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
},
"manifest_comparison": {
"source_identity_ok": True,
"matching_sequence_count": 2,
"divergent_sequence_count": 0,
"byte_for_byte_hash_match": True,
},
"prometheus": [
{
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
"ok": True,
"series_present": True,
"value": 2,
},
{
"metric": "every_channel_ladder_archive_hash_divergent_sequences",
"ok": True,
"series_present": True,
"value": 0,
},
],
},
{
"sample_unix_ms": 31_000,
"publishers": {
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
},
"manifest_comparison": {
"source_identity_ok": True,
"matching_sequence_count": 4,
"divergent_sequence_count": 0,
"byte_for_byte_hash_match": True,
},
"prometheus": [
{
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
"ok": True,
"series_present": True,
"value": 4,
},
{
"metric": "every_channel_ladder_archive_hash_divergent_sequences",
"ok": True,
"series_present": True,
"value": 0,
},
],
},
]
)
self.assertTrue(summary["ok"])
self.assertEqual(30_000, summary["elapsed_ms"])
self.assertEqual(2, summary["sample_count"])
self.assertEqual(4, summary["latest_manifest_comparison"]["matching_sequence_count"])
def test_summary_rejects_single_sample_and_manifest_hash_errors(self) -> None:
module = load_module()
summary = module.summarize(
[
{
"sample_unix_ms": 1_000,
"publishers": {
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
},
"manifests": {
"a": {
"ok": True,
"hash_divergent_sequences": 1,
"missing_hash_records": 1,
"invalid_lines": 1,
},
},
"manifest_comparison": {
"source_identity_ok": True,
"matching_sequence_count": 2,
"divergent_sequence_count": 0,
"byte_for_byte_hash_match": True,
},
"prometheus": [
{
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
"ok": True,
"series_present": True,
"value": 2,
},
],
},
]
)
self.assertFalse(summary["ok"])
self.assertIn("insufficient_elapsed_samples", summary["reasons"])
self.assertIn("manifest_hash_divergence_observed", summary["reasons"])
self.assertIn("manifest_hash_missing_records", summary["reasons"])
self.assertIn("manifest_invalid_lines", summary["reasons"])
def test_summary_rejects_missing_or_non_diverse_source_identity(self) -> None:
module = load_module()
summary = module.summarize(
[
{
"sample_unix_ms": 1_000,
"manifest_comparison": {
"source_identity_ok": False,
"matching_sequence_count": 2,
"divergent_sequence_count": 0,
"byte_for_byte_hash_match": False,
},
},
{
"sample_unix_ms": 31_000,
"manifest_comparison": {
"source_identity_ok": False,
"matching_sequence_count": 2,
"divergent_sequence_count": 0,
"byte_for_byte_hash_match": False,
},
"prometheus": [
{
"metric": "every_channel_archive_missing_source_identity_records",
"ok": True,
"series_present": True,
"value": 2,
},
],
},
]
)
self.assertFalse(summary["ok"])
self.assertIn("manifest_source_identity_missing_or_not_diverse", summary["reasons"])
self.assertIn("prometheus_source_identity_missing_nonzero", summary["reasons"])
def test_agent_manifest_url_builds_bounded_tailnet_endpoint(self) -> None:
module = load_module()
url = module.agent_manifest_url(
"http://100.64.0.5:7799/",
broadcast="la-kcop",
track="0.m4s",
role="publisher-buffer",
max_bytes=4096,
)
self.assertEqual(
"http://100.64.0.5:7799/v1/archive-manifest?broadcast=la-kcop&track=0.m4s&max_bytes=4096&role=publisher-buffer",
url,
)
def test_parser_defaults_to_publisher_origin_proof_track(self) -> None:
module = load_module()
args = module.build_parser().parse_args([])
self.assertEqual("publisher.m4s", args.track)
def test_parse_manifest_jsonl_tolerates_partial_first_tail_line(self) -> None:
module = load_module()
body = 'not-json-prefix{"group_sequence":1}\n{"group_sequence":2,"blake3":"b"}\n'
records, invalid = module.parse_manifest_jsonl(body)
self.assertEqual(0, invalid)
self.assertEqual([2], [record["group_sequence"] for record in records])
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1,581 @@
#!/usr/bin/env python3
"""Measure duplicate publisher media-hash convergence in production."""
from __future__ import annotations
import argparse
import json
import sys
import time
import urllib.parse
import urllib.request
from dataclasses import dataclass
from typing import Any, Callable
USER_AGENT = "every-channel-measure-duplicate-publishers/1"
DUPLICATE_PROMETHEUS_METRICS = [
"every_channel_ladder_archive_duplicate_hash_source_records",
"every_channel_ladder_archive_duplicate_hash_sequences",
"every_channel_ladder_archive_hash_divergent_sequences",
"every_channel_ladder_archive_missing_hash_records",
"every_channel_ladder_archive_missing_source_identity_records",
"every_channel_archive_duplicate_hash_source_records",
"every_channel_archive_duplicate_hash_sequences",
"every_channel_archive_hash_divergent_sequences",
"every_channel_archive_missing_hash_records",
"every_channel_archive_missing_source_identity_records",
]
SOURCE_IDENTITY_KEYS = ("source_node", "publisher_node", "source_id")
@dataclass
class FetchResult:
url: str
status: int
body: str
elapsed_ms: int
error: str | None = None
@property
def ok(self) -> bool:
return self.error is None and 200 <= self.status < 300
def now_ms() -> int:
return int(time.time() * 1000)
def fetch_text(url: str, timeout: float, max_bytes: int = 4 * 1024 * 1024) -> FetchResult:
started = now_ms()
headers = {"User-Agent": USER_AGENT}
if max_bytes > 0:
headers["Range"] = f"bytes=-{max_bytes}"
req = urllib.request.Request(url, headers=headers)
try:
with urllib.request.urlopen(req, timeout=timeout) as res:
body = res.read(max_bytes + 1 if max_bytes > 0 else -1)
if max_bytes > 0 and len(body) > max_bytes:
body = body[-max_bytes:]
return FetchResult(url, int(res.status), body.decode("utf-8", "replace"), now_ms() - started)
except Exception as err: # noqa: BLE001 - measurements preserve transport failures.
return FetchResult(url, 0, "", now_ms() - started, str(err))
def fetch_json(url: str, timeout: float, max_bytes: int = 1024 * 1024) -> tuple[FetchResult, Any | None]:
fetched = fetch_text(url, timeout, max_bytes=max_bytes)
if not fetched.ok:
return fetched, None
try:
return fetched, json.loads(fetched.body)
except json.JSONDecodeError as err:
fetched.error = f"invalid json: {err}"
return fetched, None
def parse_named_url(value: str) -> tuple[str, str]:
if "=" not in value:
raise ValueError(f"expected NAME=URL: {value}")
name, url = value.split("=", 1)
name = name.strip()
url = url.strip()
if not name or not url:
raise ValueError(f"expected NAME=URL: {value}")
return name, url
def manifest_url(origin: str, broadcast: str, track: str) -> str:
base = origin.rstrip("/") + "/"
return urllib.parse.urljoin(base, f"manifests/{broadcast}/{track}.jsonl")
def parse_manifest_jsonl(body: str) -> tuple[list[dict[str, Any]], int]:
records: list[dict[str, Any]] = []
invalid_lines = 0
for index, line in enumerate(body.splitlines()):
raw = line.strip()
if not raw:
continue
try:
record = json.loads(raw)
except json.JSONDecodeError:
# Tail range reads may start in the middle of a JSON line.
if index == 0:
continue
invalid_lines += 1
continue
if isinstance(record, dict):
records.append(record)
else:
invalid_lines += 1
return records, invalid_lines
def int_or_none(value: Any) -> int | None:
if isinstance(value, bool):
return None
if isinstance(value, int):
return value
try:
return int(str(value))
except (TypeError, ValueError):
return None
def record_source_identity(record: dict[str, Any]) -> str | None:
for key in SOURCE_IDENTITY_KEYS:
value = record.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return None
def manifest_hash_stats(records: list[dict[str, Any]], invalid_lines: int = 0) -> dict[str, Any]:
hashes_by_sequence: dict[int, set[str]] = {}
source_hashes_by_sequence: dict[int, dict[str, set[str]]] = {}
missing_hash_records = 0
missing_source_identity_records = 0
source_identities: set[str] = set()
received_values: list[int] = []
for record in records:
received_ms = int_or_none(record.get("received_unix_ms"))
if received_ms is not None:
received_values.append(received_ms)
sequence = int_or_none(record.get("group_sequence"))
digest = record.get("blake3")
if sequence is None:
continue
source_identity = record_source_identity(record)
if source_identity:
source_identities.add(source_identity)
else:
missing_source_identity_records += 1
if not isinstance(digest, str) or not digest.strip():
missing_hash_records += 1
continue
clean_digest = digest.strip()
hashes_by_sequence.setdefault(sequence, set()).add(clean_digest)
if source_identity:
source_hashes_by_sequence.setdefault(sequence, {}).setdefault(clean_digest, set()).add(source_identity)
duplicate_hash_source_records = sum(
max(0, len(source_identities_for_hash) - 1)
for hashes in source_hashes_by_sequence.values()
for source_identities_for_hash in hashes.values()
)
duplicate_hash_sequences = sum(
1
for hashes in source_hashes_by_sequence.values()
if any(len(source_identities_for_hash) > 1 for source_identities_for_hash in hashes.values())
)
hash_divergent_sequences = sum(1 for hashes in hashes_by_sequence.values() if len(hashes) > 1)
return {
"record_count": len(records),
"invalid_lines": invalid_lines,
"sequence_count": len(hashes_by_sequence),
"source_identity_count": len(source_identities),
"source_identities": sorted(source_identities),
"missing_source_identity_records": missing_source_identity_records,
"duplicate_hash_source_records": duplicate_hash_source_records,
"duplicate_hash_sequences": duplicate_hash_sequences,
"hash_divergent_sequences": hash_divergent_sequences,
"missing_hash_records": missing_hash_records,
"first_received_unix_ms": min(received_values) if received_values else None,
"latest_received_unix_ms": max(received_values) if received_values else None,
}
def first_hash_by_sequence(records: list[dict[str, Any]]) -> dict[int, str]:
out: dict[int, str] = {}
for record in records:
sequence = int_or_none(record.get("group_sequence"))
digest = record.get("blake3")
if sequence is None or not isinstance(digest, str) or not digest.strip():
continue
out.setdefault(sequence, digest.strip())
return out
def hash_sets_by_sequence(records: list[dict[str, Any]]) -> dict[int, set[str]]:
out: dict[int, set[str]] = {}
for record in records:
sequence = int_or_none(record.get("group_sequence"))
digest = record.get("blake3")
if sequence is None or not isinstance(digest, str) or not digest.strip():
continue
out.setdefault(sequence, set()).add(digest.strip())
return out
def compare_manifest_hashes(named_records: dict[str, list[dict[str, Any]]]) -> dict[str, Any]:
input_manifest_count = len(named_records)
missing_source_identity_records = 0
source_records: dict[str, list[dict[str, Any]]] = {}
for manifest_name, records in named_records.items():
for index, record in enumerate(records):
source_identity = record_source_identity(record)
if source_identity is None:
missing_source_identity_records += 1
source_identity = f"manifest:{manifest_name}"
source_records.setdefault(source_identity, []).append(record)
names = sorted(source_records)
per_name = {name: hash_sets_by_sequence(records) for name, records in source_records.items()}
all_sequences = sorted(set().union(*(set(value) for value in per_name.values()))) if per_name else []
shared_sequences = [
sequence
for sequence in all_sequences
if all(sequence in per_name[name] for name in names)
]
matching = 0
divergent = 0
examples: list[dict[str, Any]] = []
for sequence in shared_sequences:
values = {name: per_name[name][sequence] for name in names}
flattened = [next(iter(digests)) for digests in values.values() if len(digests) == 1]
if len(flattened) == len(names) and len(set(flattened)) == 1:
matching += 1
else:
divergent += 1
if len(examples) < 5:
examples.append(
{
"sequence": sequence,
"hashes": {
name: sorted(digests)
for name, digests in values.items()
},
}
)
source_identity_ok = missing_source_identity_records == 0 and len(names) >= 2
return {
"publisher_count": len(names),
"publishers": names,
"input_manifest_count": input_manifest_count,
"source_identity_count": len(names),
"source_identities": names,
"missing_source_identity_records": missing_source_identity_records,
"source_identity_ok": source_identity_ok,
"sequence_count": len(all_sequences),
"shared_sequence_count": len(shared_sequences),
"matching_sequence_count": matching,
"divergent_sequence_count": divergent,
"missing_sequence_count": max(0, len(all_sequences) - len(shared_sequences)),
"divergent_examples": examples,
"byte_for_byte_hash_match": bool(
source_identity_ok and shared_sequences and divergent == 0 and matching == len(shared_sequences)
),
}
def prometheus_query_url(prometheus_url: str, expr: str) -> str:
return (
prometheus_url.rstrip()
.rstrip("/")
+ "/api/v1/query?"
+ urllib.parse.urlencode({"query": expr})
)
def prometheus_metric_sum(
prometheus_url: str,
metric: str,
*,
broadcast: str,
timeout: float,
fetcher: Callable[[str, float, int], FetchResult] = fetch_text,
) -> dict[str, Any]:
selector = f'{metric}{{broadcast="{broadcast}"}}'
expr = f"sum({selector})"
fetched = fetcher(prometheus_query_url(prometheus_url, expr), timeout, 1024 * 1024)
if not fetched.ok:
return {"metric": metric, "ok": False, "value": None, "error": fetched.error}
try:
payload = json.loads(fetched.body)
result = payload.get("data", {}).get("result", [])
if not result:
return {"metric": metric, "ok": True, "value": None, "series_present": False}
raw_value = result[0].get("value", [None, None])[1]
value = float(raw_value)
except Exception as err: # noqa: BLE001 - preserve malformed Prometheus replies.
return {"metric": metric, "ok": False, "value": None, "error": f"invalid prometheus response: {err}"}
return {"metric": metric, "ok": True, "value": value, "series_present": True}
def agent_manifest_url(base_url: str, *, broadcast: str, track: str, role: str, max_bytes: int) -> str:
query = {
"broadcast": broadcast,
"track": track,
"max_bytes": str(max_bytes),
}
if role:
query["role"] = role
return base_url.rstrip("/") + "/v1/archive-manifest?" + urllib.parse.urlencode(query)
def sample_publishers(
publisher_urls: dict[str, str],
*,
timeout: float,
fetcher: Callable[[str, float, int], FetchResult] = fetch_text,
) -> dict[str, Any]:
out: dict[str, Any] = {}
for name, base_url in publisher_urls.items():
base = base_url.rstrip("/")
health = fetcher(f"{base}/health", timeout, 1024 * 1024)
metrics = fetcher(f"{base}/metrics", timeout, 2 * 1024 * 1024)
row: dict[str, Any] = {
"agent_url": base,
"health_ok": health.ok,
"metrics_ok": metrics.ok,
"health_error": health.error,
"metrics_error": metrics.error,
"duplicate_metrics_present": False,
"node_modes": [],
"unhealthy_processes": [],
}
if health.ok:
try:
payload = json.loads(health.body)
row["node_modes"] = payload.get("node_modes") if isinstance(payload.get("node_modes"), list) else []
row["unhealthy_processes"] = (
payload.get("unhealthy_processes")
if isinstance(payload.get("unhealthy_processes"), list)
else []
)
system = payload.get("system") if isinstance(payload.get("system"), dict) else {}
row["hostname"] = system.get("hostname") or payload.get("hostname")
except json.JSONDecodeError:
row["health_error"] = "invalid health json"
if metrics.ok:
row["duplicate_metrics_present"] = any(metric in metrics.body for metric in DUPLICATE_PROMETHEUS_METRICS)
row["metrics_bytes"] = len(metrics.body.encode("utf-8"))
out[name] = row
return out
def sample_once(args: argparse.Namespace) -> dict[str, Any]:
manifests: dict[str, str] = dict(parse_named_url(item) for item in args.manifest)
if not manifests and args.archive_origin and args.broadcast and args.track:
manifests["archive-origin"] = manifest_url(args.archive_origin, args.broadcast, args.track)
publisher_urls: dict[str, str] = dict(parse_named_url(item) for item in args.publisher)
agent_manifest_urls: dict[str, str] = dict(parse_named_url(item) for item in args.agent_manifest)
fetched_records: dict[str, list[dict[str, Any]]] = {}
manifest_stats: dict[str, Any] = {}
for name, url in manifests.items():
fetched = fetch_text(url, args.timeout, max_bytes=args.max_manifest_bytes)
if not fetched.ok:
manifest_stats[name] = {"url": url, "ok": False, "error": fetched.error}
continue
records, invalid_lines = parse_manifest_jsonl(fetched.body)
fetched_records[name] = records
manifest_stats[name] = {
"url": url,
"ok": True,
"fetch_elapsed_ms": fetched.elapsed_ms,
**manifest_hash_stats(records, invalid_lines),
}
if agent_manifest_urls and args.broadcast and args.track:
for name, base_url in agent_manifest_urls.items():
url = agent_manifest_url(
base_url,
broadcast=args.broadcast,
track=args.track,
role=args.agent_manifest_role,
max_bytes=args.max_manifest_bytes,
)
fetched, payload = fetch_json(url, args.timeout, max_bytes=args.max_manifest_bytes + 1024 * 1024)
if not fetched.ok or not isinstance(payload, dict) or payload.get("ok") is not True:
manifest_stats[name] = {
"url": url,
"ok": False,
"source": "node-agent",
"error": fetched.error or (payload.get("error") if isinstance(payload, dict) else "invalid response"),
}
continue
records = payload.get("records") if isinstance(payload.get("records"), list) else []
records = [record for record in records if isinstance(record, dict)]
fetched_records[name] = records
invalid_lines = int_or_none(payload.get("invalid_lines")) or 0
stats = payload.get("stats") if isinstance(payload.get("stats"), dict) else {}
manifest_stats[name] = {
"url": url,
"ok": True,
"source": "node-agent",
"fetch_elapsed_ms": fetched.elapsed_ms,
"role": payload.get("role"),
"file_bytes": int_or_none(payload.get("file_bytes")),
"partial_scan": payload.get("partial_scan") is True,
**manifest_hash_stats(records, invalid_lines),
"node_agent_stats": stats,
}
prometheus_metrics = []
if args.prometheus_url and args.broadcast:
for metric in DUPLICATE_PROMETHEUS_METRICS:
prometheus_metrics.append(
prometheus_metric_sum(args.prometheus_url, metric, broadcast=args.broadcast, timeout=args.timeout)
)
return {
"sample_unix_ms": now_ms(),
"broadcast": args.broadcast,
"track": args.track,
"publishers": sample_publishers(publisher_urls, timeout=args.timeout) if publisher_urls else {},
"manifests": manifest_stats,
"manifest_comparison": compare_manifest_hashes(fetched_records) if len(fetched_records) >= 2 else None,
"prometheus": prometheus_metrics,
}
def summarize(samples: list[dict[str, Any]]) -> dict[str, Any]:
if not samples:
return {"ok": False, "reasons": ["no_samples"]}
reasons: list[str] = []
elapsed_ms = max(0, int(samples[-1]["sample_unix_ms"]) - int(samples[0]["sample_unix_ms"]))
if len(samples) < 2 or elapsed_ms <= 0:
reasons.append("insufficient_elapsed_samples")
publisher_rows = [
row
for sample in samples
for row in (sample.get("publishers") or {}).values()
if isinstance(row, dict)
]
if publisher_rows and not all(row.get("health_ok") is True for row in publisher_rows):
reasons.append("publisher_health_missing")
if publisher_rows and not any(row.get("metrics_ok") is True for row in publisher_rows):
reasons.append("publisher_metrics_missing")
if publisher_rows and not any(row.get("duplicate_metrics_present") is True for row in publisher_rows):
reasons.append("duplicate_metrics_not_deployed_to_publishers")
comparisons = [
sample.get("manifest_comparison")
for sample in samples
if isinstance(sample.get("manifest_comparison"), dict)
]
latest_comparison = comparisons[-1] if comparisons else None
if latest_comparison is None:
reasons.append("manifest_comparison_missing")
elif latest_comparison.get("source_identity_ok") is not True:
reasons.append("manifest_source_identity_missing_or_not_diverse")
elif latest_comparison.get("matching_sequence_count", 0) <= 0:
reasons.append("no_matching_duplicate_sequences")
elif latest_comparison.get("divergent_sequence_count", 0) > 0:
reasons.append("duplicate_hash_divergence_observed")
manifest_rows = [
row
for sample in samples
for row in (sample.get("manifests") or {}).values()
if isinstance(row, dict)
]
if manifest_rows and any(row.get("ok") is not True for row in manifest_rows):
reasons.append("manifest_fetch_missing")
if manifest_rows and any(int_or_none(row.get("hash_divergent_sequences")) or 0 for row in manifest_rows):
reasons.append("manifest_hash_divergence_observed")
if manifest_rows and any(int_or_none(row.get("missing_hash_records")) or 0 for row in manifest_rows):
reasons.append("manifest_hash_missing_records")
if manifest_rows and any(int_or_none(row.get("missing_source_identity_records")) or 0 for row in manifest_rows):
reasons.append("manifest_source_identity_missing")
if manifest_rows and any(int_or_none(row.get("invalid_lines")) or 0 for row in manifest_rows):
reasons.append("manifest_invalid_lines")
prom_rows = [
row
for sample in samples
for row in (sample.get("prometheus") or [])
if isinstance(row, dict)
]
prom_series = [row for row in prom_rows if row.get("series_present") is True]
if prom_rows and not prom_series:
reasons.append("prometheus_duplicate_series_missing")
divergent_values = [
float(row.get("value") or 0)
for row in prom_series
if str(row.get("metric", "")).endswith("hash_divergent_sequences")
]
if any(value > 0 for value in divergent_values):
reasons.append("prometheus_hash_divergence_nonzero")
missing_source_values = [
float(row.get("value") or 0)
for row in prom_series
if str(row.get("metric", "")).endswith("missing_source_identity_records")
]
if any(value > 0 for value in missing_source_values):
reasons.append("prometheus_source_identity_missing_nonzero")
return {
"ok": not reasons,
"elapsed_ms": elapsed_ms,
"sample_count": len(samples),
"reasons": reasons,
"latest_manifest_comparison": latest_comparison,
"prometheus_series_present_count": len(prom_series),
"publisher_count": len(samples[-1].get("publishers") or {}),
}
def measure(args: argparse.Namespace) -> dict[str, Any]:
samples: list[dict[str, Any]] = []
started = time.monotonic()
while True:
samples.append(sample_once(args))
if args.duration_seconds <= 0:
break
if time.monotonic() - started >= args.duration_seconds:
break
time.sleep(args.poll_interval_seconds)
report = {
"started_unix_ms": samples[0]["sample_unix_ms"] if samples else now_ms(),
"duration_seconds": args.duration_seconds,
"samples": samples,
}
report["summary"] = summarize(samples)
return report
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--publisher", action="append", default=[], help="Named node-agent URL, NAME=http://IP:7799.")
parser.add_argument("--manifest", action="append", default=[], help="Named archive JSONL URL, NAME=https://...")
parser.add_argument(
"--agent-manifest",
action="append",
default=[],
help="Named node-agent URL to sample /v1/archive-manifest from, NAME=http://IP:7799.",
)
parser.add_argument("--agent-manifest-role", default="publisher-buffer")
parser.add_argument("--archive-origin", default="", help="Archive origin root for manifests/<broadcast>/<track>.jsonl.")
parser.add_argument("--prometheus-url", default="", help="Prometheus base URL for Grafana-facing metrics.")
parser.add_argument("--broadcast", default="", help="Logical broadcast name to measure.")
parser.add_argument(
"--track",
default="publisher.m4s",
help="Track name to compare. Defaults to publisher-origin proof fragments, not relay video.",
)
parser.add_argument("--duration-seconds", type=float, default=0.0)
parser.add_argument("--poll-interval-seconds", type=float, default=30.0)
parser.add_argument("--timeout", type=float, default=10.0)
parser.add_argument("--max-manifest-bytes", type=int, default=4 * 1024 * 1024)
parser.add_argument("--pretty", action="store_true")
parser.add_argument("--require-ok", action="store_true", help="Exit non-zero unless summary.ok is true.")
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
try:
report = measure(args)
except Exception as err: # noqa: BLE001 - command-line tool should preserve exact failure.
print(json.dumps({"ok": False, "error": str(err)}, sort_keys=True), file=sys.stderr)
return 1
if args.pretty:
print(json.dumps(report, indent=2, sort_keys=True))
else:
print(json.dumps(report, sort_keys=True))
if args.require_ok and not report.get("summary", {}).get("ok"):
return 2
return 0
if __name__ == "__main__":
raise SystemExit(main())