From 91dad67fc21ea230683a68fa542be55223538635 Mon Sep 17 00:00:00 2001 From: "every.channel" Date: Wed, 10 Jun 2026 03:28:55 -0700 Subject: [PATCH] Add duplicate publisher determinism proof --- .forgejo/workflows/ci-gates.yml | 29 + Cargo.lock | 291 +- Cargo.toml | 11 +- crates/ec-chopper/src/lib.rs | 98 +- crates/ec-core/src/lib.rs | 1396 ++ crates/ec-core/src/sim.rs | 2937 ++++ crates/ec-core/tests/simulation.rs | 986 ++ crates/ec-node/Cargo.toml | 16 +- crates/ec-node/src/main.rs | 13310 +++++++++++++++- crates/ec-node/src/nbc.rs | 360 +- .../ec-node/tests/determinism_cmaf_ladder.rs | 127 +- .../e2e_remote_website_watch_existing.rs | 289 +- ...cate-publisher-deterministic-data-layer.md | 334 + .../ECP-0157-rust-simulation-testing.md | 158 + nix/modules/ec-node.nix | 876 +- nix/pkgs/ec-node.nix | 45 +- scripts/measure-duplicate-publishers-test.py | 320 + scripts/measure-duplicate-publishers.py | 581 + 18 files changed, 21569 insertions(+), 595 deletions(-) create mode 100644 crates/ec-core/src/sim.rs create mode 100644 crates/ec-core/tests/simulation.rs create mode 100644 evolution/proposals/ECP-0156-duplicate-publisher-deterministic-data-layer.md create mode 100644 evolution/proposals/ECP-0157-rust-simulation-testing.md create mode 100644 scripts/measure-duplicate-publishers-test.py create mode 100755 scripts/measure-duplicate-publishers.py diff --git a/.forgejo/workflows/ci-gates.yml b/.forgejo/workflows/ci-gates.yml index e37fbdd..c3d2c67 100644 --- a/.forgejo/workflows/ci-gates.yml +++ b/.forgejo/workflows/ci-gates.yml @@ -109,6 +109,35 @@ jobs: fi cargo test -p ec-core -p ec-crypto -p ec-moq -p ec-iroh -p ec-linux-iptv + - name: Duplicate publisher proof gates + shell: bash + run: | + set -euo pipefail + cd .repo + if [[ -f "$HOME/.cargo/env" ]]; then + . "$HOME/.cargo/env" + fi + cargo test -p ec-node publisher_proof + cargo test -p ec-node archive_convergence + + - name: Distributed simulation gates + shell: bash + run: | + set -euo pipefail + cd .repo + if [[ -f "$HOME/.cargo/env" ]]; then + . "$HOME/.cargo/env" + fi + cargo test -p ec-node sim_system_ + cargo run -p ec-node -- sim-system \ + --fault-profile foundationdb \ + --seed 1 \ + --iterations 1024 \ + --max-system-complete-ms 6000 \ + --failure-artifact /tmp/ec-sim-system-foundationdb-failure.json \ + --pretty \ + > /tmp/ec-sim-system-foundationdb.json + - name: Build web (apps/web) shell: bash run: | diff --git a/Cargo.lock b/Cargo.lock index b559277..357429c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1038,15 +1038,6 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "buf-list" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6b175f9cf8fffedd4c4b18bcfef092356e952b81f596e148f18e98280994593" -dependencies = [ - "bytes", -] - [[package]] name = "bumpalo" version = "3.19.1" @@ -1375,6 +1366,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "conducer" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d2cb64e61144d6960a830d3e6f2ba3a61d5c0ca689e87e11dc9effb96dcfff5" +dependencies = [ + "smallvec", +] + [[package]] name = "const-hex" version = "1.18.1" @@ -2251,9 +2251,12 @@ dependencies = [ "hex", "iroh", "just-webrtc", - "moq-lite 0.14.0", + "moq-lite 0.16.0", "moq-mux", "moq-native", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry_sdk", "quinn", "reqwest", "rustls", @@ -2261,9 +2264,10 @@ dependencies = [ "serde", "serde_json", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.24.0", "tokio-util", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "url", "urlencoding", @@ -3264,24 +3268,20 @@ checksum = "253b313319f7109de64e480ffb606f89475cd758bae82e096e00c5d95341d30e" [[package]] name = "hang" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f548f7cdc8ec3b9eae085f7b61ff9603d6dc9f09192c5f4b0db4c02577786070" +checksum = "59435f843c8a41ac499ce68828d16c575438e34ffa85b1ea46ba2529bb2a5b16" dependencies = [ - "buf-list", "bytes", "derive_more 2.1.1", - "futures", "hex", "lazy_static", - "moq-lite 0.14.0", + "moq-lite 0.16.0", "regex", "serde", "serde_json", "serde_with", "thiserror 2.0.18", - "tokio", - "tracing", "url", ] @@ -4494,9 +4494,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "m3u8-rs" -version = "5.0.5" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c1d7ba86f7ea62f17f4310c55e93244619ddc7dadfc7e565de1967e4e41e6e7" +checksum = "f03cd3335fb5f2447755d45cda9c70f76013626a9db44374973791b0926a86c3" dependencies = [ "chrono", "nom", @@ -4707,14 +4707,13 @@ dependencies = [ [[package]] name = "moq-lite" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8a4c4e66081bc21067488da13f4131540b38b1cb79fb5176ef4ddacd104786b" +checksum = "15b02845fa5cef29b516e0ed60dc95f5904502bf001a8a2790d543fae6571a94" dependencies = [ - "async-channel", "bytes", + "conducer", "futures", - "hex", "num_enum", "rand 0.9.2", "serde", @@ -4726,24 +4725,38 @@ dependencies = [ ] [[package]] -name = "moq-mux" -version = "0.2.1" +name = "moq-msf" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73e2570aa39feef3aa00fa0990862dcdfb44937d3eb9c448c3a4eb1fb8ff43d3" +checksum = "2d61b0d5ce8285c75ed59343934aae278c4c49b1dedf41f1356939b40fab4d29" +dependencies = [ + "serde", + "serde_json", + "serde_with", +] + +[[package]] +name = "moq-mux" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fd5f397f0d147ca8920434a74f092e0846ce23bb1cb5411253123913a3e7576" dependencies = [ "anyhow", - "buf-list", + "base64 0.22.1", "bytes", + "conducer", "derive_more 2.1.1", "h264-parser", "hang", "m3u8-rs", - "moq-lite 0.14.0", + "moq-lite 0.16.0", + "moq-msf", "mp4-atom", "num_enum", "reqwest", "scuffle-av1", "scuffle-h265", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -4751,9 +4764,9 @@ dependencies = [ [[package]] name = "moq-native" -version = "0.13.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9848c21bf5db3f8ff5e5a7d89bf2c567f0eb526390c26d5f66f3fec99a6751a5" +checksum = "6942bac34d380bbab511e10069bc0f9615f20109807dab01b52d45e0812dc571" dependencies = [ "anyhow", "clap", @@ -4761,8 +4774,9 @@ dependencies = [ "hex", "humantime", "humantime-serde", - "moq-lite 0.14.0", + "moq-lite 0.16.0", "parking_lot", + "qmux", "quinn", "rand 0.9.2", "rcgen 0.14.7", @@ -4779,7 +4793,6 @@ dependencies = [ "tracing-subscriber", "url", "web-transport-quinn", - "web-transport-ws", ] [[package]] @@ -5519,6 +5532,78 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "opentelemetry" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + +[[package]] +name = "opentelemetry-http" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry", + "reqwest", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f" +dependencies = [ + "flate2", + "http", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "reqwest", + "thiserror 2.0.18", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", + "tonic-prost", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "opentelemetry", + "percent-encoding", + "rand 0.9.2", + "thiserror 2.0.18", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -6206,6 +6291,47 @@ dependencies = [ "unarray", ] +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.13.0", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "qmux" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a87859012c43a1e38dda29f2464e0ee39b0e96d0f95f870a73610bc6f2c3c2" +dependencies = [ + "bytes", + "futures", + "rustls", + "thiserror 2.0.18", + "tokio", + "tokio-rustls", + "tokio-tungstenite 0.28.0", + "tracing", + "web-transport-proto 0.6.0", + "web-transport-trait", +] + [[package]] name = "quick-error" version = "1.2.3" @@ -8315,6 +8441,22 @@ dependencies = [ "webpki-roots 0.26.11", ] +[[package]] +name = "tokio-tungstenite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25a406cddcc431a75d3d9afc6a7c0f7428d4891dd973e4d54c56b46127bf857" +dependencies = [ + "futures-util", + "log", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tungstenite 0.28.0", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -8447,6 +8589,38 @@ version = "1.0.6+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "http", + "http-body", + "http-body-util", + "percent-encoding", + "pin-project", + "sync_wrapper", + "tokio-stream", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -8567,6 +8741,22 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" +dependencies = [ + "js-sys", + "opentelemetry", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + [[package]] name = "tracing-subscriber" version = "0.3.22" @@ -8656,6 +8846,8 @@ dependencies = [ "httparse", "log", "rand 0.9.2", + "rustls", + "rustls-pki-types", "sha1", "thiserror 2.0.18", "utf-8", @@ -9136,9 +9328,9 @@ dependencies = [ [[package]] name = "web-async" -version = "0.1.1" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6b2260b739b0e95cf9b78f22a64704af7ed9760ea12baa3745b4b97899dc89a" +checksum = "f5414b65d9a5094649bb99987bb74db71febfdfa3677b7954a0a05c99d0424e8" dependencies = [ "tokio", "tracing", @@ -9198,7 +9390,9 @@ dependencies = [ [[package]] name = "web-transport-proto" -version = "0.5.2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0225d295c8ac00a2e9a498aefeaf3f3c6186da12a251c938189b15b82ea22808" dependencies = [ "bytes", "http", @@ -9210,9 +9404,9 @@ dependencies = [ [[package]] name = "web-transport-quinn" -version = "0.11.4" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b195557749e84091d7b912a25e190e9606283b5121d041faf538b0b55f40d7" +checksum = "cac11b6caf163be7f980442a26fcba15e8074a5f22e85fbb71f0f77d11cecf60" dependencies = [ "bytes", "futures", @@ -9224,34 +9418,19 @@ dependencies = [ "tokio", "tracing", "url", - "web-transport-proto 0.5.2", + "web-transport-proto 0.6.0", "web-transport-trait", ] [[package]] name = "web-transport-trait" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "802d6aa508f2c63c9050ceabc17265bbf90ed4d6f4e4357e987583883628e79c" +checksum = "cb67841c4a481ca3c1412ee4c9f463987401991e1ddc000903df2124f3dc85e9" dependencies = [ "bytes", ] -[[package]] -name = "web-transport-ws" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7b1cd89c36a28eae759329839e85f7dbca733896f048a6daaf5f8fc80f3bcba" -dependencies = [ - "bytes", - "futures", - "thiserror 2.0.18", - "tokio", - "tokio-tungstenite", - "web-transport-proto 0.5.2", - "web-transport-trait", -] - [[package]] name = "webkit2gtk" version = "2.0.1" diff --git a/Cargo.toml b/Cargo.toml index ad99b8f..4c430f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,12 +33,9 @@ blake3 = "1" clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_json = "1" +opentelemetry = { version = "0.31", features = ["trace"] } +opentelemetry-otlp = { version = "0.31", default-features = false, features = ["http-proto", "reqwest-client", "trace", "gzip-http"] } +opentelemetry_sdk = { version = "0.31", features = ["trace"] } tracing = "0.1" +tracing-opentelemetry = "0.32" tracing-subscriber = "0.3" - -[patch.crates-io] -# Cloudflare's relay uses standard WebTransport subprotocol negotiation. The upstream -# `web-transport-proto` crate (used by `web-transport-quinn`) currently uses legacy -# header names (`wt-available-protocols` / `wt-protocol`), which prevents negotiating -# `moqt-*` and causes the relay to close after MoQ SETUP. -web-transport-proto = { path = "third_party/web-transport-proto" } diff --git a/crates/ec-chopper/src/lib.rs b/crates/ec-chopper/src/lib.rs index 098d241..5292bfa 100644 --- a/crates/ec-chopper/src/lib.rs +++ b/crates/ec-chopper/src/lib.rs @@ -12,6 +12,7 @@ use ec_core::{ }; use ec_ts::{SectionAssembler, TimeSyncEngine, TimeSyncUpdate, TsReader}; use serde::{Deserialize, Serialize}; +use std::collections::VecDeque; use std::fs; use std::io::{Read, Write}; use std::path::{Path, PathBuf}; @@ -299,12 +300,55 @@ pub fn chunk_ts_stream( }) } +pub fn chunk_ts_stream_with_preroll( + stream: T, + output_dir: &Path, + chunk_duration_ms: u64, + max_chunks: Option, + preroll_packets: usize, +) -> Result { + let mut chunks = Vec::new(); + chunk_ts_stream_live_with_preroll( + stream, + output_dir, + chunk_duration_ms, + max_chunks, + preroll_packets, + |chunk| { + chunks.push(chunk); + Ok(()) + }, + )?; + Ok(TsChunkManifest { + output_dir: output_dir.to_path_buf(), + chunks, + }) +} + pub fn chunk_ts_stream_live Result<()>>( stream: T, output_dir: &Path, chunk_duration_ms: u64, max_chunks: Option, mut on_chunk: F, +) -> Result<()> { + chunk_ts_stream_live_with_preroll( + stream, + output_dir, + chunk_duration_ms, + max_chunks, + 0, + |chunk| on_chunk(chunk), + ) +} + +pub fn chunk_ts_stream_live_with_preroll Result<()>>( + stream: T, + output_dir: &Path, + chunk_duration_ms: u64, + max_chunks: Option, + preroll_packets: usize, + mut on_chunk: F, ) -> Result<()> { fs::create_dir_all(output_dir) .with_context(|| format!("failed to create {}", output_dir.display()))?; @@ -317,6 +361,7 @@ pub fn chunk_ts_stream_live Result<()>>( let mut current_file: Option = None; let mut current_timing: Option = None; let mut emitted = 0usize; + let mut preroll = VecDeque::<[u8; ec_ts::TS_PACKET_SIZE]>::with_capacity(preroll_packets); let mut close_and_emit = |index: u64, timing: ChunkTiming, file: std::fs::File| -> Result { @@ -332,6 +377,7 @@ pub fn chunk_ts_stream_live Result<()>>( }; while let Some(packet) = reader.read_packet()? { + let packet_bytes = *packet.as_bytes(); let updates = engine.ingest_packet(&packet, &mut assembler); for update in updates { if update.discontinuity { @@ -344,6 +390,7 @@ pub fn chunk_ts_stream_live Result<()>>( return Ok(()); } } + preroll.clear(); } if let Some(index) = update.chunk_index { @@ -359,8 +406,11 @@ pub fn chunk_ts_stream_live Result<()>>( } let path = chunk_path(output_dir, index); - let file = std::fs::File::create(&path) + let mut file = std::fs::File::create(&path) .with_context(|| format!("failed to create {}", path.display()))?; + for bytes in &preroll { + file.write_all(bytes)?; + } current_file = Some(file); current_index = Some(index); current_timing = Some(ChunkTiming { @@ -381,6 +431,13 @@ pub fn chunk_ts_stream_live Result<()>>( if let Some(file) = current_file.as_mut() { file.write_all(packet.as_bytes())?; } + + if preroll_packets > 0 { + preroll.push_back(packet_bytes); + while preroll.len() > preroll_packets { + preroll.pop_front(); + } + } } if let (Some(index), Some(timing), Some(file)) = ( @@ -388,7 +445,7 @@ pub fn chunk_ts_stream_live Result<()>>( current_timing.take(), current_file.take(), ) { - let _ = close_and_emit(index, timing, file); + close_and_emit(index, timing, file)?; } Ok(()) @@ -929,6 +986,43 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + #[test] + fn chunk_ts_stream_with_preroll_prepends_previous_packets() { + let chunk_ms = 1000u64; + let dir = + std::env::temp_dir().join(format!("ec-chopper-chunks-preroll-{}", std::process::id())); + let _ = fs::remove_dir_all(&dir); + fs::create_dir_all(&dir).unwrap(); + + let packet0 = ts_packet_with_pcr(0x0100, 0, 0); + let packet1 = ts_packet_with_pcr(0x0100, 1, 27_000_000); + let packet2 = ts_packet_with_pcr(0x0100, 2, 54_000_000); + let mut bytes = Vec::new(); + bytes.extend_from_slice(&packet0); + bytes.extend_from_slice(&packet1); + bytes.extend_from_slice(&packet2); + + let manifest = + chunk_ts_stream_with_preroll(Cursor::new(bytes), &dir, chunk_ms, None, 1).unwrap(); + let indices = manifest.chunks.iter().map(|c| c.index).collect::>(); + assert_eq!(indices, vec![0, 1, 2]); + + assert_eq!( + fs::read(&manifest.chunks[0].path).unwrap(), + packet0.to_vec() + ); + assert_eq!( + fs::read(&manifest.chunks[1].path).unwrap(), + [packet0, packet1].concat() + ); + assert_eq!( + fs::read(&manifest.chunks[2].path).unwrap(), + [packet1, packet2].concat() + ); + + let _ = fs::remove_dir_all(&dir); + } + #[test] fn hashed_manifest_merkle_root_matches_core() { let dir = std::env::temp_dir().join(format!("ec-chopper-merkle-{}", std::process::id())); diff --git a/crates/ec-core/src/lib.rs b/crates/ec-core/src/lib.rs index 585db12..93302f4 100644 --- a/crates/ec-core/src/lib.rs +++ b/crates/ec-core/src/lib.rs @@ -1,11 +1,40 @@ //! Core types shared across every.channel. +pub mod sim; + use serde::{Deserialize, Serialize}; use sha3::{Digest, Keccak256}; +use std::collections::BTreeMap; use std::fmt; pub const MANIFEST_ID_ALG_BLAKE3: &str = "blake3"; pub const MANIFEST_ID_ALG_KECCAK256: &str = "keccak256"; +pub const RECORD_ID_ALG_BLAKE3: &str = "blake3"; +pub const RECORD_ID_ALG_KECCAK256: &str = "keccak256"; +pub const RECORD_TYPE_COMMERCIAL_RANGE_SET: &str = "commercial_range_set"; +pub const RECORD_TYPE_CLIP: &str = "clip"; +pub const RECORD_TYPE_CHANNEL_METADATA: &str = "channel_metadata"; +pub const RECORD_TYPE_FRIEND_COMMENT: &str = "friend_comment"; +pub const RECORD_TYPE_OVERLAY: &str = "overlay"; +pub const RECORD_TYPE_SOURCE_ATTESTATION: &str = "source_attestation"; +pub const RECORD_VISIBILITY_PRIVATE: &str = "private"; +pub const RECORD_VISIBILITY_FRIENDS: &str = "friends"; +pub const RECORD_VISIBILITY_UNLISTED: &str = "unlisted"; +pub const RECORD_VISIBILITY_PUBLIC: &str = "public"; +pub const RECORD_CONTENT_HASH_BLAKE3: &str = "blake3"; +pub const RECORD_CONTENT_HASH_SHA256: &str = "sha256"; +pub const RECORD_CONTENT_HASH_IPFS_CID_V1: &str = "ipfs-cid-v1"; +pub const RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES: usize = 16 * 1024; +pub const RECORD_NFT_PROJECTION_ERC721: &str = "erc721"; +pub const RECORD_NFT_PROJECTION_ERC1155: &str = "erc1155"; +pub const RECORD_UI_RECOGNITION_COMMERCIAL_RANGES: &str = "commercial-ranges"; +pub const RECORD_UI_RECOGNITION_CLIP: &str = "clip"; +pub const RECORD_UI_RECOGNITION_CHANNEL_METADATA: &str = "channel-metadata"; +pub const RECORD_UI_RECOGNITION_FRIEND_COMMENT: &str = "friend-comment"; +pub const RECORD_UI_RECOGNITION_OVERLAY: &str = "overlay"; +pub const RECORD_UI_RECOGNITION_SOURCE_ATTESTATION: &str = "source-attestation"; +pub const RECORD_SIG_ALG_SECP256K1_EIP712_BODY_V1: &str = "secp256k1-eip712-signed-record-body-v1"; +pub const RECORD_ACTIVITYPUB_INLINE_CONTENT_MAX_BYTES: usize = 2 * 1024; pub const MERKLE_PROOF_ALG_BLAKE3: &str = "merkle+blake3"; pub const MERKLE_PROOF_ALG_KECCAK256: &str = "merkle+keccak256"; @@ -25,6 +54,835 @@ pub struct ChainCommitment { pub digest: String, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct RecordTimeRange { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub stream_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rendition_id: Option, + pub start_unix_ms: u64, + pub end_unix_ms: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RecordContentHash { + pub alg: String, + pub digest: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub uri: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignedRecordBody { + pub record_type: String, + pub subject: String, + #[serde(default)] + pub time_ranges: Vec, + #[serde(default)] + pub content_hashes: Vec, + pub source: String, + pub visibility: String, + pub created_unix_ms: u64, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub valid_from_unix_ms: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub valid_until_unix_ms: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub supersedes: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub revokes: Vec, + #[serde(default)] + pub metadata: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignedRecordSignature { + pub signer_id: String, + pub alg: String, + pub signature: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SignedRecord { + pub body: SignedRecordBody, + pub record_id: String, + #[serde(default)] + pub signatures: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub commitments: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RecordTypePolicy { + pub record_type: &'static str, + pub nft_projection: &'static str, + pub ui_recognition: &'static str, + pub requires_time_range: bool, + pub default_visibility: &'static str, + pub content_address_threshold_bytes: usize, + pub activitypub_object_type: Option<&'static str>, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RecordBlobPlacement { + InlineRecord, + ContentAddressedBlob, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordProjection { + pub record_type: String, + pub subject: String, + pub visibility: String, + pub nft_projection: String, + pub ui_recognition: String, + pub default_visibility: String, + pub uses_default_visibility: bool, + pub requires_time_range: bool, + pub time_range_count: usize, + pub content_hash_count: usize, + pub primary_content_uri: Option, + pub primary_ipfs_uri: Option, + pub metadata_inline_bytes: usize, + pub metadata_blob_placement: RecordBlobPlacement, + pub has_content_addressed_blob: bool, + pub has_ipfs_reference: bool, + pub activitypub_object_type: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SignedRecordActivityPubObject { + pub id: String, + #[serde(rename = "type")] + pub object_type: String, + pub attributed_to: String, + pub audience: String, + pub subject: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub content: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub url: Option, + pub published_unix_ms: u64, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub time_ranges: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordIndexCount { + pub name: String, + pub count: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordIndexEntry { + pub source_index: usize, + pub record_id: String, + pub record_type: String, + pub subject: String, + pub source: String, + pub visibility: String, + pub created_unix_ms: u64, + pub valid_from_unix_ms: Option, + pub valid_until_unix_ms: Option, + pub supersedes: Vec, + pub revokes: Vec, + pub signature_count: usize, + pub commitment_count: usize, + pub projection: SignedRecordProjection, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub activitypub_object: Option, + pub requires_content_addressed_blob: bool, + pub needs_content_addressed_blob: bool, + pub needs_ipfs_reference: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordIndexRejection { + pub source_index: usize, + pub record_id: Option, + pub record_type: Option, + pub subject: Option, + pub reason: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordIndexSummary { + pub total_records: usize, + pub accepted_records: usize, + pub rejected_records: usize, + pub public_records: usize, + pub activitypub_records: usize, + pub records_requiring_content_addressed_blob: usize, + pub records_missing_required_content_addressed_blob: usize, + pub records_missing_ipfs_reference: usize, + pub by_record_type: Vec, + pub by_nft_projection: Vec, + pub by_ui_recognition: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SignedRecordIndex { + pub schema: String, + pub generated_unix_ms: u64, + pub summary: SignedRecordIndexSummary, + pub entries: Vec, + pub rejected: Vec, +} + +impl SignedRecordBody { + pub fn record_id(&self) -> Result { + self.record_id_blake3() + } + + pub fn record_id_blake3(&self) -> Result { + let bytes = serde_json::to_vec(self)?; + Ok(blake3::hash(&bytes).to_hex().to_string()) + } + + pub fn record_id_keccak256(&self) -> Result { + let bytes = serde_json::to_vec(self)?; + Ok(hex::encode(keccak256(&bytes))) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SignedRecordValidationError { + EmptyRecordType, + UnknownRecordType(String), + EmptySubject, + EmptySource, + UnknownVisibility(String), + MissingTimeRange(String), + InvalidTimeRange { + start_unix_ms: u64, + end_unix_ms: u64, + }, + EmptyContentHashAlg, + UnknownContentHashAlg(String), + InvalidContentHashDigest { + alg: String, + digest: String, + }, + InvalidContentHashUri { + alg: String, + uri: String, + }, + InvalidValidityWindow { + valid_from_unix_ms: u64, + valid_until_unix_ms: u64, + }, + RecordIdEncoding(String), + InvalidRecordId { + expected: String, + actual: String, + }, + MissingSignature, + EmptySignerId, + InvalidSignatureAlg(String), + InvalidSignatureHex(String), +} + +impl fmt::Display for SignedRecordValidationError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SignedRecordValidationError::EmptyRecordType => write!(f, "record_type is empty"), + SignedRecordValidationError::UnknownRecordType(value) => { + write!(f, "unknown record_type: {value}") + } + SignedRecordValidationError::EmptySubject => write!(f, "subject is empty"), + SignedRecordValidationError::EmptySource => write!(f, "source is empty"), + SignedRecordValidationError::UnknownVisibility(value) => { + write!(f, "unknown visibility: {value}") + } + SignedRecordValidationError::MissingTimeRange(record_type) => { + write!(f, "{record_type} records require at least one time range") + } + SignedRecordValidationError::InvalidTimeRange { + start_unix_ms, + end_unix_ms, + } => write!( + f, + "invalid time range: end_unix_ms {end_unix_ms} must be after start_unix_ms {start_unix_ms}" + ), + SignedRecordValidationError::EmptyContentHashAlg => { + write!(f, "content hash algorithm is empty") + } + SignedRecordValidationError::UnknownContentHashAlg(value) => { + write!(f, "unknown content hash algorithm: {value}") + } + SignedRecordValidationError::InvalidContentHashDigest { alg, digest } => { + write!(f, "invalid {alg} content hash digest: {digest}") + } + SignedRecordValidationError::InvalidContentHashUri { alg, uri } => { + write!(f, "invalid {alg} content hash uri: {uri}") + } + SignedRecordValidationError::InvalidValidityWindow { + valid_from_unix_ms, + valid_until_unix_ms, + } => write!( + f, + "invalid validity window: valid_until_unix_ms {valid_until_unix_ms} must be after valid_from_unix_ms {valid_from_unix_ms}" + ), + SignedRecordValidationError::RecordIdEncoding(value) => { + write!(f, "record ID encoding failed: {value}") + } + SignedRecordValidationError::InvalidRecordId { expected, actual } => { + write!(f, "invalid record_id: expected {expected}, got {actual}") + } + SignedRecordValidationError::MissingSignature => { + write!(f, "signed record has no signatures") + } + SignedRecordValidationError::EmptySignerId => write!(f, "signature signer_id is empty"), + SignedRecordValidationError::InvalidSignatureAlg(value) => { + write!(f, "unsupported signature algorithm: {value}") + } + SignedRecordValidationError::InvalidSignatureHex(value) => { + write!(f, "invalid signature hex: {value}") + } + } + } +} + +impl std::error::Error for SignedRecordValidationError {} + +pub fn is_known_record_type(value: &str) -> bool { + matches!( + value, + RECORD_TYPE_COMMERCIAL_RANGE_SET + | RECORD_TYPE_CLIP + | RECORD_TYPE_CHANNEL_METADATA + | RECORD_TYPE_FRIEND_COMMENT + | RECORD_TYPE_OVERLAY + | RECORD_TYPE_SOURCE_ATTESTATION + ) +} + +pub fn is_known_record_visibility(value: &str) -> bool { + matches!( + value, + RECORD_VISIBILITY_PRIVATE + | RECORD_VISIBILITY_FRIENDS + | RECORD_VISIBILITY_UNLISTED + | RECORD_VISIBILITY_PUBLIC + ) +} + +pub fn record_type_policy(record_type: &str) -> Option { + let requires_time_range = record_type_requires_time_range(record_type); + let content_address_threshold_bytes = RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES; + match record_type { + RECORD_TYPE_COMMERCIAL_RANGE_SET => Some(RecordTypePolicy { + record_type: RECORD_TYPE_COMMERCIAL_RANGE_SET, + nft_projection: RECORD_NFT_PROJECTION_ERC1155, + ui_recognition: RECORD_UI_RECOGNITION_COMMERCIAL_RANGES, + requires_time_range, + default_visibility: RECORD_VISIBILITY_PUBLIC, + content_address_threshold_bytes, + activitypub_object_type: None, + }), + RECORD_TYPE_CLIP => Some(RecordTypePolicy { + record_type: RECORD_TYPE_CLIP, + nft_projection: RECORD_NFT_PROJECTION_ERC721, + ui_recognition: RECORD_UI_RECOGNITION_CLIP, + requires_time_range, + default_visibility: RECORD_VISIBILITY_UNLISTED, + content_address_threshold_bytes, + activitypub_object_type: Some("Video"), + }), + RECORD_TYPE_CHANNEL_METADATA => Some(RecordTypePolicy { + record_type: RECORD_TYPE_CHANNEL_METADATA, + nft_projection: RECORD_NFT_PROJECTION_ERC721, + ui_recognition: RECORD_UI_RECOGNITION_CHANNEL_METADATA, + requires_time_range, + default_visibility: RECORD_VISIBILITY_PUBLIC, + content_address_threshold_bytes, + activitypub_object_type: None, + }), + RECORD_TYPE_FRIEND_COMMENT => Some(RecordTypePolicy { + record_type: RECORD_TYPE_FRIEND_COMMENT, + nft_projection: RECORD_NFT_PROJECTION_ERC721, + ui_recognition: RECORD_UI_RECOGNITION_FRIEND_COMMENT, + requires_time_range, + default_visibility: RECORD_VISIBILITY_FRIENDS, + content_address_threshold_bytes, + activitypub_object_type: Some("Note"), + }), + RECORD_TYPE_OVERLAY => Some(RecordTypePolicy { + record_type: RECORD_TYPE_OVERLAY, + nft_projection: RECORD_NFT_PROJECTION_ERC1155, + ui_recognition: RECORD_UI_RECOGNITION_OVERLAY, + requires_time_range, + default_visibility: RECORD_VISIBILITY_PUBLIC, + content_address_threshold_bytes, + activitypub_object_type: Some("Object"), + }), + RECORD_TYPE_SOURCE_ATTESTATION => Some(RecordTypePolicy { + record_type: RECORD_TYPE_SOURCE_ATTESTATION, + nft_projection: RECORD_NFT_PROJECTION_ERC721, + ui_recognition: RECORD_UI_RECOGNITION_SOURCE_ATTESTATION, + requires_time_range, + default_visibility: RECORD_VISIBILITY_PUBLIC, + content_address_threshold_bytes, + activitypub_object_type: None, + }), + _ => None, + } +} + +pub fn record_payload_blob_placement( + record_type: &str, + payload_bytes: usize, +) -> Option { + let policy = record_type_policy(record_type)?; + if payload_bytes > policy.content_address_threshold_bytes { + Some(RecordBlobPlacement::ContentAddressedBlob) + } else { + Some(RecordBlobPlacement::InlineRecord) + } +} + +pub fn signed_record_inline_metadata_bytes(body: &SignedRecordBody) -> usize { + body.metadata + .iter() + .map(|item| item.key.len() + item.value.len()) + .sum() +} + +pub fn signed_record_metadata_blob_placement( + body: &SignedRecordBody, +) -> Option { + record_payload_blob_placement(&body.record_type, signed_record_inline_metadata_bytes(body)) +} + +pub fn signed_record_has_content_addressed_blob(body: &SignedRecordBody) -> bool { + !body.content_hashes.is_empty() +} + +pub fn signed_record_has_ipfs_reference(body: &SignedRecordBody) -> bool { + body.content_hashes.iter().any(|hash| { + hash.alg == RECORD_CONTENT_HASH_IPFS_CID_V1 + || hash + .uri + .as_deref() + .map(|uri| uri.trim_start().starts_with("ipfs://")) + .unwrap_or(false) + }) +} + +pub fn signed_record_primary_content_uri(body: &SignedRecordBody) -> Option { + body.content_hashes.iter().find_map(record_hash_content_uri) +} + +pub fn signed_record_primary_ipfs_uri(body: &SignedRecordBody) -> Option { + body.content_hashes.iter().find_map(record_hash_ipfs_uri) +} + +pub fn signed_record_projection(body: &SignedRecordBody) -> Option { + let policy = record_type_policy(&body.record_type)?; + let metadata_inline_bytes = signed_record_inline_metadata_bytes(body); + let metadata_blob_placement = + record_payload_blob_placement(&body.record_type, metadata_inline_bytes)?; + let primary_ipfs_uri = signed_record_primary_ipfs_uri(body); + Some(SignedRecordProjection { + record_type: policy.record_type.to_string(), + subject: body.subject.clone(), + visibility: body.visibility.clone(), + nft_projection: policy.nft_projection.to_string(), + ui_recognition: policy.ui_recognition.to_string(), + default_visibility: policy.default_visibility.to_string(), + uses_default_visibility: body.visibility == policy.default_visibility, + requires_time_range: policy.requires_time_range, + time_range_count: body.time_ranges.len(), + content_hash_count: body.content_hashes.len(), + primary_content_uri: signed_record_primary_content_uri(body), + primary_ipfs_uri: primary_ipfs_uri.clone(), + metadata_inline_bytes, + metadata_blob_placement, + has_content_addressed_blob: signed_record_has_content_addressed_blob(body) + || metadata_blob_placement == RecordBlobPlacement::ContentAddressedBlob, + has_ipfs_reference: primary_ipfs_uri.is_some() || signed_record_has_ipfs_reference(body), + activitypub_object_type: policy.activitypub_object_type.map(str::to_string), + }) +} + +fn signed_record_metadata_value<'a>(body: &'a SignedRecordBody, keys: &[&str]) -> Option<&'a str> { + body.metadata.iter().find_map(|item| { + let key = item.key.trim(); + if keys + .iter() + .any(|candidate| key.eq_ignore_ascii_case(candidate)) + { + let value = item.value.trim(); + if !value.is_empty() { + return Some(value); + } + } + None + }) +} + +fn truncate_utf8_bytes(value: &str, max_bytes: usize) -> String { + if value.len() <= max_bytes { + return value.to_string(); + } + let mut end = max_bytes; + while end > 0 && !value.is_char_boundary(end) { + end -= 1; + } + value[..end].to_string() +} + +pub fn signed_record_activitypub_object( + record: &SignedRecord, +) -> Option { + let projection = signed_record_projection(&record.body)?; + let object_type = projection.activitypub_object_type.clone()?; + let content = signed_record_metadata_value(&record.body, &["text", "comment", "content"]) + .map(|value| truncate_utf8_bytes(value, RECORD_ACTIVITYPUB_INLINE_CONTENT_MAX_BYTES)); + let name = signed_record_metadata_value(&record.body, &["title", "name", "label"]) + .map(|value| truncate_utf8_bytes(value, 256)); + + Some(SignedRecordActivityPubObject { + id: format!("urn:every-channel:signed-record:{}", record.record_id), + object_type, + attributed_to: record.body.source.clone(), + audience: record.body.visibility.clone(), + subject: record.body.subject.clone(), + name, + content, + url: projection.primary_content_uri, + published_unix_ms: record.body.created_unix_ms, + time_ranges: record.body.time_ranges.clone(), + }) +} + +fn count_rows(counts: BTreeMap) -> Vec { + counts + .into_iter() + .map(|(name, count)| SignedRecordIndexCount { name, count }) + .collect() +} + +fn signed_record_index_entry( + source_index: usize, + record: &SignedRecord, + projection: SignedRecordProjection, +) -> SignedRecordIndexEntry { + let requires_content_addressed_blob = + projection.metadata_blob_placement == RecordBlobPlacement::ContentAddressedBlob; + let has_explicit_content_addressed_blob = + signed_record_has_content_addressed_blob(&record.body); + let has_ipfs_reference = projection.has_ipfs_reference; + SignedRecordIndexEntry { + source_index, + record_id: record.record_id.clone(), + record_type: record.body.record_type.clone(), + subject: record.body.subject.clone(), + source: record.body.source.clone(), + visibility: record.body.visibility.clone(), + created_unix_ms: record.body.created_unix_ms, + valid_from_unix_ms: record.body.valid_from_unix_ms, + valid_until_unix_ms: record.body.valid_until_unix_ms, + supersedes: record.body.supersedes.clone(), + revokes: record.body.revokes.clone(), + signature_count: record.signatures.len(), + commitment_count: record.commitments.len(), + activitypub_object: signed_record_activitypub_object(record), + projection, + requires_content_addressed_blob, + needs_content_addressed_blob: requires_content_addressed_blob + && !has_explicit_content_addressed_blob, + needs_ipfs_reference: has_explicit_content_addressed_blob && !has_ipfs_reference, + } +} + +fn signed_record_index_rejection( + source_index: usize, + record: &SignedRecord, + reason: impl Into, +) -> SignedRecordIndexRejection { + let record_id = (!record.record_id.trim().is_empty()).then(|| record.record_id.clone()); + let record_type = + (!record.body.record_type.trim().is_empty()).then(|| record.body.record_type.clone()); + let subject = (!record.body.subject.trim().is_empty()).then(|| record.body.subject.clone()); + SignedRecordIndexRejection { + source_index, + record_id, + record_type, + subject, + reason: reason.into(), + } +} + +pub fn signed_record_index(records: &[SignedRecord], generated_unix_ms: u64) -> SignedRecordIndex { + let mut entries = Vec::new(); + let mut rejected = Vec::new(); + + for (source_index, record) in records.iter().enumerate() { + if let Err(err) = validate_signed_record(record) { + rejected.push(signed_record_index_rejection( + source_index, + record, + err.to_string(), + )); + continue; + } + let Some(projection) = signed_record_projection(&record.body) else { + rejected.push(signed_record_index_rejection( + source_index, + record, + "record does not project", + )); + continue; + }; + entries.push(signed_record_index_entry(source_index, record, projection)); + } + + entries.sort_by(|a, b| { + a.created_unix_ms + .cmp(&b.created_unix_ms) + .then_with(|| a.record_id.cmp(&b.record_id)) + .then_with(|| a.source_index.cmp(&b.source_index)) + }); + + let mut by_record_type = BTreeMap::::new(); + let mut by_nft_projection = BTreeMap::::new(); + let mut by_ui_recognition = BTreeMap::::new(); + let mut public_records = 0; + let mut activitypub_records = 0; + let mut records_requiring_content_addressed_blob = 0; + let mut records_missing_required_content_addressed_blob = 0; + let mut records_missing_ipfs_reference = 0; + + for entry in &entries { + *by_record_type.entry(entry.record_type.clone()).or_default() += 1; + *by_nft_projection + .entry(entry.projection.nft_projection.clone()) + .or_default() += 1; + *by_ui_recognition + .entry(entry.projection.ui_recognition.clone()) + .or_default() += 1; + if entry.visibility == RECORD_VISIBILITY_PUBLIC { + public_records += 1; + } + if entry.projection.activitypub_object_type.is_some() { + activitypub_records += 1; + } + if entry.requires_content_addressed_blob { + records_requiring_content_addressed_blob += 1; + } + if entry.needs_content_addressed_blob { + records_missing_required_content_addressed_blob += 1; + } + if entry.needs_ipfs_reference { + records_missing_ipfs_reference += 1; + } + } + + SignedRecordIndex { + schema: "https://every.channel/schemas/signed-record-index/v1".to_string(), + generated_unix_ms, + summary: SignedRecordIndexSummary { + total_records: records.len(), + accepted_records: entries.len(), + rejected_records: rejected.len(), + public_records, + activitypub_records, + records_requiring_content_addressed_blob, + records_missing_required_content_addressed_blob, + records_missing_ipfs_reference, + by_record_type: count_rows(by_record_type), + by_nft_projection: count_rows(by_nft_projection), + by_ui_recognition: count_rows(by_ui_recognition), + }, + entries, + rejected, + } +} + +fn record_hash_content_uri(hash: &RecordContentHash) -> Option { + hash.uri + .as_deref() + .map(str::trim) + .filter(|uri| !uri.is_empty()) + .map(str::to_string) + .or_else(|| record_hash_ipfs_uri(hash)) +} + +fn record_hash_ipfs_uri(hash: &RecordContentHash) -> Option { + if let Some(uri) = hash + .uri + .as_deref() + .map(str::trim) + .filter(|uri| uri.starts_with("ipfs://")) + { + return Some(uri.to_string()); + } + if hash.alg == RECORD_CONTENT_HASH_IPFS_CID_V1 { + let digest = hash.digest.trim(); + if !digest.is_empty() { + return Some(format!("ipfs://{digest}")); + } + } + None +} + +pub fn validate_record_content_hash( + hash: &RecordContentHash, +) -> Result<(), SignedRecordValidationError> { + let alg = hash.alg.as_str(); + if alg.trim().is_empty() { + return Err(SignedRecordValidationError::EmptyContentHashAlg); + } + match alg { + RECORD_CONTENT_HASH_BLAKE3 | RECORD_CONTENT_HASH_SHA256 => { + if !is_hex_bytes(&hash.digest, 32) { + return Err(SignedRecordValidationError::InvalidContentHashDigest { + alg: alg.to_string(), + digest: hash.digest.clone(), + }); + } + } + RECORD_CONTENT_HASH_IPFS_CID_V1 => { + let digest = hash.digest.trim(); + if digest.is_empty() + || digest.chars().any(char::is_whitespace) + || !(digest.starts_with('b') || digest.starts_with('z')) + { + return Err(SignedRecordValidationError::InvalidContentHashDigest { + alg: alg.to_string(), + digest: hash.digest.clone(), + }); + } + if let Some(uri) = &hash.uri { + let trimmed = uri.trim(); + if trimmed.is_empty() || !trimmed.starts_with("ipfs://") { + return Err(SignedRecordValidationError::InvalidContentHashUri { + alg: alg.to_string(), + uri: uri.clone(), + }); + } + } + } + _ => { + return Err(SignedRecordValidationError::UnknownContentHashAlg( + alg.to_string(), + )) + } + } + Ok(()) +} + +pub fn validate_signed_record_body( + body: &SignedRecordBody, +) -> Result<(), SignedRecordValidationError> { + let record_type = body.record_type.as_str(); + if record_type.trim().is_empty() { + return Err(SignedRecordValidationError::EmptyRecordType); + } + if !is_known_record_type(record_type) { + return Err(SignedRecordValidationError::UnknownRecordType( + body.record_type.clone(), + )); + } + if body.subject.trim().is_empty() { + return Err(SignedRecordValidationError::EmptySubject); + } + if body.source.trim().is_empty() { + return Err(SignedRecordValidationError::EmptySource); + } + let visibility = body.visibility.as_str(); + if !is_known_record_visibility(visibility) { + return Err(SignedRecordValidationError::UnknownVisibility( + body.visibility.clone(), + )); + } + if record_type_requires_time_range(record_type) && body.time_ranges.is_empty() { + return Err(SignedRecordValidationError::MissingTimeRange( + record_type.to_string(), + )); + } + for range in &body.time_ranges { + if range.end_unix_ms <= range.start_unix_ms { + return Err(SignedRecordValidationError::InvalidTimeRange { + start_unix_ms: range.start_unix_ms, + end_unix_ms: range.end_unix_ms, + }); + } + } + for hash in &body.content_hashes { + validate_record_content_hash(hash)?; + } + if let (Some(valid_from_unix_ms), Some(valid_until_unix_ms)) = + (body.valid_from_unix_ms, body.valid_until_unix_ms) + { + if valid_until_unix_ms <= valid_from_unix_ms { + return Err(SignedRecordValidationError::InvalidValidityWindow { + valid_from_unix_ms, + valid_until_unix_ms, + }); + } + } + Ok(()) +} + +pub fn validate_signed_record(record: &SignedRecord) -> Result<(), SignedRecordValidationError> { + validate_signed_record_body(&record.body)?; + let expected = record + .body + .record_id() + .map_err(|err| SignedRecordValidationError::RecordIdEncoding(err.to_string()))?; + if record.record_id != expected { + return Err(SignedRecordValidationError::InvalidRecordId { + expected, + actual: record.record_id.clone(), + }); + } + if record.signatures.is_empty() { + return Err(SignedRecordValidationError::MissingSignature); + } + for signature in &record.signatures { + if signature.signer_id.trim().is_empty() { + return Err(SignedRecordValidationError::EmptySignerId); + } + if signature.alg != RECORD_SIG_ALG_SECP256K1_EIP712_BODY_V1 { + return Err(SignedRecordValidationError::InvalidSignatureAlg( + signature.alg.clone(), + )); + } + if !is_hex_bytes(&signature.signature, 65) { + return Err(SignedRecordValidationError::InvalidSignatureHex( + signature.signature.clone(), + )); + } + } + Ok(()) +} + +fn record_type_requires_time_range(record_type: &str) -> bool { + matches!( + record_type, + RECORD_TYPE_COMMERCIAL_RANGE_SET + | RECORD_TYPE_CLIP + | RECORD_TYPE_FRIEND_COMMENT + | RECORD_TYPE_OVERLAY + ) +} + +fn is_hex_bytes(value: &str, expected_bytes: usize) -> bool { + let trimmed = value.trim(); + let hex_value = trimmed.strip_prefix("0x").unwrap_or(trimmed); + hex_value.len() == expected_bytes * 2 + && hex_value + .as_bytes() + .iter() + .all(|byte| byte.is_ascii_hexdigit()) +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct StreamDescriptor { pub id: StreamId, @@ -700,6 +1558,544 @@ mod tests { ); } + #[test] + fn signed_record_id_defaults_to_blake3_and_changes_with_body() { + let body = SignedRecordBody { + record_type: "friend_comment".to_string(), + subject: "channel:la-kcop".to_string(), + time_ranges: vec![RecordTimeRange { + stream_id: Some(StreamId("la-kcop".to_string())), + rendition_id: Some("720p".to_string()), + start_unix_ms: 1_771_000_000_000, + end_unix_ms: 1_771_000_010_000, + }], + content_hashes: vec![RecordContentHash { + alg: "ipfs-cid-v1".to_string(), + digest: "bafybeigdyrzt".to_string(), + uri: Some("ipfs://bafybeigdyrzt".to_string()), + }], + source: "did:key:z6MkFriend".to_string(), + visibility: "friends".to_string(), + created_unix_ms: 1_771_000_020_000, + valid_from_unix_ms: None, + valid_until_unix_ms: None, + supersedes: Vec::new(), + revokes: Vec::new(), + metadata: vec![StreamMetadata { + key: "text".to_string(), + value: "funny moment".to_string(), + }], + }; + + let id1 = body.record_id().unwrap(); + assert_eq!(id1, body.record_id_blake3().unwrap()); + assert_ne!(id1, body.record_id_keccak256().unwrap()); + + let mut changed = body.clone(); + changed.visibility = "public".to_string(); + assert_ne!(id1, changed.record_id().unwrap()); + } + + fn sample_signed_record_body(record_type: &str) -> SignedRecordBody { + SignedRecordBody { + record_type: record_type.to_string(), + subject: "channel:la-kcop".to_string(), + time_ranges: vec![RecordTimeRange { + stream_id: Some(StreamId("la-kcop".to_string())), + rendition_id: Some("720p".to_string()), + start_unix_ms: 1_771_000_000_000, + end_unix_ms: 1_771_000_010_000, + }], + content_hashes: vec![RecordContentHash { + alg: RECORD_CONTENT_HASH_BLAKE3.to_string(), + digest: blake3::hash(b"clip").to_hex().to_string(), + uri: Some("garage://every-channel-archive/la-kcop/clip.car".to_string()), + }], + source: "did:key:z6MkFriend".to_string(), + visibility: RECORD_VISIBILITY_PUBLIC.to_string(), + created_unix_ms: 1_771_000_020_000, + valid_from_unix_ms: Some(1_771_000_000_000), + valid_until_unix_ms: Some(1_771_100_000_000), + supersedes: Vec::new(), + revokes: Vec::new(), + metadata: vec![StreamMetadata { + key: "text".to_string(), + value: "funny moment".to_string(), + }], + } + } + + fn sample_signed_record(record_type: &str) -> SignedRecord { + let body = sample_signed_record_body(record_type); + let record_id = body.record_id().unwrap(); + SignedRecord { + body, + record_id, + signatures: vec![SignedRecordSignature { + signer_id: "did:key:z6MkFriend".to_string(), + alg: RECORD_SIG_ALG_SECP256K1_EIP712_BODY_V1.to_string(), + signature: format!("0x{}", "11".repeat(65)), + }], + commitments: Vec::new(), + } + } + + #[test] + fn record_type_policy_maps_known_types_to_nft_and_ui_shapes() { + let commercial = record_type_policy(RECORD_TYPE_COMMERCIAL_RANGE_SET).unwrap(); + assert_eq!(commercial.nft_projection, RECORD_NFT_PROJECTION_ERC1155); + assert_eq!( + commercial.ui_recognition, + RECORD_UI_RECOGNITION_COMMERCIAL_RANGES + ); + assert!(commercial.requires_time_range); + assert_eq!(commercial.default_visibility, RECORD_VISIBILITY_PUBLIC); + assert_eq!(commercial.activitypub_object_type, None); + + let clip = record_type_policy(RECORD_TYPE_CLIP).unwrap(); + assert_eq!(clip.nft_projection, RECORD_NFT_PROJECTION_ERC721); + assert_eq!(clip.default_visibility, RECORD_VISIBILITY_UNLISTED); + assert_eq!(clip.activitypub_object_type, Some("Video")); + + let metadata = record_type_policy(RECORD_TYPE_CHANNEL_METADATA).unwrap(); + assert_eq!(metadata.nft_projection, RECORD_NFT_PROJECTION_ERC721); + assert!(!metadata.requires_time_range); + assert_eq!( + metadata.ui_recognition, + RECORD_UI_RECOGNITION_CHANNEL_METADATA + ); + + let comment = record_type_policy(RECORD_TYPE_FRIEND_COMMENT).unwrap(); + assert_eq!(comment.default_visibility, RECORD_VISIBILITY_FRIENDS); + assert_eq!(comment.activitypub_object_type, Some("Note")); + + let overlay = record_type_policy(RECORD_TYPE_OVERLAY).unwrap(); + assert_eq!(overlay.nft_projection, RECORD_NFT_PROJECTION_ERC1155); + assert_eq!(overlay.activitypub_object_type, Some("Object")); + + let source = record_type_policy(RECORD_TYPE_SOURCE_ATTESTATION).unwrap(); + assert_eq!( + source.ui_recognition, + RECORD_UI_RECOGNITION_SOURCE_ATTESTATION + ); + assert!(!source.requires_time_range); + + assert_eq!(record_type_policy("unknown"), None); + } + + #[test] + fn record_blob_policy_requires_content_address_for_large_payloads() { + assert_eq!( + record_payload_blob_placement( + RECORD_TYPE_FRIEND_COMMENT, + RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + ), + Some(RecordBlobPlacement::InlineRecord) + ); + assert_eq!( + record_payload_blob_placement( + RECORD_TYPE_FRIEND_COMMENT, + RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + 1 + ), + Some(RecordBlobPlacement::ContentAddressedBlob) + ); + assert_eq!( + record_payload_blob_placement("unknown", RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + 1), + None + ); + } + + #[test] + fn signed_record_helpers_detect_ipfs_references_and_metadata_size() { + let mut body = sample_signed_record_body(RECORD_TYPE_FRIEND_COMMENT); + body.content_hashes.clear(); + body.metadata = vec![ + StreamMetadata { + key: "text".to_string(), + value: "funny moment".to_string(), + }, + StreamMetadata { + key: "author".to_string(), + value: "friend".to_string(), + }, + ]; + + assert_eq!( + signed_record_inline_metadata_bytes(&body), + "text".len() + "funny moment".len() + "author".len() + "friend".len() + ); + assert_eq!( + signed_record_metadata_blob_placement(&body), + Some(RecordBlobPlacement::InlineRecord) + ); + assert!(!signed_record_has_content_addressed_blob(&body)); + assert!(!signed_record_has_ipfs_reference(&body)); + + body.metadata = vec![StreamMetadata { + key: "payload".to_string(), + value: "x".repeat(RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + 1), + }]; + assert_eq!( + signed_record_metadata_blob_placement(&body), + Some(RecordBlobPlacement::ContentAddressedBlob) + ); + + body.content_hashes.push(RecordContentHash { + alg: RECORD_CONTENT_HASH_IPFS_CID_V1.to_string(), + digest: "bafybeigdyrzt".to_string(), + uri: Some("ipfs://bafybeigdyrzt".to_string()), + }); + assert!(signed_record_has_content_addressed_blob(&body)); + assert!(signed_record_has_ipfs_reference(&body)); + } + + #[test] + fn signed_record_projection_summarizes_ui_nft_and_ipfs_refs() { + let mut body = sample_signed_record_body(RECORD_TYPE_FRIEND_COMMENT); + body.visibility = RECORD_VISIBILITY_FRIENDS.to_string(); + body.content_hashes = vec![RecordContentHash { + alg: RECORD_CONTENT_HASH_IPFS_CID_V1.to_string(), + digest: "bafybeigdyrzt".to_string(), + uri: None, + }]; + + let projection = signed_record_projection(&body).unwrap(); + + assert_eq!(projection.record_type, RECORD_TYPE_FRIEND_COMMENT); + assert_eq!(projection.subject, "channel:la-kcop"); + assert_eq!(projection.visibility, RECORD_VISIBILITY_FRIENDS); + assert_eq!(projection.nft_projection, RECORD_NFT_PROJECTION_ERC721); + assert_eq!( + projection.ui_recognition, + RECORD_UI_RECOGNITION_FRIEND_COMMENT + ); + assert_eq!(projection.default_visibility, RECORD_VISIBILITY_FRIENDS); + assert!(projection.uses_default_visibility); + assert!(projection.requires_time_range); + assert_eq!(projection.time_range_count, 1); + assert_eq!(projection.content_hash_count, 1); + assert_eq!( + projection.primary_content_uri.as_deref(), + Some("ipfs://bafybeigdyrzt") + ); + assert_eq!( + projection.primary_ipfs_uri.as_deref(), + Some("ipfs://bafybeigdyrzt") + ); + assert_eq!( + projection.metadata_blob_placement, + RecordBlobPlacement::InlineRecord + ); + assert!(projection.has_content_addressed_blob); + assert!(projection.has_ipfs_reference); + assert_eq!(projection.activitypub_object_type.as_deref(), Some("Note")); + } + + #[test] + fn signed_record_activitypub_object_projects_bounded_friend_comment() { + let mut record = sample_signed_record(RECORD_TYPE_FRIEND_COMMENT); + record.body.visibility = RECORD_VISIBILITY_FRIENDS.to_string(); + record.body.content_hashes = vec![RecordContentHash { + alg: RECORD_CONTENT_HASH_IPFS_CID_V1.to_string(), + digest: "bafybeigdyrzt".to_string(), + uri: None, + }]; + record.body.metadata = vec![ + StreamMetadata { + key: "title".to_string(), + value: "KCOP clip".to_string(), + }, + StreamMetadata { + key: "text".to_string(), + value: "x".repeat(RECORD_ACTIVITYPUB_INLINE_CONTENT_MAX_BYTES + 16), + }, + ]; + record.record_id = record.body.record_id().unwrap(); + + let object = signed_record_activitypub_object(&record).unwrap(); + + assert_eq!( + object.id, + format!("urn:every-channel:signed-record:{}", record.record_id) + ); + assert_eq!(object.object_type, "Note"); + assert_eq!(object.attributed_to, "did:key:z6MkFriend"); + assert_eq!(object.audience, RECORD_VISIBILITY_FRIENDS); + assert_eq!(object.subject, "channel:la-kcop"); + assert_eq!(object.name.as_deref(), Some("KCOP clip")); + assert_eq!( + object.content.as_ref().map(|value| value.len()), + Some(RECORD_ACTIVITYPUB_INLINE_CONTENT_MAX_BYTES) + ); + assert_eq!(object.url.as_deref(), Some("ipfs://bafybeigdyrzt")); + assert_eq!(object.published_unix_ms, record.body.created_unix_ms); + assert_eq!(object.time_ranges.len(), 1); + + let json = serde_json::to_value(&object).unwrap(); + assert_eq!(json["type"], "Note"); + assert_eq!(json["attributedTo"], "did:key:z6MkFriend"); + } + + #[test] + fn signed_record_activitypub_object_skips_non_activitypub_records() { + let record = sample_signed_record(RECORD_TYPE_CHANNEL_METADATA); + assert_eq!(signed_record_activitypub_object(&record), None); + } + + #[test] + fn signed_record_projection_marks_large_metadata_for_blob_storage() { + let mut body = sample_signed_record_body(RECORD_TYPE_CHANNEL_METADATA); + body.content_hashes.clear(); + body.metadata = vec![StreamMetadata { + key: "payload".to_string(), + value: "x".repeat(RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + 1), + }]; + + let projection = signed_record_projection(&body).unwrap(); + + assert_eq!(projection.record_type, RECORD_TYPE_CHANNEL_METADATA); + assert!(!projection.requires_time_range); + assert_eq!( + projection.metadata_blob_placement, + RecordBlobPlacement::ContentAddressedBlob + ); + assert!(projection.has_content_addressed_blob); + assert!(!projection.has_ipfs_reference); + assert_eq!(projection.primary_content_uri, None); + assert_eq!(projection.primary_ipfs_uri, None); + } + + #[test] + fn signed_record_projection_rejects_unknown_record_types() { + let body = sample_signed_record_body("unknown"); + assert_eq!(signed_record_projection(&body), None); + } + + fn index_count<'a>( + counts: &'a [SignedRecordIndexCount], + name: &str, + ) -> Option<&'a SignedRecordIndexCount> { + counts.iter().find(|row| row.name == name) + } + + #[test] + fn signed_record_index_sorts_accepts_rejects_and_counts_policy_shapes() { + let mut clip = sample_signed_record(RECORD_TYPE_CLIP); + clip.body.created_unix_ms = 30; + clip.record_id = clip.body.record_id().unwrap(); + + let mut metadata = sample_signed_record(RECORD_TYPE_CHANNEL_METADATA); + metadata.body.created_unix_ms = 10; + metadata.body.time_ranges.clear(); + metadata.body.content_hashes.clear(); + metadata.record_id = metadata.body.record_id().unwrap(); + + let mut comment = sample_signed_record(RECORD_TYPE_FRIEND_COMMENT); + comment.body.created_unix_ms = 20; + comment.body.visibility = RECORD_VISIBILITY_FRIENDS.to_string(); + comment.body.content_hashes = vec![RecordContentHash { + alg: RECORD_CONTENT_HASH_IPFS_CID_V1.to_string(), + digest: "bafybeigdyrzt".to_string(), + uri: None, + }]; + comment.record_id = comment.body.record_id().unwrap(); + + let mut rejected = sample_signed_record(RECORD_TYPE_OVERLAY); + rejected.body.time_ranges.clear(); + rejected.record_id = rejected.body.record_id().unwrap(); + + let index = signed_record_index(&[clip, metadata, rejected, comment], 1234); + + assert_eq!( + index.schema, + "https://every.channel/schemas/signed-record-index/v1" + ); + assert_eq!(index.generated_unix_ms, 1234); + assert_eq!(index.summary.total_records, 4); + assert_eq!(index.summary.accepted_records, 3); + assert_eq!(index.summary.rejected_records, 1); + assert_eq!(index.summary.public_records, 2); + assert_eq!(index.summary.activitypub_records, 2); + assert_eq!( + index + .entries + .iter() + .map(|entry| entry.source_index) + .collect::>(), + vec![1, 3, 0] + ); + assert_eq!( + index.entries[1] + .activitypub_object + .as_ref() + .map(|object| object.object_type.as_str()), + Some("Note") + ); + assert_eq!( + index.entries[1] + .activitypub_object + .as_ref() + .and_then(|object| object.url.as_deref()), + Some("ipfs://bafybeigdyrzt") + ); + assert_eq!(index.entries[0].activitypub_object, None); + + assert_eq!( + index_count(&index.summary.by_record_type, RECORD_TYPE_CHANNEL_METADATA) + .map(|row| row.count), + Some(1) + ); + assert_eq!( + index_count(&index.summary.by_record_type, RECORD_TYPE_FRIEND_COMMENT) + .map(|row| row.count), + Some(1) + ); + assert_eq!( + index_count( + &index.summary.by_nft_projection, + RECORD_NFT_PROJECTION_ERC721 + ) + .map(|row| row.count), + Some(3) + ); + assert_eq!( + index_count( + &index.summary.by_ui_recognition, + RECORD_UI_RECOGNITION_CHANNEL_METADATA + ) + .map(|row| row.count), + Some(1) + ); + + assert_eq!(index.rejected.len(), 1); + assert_eq!(index.rejected[0].source_index, 2); + assert_eq!( + index.rejected[0].record_type.as_deref(), + Some(RECORD_TYPE_OVERLAY) + ); + assert!(index.rejected[0] + .reason + .contains("records require at least one time range")); + } + + #[test] + fn signed_record_index_flags_content_address_and_ipfs_followups() { + let mut inline = sample_signed_record(RECORD_TYPE_CHANNEL_METADATA); + inline.body.created_unix_ms = 10; + inline.body.time_ranges.clear(); + inline.body.content_hashes.clear(); + inline.record_id = inline.body.record_id().unwrap(); + + let mut large_metadata_without_blob = sample_signed_record(RECORD_TYPE_CHANNEL_METADATA); + large_metadata_without_blob.body.created_unix_ms = 20; + large_metadata_without_blob.body.time_ranges.clear(); + large_metadata_without_blob.body.content_hashes.clear(); + large_metadata_without_blob.body.metadata = vec![StreamMetadata { + key: "payload".to_string(), + value: "x".repeat(RECORD_CONTENT_ADDRESS_THRESHOLD_BYTES + 1), + }]; + large_metadata_without_blob.record_id = + large_metadata_without_blob.body.record_id().unwrap(); + + let mut garage_blob_without_ipfs = sample_signed_record(RECORD_TYPE_CHANNEL_METADATA); + garage_blob_without_ipfs.body.created_unix_ms = 30; + garage_blob_without_ipfs.body.time_ranges.clear(); + garage_blob_without_ipfs.body.content_hashes = vec![RecordContentHash { + alg: RECORD_CONTENT_HASH_BLAKE3.to_string(), + digest: blake3::hash(b"metadata-car").to_hex().to_string(), + uri: Some("garage://every-channel-records/channel-metadata.car".to_string()), + }]; + garage_blob_without_ipfs.record_id = garage_blob_without_ipfs.body.record_id().unwrap(); + + let index = signed_record_index( + &[ + inline, + large_metadata_without_blob, + garage_blob_without_ipfs, + ], + 5678, + ); + + assert_eq!(index.summary.accepted_records, 3); + assert_eq!(index.summary.rejected_records, 0); + assert_eq!(index.summary.records_requiring_content_addressed_blob, 1); + assert_eq!( + index + .summary + .records_missing_required_content_addressed_blob, + 1 + ); + assert_eq!(index.summary.records_missing_ipfs_reference, 1); + + let large = &index.entries[1]; + assert!(large.requires_content_addressed_blob); + assert!(large.needs_content_addressed_blob); + assert!(!large.needs_ipfs_reference); + + let garage = &index.entries[2]; + assert!(!garage.requires_content_addressed_blob); + assert!(!garage.needs_content_addressed_blob); + assert!(garage.needs_ipfs_reference); + } + + #[test] + fn signed_record_validation_accepts_public_clip_with_blob_hash() { + let record = sample_signed_record(RECORD_TYPE_CLIP); + validate_signed_record(&record).unwrap(); + } + + #[test] + fn signed_record_validation_rejects_bad_time_ranges() { + let mut body = sample_signed_record_body(RECORD_TYPE_COMMERCIAL_RANGE_SET); + body.time_ranges[0].end_unix_ms = body.time_ranges[0].start_unix_ms; + assert!(matches!( + validate_signed_record_body(&body), + Err(SignedRecordValidationError::InvalidTimeRange { .. }) + )); + } + + #[test] + fn signed_record_validation_rejects_mismatched_record_id() { + let mut record = sample_signed_record(RECORD_TYPE_FRIEND_COMMENT); + record.record_id = "00".repeat(32); + assert!(matches!( + validate_signed_record(&record), + Err(SignedRecordValidationError::InvalidRecordId { .. }) + )); + } + + #[test] + fn signed_record_validation_rejects_unknown_hash_alg() { + let mut body = sample_signed_record_body(RECORD_TYPE_CLIP); + body.content_hashes[0].alg = "md5".to_string(); + assert!(matches!( + validate_signed_record_body(&body), + Err(SignedRecordValidationError::UnknownContentHashAlg(value)) + if value == "md5" + )); + } + + #[test] + fn signed_record_validation_rejects_noncanonical_record_type() { + let mut body = sample_signed_record_body(RECORD_TYPE_CLIP); + body.record_type = format!(" {RECORD_TYPE_CLIP} "); + assert!(matches!( + validate_signed_record_body(&body), + Err(SignedRecordValidationError::UnknownRecordType(value)) + if value == " clip " + )); + } + + #[test] + fn signed_record_validation_rejects_missing_signature() { + let mut record = sample_signed_record(RECORD_TYPE_CLIP); + record.signatures.clear(); + assert!(matches!( + validate_signed_record(&record), + Err(SignedRecordValidationError::MissingSignature) + )); + } + #[test] fn merkle_root_single_is_leaf() { let leaf = blake3::hash(b"leaf").to_hex().to_string(); diff --git a/crates/ec-core/src/sim.rs b/crates/ec-core/src/sim.rs new file mode 100644 index 0000000..cf1cba6 --- /dev/null +++ b/crates/ec-core/src/sim.rs @@ -0,0 +1,2937 @@ +//! Deterministic simulation helpers for distributed media invariants. +//! +//! The simulator is intentionally small and synchronous. It gives tests a fast +//! way to model duplicate publishers, delayed delivery, backfill, and archive +//! convergence without booting nodes or depending on wall-clock time. + +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeMap, BTreeSet}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct SimulationSeed(pub u64); + +impl SimulationSeed { + pub const fn new(value: u64) -> Self { + Self(value) + } + + pub fn replay_hint(self) -> String { + format!("EC_SIM_SEED={:016x}", self.0) + } +} + +#[derive(Debug, Clone)] +pub struct SimulationRng { + state: u64, +} + +impl SimulationRng { + pub fn new(seed: SimulationSeed) -> Self { + let state = if seed.0 == 0 { + 0x9e37_79b9_7f4a_7c15 + } else { + seed.0 + }; + Self { state } + } + + pub fn next_u64(&mut self) -> u64 { + let mut x = self.state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + self.state = x; + x.wrapping_mul(0x2545_f491_4f6c_dd1d) + } + + pub fn range_inclusive(&mut self, min: u64, max: u64) -> u64 { + if min >= max { + return min; + } + min + (self.next_u64() % (max - min + 1)) + } + + pub fn chance_per_million(&mut self, threshold: u32) -> bool { + threshold > 0 && self.range_inclusive(0, 999_999) < u64::from(threshold) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SeededSimulationCampaignReport { + pub name: String, + pub seed_start: SimulationSeed, + pub iterations: u64, + pub passed: u64, + pub failed: u64, + pub first_failure: Option, +} + +impl SeededSimulationCampaignReport { + pub fn all_passed(&self) -> bool { + self.failed == 0 && self.passed == self.iterations && self.first_failure.is_none() + } +} + +pub fn run_seeded_simulation_campaign( + name: &str, + seed_start: SimulationSeed, + iterations: u64, + mut run_one: RunOne, +) -> SeededSimulationCampaignReport +where + RunOne: FnMut(SimulationSeed) -> Option, +{ + let mut campaign = SeededSimulationCampaignReport { + name: name.to_string(), + seed_start, + iterations, + passed: 0, + failed: 0, + first_failure: None, + }; + + for offset in 0..iterations { + let seed = SimulationSeed(seed_start.0.wrapping_add(offset)); + if let Some(failure) = run_one(seed) { + campaign.failed += 1; + if campaign.first_failure.is_none() { + campaign.first_failure = Some(failure); + } + } else { + campaign.passed += 1; + } + } + + campaign +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct MediaKey { + pub stream_id: String, + pub rendition_id: String, + pub track_name: String, + pub sequence: u64, +} + +impl MediaKey { + pub fn new(stream_id: &str, rendition_id: &str, track_name: &str, sequence: u64) -> Self { + Self { + stream_id: stream_id.to_string(), + rendition_id: rendition_id.to_string(), + track_name: track_name.to_string(), + sequence, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct MediaObservation { + pub key: MediaKey, + pub source_node: String, + pub blake3_hash: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub media_time_ms: Option, + pub observed_ms: u64, +} + +impl MediaObservation { + pub fn new(key: MediaKey, source_node: &str, blake3_hash: &str, observed_ms: u64) -> Self { + Self::new_with_media_time_ms(key, source_node, blake3_hash, None, observed_ms) + } + + pub fn new_with_media_time_ms( + key: MediaKey, + source_node: &str, + blake3_hash: &str, + media_time_ms: Option, + observed_ms: u64, + ) -> Self { + Self { + key, + source_node: source_node.to_string(), + blake3_hash: blake3_hash.to_string(), + media_time_ms, + observed_ms, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct MediaConvergenceSummary { + pub stream_id: String, + pub rendition_id: String, + pub track_name: String, + pub start_sequence: u64, + pub end_sequence: u64, + pub expected_sequences: u64, + pub observed_sequences: u64, + pub missing_sequences: Vec, + pub matching_duplicate_sequences: Vec, + pub divergent_sequences: Vec, + pub source_local_divergent_sequences: Vec, + #[serde(default)] + pub media_timing_missing_records: u64, + #[serde(default)] + pub media_timing_conflict_sequences: Vec, + pub duplicate_source_records: u64, +} + +impl MediaConvergenceSummary { + pub fn ok(&self) -> bool { + self.missing_sequences.is_empty() + && self.divergent_sequences.is_empty() + && self.media_timing_missing_records == 0 + && self.media_timing_conflict_sequences.is_empty() + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct MediaObservationEntry { + first_observed_ms: u64, + media_time_ms: Option, +} + +#[derive(Debug, Clone, Default)] +pub struct MediaConvergenceIndex { + by_key: BTreeMap>>, +} + +impl MediaConvergenceIndex { + pub fn observe(&mut self, observation: MediaObservation) { + self.by_key + .entry(observation.key) + .or_default() + .entry(observation.blake3_hash) + .or_default() + .entry(observation.source_node) + .and_modify(|entry| { + entry.first_observed_ms = entry.first_observed_ms.min(observation.observed_ms); + if entry.media_time_ms.is_none() { + entry.media_time_ms = observation.media_time_ms; + } + }) + .or_insert(MediaObservationEntry { + first_observed_ms: observation.observed_ms, + media_time_ms: observation.media_time_ms, + }); + } + + pub fn summarize( + &self, + stream_id: &str, + rendition_id: &str, + track_name: &str, + start_sequence: u64, + end_sequence: u64, + ) -> MediaConvergenceSummary { + self.summarize_sequence_set( + stream_id, + rendition_id, + track_name, + start_sequence, + end_sequence, + start_sequence..end_sequence, + ) + } + + pub fn summarize_observed_sequences( + &self, + stream_id: &str, + rendition_id: &str, + track_name: &str, + start_sequence: u64, + end_sequence: u64, + ) -> MediaConvergenceSummary { + let observed_sequences = self + .by_key + .keys() + .filter(|key| { + key.stream_id == stream_id + && key.rendition_id == rendition_id + && key.track_name == track_name + && key.sequence >= start_sequence + && key.sequence < end_sequence + }) + .map(|key| key.sequence) + .collect::>(); + self.summarize_sequence_set( + stream_id, + rendition_id, + track_name, + start_sequence, + end_sequence, + observed_sequences, + ) + } + + fn summarize_sequence_set( + &self, + stream_id: &str, + rendition_id: &str, + track_name: &str, + start_sequence: u64, + end_sequence: u64, + sequences: I, + ) -> MediaConvergenceSummary + where + I: IntoIterator, + { + let mut observed_sequences = 0; + let mut missing_sequences = Vec::new(); + let mut matching_duplicate_sequences = Vec::new(); + let mut divergent_sequences = Vec::new(); + let mut source_local_divergent_sequences = Vec::new(); + let mut media_timing_missing_records = 0; + let mut media_timing_conflict_sequences = Vec::new(); + let mut duplicate_source_records = 0; + let mut expected_sequences = 0; + + for sequence in sequences { + expected_sequences += 1; + let key = MediaKey::new(stream_id, rendition_id, track_name, sequence); + let Some(hashes) = self.by_key.get(&key) else { + missing_sequences.push(sequence); + continue; + }; + observed_sequences += 1; + + let duplicate_sources_for_sequence = hashes + .values() + .filter(|sources| sources.len() > 1) + .map(|sources| sources.len() as u64) + .sum::(); + duplicate_source_records += duplicate_sources_for_sequence; + + let mut hash_count_by_source = BTreeMap::<&str, u64>::new(); + let mut media_times = BTreeSet::::new(); + for sources in hashes.values() { + for (source, entry) in sources { + *hash_count_by_source.entry(source.as_str()).or_default() += 1; + if let Some(media_time_ms) = entry.media_time_ms { + media_times.insert(media_time_ms); + } else { + media_timing_missing_records += 1; + } + } + } + if hash_count_by_source + .values() + .any(|hash_count| *hash_count > 1) + { + source_local_divergent_sequences.push(sequence); + } + if media_times.len() > 1 { + media_timing_conflict_sequences.push(sequence); + } + + if hashes.len() > 1 { + divergent_sequences.push(sequence); + } else if duplicate_sources_for_sequence > 0 { + matching_duplicate_sequences.push(sequence); + } + } + + MediaConvergenceSummary { + stream_id: stream_id.to_string(), + rendition_id: rendition_id.to_string(), + track_name: track_name.to_string(), + start_sequence, + end_sequence, + expected_sequences, + observed_sequences, + missing_sequences, + matching_duplicate_sequences, + divergent_sequences, + source_local_divergent_sequences, + media_timing_missing_records, + media_timing_conflict_sequences, + duplicate_source_records, + } + } + + pub fn duplicate_complete_at_ms( + &self, + stream_id: &str, + rendition_id: &str, + track_name: &str, + start_sequence: u64, + end_sequence: u64, + ) -> Option { + let mut complete_at_ms = 0; + + for sequence in start_sequence..end_sequence { + let key = MediaKey::new(stream_id, rendition_id, track_name, sequence); + let hashes = self.by_key.get(&key)?; + if hashes.len() != 1 { + return None; + } + let sources = hashes.values().next()?; + if sources.len() < 2 { + return None; + } + if sources.values().any(|entry| entry.media_time_ms.is_none()) { + return None; + } + let media_times = sources + .values() + .filter_map(|entry| entry.media_time_ms) + .collect::>(); + if media_times.len() != 1 { + return None; + } + let sequence_complete_at_ms = sources + .values() + .map(|entry| entry.first_observed_ms) + .max()?; + complete_at_ms = complete_at_ms.max(sequence_complete_at_ms); + } + + Some(complete_at_ms) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum SimEvent { + Observe(MediaObservation), +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SimTraceEntry { + pub at_ms: u64, + pub order: u64, + pub event: SimEvent, +} + +#[derive(Debug, Clone, Default)] +pub struct DeterministicSimulation { + now_ms: u64, + next_order: u64, + queue: BTreeMap<(u64, u64), SimEvent>, + convergence: MediaConvergenceIndex, + trace: Vec, +} + +impl DeterministicSimulation { + pub fn new() -> Self { + Self::default() + } + + pub fn now_ms(&self) -> u64 { + self.now_ms + } + + pub fn convergence(&self) -> &MediaConvergenceIndex { + &self.convergence + } + + pub fn trace(&self) -> &[SimTraceEntry] { + &self.trace + } + + pub fn schedule(&mut self, at_ms: u64, event: SimEvent) { + let order = self.next_order; + self.next_order += 1; + self.queue.insert((at_ms, order), event); + } + + pub fn schedule_observation( + &mut self, + at_ms: u64, + source_node: &str, + stream_id: &str, + rendition_id: &str, + track_name: &str, + sequence: u64, + blake3_hash: &str, + ) { + self.schedule_observation_with_media_time_ms( + at_ms, + source_node, + stream_id, + rendition_id, + track_name, + sequence, + Some(sequence), + blake3_hash, + ); + } + + #[allow(clippy::too_many_arguments)] + pub fn schedule_observation_with_media_time_ms( + &mut self, + at_ms: u64, + source_node: &str, + stream_id: &str, + rendition_id: &str, + track_name: &str, + sequence: u64, + media_time_ms: Option, + blake3_hash: &str, + ) { + self.schedule( + at_ms, + SimEvent::Observe(MediaObservation::new_with_media_time_ms( + MediaKey::new(stream_id, rendition_id, track_name, sequence), + source_node, + blake3_hash, + media_time_ms, + at_ms, + )), + ); + } + + pub fn run_until(&mut self, deadline_ms: u64) { + while let Some((&(at_ms, order), _)) = self.queue.iter().next() { + if at_ms > deadline_ms { + break; + } + let event = self + .queue + .remove(&(at_ms, order)) + .expect("event existed while iterating"); + self.now_ms = at_ms; + self.trace.push(SimTraceEntry { + at_ms, + order, + event: event.clone(), + }); + self.apply(event); + } + self.now_ms = self.now_ms.max(deadline_ms); + } + + pub fn run_to_idle(&mut self) { + while let Some((&(at_ms, order), _)) = self.queue.iter().next() { + let event = self + .queue + .remove(&(at_ms, order)) + .expect("event existed while iterating"); + self.now_ms = at_ms; + self.trace.push(SimTraceEntry { + at_ms, + order, + event: event.clone(), + }); + self.apply(event); + } + } + + fn apply(&mut self, event: SimEvent) { + match event { + SimEvent::Observe(observation) => self.convergence.observe(observation), + } + } +} + +pub fn simulated_media_hash( + stream_id: &str, + rendition_id: &str, + track_name: &str, + sequence: u64, + profile_id: &str, +) -> String { + simulated_media_hash_with_source_material( + stream_id, + rendition_id, + track_name, + sequence, + profile_id, + "shared-source-window", + ) +} + +pub fn simulated_media_hash_with_source_material( + stream_id: &str, + rendition_id: &str, + track_name: &str, + sequence: u64, + profile_id: &str, + source_material: &str, +) -> String { + let mut hasher = blake3::Hasher::new(); + hasher.update(stream_id.as_bytes()); + hasher.update(b"\0"); + hasher.update(rendition_id.as_bytes()); + hasher.update(b"\0"); + hasher.update(track_name.as_bytes()); + hasher.update(b"\0"); + hasher.update(&sequence.to_le_bytes()); + hasher.update(b"\0"); + hasher.update(profile_id.as_bytes()); + hasher.update(b"\0"); + hasher.update(source_material.as_bytes()); + hasher.finalize().to_hex().to_string() +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SimulationPartition { + pub source_node: String, + pub start_ms: u64, + pub end_ms: u64, + pub release_delay_ms: u64, +} + +impl SimulationPartition { + pub fn new(source_node: &str, start_ms: u64, end_ms: u64, release_delay_ms: u64) -> Self { + Self { + source_node: source_node.to_string(), + start_ms, + end_ms, + release_delay_ms, + } + } + + fn contains(&self, source_node: &str, at_ms: u64) -> bool { + self.source_node == source_node && at_ms >= self.start_ms && at_ms < self.end_ms + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SimulationOutage { + pub source_node: String, + pub start_ms: u64, + pub end_ms: u64, + pub backfill_delay_ms: u64, +} + +impl SimulationOutage { + pub fn new(source_node: &str, start_ms: u64, end_ms: u64, backfill_delay_ms: u64) -> Self { + Self { + source_node: source_node.to_string(), + start_ms, + end_ms, + backfill_delay_ms, + } + } + + fn contains_production(&self, source_node: &str, produced_ms: u64) -> bool { + self.source_node == source_node && produced_ms >= self.start_ms && produced_ms < self.end_ms + } + + fn contains_time(&self, source_node: &str, at_ms: u64) -> bool { + self.source_node == source_node && at_ms >= self.start_ms && at_ms < self.end_ms + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EncoderDriftFault { + pub source_node: String, + pub sequence: u64, + pub profile_id: String, +} + +impl EncoderDriftFault { + pub fn new(source_node: &str, sequence: u64, profile_id: &str) -> Self { + Self { + source_node: source_node.to_string(), + sequence, + profile_id: profile_id.to_string(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherScenario { + pub seed: SimulationSeed, + pub stream_id: String, + pub rendition_id: String, + pub track_name: String, + pub profile_id: String, + pub publisher_nodes: Vec, + pub start_sequence: u64, + pub end_sequence: u64, + pub segment_step_ms: u64, + pub base_network_delay_ms: u64, + pub max_jitter_ms: u64, + pub transient_drop_per_million: u32, + pub backfill_after_ms: u64, + #[serde(default)] + pub publisher_sequence_offsets: BTreeMap, + #[serde(default)] + pub publisher_media_time_offsets_ms: BTreeMap, + #[serde(default)] + pub missing_media_timing_publishers: BTreeSet, + #[serde(default)] + pub publisher_source_material: BTreeMap, + pub partitions: Vec, + pub publisher_outages: Vec, + pub encoder_drifts: Vec, +} + +impl DuplicatePublisherScenario { + pub fn new( + seed: SimulationSeed, + publisher_nodes: Vec, + stream_id: &str, + rendition_id: &str, + track_name: &str, + profile_id: &str, + start_sequence: u64, + end_sequence: u64, + ) -> Self { + Self { + seed, + stream_id: stream_id.to_string(), + rendition_id: rendition_id.to_string(), + track_name: track_name.to_string(), + profile_id: profile_id.to_string(), + publisher_nodes, + start_sequence, + end_sequence, + segment_step_ms: 1_000, + base_network_delay_ms: 25, + max_jitter_ms: 250, + transient_drop_per_million: 0, + backfill_after_ms: 5_000, + publisher_sequence_offsets: BTreeMap::new(), + publisher_media_time_offsets_ms: BTreeMap::new(), + missing_media_timing_publishers: BTreeSet::new(), + publisher_source_material: BTreeMap::new(), + partitions: Vec::new(), + publisher_outages: Vec::new(), + encoder_drifts: Vec::new(), + } + } + + pub fn expected_sequences(&self) -> u64 { + self.end_sequence.saturating_sub(self.start_sequence) + } + + fn profile_for(&self, source_node: &str, sequence: u64) -> &str { + self.encoder_drifts + .iter() + .find(|fault| fault.source_node == source_node && fault.sequence == sequence) + .map(|fault| fault.profile_id.as_str()) + .unwrap_or(self.profile_id.as_str()) + } + + fn content_sequence_for(&self, source_node: &str, sequence: u64) -> u64 { + sequence.saturating_add( + self.publisher_sequence_offsets + .get(source_node) + .copied() + .unwrap_or(0), + ) + } + + fn media_time_for(&self, source_node: &str, content_sequence: u64) -> Option { + if self.missing_media_timing_publishers.contains(source_node) { + return None; + } + let base_ms = content_sequence + .saturating_sub(self.start_sequence) + .saturating_mul(self.segment_step_ms); + Some( + base_ms.saturating_add( + self.publisher_media_time_offsets_ms + .get(source_node) + .copied() + .unwrap_or(0), + ), + ) + } + + fn source_material_for(&self, source_node: &str) -> &str { + self.publisher_source_material + .get(source_node) + .map(String::as_str) + .unwrap_or("shared-source-window") + } + + fn delivery_after_partitions(&self, source_node: &str, at_ms: u64) -> (u64, u64) { + let mut adjusted_ms = at_ms; + let mut applied = 0; + + loop { + let Some(partition) = self + .partitions + .iter() + .find(|partition| partition.contains(source_node, adjusted_ms)) + else { + return (adjusted_ms, applied); + }; + adjusted_ms = partition.end_ms.saturating_add(partition.release_delay_ms); + applied += 1; + } + } + + fn publisher_outage_at( + &self, + source_node: &str, + produced_ms: u64, + ) -> Option<&SimulationOutage> { + self.publisher_outages + .iter() + .find(|outage| outage.contains_production(source_node, produced_ms)) + } +} + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct SimulationFaultStats { + pub scheduled_observations: u64, + pub transient_dropped_observations: u64, + pub backfill_observations: u64, + pub partition_delayed_observations: u64, + pub publisher_outage_observations: u64, + pub encoder_drift_observations: u64, + pub publisher_phase_offset_observations: u64, + #[serde(default)] + pub source_material_mismatch_observations: u64, + pub max_delivery_ms: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherSimulationReport { + pub seed: SimulationSeed, + pub replay_hint: String, + pub source_count: u64, + pub summary: MediaConvergenceSummary, + pub duplicate_complete_at_ms: Option, + pub fault_stats: SimulationFaultStats, + pub trace: Vec, +} + +impl DuplicatePublisherSimulationReport { + pub fn duplicate_complete(&self) -> bool { + self.summary.ok() + && self.summary.matching_duplicate_sequences.len() as u64 + == self.summary.expected_sequences + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherInvariantConfig { + pub require_source_count: u64, + pub require_duplicate_complete: bool, + #[serde(default = "default_true")] + pub require_media_timing: bool, + pub max_duplicate_complete_ms: Option, +} + +impl DuplicatePublisherInvariantConfig { + pub fn duplicate_complete_with_deadline(max_duplicate_complete_ms: u64) -> Self { + Self { + require_source_count: 2, + require_duplicate_complete: true, + require_media_timing: true, + max_duplicate_complete_ms: Some(max_duplicate_complete_ms), + } + } +} + +impl Default for DuplicatePublisherInvariantConfig { + fn default() -> Self { + Self { + require_source_count: 2, + require_duplicate_complete: true, + require_media_timing: true, + max_duplicate_complete_ms: None, + } + } +} + +const fn default_true() -> bool { + true +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherInvariantReport { + pub ok: bool, + pub replay_hint: String, + pub failures: Vec, + pub duplicate_complete_at_ms: Option, + pub max_duplicate_complete_ms: Option, +} + +pub fn check_duplicate_publisher_invariants( + report: &DuplicatePublisherSimulationReport, + config: &DuplicatePublisherInvariantConfig, +) -> DuplicatePublisherInvariantReport { + let mut failures = Vec::new(); + + if report.source_count < config.require_source_count { + failures.push("insufficient_source_count".to_string()); + } + if !report.summary.missing_sequences.is_empty() { + failures.push("missing_sequences".to_string()); + } + if !report.summary.divergent_sequences.is_empty() { + failures.push("divergent_sequences".to_string()); + } + if report.fault_stats.source_material_mismatch_observations > 0 { + failures.push("source_material_mismatch_observations".to_string()); + } + if config.require_media_timing && report.summary.media_timing_missing_records > 0 { + failures.push("media_timing_missing_records".to_string()); + } + if config.require_media_timing && !report.summary.media_timing_conflict_sequences.is_empty() { + failures.push("media_timing_conflict_sequences".to_string()); + } + if config.require_duplicate_complete && !report.duplicate_complete() { + failures.push("duplicate_incomplete".to_string()); + } + if let Some(max_duplicate_complete_ms) = config.max_duplicate_complete_ms { + match report.duplicate_complete_at_ms { + Some(duplicate_complete_at_ms) + if duplicate_complete_at_ms <= max_duplicate_complete_ms => {} + Some(_) => failures.push("duplicate_complete_deadline_exceeded".to_string()), + None => failures.push("duplicate_complete_deadline_unreached".to_string()), + } + } + + DuplicatePublisherInvariantReport { + ok: failures.is_empty(), + replay_hint: report.replay_hint.clone(), + failures, + duplicate_complete_at_ms: report.duplicate_complete_at_ms, + max_duplicate_complete_ms: config.max_duplicate_complete_ms, + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherShrinkStep { + pub dimension: String, + pub before: String, + pub after: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherShrunkFailure { + pub seed: SimulationSeed, + pub replay_hint: String, + pub attempts: u64, + pub steps: Vec, + pub scenario: DuplicatePublisherScenario, + pub invariant: DuplicatePublisherInvariantReport, + pub report: DuplicatePublisherSimulationReport, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherCampaignFailure { + pub seed: SimulationSeed, + pub replay_hint: String, + pub scenario: DuplicatePublisherScenario, + pub invariant: DuplicatePublisherInvariantReport, + pub report: DuplicatePublisherSimulationReport, + pub shrunk_failure: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DuplicatePublisherCampaignReport { + pub name: String, + pub seed_start: SimulationSeed, + pub iterations: u64, + pub passed: u64, + pub failed: u64, + pub first_failure: Option, + pub max_duplicate_complete_ms_observed: u64, + pub max_delivery_ms_observed: u64, + pub total_transient_dropped_observations: u64, + pub total_partition_delayed_observations: u64, + pub total_publisher_outage_observations: u64, + pub total_backfill_observations: u64, + #[serde(default)] + pub total_publisher_phase_offset_observations: u64, + #[serde(default)] + pub total_source_material_mismatch_observations: u64, + #[serde(default)] + pub total_media_timing_missing_records: u64, + #[serde(default)] + pub total_media_timing_conflict_sequences: u64, +} + +impl DuplicatePublisherCampaignReport { + pub fn all_passed(&self) -> bool { + self.failed == 0 && self.passed == self.iterations && self.first_failure.is_none() + } +} + +pub fn run_duplicate_publisher_simulation( + scenario: &DuplicatePublisherScenario, +) -> DuplicatePublisherSimulationReport { + let mut rng = SimulationRng::new(scenario.seed); + let mut sim = DeterministicSimulation::new(); + let mut stats = SimulationFaultStats::default(); + let source_materials = scenario + .publisher_nodes + .iter() + .map(|node| scenario.source_material_for(node).to_string()) + .collect::>(); + let source_material_mismatch_active = source_materials.len() > 1; + + for sequence in scenario.start_sequence..scenario.end_sequence { + let media_offset_ms = (sequence - scenario.start_sequence) * scenario.segment_step_ms; + + for source_node in &scenario.publisher_nodes { + let initial_delivery_ms = media_offset_ms + .saturating_add(scenario.base_network_delay_ms) + .saturating_add(rng.range_inclusive(0, scenario.max_jitter_ms)); + let profile_id = scenario.profile_for(source_node, sequence); + let content_sequence = scenario.content_sequence_for(source_node, sequence); + let media_time_ms = scenario.media_time_for(source_node, content_sequence); + let source_material = scenario.source_material_for(source_node); + let hash = simulated_media_hash_with_source_material( + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + content_sequence, + profile_id, + source_material, + ); + + if profile_id != scenario.profile_id { + stats.encoder_drift_observations += 1; + } + if content_sequence != sequence { + stats.publisher_phase_offset_observations += 1; + } + if source_material_mismatch_active { + stats.source_material_mismatch_observations += 1; + } + + if let Some(outage) = scenario.publisher_outage_at(source_node, media_offset_ms) { + stats.publisher_outage_observations += 1; + stats.backfill_observations += 1; + let retry_ms = outage + .end_ms + .saturating_add(outage.backfill_delay_ms) + .saturating_add(rng.range_inclusive(0, scenario.max_jitter_ms)); + let (retry_ms, retry_partition_delays) = + scenario.delivery_after_partitions(source_node, retry_ms); + stats.partition_delayed_observations += retry_partition_delays; + stats.max_delivery_ms = stats.max_delivery_ms.max(retry_ms); + stats.scheduled_observations += 1; + sim.schedule_observation_with_media_time_ms( + retry_ms, + source_node, + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + sequence, + media_time_ms, + &hash, + ); + continue; + } + + let (scheduled_ms, partition_delays) = + scenario.delivery_after_partitions(source_node, initial_delivery_ms); + stats.partition_delayed_observations += partition_delays; + + if rng.chance_per_million(scenario.transient_drop_per_million) { + stats.transient_dropped_observations += 1; + stats.backfill_observations += 1; + let retry_ms = scheduled_ms + .saturating_add(scenario.backfill_after_ms) + .saturating_add(rng.range_inclusive(0, scenario.max_jitter_ms)); + let (retry_ms, retry_partition_delays) = + scenario.delivery_after_partitions(source_node, retry_ms); + stats.partition_delayed_observations += retry_partition_delays; + stats.max_delivery_ms = stats.max_delivery_ms.max(retry_ms); + stats.scheduled_observations += 1; + sim.schedule_observation_with_media_time_ms( + retry_ms, + source_node, + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + sequence, + media_time_ms, + &hash, + ); + continue; + } + + stats.scheduled_observations += 1; + stats.max_delivery_ms = stats.max_delivery_ms.max(scheduled_ms); + sim.schedule_observation_with_media_time_ms( + scheduled_ms, + source_node, + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + sequence, + media_time_ms, + &hash, + ); + } + } + + sim.run_to_idle(); + let summary = sim.convergence().summarize( + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + scenario.start_sequence, + scenario.end_sequence, + ); + let duplicate_complete_at_ms = sim.convergence().duplicate_complete_at_ms( + &scenario.stream_id, + &scenario.rendition_id, + &scenario.track_name, + scenario.start_sequence, + scenario.end_sequence, + ); + + DuplicatePublisherSimulationReport { + seed: scenario.seed, + replay_hint: scenario.seed.replay_hint(), + source_count: scenario.publisher_nodes.len() as u64, + summary, + duplicate_complete_at_ms, + fault_stats: stats, + trace: sim.trace().to_vec(), + } +} + +fn evaluate_duplicate_publisher_failure( + scenario: &DuplicatePublisherScenario, + invariant_config: &DuplicatePublisherInvariantConfig, + attempts: &mut u64, +) -> Option<( + DuplicatePublisherSimulationReport, + DuplicatePublisherInvariantReport, +)> { + *attempts += 1; + let report = run_duplicate_publisher_simulation(scenario); + let invariant = check_duplicate_publisher_invariants(&report, invariant_config); + (!invariant.ok).then_some((report, invariant)) +} + +#[allow(clippy::too_many_arguments)] +fn accept_duplicate_publisher_shrink( + current: &mut DuplicatePublisherScenario, + current_report: &mut DuplicatePublisherSimulationReport, + current_invariant: &mut DuplicatePublisherInvariantReport, + steps: &mut Vec, + attempts: &mut u64, + invariant_config: &DuplicatePublisherInvariantConfig, + dimension: &str, + before: String, + after: String, + candidate: DuplicatePublisherScenario, +) -> bool { + let Some((report, invariant)) = + evaluate_duplicate_publisher_failure(&candidate, invariant_config, attempts) + else { + return false; + }; + + *current = candidate; + *current_report = report; + *current_invariant = invariant; + steps.push(DuplicatePublisherShrinkStep { + dimension: dimension.to_string(), + before, + after, + }); + true +} + +pub fn shrink_duplicate_publisher_failure( + scenario: &DuplicatePublisherScenario, + invariant_config: &DuplicatePublisherInvariantConfig, +) -> Option { + let mut attempts = 0; + let (mut current_report, mut current_invariant) = + evaluate_duplicate_publisher_failure(scenario, invariant_config, &mut attempts)?; + let mut current = scenario.clone(); + let mut steps = Vec::new(); + + let mut partition_index = 0; + while partition_index < current.partitions.len() { + let mut candidate = current.clone(); + candidate.partitions.remove(partition_index); + let before = current.partitions.len().to_string(); + let after = candidate.partitions.len().to_string(); + if !accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "partitions.len", + before, + after, + candidate, + ) { + partition_index += 1; + } + } + + let mut drift_index = 0; + while drift_index < current.encoder_drifts.len() { + let mut candidate = current.clone(); + candidate.encoder_drifts.remove(drift_index); + let before = current.encoder_drifts.len().to_string(); + let after = candidate.encoder_drifts.len().to_string(); + if !accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "encoder_drifts.len", + before, + after, + candidate, + ) { + drift_index += 1; + } + } + + if !current.publisher_sequence_offsets.is_empty() { + let mut candidate = current.clone(); + candidate.publisher_sequence_offsets.clear(); + accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "publisher_sequence_offsets.len", + scenario.publisher_sequence_offsets.len().to_string(), + "0".to_string(), + candidate, + ); + } + + if !current.publisher_source_material.is_empty() { + let mut candidate = current.clone(); + candidate.publisher_source_material.clear(); + accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "publisher_source_material.len", + scenario.publisher_source_material.len().to_string(), + "0".to_string(), + candidate, + ); + } + + let mut outage_index = 0; + while outage_index < current.publisher_outages.len() { + let mut candidate = current.clone(); + candidate.publisher_outages.remove(outage_index); + let before = current.publisher_outages.len().to_string(); + let after = candidate.publisher_outages.len().to_string(); + if !accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "publisher_outages.len", + before, + after, + candidate, + ) { + outage_index += 1; + } + } + + if current.transient_drop_per_million > 0 { + let mut candidate = current.clone(); + candidate.transient_drop_per_million = 0; + accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "transient_drop_per_million", + scenario.transient_drop_per_million.to_string(), + "0".to_string(), + candidate, + ); + } + + if current.max_jitter_ms > 0 { + let mut candidate = current.clone(); + candidate.max_jitter_ms = 0; + let before = current.max_jitter_ms.to_string(); + accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "max_jitter_ms", + before, + "0".to_string(), + candidate, + ); + } + + if current.base_network_delay_ms > 0 { + let mut candidate = current.clone(); + candidate.base_network_delay_ms = 0; + let before = current.base_network_delay_ms.to_string(); + accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "base_network_delay_ms", + before, + "0".to_string(), + candidate, + ); + } + + let mut low_count = 1; + let mut high_count = current.expected_sequences(); + while low_count < high_count { + let mid_count = low_count + ((high_count - low_count) / 2); + let mut candidate = current.clone(); + candidate.end_sequence = candidate.start_sequence.saturating_add(mid_count); + let before = current.expected_sequences().to_string(); + let after = candidate.expected_sequences().to_string(); + if accept_duplicate_publisher_shrink( + &mut current, + &mut current_report, + &mut current_invariant, + &mut steps, + &mut attempts, + invariant_config, + "sequence_count", + before, + after, + candidate, + ) { + high_count = mid_count; + } else { + low_count = mid_count + 1; + } + } + + Some(DuplicatePublisherShrunkFailure { + seed: current.seed, + replay_hint: current.seed.replay_hint(), + attempts, + steps, + scenario: current, + invariant: current_invariant, + report: current_report, + }) +} + +pub fn run_duplicate_publisher_campaign( + name: &str, + seed_start: SimulationSeed, + iterations: u64, + invariant_config: &DuplicatePublisherInvariantConfig, + mut build_scenario: F, +) -> DuplicatePublisherCampaignReport +where + F: FnMut(SimulationSeed) -> DuplicatePublisherScenario, +{ + let mut campaign = DuplicatePublisherCampaignReport { + name: name.to_string(), + seed_start, + iterations, + passed: 0, + failed: 0, + first_failure: None, + max_duplicate_complete_ms_observed: 0, + max_delivery_ms_observed: 0, + total_transient_dropped_observations: 0, + total_partition_delayed_observations: 0, + total_publisher_outage_observations: 0, + total_backfill_observations: 0, + total_publisher_phase_offset_observations: 0, + total_source_material_mismatch_observations: 0, + total_media_timing_missing_records: 0, + total_media_timing_conflict_sequences: 0, + }; + + let generic = run_seeded_simulation_campaign(name, seed_start, iterations, |seed| { + let scenario = build_scenario(seed); + let report = run_duplicate_publisher_simulation(&scenario); + let invariant = check_duplicate_publisher_invariants(&report, invariant_config); + + campaign.max_delivery_ms_observed = campaign + .max_delivery_ms_observed + .max(report.fault_stats.max_delivery_ms); + if let Some(duplicate_complete_at_ms) = report.duplicate_complete_at_ms { + campaign.max_duplicate_complete_ms_observed = campaign + .max_duplicate_complete_ms_observed + .max(duplicate_complete_at_ms); + } + campaign.total_transient_dropped_observations += + report.fault_stats.transient_dropped_observations; + campaign.total_partition_delayed_observations += + report.fault_stats.partition_delayed_observations; + campaign.total_publisher_outage_observations += + report.fault_stats.publisher_outage_observations; + campaign.total_backfill_observations += report.fault_stats.backfill_observations; + campaign.total_publisher_phase_offset_observations += + report.fault_stats.publisher_phase_offset_observations; + campaign.total_source_material_mismatch_observations += + report.fault_stats.source_material_mismatch_observations; + campaign.total_media_timing_missing_records += report.summary.media_timing_missing_records; + campaign.total_media_timing_conflict_sequences += + report.summary.media_timing_conflict_sequences.len() as u64; + + if invariant.ok { + None + } else { + let shrunk_failure = shrink_duplicate_publisher_failure(&scenario, invariant_config); + Some(DuplicatePublisherCampaignFailure { + seed, + replay_hint: report.replay_hint.clone(), + scenario, + invariant, + report, + shrunk_failure, + }) + } + }); + + campaign.passed = generic.passed; + campaign.failed = generic.failed; + campaign.first_failure = generic.first_failure; + + campaign +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationScenario { + pub seed: SimulationSeed, + pub nodes: Vec, + pub origin_node: String, + pub topic: String, + pub announcement_id: String, + pub fanout: u64, + pub gossip_interval_ms: u64, + pub max_gossip_rounds: u64, + pub base_network_delay_ms: u64, + pub max_jitter_ms: u64, + pub transient_drop_per_million: u32, + pub partitions: Vec, + pub node_outages: Vec, +} + +impl ControlPlanePropagationScenario { + pub fn new( + seed: SimulationSeed, + nodes: Vec, + origin_node: &str, + topic: &str, + announcement_id: &str, + ) -> Self { + Self { + seed, + nodes, + origin_node: origin_node.to_string(), + topic: topic.to_string(), + announcement_id: announcement_id.to_string(), + fanout: 3, + gossip_interval_ms: 35, + max_gossip_rounds: 10, + base_network_delay_ms: 8, + max_jitter_ms: 40, + transient_drop_per_million: 100_000, + partitions: Vec::new(), + node_outages: Vec::new(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlaneMessage { + pub from_node: String, + pub to_node: String, + pub topic: String, + pub announcement_id: String, + pub sent_ms: u64, + pub round: u64, +} + +impl ControlPlaneMessage { + fn new( + scenario: &ControlPlanePropagationScenario, + from_node: &str, + to_node: &str, + sent_ms: u64, + round: u64, + ) -> Self { + Self { + from_node: from_node.to_string(), + to_node: to_node.to_string(), + topic: scenario.topic.clone(), + announcement_id: scenario.announcement_id.clone(), + sent_ms, + round, + } + } +} + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlaneFaultStats { + pub scheduled_messages: u64, + pub delivered_messages: u64, + pub transient_dropped_messages: u64, + pub duplicate_messages: u64, + pub partition_delayed_messages: u64, + pub node_outage_delayed_messages: u64, + pub node_outage_delayed_rounds: u64, + pub max_delivery_ms: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum ControlPlaneTraceEvent { + NodeLearned { + node: String, + from_node: Option, + announcement_id: String, + }, + GossipRound { + node: String, + round: u64, + }, + GossipRoundDelayed { + node: String, + round: u64, + delayed_until_ms: u64, + }, + MessageScheduled { + message: ControlPlaneMessage, + deliver_at_ms: u64, + }, + MessageDropped { + message: ControlPlaneMessage, + }, + MessageDelayed { + message: ControlPlaneMessage, + delayed_until_ms: u64, + partition_delays: u64, + node_outage_delays: u64, + }, + MessageDelivered { + message: ControlPlaneMessage, + }, + DuplicateIgnored { + message: ControlPlaneMessage, + }, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlaneTraceEntry { + pub at_ms: u64, + pub order: u64, + pub event: ControlPlaneTraceEvent, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum ControlPlaneQueuedEvent { + GossipRound { node: String, round: u64 }, + DeliverMessage(ControlPlaneMessage), +} + +#[derive(Debug, Default)] +struct ControlPlaneSimulationState { + now_ms: u64, + next_queue_order: u64, + next_trace_order: u64, + queue: BTreeMap<(u64, u64), ControlPlaneQueuedEvent>, + known_at: BTreeMap, + trace: Vec, + stats: ControlPlaneFaultStats, +} + +impl ControlPlaneSimulationState { + fn schedule(&mut self, at_ms: u64, event: ControlPlaneQueuedEvent) { + let order = self.next_queue_order; + self.next_queue_order += 1; + self.queue.insert((at_ms, order), event); + } + + fn record(&mut self, at_ms: u64, event: ControlPlaneTraceEvent) { + let order = self.next_trace_order; + self.next_trace_order += 1; + self.trace.push(ControlPlaneTraceEntry { + at_ms, + order, + event, + }); + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationReport { + pub seed: SimulationSeed, + pub replay_hint: String, + pub node_count: u64, + pub known_count: u64, + pub known_nodes: Vec, + pub known_at_by_node: BTreeMap, + pub missing_nodes: Vec, + pub propagation_complete_at_ms: Option, + pub fault_stats: ControlPlaneFaultStats, + pub trace: Vec, +} + +impl ControlPlanePropagationReport { + pub fn propagation_complete(&self) -> bool { + self.missing_nodes.is_empty() + && self.known_count == self.node_count + && self.propagation_complete_at_ms.is_some() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationInvariantConfig { + pub require_node_count: u64, + pub require_complete: bool, + pub max_propagation_complete_ms: Option, +} + +impl ControlPlanePropagationInvariantConfig { + pub fn complete_with_deadline( + require_node_count: u64, + max_propagation_complete_ms: u64, + ) -> Self { + Self { + require_node_count, + require_complete: true, + max_propagation_complete_ms: Some(max_propagation_complete_ms), + } + } +} + +impl Default for ControlPlanePropagationInvariantConfig { + fn default() -> Self { + Self { + require_node_count: 2, + require_complete: true, + max_propagation_complete_ms: None, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationInvariantReport { + pub ok: bool, + pub replay_hint: String, + pub failures: Vec, + pub propagation_complete_at_ms: Option, + pub max_propagation_complete_ms: Option, +} + +pub fn check_control_plane_propagation_invariants( + report: &ControlPlanePropagationReport, + config: &ControlPlanePropagationInvariantConfig, +) -> ControlPlanePropagationInvariantReport { + let mut failures = Vec::new(); + + if report.node_count < config.require_node_count { + failures.push("insufficient_node_count".to_string()); + } + if config.require_complete && !report.propagation_complete() { + failures.push("propagation_incomplete".to_string()); + } + if let Some(max_propagation_complete_ms) = config.max_propagation_complete_ms { + match report.propagation_complete_at_ms { + Some(propagation_complete_at_ms) + if propagation_complete_at_ms <= max_propagation_complete_ms => {} + Some(_) => failures.push("propagation_deadline_exceeded".to_string()), + None => failures.push("propagation_deadline_unreached".to_string()), + } + } + + ControlPlanePropagationInvariantReport { + ok: failures.is_empty(), + replay_hint: report.replay_hint.clone(), + failures, + propagation_complete_at_ms: report.propagation_complete_at_ms, + max_propagation_complete_ms: config.max_propagation_complete_ms, + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationCampaignFailure { + pub seed: SimulationSeed, + pub replay_hint: String, + pub scenario: ControlPlanePropagationScenario, + pub invariant: ControlPlanePropagationInvariantReport, + pub report: ControlPlanePropagationReport, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ControlPlanePropagationCampaignReport { + pub name: String, + pub seed_start: SimulationSeed, + pub iterations: u64, + pub passed: u64, + pub failed: u64, + pub first_failure: Option, + pub max_propagation_complete_ms_observed: u64, + pub max_delivery_ms_observed: u64, + pub total_transient_dropped_messages: u64, + pub total_partition_delayed_messages: u64, + pub total_node_outage_delayed_messages: u64, + pub total_duplicate_messages: u64, +} + +impl ControlPlanePropagationCampaignReport { + pub fn all_passed(&self) -> bool { + self.failed == 0 && self.passed == self.iterations && self.first_failure.is_none() + } +} + +pub fn run_control_plane_propagation_simulation( + scenario: &ControlPlanePropagationScenario, +) -> ControlPlanePropagationReport { + let mut rng = SimulationRng::new(scenario.seed); + let mut state = ControlPlaneSimulationState::default(); + let node_set = scenario.nodes.iter().cloned().collect::>(); + + if node_set.contains(&scenario.origin_node) { + state.known_at.insert(scenario.origin_node.clone(), 0); + state.record( + 0, + ControlPlaneTraceEvent::NodeLearned { + node: scenario.origin_node.clone(), + from_node: None, + announcement_id: scenario.announcement_id.clone(), + }, + ); + state.schedule( + 0, + ControlPlaneQueuedEvent::GossipRound { + node: scenario.origin_node.clone(), + round: 0, + }, + ); + } + + while let Some((&(at_ms, order), _)) = state.queue.iter().next() { + let event = state + .queue + .remove(&(at_ms, order)) + .expect("event existed while iterating"); + state.now_ms = at_ms; + match event { + ControlPlaneQueuedEvent::GossipRound { node, round } => { + run_control_plane_gossip_round(&mut state, scenario, &mut rng, &node, round, at_ms); + } + ControlPlaneQueuedEvent::DeliverMessage(message) => { + run_control_plane_message_delivery(&mut state, scenario, &message, at_ms); + } + } + } + + let known_nodes = scenario + .nodes + .iter() + .filter(|node| state.known_at.contains_key(*node)) + .cloned() + .collect::>(); + let missing_nodes = scenario + .nodes + .iter() + .filter(|node| !state.known_at.contains_key(*node)) + .cloned() + .collect::>(); + let propagation_complete_at_ms = missing_nodes + .is_empty() + .then(|| state.known_at.values().copied().max()) + .flatten(); + + ControlPlanePropagationReport { + seed: scenario.seed, + replay_hint: scenario.seed.replay_hint(), + node_count: scenario.nodes.len() as u64, + known_count: known_nodes.len() as u64, + known_nodes, + known_at_by_node: state.known_at, + missing_nodes, + propagation_complete_at_ms, + fault_stats: state.stats, + trace: state.trace, + } +} + +fn run_control_plane_gossip_round( + state: &mut ControlPlaneSimulationState, + scenario: &ControlPlanePropagationScenario, + rng: &mut SimulationRng, + node: &str, + round: u64, + at_ms: u64, +) { + if !state.known_at.contains_key(node) { + return; + } + if round >= scenario.max_gossip_rounds { + return; + } + + let (available_at_ms, outage_delays) = + control_node_available_after_faults(scenario, node, at_ms); + if available_at_ms > at_ms { + state.stats.node_outage_delayed_rounds += outage_delays; + state.record( + at_ms, + ControlPlaneTraceEvent::GossipRoundDelayed { + node: node.to_string(), + round, + delayed_until_ms: available_at_ms, + }, + ); + state.schedule( + available_at_ms, + ControlPlaneQueuedEvent::GossipRound { + node: node.to_string(), + round, + }, + ); + return; + } + + state.record( + at_ms, + ControlPlaneTraceEvent::GossipRound { + node: node.to_string(), + round, + }, + ); + + for peer in control_plane_fanout_peers(scenario, rng, node) { + schedule_control_plane_message(state, scenario, rng, node, &peer, at_ms, round); + } + + let next_round = round.saturating_add(1); + if next_round < scenario.max_gossip_rounds { + state.schedule( + at_ms.saturating_add(scenario.gossip_interval_ms), + ControlPlaneQueuedEvent::GossipRound { + node: node.to_string(), + round: next_round, + }, + ); + } +} + +fn run_control_plane_message_delivery( + state: &mut ControlPlaneSimulationState, + scenario: &ControlPlanePropagationScenario, + message: &ControlPlaneMessage, + at_ms: u64, +) { + state.stats.delivered_messages += 1; + state.record( + at_ms, + ControlPlaneTraceEvent::MessageDelivered { + message: message.clone(), + }, + ); + + if state.known_at.contains_key(&message.to_node) { + state.stats.duplicate_messages += 1; + state.record( + at_ms, + ControlPlaneTraceEvent::DuplicateIgnored { + message: message.clone(), + }, + ); + return; + } + + state.known_at.insert(message.to_node.clone(), at_ms); + state.record( + at_ms, + ControlPlaneTraceEvent::NodeLearned { + node: message.to_node.clone(), + from_node: Some(message.from_node.clone()), + announcement_id: scenario.announcement_id.clone(), + }, + ); + state.schedule( + at_ms, + ControlPlaneQueuedEvent::GossipRound { + node: message.to_node.clone(), + round: 0, + }, + ); +} + +fn schedule_control_plane_message( + state: &mut ControlPlaneSimulationState, + scenario: &ControlPlanePropagationScenario, + rng: &mut SimulationRng, + from_node: &str, + to_node: &str, + at_ms: u64, + round: u64, +) { + let message = ControlPlaneMessage::new(scenario, from_node, to_node, at_ms, round); + state.stats.scheduled_messages += 1; + + if rng.chance_per_million(scenario.transient_drop_per_million) { + state.stats.transient_dropped_messages += 1; + state.record( + at_ms, + ControlPlaneTraceEvent::MessageDropped { + message: message.clone(), + }, + ); + return; + } + + let initial_delivery_ms = at_ms + .saturating_add(scenario.base_network_delay_ms) + .saturating_add(rng.range_inclusive(0, scenario.max_jitter_ms)); + let (deliver_at_ms, partition_delays, node_outage_delays) = + control_message_delivery_after_faults(scenario, from_node, to_node, initial_delivery_ms); + + state.stats.partition_delayed_messages += partition_delays; + state.stats.node_outage_delayed_messages += node_outage_delays; + state.stats.max_delivery_ms = state.stats.max_delivery_ms.max(deliver_at_ms); + + if deliver_at_ms > initial_delivery_ms { + state.record( + at_ms, + ControlPlaneTraceEvent::MessageDelayed { + message: message.clone(), + delayed_until_ms: deliver_at_ms, + partition_delays, + node_outage_delays, + }, + ); + } + state.record( + at_ms, + ControlPlaneTraceEvent::MessageScheduled { + message: message.clone(), + deliver_at_ms, + }, + ); + state.schedule( + deliver_at_ms, + ControlPlaneQueuedEvent::DeliverMessage(message), + ); +} + +fn control_plane_fanout_peers( + scenario: &ControlPlanePropagationScenario, + rng: &mut SimulationRng, + node: &str, +) -> Vec { + let candidates = scenario + .nodes + .iter() + .filter(|candidate| candidate.as_str() != node) + .cloned() + .collect::>(); + if candidates.is_empty() || scenario.fanout == 0 { + return Vec::new(); + } + let fanout = (scenario.fanout as usize).min(candidates.len()); + if fanout == candidates.len() { + return candidates; + } + + let start = rng.range_inclusive(0, candidates.len() as u64 - 1) as usize; + (0..fanout) + .map(|offset| candidates[(start + offset) % candidates.len()].clone()) + .collect() +} + +fn control_message_delivery_after_faults( + scenario: &ControlPlanePropagationScenario, + from_node: &str, + to_node: &str, + at_ms: u64, +) -> (u64, u64, u64) { + let mut adjusted_ms = at_ms; + let mut partition_delays = 0; + let mut node_outage_delays = 0; + + loop { + if let Some(partition) = scenario.partitions.iter().find(|partition| { + (partition.source_node == from_node || partition.source_node == to_node) + && partition.contains(&partition.source_node, adjusted_ms) + }) { + adjusted_ms = partition.end_ms.saturating_add(partition.release_delay_ms); + partition_delays += 1; + continue; + } + if let Some(outage) = scenario.node_outages.iter().find(|outage| { + (outage.source_node == from_node || outage.source_node == to_node) + && outage.contains_time(&outage.source_node, adjusted_ms) + }) { + adjusted_ms = outage.end_ms.saturating_add(outage.backfill_delay_ms); + node_outage_delays += 1; + continue; + } + return (adjusted_ms, partition_delays, node_outage_delays); + } +} + +fn control_node_available_after_faults( + scenario: &ControlPlanePropagationScenario, + node: &str, + at_ms: u64, +) -> (u64, u64) { + let mut adjusted_ms = at_ms; + let mut node_outage_delays = 0; + + loop { + if let Some(outage) = scenario + .node_outages + .iter() + .find(|outage| outage.contains_time(node, adjusted_ms)) + { + adjusted_ms = outage.end_ms.saturating_add(outage.backfill_delay_ms); + node_outage_delays += 1; + continue; + } + return (adjusted_ms, node_outage_delays); + } +} + +pub fn run_control_plane_propagation_campaign( + name: &str, + seed_start: SimulationSeed, + iterations: u64, + invariant_config: &ControlPlanePropagationInvariantConfig, + mut build_scenario: F, +) -> ControlPlanePropagationCampaignReport +where + F: FnMut(SimulationSeed) -> ControlPlanePropagationScenario, +{ + let mut campaign = ControlPlanePropagationCampaignReport { + name: name.to_string(), + seed_start, + iterations, + passed: 0, + failed: 0, + first_failure: None, + max_propagation_complete_ms_observed: 0, + max_delivery_ms_observed: 0, + total_transient_dropped_messages: 0, + total_partition_delayed_messages: 0, + total_node_outage_delayed_messages: 0, + total_duplicate_messages: 0, + }; + + let generic = run_seeded_simulation_campaign(name, seed_start, iterations, |seed| { + let scenario = build_scenario(seed); + let report = run_control_plane_propagation_simulation(&scenario); + let invariant = check_control_plane_propagation_invariants(&report, invariant_config); + + campaign.max_delivery_ms_observed = campaign + .max_delivery_ms_observed + .max(report.fault_stats.max_delivery_ms); + if let Some(propagation_complete_at_ms) = report.propagation_complete_at_ms { + campaign.max_propagation_complete_ms_observed = campaign + .max_propagation_complete_ms_observed + .max(propagation_complete_at_ms); + } + campaign.total_transient_dropped_messages += report.fault_stats.transient_dropped_messages; + campaign.total_partition_delayed_messages += report.fault_stats.partition_delayed_messages; + campaign.total_node_outage_delayed_messages += + report.fault_stats.node_outage_delayed_messages; + campaign.total_duplicate_messages += report.fault_stats.duplicate_messages; + + if invariant.ok { + None + } else { + Some(ControlPlanePropagationCampaignFailure { + seed, + replay_hint: report.replay_hint.clone(), + scenario, + invariant, + report, + }) + } + }); + + campaign.passed = generic.passed; + campaign.failed = generic.failed; + campaign.first_failure = generic.first_failure; + + campaign +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PublisherSequenceClock { + #[default] + Global, + LocalActivation, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherScenario { + pub seed: SimulationSeed, + pub control: ControlPlanePropagationScenario, + pub media: DuplicatePublisherScenario, + pub publisher_activation_delay_ms: u64, + pub publisher_backfill_delay_ms: u64, + pub sequence_clock: PublisherSequenceClock, +} + +impl SystemDuplicatePublisherScenario { + pub fn new( + seed: SimulationSeed, + control: ControlPlanePropagationScenario, + media: DuplicatePublisherScenario, + ) -> Self { + Self { + seed, + control, + media, + publisher_activation_delay_ms: 0, + publisher_backfill_delay_ms: 120, + sequence_clock: PublisherSequenceClock::Global, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherSimulationReport { + pub seed: SimulationSeed, + pub replay_hint: String, + pub publisher_activation_ms: BTreeMap, + pub control: ControlPlanePropagationReport, + pub media: DuplicatePublisherSimulationReport, + pub system_complete_at_ms: Option, +} + +impl SystemDuplicatePublisherSimulationReport { + pub fn system_complete(&self) -> bool { + self.control.propagation_complete() + && self.media.duplicate_complete() + && self.system_complete_at_ms.is_some() + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherInvariantConfig { + pub require_control_complete: bool, + pub require_media_duplicate_complete: bool, + #[serde(default = "default_true")] + pub require_media_timing: bool, + pub max_system_complete_ms: Option, +} + +impl SystemDuplicatePublisherInvariantConfig { + pub fn complete_with_deadline(max_system_complete_ms: u64) -> Self { + Self { + require_control_complete: true, + require_media_duplicate_complete: true, + require_media_timing: true, + max_system_complete_ms: Some(max_system_complete_ms), + } + } +} + +impl Default for SystemDuplicatePublisherInvariantConfig { + fn default() -> Self { + Self { + require_control_complete: true, + require_media_duplicate_complete: true, + require_media_timing: true, + max_system_complete_ms: None, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherInvariantReport { + pub ok: bool, + pub replay_hint: String, + pub failures: Vec, + pub system_complete_at_ms: Option, + pub max_system_complete_ms: Option, +} + +pub fn check_system_duplicate_publisher_invariants( + report: &SystemDuplicatePublisherSimulationReport, + config: &SystemDuplicatePublisherInvariantConfig, +) -> SystemDuplicatePublisherInvariantReport { + let mut failures = Vec::new(); + + if config.require_control_complete && !report.control.propagation_complete() { + failures.push("control_propagation_incomplete".to_string()); + } + if !report.media.summary.missing_sequences.is_empty() { + failures.push("media_missing_sequences".to_string()); + } + if !report.media.summary.divergent_sequences.is_empty() { + failures.push("media_divergent_sequences".to_string()); + } + if report + .media + .fault_stats + .source_material_mismatch_observations + > 0 + { + failures.push("media_source_material_mismatch_observations".to_string()); + } + if !report + .media + .summary + .source_local_divergent_sequences + .is_empty() + { + failures.push("media_source_local_divergent_sequences".to_string()); + } + if config.require_media_timing && report.media.summary.media_timing_missing_records > 0 { + failures.push("media_timing_missing_records".to_string()); + } + if config.require_media_timing + && !report + .media + .summary + .media_timing_conflict_sequences + .is_empty() + { + failures.push("media_timing_conflict_sequences".to_string()); + } + if config.require_media_duplicate_complete && !report.media.duplicate_complete() { + failures.push("media_duplicate_incomplete".to_string()); + } + if let Some(max_system_complete_ms) = config.max_system_complete_ms { + match report.system_complete_at_ms { + Some(system_complete_at_ms) if system_complete_at_ms <= max_system_complete_ms => {} + Some(_) => failures.push("system_complete_deadline_exceeded".to_string()), + None => failures.push("system_complete_deadline_unreached".to_string()), + } + } + + SystemDuplicatePublisherInvariantReport { + ok: failures.is_empty(), + replay_hint: report.replay_hint.clone(), + failures, + system_complete_at_ms: report.system_complete_at_ms, + max_system_complete_ms: config.max_system_complete_ms, + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherCampaignFailure { + pub seed: SimulationSeed, + pub replay_hint: String, + pub scenario: SystemDuplicatePublisherScenario, + pub invariant: SystemDuplicatePublisherInvariantReport, + pub report: SystemDuplicatePublisherSimulationReport, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherCampaignRunSummary { + pub seed: SimulationSeed, + pub replay_hint: String, + pub system_complete_at_ms: Option, + pub control_propagation_complete_at_ms: Option, + pub media_duplicate_complete_at_ms: Option, + pub control_trace_events: u64, + pub media_trace_events: u64, + pub total_trace_events: u64, + pub control_transient_dropped_messages: u64, + pub control_partition_delayed_messages: u64, + pub control_node_outage_delays: u64, + pub control_duplicate_messages: u64, + pub media_transient_dropped_observations: u64, + pub media_partition_delayed_observations: u64, + pub media_publisher_outage_observations: u64, + pub media_backfill_observations: u64, + pub media_publisher_phase_offset_observations: u64, + pub media_source_material_mismatch_observations: u64, + pub media_timing_missing_records: u64, + pub media_timing_conflict_sequences: u64, +} + +impl SystemDuplicatePublisherCampaignRunSummary { + fn from_report(report: &SystemDuplicatePublisherSimulationReport) -> Self { + Self { + seed: report.seed, + replay_hint: report.replay_hint.clone(), + system_complete_at_ms: report.system_complete_at_ms, + control_propagation_complete_at_ms: report.control.propagation_complete_at_ms, + media_duplicate_complete_at_ms: report.media.duplicate_complete_at_ms, + control_trace_events: report.control.trace.len() as u64, + media_trace_events: report.media.trace.len() as u64, + total_trace_events: report.control.trace.len() as u64 + report.media.trace.len() as u64, + control_transient_dropped_messages: report + .control + .fault_stats + .transient_dropped_messages, + control_partition_delayed_messages: report + .control + .fault_stats + .partition_delayed_messages, + control_node_outage_delays: report.control.fault_stats.node_outage_delayed_messages + + report.control.fault_stats.node_outage_delayed_rounds, + control_duplicate_messages: report.control.fault_stats.duplicate_messages, + media_transient_dropped_observations: report + .media + .fault_stats + .transient_dropped_observations, + media_partition_delayed_observations: report + .media + .fault_stats + .partition_delayed_observations, + media_publisher_outage_observations: report + .media + .fault_stats + .publisher_outage_observations, + media_backfill_observations: report.media.fault_stats.backfill_observations, + media_publisher_phase_offset_observations: report + .media + .fault_stats + .publisher_phase_offset_observations, + media_source_material_mismatch_observations: report + .media + .fault_stats + .source_material_mismatch_observations, + media_timing_missing_records: report.media.summary.media_timing_missing_records, + media_timing_conflict_sequences: report + .media + .summary + .media_timing_conflict_sequences + .len() as u64, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct SystemDuplicatePublisherCampaignReport { + pub name: String, + pub seed_start: SimulationSeed, + pub iterations: u64, + pub passed: u64, + pub failed: u64, + pub first_failure: Option, + pub max_system_complete_ms_observed: u64, + pub max_control_propagation_ms_observed: u64, + pub max_media_duplicate_complete_ms_observed: u64, + #[serde(default)] + pub total_system_complete_ms_observed: u64, + #[serde(default)] + pub total_control_propagation_ms_observed: u64, + #[serde(default)] + pub total_media_duplicate_complete_ms_observed: u64, + #[serde(default)] + pub total_control_trace_events: u64, + #[serde(default)] + pub total_media_trace_events: u64, + #[serde(default)] + pub total_trace_events: u64, + pub total_control_transient_drops: u64, + #[serde(default)] + pub total_control_partition_delays: u64, + #[serde(default)] + pub total_control_node_outage_delays: u64, + #[serde(default)] + pub total_control_duplicate_messages: u64, + pub total_media_transient_drops: u64, + #[serde(default)] + pub total_media_partition_delays: u64, + #[serde(default)] + pub total_media_publisher_outages: u64, + pub total_media_backfill_observations: u64, + #[serde(default)] + pub total_media_publisher_phase_offsets: u64, + #[serde(default)] + pub total_media_source_material_mismatches: u64, + #[serde(default)] + pub total_media_timing_missing_records: u64, + #[serde(default)] + pub total_media_timing_conflict_sequences: u64, + #[serde(default)] + pub seeds_with_system_convergence_time: u64, + #[serde(default)] + pub seeds_with_control_propagation_time: u64, + #[serde(default)] + pub seeds_with_media_duplicate_convergence_time: u64, + #[serde(default)] + pub seeds_with_control_transient_drops: u64, + #[serde(default)] + pub seeds_with_control_partition_delays: u64, + #[serde(default)] + pub seeds_with_control_node_outage_delays: u64, + #[serde(default)] + pub seeds_with_control_duplicate_messages: u64, + #[serde(default)] + pub seeds_with_media_transient_drops: u64, + #[serde(default)] + pub seeds_with_media_partition_delays: u64, + #[serde(default)] + pub seeds_with_media_publisher_outages: u64, + #[serde(default)] + pub seeds_with_media_backfill_observations: u64, + #[serde(default)] + pub seeds_with_media_publisher_phase_offsets: u64, + #[serde(default)] + pub seeds_with_media_source_material_mismatches: u64, + #[serde(default)] + pub seeds_with_media_timing_missing_records: u64, + #[serde(default)] + pub seeds_with_media_timing_conflict_sequences: u64, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub slowest_system_runs: Vec, +} + +impl SystemDuplicatePublisherCampaignReport { + pub fn all_passed(&self) -> bool { + self.failed == 0 && self.passed == self.iterations && self.first_failure.is_none() + } + + pub fn fault_coverage_failures(&self) -> Vec { + let mut failures = Vec::new(); + if self.max_system_complete_ms_observed == 0 { + failures.push("system_convergence_time_unobserved".to_string()); + } + if self.max_control_propagation_ms_observed == 0 { + failures.push("control_propagation_time_unobserved".to_string()); + } + if self.max_media_duplicate_complete_ms_observed == 0 { + failures.push("media_duplicate_convergence_time_unobserved".to_string()); + } + if self.seeds_with_control_transient_drops == 0 { + failures.push("control_transient_drops_uncovered".to_string()); + } + if self.seeds_with_control_partition_delays == 0 { + failures.push("control_partition_delays_uncovered".to_string()); + } + if self.seeds_with_control_node_outage_delays == 0 { + failures.push("control_node_outage_delays_uncovered".to_string()); + } + if self.seeds_with_control_duplicate_messages == 0 { + failures.push("control_duplicate_messages_uncovered".to_string()); + } + if self.seeds_with_media_transient_drops == 0 { + failures.push("media_transient_drops_uncovered".to_string()); + } + if self.seeds_with_media_partition_delays == 0 { + failures.push("media_partition_delays_uncovered".to_string()); + } + if self.seeds_with_media_publisher_outages == 0 { + failures.push("media_publisher_outages_uncovered".to_string()); + } + if self.seeds_with_media_backfill_observations == 0 { + failures.push("media_backfill_uncovered".to_string()); + } + failures + } + + pub fn fault_coverage_ok(&self) -> bool { + self.fault_coverage_failures().is_empty() + } +} + +pub fn run_system_duplicate_publisher_simulation( + scenario: &SystemDuplicatePublisherScenario, +) -> SystemDuplicatePublisherSimulationReport { + let control = run_control_plane_propagation_simulation(&scenario.control); + let mut media_scenario = scenario.media.clone(); + let media_window_ms = media_scenario + .expected_sequences() + .saturating_mul(media_scenario.segment_step_ms); + let mut publisher_activation_ms = BTreeMap::new(); + let mut active_publishers = Vec::new(); + + for publisher in &scenario.media.publisher_nodes { + let Some(learned_at_ms) = control.known_at_by_node.get(publisher).copied() else { + continue; + }; + let activation_ms = learned_at_ms.saturating_add(scenario.publisher_activation_delay_ms); + publisher_activation_ms.insert(publisher.clone(), activation_ms); + active_publishers.push(publisher.clone()); + + if activation_ms > 0 { + media_scenario.publisher_outages.push(SimulationOutage::new( + publisher, + 0, + activation_ms.min(media_window_ms.saturating_add(activation_ms)), + scenario.publisher_backfill_delay_ms, + )); + } + + if scenario.sequence_clock == PublisherSequenceClock::LocalActivation { + let sequence_offset = ceil_div_u64(activation_ms, media_scenario.segment_step_ms); + if sequence_offset > 0 { + media_scenario + .publisher_sequence_offsets + .insert(publisher.clone(), sequence_offset); + } + } + } + + media_scenario.publisher_nodes = active_publishers; + let media = run_duplicate_publisher_simulation(&media_scenario); + let system_complete_at_ms = match ( + control.propagation_complete_at_ms, + media.duplicate_complete_at_ms, + ) { + (Some(control_ms), Some(media_ms)) + if control.propagation_complete() && media.duplicate_complete() => + { + Some(control_ms.max(media_ms)) + } + _ => None, + }; + + SystemDuplicatePublisherSimulationReport { + seed: scenario.seed, + replay_hint: scenario.seed.replay_hint(), + publisher_activation_ms, + control, + media, + system_complete_at_ms, + } +} + +pub fn run_system_duplicate_publisher_campaign( + name: &str, + seed_start: SimulationSeed, + iterations: u64, + invariant_config: &SystemDuplicatePublisherInvariantConfig, + mut build_scenario: F, +) -> SystemDuplicatePublisherCampaignReport +where + F: FnMut(SimulationSeed) -> SystemDuplicatePublisherScenario, +{ + let mut campaign = SystemDuplicatePublisherCampaignReport { + name: name.to_string(), + seed_start, + iterations, + passed: 0, + failed: 0, + first_failure: None, + max_system_complete_ms_observed: 0, + max_control_propagation_ms_observed: 0, + max_media_duplicate_complete_ms_observed: 0, + total_system_complete_ms_observed: 0, + total_control_propagation_ms_observed: 0, + total_media_duplicate_complete_ms_observed: 0, + total_control_trace_events: 0, + total_media_trace_events: 0, + total_trace_events: 0, + total_control_transient_drops: 0, + total_control_partition_delays: 0, + total_control_node_outage_delays: 0, + total_control_duplicate_messages: 0, + total_media_transient_drops: 0, + total_media_partition_delays: 0, + total_media_publisher_outages: 0, + total_media_backfill_observations: 0, + total_media_publisher_phase_offsets: 0, + total_media_source_material_mismatches: 0, + total_media_timing_missing_records: 0, + total_media_timing_conflict_sequences: 0, + seeds_with_system_convergence_time: 0, + seeds_with_control_propagation_time: 0, + seeds_with_media_duplicate_convergence_time: 0, + seeds_with_control_transient_drops: 0, + seeds_with_control_partition_delays: 0, + seeds_with_control_node_outage_delays: 0, + seeds_with_control_duplicate_messages: 0, + seeds_with_media_transient_drops: 0, + seeds_with_media_partition_delays: 0, + seeds_with_media_publisher_outages: 0, + seeds_with_media_backfill_observations: 0, + seeds_with_media_publisher_phase_offsets: 0, + seeds_with_media_source_material_mismatches: 0, + seeds_with_media_timing_missing_records: 0, + seeds_with_media_timing_conflict_sequences: 0, + slowest_system_runs: Vec::new(), + }; + + let generic = run_seeded_simulation_campaign(name, seed_start, iterations, |seed| { + let scenario = build_scenario(seed); + let report = run_system_duplicate_publisher_simulation(&scenario); + let invariant = check_system_duplicate_publisher_invariants(&report, invariant_config); + let run_summary = SystemDuplicatePublisherCampaignRunSummary::from_report(&report); + + if let Some(system_complete_at_ms) = report.system_complete_at_ms { + campaign.seeds_with_system_convergence_time += 1; + campaign.total_system_complete_ms_observed = campaign + .total_system_complete_ms_observed + .saturating_add(system_complete_at_ms); + campaign.max_system_complete_ms_observed = campaign + .max_system_complete_ms_observed + .max(system_complete_at_ms); + } + if let Some(control_ms) = report.control.propagation_complete_at_ms { + campaign.seeds_with_control_propagation_time += 1; + campaign.total_control_propagation_ms_observed = campaign + .total_control_propagation_ms_observed + .saturating_add(control_ms); + campaign.max_control_propagation_ms_observed = + campaign.max_control_propagation_ms_observed.max(control_ms); + } + if let Some(media_ms) = report.media.duplicate_complete_at_ms { + campaign.seeds_with_media_duplicate_convergence_time += 1; + campaign.total_media_duplicate_complete_ms_observed = campaign + .total_media_duplicate_complete_ms_observed + .saturating_add(media_ms); + campaign.max_media_duplicate_complete_ms_observed = campaign + .max_media_duplicate_complete_ms_observed + .max(media_ms); + } + campaign.total_control_trace_events = campaign + .total_control_trace_events + .saturating_add(report.control.trace.len() as u64); + campaign.total_media_trace_events = campaign + .total_media_trace_events + .saturating_add(report.media.trace.len() as u64); + campaign.total_trace_events = campaign + .total_trace_events + .saturating_add(report.control.trace.len() as u64 + report.media.trace.len() as u64); + campaign.total_control_transient_drops += + report.control.fault_stats.transient_dropped_messages; + campaign.total_control_partition_delays += + report.control.fault_stats.partition_delayed_messages; + campaign.total_control_node_outage_delays += + report.control.fault_stats.node_outage_delayed_messages + + report.control.fault_stats.node_outage_delayed_rounds; + campaign.total_control_duplicate_messages += report.control.fault_stats.duplicate_messages; + campaign.total_media_transient_drops += + report.media.fault_stats.transient_dropped_observations; + campaign.total_media_partition_delays += + report.media.fault_stats.partition_delayed_observations; + campaign.total_media_publisher_outages += + report.media.fault_stats.publisher_outage_observations; + campaign.total_media_backfill_observations += + report.media.fault_stats.backfill_observations; + campaign.total_media_publisher_phase_offsets += + report.media.fault_stats.publisher_phase_offset_observations; + campaign.total_media_source_material_mismatches += report + .media + .fault_stats + .source_material_mismatch_observations; + campaign.total_media_timing_missing_records += + report.media.summary.media_timing_missing_records; + campaign.total_media_timing_conflict_sequences += + report.media.summary.media_timing_conflict_sequences.len() as u64; + + if report.control.fault_stats.transient_dropped_messages > 0 { + campaign.seeds_with_control_transient_drops += 1; + } + if report.control.fault_stats.partition_delayed_messages > 0 { + campaign.seeds_with_control_partition_delays += 1; + } + if report.control.fault_stats.node_outage_delayed_messages + + report.control.fault_stats.node_outage_delayed_rounds + > 0 + { + campaign.seeds_with_control_node_outage_delays += 1; + } + if report.control.fault_stats.duplicate_messages > 0 { + campaign.seeds_with_control_duplicate_messages += 1; + } + if report.media.fault_stats.transient_dropped_observations > 0 { + campaign.seeds_with_media_transient_drops += 1; + } + if report.media.fault_stats.partition_delayed_observations > 0 { + campaign.seeds_with_media_partition_delays += 1; + } + if report.media.fault_stats.publisher_outage_observations > 0 { + campaign.seeds_with_media_publisher_outages += 1; + } + if report.media.fault_stats.backfill_observations > 0 { + campaign.seeds_with_media_backfill_observations += 1; + } + if report.media.fault_stats.publisher_phase_offset_observations > 0 { + campaign.seeds_with_media_publisher_phase_offsets += 1; + } + if report + .media + .fault_stats + .source_material_mismatch_observations + > 0 + { + campaign.seeds_with_media_source_material_mismatches += 1; + } + if report.media.summary.media_timing_missing_records > 0 { + campaign.seeds_with_media_timing_missing_records += 1; + } + if !report + .media + .summary + .media_timing_conflict_sequences + .is_empty() + { + campaign.seeds_with_media_timing_conflict_sequences += 1; + } + insert_slowest_system_campaign_run(&mut campaign.slowest_system_runs, run_summary); + + if invariant.ok { + None + } else { + Some(SystemDuplicatePublisherCampaignFailure { + seed, + replay_hint: report.replay_hint.clone(), + scenario, + invariant, + report, + }) + } + }); + + campaign.passed = generic.passed; + campaign.failed = generic.failed; + campaign.first_failure = generic.first_failure; + + campaign +} + +fn insert_slowest_system_campaign_run( + runs: &mut Vec, + run: SystemDuplicatePublisherCampaignRunSummary, +) { + const LIMIT: usize = 16; + + runs.push(run); + runs.sort_by(|left, right| { + let left_complete_ms = left.system_complete_at_ms.unwrap_or(u64::MAX); + let right_complete_ms = right.system_complete_at_ms.unwrap_or(u64::MAX); + right_complete_ms + .cmp(&left_complete_ms) + .then_with(|| left.seed.cmp(&right.seed)) + }); + runs.truncate(LIMIT); +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct FoundationStyleSystemScenarioConfig { + pub nodes: Vec, + pub publisher_nodes: Vec, + pub origin_node: String, + pub topic: String, + pub announcement_id: String, + pub stream_id: String, + pub rendition_id: String, + pub track_name: String, + pub profile_id: String, + pub sequence_clock: PublisherSequenceClock, +} + +impl Default for FoundationStyleSystemScenarioConfig { + fn default() -> Self { + Self { + nodes: vec![ + "forge".to_string(), + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "relay-lax".to_string(), + "relay-nyc".to_string(), + "relay-hel".to_string(), + ], + publisher_nodes: vec!["nuc-a".to_string(), "nuc-b".to_string()], + origin_node: "forge".to_string(), + topic: "ec.control.broadcast.la-kcop".to_string(), + announcement_id: "la-kcop@42".to_string(), + stream_id: "la-kcop".to_string(), + rendition_id: "720p".to_string(), + track_name: "0.m4s".to_string(), + profile_id: "x264-hd3-v1".to_string(), + sequence_clock: PublisherSequenceClock::Global, + } + } +} + +pub fn foundation_style_system_duplicate_publisher_scenario( + seed: SimulationSeed, + config: &FoundationStyleSystemScenarioConfig, +) -> SystemDuplicatePublisherScenario { + let mut rng = SimulationRng::new(SimulationSeed::new(seed.0 ^ 0x6664_622d_7374_796c)); + let nodes = if config.nodes.is_empty() { + FoundationStyleSystemScenarioConfig::default().nodes + } else { + config.nodes.clone() + }; + let publisher_nodes = if config.publisher_nodes.is_empty() { + FoundationStyleSystemScenarioConfig::default().publisher_nodes + } else { + config.publisher_nodes.clone() + }; + let origin_node = if nodes.contains(&config.origin_node) { + config.origin_node.clone() + } else { + nodes + .first() + .cloned() + .unwrap_or_else(|| "forge".to_string()) + }; + let mut control = ControlPlanePropagationScenario::new( + seed, + nodes.clone(), + &origin_node, + &config.topic, + &config.announcement_id, + ); + control.fanout = rng + .range_inclusive(3, 5) + .min(nodes.len().saturating_sub(1) as u64); + control.gossip_interval_ms = rng.range_inclusive(24, 42); + control.max_gossip_rounds = rng.range_inclusive(12, 16); + control.base_network_delay_ms = rng.range_inclusive(4, 10); + control.max_jitter_ms = rng.range_inclusive(25, 65); + control.transient_drop_per_million = rng.range_inclusive(45_000, 95_000) as u32; + control.partitions = foundation_style_control_partitions(&mut rng, &nodes, &publisher_nodes); + control.node_outages = foundation_style_node_outages(&mut rng, &nodes, &origin_node); + + let sequence_count = rng.range_inclusive(40, 72); + let media_seed = SimulationSeed::new(seed.0 ^ 0x6d65_6469_6121); + let mut media = DuplicatePublisherScenario::new( + media_seed, + publisher_nodes.clone(), + &config.stream_id, + &config.rendition_id, + &config.track_name, + &config.profile_id, + 0, + sequence_count, + ); + media.segment_step_ms = rng.range_inclusive(35, 55); + media.base_network_delay_ms = rng.range_inclusive(3, 9); + media.max_jitter_ms = rng.range_inclusive(45, 95); + media.transient_drop_per_million = rng.range_inclusive(180_000, 320_000) as u32; + media.backfill_after_ms = rng.range_inclusive(420, 760); + let media_window_ms = sequence_count.saturating_mul(media.segment_step_ms); + media.partitions = + foundation_style_media_partitions(&mut rng, &publisher_nodes, media_window_ms); + media.publisher_outages = + foundation_style_publisher_outages(&mut rng, &publisher_nodes, media_window_ms); + + let mut scenario = SystemDuplicatePublisherScenario::new(seed, control, media); + scenario.publisher_activation_delay_ms = rng.range_inclusive(10, 55); + scenario.publisher_backfill_delay_ms = rng.range_inclusive(120, 260); + scenario.sequence_clock = config.sequence_clock; + scenario +} + +fn foundation_style_control_partitions( + rng: &mut SimulationRng, + nodes: &[String], + publisher_nodes: &[String], +) -> Vec { + let mut partitions = Vec::new(); + if let Some(publisher) = pick_sim_item(rng, publisher_nodes) { + let start_ms = rng.range_inclusive(0, 160); + let end_ms = start_ms.saturating_add(rng.range_inclusive(80, 210)); + partitions.push(SimulationPartition::new( + publisher, + start_ms, + end_ms, + rng.range_inclusive(25, 90), + )); + } + let relay_nodes = nodes + .iter() + .filter(|node| !publisher_nodes.contains(*node) && node.as_str() != "forge") + .cloned() + .collect::>(); + if let Some(relay) = pick_sim_item(rng, &relay_nodes) { + let start_ms = rng.range_inclusive(60, 260); + let end_ms = start_ms.saturating_add(rng.range_inclusive(70, 180)); + partitions.push(SimulationPartition::new( + relay, + start_ms, + end_ms, + rng.range_inclusive(25, 80), + )); + } + partitions +} + +fn foundation_style_node_outages( + rng: &mut SimulationRng, + nodes: &[String], + origin_node: &str, +) -> Vec { + let outage_candidates = nodes + .iter() + .filter(|node| node.as_str() != origin_node) + .cloned() + .collect::>(); + let Some(node) = pick_sim_item(rng, &outage_candidates) else { + return Vec::new(); + }; + let start_ms = rng.range_inclusive(80, 260); + let end_ms = start_ms.saturating_add(rng.range_inclusive(60, 180)); + vec![SimulationOutage::new( + node, + start_ms, + end_ms, + rng.range_inclusive(25, 95), + )] +} + +fn foundation_style_media_partitions( + rng: &mut SimulationRng, + publisher_nodes: &[String], + media_window_ms: u64, +) -> Vec { + let Some(publisher) = pick_sim_item(rng, publisher_nodes) else { + return Vec::new(); + }; + let start_ceiling = media_window_ms.saturating_div(2).max(1); + let start_ms = rng.range_inclusive(0, start_ceiling); + let duration_ms = rng.range_inclusive(180, 520).min(media_window_ms.max(180)); + let end_ms = start_ms.saturating_add(duration_ms); + vec![SimulationPartition::new( + publisher, + start_ms, + end_ms, + rng.range_inclusive(80, 240), + )] +} + +fn foundation_style_publisher_outages( + rng: &mut SimulationRng, + publisher_nodes: &[String], + media_window_ms: u64, +) -> Vec { + let Some(publisher) = pick_sim_item(rng, publisher_nodes) else { + return Vec::new(); + }; + let start_floor = media_window_ms.saturating_div(5); + let start_ceiling = media_window_ms + .saturating_mul(4) + .saturating_div(5) + .max(start_floor); + let start_ms = rng.range_inclusive(start_floor, start_ceiling); + let duration_ms = rng.range_inclusive(140, 420).min(media_window_ms.max(140)); + let end_ms = start_ms.saturating_add(duration_ms); + vec![SimulationOutage::new( + publisher, + start_ms, + end_ms, + rng.range_inclusive(140, 360), + )] +} + +fn pick_sim_item<'a>(rng: &mut SimulationRng, items: &'a [String]) -> Option<&'a str> { + if items.is_empty() { + return None; + } + let index = rng.range_inclusive(0, items.len() as u64 - 1) as usize; + Some(items[index].as_str()) +} + +fn ceil_div_u64(value: u64, divisor: u64) -> u64 { + if divisor == 0 { + return value; + } + value.saturating_add(divisor - 1) / divisor +} diff --git a/crates/ec-core/tests/simulation.rs b/crates/ec-core/tests/simulation.rs new file mode 100644 index 0000000..a803aab --- /dev/null +++ b/crates/ec-core/tests/simulation.rs @@ -0,0 +1,986 @@ +use ec_core::sim::{ + check_control_plane_propagation_invariants, check_duplicate_publisher_invariants, + check_system_duplicate_publisher_invariants, run_control_plane_propagation_campaign, + run_control_plane_propagation_simulation, run_duplicate_publisher_campaign, + run_duplicate_publisher_simulation, run_seeded_simulation_campaign, + run_system_duplicate_publisher_campaign, run_system_duplicate_publisher_simulation, + shrink_duplicate_publisher_failure, simulated_media_hash, + ControlPlanePropagationInvariantConfig, ControlPlanePropagationScenario, + ControlPlaneTraceEvent, DeterministicSimulation, DuplicatePublisherInvariantConfig, + DuplicatePublisherScenario, EncoderDriftFault, FoundationStyleSystemScenarioConfig, + PublisherSequenceClock, SimulationOutage, SimulationPartition, SimulationSeed, + SystemDuplicatePublisherInvariantConfig, SystemDuplicatePublisherScenario, +}; + +const STREAM: &str = "la-kcop"; +const RENDITION: &str = "720p"; +const TRACK: &str = "0.m4s"; +const PROFILE: &str = "x264-hd3-v1"; + +fn schedule_publisher_window( + sim: &mut DeterministicSimulation, + node: &str, + start_sequence: u64, + end_sequence: u64, + first_delivery_ms: u64, + step_ms: u64, + profile: &str, +) { + for sequence in start_sequence..end_sequence { + let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, profile); + sim.schedule_observation( + first_delivery_ms + (sequence - start_sequence) * step_ms, + node, + STREAM, + RENDITION, + TRACK, + sequence, + &hash, + ); + } +} + +#[test] +fn duplicate_publishers_converge_after_delayed_backfill() { + let mut sim = DeterministicSimulation::new(); + + schedule_publisher_window(&mut sim, "nuc-a", 0, 12, 0, 10, PROFILE); + schedule_publisher_window(&mut sim, "nuc-b", 0, 4, 30, 10, PROFILE); + schedule_publisher_window(&mut sim, "nuc-b", 4, 12, 500, 10, PROFILE); + + sim.run_until(250); + let before_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12); + assert_eq!(before_backfill.expected_sequences, 12); + assert_eq!(before_backfill.missing_sequences, Vec::::new()); + assert_eq!( + before_backfill.matching_duplicate_sequences, + vec![0, 1, 2, 3] + ); + assert!(before_backfill.ok()); + + sim.run_to_idle(); + let after_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12); + let duplicate_complete_at_ms = sim + .convergence() + .duplicate_complete_at_ms(STREAM, RENDITION, TRACK, 0, 12); + assert_eq!(after_backfill.missing_sequences, Vec::::new()); + assert_eq!(after_backfill.divergent_sequences, Vec::::new()); + assert_eq!( + after_backfill.matching_duplicate_sequences, + (0_u64..12).collect::>() + ); + assert_eq!(after_backfill.duplicate_source_records, 24); + assert_eq!(duplicate_complete_at_ms, Some(570)); + assert_eq!(sim.trace().len(), 24); + assert!( + sim.trace() + .windows(2) + .all(|pair| (pair[0].at_ms, pair[0].order) <= (pair[1].at_ms, pair[1].order)), + "trace should preserve deterministic event order" + ); + assert!(after_backfill.ok()); +} + +#[test] +fn media_convergence_can_summarize_sparse_observed_sequences() { + let mut sim = DeterministicSimulation::new(); + for sequence in [7_287_381_184_512, 7_287_381_188_608] { + let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, PROFILE); + sim.schedule_observation(0, "nuc-a", STREAM, RENDITION, TRACK, sequence, &hash); + sim.schedule_observation(1, "nuc-b", STREAM, RENDITION, TRACK, sequence, &hash); + } + + sim.run_to_idle(); + let dense = sim.convergence().summarize( + STREAM, + RENDITION, + TRACK, + 7_287_381_184_512, + 7_287_381_188_609, + ); + let sparse = sim.convergence().summarize_observed_sequences( + STREAM, + RENDITION, + TRACK, + 7_287_381_184_512, + 7_287_381_188_609, + ); + + assert!(!dense.missing_sequences.is_empty()); + assert_eq!(sparse.expected_sequences, 2); + assert_eq!(sparse.missing_sequences, Vec::::new()); + assert_eq!( + sparse.matching_duplicate_sequences, + vec![7_287_381_184_512, 7_287_381_188_608] + ); + assert!(sparse.ok()); +} + +#[test] +fn duplicate_publisher_simulation_detects_encoder_drift() { + let mut sim = DeterministicSimulation::new(); + + schedule_publisher_window(&mut sim, "nuc-a", 0, 8, 0, 10, PROFILE); + schedule_publisher_window(&mut sim, "nuc-b", 0, 8, 5, 10, PROFILE); + + let drift_hash = simulated_media_hash(STREAM, RENDITION, TRACK, 4, "x264-hd3-drift"); + sim.schedule_observation(90, "nuc-b", STREAM, RENDITION, TRACK, 4, &drift_hash); + + sim.run_to_idle(); + let summary = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 8); + + assert_eq!(summary.missing_sequences, Vec::::new()); + assert_eq!(summary.divergent_sequences, vec![4]); + assert!(!summary.ok()); +} + +#[test] +fn duplicate_publisher_fault_schedule_replays_from_seed() { + let scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6d6f_712d_6475_7021)); + + let first = run_duplicate_publisher_simulation(&scenario); + let second = run_duplicate_publisher_simulation(&scenario); + + assert_eq!(first, second); + assert!(first.duplicate_complete(), "replay {}", first.replay_hint); + assert_eq!(first.summary.matching_duplicate_sequences.len(), 48); + assert_eq!( + first.trace, second.trace, + "replayed reports should carry the same event history" + ); +} + +#[test] +fn duplicate_publisher_many_seed_fault_schedules_converge() { + let mut saw_transient_drop = false; + let mut saw_partition_delay = false; + let mut saw_publisher_outage = false; + + for seed in 1..=96 { + let scenario = faulted_duplicate_scenario(SimulationSeed::new(seed)); + let report = run_duplicate_publisher_simulation(&scenario); + + saw_transient_drop |= report.fault_stats.transient_dropped_observations > 0; + saw_partition_delay |= report.fault_stats.partition_delayed_observations > 0; + saw_publisher_outage |= report.fault_stats.publisher_outage_observations > 0; + + assert!( + report.duplicate_complete(), + "duplicate publisher convergence failed for {}: {:?}", + report.replay_hint, + report.summary + ); + assert_eq!(report.summary.missing_sequences, Vec::::new()); + assert_eq!(report.summary.divergent_sequences, Vec::::new()); + assert_eq!(report.summary.duplicate_source_records, 96); + } + + assert!( + saw_transient_drop, + "fault suite did not exercise transient drops" + ); + assert!( + saw_partition_delay, + "fault suite did not exercise partitions" + ); + assert!( + saw_publisher_outage, + "fault suite did not exercise publisher outages" + ); +} + +#[test] +fn seeded_fault_scenario_detects_encoder_drift() { + let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6472_6966_7421)); + scenario + .encoder_drifts + .push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift")); + + let report = run_duplicate_publisher_simulation(&scenario); + + assert!(!report.duplicate_complete()); + assert_eq!(report.summary.divergent_sequences, vec![17]); + assert_eq!(report.duplicate_complete_at_ms, None); + assert_eq!(report.fault_stats.encoder_drift_observations, 1); +} + +#[test] +fn duplicate_publisher_simulation_detects_unaligned_publisher_phase() { + let mut scenario = DuplicatePublisherScenario::new( + SimulationSeed::new(0x7068_6173_652d_6275), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 8, + ); + scenario.base_network_delay_ms = 0; + scenario.max_jitter_ms = 0; + scenario + .publisher_sequence_offsets + .insert("nuc-b".to_string(), 3); + + let report = run_duplicate_publisher_simulation(&scenario); + let invariant = check_duplicate_publisher_invariants( + &report, + &DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000), + ); + + assert!(!report.duplicate_complete()); + assert_eq!(report.summary.missing_sequences, Vec::::new()); + assert_eq!( + report.summary.matching_duplicate_sequences, + Vec::::new() + ); + assert_eq!( + report.summary.divergent_sequences, + (0_u64..8).collect::>() + ); + assert_eq!(report.fault_stats.publisher_phase_offset_observations, 8); + assert_eq!( + invariant.failures, + vec![ + "divergent_sequences".to_string(), + "media_timing_conflict_sequences".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn duplicate_publisher_simulation_rejects_missing_media_timing() { + let mut scenario = DuplicatePublisherScenario::new( + SimulationSeed::new(0x7469_6d65_2d6d_6973), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 6, + ); + scenario.base_network_delay_ms = 0; + scenario.max_jitter_ms = 0; + scenario + .missing_media_timing_publishers + .insert("nuc-b".to_string()); + + let report = run_duplicate_publisher_simulation(&scenario); + let invariant = check_duplicate_publisher_invariants( + &report, + &DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000), + ); + + assert_eq!(report.summary.divergent_sequences, Vec::::new()); + assert_eq!(report.summary.media_timing_missing_records, 6); + assert_eq!( + invariant.failures, + vec![ + "media_timing_missing_records".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn duplicate_publisher_simulation_rejects_conflicting_media_timing() { + let mut scenario = DuplicatePublisherScenario::new( + SimulationSeed::new(0x7469_6d65_2d73_6b65), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 6, + ); + scenario.base_network_delay_ms = 0; + scenario.max_jitter_ms = 0; + scenario + .publisher_media_time_offsets_ms + .insert("nuc-b".to_string(), 17); + + let report = run_duplicate_publisher_simulation(&scenario); + let invariant = check_duplicate_publisher_invariants( + &report, + &DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000), + ); + + assert_eq!(report.summary.divergent_sequences, Vec::::new()); + assert_eq!( + report.summary.media_timing_conflict_sequences, + (0_u64..6).collect::>() + ); + assert_eq!( + invariant.failures, + vec![ + "media_timing_conflict_sequences".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn duplicate_publisher_simulation_rejects_independent_source_material() { + let mut scenario = DuplicatePublisherScenario::new( + SimulationSeed::new(0x736f_7572_6365_6d61), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 6, + ); + scenario.base_network_delay_ms = 0; + scenario.max_jitter_ms = 0; + scenario + .publisher_source_material + .insert("nuc-b".to_string(), "independent-rf-window".to_string()); + + let report = run_duplicate_publisher_simulation(&scenario); + let invariant = check_duplicate_publisher_invariants( + &report, + &DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000), + ); + + assert_eq!( + report.summary.divergent_sequences, + (0_u64..6).collect::>() + ); + assert_eq!( + report.summary.media_timing_conflict_sequences, + Vec::::new() + ); + assert_eq!(report.fault_stats.source_material_mismatch_observations, 12); + assert_eq!( + invariant.failures, + vec![ + "divergent_sequences".to_string(), + "source_material_mismatch_observations".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn duplicate_publisher_outage_backfills_after_restart() { + let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6f75_7461_6765)); + scenario.partitions.clear(); + scenario.transient_drop_per_million = 0; + scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 320, 760, 180)]; + + let report = run_duplicate_publisher_simulation(&scenario); + + assert!( + report.duplicate_complete(), + "{} {:?}", + report.replay_hint, + report.summary + ); + assert!(report.fault_stats.publisher_outage_observations > 0); + assert_eq!( + report.fault_stats.backfill_observations, + report.fault_stats.publisher_outage_observations + ); + assert!( + report.duplicate_complete_at_ms.unwrap() >= 940, + "outage restart should move convergence later than the live path" + ); + assert!(report.duplicate_complete_at_ms.unwrap() <= 3_000); +} + +#[test] +fn duplicate_publisher_simulation_checks_convergence_deadline() { + let report = run_duplicate_publisher_simulation(&faulted_duplicate_scenario( + SimulationSeed::new(0x6465_6164_6c69_6e65), + )); + let invariant = check_duplicate_publisher_invariants( + &report, + &DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000), + ); + + assert!( + invariant.ok, + "{} {:?}", + invariant.replay_hint, invariant.failures + ); + assert!(invariant.duplicate_complete_at_ms.is_some()); + assert!( + invariant.duplicate_complete_at_ms.unwrap() <= 3_000, + "{} completed too late: {:?}", + invariant.replay_hint, + invariant.duplicate_complete_at_ms + ); +} + +#[test] +fn seeded_simulation_campaign_preserves_first_failure() { + let campaign = run_seeded_simulation_campaign( + "generic-seeded-campaign", + SimulationSeed::new(40), + 8, + |seed| (seed.0 == 44).then_some(seed.replay_hint()), + ); + + assert!(!campaign.all_passed()); + assert_eq!(campaign.passed, 7); + assert_eq!(campaign.failed, 1); + assert_eq!( + campaign.first_failure.as_deref(), + Some("EC_SIM_SEED=000000000000002c") + ); +} + +#[test] +fn control_plane_propagation_replays_from_seed() { + let scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6374_726c_7265_706c)); + + let first = run_control_plane_propagation_simulation(&scenario); + let second = run_control_plane_propagation_simulation(&scenario); + + assert_eq!(first, second); + assert!( + first.propagation_complete(), + "control propagation failed for {}: {:?}", + first.replay_hint, + first.missing_nodes + ); + assert_eq!(first.known_count, scenario.nodes.len() as u64); + assert_eq!( + first.trace, second.trace, + "replayed control-plane schedules should carry identical traces" + ); + assert!(first + .trace + .iter() + .any(|entry| matches!(entry.event, ControlPlaneTraceEvent::MessageScheduled { .. }))); + assert!(first + .trace + .iter() + .any(|entry| matches!(entry.event, ControlPlaneTraceEvent::NodeLearned { .. }))); +} + +#[test] +fn control_plane_campaign_runs_many_fault_schedules() { + let invariant = ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900); + let campaign = run_control_plane_propagation_campaign( + "control-plane-gossip-fault-campaign", + SimulationSeed::new(1), + 512, + &invariant, + faulted_control_plane_scenario, + ); + + assert!( + campaign.all_passed(), + "campaign failed: {:?}", + campaign.first_failure + ); + assert_eq!(campaign.passed, 512); + assert_eq!(campaign.failed, 0); + assert!(campaign.total_transient_dropped_messages > 0); + assert!(campaign.total_partition_delayed_messages > 0); + assert!(campaign.total_node_outage_delayed_messages > 0); + assert!(campaign.total_duplicate_messages > 0); + assert!(campaign.max_propagation_complete_ms_observed <= 900); +} + +#[test] +fn control_plane_simulation_detects_dead_fanout() { + let mut scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6661_6e6f_7574)); + scenario.fanout = 0; + scenario.transient_drop_per_million = 0; + scenario.partitions.clear(); + scenario.node_outages.clear(); + + let report = run_control_plane_propagation_simulation(&scenario); + let invariant = check_control_plane_propagation_invariants( + &report, + &ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900), + ); + + assert!(!report.propagation_complete()); + assert_eq!(report.known_nodes, vec!["nuc-a".to_string()]); + assert_eq!(report.missing_nodes.len(), 6); + assert_eq!( + invariant.failures, + vec![ + "propagation_incomplete".to_string(), + "propagation_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn duplicate_publisher_campaign_runs_many_seed_schedules() { + let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000); + let campaign = run_duplicate_publisher_campaign( + "duplicate-publisher-fault-campaign", + SimulationSeed::new(1), + 512, + &invariant, + faulted_duplicate_scenario, + ); + + assert!( + campaign.all_passed(), + "campaign failed: {:?}", + campaign.first_failure + ); + assert_eq!(campaign.passed, 512); + assert_eq!(campaign.failed, 0); + assert!(campaign.total_transient_dropped_observations > 0); + assert!(campaign.total_partition_delayed_observations > 0); + assert!(campaign.total_publisher_outage_observations > 0); + assert!(campaign.total_backfill_observations > 0); + assert!(campaign.max_duplicate_complete_ms_observed <= 3_000); +} + +#[test] +fn duplicate_publisher_shrinker_minimizes_noisy_drift_failure() { + let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000); + let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(19)); + scenario + .encoder_drifts + .push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift")); + + let shrunk = shrink_duplicate_publisher_failure(&scenario, &invariant) + .expect("drift should fail and be shrinkable"); + + assert_eq!(shrunk.seed, SimulationSeed::new(19)); + assert_eq!(shrunk.scenario.expected_sequences(), 18); + assert_eq!(shrunk.scenario.partitions.len(), 0); + assert_eq!(shrunk.scenario.publisher_outages.len(), 0); + assert_eq!(shrunk.scenario.transient_drop_per_million, 0); + assert_eq!(shrunk.scenario.max_jitter_ms, 0); + assert_eq!(shrunk.scenario.base_network_delay_ms, 0); + assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]); + assert_eq!( + shrunk.invariant.failures, + vec![ + "divergent_sequences".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); + assert!( + shrunk + .steps + .iter() + .any(|step| step.dimension == "sequence_count" && step.after == "18"), + "shrink steps should record the minimized failing media window" + ); +} + +#[test] +fn duplicate_publisher_campaign_keeps_first_replayable_failure() { + let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000); + let campaign = run_duplicate_publisher_campaign( + "duplicate-publisher-replayable-failure", + SimulationSeed::new(10), + 32, + &invariant, + |seed| { + let mut scenario = faulted_duplicate_scenario(seed); + if seed.0 == 19 { + scenario + .encoder_drifts + .push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift")); + } + scenario + }, + ); + + let failure = campaign + .first_failure + .as_ref() + .expect("campaign should preserve first failure"); + let shrunk = failure + .shrunk_failure + .as_ref() + .expect("campaign should preserve a shrunk replay"); + assert_eq!(failure.seed, SimulationSeed::new(19)); + assert_eq!( + failure.invariant.failures, + vec![ + "divergent_sequences".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); + + let mut replay = faulted_duplicate_scenario(failure.seed); + replay + .encoder_drifts + .push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift")); + let replayed_report = run_duplicate_publisher_simulation(&replay); + assert_eq!(replayed_report, failure.report); + assert_eq!(shrunk.scenario.expected_sequences(), 18); + assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]); +} + +#[test] +fn system_duplicate_publishers_converge_with_global_sequence_clock() { + let scenario = system_duplicate_scenario( + SimulationSeed::new(0x7379_7374_656d_676c), + PublisherSequenceClock::Global, + ); + + let report = run_system_duplicate_publisher_simulation(&scenario); + let invariant = check_system_duplicate_publisher_invariants( + &report, + &SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500), + ); + + assert!( + report.system_complete(), + "{} control={:?} media={:?}", + report.replay_hint, + report.control.missing_nodes, + report.media.summary + ); + assert!( + invariant.ok, + "{} {:?}", + invariant.replay_hint, invariant.failures + ); + assert_eq!(report.media.summary.divergent_sequences, Vec::::new()); + assert_eq!( + report.media.summary.matching_duplicate_sequences.len() as u64, + scenario.media.expected_sequences() + ); + assert!( + report + .publisher_activation_ms + .get("nuc-b") + .copied() + .unwrap_or_default() + > report + .publisher_activation_ms + .get("nuc-a") + .copied() + .unwrap_or_default(), + "faulted control plane should activate nuc-b later than nuc-a" + ); +} + +#[test] +fn system_duplicate_publishers_reject_local_activation_sequence_clock() { + let scenario = system_duplicate_scenario( + SimulationSeed::new(0x7379_7374_656d_6c6f), + PublisherSequenceClock::LocalActivation, + ); + + let report = run_system_duplicate_publisher_simulation(&scenario); + let invariant = check_system_duplicate_publisher_invariants( + &report, + &SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500), + ); + + assert!(report.control.propagation_complete()); + assert!(!report.media.duplicate_complete()); + assert!( + !report.media.summary.divergent_sequences.is_empty(), + "local activation clock should cause same advertised sequence to hash differently" + ); + assert_eq!( + invariant.failures, + vec![ + "media_divergent_sequences".to_string(), + "media_timing_conflict_sequences".to_string(), + "media_duplicate_incomplete".to_string(), + "system_complete_deadline_unreached".to_string(), + ] + ); +} + +#[test] +fn system_duplicate_publisher_campaign_runs_many_seed_schedules() { + let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500); + let campaign = run_system_duplicate_publisher_campaign( + "system-duplicate-publisher-fault-campaign", + SimulationSeed::new(1), + 256, + &invariant, + |seed| system_duplicate_scenario(seed, PublisherSequenceClock::Global), + ); + + assert!( + campaign.all_passed(), + "campaign failed: {:?}", + campaign.first_failure + ); + assert_eq!(campaign.passed, 256); + assert_eq!(campaign.failed, 0); + assert!(campaign.max_control_propagation_ms_observed > 0); + assert!(campaign.max_media_duplicate_complete_ms_observed > 0); + assert!(campaign.max_system_complete_ms_observed <= 3_500); + assert!(campaign.total_system_complete_ms_observed > 0); + assert!(campaign.total_control_trace_events > 0); + assert!(campaign.total_media_trace_events > 0); + assert_eq!( + campaign.total_trace_events, + campaign.total_control_trace_events + campaign.total_media_trace_events + ); + assert!(campaign.total_control_transient_drops > 0); + assert!(campaign.total_media_transient_drops > 0); + assert!(campaign.total_media_backfill_observations > 0); + assert!(campaign.seeds_with_system_convergence_time > 0); + assert!(campaign.seeds_with_control_transient_drops > 0); + assert!(campaign.seeds_with_media_transient_drops > 0); + assert!(campaign.seeds_with_media_backfill_observations > 0); + assert!(!campaign.slowest_system_runs.is_empty()); + assert!(campaign.slowest_system_runs.len() <= 16); + assert!(campaign + .slowest_system_runs + .windows(2) + .all(|pair| pair[0].system_complete_at_ms.unwrap_or(u64::MAX) + >= pair[1].system_complete_at_ms.unwrap_or(u64::MAX))); + assert_eq!(campaign.total_media_publisher_phase_offsets, 0); +} + +#[test] +fn foundation_style_system_campaign_runs_replayable_fault_schedules() { + let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000); + let config = FoundationStyleSystemScenarioConfig::default(); + let campaign = run_system_duplicate_publisher_campaign( + "foundation-style-system-campaign", + SimulationSeed::new(1), + 512, + &invariant, + |seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config), + ); + + assert!( + campaign.all_passed(), + "campaign failed: {:?}", + campaign.first_failure + ); + assert_eq!(campaign.passed, 512); + assert_eq!(campaign.failed, 0); + assert!(campaign.max_system_complete_ms_observed <= 6_000); + assert!(campaign.total_system_complete_ms_observed > 0); + assert!(campaign.total_control_trace_events > 0); + assert!(campaign.total_media_trace_events > 0); + assert_eq!( + campaign.total_trace_events, + campaign.total_control_trace_events + campaign.total_media_trace_events + ); + assert!(campaign.total_control_transient_drops > 0); + assert!(campaign.total_control_partition_delays > 0); + assert!(campaign.total_control_node_outage_delays > 0); + assert!(campaign.total_control_duplicate_messages > 0); + assert!(campaign.total_media_transient_drops > 0); + assert!(campaign.total_media_partition_delays > 0); + assert!(campaign.total_media_publisher_outages > 0); + assert!(campaign.total_media_backfill_observations > 0); + assert!(campaign.seeds_with_system_convergence_time > 0); + assert!(campaign.seeds_with_control_propagation_time > 0); + assert!(campaign.seeds_with_media_duplicate_convergence_time > 0); + assert!(campaign.seeds_with_control_transient_drops > 0); + assert!(campaign.seeds_with_control_partition_delays > 0); + assert!(campaign.seeds_with_control_node_outage_delays > 0); + assert!(campaign.seeds_with_control_duplicate_messages > 0); + assert!(campaign.seeds_with_media_transient_drops > 0); + assert!(campaign.seeds_with_media_partition_delays > 0); + assert!(campaign.seeds_with_media_publisher_outages > 0); + assert!(campaign.seeds_with_media_backfill_observations > 0); + assert!(campaign.fault_coverage_ok()); + assert!(!campaign.slowest_system_runs.is_empty()); + assert!(campaign.slowest_system_runs.len() <= 16); + assert_eq!(campaign.total_media_publisher_phase_offsets, 0); +} + +#[test] +fn foundation_style_system_campaign_rejects_local_activation_sequence_clock() { + let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000); + let mut config = FoundationStyleSystemScenarioConfig::default(); + config.sequence_clock = PublisherSequenceClock::LocalActivation; + let campaign = run_system_duplicate_publisher_campaign( + "foundation-style-local-activation-failure", + SimulationSeed::new(1), + 32, + &invariant, + |seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config), + ); + + let failure = campaign + .first_failure + .as_ref() + .expect("local activation clock should fail under foundation-style faults"); + + assert!(!campaign.all_passed()); + assert!(failure + .invariant + .failures + .contains(&"media_divergent_sequences".to_string())); + assert!(!failure.report.media.summary.divergent_sequences.is_empty()); + assert!( + failure + .report + .media + .fault_stats + .publisher_phase_offset_observations + > 0 + ); + assert!(campaign.total_media_publisher_phase_offsets > 0); + assert!(campaign.seeds_with_media_publisher_phase_offsets > 0); +} + +#[test] +fn system_duplicate_publisher_campaign_classifies_source_material_mismatch() { + let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500); + let campaign = run_system_duplicate_publisher_campaign( + "system-source-material-failure", + SimulationSeed::new(1), + 1, + &invariant, + |seed| { + let mut scenario = system_duplicate_scenario(seed, PublisherSequenceClock::Global); + scenario + .media + .publisher_source_material + .insert("nuc-b".to_string(), "independent-rf-window".to_string()); + scenario + }, + ); + + let failure = campaign + .first_failure + .as_ref() + .expect("source material mismatch should fail"); + + assert!(!campaign.all_passed()); + assert!(failure + .invariant + .failures + .contains(&"media_source_material_mismatch_observations".to_string())); + assert!(!failure.report.media.summary.divergent_sequences.is_empty()); + assert!( + failure + .report + .media + .fault_stats + .source_material_mismatch_observations + > 0 + ); + assert!(campaign.total_media_source_material_mismatches > 0); + assert_eq!(campaign.seeds_with_media_source_material_mismatches, 1); +} + +fn faulted_duplicate_scenario(seed: SimulationSeed) -> DuplicatePublisherScenario { + let mut scenario = DuplicatePublisherScenario::new( + seed, + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 48, + ); + scenario.segment_step_ms = 40; + scenario.base_network_delay_ms = 5; + scenario.max_jitter_ms = 75; + scenario.transient_drop_per_million = 275_000; + scenario.backfill_after_ms = 600; + scenario.partitions = vec![ + SimulationPartition::new("nuc-b", 120, 520, 140), + SimulationPartition::new("nuc-a", 940, 1_260, 90), + ]; + scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)]; + scenario +} + +fn faulted_control_plane_scenario(seed: SimulationSeed) -> ControlPlanePropagationScenario { + let mut scenario = ControlPlanePropagationScenario::new( + seed, + vec![ + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "forge".to_string(), + "relay-lax".to_string(), + "relay-nyc".to_string(), + "relay-hel".to_string(), + ], + "nuc-a", + "ec.control.broadcast.la-kcop", + "la-kcop@42", + ); + scenario.fanout = 3; + scenario.gossip_interval_ms = 35; + scenario.max_gossip_rounds = 12; + scenario.base_network_delay_ms = 6; + scenario.max_jitter_ms = 45; + scenario.transient_drop_per_million = 120_000; + scenario.partitions = vec![ + SimulationPartition::new("relay-hel", 70, 190, 55), + SimulationPartition::new("tower", 220, 310, 40), + ]; + scenario.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)]; + scenario +} + +fn system_duplicate_scenario( + seed: SimulationSeed, + sequence_clock: PublisherSequenceClock, +) -> SystemDuplicatePublisherScenario { + let mut control = ControlPlanePropagationScenario::new( + seed, + vec![ + "forge".to_string(), + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "relay-lax".to_string(), + "relay-nyc".to_string(), + "relay-hel".to_string(), + ], + "forge", + "ec.control.broadcast.la-kcop", + "la-kcop@42", + ); + control.fanout = 3; + control.gossip_interval_ms = 35; + control.max_gossip_rounds = 12; + control.base_network_delay_ms = 6; + control.max_jitter_ms = 45; + control.transient_drop_per_million = 120_000; + control.partitions = vec![ + SimulationPartition::new("nuc-b", 0, 180, 40), + SimulationPartition::new("relay-hel", 70, 190, 55), + ]; + control.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)]; + + let mut media = DuplicatePublisherScenario::new( + SimulationSeed::new(seed.0 ^ 0x6d65_6469_6121), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + STREAM, + RENDITION, + TRACK, + PROFILE, + 0, + 48, + ); + media.segment_step_ms = 40; + media.base_network_delay_ms = 5; + media.max_jitter_ms = 75; + media.transient_drop_per_million = 275_000; + media.backfill_after_ms = 600; + media.partitions = vec![SimulationPartition::new("nuc-a", 940, 1_260, 90)]; + media.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)]; + + let mut scenario = SystemDuplicatePublisherScenario::new(seed, control, media); + scenario.publisher_activation_delay_ms = 25; + scenario.publisher_backfill_delay_ms = 180; + scenario.sequence_clock = sequence_clock; + scenario +} diff --git a/crates/ec-node/Cargo.toml b/crates/ec-node/Cargo.toml index c08cb36..d0fb0bb 100644 --- a/crates/ec-node/Cargo.toml +++ b/crates/ec-node/Cargo.toml @@ -29,17 +29,21 @@ rustls-native-certs = "0.8.3" urlencoding = "2" serde.workspace = true serde_json.workspace = true +opentelemetry.workspace = true +opentelemetry-otlp.workspace = true +opentelemetry_sdk.workspace = true tokio = { version = "1", features = ["full"] } tokio-tungstenite = { version = "0.24", default-features = false, features = ["connect", "rustls-tls-webpki-roots"] } futures-util = "0.3" tracing.workspace = true +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true -web-transport-quinn = "0.11.4" -web-transport-trait = "0.3.3" -hang = "0.14.0" -moq-mux = "0.2.1" -moq-lite = "0.14.0" -moq-native = { version = "0.13.1", default-features = true } +web-transport-quinn = "0.11.9" +web-transport-trait = "0.3.4" +hang = "0.16.0" +moq-mux = "0.4.0" +moq-lite = "0.16.0" +moq-native = { version = "0.14.0", default-features = true } headless_chrome = "1" tokio-util = "0.7" url = "2" diff --git a/crates/ec-node/src/main.rs b/crates/ec-node/src/main.rs index 8d5f842..afaf880 100644 --- a/crates/ec-node/src/main.rs +++ b/crates/ec-node/src/main.rs @@ -7,14 +7,15 @@ mod source; use anyhow::{anyhow, Context, Result}; use blake3; use blockchain::{ManifestObservationLocator, ObservationSink, ObservationSinkOptions}; +use bytes::BytesMut; use clap::ValueEnum; use clap::{Parser, Subcommand}; use ec_chopper::{build_manifest_body_for_chunks, TsChunk}; use ec_core::{ - merkle_proof_for_index, verify_merkle_proof, Manifest, ManifestBody, ManifestSummary, - ManifestVariant, MoqStreamDescriptor, StreamCatalogEntry, StreamControlAnnouncement, - StreamDescriptor, StreamEncryptionInfo, StreamId, StreamKey, StreamMetadata, - StreamTransportDescriptor, MERKLE_PROOF_ALG_BLAKE3, + merkle_proof_for_index, signed_record_index, verify_merkle_proof, Manifest, ManifestBody, + ManifestSummary, ManifestVariant, MoqStreamDescriptor, SignedRecord, StreamCatalogEntry, + StreamControlAnnouncement, StreamDescriptor, StreamEncryptionInfo, StreamId, StreamKey, + StreamMetadata, StreamTransportDescriptor, MERKLE_PROOF_ALG_BLAKE3, }; use ec_crypto::{ decrypt_stream_data, encrypt_stream_data, load_ethereum_manifest_keypair_from_env, @@ -32,7 +33,7 @@ use ec_moq::{ MoqNode, MoqPublishSet, ObjectId, ObjectMeta, ObjectPayload, TimingMeta, TrackName, DEFAULT_MANIFEST_TRACK_NAME, DEFAULT_TRACK_NAME, }; -use futures_util::{SinkExt, StreamExt}; +use futures_util::{future::join_all, SinkExt, StreamExt}; use iroh::Watcher; use just_webrtc::types::{DataChannelOptions, ICEServer, PeerConfiguration, PeerConnectionState}; use just_webrtc::{DataChannelExt, PeerConnectionBuilder, PeerConnectionExt}; @@ -40,22 +41,29 @@ use nbc::{ bootstrap_nbc_auth, nbc_capture_fps, resolve_nbc_chrome_path, resolve_nbc_profile_dir, spawn_nbc_frame_reader, }; +use opentelemetry::trace::TracerProvider as _; +use opentelemetry::KeyValue; +use opentelemetry_otlp::{Protocol, SpanExporter, WithExportConfig}; +use opentelemetry_sdk::{trace::SdkTracerProvider, Resource}; use source::{HdhrSource, HlsMode, HlsSource, LinuxDvbSource, StreamSource, TsSource}; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}; +use std::ffi::OsString; use std::fs; use std::fs::File; use std::future::Future; -use std::io::{Read, Write}; +use std::io::{BufRead, BufReader, Read, Seek, SeekFrom, Write}; +use std::net::TcpStream as StdTcpStream; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, OnceLock}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; use tokio::net::TcpListener; use tokio::process::Command as TokioCommand; use tokio_tungstenite::tungstenite::Message as WsMessage; use tokio_util::io::SyncIoBridge; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use url::Url; const DIRECT_WIRE_TAG_FRAME: u8 = 0x00; @@ -64,10 +72,41 @@ const DIRECT_WIRE_TAG_PING: u8 = 0x02; const DIRECT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(8); // Conservatively under typical SCTP data channel max message sizes. const DIRECT_WIRE_CHUNK_BYTES: usize = 16 * 1024; -const WT_ARCHIVE_DEFAULT_TRACKS: &[&str] = - &["catalog.json", "init.mp4", "video0.m4s", "audio0.m4s"]; -const WT_PUBLISH_GOP_FRAMES: u32 = 1; -const WT_PUBLISH_VIDEO_FILTER: &str = "fps=6"; +const WT_PUBLISH_PRIMARY_VIDEO_TRACK: &str = "0.m4s"; +const WT_PUBLISH_PRIMARY_AUDIO_TRACK: &str = "1.m4s"; +const WT_PUBLISH_INIT_TRACK: &str = "init.mp4"; +const WT_PUBLISH_INIT_CAPTURE_MAX_BYTES: usize = 4 * 1024 * 1024; +const WT_PUBLISHER_ORIGIN_TRACK: &str = "publisher.m4s"; +const WT_ARCHIVE_HLS_DEFAULT_LIMIT: usize = 900; +const WT_ARCHIVE_TIMELINE_FULL_SCAN_MAX_BYTES: u64 = 128 * 1024 * 1024; +const WT_ARCHIVE_DEFAULT_RETENTION_SECONDS: u64 = 30 * 24 * 60 * 60; +const WT_LADDER_PRIMARY_RENDITION: &str = "720p"; +const WT_LADDER_RENDITIONS: &[&str] = &["480p", "720p", "1080p"]; +const ARCHIVE_CONVERGENCE_PROMETHEUS_METRICS: &[&str] = &[ + "every_channel_archive_convergence_ok", + "every_channel_archive_duplicate_complete", + "every_channel_archive_duplicate_hash_source_records", + "every_channel_archive_duplicate_hash_sequences", + "every_channel_archive_hash_divergent_sequences", + "every_channel_archive_source_local_hash_divergent_sequences", + "every_channel_archive_missing_hash_records", + "every_channel_archive_missing_source_identity_records", + "every_channel_archive_media_timing_conflict_sequences", +]; +const WT_ARCHIVE_DEFAULT_TRACKS: &[&str] = &[ + "catalog.json", + "catalog", + WT_PUBLISH_INIT_TRACK, + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + WT_PUBLISH_PRIMARY_AUDIO_TRACK, +]; +const WT_PUBLISH_GOP_FRAMES: u32 = 30; +const WT_PUBLISH_VIDEO_PRESET: &str = "medium"; +const WT_PUBLISH_VIDEO_CRF: u8 = 23; +const WT_PUBLISH_VIDEO_FILTER: &str = + "yadif=mode=send_frame:parity=auto:deint=interlaced,fps=30000/1001"; +const WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS: u64 = 1001; +const WT_PUBLISH_PROOF_PREROLL_PACKETS: usize = 2048; const WT_PUBLISH_MOVFLAGS: &str = "empty_moov+frag_keyframe+separate_moof+omit_tfhd_offset"; use tokio::sync::mpsc; use tokio::sync::oneshot; @@ -108,6 +147,7 @@ enum Commands { /// Subscribe to a relay broadcast over WebTransport/MoQ and archive groups into CAS. WtArchive(WtArchiveArgs), /// Serve archived relay groups as DVR-style HLS playlists + object endpoints. + #[command(alias = "archive-serve")] WtArchiveServe(WtArchiveServeArgs), /// Announce stream transport availability over iroh gossip control topic. ControlAnnounce(ControlAnnounceArgs), @@ -117,6 +157,34 @@ enum Commands { ControlResolve(ControlResolveArgs), /// Bridge iroh control announcements into the website public stream directory. ControlBridgeWeb(ControlBridgeWebArgs), + /// Encode one bounded MPEG-TS window with fresh deterministic publisher proof state. + PublisherProofSegment(PublisherProofSegmentArgs), + /// Split MPEG-TS by source clock and encode each window with fresh proof state. + PublisherProofWindows(PublisherProofWindowsArgs), + /// Prove duplicate publisher identities produce identical stateless proof windows. + PublisherProofDuplicates(PublisherProofDuplicatesArgs), + /// Compare stateless proof-window JSON reports produced on separate publishers. + PublisherProofCompare(PublisherProofCompareArgs), + /// Run stateless proof windows on remote publishers and compare the JSON reports. + PublisherProofRemoteCompare(PublisherProofRemoteCompareArgs), + /// Archive live source-clock proof windows into publisher-origin CAS/index records. + PublisherProofArchiveSource(PublisherProofArchiveSourceArgs), + /// Compare archive manifests from duplicate publishers by media BLAKE3 identity. + ArchiveConvergence(ArchiveConvergenceArgs), + /// Fetch remote archive manifests repeatedly and measure duplicate convergence. + ArchiveConvergenceMeasure(ArchiveConvergenceMeasureArgs), + /// Serve remote duplicate archive convergence measurement metrics over HTTP. + ArchiveConvergenceMeasureServe(ArchiveConvergenceMeasureServeArgs), + /// Serve duplicate archive convergence proof metrics over HTTP. + ArchiveConvergenceServe(ArchiveConvergenceServeArgs), + /// Run deterministic control-plane propagation simulations without booting nodes. + SimControlPlane(SimControlPlaneArgs), + /// Run deterministic distributed-system simulations without booting nodes. + SimSystem(SimSystemArgs), + /// Run deterministic duplicate-publisher simulations without booting nodes. + SimDuplicatePublishers(SimDuplicatePublishersArgs), + /// Build a deterministic off-chain signed-record index JSON document. + SignedRecordIndex(SignedRecordIndexArgs), } #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] @@ -469,15 +537,38 @@ struct WtPublishArgs { /// Input URL or file for ffmpeg (e.g. HDHomeRun `http://hdhomerun.local/auto/v4.1`). #[arg(long)] input: String, + /// Optional ffmpeg input format passed before `--input` (for example `lavfi` for test sources). + #[arg(long)] + input_format: Option, + /// Read the input at native rate. Useful for files and lavfi sources used as live publishers. + #[arg(long, default_value_t = false)] + realtime_input: bool, /// If set, transcode to H.264/AAC before fragmenting to fMP4. #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] transcode: bool, /// ffmpeg video filter used by the transcode path. #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] video_filter: String, + /// ntsc-rs preset JSON to render before ffmpeg publishing. + /// + /// This is an explicit file preprocessor: pair it with `--ntsc-rs-output`. + #[arg(long)] + ntsc_rs_preset: Option, + /// ntsc-rs CLI binary path. Only used when `--ntsc-rs-preset` is set. + #[arg(long, default_value = "ntsc-rs-cli")] + ntsc_rs_cli: PathBuf, + /// Output file written by ntsc-rs and then read by ffmpeg. + #[arg(long)] + ntsc_rs_output: Option, /// H.264 GOP/keyframe interval in frames for the transcode path. #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] gop_frames: u32, + /// x264 preset for the transcode path. Slower presets usually keep quality at lower bitrate. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, /// fMP4 movflags used for WebTransport publishing. #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] movflags: String, @@ -497,6 +588,50 @@ struct WtPublishArgs { /// Control gossip announce interval (ms). #[arg(long, default_value_t = 5000)] control_interval_ms: u64, + /// Exit if the relay `/announced` endpoint stops listing this broadcast for this long. + /// + /// This turns wedged relay sessions into supervised restarts instead of leaving a running + /// process that relays and the public directory consider offline. Set 0 to disable. + #[arg(long, default_value_t = 90000)] + relay_announced_watchdog_ms: u64, + /// Interval for the relay announcement watchdog. + #[arg(long, default_value_t = 10000)] + relay_announced_watchdog_interval_ms: u64, + /// Optional local archive-buffer output dir for publisher-origin fMP4 fragment proof. + #[arg(long)] + publisher_archive_output_dir: Option, + /// Optional manifest dir for publisher-origin proof records. + /// + /// Defaults to `/manifests`. + #[arg(long)] + publisher_archive_manifest_dir: Option, + /// Optional source node stamped into publisher-origin proof records. + /// + /// Defaults to EVERY_CHANNEL_ARCHIVE_SOURCE_NODE, EVERY_CHANNEL_NODE_NAME, HOSTNAME, then + /// `unknown`. + #[arg(long)] + publisher_archive_source_node: Option, + /// Track name for publisher-origin fMP4 fragment proof records. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + publisher_archive_track: String, + /// Segment duration used to map fMP4 decode timestamps onto archive group_sequence. + /// + /// When fragments carry `tfdt` timestamps, publisher-origin proof uses this media-time cadence + /// instead of a local process counter. Missing timestamps fall back to the local counter. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + publisher_archive_segment_duration_ms: u64, + /// Delay ffmpeg startup until the next Unix-epoch media boundary. + /// + /// This keeps restarted duplicate publishers on the same fragment/keyframe phase. Set 0 to + /// start immediately. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + publisher_start_boundary_ms: u64, + /// Use wall-clock input timestamps for live publisher output. + /// + /// Duplicate publishers need fMP4 `tfdt` values to share a global media-time origin. Set false + /// only for file/replay inputs that must keep source timestamps. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + publisher_wallclock_timestamps: bool, /// Optional iroh secret key (hex) for control gossip endpoint identity. #[arg(long)] iroh_secret: Option, @@ -583,7 +718,7 @@ struct NbcWtPublishArgs { #[derive(Parser, Debug)] struct WtArchiveArgs { /// Relay URL (WebTransport) to connect to. - #[arg(long, default_value = "https://cdn.moq.dev/anon")] + #[arg(long, default_value = "https://relay.every.channel/anon")] url: String, /// Broadcast name to subscribe to. #[arg(long)] @@ -599,6 +734,12 @@ struct WtArchiveArgs { /// When omitted, defaults to catalog+init+primary audio/video tracks. #[arg(long)] track: Vec, + /// Source identity stamped into archive records. + /// + /// Defaults to EVERY_CHANNEL_ARCHIVE_SOURCE_NODE, EVERY_CHANNEL_NODE_NAME, HOSTNAME, then + /// `unknown`. Production duplicate proof requires at least two distinct source identities. + #[arg(long)] + source_node: Option, /// Danger: disable TLS verification for the relay. #[arg(long, default_value_t = false)] tls_disable_verify: bool, @@ -616,6 +757,24 @@ struct WtArchiveServeArgs { /// TCP listen address for HTTP replay endpoints. #[arg(long, default_value = "0.0.0.0:7788")] listen: String, + /// Optional archive root URL to fetch missing CAS objects from. + /// The root is expected to contain `objects/blake3//.bin`. + #[arg(long)] + archive_origin_url: Option, + /// Optional CAS root URL to fetch missing objects from. + /// The root is expected to contain `/.bin`. + #[arg(long)] + archive_cas_origin_url: Option, + /// Maximum object size fetched from an archive origin. + #[arg(long, default_value_t = 64 * 1024 * 1024)] + archive_origin_max_bytes: usize, + /// Access marker root used by hot-cache pruning to retain demand-fetched objects. + /// Defaults to `/cache-access/blake3`. + #[arg(long)] + cache_access_dir: Option, + /// Retention target reported by archive status endpoints. + #[arg(long, default_value_t = WT_ARCHIVE_DEFAULT_RETENTION_SECONDS)] + archive_retention_seconds: u64, } #[derive(Parser, Debug)] @@ -639,7 +798,7 @@ struct ControlAnnounceArgs { #[arg(long)] relay_broadcast: Option, /// Relay track name for relay transport advertisement. - #[arg(long, default_value = "video0.m4s")] + #[arg(long, default_value = "0.m4s")] relay_track: String, /// Direct iroh endpoint address/id for direct transport advertisement. /// Defaults to this process endpoint id when `--direct-broadcast` is set. @@ -751,6 +910,714 @@ struct ControlBridgeWebArgs { gossip_peer: Vec, } +#[derive(Parser, Debug)] +struct PublisherProofSegmentArgs { + /// Bounded MPEG-TS source-clock window to encode. + #[arg(long)] + input_ts: PathBuf, + /// Output fMP4 proof file. + #[arg(long)] + output_mp4: PathBuf, + /// If set, transcode to H.264/AAC before fragmenting to fMP4. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + transcode: bool, + /// ffmpeg video filter used by the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] + video_filter: String, + /// H.264 GOP/keyframe interval in frames for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] + gop_frames: u32, + /// x264 preset for the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, + /// fMP4 movflags used for proof encoding. + #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] + movflags: String, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, +} + +#[derive(Parser, Debug)] +struct PublisherProofWindowsArgs { + /// MPEG-TS input to split by source clock. + #[arg(long)] + input_ts: PathBuf, + /// Output directory for source windows and fMP4 proof files. + #[arg(long)] + output_dir: PathBuf, + /// Source-clock chunk duration. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + chunk_ms: u64, + /// Maximum source windows to encode. + #[arg(long)] + max_chunks: Option, + /// MPEG-TS packets prepended from before each source-clock window for decoder context. + #[arg(long, default_value_t = WT_PUBLISH_PROOF_PREROLL_PACKETS)] + preroll_packets: usize, + /// If set, transcode to H.264/AAC before fragmenting to fMP4. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + transcode: bool, + /// ffmpeg video filter used by the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] + video_filter: String, + /// H.264 GOP/keyframe interval in frames for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] + gop_frames: u32, + /// x264 preset for the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, + /// fMP4 movflags used for proof encoding. + #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] + movflags: String, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, +} + +#[derive(Parser, Debug)] +struct PublisherProofDuplicatesArgs { + /// MPEG-TS input to split and encode independently for each publisher identity. + #[arg(long)] + input_ts: PathBuf, + /// Output directory containing per-publisher source windows and fMP4 proof files. + #[arg(long)] + output_dir: PathBuf, + /// Publisher identity labels to compare. Defaults to publisher-a and publisher-b. + #[arg(long, action = clap::ArgAction::Append)] + publisher: Vec, + /// Source-clock chunk duration. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + chunk_ms: u64, + /// Maximum source windows to encode per publisher. + #[arg(long)] + max_chunks: Option, + /// MPEG-TS packets prepended from before each source-clock window for decoder context. + #[arg(long, default_value_t = WT_PUBLISH_PROOF_PREROLL_PACKETS)] + preroll_packets: usize, + /// If set, transcode to H.264/AAC before fragmenting to fMP4. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + transcode: bool, + /// ffmpeg video filter used by the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] + video_filter: String, + /// H.264 GOP/keyframe interval in frames for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] + gop_frames: u32, + /// x264 preset for the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, + /// fMP4 movflags used for proof encoding. + #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] + movflags: String, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Exit non-zero unless every compared source/init/media hash matches. + #[arg(long, default_value_t = false)] + require_ok: bool, +} + +#[derive(Parser, Debug)] +struct PublisherProofCompareArgs { + /// Named proof-window JSON report, e.g. `nuc-a=/tmp/nuc-a-proof.json`. Repeatable. + #[arg(long = "report", action = clap::ArgAction::Append)] + report: Vec, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Exit non-zero unless every compared source/init/media hash matches. + #[arg(long, default_value_t = false)] + require_ok: bool, +} + +#[derive(Parser, Debug)] +struct PublisherProofRemoteCompareArgs { + /// Bounded MPEG-TS proof input copied to every remote publisher. + #[arg(long)] + input_ts: PathBuf, + /// Local output directory for per-publisher reports and final comparison JSON. + #[arg(long)] + output_dir: PathBuf, + /// Named remote publisher target, e.g. `nuc-a=100.64.0.44`. Repeatable. + #[arg(long = "remote", action = clap::ArgAction::Append)] + remote: Vec, + /// Remote ec-node executable path. + #[arg(long, default_value = "ec-node")] + remote_ec_node: String, + /// SSH option passed as `-o KEY=VALUE`. Repeatable. + #[arg(long = "ssh-option", action = clap::ArgAction::Append)] + ssh_option: Vec, + /// Remote root directory. Defaults to a generated `/tmp/every-channel-publisher-proof-*` path. + #[arg(long)] + remote_root: Option, + /// Source-clock chunk duration. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + chunk_ms: u64, + /// Maximum source windows to encode per publisher. + #[arg(long)] + max_chunks: Option, + /// MPEG-TS packets prepended from before each source-clock window for decoder context. + #[arg(long, default_value_t = WT_PUBLISH_PROOF_PREROLL_PACKETS)] + preroll_packets: usize, + /// If set, transcode to H.264/AAC before fragmenting to fMP4. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + transcode: bool, + /// ffmpeg video filter used by the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] + video_filter: String, + /// H.264 GOP/keyframe interval in frames for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] + gop_frames: u32, + /// x264 preset for the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, + /// fMP4 movflags used for proof encoding. + #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] + movflags: String, + /// Remove generated remote proof directories after reports are collected. + #[arg(long, default_value_t = false)] + cleanup_remote: bool, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Exit non-zero unless every compared source/init/media hash matches. + #[arg(long, default_value_t = false)] + require_ok: bool, +} + +#[derive(Parser, Debug, Clone)] +struct PublisherProofArchiveSourceArgs { + /// MPEG-TS source input path or URL to split by source clock. + #[arg(long)] + input: String, + /// Optional input format passed through for non-standard source inputs. + #[arg(long)] + input_format: Option, + /// Local archive-buffer output dir for publisher-origin proof fragments. + #[arg(long)] + output_dir: PathBuf, + /// Optional manifest/index output dir. Defaults to `/manifests`. + #[arg(long)] + manifest_dir: Option, + /// Relay URL stamped into publisher-origin proof records. + #[arg(long)] + relay_url: String, + /// Broadcast name stamped into publisher-origin proof records. + #[arg(long)] + name: String, + /// Track name for publisher-origin proof records. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + track: String, + /// Optional source node stamped into publisher-origin proof records. + #[arg(long)] + source_node: Option, + /// Source-clock chunk duration. + #[arg(long, default_value_t = WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS)] + chunk_ms: u64, + /// Maximum source windows to encode. Omit for live continuous operation. + #[arg(long)] + max_chunks: Option, + /// MPEG-TS packets prepended from before each source-clock window for decoder context. + #[arg(long, default_value_t = WT_PUBLISH_PROOF_PREROLL_PACKETS)] + preroll_packets: usize, + /// If set, transcode to H.264/AAC before fragmenting to fMP4. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + transcode: bool, + /// ffmpeg video filter used by the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_FILTER)] + video_filter: String, + /// H.264 GOP/keyframe interval in frames for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_GOP_FRAMES)] + gop_frames: u32, + /// x264 preset for the transcode path. + #[arg(long, default_value = WT_PUBLISH_VIDEO_PRESET)] + video_preset: String, + /// x264 CRF for the transcode path. + #[arg(long, default_value_t = WT_PUBLISH_VIDEO_CRF)] + video_crf: u8, + /// fMP4 movflags used for proof encoding. + #[arg(long, default_value = WT_PUBLISH_MOVFLAGS)] + movflags: String, + /// Remove temporary source-window and fMP4 proof files after CAS/index records are written. + #[arg(long, default_value_t = true, action = clap::ArgAction::Set)] + cleanup_temp: bool, + /// Pretty-print a final JSON summary on normal EOF. + #[arg(long, default_value_t = false)] + pretty: bool, +} + +#[derive(Parser, Debug)] +struct ArchiveConvergenceArgs { + /// Named archive manifest root, e.g. `nuc-a=/srv/every-channel/archive-buffer/manifests`. + #[arg(long, action = clap::ArgAction::Append)] + source: Vec, + /// Logical broadcast name under each manifest root. + #[arg(long)] + broadcast: String, + /// Track to compare. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + track: String, + /// Logical stream id for the comparison. Defaults to `--broadcast` without ladder suffix. + #[arg(long)] + stream_id: Option, + /// Logical rendition id. Defaults to track or broadcast ladder suffix, then `primary`. + #[arg(long)] + rendition: Option, + /// First sequence to include. Defaults to the earliest observed sequence. + #[arg(long)] + start_sequence: Option, + /// Exclusive end sequence. Defaults to latest observed sequence + 1. + #[arg(long)] + end_sequence: Option, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Render Prometheus text metrics instead of JSON. + #[arg(long, default_value_t = false)] + prometheus: bool, + /// Node label used by `--prometheus` output. + #[arg(long)] + metrics_node: Option, + /// Role label used by `--prometheus` output. + #[arg(long, default_value = "duplicate-proof")] + metrics_role: String, + /// Exit non-zero unless duplicate convergence is proven. + #[arg(long, default_value_t = false)] + require_ok: bool, +} + +#[derive(Parser, Debug, Clone)] +struct ArchiveConvergenceMeasureArgs { + /// Named node-agent base URL, e.g. `nuc-a=http://100.x.y.z:7799`. + #[arg(long = "agent-manifest", action = clap::ArgAction::Append)] + agent_manifest: Vec, + /// Prometheus file-SD JSON containing node-agent scrape targets. + #[arg(long = "agent-prometheus-sd", action = clap::ArgAction::Append)] + agent_prometheus_sd: Vec, + /// Label filter for Prometheus file-SD groups, e.g. `headscale_user=node`. + #[arg(long = "agent-prometheus-sd-label", action = clap::ArgAction::Append)] + agent_prometheus_sd_label: Vec, + /// Role query parameter for node-agent `/v1/archive-manifest`. + #[arg(long, default_value = "publisher-buffer")] + agent_manifest_role: String, + /// Named manifest JSONL URL, e.g. `nuc-a=https://host/manifests/la-kcop/publisher.m4s.jsonl`. + #[arg(long, action = clap::ArgAction::Append)] + manifest: Vec, + /// Logical broadcast name under each manifest root. + #[arg(long)] + broadcast: String, + /// Track to compare. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + track: String, + /// Logical stream id for the comparison. Defaults to `--broadcast` without ladder suffix. + #[arg(long)] + stream_id: Option, + /// Logical rendition id. Defaults to track or broadcast ladder suffix, then `primary`. + #[arg(long)] + rendition: Option, + /// First sequence to include. Defaults to the earliest observed sequence. + #[arg(long)] + start_sequence: Option, + /// Exclusive end sequence. Defaults to latest observed sequence + 1. + #[arg(long)] + end_sequence: Option, + /// Prometheus base URL used to check Grafana-facing duplicate proof series. + #[arg(long)] + prometheus_url: Option, + /// Total wall-clock measurement duration. Set 0 for one sample. + #[arg(long, default_value_t = 0.0)] + duration_seconds: f64, + /// Delay between samples when duration is non-zero. + #[arg(long, default_value_t = 30.0)] + poll_interval_seconds: f64, + /// HTTP request timeout in milliseconds. + #[arg(long, default_value_t = 10000)] + timeout_ms: u64, + /// Maximum manifest bytes fetched per source. + #[arg(long, default_value_t = 4 * 1024 * 1024)] + max_manifest_bytes: usize, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Exit non-zero unless elapsed production duplicate convergence is proven. + #[arg(long, default_value_t = false)] + require_ok: bool, +} + +#[derive(Parser, Debug, Clone)] +struct ArchiveConvergenceMeasureServeArgs { + /// Named node-agent base URL, e.g. `nuc-a=http://100.x.y.z:7799`. + #[arg(long = "agent-manifest", action = clap::ArgAction::Append)] + agent_manifest: Vec, + /// Prometheus file-SD JSON containing node-agent scrape targets. + #[arg(long = "agent-prometheus-sd", action = clap::ArgAction::Append)] + agent_prometheus_sd: Vec, + /// Label filter for Prometheus file-SD groups, e.g. `headscale_user=node`. + #[arg(long = "agent-prometheus-sd-label", action = clap::ArgAction::Append)] + agent_prometheus_sd_label: Vec, + /// Role query parameter for node-agent `/v1/archive-manifest`. + #[arg(long, default_value = "publisher-buffer")] + agent_manifest_role: String, + /// Named manifest JSONL URL, e.g. `nuc-a=https://host/manifests/la-kcop/publisher.m4s.jsonl`. + #[arg(long, action = clap::ArgAction::Append)] + manifest: Vec, + /// Logical broadcast name under each manifest root. + #[arg(long)] + broadcast: String, + /// Track to compare. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + track: String, + /// Logical stream id for the comparison. Defaults to `--broadcast` without ladder suffix. + #[arg(long)] + stream_id: Option, + /// Logical rendition id. Defaults to track or broadcast ladder suffix, then `primary`. + #[arg(long)] + rendition: Option, + /// First sequence to include. Defaults to the earliest observed sequence. + #[arg(long)] + start_sequence: Option, + /// Exclusive end sequence. Defaults to latest observed sequence + 1. + #[arg(long)] + end_sequence: Option, + /// Prometheus base URL used to check Grafana-facing duplicate proof series. + #[arg(long)] + prometheus_url: Option, + /// HTTP request timeout in milliseconds. + #[arg(long, default_value_t = 10000)] + timeout_ms: u64, + /// Maximum manifest bytes fetched per source. + #[arg(long, default_value_t = 4 * 1024 * 1024)] + max_manifest_bytes: usize, + /// Listen address for this proof service's `/health` and `/metrics` endpoints. + #[arg(long, default_value = "127.0.0.1:7813")] + listen: String, + /// Maximum recent scrape samples retained for elapsed convergence proof. + #[arg(long, default_value_t = 16)] + max_samples: usize, + /// Minimum elapsed sample window required before this service reports measurement ok. + #[arg(long, default_value_t = 30.0)] + min_elapsed_seconds: f64, + /// Node label used by emitted Prometheus metrics. + #[arg(long)] + metrics_node: Option, + /// Role label used by emitted Prometheus metrics. + #[arg(long, default_value = "duplicate-proof")] + metrics_role: String, +} + +#[derive(Parser, Debug, Clone)] +struct ArchiveConvergenceServeArgs { + /// Named archive manifest root, e.g. `nuc-a=/srv/every-channel/archive-buffer/manifests`. + #[arg(long, action = clap::ArgAction::Append)] + source: Vec, + /// Logical broadcast name under each manifest root. + #[arg(long)] + broadcast: String, + /// Track to compare. + #[arg(long, default_value = WT_PUBLISHER_ORIGIN_TRACK)] + track: String, + /// Logical stream id for the comparison. Defaults to `--broadcast` without ladder suffix. + #[arg(long)] + stream_id: Option, + /// Logical rendition id. Defaults to track or broadcast ladder suffix, then `primary`. + #[arg(long)] + rendition: Option, + /// First sequence to include. Defaults to the earliest observed sequence. + #[arg(long)] + start_sequence: Option, + /// Exclusive end sequence. Defaults to latest observed sequence + 1. + #[arg(long)] + end_sequence: Option, + /// TCP listen address for Prometheus `/metrics`. + #[arg(long, default_value = "127.0.0.1:7812")] + listen: String, + /// Node label used by Prometheus output. + #[arg(long)] + metrics_node: Option, + /// Role label used by Prometheus output. + #[arg(long, default_value = "duplicate-proof")] + metrics_role: String, +} + +#[derive(Parser, Debug)] +struct SimControlPlaneArgs { + /// Replay an exact ControlPlanePropagationScenario JSON. Use `-` to read stdin. + #[arg(long)] + scenario_json: Option, + /// First deterministic seed for the campaign. + #[arg(long, default_value_t = 1)] + seed: u64, + /// Number of consecutive seed schedules to run. + #[arg(long, default_value_t = 1024)] + iterations: u64, + /// Logical node name. Repeat to override the default topology. + #[arg(long = "node", action = clap::ArgAction::Append)] + nodes: Vec, + /// Node where the control announcement originates. + #[arg(long, default_value = "nuc-a")] + origin_node: String, + /// Logical gossip topic under test. + #[arg(long, default_value = "ec.control.broadcast.la-kcop")] + topic: String, + /// Announcement id under test. + #[arg(long, default_value = "la-kcop@42")] + announcement_id: String, + /// Number of peers each known node gossips to per round. + #[arg(long, default_value_t = 3)] + fanout: u64, + /// Logical milliseconds between gossip rounds for a node that knows the message. + #[arg(long, default_value_t = 35)] + gossip_interval_ms: u64, + /// Maximum gossip rounds per node after it learns the announcement. + #[arg(long, default_value_t = 12)] + max_gossip_rounds: u64, + /// Base network delivery delay per control message. + #[arg(long, default_value_t = 6)] + base_network_delay_ms: u64, + /// Maximum deterministic jitter added to each control message. + #[arg(long, default_value_t = 45)] + max_jitter_ms: u64, + /// Probability of transient delivery drop, expressed per million messages. + #[arg(long, default_value_t = 120_000)] + transient_drop_per_million: u32, + /// Required propagation deadline. Set 0 to disable the deadline gate. + #[arg(long, default_value_t = 900)] + max_propagation_complete_ms: u64, + /// Partition fault as NODE:START_MS:END_MS:RELEASE_DELAY_MS. Repeatable. + #[arg(long = "partition", action = clap::ArgAction::Append)] + partitions: Vec, + /// Node outage as NODE:START_MS:END_MS:RECOVERY_DELAY_MS. Repeatable. + #[arg(long = "node-outage", action = clap::ArgAction::Append)] + node_outages: Vec, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Emit failure JSON without returning a non-zero exit status. + #[arg(long, default_value_t = false)] + allow_failure: bool, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum SimPublisherSequenceClock { + Global, + LocalActivation, +} + +impl From for ec_core::sim::PublisherSequenceClock { + fn from(value: SimPublisherSequenceClock) -> Self { + match value { + SimPublisherSequenceClock::Global => ec_core::sim::PublisherSequenceClock::Global, + SimPublisherSequenceClock::LocalActivation => { + ec_core::sim::PublisherSequenceClock::LocalActivation + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum SimSystemFaultProfile { + Fixed, + #[value(name = "foundationdb")] + FoundationDb, +} + +#[derive(Parser, Debug)] +struct SimSystemArgs { + /// Replay an exact SystemDuplicatePublisherScenario JSON. Use `-` to read stdin. + #[arg(long)] + scenario_json: Option, + /// First deterministic seed for the campaign. + #[arg(long, default_value_t = 1)] + seed: u64, + /// Number of consecutive seed schedules to run. + #[arg(long, default_value_t = 1024)] + iterations: u64, + /// Logical node name. Repeat to override the default topology. + #[arg(long = "node", action = clap::ArgAction::Append)] + nodes: Vec, + /// Publisher node name. Repeat for more than two publishers. + #[arg(long = "publisher", action = clap::ArgAction::Append)] + publishers: Vec, + /// Control-plane origin node. + #[arg(long, default_value = "forge")] + origin_node: String, + /// Control topic under test. + #[arg(long, default_value = "ec.control.broadcast.la-kcop")] + topic: String, + /// Control announcement id under test. + #[arg(long, default_value = "la-kcop@42")] + announcement_id: String, + /// Media sequence clock model. + #[arg(long, value_enum, default_value_t = SimPublisherSequenceClock::Global)] + sequence_clock: SimPublisherSequenceClock, + /// Fault scenario profile to generate for each deterministic seed. + #[arg(long, value_enum, default_value_t = SimSystemFaultProfile::Fixed)] + fault_profile: SimSystemFaultProfile, + /// Require the campaign to exercise every built-in distributed fault class. + #[arg(long, default_value_t = false)] + require_fault_coverage: bool, + /// Minimum seeds that must exercise each required fault class. Set 0 for profile default. + #[arg(long, default_value_t = 0)] + min_fault_seed_coverage: u64, + /// Logical stream id under test. + #[arg(long, default_value = "la-kcop")] + stream_id: String, + /// Logical rendition id under test. + #[arg(long, default_value = WT_LADDER_PRIMARY_RENDITION)] + rendition: String, + /// Track name under test. + #[arg(long, default_value = WT_PUBLISH_PRIMARY_VIDEO_TRACK)] + track: String, + /// Encoder/profile identity expected to produce byte-identical chunks. + #[arg(long, default_value = "x264-hd3-v1")] + profile: String, + /// Number of media sequences to model. + #[arg(long, default_value_t = 48)] + sequence_count: u64, + /// Logical milliseconds between produced media sequences. + #[arg(long, default_value_t = 40)] + segment_step_ms: u64, + /// Additional delay after a publisher learns the control message before it can publish. + #[arg(long, default_value_t = 25)] + publisher_activation_delay_ms: u64, + /// Delay before pre-activation media is backfilled. + #[arg(long, default_value_t = 180)] + publisher_backfill_delay_ms: u64, + /// Source material identity as NODE:MATERIAL_ID. Different ids model independent captures that cannot be byte-identical. + #[arg(long = "publisher-source-material", action = clap::ArgAction::Append)] + publisher_source_material: Vec, + /// Required system convergence deadline. Set 0 to disable the deadline gate. + #[arg(long, default_value_t = 3_500)] + max_system_complete_ms: u64, + /// Write the first failing system simulation artifact to this JSON path. + #[arg(long)] + failure_artifact: Option, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Emit failure JSON without returning a non-zero exit status. + #[arg(long, default_value_t = false)] + allow_failure: bool, +} + +#[derive(Parser, Debug)] +struct SimDuplicatePublishersArgs { + /// Replay an exact DuplicatePublisherScenario JSON. Use `-` to read stdin. + #[arg(long)] + scenario_json: Option, + /// First deterministic seed for the campaign. + #[arg(long, default_value_t = 1)] + seed: u64, + /// Number of consecutive seed schedules to run. + #[arg(long, default_value_t = 1024)] + iterations: u64, + /// Publisher node name. Repeat for more than two publishers. + #[arg(long = "publisher", action = clap::ArgAction::Append)] + publishers: Vec, + /// Logical stream id under test. + #[arg(long, default_value = "la-kcop")] + stream_id: String, + /// Logical rendition id under test. + #[arg(long, default_value = WT_LADDER_PRIMARY_RENDITION)] + rendition: String, + /// Track name under test. + #[arg(long, default_value = WT_PUBLISH_PRIMARY_VIDEO_TRACK)] + track: String, + /// Encoder/profile identity expected to produce byte-identical chunks. + #[arg(long, default_value = "x264-hd3-v1")] + profile: String, + /// First media sequence. + #[arg(long, default_value_t = 0)] + start_sequence: u64, + /// Number of media sequences to model. + #[arg(long, default_value_t = 48)] + sequence_count: u64, + /// Logical milliseconds between produced media sequences. + #[arg(long, default_value_t = 40)] + segment_step_ms: u64, + /// Base network delivery delay per observation. + #[arg(long, default_value_t = 5)] + base_network_delay_ms: u64, + /// Maximum deterministic jitter added to each delivery. + #[arg(long, default_value_t = 75)] + max_jitter_ms: u64, + /// Probability of transient delivery drop, expressed per million observations. + #[arg(long, default_value_t = 275_000)] + transient_drop_per_million: u32, + /// Delay before a dropped observation is backfilled. + #[arg(long, default_value_t = 600)] + backfill_after_ms: u64, + /// Required duplicate convergence deadline. Set 0 to disable the deadline gate. + #[arg(long, default_value_t = 3_000)] + max_duplicate_complete_ms: u64, + /// Partition fault as NODE:START_MS:END_MS:RELEASE_DELAY_MS. Repeatable. + #[arg(long = "partition", action = clap::ArgAction::Append)] + partitions: Vec, + /// Publisher outage as NODE:START_MS:END_MS:BACKFILL_DELAY_MS. Repeatable. + #[arg(long = "publisher-outage", action = clap::ArgAction::Append)] + publisher_outages: Vec, + /// Publisher content phase offset as NODE:SEQUENCE_OFFSET. Repeatable. + #[arg(long = "publisher-sequence-offset", action = clap::ArgAction::Append)] + publisher_sequence_offsets: Vec, + /// Publisher media clock offset as NODE:OFFSET_MS. Repeatable. + #[arg(long = "publisher-media-time-offset", action = clap::ArgAction::Append)] + publisher_media_time_offsets_ms: Vec, + /// Publisher that emits archive records without media timing. Repeatable. + #[arg(long = "missing-media-timing-publisher", action = clap::ArgAction::Append)] + missing_media_timing_publishers: Vec, + /// Encoder drift fault as NODE:SEQUENCE:PROFILE_ID. Repeatable. + #[arg(long = "encoder-drift", action = clap::ArgAction::Append)] + encoder_drifts: Vec, + /// Source material identity as NODE:MATERIAL_ID. Different ids model independent captures that cannot be byte-identical. + #[arg(long = "publisher-source-material", action = clap::ArgAction::Append)] + publisher_source_material: Vec, + /// Write the first failing simulation artifact to this JSON path. + #[arg(long)] + failure_artifact: Option, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, + /// Emit failure JSON without returning a non-zero exit status. + #[arg(long, default_value_t = false)] + allow_failure: bool, +} + +#[derive(Parser, Debug)] +struct SignedRecordIndexArgs { + /// JSON input path. Use `-` to read from stdin. + /// + /// Accepted shapes are `[SignedRecord, ...]`, `{ "records": [...] }`, or a single SignedRecord. + #[arg(long, default_value = "-")] + input: PathBuf, + /// Override the generated_unix_ms timestamp in the output index. + #[arg(long)] + generated_unix_ms: Option, + /// Pretty-print output JSON. + #[arg(long, default_value_t = false)] + pretty: bool, +} + +#[derive(Debug, serde::Deserialize)] +#[serde(untagged)] +enum SignedRecordIndexInput { + Records(Vec), + Wrapped { records: Vec }, + Single(SignedRecord), +} + #[derive(Subcommand, Debug)] enum IngestSource { /// Ingest from an HDHomeRun device. @@ -809,13 +1676,7 @@ fn main() -> Result<()> { let _ = rustls::crypto::ring::default_provider().install_default(); } - // Keep stdout reserved for machine-readable output (endpoint addr, etc). - let filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")); - tracing_subscriber::fmt() - .with_writer(std::io::stderr) - .with_env_filter(filter) - .init(); + let _telemetry = init_tracing()?; let cli = Cli::parse(); match cli.command { @@ -836,11 +1697,154 @@ fn main() -> Result<()> { Commands::ControlListen(args) => run_async(control_listen(args))?, Commands::ControlResolve(args) => run_async(control_resolve(args))?, Commands::ControlBridgeWeb(args) => run_async(control_bridge_web(args))?, + Commands::PublisherProofSegment(args) => publisher_proof_segment_command(args)?, + Commands::PublisherProofWindows(args) => publisher_proof_windows_command(args)?, + Commands::PublisherProofDuplicates(args) => publisher_proof_duplicates_command(args)?, + Commands::PublisherProofCompare(args) => publisher_proof_compare_command(args)?, + Commands::PublisherProofRemoteCompare(args) => { + publisher_proof_remote_compare_command(args)? + } + Commands::PublisherProofArchiveSource(args) => { + publisher_proof_archive_source_command(args)? + } + Commands::ArchiveConvergence(args) => archive_convergence_command(args)?, + Commands::ArchiveConvergenceMeasure(args) => { + run_async(archive_convergence_measure_command(args))? + } + Commands::ArchiveConvergenceMeasureServe(args) => { + run_async(archive_convergence_measure_serve(args))? + } + Commands::ArchiveConvergenceServe(args) => run_async(archive_convergence_serve(args))?, + Commands::SimControlPlane(args) => sim_control_plane_command(args)?, + Commands::SimSystem(args) => sim_system_command(args)?, + Commands::SimDuplicatePublishers(args) => sim_duplicate_publishers_command(args)?, + Commands::SignedRecordIndex(args) => signed_record_index_command(args)?, } Ok(()) } +fn signed_record_index_command(args: SignedRecordIndexArgs) -> Result<()> { + let input = read_signed_record_index_input(&args.input)?; + let records = parse_signed_record_index_input(&input)?; + let generated_unix_ms = args.generated_unix_ms.unwrap_or_else(now_unix_ms); + let index = signed_record_index(&records, generated_unix_ms); + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + if args.pretty { + serde_json::to_writer_pretty(&mut out, &index) + .context("failed to write signed-record index")?; + } else { + serde_json::to_writer(&mut out, &index).context("failed to write signed-record index")?; + } + out.write_all(b"\n") + .context("failed to newline-terminate signed-record index")?; + Ok(()) +} + +fn read_signed_record_index_input(path: &Path) -> Result> { + if path == Path::new("-") { + let mut input = Vec::new(); + std::io::stdin() + .read_to_end(&mut input) + .context("failed to read signed-record input from stdin")?; + return Ok(input); + } + fs::read(path).with_context(|| format!("failed to read signed-record input {}", path.display())) +} + +fn parse_signed_record_index_input(input: &[u8]) -> Result> { + match serde_json::from_slice::(input) + .context("failed to parse signed-record input JSON")? + { + SignedRecordIndexInput::Records(records) => Ok(records), + SignedRecordIndexInput::Wrapped { records } => Ok(records), + SignedRecordIndexInput::Single(record) => Ok(vec![record]), + } +} + +struct TelemetryGuard { + tracer_provider: Option, +} + +impl Drop for TelemetryGuard { + fn drop(&mut self) { + if let Some(provider) = self.tracer_provider.take() { + let _ = provider.shutdown(); + } + } +} + +fn init_tracing() -> Result { + // Keep stdout reserved for machine-readable output (endpoint addr, etc). + let filter = tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")); + let fmt_layer = tracing_subscriber::fmt::layer().with_writer(std::io::stderr); + + if let Some(endpoint) = otel_traces_endpoint() { + let service_name = + std::env::var("OTEL_SERVICE_NAME").unwrap_or_else(|_| "ec-node".to_string()); + let service_instance = std::env::var("EVERY_CHANNEL_NODE_NAME") + .or_else(|_| std::env::var("HOSTNAME")) + .unwrap_or_else(|_| "unknown".to_string()); + let exporter = SpanExporter::builder() + .with_http() + .with_protocol(Protocol::HttpBinary) + .with_endpoint(endpoint) + .build() + .context("failed to build OTLP span exporter")?; + let tracer_provider = SdkTracerProvider::builder() + .with_batch_exporter(exporter) + .with_resource( + Resource::builder() + .with_service_name(service_name.clone()) + .with_attribute(KeyValue::new("service.namespace", "every.channel")) + .with_attribute(KeyValue::new("service.instance.id", service_instance)) + .build(), + ) + .build(); + let tracer = tracer_provider.tracer("ec-node"); + let otel_layer = tracing_opentelemetry::layer().with_tracer(tracer); + tracing_subscriber::registry() + .with(filter) + .with(fmt_layer) + .with(otel_layer) + .init(); + Ok(TelemetryGuard { + tracer_provider: Some(tracer_provider), + }) + } else { + tracing_subscriber::registry() + .with(filter) + .with(fmt_layer) + .init(); + Ok(TelemetryGuard { + tracer_provider: None, + }) + } +} + +fn otel_traces_endpoint() -> Option { + for name in [ + "EVERY_CHANNEL_OTEL_TRACES_ENDPOINT", + "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", + "OTEL_EXPORTER_OTLP_ENDPOINT", + ] { + let Ok(raw) = std::env::var(name) else { + continue; + }; + let trimmed = raw.trim().trim_end_matches('/'); + if trimmed.is_empty() { + continue; + } + if trimmed.ends_with("/v1/traces") { + return Some(trimmed.to_string()); + } + return Some(format!("{trimmed}/v1/traces")); + } + None +} + fn run_async(future: F) -> Result<()> where F: Future>, @@ -1786,6 +2790,898 @@ mod tests { use ec_core::BroadcastId; use std::io::Cursor; + #[test] + fn wt_publish_defaults_are_smooth_live_defaults() { + assert_eq!( + WT_PUBLISH_VIDEO_FILTER, + "yadif=mode=send_frame:parity=auto:deint=interlaced,fps=30000/1001" + ); + assert_eq!(WT_PUBLISH_GOP_FRAMES, 30); + assert_eq!(WT_PUBLISH_VIDEO_PRESET, "medium"); + assert_eq!(WT_PUBLISH_VIDEO_CRF, 23); + assert_eq!( + WT_PUBLISH_MOVFLAGS, + "empty_moov+frag_keyframe+separate_moof+omit_tfhd_offset" + ); + } + + #[test] + fn wt_publish_defaults_align_start_to_publisher_proof_cadence() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.mp4", + ]); + + assert_eq!( + args.publisher_start_boundary_ms, + WT_PUBLISH_ARCHIVE_SEGMENT_DURATION_MS + ); + assert!(args.publisher_wallclock_timestamps); + assert_eq!(0, publisher_start_boundary_delay_ms(10_010, 1001)); + assert_eq!(1, publisher_start_boundary_delay_ms(10_009, 1001)); + assert_eq!(0, publisher_start_boundary_delay_ms(10_009, 0)); + } + + #[test] + fn wt_publish_ffmpeg_plan_is_deterministic_live_profile() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.ts", + ]); + + let ffmpeg_args = wt_publish_ffmpeg_args(&args, "/tmp/source.ts") + .into_iter() + .map(|arg| arg.to_string_lossy().into_owned()) + .collect::>(); + + assert_eq!( + ffmpeg_args, + vec![ + "-hide_banner", + "-loglevel", + "error", + "-nostats", + "-fflags", + "+nobuffer+bitexact", + "-flags", + "low_delay", + "-copyts", + "-use_wallclock_as_timestamps", + "1", + "-i", + "/tmp/source.ts", + "-map", + "0:v:0", + "-map", + "0:a:0?", + "-c:v", + "libx264", + "-vf", + WT_PUBLISH_VIDEO_FILTER, + "-preset", + WT_PUBLISH_VIDEO_PRESET, + "-tune", + "zerolatency", + "-crf", + "23", + "-pix_fmt", + "yuv420p", + "-profile:v", + "main", + "-g", + "30", + "-keyint_min", + "30", + "-force_key_frames", + "expr:if(isnan(prev_forced_t),gte(t,0),gte(t,prev_forced_t+1.001))", + "-sc_threshold", + "0", + "-bf", + "0", + "-x264-params", + "open-gop=0:scenecut=0:rc-lookahead=0:sync-lookahead=0:stitchable=1", + "-threads", + "1", + "-c:a", + "aac", + "-profile:a", + "aac_low", + "-b:a", + "160k", + "-ac", + "2", + "-ar", + "48000", + "-af", + ec_chopper::LIVE_AUDIO_RESAMPLE_FILTER, + "-max_muxing_queue_size", + "2048", + "-avoid_negative_ts", + "disabled", + "-f", + "mp4", + "-movflags", + WT_PUBLISH_MOVFLAGS, + "pipe:1", + ] + ); + } + + #[test] + fn wt_publish_stateless_proof_plan_uses_fresh_bitexact_encoder() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.ts", + ]); + + let ffmpeg_args = wt_publish_stateless_proof_ffmpeg_args( + &args, + Path::new("/tmp/source-window.ts"), + Path::new("/tmp/proof.mp4"), + ) + .into_iter() + .map(|arg| arg.to_string_lossy().into_owned()) + .collect::>(); + + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-map_metadata", "-1"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-filter_threads", "1"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-filter_complex_threads", "1"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-force_key_frames", "expr:gte(t,0)"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-avoid_negative_ts", "make_zero"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-flags:v", "+bitexact"])); + assert!(ffmpeg_args + .windows(2) + .any(|window| window == ["-flags:a", "+bitexact"])); + assert_eq!( + ffmpeg_args.last().map(String::as_str), + Some("/tmp/proof.mp4") + ); + assert!(!ffmpeg_args.iter().any(|arg| arg == "pipe:1")); + } + + fn test_command_available(name: &str) -> bool { + Command::new(name) + .arg("-version") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .map(|status| status.success()) + .unwrap_or(false) + } + + fn write_short_deterministic_ts(out_path: &Path) -> Result<()> { + let status = Command::new("ffmpeg") + .arg("-hide_banner") + .arg("-loglevel") + .arg("error") + .arg("-nostdin") + .arg("-y") + .arg("-f") + .arg("lavfi") + .arg("-i") + .arg("testsrc2=size=320x180:rate=30000/1001") + .arg("-f") + .arg("lavfi") + .arg("-i") + .arg("sine=frequency=1000:sample_rate=48000") + .arg("-t") + .arg("1.25") + .arg("-map") + .arg("0:v:0") + .arg("-map") + .arg("1:a:0") + .arg("-c:v") + .arg("libx264") + .arg("-pix_fmt") + .arg("yuv420p") + .arg("-g") + .arg("30") + .arg("-keyint_min") + .arg("30") + .arg("-sc_threshold") + .arg("0") + .arg("-bf") + .arg("0") + .arg("-x264-params") + .arg("repeat-headers=1") + .arg("-threads") + .arg("1") + .arg("-fflags") + .arg("+bitexact") + .arg("-flags:v") + .arg("+bitexact") + .arg("-c:a") + .arg("aac") + .arg("-b:a") + .arg("128k") + .arg("-ac") + .arg("2") + .arg("-ar") + .arg("48000") + .arg("-flags:a") + .arg("+bitexact") + .arg("-f") + .arg("mpegts") + .arg(out_path) + .status() + .context("failed to run ffmpeg synthetic TS writer")?; + if !status.success() { + return Err(anyhow!( + "ffmpeg synthetic TS generation failed with {status}" + )); + } + Ok(()) + } + + fn encode_stateless_proof_segment( + args: &WtPublishArgs, + input_ts: &Path, + output_mp4: &Path, + ) -> Result { + let status = Command::new("ffmpeg") + .args(wt_publish_stateless_proof_ffmpeg_args( + args, input_ts, output_mp4, + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::inherit()) + .status() + .context("failed to run stateless proof ffmpeg")?; + if !status.success() { + return Err(anyhow!("stateless proof ffmpeg exited with {status}")); + } + split_fmp4_init_and_media(&fs::read(output_mp4)?) + } + + #[test] + fn stateless_publisher_proof_segment_encodes_same_window_byte_identically() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!("skipping stateless publisher proof determinism test: ffmpeg unavailable"); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-stateless-publisher-proof-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("window.ts"); + write_short_deterministic_ts(&input_ts)?; + + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-test", + "--input", + input_ts.to_str().unwrap_or("/tmp/window.ts"), + "--publisher-archive-segment-duration-ms", + "1001", + ]); + + let first = encode_stateless_proof_segment(&args, &input_ts, &root.join("first.mp4"))?; + let second = encode_stateless_proof_segment(&args, &input_ts, &root.join("second.mp4"))?; + + assert_eq!(first.init, second.init); + assert_eq!(first.media.len(), second.media.len()); + assert!(!first.media.is_empty()); + for (left, right) in first.media.iter().zip(second.media.iter()) { + assert_eq!(blake3::hash(left), blake3::hash(right)); + assert_eq!(left, right); + } + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + fn comparable_proof_windows( + report: &PublisherProofWindowsReport, + ) -> Vec<(u64, String, String, Vec)> { + report + .windows + .iter() + .map(|window| { + ( + window.chunk_index, + window.source_ts_blake3.clone(), + window.init_blake3.clone(), + window.media_fragment_blake3.clone(), + ) + }) + .collect() + } + + #[test] + fn publisher_proof_windows_same_input_produces_identical_hashes() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!("skipping publisher proof windows determinism test: ffmpeg unavailable"); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-publisher-proof-windows-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("input.ts"); + write_short_deterministic_ts(&input_ts)?; + + let args_a = PublisherProofWindowsArgs::parse_from([ + "publisher-proof-windows", + "--input-ts", + input_ts.to_str().unwrap_or("/tmp/input.ts"), + "--output-dir", + root.join("publisher-a") + .to_str() + .unwrap_or("/tmp/publisher-a"), + "--chunk-ms", + "1001", + "--max-chunks", + "2", + ]); + let args_b = PublisherProofWindowsArgs::parse_from([ + "publisher-proof-windows", + "--input-ts", + input_ts.to_str().unwrap_or("/tmp/input.ts"), + "--output-dir", + root.join("publisher-b") + .to_str() + .unwrap_or("/tmp/publisher-b"), + "--chunk-ms", + "1001", + "--max-chunks", + "2", + ]); + + let report_a = publisher_proof_windows_report(&args_a)?; + let report_b = publisher_proof_windows_report(&args_b)?; + + assert_eq!(report_a.preroll_packets, WT_PUBLISH_PROOF_PREROLL_PACKETS); + assert_eq!(report_b.preroll_packets, WT_PUBLISH_PROOF_PREROLL_PACKETS); + assert_eq!(report_a.window_count, 2); + assert_eq!(report_b.window_count, 2); + assert_eq!( + comparable_proof_windows(&report_a), + comparable_proof_windows(&report_b) + ); + assert!(report_a + .windows + .iter() + .all(|window| window.media_fragment_count > 0)); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_proof_duplicates_same_input_proves_identical_windows() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!("skipping publisher proof duplicate comparison test: ffmpeg unavailable"); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-publisher-proof-duplicates-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("input.ts"); + write_short_deterministic_ts(&input_ts)?; + + let args = PublisherProofDuplicatesArgs::parse_from([ + "publisher-proof-duplicates", + "--input-ts", + input_ts.to_str().unwrap_or("/tmp/input.ts"), + "--output-dir", + root.join("duplicate-proof") + .to_str() + .unwrap_or("/tmp/duplicate-proof"), + "--chunk-ms", + "1001", + "--max-chunks", + "2", + ]); + + let report = publisher_proof_duplicates_report(&args)?; + + assert!(report.ok, "{:?}", report.reasons); + assert_eq!(report.publisher_count, 2); + assert_eq!(report.compared_window_count, 2); + assert_eq!(report.matching_window_count, 2); + assert_eq!(report.divergent_window_count, 0); + assert!(report.divergences.is_empty()); + assert_eq!( + report + .publishers + .iter() + .map(|publisher| publisher.publisher.as_str()) + .collect::>(), + vec!["publisher-a", "publisher-b"] + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_proof_compare_reports_from_same_input_passes() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!("skipping publisher proof report comparison test: ffmpeg unavailable"); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-publisher-proof-report-compare-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("input.ts"); + write_short_deterministic_ts(&input_ts)?; + + let report_a = publisher_proof_windows_report(&PublisherProofWindowsArgs::parse_from([ + "publisher-proof-windows", + "--input-ts", + input_ts.to_str().unwrap_or("/tmp/input.ts"), + "--output-dir", + root.join("nuc-a").to_str().unwrap_or("/tmp/nuc-a"), + "--chunk-ms", + "1001", + "--max-chunks", + "2", + ]))?; + let report_b = publisher_proof_windows_report(&PublisherProofWindowsArgs::parse_from([ + "publisher-proof-windows", + "--input-ts", + input_ts.to_str().unwrap_or("/tmp/input.ts"), + "--output-dir", + root.join("nuc-b").to_str().unwrap_or("/tmp/nuc-b"), + "--chunk-ms", + "1001", + "--max-chunks", + "2", + ]))?; + let report_a_path = root.join("nuc-a.json"); + let report_b_path = root.join("nuc-b.json"); + fs::write(&report_a_path, serde_json::to_vec_pretty(&report_a)?)?; + fs::write(&report_b_path, serde_json::to_vec_pretty(&report_b)?)?; + + let args = PublisherProofCompareArgs::parse_from(vec![ + "publisher-proof-compare".to_string(), + "--report".to_string(), + format!("nuc-a={}", report_a_path.display()), + "--report".to_string(), + format!("nuc-b={}", report_b_path.display()), + "--require-ok".to_string(), + ]); + let compare = publisher_proof_compare_report(&args)?; + + assert!(compare.ok, "{:?}", compare.reasons); + assert_eq!(compare.publisher_count, 2); + assert_eq!(compare.compared_window_count, 2); + assert_eq!(compare.matching_window_count, 2); + assert_eq!(compare.divergent_window_count, 0); + assert!(compare.divergences.is_empty()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_proof_compare_reports_detects_media_hash_mismatch() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-publisher-proof-report-mismatch-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let base_report = PublisherProofWindowsReport { + input_ts: root.join("input.ts"), + output_dir: root.join("proof-a"), + chunk_ms: 1001, + preroll_packets: WT_PUBLISH_PROOF_PREROLL_PACKETS, + window_count: 1, + windows: vec![PublisherProofWindowReport { + chunk_index: 0, + chunk_start_27mhz: Some(0), + chunk_duration_27mhz: 27_027_000, + sync_status: "aligned".to_string(), + source_ts: root.join("source.ts"), + source_ts_blake3: "source".to_string(), + output_mp4: root.join("proof.mp4"), + size_bytes: 10, + init_size_bytes: 4, + init_blake3: "init".to_string(), + media_fragment_count: 1, + media_fragment_blake3: vec!["media-a".to_string()], + }], + }; + let mut divergent_report = PublisherProofWindowsReport { + output_dir: root.join("proof-b"), + ..base_report.clone() + }; + divergent_report.windows[0].media_fragment_blake3 = vec!["media-b".to_string()]; + + let report_a_path = root.join("nuc-a.json"); + let report_b_path = root.join("nuc-b.json"); + fs::write(&report_a_path, serde_json::to_vec_pretty(&base_report)?)?; + fs::write( + &report_b_path, + serde_json::to_vec_pretty(&divergent_report)?, + )?; + + let args = PublisherProofCompareArgs::parse_from(vec![ + "publisher-proof-compare".to_string(), + "--report".to_string(), + format!("nuc-a={}", report_a_path.display()), + "--report".to_string(), + format!("nuc-b={}", report_b_path.display()), + ]); + let compare = publisher_proof_compare_report(&args)?; + + assert!(!compare.ok); + assert_eq!(compare.compared_window_count, 1); + assert_eq!(compare.matching_window_count, 0); + assert_eq!(compare.divergent_window_count, 1); + assert!(compare.reasons.contains(&"divergent_windows".to_string())); + assert!(compare.divergences[0] + .reasons + .contains(&"media_fragment_hash_mismatch".to_string())); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_proof_duplicates_rejects_duplicate_publisher_identity() { + let args = PublisherProofDuplicatesArgs::parse_from([ + "publisher-proof-duplicates", + "--input-ts", + "/tmp/input.ts", + "--output-dir", + "/tmp/proof", + "--publisher", + "nuc-a", + "--publisher", + "nuc-a", + ]); + + let err = publisher_proof_duplicates_report(&args).expect_err("duplicate should fail"); + + assert!( + err.to_string().contains("duplicated"), + "unexpected error: {err:#}" + ); + } + + #[test] + fn publisher_proof_remote_compare_parses_distinct_safe_targets() -> Result<()> { + let targets = parse_publisher_proof_remote_targets(&[ + "nuc-a=100.64.0.44".to_string(), + "nuc-b=node@example.internal".to_string(), + ])?; + + assert_eq!( + targets, + vec![ + PublisherProofRemoteTarget { + publisher: "nuc-a".to_string(), + host: "100.64.0.44".to_string(), + }, + PublisherProofRemoteTarget { + publisher: "nuc-b".to_string(), + host: "node@example.internal".to_string(), + }, + ] + ); + Ok(()) + } + + #[test] + fn publisher_proof_remote_compare_rejects_duplicate_target_labels() { + let err = parse_publisher_proof_remote_targets(&[ + "nuc-a=100.64.0.44".to_string(), + "nuc-a=100.64.0.45".to_string(), + ]) + .expect_err("duplicate remote label should fail"); + + assert!( + err.to_string().contains("duplicated"), + "unexpected error: {err:#}" + ); + } + + #[test] + fn publisher_proof_remote_compare_rejects_unsafe_remote_shell_args() { + let args = PublisherProofRemoteCompareArgs::parse_from([ + "publisher-proof-remote-compare", + "--input-ts", + "/tmp/input.ts", + "--output-dir", + "/tmp/proof", + "--remote", + "nuc-a=100.64.0.44", + "--remote", + "nuc-b=100.64.0.45", + "--remote-root", + "/tmp/every-channel-proof", + "--video-filter", + "fps=30;touch/tmp/bad", + ]); + + let err = publisher_proof_remote_compare_report(&args) + .expect_err("unsafe video filter should fail before ssh"); + + assert!( + err.to_string().contains("unsupported shell characters"), + "unexpected error: {err:#}" + ); + } + + #[test] + fn publisher_proof_remote_compare_command_uses_shared_input_window() { + let args = PublisherProofRemoteCompareArgs::parse_from([ + "publisher-proof-remote-compare", + "--input-ts", + "/tmp/input.ts", + "--output-dir", + "/tmp/proof", + "--remote", + "nuc-a=100.64.0.44", + "--remote", + "nuc-b=100.64.0.45", + "--max-chunks", + "3", + ]); + + let command = publisher_proof_remote_command_args( + &args, + "ec-node", + "/tmp/every-channel-proof/nuc-a/source.ts", + "/tmp/every-channel-proof/nuc-a/proof", + ); + + assert_eq!(command[0], "ec-node"); + assert!(command + .windows(2) + .any(|pair| pair[0] == "--input-ts" + && pair[1] == "/tmp/every-channel-proof/nuc-a/source.ts")); + assert!(command + .windows(2) + .any(|pair| pair[0] == "--output-dir" + && pair[1] == "/tmp/every-channel-proof/nuc-a/proof")); + assert!(command + .windows(2) + .any(|pair| pair[0] == "--max-chunks" && pair[1] == "3")); + assert!(command.windows(2).any(|pair| pair[0] == "--preroll-packets" + && pair[1] == WT_PUBLISH_PROOF_PREROLL_PACKETS.to_string())); + } + + #[test] + fn wt_publish_defaults_restart_when_relay_stops_announcing() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.mp4", + ]); + + assert_eq!(90_000, args.relay_announced_watchdog_ms); + assert_eq!(10_000, args.relay_announced_watchdog_interval_ms); + } + + fn signed_record_fixture(record_type: &str, created_unix_ms: u64) -> SignedRecord { + let body = ec_core::SignedRecordBody { + record_type: record_type.to_string(), + subject: "channel:la-kcop".to_string(), + time_ranges: vec![ec_core::RecordTimeRange { + stream_id: Some(StreamId("la-kcop".to_string())), + rendition_id: Some("720p".to_string()), + start_unix_ms: created_unix_ms, + end_unix_ms: created_unix_ms + 10_000, + }], + content_hashes: vec![ec_core::RecordContentHash { + alg: ec_core::RECORD_CONTENT_HASH_BLAKE3.to_string(), + digest: blake3::hash(b"record-payload").to_hex().to_string(), + uri: Some("garage://every-channel-records/la-kcop/record.json".to_string()), + }], + source: "did:key:z6MkOperator".to_string(), + visibility: ec_core::RECORD_VISIBILITY_PUBLIC.to_string(), + created_unix_ms, + valid_from_unix_ms: None, + valid_until_unix_ms: None, + supersedes: Vec::new(), + revokes: Vec::new(), + metadata: vec![StreamMetadata { + key: "text".to_string(), + value: "operator note".to_string(), + }], + }; + let record_id = body.record_id().unwrap(); + SignedRecord { + body, + record_id, + signatures: vec![ec_core::SignedRecordSignature { + signer_id: "did:key:z6MkOperator".to_string(), + alg: ec_core::RECORD_SIG_ALG_SECP256K1_EIP712_BODY_V1.to_string(), + signature: format!("0x{}", "11".repeat(65)), + }], + commitments: Vec::new(), + } + } + + #[test] + fn signed_record_index_args_default_to_stdin_and_compact_json() { + let args = SignedRecordIndexArgs::parse_from(["signed-record-index"]); + + assert_eq!(args.input, PathBuf::from("-")); + assert_eq!(args.generated_unix_ms, None); + assert!(!args.pretty); + + let explicit = SignedRecordIndexArgs::parse_from([ + "signed-record-index", + "--input", + "/tmp/records.json", + "--generated-unix-ms", + "42", + "--pretty", + ]); + assert_eq!(explicit.input, PathBuf::from("/tmp/records.json")); + assert_eq!(explicit.generated_unix_ms, Some(42)); + assert!(explicit.pretty); + } + + #[test] + fn signed_record_index_input_accepts_array_wrapped_and_single_record() { + let record = signed_record_fixture(ec_core::RECORD_TYPE_FRIEND_COMMENT, 1_771_000_000_000); + + let array = serde_json::to_vec(&vec![record.clone()]).unwrap(); + let wrapped = + serde_json::to_vec(&serde_json::json!({ "records": [record.clone()] })).unwrap(); + let single = serde_json::to_vec(&record).unwrap(); + + assert_eq!( + parse_signed_record_index_input(&array).unwrap()[0].record_id, + record.record_id + ); + assert_eq!( + parse_signed_record_index_input(&wrapped).unwrap()[0].record_id, + record.record_id + ); + assert_eq!( + parse_signed_record_index_input(&single).unwrap()[0].record_id, + record.record_id + ); + } + + #[test] + fn relay_announced_watchdog_uses_relay_announced_endpoint() { + let url = relay_announced_url("https://lax.relay.every.channel/anon?x=1").unwrap(); + + assert_eq!("https://lax.relay.every.channel/announced", url.as_str()); + } + + #[test] + fn relay_announced_watchdog_matches_prefixed_and_plain_names() { + assert!(announced_body_contains_broadcast( + "anon/la-kcbs\nanon/la-nbc4\n", + "la-nbc4" + )); + assert!(announced_body_contains_broadcast("la-kcet", "la-kcet")); + assert!(!announced_body_contains_broadcast( + "anon/la-kcet", + "la-kcbs" + )); + } + + #[test] + fn wt_publish_ntsc_rs_plan_builds_cli_invocation() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.mp4", + "--ntsc-rs-preset", + "/etc/every-channel/ntsc/vhs.json", + "--ntsc-rs-output", + "/tmp/every-channel-ntsc-rs/la-test.mp4", + "--ntsc-rs-cli", + "/opt/ntsc-rs-cli", + ]); + + let plan = ntsc_rs_preprocess_plan(&args).unwrap().unwrap(); + + assert_eq!(PathBuf::from("/opt/ntsc-rs-cli"), plan.cli); + assert_eq!( + PathBuf::from("/tmp/every-channel-ntsc-rs/la-test.mp4"), + plan.output + ); + assert_eq!( + vec![ + OsString::from("-i"), + OsString::from("/tmp/source.mp4"), + OsString::from("-o"), + OsString::from("/tmp/every-channel-ntsc-rs/la-test.mp4"), + OsString::from("-p"), + OsString::from("/etc/every-channel/ntsc/vhs.json"), + OsString::from("--overwrite"), + ], + plan.args + ); + } + + #[test] + fn wt_publish_ntsc_rs_requires_explicit_output() { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://relay.every.channel/anon", + "--name", + "la-test", + "--input", + "/tmp/source.mp4", + "--ntsc-rs-preset", + "/etc/every-channel/ntsc/vhs.json", + ]); + + let err = ntsc_rs_preprocess_plan(&args).unwrap_err(); + + assert!(err.to_string().contains("--ntsc-rs-output")); + } + #[test] fn parse_manifest_allowlist_splits_and_trims() { let set = parse_manifest_allowlist(Some(" a,b ; c\t d ")).unwrap(); @@ -1966,7 +3862,7 @@ mod tests { vec![StreamTransportDescriptor::IrohDirect { endpoint: "ed25519:node".to_string(), broadcast_name: "kcbs".to_string(), - track_name: "video0.m4s".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), }], 5_000, ); @@ -1990,6 +3886,97 @@ mod tests { assert!(manifest_hash_for_chunk(&manifest, sid, 12).is_none()); } + #[test] + fn duplicate_publisher_manifests_share_media_roots_not_envelopes() { + let variants = cmaf_ladder_variants(CmafLadderPreset::Hd3); + let per_variant_hash = vec![ + ( + "1080p".to_string(), + blake3::hash(b"same-1080p-segment").to_hex().to_string(), + ), + ( + "720p".to_string(), + blake3::hash(b"same-720p-segment").to_hex().to_string(), + ), + ( + "480p".to_string(), + blake3::hash(b"same-480p-segment").to_hex().to_string(), + ), + ]; + let metadata = vec![StreamMetadata { + key: "source_kind".to_string(), + value: "hdhomerun".to_string(), + }]; + + let publisher_a = build_multi_variant_manifest( + StreamId("publisher-a/la-kcop".to_string()), + "epoch-a".to_string(), + 2_000, + 42, + "x264-hd3".to_string(), + 1_000, + metadata.clone(), + &variants, + 42, + per_variant_hash.clone(), + ) + .unwrap(); + let publisher_b = build_multi_variant_manifest( + StreamId("publisher-b/la-kcop".to_string()), + "epoch-b".to_string(), + 2_000, + 42, + "x264-hd3".to_string(), + 2_000, + metadata, + &variants, + 42, + per_variant_hash, + ) + .unwrap(); + + assert!(validate_manifest(&publisher_a, None)); + assert!(validate_manifest(&publisher_b, None)); + assert_ne!(publisher_a.manifest_id, publisher_b.manifest_id); + assert_ne!(publisher_a.body.stream_id.0, publisher_b.body.stream_id.0); + assert_ne!(publisher_a.body.epoch_id, publisher_b.body.epoch_id); + assert_ne!( + publisher_a.body.created_unix_ms, + publisher_b.body.created_unix_ms + ); + + assert_eq!(publisher_a.body.chunk_hashes.len(), 0); + assert_eq!(publisher_b.body.chunk_hashes.len(), 0); + assert_eq!(publisher_a.body.merkle_root, publisher_b.body.merkle_root); + + let variants_a = publisher_a.body.variants.as_ref().unwrap(); + let variants_b = publisher_b.body.variants.as_ref().unwrap(); + assert_eq!(variants_a.len(), variants_b.len()); + for (a, b) in variants_a.iter().zip(variants_b.iter()) { + assert_eq!(a.variant_id, b.variant_id); + assert_ne!(a.stream_id.0, b.stream_id.0); + assert_eq!(a.chunk_start_index, b.chunk_start_index); + assert_eq!(a.total_chunks, b.total_chunks); + assert_eq!(a.chunk_hashes, b.chunk_hashes); + assert_eq!(a.merkle_root, b.merkle_root); + assert_eq!(a.metadata.len(), b.metadata.len()); + for (left, right) in a.metadata.iter().zip(b.metadata.iter()) { + assert_eq!(left.key, right.key); + assert_eq!(left.value, right.value); + } + assert_eq!( + manifest_hash_for_chunk(&publisher_a, &a.stream_id.0, a.chunk_start_index) + .as_deref(), + Some(a.chunk_hashes[0].as_str()) + ); + assert_eq!( + manifest_hash_for_chunk(&publisher_b, &b.stream_id.0, b.chunk_start_index) + .as_deref(), + Some(b.chunk_hashes[0].as_str()) + ); + } + } + #[test] fn decode_archive_group_bytes_unwraps_concatenated_object_frames() { let meta_a = ObjectMeta { @@ -2031,6 +4018,3226 @@ mod tests { assert_eq!(decoded, raw); } + #[test] + fn archive_origin_object_urls_support_archive_and_cas_roots() { + let digest = "a".repeat(64); + let state = ArchiveReplayState { + cas_root: PathBuf::from("/cache/objects/blake3"), + manifest_root: PathBuf::from("/cache/manifests"), + archive_origin_url: Some(Url::parse("https://archive.example/root").unwrap()), + archive_cas_origin_url: Some( + Url::parse("https://garage.example/bucket/objects/blake3").unwrap(), + ), + archive_origin_max_bytes: 1024, + archive_retention_seconds: WT_ARCHIVE_DEFAULT_RETENTION_SECONDS, + cache_access_root: PathBuf::from("/cache/cache-access/blake3"), + http_client: reqwest::Client::new(), + }; + + let urls = archive_origin_object_urls(&state, &digest) + .unwrap() + .into_iter() + .map(|url| url.to_string()) + .collect::>(); + + assert_eq!( + urls, + vec![ + format!("https://garage.example/bucket/objects/blake3/aa/{digest}.bin"), + format!("https://archive.example/root/objects/blake3/aa/{digest}.bin"), + ] + ); + } + + #[test] + fn archive_replay_reads_legacy_video_audio_track_names() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-legacy-track-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let manifest_root = root.join("manifests"); + let broadcast_dir = manifest_root.join("legacy"); + fs::create_dir_all(&broadcast_dir)?; + + let video_hash = "b".repeat(64); + let audio_hash = "c".repeat(64); + let video_record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: "legacy".to_string(), + track_name: "video0.m4s".to_string(), + group_sequence: 7, + frame_count: 1, + size_bytes: 4, + blake3: video_hash.clone(), + cas_path: format!("objects/blake3/bb/{video_hash}.bin"), + }; + let audio_record = ArchiveIndexRecord { + received_unix_ms: 1_010, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: "legacy".to_string(), + track_name: "audio0.m4s".to_string(), + group_sequence: 8, + frame_count: 1, + size_bytes: 4, + blake3: audio_hash.clone(), + cas_path: format!("objects/blake3/cc/{audio_hash}.bin"), + }; + fs::write( + broadcast_dir.join("video0.m4s.jsonl"), + format!("{}\n", serde_json::to_string(&video_record)?), + )?; + fs::write( + broadcast_dir.join("audio0.m4s.jsonl"), + format!("{}\n", serde_json::to_string(&audio_record)?), + )?; + + let video = parse_archive_track( + &manifest_root, + "legacy", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + None, + None, + )?; + let audio = parse_archive_track( + &manifest_root, + "legacy", + WT_PUBLISH_PRIMARY_AUDIO_TRACK, + None, + None, + )?; + + assert_eq!(video.len(), 1); + assert_eq!(video[0].track_name, "video0.m4s"); + assert_eq!(audio.len(), 1); + assert_eq!(audio[0].track_name, "audio0.m4s"); + assert_eq!( + latest_init_hash(&manifest_root, "legacy", WT_PUBLISH_PRIMARY_VIDEO_TRACK)?, + None + ); + assert_eq!( + latest_init_hash(&manifest_root, "legacy", WT_PUBLISH_PRIMARY_AUDIO_TRACK)?, + None + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_compares_duplicate_publisher_manifest_roots() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + for manifest_root in [&publisher_a, &publisher_b] { + fs::create_dir_all(manifest_root.join("la-kcop"))?; + } + + for manifest_root in [&publisher_a, &publisher_b] { + let mut lines = String::new(); + for sequence in 0..4u64 { + let hash = blake3::hash(format!("la-kcop-720p-{sequence}").as_bytes()) + .to_hex() + .to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some( + manifest_root + .parent() + .and_then(|path| path.file_name()) + .and_then(|name| name.to_str()) + .unwrap_or("unknown") + .to_string(), + ), + source_session: Some(format!("test-session-{sequence}")), + media_timing: Some(ArchiveRecordMediaTiming { + track_id: 7, + timescale: 90_000, + decode_time: sequence * 90_090, + sequence, + }), + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + group_sequence: sequence, + frame_count: 30, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + lines.push_str(&serde_json::to_string(&record)?); + lines.push('\n'); + } + fs::write(manifest_root.join("la-kcop").join("0.m4s.jsonl"), lines)?; + } + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + stream_id: None, + rendition: Some("720p".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(report.ok); + assert!(report.duplicate_complete); + assert!(report.reasons.is_empty()); + assert_eq!(report.source_count, 2); + assert_eq!(report.record_source_count, 2); + assert!(report.record_source_identity_ok); + assert_eq!( + report.record_source_nodes, + vec!["publisher-a".to_string(), "publisher-b".to_string()] + ); + assert_eq!(report.summary.expected_sequences, 4); + assert_eq!(report.summary.missing_sequences, Vec::::new()); + assert_eq!(report.summary.divergent_sequences, Vec::::new()); + assert_eq!( + report.summary.matching_duplicate_sequences, + vec![0, 1, 2, 3] + ); + assert_eq!(report.summary.duplicate_source_records, 8); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_rejects_matching_hashes_with_conflicting_media_timing() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-media-timing-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash = blake3::hash(b"same-bytes").to_hex().to_string(); + for (manifest_root, source_node, media_sequence, decode_time) in [ + (&publisher_a, "publisher-a", 200_u64, 180_000_u64), + (&publisher_b, "publisher-b", 201_u64, 270_000_u64), + ] { + let record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(format!("{source_node}-test")), + media_timing: Some(ArchiveRecordMediaTiming { + track_id: 7, + timescale: 90_000, + decode_time, + sequence: media_sequence, + }), + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 100, + frame_count: 1, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &manifest_root + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + } + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert!(report.duplicate_complete); + assert_eq!(report.summary.divergent_sequences, Vec::::new()); + assert_eq!(report.summary.matching_duplicate_sequences, vec![100]); + assert_eq!(report.media_timing_conflict_count, 1); + assert_eq!(report.media_timing_conflicts.len(), 1); + assert_eq!(report.media_timing_conflicts[0].group_sequence, 100); + assert_eq!( + report.media_timing_conflicts[0].media_sequences, + vec![200, 201] + ); + assert_eq!( + report.media_timing_conflicts[0].media_decode_times, + vec![180_000, 270_000] + ); + assert!(report + .reasons + .contains(&"media_sequence_conflict".to_string())); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_reports_divergent_sequence_samples() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-divergence-sample-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash_a = blake3::hash(b"publisher-a-bytes").to_hex().to_string(); + let hash_b = blake3::hash(b"publisher-b-bytes").to_hex().to_string(); + for (manifest_root, source_node, source_session, received_unix_ms, size_bytes, hash) in [ + ( + &publisher_a, + "publisher-a", + "publisher-a-session", + 1_000_u64, + 111_usize, + hash_a.as_str(), + ), + ( + &publisher_b, + "publisher-b", + "publisher-b-session", + 1_030_u64, + 222_usize, + hash_b.as_str(), + ), + ] { + let record = ArchiveIndexRecord { + received_unix_ms, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(source_session.to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 42, + frame_count: 1, + size_bytes, + blake3: hash.to_string(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &manifest_root + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + } + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert_eq!(report.media_timing_missing_records, 2); + assert_eq!(report.summary.divergent_sequences, vec![42]); + assert_eq!(report.divergent_sequence_samples.len(), 1); + let sample = &report.divergent_sequence_samples[0]; + assert_eq!(sample.group_sequence, 42); + assert_eq!(sample.record_count, 2); + assert_eq!(sample.hash_count, 2); + assert_eq!( + sample.sources, + vec!["publisher-a".to_string(), "publisher-b".to_string()] + ); + assert_eq!(sample.records.len(), 2); + assert_eq!(sample.records[0].manifest_source, "nuc-a"); + assert_eq!( + sample.records[0].source_node.as_deref(), + Some("publisher-a") + ); + assert_eq!( + sample.records[0].source_session.as_deref(), + Some("publisher-a-session") + ); + assert_eq!(sample.records[0].size_bytes, 111); + assert_eq!(sample.records[0].received_unix_ms, 1_000); + assert_eq!(sample.records[0].blake3, hash_a); + assert_eq!(sample.records[1].manifest_source, "nuc-b"); + assert_eq!( + sample.records[1].source_node.as_deref(), + Some("publisher-b") + ); + assert_eq!( + sample.records[1].source_session.as_deref(), + Some("publisher-b-session") + ); + assert_eq!(sample.records[1].size_bytes, 222); + assert_eq!(sample.records[1].received_unix_ms, 1_030); + assert_eq!(sample.records[1].blake3, hash_b); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_prometheus_metrics_include_duplicate_and_miss_gauges() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-prometheus-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash = blake3::hash(b"same-window").to_hex().to_string(); + for (manifest_root, source_node, sequence, digest) in [ + (&publisher_a, "publisher-a", 0_u64, hash.as_str()), + (&publisher_b, "publisher-b", 0_u64, hash.as_str()), + (&publisher_a, "publisher-a", 1_u64, ""), + ] { + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(format!("{source_node}-test")), + media_timing: Some(ArchiveRecordMediaTiming { + track_id: 7, + timescale: 90_000, + decode_time: 0, + sequence: 0, + }), + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: sequence, + frame_count: 1, + size_bytes: 1024, + blake3: digest.to_string(), + cas_path: if digest.is_empty() { + String::new() + } else { + format!("objects/blake3/{}/{digest}.bin", &digest[0..2]) + }, + }; + append_archive_index_record( + &manifest_root + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + } + + let args = ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: true, + metrics_node: Some("nuc\"a\nb".to_string()), + metrics_role: "publisher-buffer".to_string(), + require_ok: false, + }; + let report = archive_convergence_report(&args)?; + let text = archive_convergence_prometheus_metrics(&report, &args); + + assert!(!report.ok); + assert_eq!(report.missing_hash_records, 1); + assert!(report.reasons.contains(&"hash_missing".to_string())); + assert!(text.contains("node=\"nuc\\\"a\\nb\"")); + assert!(text.contains("role=\"publisher-buffer\"")); + assert!(text.contains("every_channel_archive_convergence_scrape_ok")); + assert!(text.contains("every_channel_archive_convergence_ok")); + assert!(text.contains("every_channel_archive_duplicate_hash_source_records")); + assert!(text.contains("every_channel_archive_duplicate_hash_sequences")); + assert!(text.contains("every_channel_archive_missing_hash_records")); + assert!(text.contains("every_channel_archive_missing_source_identity_records")); + assert!(text.contains("every_channel_archive_media_timing_missing_records")); + assert!(text.contains("every_channel_archive_media_timing_conflict_sequences")); + assert!(text.contains("every_channel_archive_duplicate_hash_sequences{")); + assert!(text.contains(" 1.000000\n")); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_serve_metrics_response_uses_rust_report() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-serve-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash = blake3::hash(b"same-window").to_hex().to_string(); + for (manifest_root, source_node) in + [(&publisher_a, "publisher-a"), (&publisher_b, "publisher-b")] + { + let record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(format!("{source_node}-test")), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 0, + frame_count: 1, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &manifest_root + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + } + + let args = ArchiveConvergenceServeArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + listen: "127.0.0.1:0".to_string(), + metrics_node: Some("forge".to_string()), + metrics_role: "duplicate-proof".to_string(), + }; + + let response = archive_convergence_serve_response(&args, "GET", "/metrics"); + let text = String::from_utf8(response.body)?; + + assert_eq!(response.status, 200); + assert!(text.contains("every_channel_archive_convergence_scrape_ok")); + assert!(text.contains("every_channel_archive_convergence_ok")); + assert!(text.contains("every_channel_archive_duplicate_hash_sequences")); + assert!(text.contains("node=\"forge\"")); + assert!(text.contains(" 1.000000\n")); + + let health = archive_convergence_serve_response(&args, "GET", "/health"); + assert_eq!(health.status, 200); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_serve_metrics_response_reports_scrape_failure() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-serve-empty-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(&publisher_a)?; + fs::create_dir_all(&publisher_b)?; + + let args = ArchiveConvergenceServeArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + listen: "127.0.0.1:0".to_string(), + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + }; + + let response = archive_convergence_serve_response(&args, "GET", "/metrics"); + let text = String::from_utf8(response.body)?; + + assert_eq!(response.status, 200); + assert!(text.contains("# archive_convergence_error")); + assert!(text.contains("every_channel_archive_convergence_scrape_ok")); + assert!(text.contains(" 0.000000\n")); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_measure_parses_agent_manifest_records() -> Result<()> { + let hash = blake3::hash(b"agent-record").to_hex().to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some("publisher-a".to_string()), + source_session: Some("publisher-a-session".to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 7, + frame_count: 1, + size_bytes: 512, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + let body = serde_json::json!({ + "ok": true, + "invalid_lines": 1, + "records": [ + record, + {"not": "an archive index record"} + ], + }) + .to_string(); + + let (records, invalid) = parse_agent_archive_records(&body)?; + + assert_eq!(records.len(), 1); + assert_eq!(invalid, 2); + assert_eq!(records[0].source_node.as_deref(), Some("publisher-a")); + assert_eq!(records[0].group_sequence, 7); + Ok(()) + } + + #[test] + fn archive_convergence_measure_ignores_partial_tail_line() -> Result<()> { + let hash = blake3::hash(b"jsonl-record").to_hex().to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 2_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some("publisher-b".to_string()), + source_session: Some("publisher-b-session".to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 11, + frame_count: 1, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + let body = format!( + "partial-json-tail\n{}\n{{\"bad\": true}}\n", + serde_json::to_string(&record)? + ); + + let (records, invalid) = parse_archive_records_jsonl(&body); + + assert_eq!(records.len(), 1); + assert_eq!(invalid, 1); + assert_eq!(records[0].source_node.as_deref(), Some("publisher-b")); + assert_eq!(records[0].group_sequence, 11); + Ok(()) + } + + #[test] + fn archive_convergence_measure_requires_elapsed_samples() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-measure-summary-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash = blake3::hash(b"elapsed-perfect-match").to_hex().to_string(); + for (manifest_root, source_node) in + [(&publisher_a, "publisher-a"), (&publisher_b, "publisher-b")] + { + let record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(format!("{source_node}-session")), + media_timing: Some(ArchiveRecordMediaTiming { + track_id: 7, + timescale: 90_000, + decode_time: 0, + sequence: 0, + }), + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 0, + frame_count: 1, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &manifest_root + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + } + + let convergence = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + assert!(convergence.ok); + + let summary = archive_convergence_measure_summary(&[ArchiveConvergenceMeasureSample { + sample_unix_ms: 1_000, + sources: vec![ + ArchiveConvergenceMeasureFetch { + name: "nuc-a".to_string(), + kind: "manifest".to_string(), + url: "https://example.invalid/a.jsonl".to_string(), + ok: true, + status: Some(200), + elapsed_ms: 1, + record_count: 1, + invalid_record_count: 0, + error: None, + }, + ArchiveConvergenceMeasureFetch { + name: "nuc-b".to_string(), + kind: "manifest".to_string(), + url: "https://example.invalid/b.jsonl".to_string(), + ok: true, + status: Some(200), + elapsed_ms: 1, + record_count: 1, + invalid_record_count: 0, + error: None, + }, + ], + convergence: Some(convergence), + convergence_error: None, + prometheus: Vec::new(), + }]); + + assert!(!summary.ok); + assert_eq!(summary.sample_count, 1); + assert_eq!(summary.elapsed_ms, 0); + assert_eq!(summary.latest_matching_duplicate_sequences, Some(1)); + assert_eq!(summary.latest_divergent_sequences, Some(0)); + assert_eq!(summary.reasons, vec!["insufficient_elapsed_samples"]); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_measure_reads_prometheus_file_sd_agents() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-file-sd-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let sd_path = root.join("node-agents.json"); + fs::write( + &sd_path, + serde_json::json!([ + { + "targets": ["100.64.0.5:7799"], + "labels": { + "headscale_name": "nuc-a", + "headscale_user": "node" + } + }, + { + "targets": ["127.0.0.1:7799"], + "labels": { + "headscale_name": "ecp-forge", + "headscale_user": "forge" + } + }, + { + "targets": ["http://100.64.0.6:7799/"], + "labels": { + "headscale_name": "nuc-b", + "headscale_user": "node" + } + } + ]) + .to_string(), + )?; + let args = ArchiveConvergenceMeasureArgs::parse_from([ + "archive-convergence-measure", + "--agent-prometheus-sd", + sd_path.to_str().expect("utf-8 temp path"), + "--agent-prometheus-sd-label", + "headscale_user=node", + "--broadcast", + "la-kcop", + ]); + + let (entries, errors) = archive_measure_prometheus_sd_agent_manifests(&args); + + assert!(errors.is_empty(), "{errors:?}"); + assert_eq!( + entries, + vec![ + "nuc-a=http://100.64.0.5:7799".to_string(), + "nuc-b=http://100.64.0.6:7799".to_string() + ] + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_measure_serve_metrics_report_elapsed_gate() { + let args = ArchiveConvergenceMeasureServeArgs { + agent_manifest: vec![ + "nuc-a=http://nuc-a.example.invalid:7799".to_string(), + "nuc-b=http://nuc-b.example.invalid:7799".to_string(), + ], + agent_prometheus_sd: Vec::new(), + agent_prometheus_sd_label: Vec::new(), + agent_manifest_role: "publisher-buffer".to_string(), + manifest: Vec::new(), + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + prometheus_url: None, + timeout_ms: 1000, + max_manifest_bytes: 1024 * 1024, + listen: "127.0.0.1:7813".to_string(), + max_samples: 16, + min_elapsed_seconds: 30.0, + metrics_node: Some("forge".to_string()), + metrics_role: "duplicate-proof".to_string(), + }; + let sample = ArchiveConvergenceMeasureSample { + sample_unix_ms: 1_000, + sources: vec![ + ArchiveConvergenceMeasureFetch { + name: "nuc-a".to_string(), + kind: "agent-manifest".to_string(), + url: "http://nuc-a.example.invalid:7799/v1/archive-manifest".to_string(), + ok: true, + status: Some(200), + elapsed_ms: 10, + record_count: 3, + invalid_record_count: 0, + error: None, + }, + ArchiveConvergenceMeasureFetch { + name: "nuc-b".to_string(), + kind: "agent-manifest".to_string(), + url: "http://nuc-b.example.invalid:7799/v1/archive-manifest".to_string(), + ok: true, + status: Some(200), + elapsed_ms: 11, + record_count: 3, + invalid_record_count: 0, + error: None, + }, + ], + convergence: None, + convergence_error: Some("synthetic convergence missing".to_string()), + prometheus: Vec::new(), + }; + let mut next_sample = sample.clone(); + next_sample.sample_unix_ms = 1_100; + let report = + archive_convergence_measure_report_from_samples(&args, vec![sample, next_sample]); + + assert!(!report.ok); + assert_eq!(report.summary.elapsed_ms, 100); + assert!(report + .summary + .reasons + .iter() + .any(|reason| reason == "elapsed_window_too_short")); + + let metrics = archive_convergence_measure_prometheus_metrics(&report, &args); + assert!(metrics.contains("every_channel_archive_convergence_measure_ok")); + assert!(metrics.contains("reason=\"elapsed_window_too_short\"")); + assert!(metrics.contains("source=\"nuc-a\"")); + assert!(metrics.contains("every_channel_archive_convergence_scrape_ok")); + } + + #[test] + fn archive_convergence_defaults_to_publisher_origin_track() { + let args = ArchiveConvergenceArgs::parse_from([ + "archive-convergence", + "--source", + "nuc-a=/tmp/a", + "--source", + "nuc-b=/tmp/b", + "--broadcast", + "la-kcop", + ]); + + assert_eq!(args.track, WT_PUBLISHER_ORIGIN_TRACK); + } + + #[test] + fn archive_convergence_require_ok_fails_on_incomplete_proof() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-require-ok-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let hash = blake3::hash(b"single-source").to_hex().to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some("publisher-a".to_string()), + source_session: Some("publisher-a-test".to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + group_sequence: 0, + frame_count: 1, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &publisher_a + .join("la-kcop") + .join(format!("{WT_PUBLISHER_ORIGIN_TRACK}.jsonl")), + &record, + )?; + + let err = archive_convergence_command(ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: Some("primary".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: true, + }) + .expect_err("require-ok should fail incomplete duplicate proof"); + + assert!(err + .to_string() + .contains("archive convergence failed for la-kcop/publisher.m4s")); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_rejects_single_source_coverage_as_duplicate_proof() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-incomplete-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + fs::create_dir_all(publisher_a.join("la-kcop"))?; + fs::create_dir_all(publisher_b.join("la-kcop"))?; + + let mut lines = String::new(); + for sequence in 0..2u64 { + let hash = blake3::hash(format!("la-kcop-only-a-{sequence}").as_bytes()) + .to_hex() + .to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some("publisher-a".to_string()), + source_session: Some("publisher-a-test".to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + group_sequence: sequence, + frame_count: 30, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + lines.push_str(&serde_json::to_string(&record)?); + lines.push('\n'); + } + fs::write(publisher_a.join("la-kcop").join("0.m4s.jsonl"), lines)?; + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + stream_id: None, + rendition: Some("720p".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert!(!report.duplicate_complete); + assert_eq!(report.summary.expected_sequences, 2); + assert_eq!(report.summary.missing_sequences, Vec::::new()); + assert_eq!( + report.summary.matching_duplicate_sequences, + Vec::::new() + ); + assert_eq!( + report.reasons, + vec![ + "duplicate_sequences_incomplete".to_string(), + "media_timing_missing".to_string(), + "source_identity_not_diverse".to_string(), + ] + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_rejects_legacy_records_without_source_identity() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-no-source-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + for manifest_root in [&publisher_a, &publisher_b] { + fs::create_dir_all(manifest_root.join("la-kcop"))?; + let mut lines = String::new(); + for sequence in 0..2u64 { + let hash = blake3::hash(format!("legacy-source-{sequence}").as_bytes()) + .to_hex() + .to_string(); + lines.push_str( + &serde_json::json!({ + "received_unix_ms": 1_000 + sequence * 1_000, + "relay_url": "https://lax.relay.every.channel/anon", + "broadcast_name": "la-kcop", + "track_name": WT_PUBLISH_PRIMARY_VIDEO_TRACK, + "group_sequence": sequence, + "frame_count": 30, + "size_bytes": 1024, + "blake3": hash, + "cas_path": format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }) + .to_string(), + ); + lines.push('\n'); + } + fs::write(manifest_root.join("la-kcop").join("0.m4s.jsonl"), lines)?; + } + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + stream_id: None, + rendition: Some("720p".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert_eq!(report.missing_source_identity_records, 4); + assert!(!report.record_source_identity_ok); + assert!(report + .reasons + .contains(&"source_identity_missing".to_string())); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_rejects_mirrored_same_source_records() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-mirrored-source-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + for manifest_root in [&publisher_a, &publisher_b] { + fs::create_dir_all(manifest_root.join("la-kcop"))?; + let mut lines = String::new(); + for sequence in 0..2u64 { + let hash = blake3::hash(format!("mirrored-source-{sequence}").as_bytes()) + .to_hex() + .to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some("archive-origin".to_string()), + source_session: Some("archive-origin-session".to_string()), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + group_sequence: sequence, + frame_count: 30, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + lines.push_str(&serde_json::to_string(&record)?); + lines.push('\n'); + } + fs::write(manifest_root.join("la-kcop").join("0.m4s.jsonl"), lines)?; + } + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + stream_id: None, + rendition: Some("720p".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert_eq!( + report.record_source_nodes, + vec!["archive-origin".to_string()] + ); + assert_eq!(report.record_source_count, 1); + assert_eq!( + report.reasons, + vec![ + "duplicate_sequences_incomplete".to_string(), + "media_timing_missing".to_string(), + "source_identity_not_diverse".to_string(), + ] + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_convergence_reports_source_local_hash_divergence() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-convergence-local-divergence-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let publisher_a = root.join("publisher-a").join("manifests"); + let publisher_b = root.join("publisher-b").join("manifests"); + for manifest_root in [&publisher_a, &publisher_b] { + fs::create_dir_all(manifest_root.join("la-kcop"))?; + } + + let write_record = |manifest_root: &Path, + source_node: &str, + sequence: u64, + hash_material: &str| { + let hash = blake3::hash(hash_material.as_bytes()).to_hex().to_string(); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://lax.relay.every.channel/anon".to_string(), + source_node: Some(source_node.to_string()), + source_session: Some(format!("{source_node}-test")), + media_timing: None, + broadcast_name: "la-kcop".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + group_sequence: sequence, + frame_count: 30, + size_bytes: 1024, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record(&manifest_root.join("la-kcop").join("0.m4s.jsonl"), &record) + }; + + write_record(&publisher_a, "publisher-a", 0, "same")?; + write_record(&publisher_a, "publisher-a", 0, "different")?; + write_record(&publisher_b, "publisher-b", 0, "same")?; + + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", publisher_a.display()), + format!("nuc-b={}", publisher_b.display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + stream_id: None, + rendition: Some("720p".to_string()), + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(!report.ok); + assert_eq!(report.summary.source_local_divergent_sequences, vec![0]); + assert!(report + .reasons + .contains(&"source_local_divergent_sequences".to_string())); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_serve_alias_parses_for_node_agent_supervision() { + let cli = Cli::parse_from([ + "ec-node", + "archive-serve", + "--output-dir", + "/tmp/ec-archive", + "--manifest-dir", + "/tmp/ec-archive/manifests", + "--listen", + "127.0.0.1:7789", + ]); + + assert!(matches!(cli.command, Commands::WtArchiveServe(_))); + } + + #[test] + fn sim_duplicate_publishers_default_campaign_passes() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "1", + "--iterations", + "128", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + + assert!(output.ok, "{:?}", output.campaign.first_failure); + assert_eq!(output.campaign.passed, 128); + assert_eq!(output.campaign.failed, 0); + assert!(output.campaign.max_duplicate_complete_ms_observed <= 3_000); + assert!(output.campaign.total_transient_dropped_observations > 0); + assert!(output.campaign.total_partition_delayed_observations > 0); + assert!(output.campaign.total_publisher_outage_observations > 0); + assert!(output.campaign.total_backfill_observations > 0); + Ok(()) + } + + #[test] + fn sim_control_plane_default_campaign_passes() -> Result<()> { + let args = SimControlPlaneArgs::parse_from([ + "sim-control-plane", + "--seed", + "1", + "--iterations", + "128", + ]); + + let output = sim_control_plane_output(&args)?; + + assert!(output.ok, "{:?}", output.campaign.first_failure); + assert_eq!(output.campaign.passed, 128); + assert_eq!(output.campaign.failed, 0); + assert!(output.campaign.max_propagation_complete_ms_observed <= 900); + assert!(output.campaign.total_transient_dropped_messages > 0); + assert!(output.campaign.total_partition_delayed_messages > 0); + assert!(output.campaign.total_node_outage_delayed_messages > 0); + assert!(output.campaign.total_duplicate_messages > 0); + Ok(()) + } + + #[test] + fn sim_control_plane_preserves_replayable_failure() -> Result<()> { + let args = SimControlPlaneArgs::parse_from([ + "sim-control-plane", + "--seed", + "1", + "--iterations", + "1", + "--fanout", + "0", + "--allow-failure", + ]); + + let output = sim_control_plane_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("zero fanout should produce a replayable failure"); + + assert!(!output.ok); + assert_eq!(failure.replay_hint, "EC_SIM_SEED=0000000000000001"); + assert_eq!(failure.report.known_nodes, vec!["nuc-a".to_string()]); + assert_eq!(failure.report.missing_nodes.len(), 6); + assert_eq!( + failure.invariant.failures, + vec![ + "propagation_incomplete".to_string(), + "propagation_deadline_unreached".to_string(), + ] + ); + assert!(!failure.report.trace.is_empty()); + Ok(()) + } + + #[test] + fn sim_control_plane_replays_exact_scenario_json() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-sim-control-plane-replay-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let scenario_path = root.join("scenario.json"); + let mut scenario = ec_core::sim::ControlPlanePropagationScenario::new( + ec_core::sim::SimulationSeed::new(44), + vec![ + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "forge".to_string(), + ], + "nuc-a", + "ec.control.broadcast.la-kcop", + "la-kcop@44", + ); + scenario.fanout = 0; + scenario.transient_drop_per_million = 0; + fs::write(&scenario_path, serde_json::to_vec_pretty(&scenario)?)?; + + let args = SimControlPlaneArgs::parse_from(vec![ + "sim-control-plane".to_string(), + "--scenario-json".to_string(), + scenario_path.display().to_string(), + "--allow-failure".to_string(), + ]); + let output = sim_control_plane_output(&args)?; + let direct_report = ec_core::sim::run_control_plane_propagation_simulation(&scenario); + let failure = output + .campaign + .first_failure + .as_ref() + .expect("zero-fanout scenario should fail"); + + assert!(!output.ok); + assert_eq!(output.scenario_template, scenario); + assert_eq!( + output.campaign.seed_start, + ec_core::sim::SimulationSeed::new(44) + ); + assert_eq!(output.campaign.iterations, 1); + assert_eq!(failure.report, direct_report); + assert_eq!(failure.report.known_nodes, vec!["nuc-a".to_string()]); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn sim_control_plane_rejects_unknown_origin() { + let args = SimControlPlaneArgs::parse_from([ + "sim-control-plane", + "--node", + "nuc-a", + "--node", + "nuc-b", + "--origin-node", + "forge", + ]); + + let err = sim_control_plane_output(&args).expect_err("unknown origin should fail"); + + assert!(err.to_string().contains("origin node"), "{err:#}"); + } + + #[test] + fn sim_system_default_campaign_passes_with_global_sequence_clock() -> Result<()> { + let args = SimSystemArgs::parse_from(["sim-system", "--seed", "1", "--iterations", "128"]); + + let output = sim_system_output(&args)?; + + assert!(output.ok, "{:?}", output.campaign.first_failure); + assert_eq!(output.campaign.passed, 128); + assert_eq!(output.campaign.failed, 0); + assert!(output.campaign.max_system_complete_ms_observed <= 3_500); + assert!(output.campaign.max_control_propagation_ms_observed > 0); + assert!(output.campaign.max_media_duplicate_complete_ms_observed > 0); + assert!(output.campaign.total_control_transient_drops > 0); + assert!(output.campaign.total_media_transient_drops > 0); + assert!(output.campaign.total_media_backfill_observations > 0); + assert!(output.campaign.seeds_with_control_transient_drops > 0); + assert!(output.campaign.seeds_with_media_transient_drops > 0); + assert!(output.campaign.seeds_with_media_backfill_observations > 0); + assert!(!output.campaign.slowest_system_runs.is_empty()); + Ok(()) + } + + #[test] + fn sim_system_foundationdb_profile_exercises_fault_coverage() -> Result<()> { + let args = SimSystemArgs::parse_from([ + "sim-system", + "--fault-profile", + "foundationdb", + "--seed", + "1", + "--iterations", + "128", + "--max-system-complete-ms", + "6000", + ]); + + let output = sim_system_output(&args)?; + + assert!(output.ok, "{:?}", output.campaign.first_failure); + assert!(output.fault_coverage.required); + assert!(output.fault_coverage.ok, "{:?}", output.fault_coverage); + assert_eq!(output.fault_coverage.min_seed_coverage, 4); + assert!(output.runtime.wall_elapsed_ms >= 0.0); + assert!(output.runtime.iterations_per_second > 0.0); + assert!(output.runtime.simulated_system_seconds_per_wall_second > 0.0); + assert!(output.runtime.trace_events_per_second > 0.0); + assert_eq!(output.campaign.passed, 128); + assert_eq!(output.campaign.failed, 0); + assert!(output.campaign.fault_coverage_ok()); + assert!(output.campaign.total_system_complete_ms_observed > 0); + assert!(output.campaign.total_trace_events > 0); + assert!(output.campaign.total_control_partition_delays > 0); + assert!(output.campaign.total_control_node_outage_delays > 0); + assert!(output.campaign.total_control_duplicate_messages > 0); + assert!(output.campaign.total_media_partition_delays > 0); + assert!(output.campaign.total_media_publisher_outages > 0); + assert!(output.campaign.total_media_backfill_observations > 0); + assert!(output.campaign.seeds_with_control_partition_delays > 0); + assert!(output.campaign.seeds_with_control_node_outage_delays > 0); + assert!(output.campaign.seeds_with_control_duplicate_messages > 0); + assert!(output.campaign.seeds_with_media_partition_delays > 0); + assert!(output.campaign.seeds_with_media_publisher_outages > 0); + assert!(output.campaign.seeds_with_media_backfill_observations > 0); + assert!(!output.campaign.slowest_system_runs.is_empty()); + Ok(()) + } + + #[test] + fn sim_system_foundationdb_profile_rejects_weak_seed_coverage() -> Result<()> { + let args = SimSystemArgs::parse_from([ + "sim-system", + "--fault-profile", + "foundationdb", + "--seed", + "1", + "--iterations", + "32", + "--max-system-complete-ms", + "6000", + "--min-fault-seed-coverage", + "33", + "--allow-failure", + ]); + + let output = sim_system_output(&args)?; + + assert!(!output.ok); + assert!(output.campaign.all_passed()); + assert!(output.fault_coverage.required); + assert_eq!(output.fault_coverage.min_seed_coverage, 33); + assert!(output + .fault_coverage + .failures + .contains(&"control_transient_drops_seed_coverage_below_min".to_string())); + assert!(output + .fault_coverage + .failures + .contains(&"media_backfill_seed_coverage_below_min".to_string())); + Ok(()) + } + + #[test] + fn sim_system_fault_coverage_gate_rejects_weak_campaign() -> Result<()> { + let args = SimSystemArgs::parse_from([ + "sim-system", + "--seed", + "1", + "--iterations", + "1", + "--node", + "forge", + "--node", + "nuc-a", + "--node", + "nuc-b", + "--require-fault-coverage", + "--allow-failure", + ]); + + let output = sim_system_output(&args)?; + + assert!(!output.ok); + assert!(output.campaign.all_passed()); + assert!(output.campaign.first_failure.is_none()); + assert!(output.fault_coverage.required); + assert!(!output.fault_coverage.ok); + assert!(output + .fault_coverage + .failures + .contains(&"control_node_outage_delays_uncovered".to_string())); + Ok(()) + } + + #[test] + fn sim_system_writes_weak_coverage_artifact() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-sim-system-weak-coverage-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let artifact_path = root.join("weak-coverage.json"); + let args = SimSystemArgs::parse_from(vec![ + "sim-system".to_string(), + "--seed".to_string(), + "1".to_string(), + "--iterations".to_string(), + "1".to_string(), + "--node".to_string(), + "forge".to_string(), + "--node".to_string(), + "nuc-a".to_string(), + "--node".to_string(), + "nuc-b".to_string(), + "--require-fault-coverage".to_string(), + "--failure-artifact".to_string(), + artifact_path.display().to_string(), + "--allow-failure".to_string(), + ]); + + sim_system_command(args)?; + let artifact: serde_json::Value = serde_json::from_slice(&fs::read(&artifact_path)?)?; + assert_eq!( + artifact["artifact_type"], + "every.channel.sim.system_duplicate_publishers.weak_coverage.v1" + ); + assert!(artifact["rerun_command"] + .as_str() + .unwrap_or_default() + .contains("--require-fault-coverage")); + assert_eq!(artifact["campaign"]["iterations"], 1); + assert!(artifact["fault_coverage"]["required"] + .as_bool() + .unwrap_or(false)); + assert!(!artifact["fault_coverage"]["ok"].as_bool().unwrap_or(true)); + let failures = artifact["fault_coverage"]["failures"] + .as_array() + .expect("weak coverage artifact has failure array"); + assert!(failures + .iter() + .any(|failure| failure == "control_node_outage_delays_uncovered")); + assert!(artifact.get("scenario_template").is_some()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn sim_system_rejects_local_activation_sequence_clock() -> Result<()> { + let args = SimSystemArgs::parse_from([ + "sim-system", + "--seed", + "1", + "--iterations", + "1", + "--sequence-clock", + "local-activation", + "--allow-failure", + ]); + + let output = sim_system_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("local activation clock should produce a replayable failure"); + + assert!(!output.ok); + assert_eq!(failure.replay_hint, "EC_SIM_SEED=0000000000000001"); + assert!(failure + .invariant + .failures + .contains(&"media_divergent_sequences".to_string())); + assert!(!failure.report.media.summary.divergent_sequences.is_empty()); + Ok(()) + } + + #[test] + fn sim_system_writes_replayable_foundationdb_failure_artifact() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-sim-system-artifact-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let artifact_path = root.join("failure.json"); + let args = SimSystemArgs::parse_from(vec![ + "sim-system".to_string(), + "--fault-profile".to_string(), + "foundationdb".to_string(), + "--seed".to_string(), + "1".to_string(), + "--iterations".to_string(), + "256".to_string(), + "--sequence-clock".to_string(), + "local-activation".to_string(), + "--max-system-complete-ms".to_string(), + "6000".to_string(), + "--failure-artifact".to_string(), + artifact_path.display().to_string(), + "--allow-failure".to_string(), + ]); + + sim_system_command(args)?; + let artifact: serde_json::Value = serde_json::from_slice(&fs::read(&artifact_path)?)?; + assert_eq!( + artifact["artifact_type"], + "every.channel.sim.system_duplicate_publishers.failure.v1" + ); + assert_eq!(artifact["replay_scenario_pointer"], "/replay_scenario"); + assert!(artifact["replay_command"] + .as_str() + .unwrap_or_default() + .contains("sim-system --scenario-json -")); + assert!(artifact["replay_command"] + .as_str() + .unwrap_or_default() + .contains("--max-system-complete-ms 6000")); + + let replay_scenario: ec_core::sim::SystemDuplicatePublisherScenario = + serde_json::from_value(artifact["replay_scenario"].clone())?; + let report = ec_core::sim::run_system_duplicate_publisher_simulation(&replay_scenario); + let invariant = ec_core::sim::check_system_duplicate_publisher_invariants( + &report, + &ec_core::sim::SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000), + ); + assert!(!invariant.ok); + assert!(invariant + .failures + .contains(&"media_divergent_sequences".to_string())); + assert!(!report.media.summary.divergent_sequences.is_empty()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_preserves_replayable_failure() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "19", + "--iterations", + "1", + "--encoder-drift", + "nuc-b:17:x264-hd3-drift", + "--allow-failure", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("drift should produce a replayable failure"); + let shrunk = failure + .shrunk_failure + .as_ref() + .expect("drift should produce a shrunk replay"); + + assert!(!output.ok); + assert_eq!(failure.replay_hint, "EC_SIM_SEED=0000000000000013"); + assert_eq!(failure.report.summary.divergent_sequences, vec![17]); + assert_eq!(failure.report.duplicate_complete_at_ms, None); + assert!(!failure.report.trace.is_empty()); + assert_eq!( + failure.invariant.failures, + vec![ + "divergent_sequences".to_string(), + "duplicate_incomplete".to_string(), + "duplicate_complete_deadline_unreached".to_string(), + ] + ); + assert_eq!(shrunk.scenario.expected_sequences(), 18); + assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_writes_replayable_failure_artifact() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-sim-duplicate-publisher-artifact-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let artifact_path = root.join("failure.json"); + let args = SimDuplicatePublishersArgs::parse_from(vec![ + "sim-duplicate-publishers".to_string(), + "--seed".to_string(), + "19".to_string(), + "--iterations".to_string(), + "1".to_string(), + "--encoder-drift".to_string(), + "nuc-b:17:x264-hd3-drift".to_string(), + "--failure-artifact".to_string(), + artifact_path.display().to_string(), + "--allow-failure".to_string(), + ]); + + sim_duplicate_publishers_command(args)?; + let artifact: serde_json::Value = serde_json::from_slice(&fs::read(&artifact_path)?)?; + assert_eq!( + artifact["artifact_type"], + "every.channel.sim.duplicate_publishers.failure.v1" + ); + assert_eq!(artifact["shrunk"], true); + assert_eq!(artifact["replay_scenario_pointer"], "/replay_scenario"); + assert!(artifact["replay_command"] + .as_str() + .unwrap_or_default() + .contains("--scenario-json -")); + + let replay_scenario: ec_core::sim::DuplicatePublisherScenario = + serde_json::from_value(artifact["replay_scenario"].clone())?; + assert_eq!(replay_scenario.expected_sequences(), 18); + let report = ec_core::sim::run_duplicate_publisher_simulation(&replay_scenario); + let invariant = ec_core::sim::check_duplicate_publisher_invariants( + &report, + &ec_core::sim::DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline( + 3_000, + ), + ); + assert!(!invariant.ok); + assert_eq!(report.summary.divergent_sequences, vec![17]); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_detects_unaligned_publisher_sequence_phase() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "1", + "--iterations", + "1", + "--partition", + "nuc-b:1000:1100:0", + "--publisher-outage", + "nuc-b:2000:2100:0", + "--transient-drop-per-million", + "0", + "--max-jitter-ms", + "0", + "--publisher-sequence-offset", + "nuc-b:3", + "--allow-failure", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("phase offset should produce a replayable failure"); + + assert!(!output.ok); + assert_eq!( + output + .scenario_template + .publisher_sequence_offsets + .get("nuc-b"), + Some(&3) + ); + assert_eq!(failure.report.summary.matching_duplicate_sequences.len(), 0); + assert_eq!( + failure + .report + .fault_stats + .publisher_phase_offset_observations, + 48 + ); + assert!(failure + .invariant + .failures + .contains(&"divergent_sequences".to_string())); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_detects_missing_media_timing() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "1", + "--iterations", + "1", + "--transient-drop-per-million", + "0", + "--max-jitter-ms", + "0", + "--missing-media-timing-publisher", + "nuc-b", + "--allow-failure", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("missing media timing should produce a replayable failure"); + + assert!(!output.ok); + assert!(output + .scenario_template + .missing_media_timing_publishers + .contains("nuc-b")); + assert!(failure + .invariant + .failures + .contains(&"media_timing_missing_records".to_string())); + assert_eq!( + failure.report.summary.media_timing_missing_records, + output.scenario_template.expected_sequences() + ); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_detects_conflicting_media_timing() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "1", + "--iterations", + "1", + "--transient-drop-per-million", + "0", + "--max-jitter-ms", + "0", + "--publisher-media-time-offset", + "nuc-b:17", + "--allow-failure", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("conflicting media timing should produce a replayable failure"); + + assert!(!output.ok); + assert_eq!( + output + .scenario_template + .publisher_media_time_offsets_ms + .get("nuc-b"), + Some(&17) + ); + assert!(failure + .invariant + .failures + .contains(&"media_timing_conflict_sequences".to_string())); + assert_eq!( + failure.report.summary.media_timing_conflict_sequences.len() as u64, + output.scenario_template.expected_sequences() + ); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_detects_independent_source_material() -> Result<()> { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--seed", + "1", + "--iterations", + "1", + "--transient-drop-per-million", + "0", + "--max-jitter-ms", + "0", + "--publisher-source-material", + "nuc-b:independent-rf-window", + "--allow-failure", + ]); + + let output = sim_duplicate_publishers_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("source material mismatch should produce a replayable failure"); + + assert!(!output.ok); + assert_eq!( + output + .scenario_template + .publisher_source_material + .get("nuc-b") + .map(String::as_str), + Some("independent-rf-window") + ); + assert!(failure + .invariant + .failures + .contains(&"source_material_mismatch_observations".to_string())); + assert_eq!( + failure + .report + .fault_stats + .source_material_mismatch_observations, + output.scenario_template.expected_sequences() * 2 + ); + assert!(!failure.report.summary.divergent_sequences.is_empty()); + Ok(()) + } + + #[test] + fn sim_system_detects_independent_source_material() -> Result<()> { + let args = SimSystemArgs::parse_from([ + "sim-system", + "--seed", + "1", + "--iterations", + "1", + "--publisher-source-material", + "nuc-b:independent-rf-window", + "--allow-failure", + ]); + + let output = sim_system_output(&args)?; + let failure = output + .campaign + .first_failure + .as_ref() + .expect("source material mismatch should fail the composed system model"); + + assert!(!output.ok); + assert_eq!( + output + .scenario_template + .media + .publisher_source_material + .get("nuc-b") + .map(String::as_str), + Some("independent-rf-window") + ); + assert!(failure + .invariant + .failures + .contains(&"media_source_material_mismatch_observations".to_string())); + assert!(output.campaign.total_media_source_material_mismatches > 0); + assert_eq!( + output.campaign.seeds_with_media_source_material_mismatches, + 1 + ); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_replays_exact_scenario_json() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-sim-duplicate-publisher-replay-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let scenario_path = root.join("scenario.json"); + let mut scenario = ec_core::sim::DuplicatePublisherScenario::new( + ec_core::sim::SimulationSeed::new(19), + vec!["nuc-a".to_string(), "nuc-b".to_string()], + "la-kcop", + WT_LADDER_PRIMARY_RENDITION, + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + "x264-hd3-v1", + 0, + 48, + ); + scenario.segment_step_ms = 40; + scenario.base_network_delay_ms = 5; + scenario.max_jitter_ms = 75; + scenario.transient_drop_per_million = 275_000; + scenario.backfill_after_ms = 600; + scenario.partitions = vec![ + ec_core::sim::SimulationPartition::new("nuc-b", 120, 520, 140), + ec_core::sim::SimulationPartition::new("nuc-a", 940, 1_260, 90), + ]; + scenario.publisher_outages = vec![ec_core::sim::SimulationOutage::new( + "nuc-b", 1_360, 1_520, 220, + )]; + scenario.encoder_drifts = vec![ec_core::sim::EncoderDriftFault::new( + "nuc-b", + 17, + "x264-hd3-drift", + )]; + fs::write(&scenario_path, serde_json::to_vec_pretty(&scenario)?)?; + + let args = SimDuplicatePublishersArgs::parse_from(vec![ + "sim-duplicate-publishers".to_string(), + "--scenario-json".to_string(), + scenario_path.display().to_string(), + "--allow-failure".to_string(), + ]); + let output = sim_duplicate_publishers_output(&args)?; + let direct_report = ec_core::sim::run_duplicate_publisher_simulation(&scenario); + let failure = output + .campaign + .first_failure + .as_ref() + .expect("drift scenario should fail"); + + assert!(!output.ok); + assert_eq!(output.scenario_template, scenario); + assert_eq!( + output.campaign.seed_start, + ec_core::sim::SimulationSeed::new(19) + ); + assert_eq!(output.campaign.iterations, 1); + assert_eq!(failure.report, direct_report); + assert_eq!(failure.report.summary.divergent_sequences, vec![17]); + assert!(failure.shrunk_failure.is_some()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn sim_duplicate_publishers_rejects_invalid_partition() { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--partition", + "nuc-a:100:50:10", + ]); + + let err = sim_duplicate_publishers_output(&args).expect_err("partition should fail"); + + assert!( + err.to_string().contains("end_ms must be greater"), + "{err:#}" + ); + } + + #[test] + fn sim_duplicate_publishers_rejects_invalid_publisher_outage() { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--publisher-outage", + "nuc-a:100:50:10", + ]); + + let err = sim_duplicate_publishers_output(&args).expect_err("outage should fail"); + + assert!( + err.to_string().contains("end_ms must be greater"), + "{err:#}" + ); + } + + #[test] + fn sim_duplicate_publishers_rejects_unknown_sequence_offset_publisher() { + let args = SimDuplicatePublishersArgs::parse_from([ + "sim-duplicate-publishers", + "--publisher-sequence-offset", + "nuc-z:3", + ]); + + let err = sim_duplicate_publishers_output(&args).expect_err("offset should fail"); + + assert!(err.to_string().contains("unknown publisher"), "{err:#}"); + } + + #[test] + fn wt_archive_defaults_include_real_init_track() { + assert!(WT_ARCHIVE_DEFAULT_TRACKS.contains(&WT_PUBLISH_INIT_TRACK)); + } + + #[test] + fn fmp4_init_prefix_scanner_finds_first_moof() -> Result<()> { + let mut data = Vec::new(); + data.extend_from_slice(&16u32.to_be_bytes()); + data.extend_from_slice(b"ftyp"); + data.extend_from_slice(&[0u8; 8]); + data.extend_from_slice(&24u32.to_be_bytes()); + data.extend_from_slice(b"moov"); + data.extend_from_slice(&[1u8; 16]); + let expected = data.len(); + data.extend_from_slice(&8u32.to_be_bytes()); + data.extend_from_slice(b"moof"); + assert_eq!(top_level_moof_offset(&data)?, Some(expected)); + assert_eq!(top_level_moof_offset(&data[..expected])?, None); + Ok(()) + } + + fn test_mp4_box(kind: &[u8; 4], payload: &[u8]) -> Vec { + let mut data = Vec::with_capacity(8 + payload.len()); + data.extend_from_slice(&(8u32 + payload.len() as u32).to_be_bytes()); + data.extend_from_slice(kind); + data.extend_from_slice(payload); + data + } + + fn test_mp4_full_box(kind: &[u8; 4], version: u8, payload: &[u8]) -> Vec { + let mut full_payload = Vec::with_capacity(4 + payload.len()); + full_payload.push(version); + full_payload.extend_from_slice(&[0, 0, 0]); + full_payload.extend_from_slice(payload); + test_mp4_box(kind, &full_payload) + } + + fn test_mp4_moov_with_track(track_id: u32, timescale: u32) -> Vec { + let mut tkhd_payload = Vec::new(); + tkhd_payload.extend_from_slice(&0u32.to_be_bytes()); + tkhd_payload.extend_from_slice(&0u32.to_be_bytes()); + tkhd_payload.extend_from_slice(&track_id.to_be_bytes()); + let tkhd = test_mp4_full_box(b"tkhd", 0, &tkhd_payload); + + let mut mdhd_payload = Vec::new(); + mdhd_payload.extend_from_slice(&0u32.to_be_bytes()); + mdhd_payload.extend_from_slice(&0u32.to_be_bytes()); + mdhd_payload.extend_from_slice(×cale.to_be_bytes()); + let mdhd = test_mp4_full_box(b"mdhd", 0, &mdhd_payload); + let mdia = test_mp4_box(b"mdia", &mdhd); + + let mut trak_payload = Vec::new(); + trak_payload.extend_from_slice(&tkhd); + trak_payload.extend_from_slice(&mdia); + let trak = test_mp4_box(b"trak", &trak_payload); + + test_mp4_box(b"moov", &trak) + } + + fn test_mp4_moof_with_tfdt(track_id: u32, base_media_decode_time: u64) -> Vec { + let tfhd = test_mp4_full_box(b"tfhd", 0, &track_id.to_be_bytes()); + let tfdt = test_mp4_full_box(b"tfdt", 1, &base_media_decode_time.to_be_bytes()); + let mut traf_payload = Vec::new(); + traf_payload.extend_from_slice(&tfhd); + traf_payload.extend_from_slice(&tfdt); + let traf = test_mp4_box(b"traf", &traf_payload); + test_mp4_box(b"moof", &traf) + } + + #[test] + fn split_fmp4_init_and_media_groups_top_level_fragments() -> Result<()> { + let mut bytes = Vec::new(); + let ftyp = test_mp4_box(b"ftyp", b"brand"); + let moov = test_mp4_box(b"moov", b"init"); + let moof_a = test_mp4_box(b"moof", b"frag-a-header"); + let mdat_a = test_mp4_box(b"mdat", b"frag-a-data"); + let moof_b = test_mp4_box(b"moof", b"frag-b-header"); + let mdat_b = test_mp4_box(b"mdat", b"frag-b-data"); + bytes.extend_from_slice(&ftyp); + bytes.extend_from_slice(&moov); + bytes.extend_from_slice(&moof_a); + bytes.extend_from_slice(&mdat_a); + bytes.extend_from_slice(&moof_b); + bytes.extend_from_slice(&mdat_b); + + let split = split_fmp4_init_and_media(&bytes)?; + + assert_eq!(split.init, [ftyp, moov].concat()); + assert_eq!( + split.media, + vec![[moof_a, mdat_a].concat(), [moof_b, mdat_b].concat()] + ); + Ok(()) + } + + #[test] + fn publisher_archive_writer_records_source_stamped_fmp4_fragments() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-publisher-archive-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let output_dir = root.join("archive-buffer"); + let manifest_dir = output_dir.join("manifests"); + let mut writer = PublisherArchiveWriter::new( + &output_dir, + Some(&manifest_dir), + "https://relay.every.channel/anon", + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + 1000, + "nuc-a".to_string(), + ); + + let mut stream = Vec::new(); + stream.extend(test_mp4_box(b"ftyp", b"init-a")); + stream.extend(test_mp4_box(b"moov", b"init-b")); + stream.extend(test_mp4_box(b"moof", b"fragment-a-header")); + stream.extend(test_mp4_box(b"mdat", b"fragment-a-data")); + stream.extend(test_mp4_box(b"moof", b"fragment-b-header")); + stream.extend(test_mp4_box(b"mdat", b"fragment-b-data")); + + writer.observe_bytes(&stream[..17])?; + writer.observe_bytes(&stream[17..])?; + writer.finish()?; + + let records = read_archive_records_from_path(&archive_index_path( + &manifest_dir, + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + ))?; + assert_eq!(records.len(), 2); + assert_eq!(records[0].source_node.as_deref(), Some("nuc-a")); + assert_eq!( + records[0].source_session.as_deref(), + Some(writer.source_session.as_str()) + ); + assert_eq!(records[0].broadcast_name, "la-kcop"); + assert_eq!(records[0].track_name, WT_PUBLISHER_ORIGIN_TRACK); + assert_eq!(records[0].group_sequence, 0); + assert_eq!(records[1].group_sequence, 1); + for record in &records { + assert!(output_dir.join(&record.cas_path).exists()); + } + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_archive_writer_maps_fmp4_fragments_to_media_time_slots() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-publisher-archive-tfdt-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let output_dir = root.join("archive-buffer"); + let manifest_dir = output_dir.join("manifests"); + let mut writer = PublisherArchiveWriter::new( + &output_dir, + Some(&manifest_dir), + "https://relay.every.channel/anon", + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + 1000, + "nuc-a".to_string(), + ); + + let mut stream = Vec::new(); + stream.extend(test_mp4_box(b"ftyp", b"init-a")); + stream.extend(test_mp4_moov_with_track(7, 90_000)); + stream.extend(test_mp4_moof_with_tfdt(7, 180_000)); + stream.extend(test_mp4_box(b"mdat", b"fragment-a-data")); + stream.extend(test_mp4_moof_with_tfdt(7, 270_000)); + stream.extend(test_mp4_box(b"mdat", b"fragment-b-data")); + + writer.observe_bytes(&stream)?; + writer.finish()?; + + let records = read_archive_records_from_path(&archive_index_path( + &manifest_dir, + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + ))?; + assert_eq!(records.len(), 2); + let first_origin = (records[0].received_unix_ms / 1000).saturating_sub(2); + for (record, expected_sequence) in records.iter().zip([first_origin + 2, first_origin + 3]) + { + let timing = record.media_timing.as_ref().expect("media timing"); + assert_eq!(timing.track_id, 7); + assert_eq!(timing.sequence, expected_sequence); + assert_eq!( + timing.decode_time, + fmp4_decode_time_for_media_sequence(expected_sequence, timing.timescale, 1000) + .expect("decode time") + ); + assert_eq!( + record.group_sequence, + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment(timing, 0) + ); + } + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_archive_writer_coalesces_subsegment_timed_fragments() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-publisher-archive-fast-tfdt-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let output_dir = root.join("archive-buffer"); + let manifest_dir = output_dir.join("manifests"); + let mut writer = PublisherArchiveWriter::new( + &output_dir, + Some(&manifest_dir), + "https://relay.every.channel/anon", + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + 1000, + "nuc-a".to_string(), + ); + + let mut stream = Vec::new(); + stream.extend(test_mp4_box(b"ftyp", b"init-a")); + stream.extend(test_mp4_moov_with_track(7, 90_000)); + for decode_time in [0, 30_000, 60_000, 90_000] { + stream.extend(test_mp4_moof_with_tfdt(7, decode_time)); + stream.extend(test_mp4_box(b"mdat", &decode_time.to_be_bytes())); + } + + writer.observe_bytes(&stream)?; + writer.finish()?; + + let records = read_archive_records_from_path(&archive_index_path( + &manifest_dir, + "la-kcop", + WT_PUBLISHER_ORIGIN_TRACK, + ))?; + assert_eq!(records.len(), 4); + let first_origin = records[0].received_unix_ms / 1000; + let mut subfragment_counts = BTreeMap::<(u32, u64), u64>::new(); + for (record, expected_sequence) in records.iter().zip([ + first_origin, + first_origin, + first_origin + 1, + first_origin + 1, + ]) { + let timing = record.media_timing.as_ref().expect("media timing"); + assert_eq!(timing.track_id, 7); + assert_eq!(timing.sequence, expected_sequence); + let subfragment_index = subfragment_counts + .entry((timing.track_id, timing.sequence)) + .or_default(); + let expected_group_sequence = + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + timing, + *subfragment_index, + ); + *subfragment_index = subfragment_index.saturating_add(1); + assert_eq!(record.group_sequence, expected_group_sequence); + } + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn rewrite_fmp4_fragment_decode_time_replaces_tfdt() -> Result<()> { + let mut fragment = Vec::new(); + fragment.extend(test_mp4_moof_with_tfdt(7, 180_000)); + fragment.extend(test_mp4_box(b"mdat", b"fragment-a-data")); + + assert!(rewrite_fmp4_fragment_decode_time( + &mut fragment, + 53_407_213_650_000 + )?); + let (_, decode_time) = fmp4_fragment_decode_time(&fragment)?.expect("decode time"); + + assert_eq!(decode_time, 53_407_213_650_000); + Ok(()) + } + + #[test] + fn publisher_archive_writer_aligns_timed_fragments_by_media_identity() { + let publisher_a_timing = ArchiveRecordMediaTiming { + track_id: 1, + timescale: 30_000, + decode_time: 53_407_213_650_000, + sequence: 1_780_240_988, + }; + let publisher_b_same_timing = ArchiveRecordMediaTiming { + track_id: 1, + timescale: 30_000, + decode_time: 53_407_213_650_000, + sequence: 1_780_240_988, + }; + let publisher_b_late_timing = ArchiveRecordMediaTiming { + track_id: 1, + timescale: 30_000, + decode_time: 53_407_216_653_000, + sequence: 1_780_240_989, + }; + let publisher_b_audio_timing = ArchiveRecordMediaTiming { + track_id: 2, + timescale: 48_000, + decode_time: 85_451_541_840_000, + sequence: 1_780_240_988, + }; + assert_eq!( + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_a_timing, + 0 + ), + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_b_same_timing, + 0 + ) + ); + assert_ne!( + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_a_timing, + 0 + ), + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_b_late_timing, + 0 + ) + ); + assert_ne!( + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_a_timing, + 0 + ), + PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + &publisher_b_audio_timing, + 0 + ) + ); + } + + #[test] + fn wt_publish_publisher_archive_spawns_source_proof_worker() -> Result<()> { + let args = WtPublishArgs::parse_from([ + "wt-publish", + "--url", + "https://lax.relay.every.channel/anon", + "--name", + "la-kcop", + "--input", + "http://hdhr/auto/v13.1", + "--publisher-archive-output-dir", + "/tmp/ec-publisher-archive", + "--publisher-archive-manifest-dir", + "/tmp/ec-publisher-archive/manifests", + "--publisher-archive-source-node", + "nuc-a", + "--publisher-archive-segment-duration-ms", + "1001", + ]); + + let child_args = publisher_proof_archive_source_child_args(&args, &args.input)? + .expect("publisher archive child args"); + let child_args = child_args + .into_iter() + .map(|arg| arg.to_string_lossy().into_owned()) + .collect::>(); + + assert_eq!( + child_args.first().map(String::as_str), + Some("publisher-proof-archive-source") + ); + assert!(child_args + .windows(2) + .any(|window| window == ["--input", "http://hdhr/auto/v13.1"])); + assert!(child_args + .windows(2) + .any(|window| window == ["--track", WT_PUBLISHER_ORIGIN_TRACK])); + assert!(child_args + .windows(2) + .any(|window| window == ["--source-node", "nuc-a"])); + assert!(child_args.iter().any(|arg| arg == "--transcode=true")); + Ok(()) + } + + #[test] + fn publisher_proof_archive_source_converges_two_publishers() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!( + "skipping publisher source proof archive convergence test: ffmpeg unavailable" + ); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-source-proof-archive-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("source.ts"); + write_short_deterministic_ts(&input_ts)?; + + let archive_a = root.join("nuc-a"); + let archive_b = root.join("nuc-b"); + let report_a = + publisher_proof_archive_source_report(&PublisherProofArchiveSourceArgs::parse_from([ + "publisher-proof-archive-source", + "--input", + input_ts.to_str().unwrap_or("/tmp/source.ts"), + "--output-dir", + archive_a.to_str().unwrap_or("/tmp/nuc-a"), + "--relay-url", + "https://lax.relay.every.channel/anon", + "--name", + "la-kcop", + "--source-node", + "nuc-a", + "--chunk-ms", + "1001", + "--max-chunks", + "1", + ]))?; + let report_b = + publisher_proof_archive_source_report(&PublisherProofArchiveSourceArgs::parse_from([ + "publisher-proof-archive-source", + "--input", + input_ts.to_str().unwrap_or("/tmp/source.ts"), + "--output-dir", + archive_b.to_str().unwrap_or("/tmp/nuc-b"), + "--relay-url", + "https://lax.relay.every.channel/anon", + "--name", + "la-kcop", + "--source-node", + "nuc-b", + "--chunk-ms", + "1001", + "--max-chunks", + "1", + ]))?; + + assert_eq!(report_a.archived_windows, 1); + assert_eq!(report_b.archived_windows, 1); + assert!(report_a.archived_fragments > 0); + assert_eq!(report_a.archived_fragments, report_b.archived_fragments); + + let convergence = archive_convergence_report(&ArchiveConvergenceArgs { + source: vec![ + format!("nuc-a={}", archive_a.join("manifests").display()), + format!("nuc-b={}", archive_b.join("manifests").display()), + ], + broadcast: "la-kcop".to_string(), + track: WT_PUBLISHER_ORIGIN_TRACK.to_string(), + stream_id: None, + rendition: None, + start_sequence: None, + end_sequence: None, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + })?; + + assert!(convergence.ok, "{:#?}", convergence.reasons); + assert_eq!(convergence.record_source_count, 2); + assert_eq!(convergence.summary.divergent_sequences.len(), 0); + assert_eq!( + convergence.summary.source_local_divergent_sequences.len(), + 0 + ); + assert_eq!( + convergence.summary.matching_duplicate_sequences.len() as u64, + convergence.summary.expected_sequences + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_proof_archive_source_skips_bad_ts_window() -> Result<()> { + if !test_command_available("ffmpeg") { + eprintln!("skipping bad source proof window test: ffmpeg unavailable"); + return Ok(()); + } + + let root = std::env::temp_dir().join(format!( + "ec-source-proof-bad-window-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::create_dir_all(&root)?; + let input_ts = root.join("bad-window.ts"); + fs::write(&input_ts, b"not a valid mpeg transport stream")?; + + let args = PublisherProofArchiveSourceArgs::parse_from([ + "publisher-proof-archive-source", + "--input", + input_ts.to_str().unwrap_or("/tmp/bad-window.ts"), + "--output-dir", + root.join("archive").to_str().unwrap_or("/tmp/archive"), + "--relay-url", + "https://lax.relay.every.channel/anon", + "--name", + "la-kcop", + "--source-node", + "nuc-a", + "--chunk-ms", + "1001", + ]); + let config = PublisherProofArchiveSourceConfig::from_args(&args)?; + fs::create_dir_all(&config.source_window_dir)?; + fs::create_dir_all(&config.proof_dir)?; + fs::create_dir_all(&config.cas_root)?; + let chunk_path = config.source_window_dir.join("bad-window.ts"); + fs::copy(&input_ts, &chunk_path)?; + + let mut stats = PublisherProofArchiveSourceStats::default(); + archive_publisher_source_proof_chunk( + &config, + &mut stats, + TsChunk { + index: 1, + path: chunk_path, + timing: ec_chopper::ChunkTiming { + chunk_index: 1, + chunk_start_27mhz: None, + chunk_duration_27mhz: 27_000_000, + utc_start_unix: None, + sync_status: "test-invalid".to_string(), + }, + }, + )?; + + assert_eq!(stats.archived_windows, 0); + assert_eq!(stats.archived_fragments, 0); + assert!(!config.manifest_path.exists()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn publisher_source_proof_window_sequence_uses_synced_utc_or_wallclock_start() { + let synced = TsChunk { + index: 390_545_408, + path: PathBuf::from("unused.ts"), + timing: ec_chopper::ChunkTiming { + chunk_index: 390_545_408, + chunk_start_27mhz: Some(123), + chunk_duration_27mhz: 54_000_000, + utc_start_unix: Some(10), + sync_status: "synced".to_string(), + }, + }; + assert_eq!( + publisher_source_proof_window_sequence(&synced, 12_345, 2_000), + 390_545_408 + ); + + let unsynced = TsChunk { + index: 390_545_408, + path: PathBuf::from("unused.ts"), + timing: ec_chopper::ChunkTiming { + chunk_index: 390_545_408, + chunk_start_27mhz: Some(123), + chunk_duration_27mhz: 54_000_000, + utc_start_unix: Some(10), + sync_status: "unsynced".to_string(), + }, + }; + assert_eq!( + publisher_source_proof_window_sequence(&unsynced, 12_345, 2_000), + 5 + ); + } + + #[test] + fn publisher_source_proof_stats_keeps_subfragment_slots_across_chunks() { + let mut stats = PublisherProofArchiveSourceStats::default(); + let timing = ArchiveRecordMediaTiming { + track_id: 1, + timescale: 90_000, + decode_time: 178_095_128_8116, + sequence: 1_779_171_116, + }; + + let first = stats.next_source_proof_group_sequence(&timing); + let second = stats.next_source_proof_group_sequence(&timing); + + assert_ne!(first, second); + assert_eq!(second, first + 1); + assert_eq!(stats.archived_fragments, 2); + assert_eq!(stats.first_group_sequence, Some(first)); + assert_eq!(stats.latest_group_sequence, Some(second)); + } + + #[test] + fn publisher_source_proof_fragment_timing_rewrites_to_window_clock() -> Result<()> { + let mut fragment = Vec::new(); + fragment.extend(test_mp4_moof_with_tfdt(7, 90_000)); + fragment.extend(test_mp4_box(b"mdat", b"fragment")); + let mut track_timescales = BTreeMap::new(); + track_timescales.insert(7, 90_000); + + let timing = + publisher_source_proof_fragment_timing(&mut fragment, &track_timescales, 1_000, 1_000)? + .expect("timed fragment"); + + assert_eq!(timing.track_id, 7); + assert_eq!(timing.timescale, 90_000); + assert_eq!(timing.sequence, 1_001); + assert_eq!( + timing.decode_time, + fmp4_decode_time_for_media_sequence(1_001, 90_000, 1_000).expect("decode time") + ); + let (_, rewritten_decode_time) = + fmp4_fragment_decode_time(&fragment)?.expect("rewritten tfdt"); + assert_eq!(rewritten_decode_time, timing.decode_time); + Ok(()) + } + + #[test] + fn archive_track_limits_tail_or_seek_window() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-limit-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let manifest_root = root.join("manifests"); + let broadcast_dir = manifest_root.join("limited"); + fs::create_dir_all(&broadcast_dir)?; + + let mut lines = String::new(); + for sequence in 0..10u64 { + let hash = format!("{sequence:064x}"); + let record = ArchiveIndexRecord { + received_unix_ms: 1_000 + sequence * 1_000, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: "limited".to_string(), + track_name: WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string(), + group_sequence: sequence, + frame_count: 1, + size_bytes: 4, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + lines.push_str(&serde_json::to_string(&record)?); + lines.push('\n'); + } + fs::write(broadcast_dir.join("0.m4s.jsonl"), lines)?; + + let tail = parse_archive_track( + &manifest_root, + "limited", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + None, + Some(3), + )?; + assert_eq!( + tail.iter() + .map(|record| record.group_sequence) + .collect::>(), + vec![7, 8, 9] + ); + + let seek_window = parse_archive_track( + &manifest_root, + "limited", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + Some(4_000), + Some(2), + )?; + assert_eq!( + seek_window + .iter() + .map(|record| record.group_sequence) + .collect::>(), + vec![3, 4] + ); + + let timeline = summarize_archive_track( + &manifest_root, + "limited", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + None, + None, + )?; + assert_eq!(timeline.start_unix_ms, Some(1_000)); + assert_eq!(timeline.end_unix_ms, Some(10_000)); + assert_eq!(timeline.segments, 10); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_ladder_status_groups_directory_and_track_renditions() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-ladder-status-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let manifest_root = root.join("manifests"); + + let write_record = |broadcast_name: &str, + track_name: &str, + sequence: u64, + received_unix_ms: u64, + frame_count: u64, + size_bytes: usize| + -> Result<()> { + let hash = format!("{sequence:064x}"); + let record = ArchiveIndexRecord { + received_unix_ms, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: broadcast_name.to_string(), + track_name: track_name.to_string(), + group_sequence: sequence, + frame_count, + size_bytes, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &archive_index_path(&manifest_root, broadcast_name, track_name), + &record, + ) + }; + + write_record( + "west-cbs", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + 10, + 1_000, + 30, + 100, + )?; + write_record( + "west-cbs", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + 11, + 2_000, + 30, + 200, + )?; + write_record( + "west-cbs-480p", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + 12, + 1_500, + 30, + 120, + )?; + write_record("west-cbs", "0.m4s/1080p", 13, 2_500, 30, 800)?; + write_record("west-cbs", "init/1080p", 14, 900, 1, 64)?; + + let status = archive_ladder_status(&manifest_root, "west-cbs", 3_601_000, 3_600)?; + assert_eq!(status.base_broadcast_name, "west-cbs"); + assert_eq!(status.retention_seconds, 3_600); + assert_eq!(status.retention_window_start_unix_ms, Some(1_000)); + assert_eq!( + status + .renditions + .iter() + .map(|item| item.rendition_id.as_str()) + .collect::>(), + vec!["1080p", "480p", "720p"] + ); + + let track_1080 = status + .tracks + .iter() + .find(|track| track.track_name == "0.m4s/1080p") + .expect("1080p video track"); + assert_eq!(track_1080.media_kind, "video"); + assert_eq!(track_1080.rendition_id, "1080p"); + assert_eq!(track_1080.total_bytes, Some(800)); + assert_eq!(track_1080.total_frames, Some(30)); + + let summary_720 = status + .renditions + .iter() + .find(|item| item.rendition_id == "720p") + .expect("720p rendition summary"); + assert_eq!(summary_720.segments, 2); + assert_eq!(summary_720.total_bytes, Some(300)); + assert_eq!(summary_720.total_frames, Some(60)); + + let track_480 = status + .tracks + .iter() + .find(|track| track.rendition_id == "480p") + .expect("480p track"); + assert_eq!(track_480.broadcast_name, "west-cbs-480p"); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_hls_master_playlist_lists_ladder_variant_tracks() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-ladder-master-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let manifest_root = root.join("manifests"); + + let write_record = |track_name: &str, + sequence: u64, + received_unix_ms: u64, + frame_count: u64, + size_bytes: usize| + -> Result { + let hash = format!("{sequence:064x}"); + let record = ArchiveIndexRecord { + received_unix_ms, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: "west-cbs".to_string(), + track_name: track_name.to_string(), + group_sequence: sequence, + frame_count, + size_bytes, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &archive_index_path(&manifest_root, "west-cbs", track_name), + &record, + )?; + Ok(hash) + }; + + let init_480 = write_record("init.mp4/480p", 1, 900, 1, 64)?; + write_record("0.m4s/480p", 10, 1_000, 30, 100)?; + write_record("0.m4s/480p", 11, 2_000, 30, 100)?; + write_record("init.mp4/720p", 2, 900, 1, 64)?; + write_record("0.m4s/720p", 12, 1_000, 30, 200)?; + write_record("init.mp4/1080p", 3, 900, 1, 64)?; + write_record("0.m4s/1080p", 13, 1_000, 30, 400)?; + + let req_url = + Url::parse("http://localhost/archive/west-cbs/master.m3u8?from_ms=1000&limit=12")?; + let playlist = archive_hls_master_playlist( + &manifest_root, + "west-cbs", + &req_url, + 3_000, + WT_ARCHIVE_DEFAULT_RETENTION_SECONDS, + )? + .expect("ladder playlist"); + + assert!(playlist.contains("#EXT-X-INDEPENDENT-SEGMENTS")); + assert!(playlist.contains("BANDWIDTH=1400000")); + assert!(playlist.contains("RESOLUTION=854x480")); + assert!(playlist.contains("BANDWIDTH=3200000")); + assert!(playlist.contains("RESOLUTION=1280x720")); + assert!(playlist.contains("BANDWIDTH=6300000")); + assert!(playlist.contains("RESOLUTION=1920x1080")); + assert!(playlist.contains( + "/archive/west-cbs/track.m3u8?broadcast=west-cbs&track=0.m4s%2F480p&from_ms=1000&limit=12" + )); + assert!(playlist.contains( + "/archive/west-cbs/track.m3u8?broadcast=west-cbs&track=0.m4s%2F720p&from_ms=1000&limit=12" + )); + assert!(playlist.contains( + "/archive/west-cbs/track.m3u8?broadcast=west-cbs&track=0.m4s%2F1080p&from_ms=1000&limit=12" + )); + assert_eq!( + latest_init_hash(&manifest_root, "west-cbs", "0.m4s/480p")?, + Some(init_480) + ); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[test] + fn archive_hls_master_playlist_falls_back_for_legacy_single_video() -> Result<()> { + let root = std::env::temp_dir().join(format!( + "ec-archive-legacy-master-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let manifest_root = root.join("manifests"); + + for (track_name, sequence) in [ + (WT_PUBLISH_INIT_TRACK, 1), + (WT_PUBLISH_PRIMARY_VIDEO_TRACK, 2), + ] { + let hash = format!("{sequence:064x}"); + let record = ArchiveIndexRecord { + received_unix_ms: sequence * 1000, + relay_url: "https://relay.every.channel/anon".to_string(), + source_node: Some("nuc-a".to_string()), + source_session: Some("nuc-a-test".to_string()), + media_timing: None, + broadcast_name: "legacy".to_string(), + track_name: track_name.to_string(), + group_sequence: sequence, + frame_count: 1, + size_bytes: 64, + blake3: hash.clone(), + cas_path: format!("objects/blake3/{}/{hash}.bin", &hash[0..2]), + }; + append_archive_index_record( + &archive_index_path(&manifest_root, "legacy", track_name), + &record, + )?; + } + + let req_url = Url::parse("http://localhost/archive/legacy/master.m3u8")?; + assert!(archive_hls_master_playlist( + &manifest_root, + "legacy", + &req_url, + 3_000, + WT_ARCHIVE_DEFAULT_RETENTION_SECONDS, + )? + .is_none()); + + let _ = fs::remove_dir_all(root); + Ok(()) + } + + #[tokio::test] + async fn read_archive_object_fetches_validates_and_caches_origin_object() -> Result<()> { + let payload = b"origin-cas-object".to_vec(); + let digest = blake3::hash(&payload).to_hex().to_string(); + let root = std::env::temp_dir().join(format!( + "ec-archive-origin-test-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + let cas_root = root.join("objects").join("blake3"); + let cache_access_root = root.join("cache-access").join("blake3"); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await?; + let origin_addr = listener.local_addr()?; + let expected_path = format!( + "/archive-root/objects/blake3/{}/{}.bin", + &digest[0..2], + digest + ); + let response_payload = payload.clone(); + let server = tokio::spawn(async move { + let (mut socket, _) = listener.accept().await.unwrap(); + let mut req = vec![0u8; 1024]; + let read = socket.read(&mut req).await.unwrap(); + let request = String::from_utf8_lossy(&req[..read]); + assert!(request.starts_with(&format!("GET {expected_path} "))); + let head = format!( + "HTTP/1.1 200 OK\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", + response_payload.len() + ); + socket.write_all(head.as_bytes()).await.unwrap(); + socket.write_all(&response_payload).await.unwrap(); + }); + + let state = ArchiveReplayState { + cas_root: cas_root.clone(), + manifest_root: root.join("manifests"), + archive_origin_url: Some(Url::parse(&format!("http://{origin_addr}/archive-root"))?), + archive_cas_origin_url: None, + archive_origin_max_bytes: 1024, + archive_retention_seconds: WT_ARCHIVE_DEFAULT_RETENTION_SECONDS, + cache_access_root: cache_access_root.clone(), + http_client: reqwest::Client::new(), + }; + + let fetched = read_archive_object(&state, &digest).await?.unwrap(); + assert_eq!(fetched, payload); + assert_eq!(fs::read(cas_path_for_hash(&cas_root, &digest)?)?, payload); + assert!(cache_access_root + .join(&digest[0..2]) + .join(format!("{digest}.access")) + .exists()); + + server.await?; + let _ = fs::remove_dir_all(root); + Ok(()) + } + #[derive(Clone)] struct DummySource { source_id: ec_core::SourceId, @@ -5151,33 +10358,20 @@ async fn control_resolve(args: ControlResolveArgs) -> Result<()> { )) } -fn relay_transports_for_web(transports: &[StreamTransportDescriptor]) -> Vec { - transports - .iter() - .filter_map(|transport| { - if let StreamTransportDescriptor::RelayMoq { - url, - broadcast_name, - track_name, - } = transport - { - Some(WebStreamRelay { - relay_url: url.clone(), - broadcast_name: broadcast_name.clone(), - track_name: track_name.clone(), - }) - } else { - None - } - }) - .collect() -} - -#[derive(Debug, Clone, serde::Serialize)] -struct WebStreamRelay { - relay_url: String, - broadcast_name: String, - track_name: String, +fn select_relay_transport_for_web( + transports: &[StreamTransportDescriptor], +) -> Option<(String, String, String)> { + for transport in transports { + if let StreamTransportDescriptor::RelayMoq { + url, + broadcast_name, + track_name, + } = transport + { + return Some((url.clone(), broadcast_name.clone(), track_name.clone())); + } + } + None } #[derive(Debug, serde::Serialize)] @@ -5187,7 +10381,6 @@ struct WebStreamUpsertReq<'a> { relay_url: &'a str, broadcast_name: &'a str, track_name: &'a str, - relays: &'a [WebStreamRelay], expires_ms: u64, } @@ -5256,11 +10449,11 @@ async fn control_bridge_web(args: ControlBridgeWebArgs) -> Result<()> { } } - let relays = relay_transports_for_web(&announcement.transports); - if relays.is_empty() { + let Some((relay_url, broadcast_name, track_name)) = + select_relay_transport_for_web(&announcement.transports) + else { continue; - } - let primary_relay = &relays[0]; + }; if last_upserted_unix_ms .get(&stream_id) @@ -5273,10 +10466,9 @@ async fn control_bridge_web(args: ControlBridgeWebArgs) -> Result<()> { let payload = WebStreamUpsertReq { stream_id: &stream_id, title: &announcement.stream.title, - relay_url: &primary_relay.relay_url, - broadcast_name: &primary_relay.broadcast_name, - track_name: &primary_relay.track_name, - relays: &relays, + relay_url: &relay_url, + broadcast_name: &broadcast_name, + track_name: &track_name, expires_ms: now_unix_ms().saturating_add(ttl_ms), }; @@ -5303,9 +10495,8 @@ async fn control_bridge_web(args: ControlBridgeWebArgs) -> Result<()> { last_upserted_unix_ms.insert(stream_id.clone(), announcement.updated_unix_ms); tracing::info!( stream = %stream_id, - relay = %primary_relay.relay_url, - broadcast = %primary_relay.broadcast_name, - relay_count = relays.len(), + relay = %relay_url, + broadcast = %broadcast_name, "web stream upserted" ); if args.once { @@ -5348,10 +10539,24 @@ fn wait_for_stable_file(path: &Path, timeout: Duration) -> Result<()> { )) } +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct ArchiveRecordMediaTiming { + track_id: u32, + timescale: u32, + decode_time: u64, + sequence: u64, +} + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] struct ArchiveIndexRecord { received_unix_ms: u64, relay_url: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + source_node: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + source_session: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + media_timing: Option, broadcast_name: String, track_name: String, group_sequence: u64, @@ -5384,6 +10589,58 @@ fn default_wt_archive_tracks() -> Vec { .collect() } +fn non_empty_trimmed_string(value: &str) -> Option { + let trimmed = value.trim(); + (!trimmed.is_empty()).then(|| trimmed.to_string()) +} + +fn archive_machine_identity_suffix() -> Option { + [ + "/etc/machine-id", + "/var/lib/dbus/machine-id", + "/proc/sys/kernel/random/boot_id", + ] + .into_iter() + .find_map(|path| { + let value = fs::read_to_string(path).ok()?; + let trimmed = value.trim(); + if trimmed.is_empty() { + return None; + } + let hash = blake3::hash(trimmed.as_bytes()).to_hex().to_string(); + Some(hash[..12].to_string()) + }) +} + +fn archive_host_source_node(hostname: &str) -> Option { + let hostname = non_empty_trimmed_string(hostname)?; + let sanitized = sanitize_path_component(&hostname); + archive_machine_identity_suffix() + .map(|suffix| format!("{sanitized}-{suffix}")) + .or(Some(sanitized)) +} + +fn archive_source_node(explicit: Option<&str>) -> String { + explicit + .and_then(non_empty_trimmed_string) + .or_else(|| std::env::var("EVERY_CHANNEL_ARCHIVE_SOURCE_NODE").ok()) + .and_then(|value| non_empty_trimmed_string(&value)) + .or_else(|| std::env::var("EVERY_CHANNEL_NODE_NAME").ok()) + .and_then(|value| archive_host_source_node(&value)) + .or_else(|| std::env::var("HOSTNAME").ok()) + .and_then(|value| archive_host_source_node(&value)) + .unwrap_or_else(|| "unknown".to_string()) +} + +fn archive_source_session(source_node: &str) -> String { + format!( + "{}-{}-{}", + sanitize_path_component(source_node), + std::process::id(), + now_unix_ms() + ) +} + fn cas_store_blob(cas_root: &Path, data: &[u8]) -> Result<(String, PathBuf, bool)> { let hash = blake3::hash(data).to_hex().to_string(); let shard = &hash[0..2]; @@ -5458,9 +10715,1285 @@ fn append_archive_index_record(path: &Path, record: &ArchiveIndexRecord) -> Resu Ok(()) } +#[derive(Debug, Clone, PartialEq, Eq)] +struct Mp4TopLevelBox { + kind: [u8; 4], + size: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct Mp4BoxSpan { + kind: [u8; 4], + start: usize, + size: usize, + header_size: usize, +} + +impl Mp4BoxSpan { + fn payload_range(&self) -> std::ops::Range { + (self.start + self.header_size)..(self.start + self.size) + } +} + +fn mp4_box_at(data: &[u8], offset: usize) -> Result> { + if data.len().saturating_sub(offset) < 8 { + return Ok(None); + } + let size32 = u32::from_be_bytes([ + data[offset], + data[offset + 1], + data[offset + 2], + data[offset + 3], + ]); + let mut header_size = 8usize; + let box_size = if size32 == 1 { + if data.len().saturating_sub(offset) < 16 { + return Ok(None); + } + header_size = 16; + u64::from_be_bytes([ + data[offset + 8], + data[offset + 9], + data[offset + 10], + data[offset + 11], + data[offset + 12], + data[offset + 13], + data[offset + 14], + data[offset + 15], + ]) + } else if size32 == 0 { + return Ok(None); + } else { + u64::from(size32) + }; + if box_size < header_size as u64 { + return Err(anyhow!("invalid MP4 box size")); + } + let size = usize::try_from(box_size).context("MP4 box too large")?; + let end = offset + .checked_add(size) + .ok_or_else(|| anyhow!("MP4 box offset overflow"))?; + if data.len() < end { + return Ok(None); + } + Ok(Some(Mp4BoxSpan { + kind: [ + data[offset + 4], + data[offset + 5], + data[offset + 6], + data[offset + 7], + ], + start: offset, + size, + header_size, + })) +} + +fn mp4_top_level_box(data: &[u8]) -> Result> { + Ok(mp4_box_at(data, 0)?.map(|span| Mp4TopLevelBox { + kind: span.kind, + size: span.size, + })) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct Fmp4InitAndMedia { + init: Vec, + media: Vec>, +} + +fn split_fmp4_init_and_media(data: &[u8]) -> Result { + let mut offset = 0usize; + let mut init_end = None::; + let mut media = Vec::new(); + + while offset < data.len() { + let Some(span) = mp4_box_at(data, offset)? else { + break; + }; + let end = span.start + span.size; + if span.kind == *b"moof" { + init_end.get_or_insert(span.start); + let mut fragment = data[span.start..end].to_vec(); + offset = end; + while offset < data.len() { + let Some(next) = mp4_box_at(data, offset)? else { + break; + }; + let next_end = next.start + next.size; + if next.kind == *b"moof" { + break; + } + fragment.extend_from_slice(&data[next.start..next_end]); + offset = next_end; + if next.kind == *b"mdat" { + break; + } + } + media.push(fragment); + continue; + } + offset = end; + } + + let init_end = init_end.ok_or_else(|| anyhow!("fMP4 output has no media fragment"))?; + if init_end == 0 { + return Err(anyhow!("fMP4 output has media before init")); + } + if media.is_empty() { + return Err(anyhow!("fMP4 output has no media fragments")); + } + + Ok(Fmp4InitAndMedia { + init: data[..init_end].to_vec(), + media, + }) +} + +fn mp4_children(data: &[u8], range: std::ops::Range) -> Result> { + let mut children = Vec::new(); + let mut offset = range.start; + while offset < range.end { + let Some(child) = mp4_box_at(data, offset)? else { + break; + }; + let child_end = child.start + child.size; + if child_end > range.end { + return Err(anyhow!("nested MP4 box exceeds parent bounds")); + } + offset = child_end; + children.push(child); + } + Ok(children) +} + +fn read_be_u32(data: &[u8], offset: usize) -> Option { + (data.len().saturating_sub(offset) >= 4).then(|| { + u32::from_be_bytes([ + data[offset], + data[offset + 1], + data[offset + 2], + data[offset + 3], + ]) + }) +} + +fn read_be_u64(data: &[u8], offset: usize) -> Option { + (data.len().saturating_sub(offset) >= 8).then(|| { + u64::from_be_bytes([ + data[offset], + data[offset + 1], + data[offset + 2], + data[offset + 3], + data[offset + 4], + data[offset + 5], + data[offset + 6], + data[offset + 7], + ]) + }) +} + +fn write_be_u32(data: &mut [u8], offset: usize, value: u32) -> bool { + if data.len().saturating_sub(offset) < 4 { + return false; + } + data[offset..offset + 4].copy_from_slice(&value.to_be_bytes()); + true +} + +fn write_be_u64(data: &mut [u8], offset: usize, value: u64) -> bool { + if data.len().saturating_sub(offset) < 8 { + return false; + } + data[offset..offset + 8].copy_from_slice(&value.to_be_bytes()); + true +} + +fn mp4_full_box_version(data: &[u8], span: &Mp4BoxSpan) -> Option { + data.get(span.payload_range().start).copied() +} + +fn mp4_tkhd_track_id(data: &[u8], tkhd: &Mp4BoxSpan) -> Option { + let payload_start = tkhd.payload_range().start; + let version = mp4_full_box_version(data, tkhd)?; + let track_id_offset = match version { + 0 => payload_start + 12, + 1 => payload_start + 20, + _ => return None, + }; + read_be_u32(data, track_id_offset) +} + +fn mp4_mdhd_timescale(data: &[u8], mdhd: &Mp4BoxSpan) -> Option { + let payload_start = mdhd.payload_range().start; + let version = mp4_full_box_version(data, mdhd)?; + let timescale_offset = match version { + 0 => payload_start + 12, + 1 => payload_start + 20, + _ => return None, + }; + read_be_u32(data, timescale_offset).filter(|timescale| *timescale > 0) +} + +fn mp4_init_track_timescales(moov: &[u8]) -> Result> { + let moov_span = mp4_box_at(moov, 0)?.ok_or_else(|| anyhow!("missing moov box"))?; + if moov_span.kind != *b"moov" { + return Ok(BTreeMap::new()); + } + let mut timescales = BTreeMap::new(); + for trak in mp4_children(moov, moov_span.payload_range())? { + if trak.kind != *b"trak" { + continue; + } + let mut track_id = None; + let mut timescale = None; + for trak_child in mp4_children(moov, trak.payload_range())? { + match &trak_child.kind { + b"tkhd" => track_id = mp4_tkhd_track_id(moov, &trak_child), + b"mdia" => { + for mdia_child in mp4_children(moov, trak_child.payload_range())? { + if mdia_child.kind == *b"mdhd" { + timescale = mp4_mdhd_timescale(moov, &mdia_child); + } + } + } + _ => {} + } + } + if let (Some(track_id), Some(timescale)) = (track_id, timescale) { + timescales.insert(track_id, timescale); + } + } + Ok(timescales) +} + +fn mp4_tfhd_track_id(data: &[u8], tfhd: &Mp4BoxSpan) -> Option { + read_be_u32(data, tfhd.payload_range().start + 4) +} + +fn mp4_tfdt_base_media_decode_time(data: &[u8], tfdt: &Mp4BoxSpan) -> Option { + let payload_start = tfdt.payload_range().start; + let version = mp4_full_box_version(data, tfdt)?; + match version { + 0 => read_be_u32(data, payload_start + 4).map(u64::from), + 1 => read_be_u64(data, payload_start + 4), + _ => None, + } +} + +fn fmp4_fragment_decode_time(bytes: &[u8]) -> Result> { + let Some(moof) = mp4_box_at(bytes, 0)? else { + return Ok(None); + }; + if moof.kind != *b"moof" { + return Ok(None); + } + for moof_child in mp4_children(bytes, moof.payload_range())? { + if moof_child.kind != *b"traf" { + continue; + } + let mut track_id = None; + let mut base_media_decode_time = None; + for traf_child in mp4_children(bytes, moof_child.payload_range())? { + match &traf_child.kind { + b"tfhd" => track_id = mp4_tfhd_track_id(bytes, &traf_child), + b"tfdt" => { + base_media_decode_time = mp4_tfdt_base_media_decode_time(bytes, &traf_child) + } + _ => {} + } + } + if let (Some(track_id), Some(base_media_decode_time)) = (track_id, base_media_decode_time) { + return Ok(Some((track_id, base_media_decode_time))); + } + } + Ok(None) +} + +fn fmp4_media_time_sequence( + base_media_decode_time: u64, + timescale: u32, + segment_duration_ms: u64, +) -> Option { + if timescale == 0 || segment_duration_ms == 0 { + return None; + } + let denominator = u128::from(timescale) * u128::from(segment_duration_ms); + if denominator == 0 { + return None; + } + let numerator = u128::from(base_media_decode_time) * 1000; + let rounded = (numerator + (denominator / 2)) / denominator; + u64::try_from(rounded).ok() +} + +fn fmp4_decode_time_for_media_sequence( + media_sequence: u64, + timescale: u32, + segment_duration_ms: u64, +) -> Option { + if timescale == 0 || segment_duration_ms == 0 { + return None; + } + let numerator = u128::from(media_sequence) + .saturating_mul(u128::from(timescale)) + .saturating_mul(u128::from(segment_duration_ms)); + u64::try_from((numerator + 500) / 1000).ok() +} + +fn rewrite_fmp4_fragment_decode_time( + bytes: &mut [u8], + replacement_decode_time: u64, +) -> Result { + let Some(moof) = mp4_box_at(bytes, 0)? else { + return Ok(false); + }; + if moof.kind != *b"moof" { + return Ok(false); + } + for moof_child in mp4_children(bytes, moof.payload_range())? { + if moof_child.kind != *b"traf" { + continue; + } + for traf_child in mp4_children(bytes, moof_child.payload_range())? { + if traf_child.kind != *b"tfdt" { + continue; + } + let payload_start = traf_child.payload_range().start; + let Some(version) = mp4_full_box_version(bytes, &traf_child) else { + return Ok(false); + }; + return Ok(match version { + 0 => u32::try_from(replacement_decode_time) + .ok() + .is_some_and(|value| write_be_u32(bytes, payload_start + 4, value)), + 1 => write_be_u64(bytes, payload_start + 4, replacement_decode_time), + _ => false, + }); + } + } + Ok(false) +} + +const PUBLISHER_ARCHIVE_EPOCH_BUCKET_STRIDE: u64 = 4096; +const PUBLISHER_PROOF_SUBFRAGMENT_SLOT_RETENTION: usize = 8192; + +#[derive(Debug)] +struct PublisherArchiveWriter { + relay_url: String, + broadcast_name: String, + track_name: String, + source_node: String, + source_session: String, + cas_root: PathBuf, + manifest_path: PathBuf, + buffer: Vec, + pending_moof: Option>, + track_timescales: BTreeMap, + media_sequence_origins: BTreeMap, + media_sequence_fragment_counts: BTreeMap<(u32, u64), u64>, + segment_duration_ms: u64, + next_sequence: u64, + archived_fragments: u64, +} + +impl PublisherArchiveWriter { + #[cfg(test)] + fn new( + output_dir: &Path, + manifest_dir: Option<&Path>, + relay_url: &str, + broadcast_name: &str, + track_name: &str, + segment_duration_ms: u64, + source_node: String, + ) -> Self { + let manifest_root = manifest_dir + .map(Path::to_path_buf) + .unwrap_or_else(|| output_dir.join("manifests")); + let source_session = archive_source_session(&source_node); + Self { + relay_url: relay_url.to_string(), + broadcast_name: broadcast_name.to_string(), + track_name: track_name.to_string(), + source_node, + source_session, + cas_root: output_dir.join("objects").join("blake3"), + manifest_path: archive_index_path(&manifest_root, broadcast_name, track_name), + buffer: Vec::new(), + pending_moof: None, + track_timescales: BTreeMap::new(), + media_sequence_origins: BTreeMap::new(), + media_sequence_fragment_counts: BTreeMap::new(), + segment_duration_ms, + next_sequence: 0, + archived_fragments: 0, + } + } + + fn observe_bytes(&mut self, bytes: &[u8]) -> Result<()> { + self.buffer.extend_from_slice(bytes); + + while let Some(top_box) = mp4_top_level_box(&self.buffer)? { + let raw = self.buffer.drain(..top_box.size).collect::>(); + match &top_box.kind { + b"moof" => { + if self.pending_moof.replace(raw).is_some() { + return Err(anyhow!( + "publisher-origin fMP4 saw moof before matching mdat" + )); + } + } + b"mdat" => { + let Some(mut moof) = self.pending_moof.take() else { + continue; + }; + moof.extend_from_slice(&raw); + self.archive_fragment(moof)?; + } + b"moov" => { + let timescales = mp4_init_track_timescales(&raw)?; + if !timescales.is_empty() { + self.track_timescales.extend(timescales); + } + } + _ => {} + } + } + + Ok(()) + } + + fn fragment_media_sequence(&self, bytes: &[u8]) -> Result> { + let Some((track_id, base_media_decode_time)) = fmp4_fragment_decode_time(bytes)? else { + return Ok(None); + }; + let Some(timescale) = self.track_timescales.get(&track_id).copied() else { + return Ok(None); + }; + Ok( + fmp4_media_time_sequence(base_media_decode_time, timescale, self.segment_duration_ms) + .map(|sequence| ArchiveRecordMediaTiming { + track_id, + timescale, + decode_time: base_media_decode_time, + sequence, + }), + ) + } + + fn group_sequence_for_media_timing_subfragment( + timing: &ArchiveRecordMediaTiming, + subfragment_index: u64, + ) -> u64 { + let track_slot_base = (u64::from(timing.track_id) % 16) * 256; + let track_slot = (track_slot_base + subfragment_index.min(255)) + .min(PUBLISHER_ARCHIVE_EPOCH_BUCKET_STRIDE - 1); + timing + .sequence + .saturating_mul(PUBLISHER_ARCHIVE_EPOCH_BUCKET_STRIDE) + .saturating_add(track_slot) + } + + fn wallclock_media_sequence_for_received(&self, received_unix_ms: u64) -> Option { + (self.segment_duration_ms > 0).then(|| received_unix_ms / self.segment_duration_ms) + } + + fn normalize_fragment_media_timing( + &mut self, + bytes: &mut [u8], + media_timing: Option, + received_unix_ms: u64, + ) -> Result> { + let Some(mut timing) = media_timing else { + return Ok(None); + }; + let Some(observed_sequence) = self.wallclock_media_sequence_for_received(received_unix_ms) + else { + return Ok(Some(timing)); + }; + let original_sequence = timing.sequence; + let origin_sequence = self + .media_sequence_origins + .entry(timing.track_id) + .or_insert_with(|| observed_sequence.saturating_sub(original_sequence)); + let sequence = origin_sequence.saturating_add(original_sequence); + let Some(decode_time) = fmp4_decode_time_for_media_sequence( + sequence, + timing.timescale, + self.segment_duration_ms, + ) else { + return Ok(Some(timing)); + }; + if rewrite_fmp4_fragment_decode_time(bytes, decode_time)? { + timing.sequence = sequence; + timing.decode_time = decode_time; + } + Ok(Some(timing)) + } + + fn archive_fragment(&mut self, bytes: Vec) -> Result<()> { + let mut bytes = bytes; + let media_sequence = self.fragment_media_sequence(&bytes)?; + let received_unix_ms = now_unix_ms(); + let media_sequence = + self.normalize_fragment_media_timing(&mut bytes, media_sequence, received_unix_ms)?; + let candidate_sequence = if let Some(timing) = media_sequence.as_ref() { + let counter = self + .media_sequence_fragment_counts + .entry((timing.track_id, timing.sequence)) + .or_default(); + let group_sequence = + Self::group_sequence_for_media_timing_subfragment(timing, *counter); + *counter = counter.saturating_add(1); + group_sequence + } else { + self.next_sequence + }; + let group_sequence = if media_sequence.is_some() { + candidate_sequence + } else { + candidate_sequence.max(self.next_sequence) + }; + self.next_sequence = group_sequence.saturating_add(1); + let size_bytes = bytes.len(); + let (hash, rel_path, inserted) = cas_store_blob(&self.cas_root, &bytes)?; + let record = ArchiveIndexRecord { + received_unix_ms, + relay_url: self.relay_url.clone(), + source_node: Some(self.source_node.clone()), + source_session: Some(self.source_session.clone()), + media_timing: media_sequence.clone(), + broadcast_name: self.broadcast_name.clone(), + track_name: self.track_name.clone(), + group_sequence, + frame_count: 1, + size_bytes, + blake3: hash.clone(), + cas_path: rel_path.display().to_string(), + }; + append_archive_index_record(&self.manifest_path, &record)?; + self.archived_fragments += 1; + if self.archived_fragments % 50 == 0 { + tracing::info!( + relay = %self.relay_url, + source_node = %self.source_node, + source_session = %self.source_session, + broadcast = %self.broadcast_name, + track = %self.track_name, + group_sequence, + media_timescale = ?media_sequence.as_ref().map(|timing| timing.timescale), + media_decode_time = ?media_sequence.as_ref().map(|timing| timing.decode_time), + media_sequence = ?media_sequence.as_ref().map(|timing| timing.sequence), + archived_fragments = self.archived_fragments, + inserted, + size_bytes, + hash = %hash, + "archived publisher-origin fMP4 fragment" + ); + } + Ok(()) + } + + fn finish(&self) -> Result<()> { + if self.pending_moof.is_some() { + return Err(anyhow!( + "publisher-origin fMP4 stream ended with moof but no mdat" + )); + } + Ok(()) + } +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofArchiveSourceReport { + input: String, + output_dir: PathBuf, + manifest_path: PathBuf, + broadcast_name: String, + track_name: String, + source_node: String, + source_session: String, + chunk_ms: u64, + archived_windows: u64, + archived_fragments: u64, + first_group_sequence: Option, + latest_group_sequence: Option, +} + +#[derive(Debug, Clone)] +struct PublisherProofArchiveSourceConfig { + input: String, + input_format: Option, + output_dir: PathBuf, + relay_url: String, + broadcast_name: String, + track_name: String, + source_node: String, + source_session: String, + cas_root: PathBuf, + manifest_path: PathBuf, + source_window_dir: PathBuf, + proof_dir: PathBuf, + chunk_ms: u64, + max_chunks: Option, + preroll_packets: usize, + cleanup_temp: bool, + profile: StatelessProofEncodeProfile, +} + +#[derive(Debug, Default)] +struct PublisherProofArchiveSourceStats { + archived_windows: u64, + archived_fragments: u64, + first_group_sequence: Option, + latest_group_sequence: Option, + source_proof_subfragment_counts: BTreeMap<(u64, u32), u64>, +} + +impl PublisherProofArchiveSourceStats { + fn observe_group_sequence(&mut self, group_sequence: u64) { + self.archived_fragments = self.archived_fragments.saturating_add(1); + self.first_group_sequence = Some( + self.first_group_sequence + .map(|value| value.min(group_sequence)) + .unwrap_or(group_sequence), + ); + self.latest_group_sequence = Some( + self.latest_group_sequence + .map(|value| value.max(group_sequence)) + .unwrap_or(group_sequence), + ); + } + + fn next_source_proof_group_sequence(&mut self, timing: &ArchiveRecordMediaTiming) -> u64 { + let subfragment_index = { + let counter = self + .source_proof_subfragment_counts + .entry((timing.sequence, timing.track_id)) + .or_default(); + let subfragment_index = *counter; + *counter = counter.saturating_add(1); + subfragment_index + }; + while self.source_proof_subfragment_counts.len() + > PUBLISHER_PROOF_SUBFRAGMENT_SLOT_RETENTION + { + let Some(oldest_key) = self.source_proof_subfragment_counts.keys().next().copied() + else { + break; + }; + self.source_proof_subfragment_counts.remove(&oldest_key); + } + let group_sequence = PublisherArchiveWriter::group_sequence_for_media_timing_subfragment( + timing, + subfragment_index, + ); + self.observe_group_sequence(group_sequence); + group_sequence + } +} + +impl PublisherProofArchiveSourceConfig { + fn from_args(args: &PublisherProofArchiveSourceArgs) -> Result { + if args.chunk_ms == 0 { + return Err(anyhow!("--chunk-ms must be greater than 0")); + } + let source_node = archive_source_node(args.source_node.as_deref()); + let source_session = archive_source_session(&source_node); + let manifest_root = args + .manifest_dir + .clone() + .unwrap_or_else(|| args.output_dir.join("manifests")); + let proof_root = args + .output_dir + .join("publisher-proof") + .join(sanitize_path_component(&args.name)) + .join(sanitize_path_component(&source_session)); + Ok(Self { + input: args.input.clone(), + input_format: args.input_format.clone(), + output_dir: args.output_dir.clone(), + relay_url: args.relay_url.clone(), + broadcast_name: args.name.clone(), + track_name: args.track.clone(), + source_node, + source_session, + cas_root: args.output_dir.join("objects").join("blake3"), + manifest_path: archive_index_path(&manifest_root, &args.name, &args.track), + source_window_dir: proof_root.join("source-windows"), + proof_dir: proof_root.join("proof-mp4"), + chunk_ms: args.chunk_ms, + max_chunks: args.max_chunks, + preroll_packets: args.preroll_packets, + cleanup_temp: args.cleanup_temp, + profile: StatelessProofEncodeProfile::from_publisher_proof_archive_source_args(args), + }) + } + + fn report(&self, stats: PublisherProofArchiveSourceStats) -> PublisherProofArchiveSourceReport { + PublisherProofArchiveSourceReport { + input: self.input.clone(), + output_dir: self.output_dir.clone(), + manifest_path: self.manifest_path.clone(), + broadcast_name: self.broadcast_name.clone(), + track_name: self.track_name.clone(), + source_node: self.source_node.clone(), + source_session: self.source_session.clone(), + chunk_ms: self.chunk_ms, + archived_windows: stats.archived_windows, + archived_fragments: stats.archived_fragments, + first_group_sequence: stats.first_group_sequence, + latest_group_sequence: stats.latest_group_sequence, + } + } +} + +fn publisher_proof_archive_source_command(args: PublisherProofArchiveSourceArgs) -> Result<()> { + let pretty = args.pretty; + let report = publisher_proof_archive_source_report(&args)?; + if pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + Ok(()) +} + +fn publisher_proof_archive_source_report( + args: &PublisherProofArchiveSourceArgs, +) -> Result { + let config = PublisherProofArchiveSourceConfig::from_args(args)?; + run_publisher_proof_archive_source(config) +} + +fn run_publisher_proof_archive_source( + config: PublisherProofArchiveSourceConfig, +) -> Result { + fs::create_dir_all(&config.source_window_dir).with_context(|| { + format!( + "failed to create source proof dir {}", + config.source_window_dir.display() + ) + })?; + fs::create_dir_all(&config.proof_dir) + .with_context(|| format!("failed to create proof dir {}", config.proof_dir.display()))?; + fs::create_dir_all(&config.cas_root) + .with_context(|| format!("failed to create CAS dir {}", config.cas_root.display()))?; + + if let Some(path) = publisher_proof_archive_source_file_input(&config) { + let file = File::open(&path) + .with_context(|| format!("failed to open source {}", path.display()))?; + return run_publisher_proof_archive_source_reader(&config, file); + } + + if publisher_proof_archive_source_live_http_input(&config) && config.max_chunks.is_none() { + return run_publisher_proof_archive_live_http_source(config); + } + + if let Some(reader) = open_plain_http_body_reader(&config)? { + return run_publisher_proof_archive_source_reader(&config, reader); + } + + let mut child = Command::new("ffmpeg") + .args(publisher_proof_archive_source_ffmpeg_args(&config)) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .context("failed to spawn ffmpeg source proof reader")?; + let stdout = child + .stdout + .take() + .ok_or_else(|| anyhow!("ffmpeg source proof stdout unavailable"))?; + let report = run_publisher_proof_archive_source_reader(&config, stdout); + let status = child + .wait() + .context("failed to wait for ffmpeg source proof reader")?; + match report { + Ok(report) if status.success() => Ok(report), + Ok(_) => Err(anyhow!("ffmpeg source proof reader exited with {status}")), + Err(err) => Err(err), + } +} + +fn publisher_proof_archive_source_live_http_input( + config: &PublisherProofArchiveSourceConfig, +) -> bool { + if config.input_format.is_some() { + return false; + } + Url::parse(&config.input) + .ok() + .is_some_and(|url| url.scheme() == "http") +} + +fn run_publisher_proof_archive_live_http_source( + config: PublisherProofArchiveSourceConfig, +) -> Result { + let mut total_stats = PublisherProofArchiveSourceStats::default(); + loop { + let reader = match open_plain_http_body_reader(&config) { + Ok(Some(reader)) => reader, + Ok(None) => return Ok(config.report(total_stats)), + Err(err) => { + tracing::warn!( + input = %config.input, + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + error = %err, + "publisher proof live HTTP source open failed; retrying" + ); + std::thread::sleep(Duration::from_secs(1)); + continue; + } + }; + match run_publisher_proof_archive_source_reader_with_stats( + &config, + reader, + &mut total_stats, + ) { + Ok(()) => { + tracing::warn!( + input = %config.input, + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + "publisher proof live HTTP source ended; retrying" + ); + } + Err(err) => { + tracing::warn!( + input = %config.input, + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + error = %err, + "publisher proof live HTTP source reader failed; retrying" + ); + } + } + std::thread::sleep(Duration::from_secs(1)); + } +} + +fn publisher_proof_archive_source_file_input( + config: &PublisherProofArchiveSourceConfig, +) -> Option { + if config.input_format.is_some() || Url::parse(&config.input).is_ok() { + return None; + } + let path = PathBuf::from(&config.input); + path.exists().then_some(path) +} + +fn open_plain_http_body_reader( + config: &PublisherProofArchiveSourceConfig, +) -> Result>> { + if config.input_format.is_some() { + return Ok(None); + } + let Ok(url) = Url::parse(&config.input) else { + return Ok(None); + }; + if url.scheme() != "http" { + return Ok(None); + } + let host = url + .host_str() + .ok_or_else(|| anyhow!("HTTP source URL has no host: {}", config.input))?; + let port = url.port_or_known_default().unwrap_or(80); + let mut stream = StdTcpStream::connect((host, port)) + .with_context(|| format!("failed to connect to HTTP source {host}:{port}"))?; + let mut path = if url.path().is_empty() { + "/".to_string() + } else { + url.path().to_string() + }; + if let Some(query) = url.query() { + path.push('?'); + path.push_str(query); + } + write!( + stream, + "GET {path} HTTP/1.1\r\nHost: {host}\r\nUser-Agent: every-channel-ec-node\r\nAccept: */*\r\nConnection: close\r\n\r\n" + ) + .context("failed to write HTTP source request")?; + let mut reader = BufReader::new(stream); + let mut line = String::new(); + reader + .read_line(&mut line) + .context("failed to read HTTP source status line")?; + let status = line.split_whitespace().nth(1).unwrap_or(""); + if !status.starts_with('2') { + return Err(anyhow!( + "HTTP source {} returned non-success status line {:?}", + config.input, + line.trim_end() + )); + } + loop { + line.clear(); + let n = reader + .read_line(&mut line) + .context("failed to read HTTP source headers")?; + if n == 0 { + return Err(anyhow!("HTTP source ended before response body")); + } + let trimmed = line.trim_end_matches(['\r', '\n']); + if trimmed.is_empty() { + break; + } + let lower = trimmed.to_ascii_lowercase(); + if lower.starts_with("transfer-encoding:") && lower.contains("chunked") { + return Err(anyhow!( + "HTTP chunked transfer sources are not supported for byte-exact TS proof input" + )); + } + } + Ok(Some(reader)) +} + +fn publisher_proof_archive_source_ffmpeg_args( + config: &PublisherProofArchiveSourceConfig, +) -> Vec { + let mut args = vec![ + OsString::from("-hide_banner"), + OsString::from("-loglevel"), + OsString::from("error"), + OsString::from("-nostdin"), + OsString::from("-copyts"), + ]; + if let Some(format) = config.input_format.as_deref() { + args.push(OsString::from("-f")); + args.push(OsString::from(format)); + } + args.extend([ + OsString::from("-i"), + OsString::from(config.input.as_str()), + OsString::from("-map"), + OsString::from("0"), + OsString::from("-c"), + OsString::from("copy"), + OsString::from("-f"), + OsString::from("mpegts"), + OsString::from("-mpegts_copyts"), + OsString::from("1"), + OsString::from("pipe:1"), + ]); + args +} + +fn run_publisher_proof_archive_source_reader( + config: &PublisherProofArchiveSourceConfig, + reader: R, +) -> Result { + let mut stats = PublisherProofArchiveSourceStats::default(); + run_publisher_proof_archive_source_reader_with_stats(config, reader, &mut stats)?; + Ok(config.report(stats)) +} + +fn run_publisher_proof_archive_source_reader_with_stats( + config: &PublisherProofArchiveSourceConfig, + reader: R, + stats: &mut PublisherProofArchiveSourceStats, +) -> Result<()> { + ec_chopper::chunk_ts_stream_live_with_preroll( + reader, + &config.source_window_dir, + config.chunk_ms, + config.max_chunks, + config.preroll_packets, + |chunk| archive_publisher_source_proof_chunk(config, stats, chunk), + )?; + Ok(()) +} + +fn publisher_source_proof_window_sequence( + chunk: &TsChunk, + received_unix_ms: u64, + chunk_ms: u64, +) -> u64 { + if chunk.timing.sync_status == "synced" { + return chunk.timing.chunk_index; + } + received_unix_ms.saturating_sub(chunk_ms) / chunk_ms.max(1) +} + +fn publisher_source_proof_fragment_timing( + fragment: &mut [u8], + track_timescales: &BTreeMap, + window_sequence: u64, + chunk_ms: u64, +) -> Result> { + let Some((track_id, base_media_decode_time)) = fmp4_fragment_decode_time(fragment)? else { + return Ok(None); + }; + let Some(timescale) = track_timescales.get(&track_id).copied() else { + return Ok(None); + }; + let fragment_offset = + fmp4_media_time_sequence(base_media_decode_time, timescale, chunk_ms).unwrap_or(0); + let sequence = window_sequence.saturating_add(fragment_offset); + let Some(decode_time) = fmp4_decode_time_for_media_sequence(sequence, timescale, chunk_ms) + else { + return Ok(Some(ArchiveRecordMediaTiming { + track_id, + timescale, + decode_time: base_media_decode_time, + sequence, + })); + }; + if rewrite_fmp4_fragment_decode_time(fragment, decode_time)? { + return Ok(Some(ArchiveRecordMediaTiming { + track_id, + timescale, + decode_time, + sequence, + })); + } + Ok(Some(ArchiveRecordMediaTiming { + track_id, + timescale, + decode_time: base_media_decode_time, + sequence, + })) +} + +fn archive_publisher_source_proof_chunk( + config: &PublisherProofArchiveSourceConfig, + stats: &mut PublisherProofArchiveSourceStats, + chunk: TsChunk, +) -> Result<()> { + let output_mp4 = config + .proof_dir + .join(format!("proof_{:010}.mp4", chunk.index)); + let status = match Command::new("ffmpeg") + .args(stateless_proof_ffmpeg_args( + &config.profile, + &chunk.path, + &output_mp4, + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::inherit()) + .status() + { + Ok(status) => status, + Err(err) => { + tracing::warn!( + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + source_session = %config.source_session, + chunk_index = chunk.index, + error = %err, + "skipping source-window publisher proof chunk after ffmpeg spawn failure" + ); + if config.cleanup_temp { + let _ = fs::remove_file(&output_mp4); + let _ = fs::remove_file(&chunk.path); + } + return Ok(()); + } + }; + if !status.success() { + tracing::warn!( + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + source_session = %config.source_session, + chunk_index = chunk.index, + %status, + "skipping source-window publisher proof chunk after ffmpeg failure" + ); + if config.cleanup_temp { + let _ = fs::remove_file(&output_mp4); + let _ = fs::remove_file(&chunk.path); + } + return Ok(()); + } + + let bytes = match fs::read(&output_mp4) { + Ok(bytes) => bytes, + Err(err) => { + tracing::warn!( + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + source_session = %config.source_session, + chunk_index = chunk.index, + output = %output_mp4.display(), + error = %err, + "skipping source-window publisher proof chunk after output read failure" + ); + if config.cleanup_temp { + let _ = fs::remove_file(&output_mp4); + let _ = fs::remove_file(&chunk.path); + } + return Ok(()); + } + }; + let split = match split_fmp4_init_and_media(&bytes) { + Ok(split) => split, + Err(err) => { + tracing::warn!( + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + source_session = %config.source_session, + chunk_index = chunk.index, + error = %err, + "skipping source-window publisher proof chunk after fMP4 split failure" + ); + if config.cleanup_temp { + let _ = fs::remove_file(&output_mp4); + let _ = fs::remove_file(&chunk.path); + } + return Ok(()); + } + }; + let received_unix_ms = now_unix_ms(); + let window_sequence = + publisher_source_proof_window_sequence(&chunk, received_unix_ms, config.chunk_ms); + let track_timescales = mp4_init_track_timescales(&split.init).unwrap_or_default(); + + for mut fragment in split.media { + let timing = publisher_source_proof_fragment_timing( + &mut fragment, + &track_timescales, + window_sequence, + config.chunk_ms, + )? + .unwrap_or_else(|| ArchiveRecordMediaTiming { + track_id: 0, + timescale: 1000, + decode_time: window_sequence.saturating_mul(config.chunk_ms), + sequence: window_sequence, + }); + let group_sequence = stats.next_source_proof_group_sequence(&timing); + let size_bytes = fragment.len(); + let (hash, rel_path, inserted) = cas_store_blob(&config.cas_root, &fragment)?; + let record = ArchiveIndexRecord { + received_unix_ms, + relay_url: config.relay_url.clone(), + source_node: Some(config.source_node.clone()), + source_session: Some(config.source_session.clone()), + media_timing: Some(timing.clone()), + broadcast_name: config.broadcast_name.clone(), + track_name: config.track_name.clone(), + group_sequence, + frame_count: 1, + size_bytes, + blake3: hash.clone(), + cas_path: rel_path.display().to_string(), + }; + append_archive_index_record(&config.manifest_path, &record)?; + tracing::debug!( + broadcast = %config.broadcast_name, + track = %config.track_name, + source_node = %config.source_node, + source_session = %config.source_session, + chunk_index = chunk.index, + chunk_sync_status = %chunk.timing.sync_status, + chunk_utc_start_unix = ?chunk.timing.utc_start_unix, + window_sequence, + media_track_id = timing.track_id, + media_sequence = timing.sequence, + group_sequence, + inserted, + size_bytes, + hash = %hash, + "archived source-window publisher proof fragment" + ); + } + + stats.archived_windows = stats.archived_windows.saturating_add(1); + if config.cleanup_temp { + let _ = fs::remove_file(&output_mp4); + let _ = fs::remove_file(&chunk.path); + } + Ok(()) +} + +fn publisher_proof_archive_source_child_args( + args: &WtPublishArgs, + publish_input: &str, +) -> Result>> { + let Some(output_dir) = args.publisher_archive_output_dir.as_ref() else { + return Ok(None); + }; + + let mut out = vec![ + OsString::from("publisher-proof-archive-source"), + OsString::from("--input"), + OsString::from(publish_input), + OsString::from("--output-dir"), + output_dir.as_os_str().to_os_string(), + OsString::from("--relay-url"), + OsString::from(args.url.as_str()), + OsString::from("--name"), + OsString::from(args.name.as_str()), + OsString::from("--track"), + OsString::from(args.publisher_archive_track.as_str()), + OsString::from("--chunk-ms"), + OsString::from(args.publisher_archive_segment_duration_ms.to_string()), + OsString::from("--preroll-packets"), + OsString::from(WT_PUBLISH_PROOF_PREROLL_PACKETS.to_string()), + OsString::from(format!("--transcode={}", args.transcode)), + OsString::from("--video-filter"), + OsString::from(args.video_filter.as_str()), + OsString::from("--gop-frames"), + OsString::from(args.gop_frames.to_string()), + OsString::from("--video-preset"), + OsString::from(args.video_preset.as_str()), + OsString::from("--video-crf"), + OsString::from(args.video_crf.to_string()), + OsString::from("--movflags"), + OsString::from(args.movflags.as_str()), + ]; + + if let Some(input_format) = args.input_format.as_deref() { + out.push(OsString::from("--input-format")); + out.push(OsString::from(input_format)); + } + if let Some(manifest_dir) = args.publisher_archive_manifest_dir.as_ref() { + out.push(OsString::from("--manifest-dir")); + out.push(manifest_dir.as_os_str().to_os_string()); + } + if let Some(source_node) = args.publisher_archive_source_node.as_deref() { + out.push(OsString::from("--source-node")); + out.push(OsString::from(source_node)); + } + + Ok(Some(out)) +} + +fn spawn_publisher_proof_archive_source_child( + args: &WtPublishArgs, + publish_input: &str, +) -> Result> { + let Some(child_args) = publisher_proof_archive_source_child_args(args, publish_input)? else { + return Ok(None); + }; + let current_exe = + std::env::current_exe().context("failed to locate current ec-node executable")?; + let mut cmd = TokioCommand::new(current_exe); + cmd.args(child_args) + .stdin(Stdio::null()) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()); + Ok(Some(cmd.spawn().context( + "failed to spawn publisher source proof archive worker", + )?)) +} + async fn archive_track_loop( mut track: moq_lite::TrackConsumer, relay_url: String, + source_node: String, + source_session: String, broadcast_name: String, track_name: String, cas_root: PathBuf, @@ -5483,7 +12016,7 @@ async fn archive_track_loop( )); }; - let group_sequence = group.info.sequence; + let group_sequence = group.sequence; let mut frame_count: u64 = 0; let mut bytes = Vec::new(); loop { @@ -5512,6 +12045,9 @@ async fn archive_track_loop( let record = ArchiveIndexRecord { received_unix_ms: now_unix_ms(), relay_url: relay_url.clone(), + source_node: Some(source_node.clone()), + source_session: Some(source_session.clone()), + media_timing: None, broadcast_name: broadcast_name.clone(), track_name: track_name.clone(), group_sequence, @@ -5526,6 +12062,8 @@ async fn archive_track_loop( if archived_groups % 50 == 0 { tracing::info!( relay = %relay_url, + source_node = %source_node, + source_session = %source_session, broadcast = %broadcast_name, track = %track_name, archived_groups, @@ -5557,12 +12095,14 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { let manifest_root = args .manifest_dir .unwrap_or_else(|| args.output_dir.join("manifests")); + let source_node = archive_source_node(args.source_node.as_deref()); + let source_session = archive_source_session(&source_node); fs::create_dir_all(&cas_root) .with_context(|| format!("failed to create CAS root {}", cas_root.display()))?; fs::create_dir_all(&manifest_root) .with_context(|| format!("failed to create manifest root {}", manifest_root.display()))?; - let consume = moq_lite::Origin::produce(); + let consume = moq_lite::Origin::random().produce(); let mut consume_updates = consume.consume(); #[derive(Clone)] @@ -5795,10 +12335,16 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { Err(last_err.unwrap_or_else(|| anyhow!("failed to connect"))).context("failed MoQ SETUP") } - tracing::info!(url=%relay_url, name=%args.name, "connecting to relay for archival"); + tracing::info!( + url=%relay_url, + name=%args.name, + source_node=%source_node, + source_session=%source_session, + "connecting to relay for archival" + ); let session = connect_moq_session(&relay_url, consume.clone(), args.tls_disable_verify).await?; - let broadcast = if let Some(active) = consume.consume_broadcast(args.name.as_str()) { + let broadcast = if let Some(active) = consume_updates.get_broadcast(args.name.as_str()) { active } else { tracing::info!(name=%args.name, "waiting for relay broadcast announcement"); @@ -5818,7 +12364,9 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { let broadcast_dir = sanitize_path_component(&args.name); let mut tasks = tokio::task::JoinSet::new(); for track_name in tracks { - let track = broadcast.subscribe_track(&moq_lite::Track::new(&track_name)); + let track = broadcast + .subscribe_track(&moq_lite::Track::new(&track_name)) + .with_context(|| format!("failed to subscribe to relay track {track_name}"))?; let track_file = sanitize_path_component(&track_name); let manifest_path = manifest_root .join(&broadcast_dir) @@ -5826,6 +12374,8 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { tasks.spawn(archive_track_loop( track, relay_url_str.clone(), + source_node.clone(), + source_session.clone(), args.name.clone(), track_name.clone(), cas_root.clone(), @@ -5833,6 +12383,8 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { )); tracing::info!( relay = %relay_url_str, + source_node = %source_node, + source_session = %source_session, broadcast = %args.name, track = %track_name, "archival track subscribed" @@ -5872,6 +12424,12 @@ async fn wt_archive(args: WtArchiveArgs) -> Result<()> { struct ArchiveReplayState { cas_root: PathBuf, manifest_root: PathBuf, + archive_origin_url: Option, + archive_cas_origin_url: Option, + archive_origin_max_bytes: usize, + archive_retention_seconds: u64, + cache_access_root: PathBuf, + http_client: reqwest::Client, } #[derive(Debug, serde::Serialize)] @@ -5883,6 +12441,47 @@ struct ArchiveTimelineResponse { audio_segments: usize, } +#[derive(Debug, serde::Serialize)] +struct ArchiveLadderResponse { + broadcast_name: String, + base_broadcast_name: String, + generated_unix_ms: u64, + retention_seconds: u64, + retention_window_start_unix_ms: Option, + renditions: Vec, + tracks: Vec, +} + +#[derive(Debug, serde::Serialize)] +struct ArchiveLadderRenditionSummary { + rendition_id: String, + start_unix_ms: Option, + end_unix_ms: Option, + latest_age_ms: Option, + segments: usize, + total_bytes: Option, + total_frames: Option, + retention_coverage: f64, + track_count: usize, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveLadderTrackStatus { + broadcast_name: String, + track_name: String, + media_kind: String, + rendition_id: String, + start_unix_ms: Option, + end_unix_ms: Option, + latest_age_ms: Option, + segments: usize, + total_bytes: Option, + total_frames: Option, + span_seconds: f64, + retention_coverage: f64, + partial_summary: bool, +} + #[derive(Debug, Clone)] struct ArchiveHlsSegment { sequence: u64, @@ -5890,6 +12489,114 @@ struct ArchiveHlsSegment { hash: String, } +#[derive(Debug, Clone)] +struct ArchiveHlsVariant { + broadcast_name: String, + track_name: String, + rendition_id: String, + bandwidth: u64, + resolution: Option<(u32, u32)>, + segments: usize, + latest_age_ms: Option, +} + +#[derive(Debug, Clone, Default)] +struct ArchiveTrackSummary { + start_unix_ms: Option, + end_unix_ms: Option, + segments: usize, + total_bytes: Option, + total_frames: Option, + partial_summary: bool, +} + +#[derive(Debug, Clone)] +struct ArchiveTrackSummaryCacheEntry { + size: u64, + modified: Option, + summary: ArchiveTrackSummary, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceSourceSummary { + name: String, + manifest_root: PathBuf, + manifest_path: Option, + record_count: usize, + source_nodes: Vec, + source_sessions: Vec, + missing_source_identity_records: usize, + first_sequence: Option, + latest_sequence: Option, + first_received_unix_ms: Option, + latest_received_unix_ms: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveMediaTimingConflict { + group_sequence: u64, + sources: Vec, + media_track_ids: Vec, + media_timescales: Vec, + media_sequences: Vec, + media_decode_times: Vec, +} + +#[derive(Debug, Clone)] +struct ArchiveMediaTimingObservation { + source: String, + timing: ArchiveRecordMediaTiming, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveDivergentSequenceRecord { + manifest_source: String, + observation_source: String, + source_node: Option, + source_session: Option, + received_unix_ms: u64, + size_bytes: usize, + blake3: String, + cas_path: String, + media_timing: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveDivergentSequenceSample { + group_sequence: u64, + record_count: usize, + hash_count: usize, + sources: Vec, + records: Vec, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceReport { + ok: bool, + reasons: Vec, + broadcast: String, + track: String, + stream_id: String, + rendition_id: String, + source_count: usize, + record_source_count: usize, + record_source_nodes: Vec, + missing_hash_records: usize, + missing_source_identity_records: usize, + media_timing_missing_records: usize, + record_source_identity_ok: bool, + sources: Vec, + duplicate_complete: bool, + media_timing_conflict_count: usize, + media_timing_conflicts: Vec, + divergent_sequence_samples: Vec, + summary: ec_core::sim::MediaConvergenceSummary, +} + +static ARCHIVE_TRACK_SUMMARY_CACHE: OnceLock< + Mutex>, +> = OnceLock::new(); + #[derive(Debug)] struct ArchiveHttpResponse { status: u16, @@ -5897,41 +12604,3633 @@ struct ArchiveHttpResponse { body: Vec, } +fn archive_index_path(manifest_root: &Path, broadcast_name: &str, track_name: &str) -> PathBuf { + let broadcast_dir = sanitize_path_component(broadcast_name); + let track_file = sanitize_path_component(track_name); + manifest_root + .join(broadcast_dir) + .join(format!("{track_file}.jsonl")) +} + +fn parse_archive_record_line(path: &Path, line: &[u8]) -> Option { + let line = trim_ascii_whitespace(line); + if line.is_empty() { + return None; + } + match serde_json::from_slice::(line) { + Ok(record) => Some(record), + Err(err) => { + tracing::warn!( + path = %path.display(), + err = %err, + "failed to parse archive index line" + ); + None + } + } +} + +fn trim_ascii_whitespace(mut bytes: &[u8]) -> &[u8] { + while bytes.first().is_some_and(u8::is_ascii_whitespace) { + bytes = &bytes[1..]; + } + while bytes.last().is_some_and(u8::is_ascii_whitespace) { + bytes = &bytes[..bytes.len() - 1]; + } + bytes +} + fn read_archive_records( manifest_root: &Path, broadcast_name: &str, track_name: &str, ) -> Result> { - let broadcast_dir = sanitize_path_component(broadcast_name); - let track_file = sanitize_path_component(track_name); - let path = manifest_root - .join(broadcast_dir) - .join(format!("{track_file}.jsonl")); + let path = archive_index_path(manifest_root, broadcast_name, track_name); + read_archive_records_from_path(&path) +} + +fn read_archive_records_from_path(path: &Path) -> Result> { if !path.exists() { return Ok(Vec::new()); } let mut out = Vec::new(); - let data = fs::read_to_string(&path) - .with_context(|| format!("failed to read archive index {}", path.display()))?; - for line in data.lines() { - let line = line.trim(); - if line.is_empty() { - continue; - } - match serde_json::from_str::(line) { - Ok(record) => out.push(record), - Err(err) => { - tracing::warn!( - path = %path.display(), - err = %err, - "failed to parse archive index line" - ); - } + let file = File::open(path) + .with_context(|| format!("failed to open archive index {}", path.display()))?; + let reader = BufReader::new(file); + for line in reader.split(b'\n') { + let line = + line.with_context(|| format!("failed to read archive index {}", path.display()))?; + if let Some(record) = parse_archive_record_line(path, &line) { + out.push(record); } } Ok(out) } +fn parse_named_path(value: &str) -> Result<(String, PathBuf)> { + let (name, path) = value + .split_once('=') + .ok_or_else(|| anyhow!("expected NAME=PATH for --source, got {value:?}"))?; + let name = name.trim(); + let path = path.trim(); + if name.is_empty() || path.is_empty() { + return Err(anyhow!("expected NAME=PATH for --source, got {value:?}")); + } + Ok((name.to_string(), PathBuf::from(path))) +} + +fn archive_convergence_rendition(broadcast: &str, track: &str, explicit: Option<&str>) -> String { + if let Some(value) = explicit { + let trimmed = value.trim(); + if !trimmed.is_empty() { + return trimmed.to_string(); + } + } + if let Some(rendition) = track_rendition(track) { + return rendition.to_string(); + } + if let (_, Some(rendition)) = strip_ladder_rendition_suffix(broadcast) { + return rendition.to_string(); + } + "primary".to_string() +} + +fn archive_convergence_stream_id(broadcast: &str, explicit: Option<&str>) -> String { + if let Some(value) = explicit { + let trimmed = value.trim(); + if !trimmed.is_empty() { + return trimmed.to_string(); + } + } + strip_ladder_rendition_suffix(broadcast).0.to_string() +} + +const ARCHIVE_MEDIA_TIMING_CONFLICT_REPORT_LIMIT: usize = 32; +const ARCHIVE_DIVERGENT_SEQUENCE_REPORT_LIMIT: usize = 16; + +fn archive_media_timing_conflicts( + observations_by_group: &BTreeMap>, + start_sequence: u64, + end_sequence: u64, +) -> (usize, Vec) { + let mut conflict_count = 0usize; + let mut conflicts = Vec::new(); + + for (group_sequence, observations) in observations_by_group { + if *group_sequence < start_sequence || *group_sequence >= end_sequence { + continue; + } + if observations.len() < 2 { + continue; + } + + let sources = observations + .iter() + .map(|observation| observation.source.clone()) + .collect::>() + .into_iter() + .collect::>(); + let media_track_ids = observations + .iter() + .map(|observation| observation.timing.track_id) + .collect::>() + .into_iter() + .collect::>(); + let media_timescales = observations + .iter() + .map(|observation| observation.timing.timescale) + .collect::>() + .into_iter() + .collect::>(); + let media_sequences = observations + .iter() + .map(|observation| observation.timing.sequence) + .collect::>() + .into_iter() + .collect::>(); + let media_decode_times = observations + .iter() + .map(|observation| observation.timing.decode_time) + .collect::>() + .into_iter() + .collect::>(); + + let conflicting_window = media_sequences.len() > 1 || media_decode_times.len() > 1; + if !conflicting_window { + continue; + } + + conflict_count += 1; + if conflicts.len() < ARCHIVE_MEDIA_TIMING_CONFLICT_REPORT_LIMIT { + conflicts.push(ArchiveMediaTimingConflict { + group_sequence: *group_sequence, + sources, + media_track_ids, + media_timescales, + media_sequences, + media_decode_times, + }); + } + } + + (conflict_count, conflicts) +} + +fn archive_divergent_sequence_samples( + divergent_sequences: &[u64], + records_by_group: &BTreeMap>, +) -> Vec { + divergent_sequences + .iter() + .take(ARCHIVE_DIVERGENT_SEQUENCE_REPORT_LIMIT) + .filter_map(|group_sequence| { + let records = records_by_group.get(group_sequence)?; + let hash_count = records + .iter() + .map(|record| record.blake3.as_str()) + .collect::>() + .len(); + let sources = records + .iter() + .map(|record| record.observation_source.clone()) + .collect::>() + .into_iter() + .collect::>(); + Some(ArchiveDivergentSequenceSample { + group_sequence: *group_sequence, + record_count: records.len(), + hash_count, + sources, + records: records.clone(), + }) + }) + .collect() +} + +fn archive_convergence_report(args: &ArchiveConvergenceArgs) -> Result { + if args.source.len() < 2 { + return Err(anyhow!( + "archive-convergence requires at least two --source NAME=PATH entries" + )); + } + + let sources = args + .source + .iter() + .map(|item| parse_named_path(item)) + .collect::>>()?; + let stream_id = archive_convergence_stream_id(&args.broadcast, args.stream_id.as_deref()); + let rendition_id = + archive_convergence_rendition(&args.broadcast, &args.track, args.rendition.as_deref()); + let mut index = ec_core::sim::MediaConvergenceIndex::default(); + let mut source_summaries = Vec::with_capacity(sources.len()); + let mut first_observed_sequence: Option = None; + let mut latest_observed_sequence: Option = None; + let mut all_record_source_nodes = BTreeSet::::new(); + let mut missing_hash_records = 0usize; + let mut missing_source_identity_records = 0usize; + let mut media_timing_missing_records = 0usize; + let mut media_timing_observations = BTreeMap::>::new(); + let mut divergent_records_by_group = + BTreeMap::>::new(); + + for (name, manifest_root) in sources { + let manifest_path = + archive_index_path_with_aliases(&manifest_root, &args.broadcast, &args.track); + let records = + read_archive_records_with_aliases(&manifest_root, &args.broadcast, &args.track) + .with_context(|| { + format!( + "failed to read archive records for source {name} at {}", + manifest_root.display() + ) + })?; + let mut source_nodes = BTreeSet::::new(); + let mut source_sessions = BTreeSet::::new(); + let mut source_missing_records = 0usize; + + for record in &records { + first_observed_sequence = Some( + first_observed_sequence + .map(|value| value.min(record.group_sequence)) + .unwrap_or(record.group_sequence), + ); + latest_observed_sequence = Some( + latest_observed_sequence + .map(|value| value.max(record.group_sequence)) + .unwrap_or(record.group_sequence), + ); + let record_source_node = record + .source_node + .as_deref() + .and_then(non_empty_trimmed_string); + if let Some(source_node) = record_source_node.as_ref() { + source_nodes.insert(source_node.clone()); + all_record_source_nodes.insert(source_node.clone()); + } else { + source_missing_records += 1; + missing_source_identity_records += 1; + } + if let Some(source_session) = record + .source_session + .as_deref() + .and_then(non_empty_trimmed_string) + { + source_sessions.insert(source_session); + } + if record.blake3.trim().is_empty() { + missing_hash_records += 1; + } else { + let observation_source = record_source_node.clone().unwrap_or_else(|| name.clone()); + divergent_records_by_group + .entry(record.group_sequence) + .or_default() + .push(ArchiveDivergentSequenceRecord { + manifest_source: name.clone(), + observation_source: observation_source.clone(), + source_node: record.source_node.clone(), + source_session: record.source_session.clone(), + received_unix_ms: record.received_unix_ms, + size_bytes: record.size_bytes, + blake3: record.blake3.trim().to_string(), + cas_path: record.cas_path.clone(), + media_timing: record.media_timing.clone(), + }); + if let Some(timing) = record.media_timing.clone() { + media_timing_observations + .entry(record.group_sequence) + .or_default() + .push(ArchiveMediaTimingObservation { + source: observation_source.clone(), + timing, + }); + } else { + media_timing_missing_records += 1; + } + index.observe(ec_core::sim::MediaObservation::new_with_media_time_ms( + ec_core::sim::MediaKey::new( + &stream_id, + &rendition_id, + &args.track, + record.group_sequence, + ), + &observation_source, + record.blake3.trim(), + record.media_timing.as_ref().map(|timing| timing.sequence), + record.received_unix_ms, + )); + } + } + + source_summaries.push(ArchiveConvergenceSourceSummary { + name, + manifest_root, + manifest_path, + record_count: records.len(), + source_nodes: source_nodes.into_iter().collect(), + source_sessions: source_sessions.into_iter().collect(), + missing_source_identity_records: source_missing_records, + first_sequence: records.iter().map(|record| record.group_sequence).min(), + latest_sequence: records.iter().map(|record| record.group_sequence).max(), + first_received_unix_ms: records.iter().map(|record| record.received_unix_ms).min(), + latest_received_unix_ms: records.iter().map(|record| record.received_unix_ms).max(), + }); + } + + let start_sequence = args + .start_sequence + .or(first_observed_sequence) + .ok_or_else(|| anyhow!("no archive records found for {}", args.broadcast))?; + let end_sequence = args + .end_sequence + .or_else(|| latest_observed_sequence.map(|value| value.saturating_add(1))) + .ok_or_else(|| anyhow!("no archive records found for {}", args.broadcast))?; + if end_sequence <= start_sequence { + return Err(anyhow!( + "end sequence {end_sequence} must be greater than start sequence {start_sequence}" + )); + } + + let summary = index.summarize_observed_sequences( + &stream_id, + &rendition_id, + &args.track, + start_sequence, + end_sequence, + ); + let duplicate_complete = summary.expected_sequences > 0 + && summary.matching_duplicate_sequences.len() as u64 == summary.expected_sequences; + let (media_timing_conflict_count, media_timing_conflicts) = + archive_media_timing_conflicts(&media_timing_observations, start_sequence, end_sequence); + let divergent_sequence_samples = archive_divergent_sequence_samples( + &summary.divergent_sequences, + &divergent_records_by_group, + ); + let mut reasons = Vec::new(); + if !summary.missing_sequences.is_empty() { + reasons.push("missing_sequences".to_string()); + } + if !summary.divergent_sequences.is_empty() { + reasons.push("divergent_sequences".to_string()); + } + if !summary.source_local_divergent_sequences.is_empty() { + reasons.push("source_local_divergent_sequences".to_string()); + } + if !duplicate_complete { + reasons.push("duplicate_sequences_incomplete".to_string()); + } + if media_timing_conflict_count > 0 { + reasons.push("media_sequence_conflict".to_string()); + } + if media_timing_missing_records > 0 { + reasons.push("media_timing_missing".to_string()); + } + if missing_hash_records > 0 { + reasons.push("hash_missing".to_string()); + } + let record_source_nodes = all_record_source_nodes.into_iter().collect::>(); + let record_source_identity_ok = + missing_source_identity_records == 0 && record_source_nodes.len() >= 2; + if missing_source_identity_records > 0 { + reasons.push("source_identity_missing".to_string()); + } + if record_source_nodes.len() < 2 { + reasons.push("source_identity_not_diverse".to_string()); + } + + Ok(ArchiveConvergenceReport { + ok: summary.ok() + && duplicate_complete + && record_source_identity_ok + && missing_hash_records == 0 + && media_timing_missing_records == 0 + && media_timing_conflict_count == 0, + reasons, + broadcast: args.broadcast.clone(), + track: args.track.clone(), + stream_id, + rendition_id, + source_count: source_summaries.len(), + record_source_count: record_source_nodes.len(), + record_source_nodes, + missing_hash_records, + missing_source_identity_records, + media_timing_missing_records, + record_source_identity_ok, + sources: source_summaries, + duplicate_complete, + media_timing_conflict_count, + media_timing_conflicts, + divergent_sequence_samples, + summary, + }) +} + +fn prometheus_escape_label_value(value: &str) -> String { + let mut out = String::with_capacity(value.len()); + for ch in value.chars() { + match ch { + '\\' => out.push_str("\\\\"), + '"' => out.push_str("\\\""), + '\n' => out.push_str("\\n"), + _ => out.push(ch), + } + } + out +} + +fn prometheus_label_set(labels: &BTreeMap) -> String { + if labels.is_empty() { + return String::new(); + } + + let pairs = labels + .iter() + .map(|(key, value)| format!("{key}=\"{}\"", prometheus_escape_label_value(value))) + .collect::>() + .join(","); + format!("{{{pairs}}}") +} + +fn archive_convergence_prometheus_labels( + broadcast: &str, + track: &str, + stream_id: &str, + rendition_id: &str, + metrics_node: Option<&str>, + metrics_role: &str, +) -> BTreeMap { + let mut labels = BTreeMap::from([ + ("broadcast".to_string(), broadcast.to_string()), + ("rendition".to_string(), rendition_id.to_string()), + ("role".to_string(), metrics_role.to_string()), + ("stream".to_string(), stream_id.to_string()), + ("track".to_string(), track.to_string()), + ]); + if let Some(node) = metrics_node.and_then(non_empty_trimmed_string) { + labels.insert("node".to_string(), node); + } + labels +} + +fn push_prometheus_gauge( + output: &mut String, + metric: &str, + labels: &BTreeMap, + value: f64, +) { + output.push_str(metric); + output.push_str(&prometheus_label_set(labels)); + output.push(' '); + output.push_str(&format!("{value:.6}")); + output.push('\n'); +} + +fn archive_convergence_prometheus_metrics( + report: &ArchiveConvergenceReport, + args: &ArchiveConvergenceArgs, +) -> String { + let labels = archive_convergence_prometheus_labels( + &report.broadcast, + &report.track, + &report.stream_id, + &report.rendition_id, + args.metrics_node.as_deref(), + &args.metrics_role, + ); + + let mut output = String::new(); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_scrape_ok", + &labels, + 1.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_ok", + &labels, + if report.ok { 1.0 } else { 0.0 }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_duplicate_complete", + &labels, + if report.duplicate_complete { 1.0 } else { 0.0 }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_record_source_identity_ok", + &labels, + if report.record_source_identity_ok { + 1.0 + } else { + 0.0 + }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_duplicate_hash_source_records", + &labels, + report.summary.duplicate_source_records as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_duplicate_hash_sequences", + &labels, + report.summary.matching_duplicate_sequences.len() as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_hash_divergent_sequences", + &labels, + report.summary.divergent_sequences.len() as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_source_local_hash_divergent_sequences", + &labels, + report.summary.source_local_divergent_sequences.len() as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_missing_hash_records", + &labels, + report.missing_hash_records as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_missing_source_identity_records", + &labels, + report.missing_source_identity_records as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_media_timing_missing_records", + &labels, + report.media_timing_missing_records as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_media_timing_conflict_sequences", + &labels, + report.media_timing_conflict_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_expected_sequences", + &labels, + report.summary.expected_sequences as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_observed_sequences", + &labels, + report.summary.observed_sequences as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_record_source_count", + &labels, + report.record_source_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_manifest_source_count", + &labels, + report.source_count as f64, + ); + output +} + +fn archive_convergence_prometheus_error_metrics( + args: &ArchiveConvergenceArgs, + err: &anyhow::Error, +) -> String { + let stream_id = archive_convergence_stream_id(&args.broadcast, args.stream_id.as_deref()); + let rendition_id = + archive_convergence_rendition(&args.broadcast, &args.track, args.rendition.as_deref()); + let labels = archive_convergence_prometheus_labels( + &args.broadcast, + &args.track, + &stream_id, + &rendition_id, + args.metrics_node.as_deref(), + &args.metrics_role, + ); + let mut output = String::new(); + output.push_str("# archive_convergence_error "); + output.push_str(&prometheus_escape_label_value(&err.to_string())); + output.push('\n'); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_scrape_ok", + &labels, + 0.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_ok", + &labels, + 0.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_duplicate_complete", + &labels, + 0.0, + ); + output +} + +fn archive_convergence_command(args: ArchiveConvergenceArgs) -> Result<()> { + let report = archive_convergence_report(&args)?; + if args.prometheus { + print!("{}", archive_convergence_prometheus_metrics(&report, &args)); + } else if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + if args.require_ok && !report.ok { + return Err(anyhow!( + "archive convergence failed for {}/{}: {}", + report.broadcast, + report.track, + if report.reasons.is_empty() { + "unknown".to_string() + } else { + report.reasons.join(",") + } + )); + } + Ok(()) +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceMeasureFetch { + name: String, + kind: String, + url: String, + ok: bool, + status: Option, + elapsed_ms: u64, + record_count: usize, + invalid_record_count: usize, + error: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceMeasurePrometheusSample { + metric: String, + ok: bool, + series_present: bool, + value: Option, + error: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceMeasureSample { + sample_unix_ms: u64, + sources: Vec, + convergence: Option, + convergence_error: Option, + prometheus: Vec, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceMeasureSummary { + ok: bool, + elapsed_ms: u64, + sample_count: usize, + reasons: Vec, + prometheus_series_present_count: usize, + latest_matching_duplicate_sequences: Option, + latest_divergent_sequences: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +struct ArchiveConvergenceMeasureReport { + ok: bool, + broadcast: String, + track: String, + duration_seconds: f64, + summary: ArchiveConvergenceMeasureSummary, + samples: Vec, +} + +struct ArchiveConvergenceMeasureFetchedBody { + status: Option, + elapsed_ms: u64, + body: Option, + error: Option, +} + +#[derive(Debug, Clone)] +struct ArchiveConvergenceMeasureFetchSpec { + name: String, + kind: String, + url: String, + tail_range: bool, +} + +fn parse_named_endpoint(value: &str) -> Result<(String, String)> { + let (name, endpoint) = value + .split_once('=') + .ok_or_else(|| anyhow!("expected NAME=URL, got {value:?}"))?; + let name = name.trim(); + let endpoint = endpoint.trim(); + if name.is_empty() || endpoint.is_empty() { + return Err(anyhow!("expected NAME=URL, got {value:?}")); + } + Ok((name.to_string(), endpoint.to_string())) +} + +fn parse_label_filter(value: &str) -> Result<(String, String)> { + let (name, expected) = value + .split_once('=') + .ok_or_else(|| anyhow!("expected KEY=VALUE for label filter, got {value:?}"))?; + let name = name.trim(); + let expected = expected.trim(); + if name.is_empty() || expected.is_empty() { + return Err(anyhow!( + "expected KEY=VALUE for label filter, got {value:?}" + )); + } + Ok((name.to_string(), expected.to_string())) +} + +#[derive(Debug, serde::Deserialize)] +struct PrometheusFileSdGroup { + targets: Vec, + #[serde(default)] + labels: BTreeMap, +} + +fn archive_measure_prometheus_sd_agent_manifests( + args: &ArchiveConvergenceMeasureArgs, +) -> (Vec, Vec) { + let mut filters = Vec::new(); + let mut discovery_errors = Vec::new(); + for filter in &args.agent_prometheus_sd_label { + match parse_label_filter(filter) { + Ok(filter) => filters.push(filter), + Err(err) => discovery_errors.push(ArchiveConvergenceMeasureFetch { + name: filter.clone(), + kind: "agent-prometheus-sd".to_string(), + url: String::new(), + ok: false, + status: None, + elapsed_ms: 0, + record_count: 0, + invalid_record_count: 0, + error: Some(err.to_string()), + }), + } + } + + let mut entries = Vec::new(); + let mut seen_targets = BTreeSet::::new(); + let mut used_names = HashMap::::new(); + for path in &args.agent_prometheus_sd { + let started = Instant::now(); + let body = match fs::read_to_string(path) { + Ok(body) => body, + Err(err) => { + discovery_errors.push(ArchiveConvergenceMeasureFetch { + name: path.display().to_string(), + kind: "agent-prometheus-sd".to_string(), + url: path.display().to_string(), + ok: false, + status: None, + elapsed_ms: started.elapsed().as_millis() as u64, + record_count: 0, + invalid_record_count: 0, + error: Some(err.to_string()), + }); + continue; + } + }; + let groups = match serde_json::from_str::>(&body) { + Ok(groups) => groups, + Err(err) => { + discovery_errors.push(ArchiveConvergenceMeasureFetch { + name: path.display().to_string(), + kind: "agent-prometheus-sd".to_string(), + url: path.display().to_string(), + ok: false, + status: None, + elapsed_ms: started.elapsed().as_millis() as u64, + record_count: 0, + invalid_record_count: 0, + error: Some(format!("invalid prometheus file-SD JSON: {err}")), + }); + continue; + } + }; + for group in groups { + if filters + .iter() + .any(|(key, expected)| group.labels.get(key) != Some(expected)) + { + continue; + } + let base_name = group + .labels + .get("headscale_name") + .or_else(|| group.labels.get("node")) + .or_else(|| group.labels.get("instance")) + .map(|value| value.trim()) + .filter(|value| !value.is_empty()); + for target in group.targets { + let target = target.trim(); + if target.is_empty() || !seen_targets.insert(target.to_string()) { + continue; + } + let url = if target.starts_with("http://") || target.starts_with("https://") { + target.trim_end_matches('/').to_string() + } else { + format!("http://{}", target.trim_end_matches('/')) + }; + let fallback_name = target + .replace("://", "-") + .replace(['/', ':', '.'], "-") + .trim_matches('-') + .to_string(); + let base_name = base_name.unwrap_or(&fallback_name); + let count = used_names.entry(base_name.to_string()).or_insert(0); + let name = if *count == 0 { + base_name.to_string() + } else { + format!("{base_name}-{}", *count + 1) + }; + *count += 1; + entries.push(format!("{name}={url}")); + } + } + } + (entries, discovery_errors) +} + +async fn archive_measure_fetch_body( + client: &reqwest::Client, + url: &str, + timeout: Duration, + max_bytes: usize, + tail_range: bool, +) -> ArchiveConvergenceMeasureFetchedBody { + let started = Instant::now(); + let mut request = client.get(url).timeout(timeout); + if tail_range && max_bytes > 0 { + request = request.header(reqwest::header::RANGE, format!("bytes=-{max_bytes}")); + } + let response = match request.send().await { + Ok(response) => response, + Err(err) => { + return ArchiveConvergenceMeasureFetchedBody { + status: None, + elapsed_ms: started.elapsed().as_millis() as u64, + body: None, + error: Some(err.to_string()), + }; + } + }; + let status = response.status(); + let status_u16 = status.as_u16(); + let bytes = match response.bytes().await { + Ok(bytes) => bytes, + Err(err) => { + return ArchiveConvergenceMeasureFetchedBody { + status: Some(status_u16), + elapsed_ms: started.elapsed().as_millis() as u64, + body: None, + error: Some(err.to_string()), + }; + } + }; + let body = if max_bytes > 0 && bytes.len() > max_bytes { + String::from_utf8_lossy(&bytes[bytes.len() - max_bytes..]).into_owned() + } else { + String::from_utf8_lossy(&bytes).into_owned() + }; + ArchiveConvergenceMeasureFetchedBody { + status: Some(status_u16), + elapsed_ms: started.elapsed().as_millis() as u64, + body: status.is_success().then_some(body), + error: (!status.is_success()).then(|| format!("http status {status_u16}")), + } +} + +fn parse_archive_records_jsonl(body: &str) -> (Vec, usize) { + let mut records = Vec::new(); + let mut invalid = 0usize; + for (index, line) in body.lines().enumerate() { + let raw = line.trim(); + if raw.is_empty() { + continue; + } + match serde_json::from_str::(raw) { + Ok(record) => records.push(record), + Err(_) if index == 0 => {} + Err(_) => invalid += 1, + } + } + (records, invalid) +} + +fn parse_agent_archive_records(body: &str) -> Result<(Vec, usize)> { + let payload = serde_json::from_str::(body) + .context("failed to parse node-agent archive manifest response")?; + if payload.get("ok").and_then(|value| value.as_bool()) == Some(false) { + return Err(anyhow!( + "node-agent archive manifest response was not ok: {}", + payload + .get("error") + .and_then(|value| value.as_str()) + .unwrap_or("unknown") + )); + } + let mut invalid = payload + .get("invalid_lines") + .and_then(|value| value.as_u64()) + .unwrap_or(0) as usize; + let records = payload + .get("records") + .and_then(|value| value.as_array()) + .ok_or_else(|| anyhow!("node-agent archive manifest response missing records array"))?; + let mut parsed = Vec::with_capacity(records.len()); + for record in records { + match serde_json::from_value::(record.clone()) { + Ok(record) => parsed.push(record), + Err(_) => invalid += 1, + } + } + Ok((parsed, invalid)) +} + +fn write_measure_records( + manifest_root: &Path, + broadcast: &str, + track: &str, + records: &[ArchiveIndexRecord], +) -> Result<()> { + let path = archive_index_path(manifest_root, broadcast, track); + let parent = path + .parent() + .ok_or_else(|| anyhow!("invalid manifest path: {}", path.display()))?; + fs::create_dir_all(parent).with_context(|| format!("failed to create {}", parent.display()))?; + let mut file = + File::create(&path).with_context(|| format!("failed to create {}", path.display()))?; + for record in records { + serde_json::to_writer(&mut file, record) + .with_context(|| format!("failed to write {}", path.display()))?; + file.write_all(b"\n") + .with_context(|| format!("failed to write {}", path.display()))?; + } + Ok(()) +} + +fn archive_measure_agent_url(args: &ArchiveConvergenceMeasureArgs, base_url: &str) -> String { + let mut query = vec![ + ( + "broadcast", + urlencoding::encode(&args.broadcast).into_owned(), + ), + ("track", urlencoding::encode(&args.track).into_owned()), + ("max_bytes", args.max_manifest_bytes.to_string()), + ]; + if !args.agent_manifest_role.trim().is_empty() { + query.push(( + "role", + urlencoding::encode(args.agent_manifest_role.trim()).into_owned(), + )); + } + let query = query + .into_iter() + .map(|(key, value)| format!("{key}={value}")) + .collect::>() + .join("&"); + format!( + "{}/v1/archive-manifest?{query}", + base_url.trim_end_matches('/') + ) +} + +async fn archive_convergence_measure_sample( + args: &ArchiveConvergenceMeasureArgs, + client: &reqwest::Client, + sample_index: usize, +) -> ArchiveConvergenceMeasureSample { + let sample_unix_ms = now_unix_ms(); + let timeout = Duration::from_millis(args.timeout_ms.max(1)); + let temp_root = std::env::temp_dir().join(format!( + "ec-archive-convergence-measure-{}-{}-{}", + std::process::id(), + sample_unix_ms, + sample_index + )); + let mut fetches = Vec::new(); + let mut source_args = Vec::new(); + let (sd_agent_manifests, mut discovery_fetches) = + archive_measure_prometheus_sd_agent_manifests(args); + fetches.append(&mut discovery_fetches); + + let mut specs = Vec::::new(); + for item in args.agent_manifest.iter().chain(sd_agent_manifests.iter()) { + let (name, base_url) = match parse_named_endpoint(item) { + Ok(parsed) => parsed, + Err(err) => { + fetches.push(ArchiveConvergenceMeasureFetch { + name: item.clone(), + kind: "agent-manifest".to_string(), + url: String::new(), + ok: false, + status: None, + elapsed_ms: 0, + record_count: 0, + invalid_record_count: 0, + error: Some(err.to_string()), + }); + continue; + } + }; + let url = archive_measure_agent_url(args, &base_url); + specs.push(ArchiveConvergenceMeasureFetchSpec { + name, + kind: "agent-manifest".to_string(), + url, + tail_range: false, + }); + } + + for item in &args.manifest { + let (name, url) = match parse_named_endpoint(item) { + Ok(parsed) => parsed, + Err(err) => { + fetches.push(ArchiveConvergenceMeasureFetch { + name: item.clone(), + kind: "manifest".to_string(), + url: String::new(), + ok: false, + status: None, + elapsed_ms: 0, + record_count: 0, + invalid_record_count: 0, + error: Some(err.to_string()), + }); + continue; + } + }; + specs.push(ArchiveConvergenceMeasureFetchSpec { + name, + kind: "manifest".to_string(), + url, + tail_range: true, + }); + } + + let fetched_bodies = join_all(specs.iter().map(|spec| { + let response_max_bytes = if spec.kind == "agent-manifest" { + 0 + } else { + args.max_manifest_bytes + }; + archive_measure_fetch_body( + client, + &spec.url, + timeout, + response_max_bytes, + spec.tail_range, + ) + })) + .await; + + for (spec, fetched) in specs.into_iter().zip(fetched_bodies) { + let mut record_count = 0usize; + let mut invalid_record_count = 0usize; + let mut error = fetched.error.clone(); + if let Some(body) = fetched.body.as_deref() { + let parsed = if spec.kind == "agent-manifest" { + parse_agent_archive_records(body) + } else { + let (records, invalid) = parse_archive_records_jsonl(body); + Ok((records, invalid)) + }; + match parsed { + Ok((records, invalid)) => { + record_count = records.len(); + invalid_record_count = invalid; + let manifest_root = temp_root.join(sanitize_path_component(&spec.name)); + if let Err(err) = write_measure_records( + &manifest_root, + &args.broadcast, + &args.track, + &records, + ) { + error = Some(err.to_string()); + } else { + source_args.push(format!("{}={}", spec.name, manifest_root.display())); + } + } + Err(err) => error = Some(err.to_string()), + } + } + fetches.push(ArchiveConvergenceMeasureFetch { + name: spec.name, + kind: spec.kind, + url: spec.url, + ok: error.is_none(), + status: fetched.status, + elapsed_ms: fetched.elapsed_ms, + record_count, + invalid_record_count, + error, + }); + } + + let (convergence, convergence_error) = if source_args.len() >= 2 { + let report = archive_convergence_report(&ArchiveConvergenceArgs { + source: source_args, + broadcast: args.broadcast.clone(), + track: args.track.clone(), + stream_id: args.stream_id.clone(), + rendition: args.rendition.clone(), + start_sequence: args.start_sequence, + end_sequence: args.end_sequence, + pretty: false, + prometheus: false, + metrics_node: None, + metrics_role: "duplicate-proof".to_string(), + require_ok: false, + }); + match report { + Ok(report) => (Some(report), None), + Err(err) => (None, Some(err.to_string())), + } + } else { + ( + None, + Some("fewer than two fetched manifest sources".to_string()), + ) + }; + let prometheus = match args.prometheus_url.as_deref() { + Some(url) if !url.trim().is_empty() => { + archive_convergence_measure_prometheus(client, url, &args.broadcast, timeout).await + } + _ => Vec::new(), + }; + let _ = fs::remove_dir_all(&temp_root); + + ArchiveConvergenceMeasureSample { + sample_unix_ms, + sources: fetches, + convergence, + convergence_error, + prometheus, + } +} + +async fn archive_convergence_measure_prometheus( + client: &reqwest::Client, + prometheus_url: &str, + broadcast: &str, + timeout: Duration, +) -> Vec { + let base = prometheus_url.trim_end_matches('/'); + let queries = ARCHIVE_CONVERGENCE_PROMETHEUS_METRICS + .iter() + .map(|metric| { + let expr = format!("sum({metric}{{broadcast=\"{broadcast}\"}})"); + let url = format!("{base}/api/v1/query?query={}", urlencoding::encode(&expr)); + ((*metric).to_string(), url) + }) + .collect::>(); + let fetched = join_all( + queries + .iter() + .map(|(_, url)| archive_measure_fetch_body(client, url, timeout, 1024 * 1024, false)), + ) + .await; + let mut rows = Vec::with_capacity(queries.len()); + for ((metric, _), fetched) in queries.into_iter().zip(fetched) { + let mut row = ArchiveConvergenceMeasurePrometheusSample { + metric, + ok: false, + series_present: false, + value: None, + error: fetched.error.clone(), + }; + if let Some(body) = fetched.body.as_deref() { + match serde_json::from_str::(body) { + Ok(payload) => { + let result = payload + .get("data") + .and_then(|data| data.get("result")) + .and_then(|result| result.as_array()) + .cloned() + .unwrap_or_default(); + row.ok = true; + row.series_present = !result.is_empty(); + row.value = result + .first() + .and_then(|first| first.get("value")) + .and_then(|value| value.as_array()) + .and_then(|value| value.get(1)) + .and_then(|value| value.as_str()) + .and_then(|value| value.parse::().ok()); + } + Err(err) => row.error = Some(format!("invalid prometheus response: {err}")), + } + } + rows.push(row); + } + rows +} + +fn archive_convergence_measure_summary( + samples: &[ArchiveConvergenceMeasureSample], +) -> ArchiveConvergenceMeasureSummary { + archive_convergence_measure_summary_with_min_elapsed(samples, 0) +} + +fn archive_convergence_measure_summary_with_min_elapsed( + samples: &[ArchiveConvergenceMeasureSample], + min_elapsed_ms: u64, +) -> ArchiveConvergenceMeasureSummary { + let mut reasons = Vec::new(); + if samples.is_empty() { + return ArchiveConvergenceMeasureSummary { + ok: false, + elapsed_ms: 0, + sample_count: 0, + reasons: vec!["no_samples".to_string()], + prometheus_series_present_count: 0, + latest_matching_duplicate_sequences: None, + latest_divergent_sequences: None, + }; + } + let elapsed_ms = samples + .last() + .and_then(|last| { + samples + .first() + .map(|first| last.sample_unix_ms.saturating_sub(first.sample_unix_ms)) + }) + .unwrap_or(0); + if samples.len() < 2 || elapsed_ms == 0 { + reasons.push("insufficient_elapsed_samples".to_string()); + } + if min_elapsed_ms > 0 && elapsed_ms < min_elapsed_ms { + reasons.push("elapsed_window_too_short".to_string()); + } + let latest = samples.last().expect("checked non-empty samples"); + if latest.sources.iter().any(|source| !source.ok) { + reasons.push("source_fetch_failed".to_string()); + } + let (matching, divergent) = match latest.convergence.as_ref() { + Some(report) => { + if !report.ok { + reasons.push("archive_convergence_failed".to_string()); + reasons.extend( + report + .reasons + .iter() + .map(|reason| format!("archive_{reason}")), + ); + } + let matching = report.summary.matching_duplicate_sequences.len(); + if matching == 0 { + reasons.push("no_matching_duplicate_sequences".to_string()); + } + let divergent = report.summary.divergent_sequences.len(); + (Some(matching), Some(divergent)) + } + None => { + reasons.push("archive_convergence_missing".to_string()); + (None, None) + } + }; + if !latest.prometheus.is_empty() { + let series_count = latest + .prometheus + .iter() + .filter(|row| row.series_present) + .count(); + if series_count == 0 { + reasons.push("prometheus_duplicate_series_missing".to_string()); + } + if latest.prometheus.iter().any(|row| !row.ok) { + reasons.push("prometheus_query_failed".to_string()); + } + if latest.prometheus.iter().any(|row| { + row.metric.ends_with("hash_divergent_sequences") && row.value.unwrap_or(0.0) > 0.0 + }) { + reasons.push("prometheus_hash_divergence_nonzero".to_string()); + } + if latest.prometheus.iter().any(|row| { + row.metric.ends_with("missing_source_identity_records") + && row.value.unwrap_or(0.0) > 0.0 + }) { + reasons.push("prometheus_source_identity_missing_nonzero".to_string()); + } + } + reasons.sort(); + reasons.dedup(); + ArchiveConvergenceMeasureSummary { + ok: reasons.is_empty(), + elapsed_ms, + sample_count: samples.len(), + reasons, + prometheus_series_present_count: latest + .prometheus + .iter() + .filter(|row| row.series_present) + .count(), + latest_matching_duplicate_sequences: matching, + latest_divergent_sequences: divergent, + } +} + +async fn archive_convergence_measure_report( + args: &ArchiveConvergenceMeasureArgs, +) -> Result { + if args.agent_manifest.is_empty() + && args.agent_prometheus_sd.is_empty() + && args.manifest.is_empty() + { + return Err(anyhow!( + "archive-convergence-measure requires at least two --agent-manifest or --manifest sources" + )); + } + if args.agent_prometheus_sd.is_empty() && args.agent_manifest.len() + args.manifest.len() < 2 { + return Err(anyhow!( + "archive-convergence-measure requires at least two manifest sources" + )); + } + if args.timeout_ms == 0 { + return Err(anyhow!("--timeout-ms must be greater than 0")); + } + if args.max_manifest_bytes == 0 { + return Err(anyhow!("--max-manifest-bytes must be greater than 0")); + } + let client = reqwest::Client::builder() + .user_agent("every-channel-archive-convergence-measure/1") + .build() + .context("failed to build HTTP client")?; + let started = Instant::now(); + let mut samples = Vec::new(); + loop { + let sample = archive_convergence_measure_sample(args, &client, samples.len()).await; + samples.push(sample); + if args.duration_seconds <= 0.0 || started.elapsed().as_secs_f64() >= args.duration_seconds + { + break; + } + tokio::time::sleep(Duration::from_secs_f64( + args.poll_interval_seconds.max(0.001), + )) + .await; + } + let summary = archive_convergence_measure_summary(&samples); + Ok(ArchiveConvergenceMeasureReport { + ok: summary.ok, + broadcast: args.broadcast.clone(), + track: args.track.clone(), + duration_seconds: args.duration_seconds, + summary, + samples, + }) +} + +async fn archive_convergence_measure_command(args: ArchiveConvergenceMeasureArgs) -> Result<()> { + let report = archive_convergence_measure_report(&args).await?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + if args.require_ok && !report.ok { + return Err(anyhow!( + "archive convergence measurement failed for {}/{}: {}", + report.broadcast, + report.track, + report.summary.reasons.join(",") + )); + } + Ok(()) +} + +#[derive(Debug, Default)] +struct ArchiveConvergenceMeasureServeState { + samples: VecDeque, + next_sample_index: usize, +} + +fn archive_convergence_measure_args_from_serve( + args: &ArchiveConvergenceMeasureServeArgs, +) -> ArchiveConvergenceMeasureArgs { + ArchiveConvergenceMeasureArgs { + agent_manifest: args.agent_manifest.clone(), + agent_prometheus_sd: args.agent_prometheus_sd.clone(), + agent_prometheus_sd_label: args.agent_prometheus_sd_label.clone(), + agent_manifest_role: args.agent_manifest_role.clone(), + manifest: args.manifest.clone(), + broadcast: args.broadcast.clone(), + track: args.track.clone(), + stream_id: args.stream_id.clone(), + rendition: args.rendition.clone(), + start_sequence: args.start_sequence, + end_sequence: args.end_sequence, + prometheus_url: args.prometheus_url.clone(), + duration_seconds: 0.0, + poll_interval_seconds: 30.0, + timeout_ms: args.timeout_ms, + max_manifest_bytes: args.max_manifest_bytes, + pretty: false, + require_ok: false, + } +} + +fn archive_convergence_args_from_measure_serve( + args: &ArchiveConvergenceMeasureServeArgs, +) -> ArchiveConvergenceArgs { + ArchiveConvergenceArgs { + source: Vec::new(), + broadcast: args.broadcast.clone(), + track: args.track.clone(), + stream_id: args.stream_id.clone(), + rendition: args.rendition.clone(), + start_sequence: args.start_sequence, + end_sequence: args.end_sequence, + pretty: false, + prometheus: true, + metrics_node: args.metrics_node.clone(), + metrics_role: args.metrics_role.clone(), + require_ok: false, + } +} + +fn archive_convergence_measure_report_from_samples( + args: &ArchiveConvergenceMeasureServeArgs, + samples: Vec, +) -> ArchiveConvergenceMeasureReport { + let min_elapsed_ms = + Duration::from_secs_f64(args.min_elapsed_seconds.max(0.0)).as_millis() as u64; + let summary = archive_convergence_measure_summary_with_min_elapsed(&samples, min_elapsed_ms); + ArchiveConvergenceMeasureReport { + ok: summary.ok, + broadcast: args.broadcast.clone(), + track: args.track.clone(), + duration_seconds: args.min_elapsed_seconds, + summary, + samples, + } +} + +fn archive_convergence_measure_prometheus_metrics( + report: &ArchiveConvergenceMeasureReport, + args: &ArchiveConvergenceMeasureServeArgs, +) -> String { + let latest = report.samples.last(); + let latest_convergence = latest.and_then(|sample| sample.convergence.as_ref()); + let stream_id = latest_convergence + .map(|convergence| convergence.stream_id.clone()) + .unwrap_or_else(|| { + archive_convergence_stream_id(&args.broadcast, args.stream_id.as_deref()) + }); + let rendition_id = latest_convergence + .map(|convergence| convergence.rendition_id.clone()) + .unwrap_or_else(|| { + archive_convergence_rendition(&args.broadcast, &args.track, args.rendition.as_deref()) + }); + let labels = archive_convergence_prometheus_labels( + &report.broadcast, + &report.track, + &stream_id, + &rendition_id, + args.metrics_node.as_deref(), + &args.metrics_role, + ); + + let mut output = String::new(); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_scrape_ok", + &labels, + 1.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_ok", + &labels, + if report.ok { 1.0 } else { 0.0 }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_elapsed_seconds", + &labels, + report.summary.elapsed_ms as f64 / 1000.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_min_elapsed_seconds", + &labels, + args.min_elapsed_seconds.max(0.0), + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_samples", + &labels, + report.summary.sample_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_prometheus_series_present", + &labels, + report.summary.prometheus_series_present_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_matching_duplicate_sequences", + &labels, + report + .summary + .latest_matching_duplicate_sequences + .unwrap_or(0) as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_divergent_sequences", + &labels, + report.summary.latest_divergent_sequences.unwrap_or(0) as f64, + ); + + for reason in &report.summary.reasons { + let mut reason_labels = labels.clone(); + reason_labels.insert("reason".to_string(), reason.clone()); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_reason", + &reason_labels, + 1.0, + ); + } + + if let Some(sample) = latest { + for source in &sample.sources { + let mut source_labels = labels.clone(); + source_labels.insert("source".to_string(), source.name.clone()); + source_labels.insert("kind".to_string(), source.kind.clone()); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_source_fetch_ok", + &source_labels, + if source.ok { 1.0 } else { 0.0 }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_source_fetch_elapsed_seconds", + &source_labels, + source.elapsed_ms as f64 / 1000.0, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_source_records", + &source_labels, + source.record_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_source_invalid_records", + &source_labels, + source.invalid_record_count as f64, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_source_http_status", + &source_labels, + source.status.unwrap_or(0) as f64, + ); + } + + for prometheus in &sample.prometheus { + let mut prometheus_labels = labels.clone(); + prometheus_labels.insert("metric".to_string(), prometheus.metric.clone()); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_prometheus_query_ok", + &prometheus_labels, + if prometheus.ok { 1.0 } else { 0.0 }, + ); + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_prometheus_query_series_present", + &prometheus_labels, + if prometheus.series_present { 1.0 } else { 0.0 }, + ); + if let Some(value) = prometheus.value { + push_prometheus_gauge( + &mut output, + "every_channel_archive_convergence_measure_prometheus_query_value", + &prometheus_labels, + value, + ); + } + } + + if let Some(err) = sample.convergence_error.as_deref() { + output.push_str("# archive_convergence_measure_error "); + output.push_str(&prometheus_escape_label_value(err)); + output.push('\n'); + } + } + + let convergence_args = archive_convergence_args_from_measure_serve(args); + if let Some(convergence) = latest_convergence { + output.push_str(&archive_convergence_prometheus_metrics( + convergence, + &convergence_args, + )); + } else { + let err = anyhow!( + "latest archive convergence measurement sample did not produce a convergence report" + ); + output.push_str(&archive_convergence_prometheus_error_metrics( + &convergence_args, + &err, + )); + } + + output +} + +async fn archive_convergence_measure_serve_response( + args: &ArchiveConvergenceMeasureServeArgs, + state: &Arc>, + client: &reqwest::Client, + method: &str, + target: &str, +) -> ArchiveHttpResponse { + let path = target.split('?').next().unwrap_or(target); + if method != "GET" && method != "HEAD" { + return archive_error(405, "method not allowed"); + } + match path { + "/health" => archive_response( + 200, + "application/json", + br#"{"ok":true,"service":"archive-convergence-measure-serve"}"#.to_vec(), + ), + "/metrics" => { + let measure_args = archive_convergence_measure_args_from_serve(args); + let sample_index = { + let mut guard = state + .lock() + .expect("archive convergence measure state poisoned"); + let index = guard.next_sample_index; + guard.next_sample_index = guard.next_sample_index.saturating_add(1); + index + }; + let sample = + archive_convergence_measure_sample(&measure_args, client, sample_index).await; + let samples = { + let mut guard = state + .lock() + .expect("archive convergence measure state poisoned"); + guard.samples.push_back(sample); + while guard.samples.len() > args.max_samples { + guard.samples.pop_front(); + } + guard.samples.iter().cloned().collect::>() + }; + let report = archive_convergence_measure_report_from_samples(args, samples); + let body = archive_convergence_measure_prometheus_metrics(&report, args); + archive_response( + 200, + "text/plain; version=0.0.4; charset=utf-8", + body.into_bytes(), + ) + } + _ => archive_error(404, "not found"), + } +} + +async fn handle_archive_convergence_measure_serve_connection( + mut stream: tokio::net::TcpStream, + args: Arc, + state: Arc>, + client: reqwest::Client, +) -> Result<()> { + let mut buffer = vec![0_u8; 4096]; + let read = stream + .read(&mut buffer) + .await + .context("failed to read archive convergence measure HTTP request")?; + if read == 0 { + return Ok(()); + } + let request = String::from_utf8_lossy(&buffer[..read]); + let first_line = request.lines().next().unwrap_or_default(); + let mut parts = first_line.split_whitespace(); + let method = parts.next().unwrap_or_default(); + let target = parts.next().unwrap_or("/"); + let response = + archive_convergence_measure_serve_response(&args, &state, &client, method, target).await; + let include_body = method != "HEAD"; + let body_len = if include_body { response.body.len() } else { 0 }; + let status_text = match response.status { + 200 => "OK", + 404 => "Not Found", + 405 => "Method Not Allowed", + _ => "Error", + }; + let header = format!( + "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", + response.status, status_text, response.content_type, body_len + ); + stream.write_all(header.as_bytes()).await?; + if include_body { + stream.write_all(&response.body).await?; + } + Ok(()) +} + +async fn archive_convergence_measure_serve(args: ArchiveConvergenceMeasureServeArgs) -> Result<()> { + if args.agent_manifest.is_empty() + && args.agent_prometheus_sd.is_empty() + && args.manifest.is_empty() + { + return Err(anyhow!( + "archive-convergence-measure-serve requires at least two --agent-manifest or --manifest sources" + )); + } + if args.agent_prometheus_sd.is_empty() && args.agent_manifest.len() + args.manifest.len() < 2 { + return Err(anyhow!( + "archive-convergence-measure-serve requires at least two manifest sources" + )); + } + if args.timeout_ms == 0 { + return Err(anyhow!("--timeout-ms must be greater than 0")); + } + if args.max_manifest_bytes == 0 { + return Err(anyhow!("--max-manifest-bytes must be greater than 0")); + } + if args.max_samples < 2 { + return Err(anyhow!("--max-samples must be at least 2")); + } + if !args.min_elapsed_seconds.is_finite() || args.min_elapsed_seconds < 0.0 { + return Err(anyhow!( + "--min-elapsed-seconds must be a finite non-negative value" + )); + } + for source in args.agent_manifest.iter().chain(args.manifest.iter()) { + parse_named_endpoint(source)?; + } + for filter in &args.agent_prometheus_sd_label { + parse_label_filter(filter)?; + } + + let listener = TcpListener::bind(&args.listen) + .await + .with_context(|| format!("failed to bind {}", args.listen))?; + let client = reqwest::Client::builder() + .user_agent("every-channel-archive-convergence-measure-serve/1") + .build() + .context("failed to build HTTP client")?; + tracing::info!( + listen = %args.listen, + broadcast = %args.broadcast, + track = %args.track, + "serving remote archive convergence measurement metrics" + ); + let args = Arc::new(args); + let state = Arc::new(Mutex::new(ArchiveConvergenceMeasureServeState::default())); + loop { + let (stream, peer) = listener + .accept() + .await + .context("failed to accept archive convergence measurement metrics connection")?; + let args_clone = args.clone(); + let state_clone = state.clone(); + let client_clone = client.clone(); + tokio::spawn(async move { + if let Err(err) = handle_archive_convergence_measure_serve_connection( + stream, + args_clone, + state_clone, + client_clone, + ) + .await + { + tracing::debug!(peer = %peer, err = %err, "archive convergence measurement metrics request failed"); + } + }); + } +} + +fn archive_convergence_args_from_serve( + args: &ArchiveConvergenceServeArgs, +) -> ArchiveConvergenceArgs { + ArchiveConvergenceArgs { + source: args.source.clone(), + broadcast: args.broadcast.clone(), + track: args.track.clone(), + stream_id: args.stream_id.clone(), + rendition: args.rendition.clone(), + start_sequence: args.start_sequence, + end_sequence: args.end_sequence, + pretty: false, + prometheus: true, + metrics_node: args.metrics_node.clone(), + metrics_role: args.metrics_role.clone(), + require_ok: false, + } +} + +fn archive_convergence_serve_response( + args: &ArchiveConvergenceServeArgs, + method: &str, + target: &str, +) -> ArchiveHttpResponse { + let path = target.split('?').next().unwrap_or(target); + if method != "GET" && method != "HEAD" { + return archive_error(405, "method not allowed"); + } + match path { + "/health" => archive_response( + 200, + "application/json", + br#"{"ok":true,"service":"archive-convergence-serve"}"#.to_vec(), + ), + "/metrics" => { + let convergence_args = archive_convergence_args_from_serve(args); + let body = match archive_convergence_report(&convergence_args) { + Ok(report) => archive_convergence_prometheus_metrics(&report, &convergence_args), + Err(err) => archive_convergence_prometheus_error_metrics(&convergence_args, &err), + }; + archive_response( + 200, + "text/plain; version=0.0.4; charset=utf-8", + body.into_bytes(), + ) + } + _ => archive_error(404, "not found"), + } +} + +async fn handle_archive_convergence_serve_connection( + mut stream: tokio::net::TcpStream, + args: Arc, +) -> Result<()> { + let mut buffer = vec![0_u8; 4096]; + let read = stream + .read(&mut buffer) + .await + .context("failed to read archive convergence HTTP request")?; + if read == 0 { + return Ok(()); + } + let request = String::from_utf8_lossy(&buffer[..read]); + let first_line = request.lines().next().unwrap_or_default(); + let mut parts = first_line.split_whitespace(); + let method = parts.next().unwrap_or_default(); + let target = parts.next().unwrap_or("/"); + let response = archive_convergence_serve_response(&args, method, target); + let include_body = method != "HEAD"; + let body_len = if include_body { response.body.len() } else { 0 }; + let status_text = match response.status { + 200 => "OK", + 404 => "Not Found", + 405 => "Method Not Allowed", + _ => "Error", + }; + let header = format!( + "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n", + response.status, status_text, response.content_type, body_len + ); + stream.write_all(header.as_bytes()).await?; + if include_body { + stream.write_all(&response.body).await?; + } + Ok(()) +} + +async fn archive_convergence_serve(args: ArchiveConvergenceServeArgs) -> Result<()> { + if args.source.len() < 2 { + return Err(anyhow!( + "archive-convergence-serve requires at least two --source NAME=PATH entries" + )); + } + for source in &args.source { + parse_named_path(source)?; + } + + let listener = TcpListener::bind(&args.listen) + .await + .with_context(|| format!("failed to bind {}", args.listen))?; + tracing::info!( + listen = %args.listen, + broadcast = %args.broadcast, + track = %args.track, + "serving archive convergence metrics" + ); + let args = Arc::new(args); + loop { + let (stream, peer) = listener + .accept() + .await + .context("failed to accept archive convergence metrics connection")?; + let args_clone = args.clone(); + tokio::spawn(async move { + if let Err(err) = handle_archive_convergence_serve_connection(stream, args_clone).await + { + tracing::debug!(peer = %peer, err = %err, "archive convergence metrics request failed"); + } + }); + } +} + +#[derive(Debug, serde::Serialize)] +struct SimControlPlaneOutput { + ok: bool, + invariant: ec_core::sim::ControlPlanePropagationInvariantConfig, + scenario_template: ec_core::sim::ControlPlanePropagationScenario, + campaign: ec_core::sim::ControlPlanePropagationCampaignReport, +} + +fn sim_control_plane_output(args: &SimControlPlaneArgs) -> Result { + if let Some(path) = &args.scenario_json { + return sim_control_plane_replay_output(args, path); + } + if args.iterations == 0 { + return Err(anyhow!("sim-control-plane requires --iterations > 0")); + } + if args.transient_drop_per_million > 1_000_000 { + return Err(anyhow!( + "--transient-drop-per-million must be <= 1000000, got {}", + args.transient_drop_per_million + )); + } + + let nodes = sim_control_plane_nodes(args)?; + let partitions = if args.partitions.is_empty() { + vec![ + ec_core::sim::SimulationPartition::new("relay-hel", 70, 190, 55), + ec_core::sim::SimulationPartition::new("tower", 220, 310, 40), + ] + } else { + args.partitions + .iter() + .map(|value| parse_sim_partition(value)) + .collect::>>()? + }; + let node_outages = if args.node_outages.is_empty() { + vec![ec_core::sim::SimulationOutage::new( + "relay-nyc", + 105, + 205, + 45, + )] + } else { + args.node_outages + .iter() + .map(|value| parse_sim_node_outage(value)) + .collect::>>()? + }; + + let invariant = ec_core::sim::ControlPlanePropagationInvariantConfig { + require_node_count: nodes.len() as u64, + require_complete: true, + max_propagation_complete_ms: if args.max_propagation_complete_ms == 0 { + None + } else { + Some(args.max_propagation_complete_ms) + }, + }; + + let build_scenario = |seed| { + let mut scenario = ec_core::sim::ControlPlanePropagationScenario::new( + seed, + nodes.clone(), + &args.origin_node, + &args.topic, + &args.announcement_id, + ); + scenario.fanout = args.fanout; + scenario.gossip_interval_ms = args.gossip_interval_ms; + scenario.max_gossip_rounds = args.max_gossip_rounds; + scenario.base_network_delay_ms = args.base_network_delay_ms; + scenario.max_jitter_ms = args.max_jitter_ms; + scenario.transient_drop_per_million = args.transient_drop_per_million; + scenario.partitions = partitions.clone(); + scenario.node_outages = node_outages.clone(); + scenario + }; + + let seed_start = ec_core::sim::SimulationSeed::new(args.seed); + let scenario_template = build_scenario(seed_start); + let campaign = ec_core::sim::run_control_plane_propagation_campaign( + "control-plane-propagation-fault-campaign", + seed_start, + args.iterations, + &invariant, + build_scenario, + ); + let ok = campaign.all_passed(); + + Ok(SimControlPlaneOutput { + ok, + invariant, + scenario_template, + campaign, + }) +} + +fn sim_control_plane_replay_output( + args: &SimControlPlaneArgs, + path: &Path, +) -> Result { + let scenario = read_sim_control_plane_scenario(path)?; + validate_sim_control_plane_scenario(&scenario)?; + + let invariant = ec_core::sim::ControlPlanePropagationInvariantConfig { + require_node_count: scenario.nodes.len() as u64, + require_complete: true, + max_propagation_complete_ms: if args.max_propagation_complete_ms == 0 { + None + } else { + Some(args.max_propagation_complete_ms) + }, + }; + let seed_start = scenario.seed; + let scenario_template = scenario.clone(); + let campaign = ec_core::sim::run_control_plane_propagation_campaign( + "control-plane-replay-scenario", + seed_start, + 1, + &invariant, + |_| scenario.clone(), + ); + let ok = campaign.all_passed(); + + Ok(SimControlPlaneOutput { + ok, + invariant, + scenario_template, + campaign, + }) +} + +fn read_sim_control_plane_scenario( + path: &Path, +) -> Result { + let mut bytes = Vec::new(); + if path == Path::new("-") { + std::io::stdin() + .read_to_end(&mut bytes) + .context("failed to read control-plane simulation scenario from stdin")?; + } else { + bytes = fs::read(path).with_context(|| { + format!( + "failed to read control-plane simulation scenario {}", + path.display() + ) + })?; + } + serde_json::from_slice::(&bytes).with_context( + || { + format!( + "failed to parse control-plane simulation scenario JSON from {}", + path.display() + ) + }, + ) +} + +fn validate_sim_control_plane_scenario( + scenario: &ec_core::sim::ControlPlanePropagationScenario, +) -> Result<()> { + if scenario.nodes.len() < 2 { + return Err(anyhow!( + "control-plane simulation scenario requires at least two nodes" + )); + } + if !scenario.nodes.contains(&scenario.origin_node) { + return Err(anyhow!( + "control-plane simulation origin node {} is not in node list", + scenario.origin_node + )); + } + if scenario.topic.trim().is_empty() || scenario.announcement_id.trim().is_empty() { + return Err(anyhow!( + "control-plane simulation topic and announcement id cannot be empty" + )); + } + if scenario.transient_drop_per_million > 1_000_000 { + return Err(anyhow!( + "control-plane simulation transient_drop_per_million must be <= 1000000" + )); + } + for partition in &scenario.partitions { + if partition.source_node.trim().is_empty() { + return Err(anyhow!( + "control-plane partition source node cannot be empty" + )); + } + if !scenario.nodes.contains(&partition.source_node) { + return Err(anyhow!( + "control-plane partition references unknown node {}", + partition.source_node + )); + } + if partition.end_ms <= partition.start_ms { + return Err(anyhow!( + "control-plane partition end_ms must be greater than start_ms for {}", + partition.source_node + )); + } + } + for outage in &scenario.node_outages { + if outage.source_node.trim().is_empty() { + return Err(anyhow!( + "control-plane node outage source node cannot be empty" + )); + } + if !scenario.nodes.contains(&outage.source_node) { + return Err(anyhow!( + "control-plane node outage references unknown node {}", + outage.source_node + )); + } + if outage.end_ms <= outage.start_ms { + return Err(anyhow!( + "control-plane node outage end_ms must be greater than start_ms for {}", + outage.source_node + )); + } + } + Ok(()) +} + +fn sim_control_plane_command(args: SimControlPlaneArgs) -> Result<()> { + let output = sim_control_plane_output(&args)?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&output)?); + } else { + println!("{}", serde_json::to_string(&output)?); + } + if !output.ok && !args.allow_failure { + let replay_hint = output + .campaign + .first_failure + .as_ref() + .map(|failure| failure.replay_hint.as_str()) + .unwrap_or("no replay hint"); + return Err(anyhow!( + "control-plane simulation invariants failed: {replay_hint}" + )); + } + Ok(()) +} + +fn sim_control_plane_nodes(args: &SimControlPlaneArgs) -> Result> { + let nodes = if args.nodes.is_empty() { + vec![ + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "forge".to_string(), + "relay-lax".to_string(), + "relay-nyc".to_string(), + "relay-hel".to_string(), + ] + } else { + args.nodes + .iter() + .map(|node| node.trim()) + .filter(|node| !node.is_empty()) + .map(ToString::to_string) + .collect::>() + }; + if nodes.len() < 2 { + return Err(anyhow!( + "sim-control-plane requires at least two --node values" + )); + } + let mut unique = BTreeSet::new(); + for node in &nodes { + if !unique.insert(node.clone()) { + return Err(anyhow!("sim-control-plane node {} is duplicated", node)); + } + } + if !nodes.contains(&args.origin_node) { + return Err(anyhow!( + "sim-control-plane origin node {} is not in node list", + args.origin_node + )); + } + Ok(nodes) +} + +fn parse_sim_node_outage(value: &str) -> Result { + let parts = value.split(':').collect::>(); + if parts.len() != 4 { + return Err(anyhow!( + "expected node outage NODE:START_MS:END_MS:RECOVERY_DELAY_MS, got {value:?}" + )); + } + let source_node = parts[0].trim(); + if source_node.is_empty() { + return Err(anyhow!("node outage source node cannot be empty")); + } + let start_ms = parse_sim_u64("node outage start_ms", parts[1])?; + let end_ms = parse_sim_u64("node outage end_ms", parts[2])?; + let recovery_delay_ms = parse_sim_u64("node outage recovery_delay_ms", parts[3])?; + if end_ms <= start_ms { + return Err(anyhow!( + "node outage end_ms must be greater than start_ms for {source_node}" + )); + } + Ok(ec_core::sim::SimulationOutage::new( + source_node, + start_ms, + end_ms, + recovery_delay_ms, + )) +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemOutput { + ok: bool, + invariant: ec_core::sim::SystemDuplicatePublisherInvariantConfig, + fault_coverage: SimSystemFaultCoverageReport, + runtime: SimSystemRuntimeReport, + scenario_template: ec_core::sim::SystemDuplicatePublisherScenario, + campaign: ec_core::sim::SystemDuplicatePublisherCampaignReport, +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemFaultCoverageReport { + required: bool, + ok: bool, + min_seed_coverage: u64, + failures: Vec, +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemRuntimeReport { + wall_elapsed_ms: f64, + iterations_per_second: f64, + simulated_system_seconds_per_wall_second: f64, + trace_events_per_second: f64, +} + +fn sim_system_output(args: &SimSystemArgs) -> Result { + if let Some(path) = &args.scenario_json { + return sim_system_replay_output(args, path); + } + if args.iterations == 0 { + return Err(anyhow!("sim-system requires --iterations > 0")); + } + if args.sequence_count == 0 { + return Err(anyhow!("sim-system requires --sequence-count > 0")); + } + + let nodes = sim_system_nodes(args)?; + let publishers = sim_system_publishers(args, &nodes)?; + let publisher_source_material = + sim_publisher_source_material(&args.publisher_source_material, &publishers)?; + let invariant = ec_core::sim::SystemDuplicatePublisherInvariantConfig { + require_control_complete: true, + require_media_duplicate_complete: true, + require_media_timing: true, + max_system_complete_ms: if args.max_system_complete_ms == 0 { + None + } else { + Some(args.max_system_complete_ms) + }, + }; + + let build_scenario = |seed| { + sim_system_default_scenario(args, &nodes, &publishers, &publisher_source_material, seed) + }; + let seed_start = ec_core::sim::SimulationSeed::new(args.seed); + let scenario_template = build_scenario(seed_start); + let runtime_start = Instant::now(); + let campaign = ec_core::sim::run_system_duplicate_publisher_campaign( + "system-duplicate-publisher-fault-campaign", + seed_start, + args.iterations, + &invariant, + build_scenario, + ); + let runtime = sim_system_runtime_report(runtime_start, &campaign); + let fault_coverage = sim_system_fault_coverage_report(args, &campaign); + let ok = campaign.all_passed() && fault_coverage.ok; + + Ok(SimSystemOutput { + ok, + invariant, + fault_coverage, + runtime, + scenario_template, + campaign, + }) +} + +fn sim_system_replay_output(args: &SimSystemArgs, path: &Path) -> Result { + let scenario = read_sim_system_scenario(path)?; + validate_sim_system_scenario(&scenario)?; + + let invariant = ec_core::sim::SystemDuplicatePublisherInvariantConfig { + require_control_complete: true, + require_media_duplicate_complete: true, + require_media_timing: true, + max_system_complete_ms: if args.max_system_complete_ms == 0 { + None + } else { + Some(args.max_system_complete_ms) + }, + }; + let seed_start = scenario.seed; + let scenario_template = scenario.clone(); + let runtime_start = Instant::now(); + let campaign = ec_core::sim::run_system_duplicate_publisher_campaign( + "system-duplicate-publisher-replay-scenario", + seed_start, + 1, + &invariant, + |_| scenario.clone(), + ); + let runtime = sim_system_runtime_report(runtime_start, &campaign); + let fault_coverage = sim_system_fault_coverage_report(args, &campaign); + let ok = campaign.all_passed() && fault_coverage.ok; + + Ok(SimSystemOutput { + ok, + invariant, + fault_coverage, + runtime, + scenario_template, + campaign, + }) +} + +fn sim_system_fault_coverage_report( + args: &SimSystemArgs, + campaign: &ec_core::sim::SystemDuplicatePublisherCampaignReport, +) -> SimSystemFaultCoverageReport { + let required = + args.require_fault_coverage || args.fault_profile == SimSystemFaultProfile::FoundationDb; + let min_seed_coverage = sim_system_min_fault_seed_coverage(args, campaign); + let failures = if required { + sim_system_fault_coverage_failures(campaign, min_seed_coverage) + } else { + Vec::new() + }; + SimSystemFaultCoverageReport { + required, + ok: failures.is_empty(), + min_seed_coverage, + failures, + } +} + +fn sim_system_min_fault_seed_coverage( + args: &SimSystemArgs, + campaign: &ec_core::sim::SystemDuplicatePublisherCampaignReport, +) -> u64 { + if args.min_fault_seed_coverage > 0 { + return args.min_fault_seed_coverage; + } + if args.fault_profile == SimSystemFaultProfile::FoundationDb { + return (campaign.iterations / 32).max(2).min(campaign.iterations); + } + 1 +} + +fn sim_system_fault_coverage_failures( + campaign: &ec_core::sim::SystemDuplicatePublisherCampaignReport, + min_seed_coverage: u64, +) -> Vec { + let mut failures = campaign.fault_coverage_failures(); + let thresholds = [ + ( + "control_transient_drops", + campaign.seeds_with_control_transient_drops, + ), + ( + "control_partition_delays", + campaign.seeds_with_control_partition_delays, + ), + ( + "control_node_outage_delays", + campaign.seeds_with_control_node_outage_delays, + ), + ( + "control_duplicate_messages", + campaign.seeds_with_control_duplicate_messages, + ), + ( + "media_transient_drops", + campaign.seeds_with_media_transient_drops, + ), + ( + "media_partition_delays", + campaign.seeds_with_media_partition_delays, + ), + ( + "media_publisher_outages", + campaign.seeds_with_media_publisher_outages, + ), + ( + "media_backfill", + campaign.seeds_with_media_backfill_observations, + ), + ]; + for (name, observed) in thresholds { + if observed < min_seed_coverage { + failures.push(format!("{name}_seed_coverage_below_min")); + } + } + if campaign.total_trace_events == 0 { + failures.push("trace_events_unobserved".to_string()); + } + if campaign.slowest_system_runs.is_empty() { + failures.push("slowest_system_runs_unobserved".to_string()); + } + failures +} + +fn sim_system_runtime_report( + started_at: Instant, + campaign: &ec_core::sim::SystemDuplicatePublisherCampaignReport, +) -> SimSystemRuntimeReport { + let elapsed = started_at.elapsed(); + let wall_elapsed_ms = elapsed.as_secs_f64() * 1000.0; + let wall_elapsed_seconds = elapsed.as_secs_f64().max(0.000_001); + SimSystemRuntimeReport { + wall_elapsed_ms, + iterations_per_second: campaign.iterations as f64 / wall_elapsed_seconds, + simulated_system_seconds_per_wall_second: campaign.total_system_complete_ms_observed as f64 + / 1000.0 + / wall_elapsed_seconds, + trace_events_per_second: campaign.total_trace_events as f64 / wall_elapsed_seconds, + } +} + +fn read_sim_system_scenario(path: &Path) -> Result { + let mut bytes = Vec::new(); + if path == Path::new("-") { + std::io::stdin() + .read_to_end(&mut bytes) + .context("failed to read system simulation scenario from stdin")?; + } else { + bytes = fs::read(path).with_context(|| { + format!( + "failed to read system simulation scenario {}", + path.display() + ) + })?; + } + serde_json::from_slice::(&bytes).with_context( + || { + format!( + "failed to parse system simulation scenario JSON from {}", + path.display() + ) + }, + ) +} + +fn validate_sim_system_scenario( + scenario: &ec_core::sim::SystemDuplicatePublisherScenario, +) -> Result<()> { + validate_sim_control_plane_scenario(&scenario.control)?; + validate_sim_duplicate_publishers_scenario(&scenario.media)?; + for publisher in &scenario.media.publisher_nodes { + if !scenario.control.nodes.contains(publisher) { + return Err(anyhow!( + "system simulation publisher {} is not in control node list", + publisher + )); + } + } + Ok(()) +} + +fn sim_system_command(args: SimSystemArgs) -> Result<()> { + let output = sim_system_output(&args)?; + if !output.ok { + if let Some(path) = &args.failure_artifact { + write_sim_system_failure_artifact(path, &args, &output)?; + eprintln!( + "wrote system simulation failure artifact to {}", + path.display() + ); + } + } + if args.pretty { + println!("{}", serde_json::to_string_pretty(&output)?); + } else { + println!("{}", serde_json::to_string(&output)?); + } + if !output.ok && !args.allow_failure { + let replay_hint = output + .campaign + .first_failure + .as_ref() + .map(|failure| failure.replay_hint.as_str()) + .unwrap_or("no replay hint"); + if output.fault_coverage.ok { + return Err(anyhow!( + "system simulation invariants failed: {replay_hint}" + )); + } + return Err(anyhow!( + "system simulation fault coverage failed: {:?}", + output.fault_coverage.failures + )); + } + Ok(()) +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemFailureCampaignSummary { + name: String, + seed_start: ec_core::sim::SimulationSeed, + iterations: u64, + passed: u64, + failed: u64, +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemFailureArtifact<'a> { + artifact_type: &'static str, + replay_scenario_pointer: &'static str, + replay_command: String, + first_failure_seed: ec_core::sim::SimulationSeed, + first_failure_replay_hint: &'a str, + campaign: SimSystemFailureCampaignSummary, + replay_scenario: &'a ec_core::sim::SystemDuplicatePublisherScenario, + invariant: &'a ec_core::sim::SystemDuplicatePublisherInvariantReport, + report: &'a ec_core::sim::SystemDuplicatePublisherSimulationReport, +} + +#[derive(Debug, serde::Serialize)] +struct SimSystemWeakCoverageArtifact<'a> { + artifact_type: &'static str, + rerun_command: String, + campaign: SimSystemFailureCampaignSummary, + fault_coverage: &'a SimSystemFaultCoverageReport, + invariant: &'a ec_core::sim::SystemDuplicatePublisherInvariantConfig, + scenario_template: &'a ec_core::sim::SystemDuplicatePublisherScenario, +} + +fn write_sim_system_failure_artifact( + path: &Path, + args: &SimSystemArgs, + output: &SimSystemOutput, +) -> Result<()> { + if let Some(parent) = path + .parent() + .filter(|parent| !parent.as_os_str().is_empty()) + { + fs::create_dir_all(parent).with_context(|| { + format!( + "failed to create system simulation failure artifact directory {}", + parent.display() + ) + })?; + } + let campaign = SimSystemFailureCampaignSummary { + name: output.campaign.name.clone(), + seed_start: output.campaign.seed_start, + iterations: output.campaign.iterations, + passed: output.campaign.passed, + failed: output.campaign.failed, + }; + let bytes = if let Some(failure) = output.campaign.first_failure.as_ref() { + let mut replay_command = format!( + "jq '.replay_scenario' {} | ec-node sim-system --scenario-json - --allow-failure --pretty", + path.display() + ); + if args.max_system_complete_ms != 3_500 { + replay_command.push_str(&format!( + " --max-system-complete-ms {}", + args.max_system_complete_ms + )); + } + let artifact = SimSystemFailureArtifact { + artifact_type: "every.channel.sim.system_duplicate_publishers.failure.v1", + replay_scenario_pointer: "/replay_scenario", + replay_command, + first_failure_seed: failure.seed, + first_failure_replay_hint: &failure.replay_hint, + campaign, + replay_scenario: &failure.scenario, + invariant: &failure.invariant, + report: &failure.report, + }; + serde_json::to_vec_pretty(&artifact) + .context("failed to serialize system simulation failure artifact")? + } else if !output.fault_coverage.ok { + let artifact = SimSystemWeakCoverageArtifact { + artifact_type: "every.channel.sim.system_duplicate_publishers.weak_coverage.v1", + rerun_command: sim_system_campaign_rerun_command(args), + campaign, + fault_coverage: &output.fault_coverage, + invariant: &output.invariant, + scenario_template: &output.scenario_template, + }; + serde_json::to_vec_pretty(&artifact) + .context("failed to serialize system simulation weak coverage artifact")? + } else { + return Err(anyhow!( + "system simulation had no failure artifact to write" + )); + }; + fs::write(path, bytes).with_context(|| { + format!( + "failed to write system simulation failure artifact {}", + path.display() + ) + })?; + Ok(()) +} + +fn sim_system_campaign_rerun_command(args: &SimSystemArgs) -> String { + let mut command = format!( + "ec-node sim-system --seed {} --iterations {} --max-system-complete-ms {} --require-fault-coverage --pretty", + args.seed, args.iterations, args.max_system_complete_ms + ); + if args.fault_profile == SimSystemFaultProfile::FoundationDb { + command.push_str(" --fault-profile foundationdb"); + } + if args.min_fault_seed_coverage != 0 { + command.push_str(&format!( + " --min-fault-seed-coverage {}", + args.min_fault_seed_coverage + )); + } + if args.origin_node != "forge" { + command.push_str(&format!(" --origin-node {}", shell_arg(&args.origin_node))); + } + if args.topic != "ec.control.broadcast.la-kcop" { + command.push_str(&format!(" --topic {}", shell_arg(&args.topic))); + } + if args.announcement_id != "la-kcop@42" { + command.push_str(&format!( + " --announcement-id {}", + shell_arg(&args.announcement_id) + )); + } + if args.sequence_clock == SimPublisherSequenceClock::LocalActivation { + command.push_str(" --sequence-clock local-activation"); + } + if args.stream_id != "la-kcop" { + command.push_str(&format!(" --stream-id {}", shell_arg(&args.stream_id))); + } + if args.rendition != WT_LADDER_PRIMARY_RENDITION { + command.push_str(&format!(" --rendition {}", shell_arg(&args.rendition))); + } + if args.track != WT_PUBLISH_PRIMARY_VIDEO_TRACK { + command.push_str(&format!(" --track {}", shell_arg(&args.track))); + } + if args.profile != "x264-hd3-v1" { + command.push_str(&format!(" --profile {}", shell_arg(&args.profile))); + } + if args.sequence_count != 48 { + command.push_str(&format!(" --sequence-count {}", args.sequence_count)); + } + if args.segment_step_ms != 40 { + command.push_str(&format!(" --segment-step-ms {}", args.segment_step_ms)); + } + if args.publisher_activation_delay_ms != 25 { + command.push_str(&format!( + " --publisher-activation-delay-ms {}", + args.publisher_activation_delay_ms + )); + } + if args.publisher_backfill_delay_ms != 180 { + command.push_str(&format!( + " --publisher-backfill-delay-ms {}", + args.publisher_backfill_delay_ms + )); + } + for node in &args.nodes { + command.push_str(&format!(" --node {}", shell_arg(node))); + } + for publisher in &args.publishers { + command.push_str(&format!(" --publisher {}", shell_arg(publisher))); + } + for source_material in &args.publisher_source_material { + command.push_str(&format!( + " --publisher-source-material {}", + shell_arg(source_material) + )); + } + command +} + +fn shell_arg(value: &str) -> String { + if value + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':' | '@')) + { + value.to_string() + } else { + format!("'{}'", value.replace('\'', "'\\''")) + } +} + +fn sim_system_nodes(args: &SimSystemArgs) -> Result> { + let nodes = if args.nodes.is_empty() { + vec![ + "forge".to_string(), + "nuc-a".to_string(), + "nuc-b".to_string(), + "tower".to_string(), + "relay-lax".to_string(), + "relay-nyc".to_string(), + "relay-hel".to_string(), + ] + } else { + args.nodes + .iter() + .map(|node| node.trim()) + .filter(|node| !node.is_empty()) + .map(ToString::to_string) + .collect::>() + }; + if nodes.len() < 2 { + return Err(anyhow!("sim-system requires at least two --node values")); + } + let mut unique = BTreeSet::new(); + for node in &nodes { + if !unique.insert(node.clone()) { + return Err(anyhow!("sim-system node {} is duplicated", node)); + } + } + if !nodes.contains(&args.origin_node) { + return Err(anyhow!( + "sim-system origin node {} is not in node list", + args.origin_node + )); + } + Ok(nodes) +} + +fn sim_system_publishers(args: &SimSystemArgs, nodes: &[String]) -> Result> { + let publishers = if args.publishers.is_empty() { + vec!["nuc-a".to_string(), "nuc-b".to_string()] + } else { + args.publishers + .iter() + .map(|node| node.trim()) + .filter(|node| !node.is_empty()) + .map(ToString::to_string) + .collect::>() + }; + if publishers.len() < 2 { + return Err(anyhow!( + "sim-system requires at least two --publisher values" + )); + } + let mut unique = BTreeSet::new(); + for publisher in &publishers { + if !unique.insert(publisher.clone()) { + return Err(anyhow!("sim-system publisher {} is duplicated", publisher)); + } + if !nodes.contains(publisher) { + return Err(anyhow!( + "sim-system publisher {} is not in node list", + publisher + )); + } + } + Ok(publishers) +} + +fn sim_system_default_scenario( + args: &SimSystemArgs, + nodes: &[String], + publishers: &[String], + publisher_source_material: &BTreeMap, + seed: ec_core::sim::SimulationSeed, +) -> ec_core::sim::SystemDuplicatePublisherScenario { + if args.fault_profile == SimSystemFaultProfile::FoundationDb { + return sim_system_foundationdb_scenario( + args, + nodes, + publishers, + publisher_source_material, + seed, + ); + } + + let mut control = ec_core::sim::ControlPlanePropagationScenario::new( + seed, + nodes.to_vec(), + &args.origin_node, + &args.topic, + &args.announcement_id, + ); + control.fanout = 3; + control.gossip_interval_ms = 35; + control.max_gossip_rounds = 12; + control.base_network_delay_ms = 6; + control.max_jitter_ms = 45; + control.transient_drop_per_million = 120_000; + control.partitions = sim_system_default_control_partitions(nodes, publishers); + control.node_outages = if nodes.iter().any(|node| node == "relay-nyc") { + vec![ec_core::sim::SimulationOutage::new( + "relay-nyc", + 105, + 205, + 45, + )] + } else { + Vec::new() + }; + + let mut media = ec_core::sim::DuplicatePublisherScenario::new( + ec_core::sim::SimulationSeed::new(seed.0 ^ 0x6d65_6469_6121), + publishers.to_vec(), + &args.stream_id, + &args.rendition, + &args.track, + &args.profile, + 0, + args.sequence_count, + ); + media.segment_step_ms = args.segment_step_ms; + media.base_network_delay_ms = 5; + media.max_jitter_ms = 75; + media.transient_drop_per_million = 275_000; + media.backfill_after_ms = 600; + media.partitions = publishers + .first() + .map(|publisher| ec_core::sim::SimulationPartition::new(publisher, 940, 1_260, 90)) + .into_iter() + .collect(); + media.publisher_outages = publishers + .get(1) + .map(|publisher| ec_core::sim::SimulationOutage::new(publisher, 1_360, 1_520, 220)) + .into_iter() + .collect(); + media.publisher_source_material = publisher_source_material.clone(); + + let mut scenario = ec_core::sim::SystemDuplicatePublisherScenario::new(seed, control, media); + scenario.publisher_activation_delay_ms = args.publisher_activation_delay_ms; + scenario.publisher_backfill_delay_ms = args.publisher_backfill_delay_ms; + scenario.sequence_clock = args.sequence_clock.into(); + scenario +} + +fn sim_system_foundationdb_scenario( + args: &SimSystemArgs, + nodes: &[String], + publishers: &[String], + publisher_source_material: &BTreeMap, + seed: ec_core::sim::SimulationSeed, +) -> ec_core::sim::SystemDuplicatePublisherScenario { + let config = ec_core::sim::FoundationStyleSystemScenarioConfig { + nodes: nodes.to_vec(), + publisher_nodes: publishers.to_vec(), + origin_node: args.origin_node.clone(), + topic: args.topic.clone(), + announcement_id: args.announcement_id.clone(), + stream_id: args.stream_id.clone(), + rendition_id: args.rendition.clone(), + track_name: args.track.clone(), + profile_id: args.profile.clone(), + sequence_clock: args.sequence_clock.into(), + }; + let mut scenario = + ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config); + scenario.media.publisher_source_material = publisher_source_material.clone(); + scenario.publisher_activation_delay_ms = args.publisher_activation_delay_ms; + scenario.publisher_backfill_delay_ms = args.publisher_backfill_delay_ms; + scenario +} + +fn sim_system_default_control_partitions( + nodes: &[String], + publishers: &[String], +) -> Vec { + let mut partitions = Vec::new(); + if let Some(delayed_publisher) = publishers.get(1) { + partitions.push(ec_core::sim::SimulationPartition::new( + delayed_publisher, + 0, + 180, + 40, + )); + } + if nodes.iter().any(|node| node == "relay-hel") { + partitions.push(ec_core::sim::SimulationPartition::new( + "relay-hel", + 70, + 190, + 55, + )); + } + partitions +} + +#[derive(Debug, serde::Serialize)] +struct SimDuplicatePublishersOutput { + ok: bool, + invariant: ec_core::sim::DuplicatePublisherInvariantConfig, + scenario_template: ec_core::sim::DuplicatePublisherScenario, + campaign: ec_core::sim::DuplicatePublisherCampaignReport, +} + +#[derive(Debug, serde::Serialize)] +struct SimDuplicatePublishersFailureCampaignSummary { + name: String, + seed_start: ec_core::sim::SimulationSeed, + iterations: u64, + passed: u64, + failed: u64, +} + +#[derive(Debug, serde::Serialize)] +struct SimDuplicatePublishersFailureArtifact<'a> { + artifact_type: &'static str, + replay_scenario_pointer: &'static str, + replay_command: String, + first_failure_seed: ec_core::sim::SimulationSeed, + first_failure_replay_hint: &'a str, + shrunk: bool, + campaign: SimDuplicatePublishersFailureCampaignSummary, + replay_scenario: &'a ec_core::sim::DuplicatePublisherScenario, + invariant: &'a ec_core::sim::DuplicatePublisherInvariantReport, + report: &'a ec_core::sim::DuplicatePublisherSimulationReport, + shrink_steps: &'a [ec_core::sim::DuplicatePublisherShrinkStep], +} + +fn sim_duplicate_publishers_output( + args: &SimDuplicatePublishersArgs, +) -> Result { + if let Some(path) = &args.scenario_json { + return sim_duplicate_publishers_replay_output(args, path); + } + if args.iterations == 0 { + return Err(anyhow!( + "sim-duplicate-publishers requires --iterations > 0" + )); + } + if args.sequence_count == 0 { + return Err(anyhow!( + "sim-duplicate-publishers requires --sequence-count > 0" + )); + } + if args.transient_drop_per_million > 1_000_000 { + return Err(anyhow!( + "--transient-drop-per-million must be <= 1000000, got {}", + args.transient_drop_per_million + )); + } + let end_sequence = args + .start_sequence + .checked_add(args.sequence_count) + .ok_or_else(|| anyhow!("--start-sequence + --sequence-count overflows u64"))?; + let publishers = sim_duplicate_publishers_nodes(args)?; + let partitions = sim_duplicate_publishers_partitions(args, &publishers)?; + let publisher_outages = sim_duplicate_publishers_publisher_outages(args, &publishers)?; + let publisher_sequence_offsets = sim_duplicate_publishers_sequence_offsets(args, &publishers)?; + let publisher_media_time_offsets_ms = + sim_duplicate_publishers_media_time_offsets(args, &publishers)?; + let missing_media_timing_publishers = + sim_duplicate_publishers_missing_media_timing_publishers(args, &publishers)?; + let publisher_source_material = + sim_publisher_source_material(&args.publisher_source_material, &publishers)?; + let encoder_drifts = args + .encoder_drifts + .iter() + .map(|value| parse_sim_encoder_drift(value)) + .collect::>>()?; + + let invariant = ec_core::sim::DuplicatePublisherInvariantConfig { + require_source_count: publishers.len() as u64, + require_duplicate_complete: true, + require_media_timing: true, + max_duplicate_complete_ms: if args.max_duplicate_complete_ms == 0 { + None + } else { + Some(args.max_duplicate_complete_ms) + }, + }; + + let build_scenario = |seed| { + let mut scenario = ec_core::sim::DuplicatePublisherScenario::new( + seed, + publishers.clone(), + &args.stream_id, + &args.rendition, + &args.track, + &args.profile, + args.start_sequence, + end_sequence, + ); + scenario.segment_step_ms = args.segment_step_ms; + scenario.base_network_delay_ms = args.base_network_delay_ms; + scenario.max_jitter_ms = args.max_jitter_ms; + scenario.transient_drop_per_million = args.transient_drop_per_million; + scenario.backfill_after_ms = args.backfill_after_ms; + scenario.publisher_sequence_offsets = publisher_sequence_offsets.clone(); + scenario.publisher_media_time_offsets_ms = publisher_media_time_offsets_ms.clone(); + scenario.missing_media_timing_publishers = missing_media_timing_publishers.clone(); + scenario.publisher_source_material = publisher_source_material.clone(); + scenario.partitions = partitions.clone(); + scenario.publisher_outages = publisher_outages.clone(); + scenario.encoder_drifts = encoder_drifts.clone(); + scenario + }; + + let seed_start = ec_core::sim::SimulationSeed::new(args.seed); + let scenario_template = build_scenario(seed_start); + let campaign = ec_core::sim::run_duplicate_publisher_campaign( + "duplicate-publisher-fault-campaign", + seed_start, + args.iterations, + &invariant, + build_scenario, + ); + let ok = campaign.all_passed(); + + Ok(SimDuplicatePublishersOutput { + ok, + invariant, + scenario_template, + campaign, + }) +} + +fn sim_duplicate_publishers_replay_output( + args: &SimDuplicatePublishersArgs, + path: &Path, +) -> Result { + let scenario = read_sim_duplicate_publishers_scenario(path)?; + validate_sim_duplicate_publishers_scenario(&scenario)?; + + let invariant = ec_core::sim::DuplicatePublisherInvariantConfig { + require_source_count: scenario.publisher_nodes.len() as u64, + require_duplicate_complete: true, + require_media_timing: true, + max_duplicate_complete_ms: if args.max_duplicate_complete_ms == 0 { + None + } else { + Some(args.max_duplicate_complete_ms) + }, + }; + let seed_start = scenario.seed; + let scenario_template = scenario.clone(); + let campaign = ec_core::sim::run_duplicate_publisher_campaign( + "duplicate-publisher-replay-scenario", + seed_start, + 1, + &invariant, + |_| scenario.clone(), + ); + let ok = campaign.all_passed(); + + Ok(SimDuplicatePublishersOutput { + ok, + invariant, + scenario_template, + campaign, + }) +} + +fn read_sim_duplicate_publishers_scenario( + path: &Path, +) -> Result { + let mut bytes = Vec::new(); + if path == Path::new("-") { + std::io::stdin() + .read_to_end(&mut bytes) + .context("failed to read simulation scenario from stdin")?; + } else { + bytes = fs::read(path) + .with_context(|| format!("failed to read simulation scenario {}", path.display()))?; + } + serde_json::from_slice::(&bytes).with_context(|| { + format!( + "failed to parse simulation scenario JSON from {}", + path.display() + ) + }) +} + +fn validate_sim_duplicate_publishers_scenario( + scenario: &ec_core::sim::DuplicatePublisherScenario, +) -> Result<()> { + if scenario.publisher_nodes.len() < 2 { + return Err(anyhow!( + "simulation scenario requires at least two publisher nodes" + )); + } + if scenario.end_sequence <= scenario.start_sequence { + return Err(anyhow!( + "simulation scenario end_sequence must be greater than start_sequence" + )); + } + if scenario.transient_drop_per_million > 1_000_000 { + return Err(anyhow!( + "simulation scenario transient_drop_per_million must be <= 1000000" + )); + } + for partition in &scenario.partitions { + if partition.source_node.trim().is_empty() { + return Err(anyhow!("simulation partition source_node cannot be empty")); + } + if partition.end_ms <= partition.start_ms { + return Err(anyhow!( + "simulation partition end_ms must be greater than start_ms for {}", + partition.source_node + )); + } + } + for outage in &scenario.publisher_outages { + if outage.source_node.trim().is_empty() { + return Err(anyhow!( + "simulation publisher outage source_node cannot be empty" + )); + } + if outage.end_ms <= outage.start_ms { + return Err(anyhow!( + "simulation publisher outage end_ms must be greater than start_ms for {}", + outage.source_node + )); + } + } + for source_node in scenario.publisher_sequence_offsets.keys() { + if source_node.trim().is_empty() { + return Err(anyhow!( + "simulation publisher sequence offset source_node cannot be empty" + )); + } + if !scenario.publisher_nodes.contains(source_node) { + return Err(anyhow!( + "simulation publisher sequence offset references unknown publisher {}", + source_node + )); + } + } + for source_node in scenario.publisher_media_time_offsets_ms.keys() { + if source_node.trim().is_empty() { + return Err(anyhow!( + "simulation publisher media time offset source_node cannot be empty" + )); + } + if !scenario.publisher_nodes.contains(source_node) { + return Err(anyhow!( + "simulation publisher media time offset references unknown publisher {}", + source_node + )); + } + } + for source_node in &scenario.missing_media_timing_publishers { + if source_node.trim().is_empty() { + return Err(anyhow!( + "simulation missing media timing publisher cannot be empty" + )); + } + if !scenario.publisher_nodes.contains(source_node) { + return Err(anyhow!( + "simulation missing media timing references unknown publisher {}", + source_node + )); + } + } + for (source_node, material_id) in &scenario.publisher_source_material { + if source_node.trim().is_empty() || material_id.trim().is_empty() { + return Err(anyhow!( + "simulation publisher source material source_node and material_id cannot be empty" + )); + } + if !scenario.publisher_nodes.contains(source_node) { + return Err(anyhow!( + "simulation publisher source material references unknown publisher {}", + source_node + )); + } + } + for drift in &scenario.encoder_drifts { + if drift.source_node.trim().is_empty() || drift.profile_id.trim().is_empty() { + return Err(anyhow!( + "simulation encoder drift source_node and profile_id cannot be empty" + )); + } + } + Ok(()) +} + +fn sim_duplicate_publishers_command(args: SimDuplicatePublishersArgs) -> Result<()> { + let output = sim_duplicate_publishers_output(&args)?; + if !output.ok { + if let Some(path) = &args.failure_artifact { + write_sim_duplicate_publishers_failure_artifact(path, &output)?; + eprintln!( + "wrote duplicate publisher simulation failure artifact to {}", + path.display() + ); + } + } + if args.pretty { + println!("{}", serde_json::to_string_pretty(&output)?); + } else { + println!("{}", serde_json::to_string(&output)?); + } + if !output.ok && !args.allow_failure { + let replay_hint = output + .campaign + .first_failure + .as_ref() + .map(|failure| failure.replay_hint.as_str()) + .unwrap_or("no replay hint"); + return Err(anyhow!( + "duplicate publisher simulation invariants failed: {replay_hint}" + )); + } + Ok(()) +} + +fn write_sim_duplicate_publishers_failure_artifact( + path: &Path, + output: &SimDuplicatePublishersOutput, +) -> Result<()> { + let failure = output + .campaign + .first_failure + .as_ref() + .ok_or_else(|| anyhow!("duplicate publisher simulation did not fail"))?; + let (replay_scenario, invariant, report, shrink_steps, shrunk) = + if let Some(shrunk_failure) = failure.shrunk_failure.as_ref() { + ( + &shrunk_failure.scenario, + &shrunk_failure.invariant, + &shrunk_failure.report, + shrunk_failure.steps.as_slice(), + true, + ) + } else { + ( + &failure.scenario, + &failure.invariant, + &failure.report, + &[][..], + false, + ) + }; + let replay_command = format!( + "jq '.replay_scenario' {} | ec-node sim-duplicate-publishers --scenario-json - --allow-failure --pretty", + path.display() + ); + let artifact = SimDuplicatePublishersFailureArtifact { + artifact_type: "every.channel.sim.duplicate_publishers.failure.v1", + replay_scenario_pointer: "/replay_scenario", + replay_command, + first_failure_seed: failure.seed, + first_failure_replay_hint: &failure.replay_hint, + shrunk, + campaign: SimDuplicatePublishersFailureCampaignSummary { + name: output.campaign.name.clone(), + seed_start: output.campaign.seed_start, + iterations: output.campaign.iterations, + passed: output.campaign.passed, + failed: output.campaign.failed, + }, + replay_scenario, + invariant, + report, + shrink_steps, + }; + if let Some(parent) = path + .parent() + .filter(|parent| !parent.as_os_str().is_empty()) + { + fs::create_dir_all(parent).with_context(|| { + format!( + "failed to create simulation failure artifact directory {}", + parent.display() + ) + })?; + } + let bytes = serde_json::to_vec_pretty(&artifact) + .context("failed to serialize simulation failure artifact")?; + fs::write(path, bytes).with_context(|| { + format!( + "failed to write simulation failure artifact {}", + path.display() + ) + })?; + Ok(()) +} + +fn sim_duplicate_publishers_nodes(args: &SimDuplicatePublishersArgs) -> Result> { + let publishers = if args.publishers.is_empty() { + vec!["nuc-a".to_string(), "nuc-b".to_string()] + } else { + args.publishers + .iter() + .map(|publisher| publisher.trim()) + .filter(|publisher| !publisher.is_empty()) + .map(ToString::to_string) + .collect::>() + }; + if publishers.len() < 2 { + return Err(anyhow!( + "sim-duplicate-publishers requires at least two --publisher values" + )); + } + Ok(publishers) +} + +fn sim_duplicate_publishers_partitions( + args: &SimDuplicatePublishersArgs, + publishers: &[String], +) -> Result> { + if !args.partitions.is_empty() { + return args + .partitions + .iter() + .map(|value| parse_sim_partition(value)) + .collect::>>(); + } + Ok(vec![ + ec_core::sim::SimulationPartition::new(&publishers[1], 120, 520, 140), + ec_core::sim::SimulationPartition::new(&publishers[0], 940, 1_260, 90), + ]) +} + +fn parse_sim_partition(value: &str) -> Result { + let parts = value.split(':').collect::>(); + if parts.len() != 4 { + return Err(anyhow!( + "expected partition NODE:START_MS:END_MS:RELEASE_DELAY_MS, got {value:?}" + )); + } + let source_node = parts[0].trim(); + if source_node.is_empty() { + return Err(anyhow!("partition source node cannot be empty")); + } + let start_ms = parse_sim_u64("partition start_ms", parts[1])?; + let end_ms = parse_sim_u64("partition end_ms", parts[2])?; + let release_delay_ms = parse_sim_u64("partition release_delay_ms", parts[3])?; + if end_ms <= start_ms { + return Err(anyhow!( + "partition end_ms must be greater than start_ms for {source_node}" + )); + } + Ok(ec_core::sim::SimulationPartition::new( + source_node, + start_ms, + end_ms, + release_delay_ms, + )) +} + +fn sim_duplicate_publishers_publisher_outages( + args: &SimDuplicatePublishersArgs, + publishers: &[String], +) -> Result> { + if !args.publisher_outages.is_empty() { + return args + .publisher_outages + .iter() + .map(|value| parse_sim_publisher_outage(value)) + .collect::>>(); + } + Ok(vec![ec_core::sim::SimulationOutage::new( + &publishers[1], + 1_360, + 1_520, + 220, + )]) +} + +fn parse_sim_publisher_outage(value: &str) -> Result { + let parts = value.split(':').collect::>(); + if parts.len() != 4 { + return Err(anyhow!( + "expected publisher outage NODE:START_MS:END_MS:BACKFILL_DELAY_MS, got {value:?}" + )); + } + let source_node = parts[0].trim(); + if source_node.is_empty() { + return Err(anyhow!("publisher outage source node cannot be empty")); + } + let start_ms = parse_sim_u64("publisher outage start_ms", parts[1])?; + let end_ms = parse_sim_u64("publisher outage end_ms", parts[2])?; + let backfill_delay_ms = parse_sim_u64("publisher outage backfill_delay_ms", parts[3])?; + if end_ms <= start_ms { + return Err(anyhow!( + "publisher outage end_ms must be greater than start_ms for {source_node}" + )); + } + Ok(ec_core::sim::SimulationOutage::new( + source_node, + start_ms, + end_ms, + backfill_delay_ms, + )) +} + +fn sim_duplicate_publishers_sequence_offsets( + args: &SimDuplicatePublishersArgs, + publishers: &[String], +) -> Result> { + let mut offsets = BTreeMap::new(); + for value in &args.publisher_sequence_offsets { + let (source_node, offset) = parse_sim_publisher_sequence_offset(value)?; + if !publishers.contains(&source_node) { + return Err(anyhow!( + "publisher sequence offset references unknown publisher {source_node}" + )); + } + offsets.insert(source_node, offset); + } + Ok(offsets) +} + +fn parse_sim_publisher_sequence_offset(value: &str) -> Result<(String, u64)> { + let parts = value.split(':').collect::>(); + if parts.len() != 2 { + return Err(anyhow!( + "expected publisher sequence offset NODE:SEQUENCE_OFFSET, got {value:?}" + )); + } + let source_node = parts[0].trim(); + if source_node.is_empty() { + return Err(anyhow!("publisher sequence offset node cannot be empty")); + } + let offset = parse_sim_u64("publisher sequence offset", parts[1])?; + Ok((source_node.to_string(), offset)) +} + +fn sim_duplicate_publishers_media_time_offsets( + args: &SimDuplicatePublishersArgs, + publishers: &[String], +) -> Result> { + let mut offsets = BTreeMap::new(); + for value in &args.publisher_media_time_offsets_ms { + let (source_node, offset) = parse_sim_publisher_media_time_offset(value)?; + if !publishers.contains(&source_node) { + return Err(anyhow!( + "publisher media time offset references unknown publisher {source_node}" + )); + } + offsets.insert(source_node, offset); + } + Ok(offsets) +} + +fn parse_sim_publisher_media_time_offset(value: &str) -> Result<(String, u64)> { + let parts = value.split(':').collect::>(); + if parts.len() != 2 { + return Err(anyhow!( + "expected publisher media time offset NODE:OFFSET_MS, got {value:?}" + )); + } + let source_node = parts[0].trim(); + if source_node.is_empty() { + return Err(anyhow!("publisher media time offset node cannot be empty")); + } + let offset = parse_sim_u64("publisher media time offset", parts[1])?; + Ok((source_node.to_string(), offset)) +} + +fn sim_publisher_source_material( + values: &[String], + publishers: &[String], +) -> Result> { + let mut material = BTreeMap::new(); + for value in values { + let (source_node, material_id) = parse_sim_publisher_source_material(value)?; + if !publishers.contains(&source_node) { + return Err(anyhow!( + "publisher source material references unknown publisher {source_node}" + )); + } + material.insert(source_node, material_id); + } + Ok(material) +} + +fn parse_sim_publisher_source_material(value: &str) -> Result<(String, String)> { + let parts = value.splitn(2, ':').collect::>(); + if parts.len() != 2 { + return Err(anyhow!( + "expected publisher source material NODE:MATERIAL_ID, got {value:?}" + )); + } + let source_node = parts[0].trim(); + let material_id = parts[1].trim(); + if source_node.is_empty() || material_id.is_empty() { + return Err(anyhow!( + "publisher source material node and material id cannot be empty" + )); + } + Ok((source_node.to_string(), material_id.to_string())) +} + +fn sim_duplicate_publishers_missing_media_timing_publishers( + args: &SimDuplicatePublishersArgs, + publishers: &[String], +) -> Result> { + let mut missing = BTreeSet::new(); + for value in &args.missing_media_timing_publishers { + let source_node = value.trim(); + if source_node.is_empty() { + return Err(anyhow!( + "missing media timing publisher node cannot be empty" + )); + } + if !publishers.iter().any(|publisher| publisher == source_node) { + return Err(anyhow!( + "missing media timing references unknown publisher {source_node}" + )); + } + missing.insert(source_node.to_string()); + } + Ok(missing) +} + +fn parse_sim_encoder_drift(value: &str) -> Result { + let parts = value.split(':').collect::>(); + if parts.len() != 3 { + return Err(anyhow!( + "expected encoder drift NODE:SEQUENCE:PROFILE_ID, got {value:?}" + )); + } + let source_node = parts[0].trim(); + let profile_id = parts[2].trim(); + if source_node.is_empty() || profile_id.is_empty() { + return Err(anyhow!("encoder drift node and profile id cannot be empty")); + } + let sequence = parse_sim_u64("encoder drift sequence", parts[1])?; + Ok(ec_core::sim::EncoderDriftFault::new( + source_node, + sequence, + profile_id, + )) +} + +fn parse_sim_u64(label: &str, value: &str) -> Result { + value + .trim() + .parse::() + .with_context(|| format!("failed to parse {label} as u64: {value:?}")) +} + +fn archive_track_aliases(track_name: &str) -> Vec<&'static str> { + match track_name { + WT_PUBLISH_PRIMARY_VIDEO_TRACK => vec![WT_PUBLISH_PRIMARY_VIDEO_TRACK, "video0.m4s"], + WT_PUBLISH_PRIMARY_AUDIO_TRACK => vec![WT_PUBLISH_PRIMARY_AUDIO_TRACK, "audio0.m4s"], + "video0.m4s" => vec!["video0.m4s", WT_PUBLISH_PRIMARY_VIDEO_TRACK], + "audio0.m4s" => vec!["audio0.m4s", WT_PUBLISH_PRIMARY_AUDIO_TRACK], + _ => Vec::new(), + } +} + +fn archive_index_path_with_aliases( + manifest_root: &Path, + broadcast_name: &str, + track_name: &str, +) -> Option { + let aliases = archive_track_aliases(track_name); + if aliases.is_empty() { + let path = archive_index_path(manifest_root, broadcast_name, track_name); + return path.exists().then_some(path); + } + aliases + .into_iter() + .map(|alias| archive_index_path(manifest_root, broadcast_name, alias)) + .find(|path| path.exists()) +} + +fn read_archive_records_with_aliases( + manifest_root: &Path, + broadcast_name: &str, + track_name: &str, +) -> Result> { + let aliases = archive_track_aliases(track_name); + if aliases.is_empty() { + return read_archive_records(manifest_root, broadcast_name, track_name); + } + for alias in aliases { + let records = read_archive_records(manifest_root, broadcast_name, alias)?; + if !records.is_empty() { + return Ok(records); + } + } + Ok(Vec::new()) +} + fn dedupe_by_group_sequence(mut records: Vec) -> Vec { records.sort_by_key(|record| record.group_sequence); let mut out: Vec = Vec::with_capacity(records.len()); @@ -5947,6 +16246,609 @@ fn dedupe_by_group_sequence(mut records: Vec) -> Vec Result> { + if min_records == 0 || !path.exists() { + return Ok(Vec::new()); + } + + let mut file = File::open(path) + .with_context(|| format!("failed to open archive index {}", path.display()))?; + let len = file + .metadata() + .with_context(|| format!("failed to stat archive index {}", path.display()))? + .len(); + let mut offset = len; + let mut data = Vec::new(); + let mut reached_start = false; + + while offset > 0 { + let read_len = usize::try_from(offset.min(256 * 1024)).unwrap_or(256 * 1024); + offset -= read_len as u64; + file.seek(SeekFrom::Start(offset)) + .with_context(|| format!("failed to seek archive index {}", path.display()))?; + let mut chunk = vec![0u8; read_len]; + file.read_exact(&mut chunk) + .with_context(|| format!("failed to read archive index {}", path.display()))?; + chunk.extend_from_slice(&data); + data = chunk; + reached_start = offset == 0; + + let parse_start = if reached_start { + 0 + } else if let Some(pos) = data.iter().position(|byte| *byte == b'\n') { + pos + 1 + } else { + continue; + }; + let complete = &data[parse_start..]; + let complete_lines = complete + .split(|byte| *byte == b'\n') + .filter(|line| !trim_ascii_whitespace(line).is_empty()) + .count(); + if complete_lines >= min_records { + break; + } + } + + let parse_start = if reached_start { + 0 + } else { + data.iter() + .position(|byte| *byte == b'\n') + .map(|pos| pos + 1) + .unwrap_or(data.len()) + }; + let mut records = Vec::new(); + for line in data[parse_start..].split(|byte| *byte == b'\n') { + if let Some(record) = parse_archive_record_line(path, line) { + records.push(record); + } + } + if records.len() > min_records { + records = records.split_off(records.len() - min_records); + } + Ok(records) +} + +fn read_archive_tail_records_with_aliases( + manifest_root: &Path, + broadcast_name: &str, + track_name: &str, + min_records: usize, +) -> Result> { + let Some(path) = archive_index_path_with_aliases(manifest_root, broadcast_name, track_name) + else { + return Ok(Vec::new()); + }; + read_archive_tail_records_from_path(&path, min_records) +} + +fn count_archive_index_lines(path: &Path) -> Result { + let mut file = File::open(path) + .with_context(|| format!("failed to open archive index {}", path.display()))?; + let mut buf = vec![0u8; 1024 * 1024]; + let mut lines = 0usize; + let mut saw_non_newline = false; + let mut ended_with_newline = true; + loop { + let read = file + .read(&mut buf) + .with_context(|| format!("failed to read archive index {}", path.display()))?; + if read == 0 { + break; + } + for byte in &buf[..read] { + if *byte == b'\n' { + lines += 1; + ended_with_newline = true; + } else { + saw_non_newline = true; + ended_with_newline = false; + } + } + } + if saw_non_newline && !ended_with_newline { + lines += 1; + } + Ok(lines) +} + +fn archive_track_summary_from_records(records: &[ArchiveIndexRecord]) -> ArchiveTrackSummary { + ArchiveTrackSummary { + start_unix_ms: records.iter().map(|record| record.received_unix_ms).min(), + end_unix_ms: records.iter().map(|record| record.received_unix_ms).max(), + segments: records.len(), + total_bytes: Some(records.iter().map(|record| record.size_bytes as u64).sum()), + total_frames: Some(records.iter().map(|record| record.frame_count).sum()), + partial_summary: false, + } +} + +fn first_archive_record_from_path(path: &Path) -> Result> { + if !path.exists() { + return Ok(None); + } + let file = File::open(path) + .with_context(|| format!("failed to open archive index {}", path.display()))?; + let reader = BufReader::new(file); + for line in reader.split(b'\n') { + let line = + line.with_context(|| format!("failed to read archive index {}", path.display()))?; + if let Some(record) = parse_archive_record_line(path, &line) { + return Ok(Some(record)); + } + } + Ok(None) +} + +fn last_archive_record_from_path(path: &Path) -> Result> { + let records = read_archive_tail_records_from_path(path, 1)?; + Ok(records.into_iter().last()) +} + +fn summarize_archive_track( + manifest_root: &Path, + broadcast_name: &str, + track_name: &str, + from_ms: Option, + limit: Option, +) -> Result { + if from_ms.is_some() || limit.is_some() { + let records = + parse_archive_track(manifest_root, broadcast_name, track_name, from_ms, limit)?; + return Ok(archive_track_summary_from_records(&records)); + } + + let Some(path) = archive_index_path_with_aliases(manifest_root, broadcast_name, track_name) + else { + return Ok(ArchiveTrackSummary::default()); + }; + summarize_archive_track_from_path(&path) +} + +fn summarize_archive_track_from_path(path: &Path) -> Result { + let size = fs::metadata(&path) + .with_context(|| format!("failed to stat archive index {}", path.display()))? + .len(); + if size <= WT_ARCHIVE_TIMELINE_FULL_SCAN_MAX_BYTES { + let records = read_archive_records_from_path(&path)?; + return Ok(archive_track_summary_from_records(&records)); + } + + let modified = fs::metadata(&path) + .with_context(|| format!("failed to stat archive index {}", path.display()))? + .modified() + .ok(); + let cache = ARCHIVE_TRACK_SUMMARY_CACHE.get_or_init(|| Mutex::new(HashMap::new())); + if let Ok(cache) = cache.lock() { + if let Some(entry) = cache.get(path) { + if entry.size == size && entry.modified == modified { + return Ok(entry.summary.clone()); + } + } + } + + let first = first_archive_record_from_path(&path)?; + let last = last_archive_record_from_path(&path)?; + let start_unix_ms = first + .as_ref() + .map(|record| record.received_unix_ms) + .into_iter() + .chain(last.as_ref().map(|record| record.received_unix_ms)) + .min(); + let end_unix_ms = first + .as_ref() + .map(|record| record.received_unix_ms) + .into_iter() + .chain(last.as_ref().map(|record| record.received_unix_ms)) + .max(); + let summary = ArchiveTrackSummary { + start_unix_ms, + end_unix_ms, + segments: count_archive_index_lines(&path)?, + total_bytes: None, + total_frames: None, + partial_summary: true, + }; + if let Ok(mut cache) = cache.lock() { + cache.insert( + path.to_path_buf(), + ArchiveTrackSummaryCacheEntry { + size, + modified, + summary: summary.clone(), + }, + ); + } + Ok(summary) +} + +fn strip_ladder_rendition_suffix(value: &str) -> (&str, Option<&'static str>) { + for rendition in WT_LADDER_RENDITIONS { + let suffix = format!("-{rendition}"); + if value.ends_with(&suffix) { + return (&value[..value.len() - suffix.len()], Some(*rendition)); + } + } + (value, None) +} + +fn track_rendition(track_name: &str) -> Option<&'static str> { + let normalized = track_name.trim_end_matches(".jsonl"); + if let Some((_, candidate)) = normalized.rsplit_once('/') { + for rendition in WT_LADDER_RENDITIONS { + if candidate == *rendition { + return Some(*rendition); + } + } + } + for rendition in WT_LADDER_RENDITIONS { + if normalized.ends_with(&format!("_{rendition}")) + || normalized.ends_with(&format!("-{rendition}")) + { + return Some(*rendition); + } + } + None +} + +fn archive_track_media_kind(track_name: &str) -> &'static str { + let lower = track_name.to_ascii_lowercase(); + let head = lower.split('/').next().unwrap_or(lower.as_str()); + if head.contains("init") { + "init" + } else if head == WT_PUBLISH_PRIMARY_AUDIO_TRACK + || head.starts_with("audio") + || lower.contains("/audio") + { + "audio" + } else if head == WT_PUBLISH_PRIMARY_VIDEO_TRACK + || head.starts_with("video") + || lower.contains("/video") + || lower.ends_with(".m4s") + { + "video" + } else if head.contains("catalog") { + "catalog" + } else if head.contains("manifest") { + "manifest" + } else { + "other" + } +} + +fn archive_index_files_for_broadcast( + manifest_root: &Path, + broadcast_name: &str, +) -> Result> { + let dir = manifest_root.join(sanitize_path_component(broadcast_name)); + if !dir.exists() { + return Ok(Vec::new()); + } + let mut files = Vec::new(); + for entry in fs::read_dir(&dir) + .with_context(|| format!("failed to read archive manifest dir {}", dir.display()))? + { + let entry = entry + .with_context(|| format!("failed to read archive manifest dir {}", dir.display()))?; + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) == Some("jsonl") { + files.push(path); + } + } + files.sort(); + Ok(files) +} + +fn archive_index_track_name(path: &Path) -> Result> { + if let Some(record) = first_archive_record_from_path(path)? { + return Ok(Some(record.track_name)); + } + Ok(path + .file_stem() + .and_then(|value| value.to_str()) + .map(|value| value.to_string())) +} + +fn archive_ladder_broadcast_names( + manifest_root: &Path, + broadcast_name: &str, +) -> Result> { + let (base_name, requested_rendition) = strip_ladder_rendition_suffix(broadcast_name); + let mut names = Vec::new(); + let push_name = |names: &mut Vec, value: String| { + if !names.iter().any(|existing| existing == &value) { + names.push(value); + } + }; + + push_name(&mut names, base_name.to_string()); + if let Some(rendition) = requested_rendition { + push_name(&mut names, format!("{base_name}-{rendition}")); + } + + if manifest_root.exists() { + for entry in fs::read_dir(manifest_root).with_context(|| { + format!( + "failed to read archive manifest root {}", + manifest_root.display() + ) + })? { + let entry = entry.with_context(|| { + format!( + "failed to read archive manifest root {}", + manifest_root.display() + ) + })?; + if !entry.file_type().map(|kind| kind.is_dir()).unwrap_or(false) { + continue; + } + let Some(name) = entry.file_name().to_str().map(|value| value.to_string()) else { + continue; + }; + let (candidate_base, candidate_rendition) = strip_ladder_rendition_suffix(&name); + if candidate_base == base_name + && (candidate_rendition.is_some() || candidate_base == name) + { + push_name(&mut names, name); + } + } + } + + names.sort(); + Ok(names) +} + +fn retention_coverage(span_seconds: f64, retention_seconds: u64) -> f64 { + if retention_seconds == 0 { + return 0.0; + } + (span_seconds / retention_seconds as f64).clamp(0.0, 1.0) +} + +fn archive_ladder_track_status( + broadcast_name: &str, + track_name: String, + summary: ArchiveTrackSummary, + broadcast_rendition: Option<&'static str>, + now_ms: u64, + retention_seconds: u64, +) -> ArchiveLadderTrackStatus { + let rendition_id = track_rendition(&track_name) + .or(broadcast_rendition) + .unwrap_or(WT_LADDER_PRIMARY_RENDITION) + .to_string(); + let span_seconds = match (summary.start_unix_ms, summary.end_unix_ms) { + (Some(start), Some(end)) if end >= start => (end - start) as f64 / 1000.0, + _ => 0.0, + }; + let latest_age_ms = summary.end_unix_ms.map(|end| now_ms.saturating_sub(end)); + let media_kind = archive_track_media_kind(&track_name).to_string(); + ArchiveLadderTrackStatus { + broadcast_name: broadcast_name.to_string(), + track_name, + media_kind, + rendition_id, + start_unix_ms: summary.start_unix_ms, + end_unix_ms: summary.end_unix_ms, + latest_age_ms, + segments: summary.segments, + total_bytes: summary.total_bytes, + total_frames: summary.total_frames, + span_seconds, + retention_coverage: retention_coverage(span_seconds, retention_seconds), + partial_summary: summary.partial_summary, + } +} + +fn summarize_ladder_renditions( + tracks: &[ArchiveLadderTrackStatus], + retention_seconds: u64, +) -> Vec { + let mut by_rendition: BTreeMap> = BTreeMap::new(); + for track in tracks { + by_rendition + .entry(track.rendition_id.clone()) + .or_default() + .push(track); + } + by_rendition + .into_iter() + .map(|(rendition_id, tracks)| { + let start_unix_ms = tracks.iter().filter_map(|track| track.start_unix_ms).min(); + let end_unix_ms = tracks.iter().filter_map(|track| track.end_unix_ms).max(); + let latest_age_ms = tracks.iter().filter_map(|track| track.latest_age_ms).min(); + let segments = tracks.iter().map(|track| track.segments).sum(); + let total_bytes = tracks + .iter() + .map(|track| track.total_bytes) + .try_fold(0u64, |acc, value| value.map(|value| acc + value)); + let total_frames = tracks + .iter() + .map(|track| track.total_frames) + .try_fold(0u64, |acc, value| value.map(|value| acc + value)); + let span_seconds = match (start_unix_ms, end_unix_ms) { + (Some(start), Some(end)) if end >= start => (end - start) as f64 / 1000.0, + _ => 0.0, + }; + ArchiveLadderRenditionSummary { + rendition_id, + start_unix_ms, + end_unix_ms, + latest_age_ms, + segments, + total_bytes, + total_frames, + retention_coverage: retention_coverage(span_seconds, retention_seconds), + track_count: tracks.len(), + } + }) + .collect() +} + +fn archive_ladder_status( + manifest_root: &Path, + broadcast_name: &str, + now_ms: u64, + retention_seconds: u64, +) -> Result { + let (base_broadcast_name, _) = strip_ladder_rendition_suffix(broadcast_name); + let mut tracks = Vec::new(); + for candidate in archive_ladder_broadcast_names(manifest_root, broadcast_name)? { + let (_, broadcast_rendition) = strip_ladder_rendition_suffix(&candidate); + for path in archive_index_files_for_broadcast(manifest_root, &candidate)? { + let Some(track_name) = archive_index_track_name(&path)? else { + continue; + }; + let summary = summarize_archive_track_from_path(&path)?; + tracks.push(archive_ladder_track_status( + &candidate, + track_name, + summary, + broadcast_rendition, + now_ms, + retention_seconds, + )); + } + } + tracks.sort_by(|a, b| { + a.rendition_id + .cmp(&b.rendition_id) + .then_with(|| a.media_kind.cmp(&b.media_kind)) + .then_with(|| a.broadcast_name.cmp(&b.broadcast_name)) + .then_with(|| a.track_name.cmp(&b.track_name)) + }); + Ok(ArchiveLadderResponse { + broadcast_name: broadcast_name.to_string(), + base_broadcast_name: base_broadcast_name.to_string(), + generated_unix_ms: now_ms, + retention_seconds, + retention_window_start_unix_ms: (retention_seconds > 0) + .then_some(now_ms.saturating_sub(retention_seconds.saturating_mul(1000))), + renditions: summarize_ladder_renditions(&tracks, retention_seconds), + tracks, + }) +} + +fn ladder_rendition_bandwidth(rendition_id: &str) -> u64 { + match rendition_id { + "480p" => 1_400_000, + "720p" => 3_200_000, + "1080p" => 6_300_000, + _ => 3_500_000, + } +} + +fn ladder_rendition_resolution(rendition_id: &str) -> Option<(u32, u32)> { + match rendition_id { + "480p" => Some((854, 480)), + "720p" => Some((1280, 720)), + "1080p" => Some((1920, 1080)), + _ => None, + } +} + +fn archive_hls_variants_from_ladder(status: &ArchiveLadderResponse) -> Vec { + let has_explicit_ladder = status.tracks.iter().any(|track| { + track.media_kind == "video" + && track.segments > 0 + && (track_rendition(&track.track_name).is_some() + || strip_ladder_rendition_suffix(&track.broadcast_name) + .1 + .is_some()) + }); + if !has_explicit_ladder { + return Vec::new(); + } + + let mut selected: BTreeMap = BTreeMap::new(); + for track in &status.tracks { + if track.media_kind != "video" || track.segments == 0 { + continue; + } + let candidate = ArchiveHlsVariant { + broadcast_name: track.broadcast_name.clone(), + track_name: track.track_name.clone(), + rendition_id: track.rendition_id.clone(), + bandwidth: ladder_rendition_bandwidth(&track.rendition_id), + resolution: ladder_rendition_resolution(&track.rendition_id), + segments: track.segments, + latest_age_ms: track.latest_age_ms, + }; + let replace = selected + .get(&candidate.rendition_id) + .map(|existing| { + candidate.segments > existing.segments + || (candidate.segments == existing.segments + && candidate.latest_age_ms.unwrap_or(u64::MAX) + < existing.latest_age_ms.unwrap_or(u64::MAX)) + }) + .unwrap_or(true); + if replace { + selected.insert(candidate.rendition_id.clone(), candidate); + } + } + let mut variants = selected.into_values().collect::>(); + variants.sort_by(|a, b| { + a.bandwidth + .cmp(&b.bandwidth) + .then_with(|| a.rendition_id.cmp(&b.rendition_id)) + }); + variants +} + +fn archive_track_playlist_query(url: &Url, broadcast_name: &str, track_name: &str) -> String { + let mut parts = vec![ + format!("broadcast={}", urlencoding::encode(broadcast_name)), + format!("track={}", urlencoding::encode(track_name)), + ]; + for key in ["from_ms", "limit"] { + if let Some((_, value)) = url.query_pairs().find(|(k, _)| k == key) { + parts.push(format!("{key}={}", urlencoding::encode(&value))); + } + } + format!("?{}", parts.join("&")) +} + +fn archive_hls_master_playlist( + manifest_root: &Path, + broadcast_name: &str, + req_url: &Url, + now_ms: u64, + retention_seconds: u64, +) -> Result> { + let status = archive_ladder_status(manifest_root, broadcast_name, now_ms, retention_seconds)?; + let variants = archive_hls_variants_from_ladder(&status); + if variants.is_empty() { + return Ok(None); + } + + let encoded_path_broadcast = urlencoding::encode(broadcast_name); + let mut out = String::new(); + out.push_str("#EXTM3U\n"); + out.push_str("#EXT-X-VERSION:7\n"); + out.push_str("#EXT-X-INDEPENDENT-SEGMENTS\n"); + for variant in variants { + let query = + archive_track_playlist_query(req_url, &variant.broadcast_name, &variant.track_name); + let resolution = variant + .resolution + .map(|(width, height)| format!(",RESOLUTION={width}x{height}")) + .unwrap_or_default(); + out.push_str(&format!( + "#EXT-X-STREAM-INF:BANDWIDTH={},CODECS=\"avc1.640028,mp4a.40.2\"{resolution},NAME=\"{}\"\n", + variant.bandwidth, variant.rendition_id + )); + out.push_str(&format!( + "/archive/{encoded_path_broadcast}/track.m3u8{query}\n" + )); + } + Ok(Some(out)) +} + fn default_segment_duration_ms(records: &[ArchiveIndexRecord]) -> u64 { if records.len() < 2 { return 1000; @@ -5993,20 +16895,23 @@ fn latest_init_hash( broadcast_name: &str, fallback_track: &str, ) -> Result> { - let mut init_records = dedupe_by_group_sequence(read_archive_records( - manifest_root, - broadcast_name, - "init.mp4", - )?); - if let Some(record) = init_records.pop() { - return Ok(Some(record.blake3)); + let mut init_tracks = Vec::new(); + if let Some(rendition) = track_rendition(fallback_track) { + init_tracks.push(format!("{WT_PUBLISH_INIT_TRACK}/{rendition}")); } - let fallback = dedupe_by_group_sequence(read_archive_records( - manifest_root, - broadcast_name, - fallback_track, - )?); - Ok(fallback.first().map(|record| record.blake3.clone())) + init_tracks.push(WT_PUBLISH_INIT_TRACK.to_string()); + + for init_track in init_tracks { + let mut init_records = dedupe_by_group_sequence(read_archive_records( + manifest_root, + broadcast_name, + &init_track, + )?); + if let Some(record) = init_records.pop() { + return Ok(Some(record.blake3)); + } + } + Ok(None) } fn parse_limit(url: &Url) -> Option { @@ -6021,6 +16926,20 @@ fn parse_from_ms(url: &Url) -> Option { .and_then(|(_, v)| v.parse::().ok()) } +fn archive_playlist_query(url: &Url) -> String { + let mut parts = Vec::new(); + for key in ["from_ms", "limit"] { + if let Some((_, value)) = url.query_pairs().find(|(k, _)| k == key) { + parts.push(format!("{key}={}", urlencoding::encode(&value))); + } + } + if parts.is_empty() { + String::new() + } else { + format!("?{}", parts.join("&")) + } +} + fn parse_archive_track( manifest_root: &Path, broadcast_name: &str, @@ -6028,17 +16947,32 @@ fn parse_archive_track( from_ms: Option, limit: Option, ) -> Result> { - let mut records = dedupe_by_group_sequence(read_archive_records( - manifest_root, - broadcast_name, - track_name, - )?); + let tail_limit = from_ms.is_none().then_some(limit).flatten(); + let mut records = if let Some(max_items) = tail_limit { + if max_items > 0 { + read_archive_tail_records_with_aliases( + manifest_root, + broadcast_name, + track_name, + max_items.saturating_mul(4).max(max_items + 16), + )? + } else { + Vec::new() + } + } else { + read_archive_records_with_aliases(manifest_root, broadcast_name, track_name)? + }; + records = dedupe_by_group_sequence(records); if let Some(start_ms) = from_ms { records.retain(|record| record.received_unix_ms >= start_ms); } if let Some(max_items) = limit { if max_items > 0 && records.len() > max_items { - records = records.split_off(records.len() - max_items); + if from_ms.is_some() { + records.truncate(max_items); + } else { + records = records.split_off(records.len() - max_items); + } } } Ok(records) @@ -6094,10 +17028,166 @@ fn cas_path_for_hash(cas_root: &Path, hash: &str) -> Result { if !validate_blake3_hex(hash) { return Err(anyhow!("invalid hash")); } + let hash = hash.to_ascii_lowercase(); let shard = &hash[0..2]; Ok(cas_root.join(shard).join(format!("{hash}.bin"))) } +fn archive_origin_object_url(base: &Url, relative_path: &str) -> Result { + let mut root = base.clone(); + root.set_query(None); + root.set_fragment(None); + if !root.path().ends_with('/') { + let path = format!("{}/", root.path().trim_end_matches('/')); + root.set_path(&path); + } + root.join(relative_path) + .with_context(|| format!("failed to build archive origin URL for {relative_path}")) +} + +fn archive_origin_object_urls(state: &ArchiveReplayState, hash: &str) -> Result> { + if !validate_blake3_hex(hash) { + return Err(anyhow!("invalid hash")); + } + let hash = hash.to_ascii_lowercase(); + let shard = &hash[0..2]; + let mut urls = Vec::new(); + if let Some(base) = &state.archive_cas_origin_url { + urls.push(archive_origin_object_url( + base, + &format!("{shard}/{hash}.bin"), + )?); + } + if let Some(base) = &state.archive_origin_url { + urls.push(archive_origin_object_url( + base, + &format!("objects/blake3/{shard}/{hash}.bin"), + )?); + } + Ok(urls) +} + +fn mark_archive_object_access(state: &ArchiveReplayState, hash: &str) { + if !validate_blake3_hex(hash) { + return; + } + let hash = hash.to_ascii_lowercase(); + let marker = state + .cache_access_root + .join(&hash[0..2]) + .join(format!("{hash}.access")); + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() + .to_string(); + if let Some(parent) = marker.parent() { + if let Err(err) = fs::create_dir_all(parent) { + tracing::debug!(path = %parent.display(), err = %err, "failed to create archive cache access marker directory"); + return; + } + } + if let Err(err) = fs::write(&marker, now_ms) { + tracing::debug!(path = %marker.display(), err = %err, "failed to write archive cache access marker"); + } +} + +async fn fetch_archive_origin_object( + state: &ArchiveReplayState, + hash: &str, + dest: &Path, +) -> Result>> { + let expected_hash = hash.to_ascii_lowercase(); + let urls = archive_origin_object_urls(state, &expected_hash)?; + if urls.is_empty() { + return Ok(None); + } + + let mut last_err: Option = None; + for url in urls { + let response = match state.http_client.get(url.clone()).send().await { + Ok(response) => response, + Err(err) => { + last_err = Some(anyhow!("GET {url} failed: {err}")); + continue; + } + }; + if response.status() == reqwest::StatusCode::NOT_FOUND { + continue; + } + if !response.status().is_success() { + last_err = Some(anyhow!("GET {url} returned {}", response.status())); + continue; + } + if response + .content_length() + .is_some_and(|len| len > state.archive_origin_max_bytes as u64) + { + return Err(anyhow!("archive origin object exceeds size cap: {url}")); + } + let bytes = response + .bytes() + .await + .with_context(|| format!("failed to read archive origin body: {url}"))?; + if bytes.len() > state.archive_origin_max_bytes { + return Err(anyhow!("archive origin object exceeds size cap: {url}")); + } + let actual = blake3::hash(&bytes).to_hex().to_string(); + if actual != expected_hash { + return Err(anyhow!( + "archive origin hash mismatch for {url}: expected {expected_hash}, got {actual}" + )); + } + + if let Some(parent) = dest.parent() { + fs::create_dir_all(parent) + .with_context(|| format!("failed to create CAS shard {}", parent.display()))?; + } + let nonce = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + let tmp = dest.with_file_name(format!( + ".{}.tmp-{}-{nonce}", + dest.file_name() + .and_then(|name| name.to_str()) + .unwrap_or("archive-object"), + std::process::id() + )); + fs::write(&tmp, &bytes) + .with_context(|| format!("failed to write temporary CAS object {}", tmp.display()))?; + fs::rename(&tmp, dest) + .with_context(|| format!("failed to promote CAS object {}", dest.display()))?; + tracing::info!(hash = %expected_hash, origin = %url, path = %dest.display(), "cached archive object from origin"); + return Ok(Some(bytes.to_vec())); + } + + if let Some(err) = last_err { + return Err(err); + } + Ok(None) +} + +async fn read_archive_object(state: &ArchiveReplayState, hash: &str) -> Result>> { + let path = cas_path_for_hash(&state.cas_root, hash)?; + match fs::read(&path) { + Ok(bytes) => { + mark_archive_object_access(state, hash); + Ok(Some(bytes)) + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let fetched = fetch_archive_origin_object(state, hash, &path).await?; + if fetched.is_some() { + mark_archive_object_access(state, hash); + } + Ok(fetched) + } + Err(err) => { + Err(err).with_context(|| format!("failed to read archive object {}", path.display())) + } + } +} + fn archive_response(status: u16, content_type: &str, body: Vec) -> ArchiveHttpResponse { ArchiveHttpResponse { status, @@ -6182,7 +17272,7 @@ fn archive_status_text(status: u16) -> &'static str { } } -fn handle_archive_http_request( +async fn handle_archive_http_request( state: &ArchiveReplayState, method: &str, target: &str, @@ -6225,20 +17315,20 @@ fn handle_archive_http_request( match parts[2] { "timeline.json" if parts.len() == 3 => { - let video = match parse_archive_track( + let video = match summarize_archive_track( &state.manifest_root, &broadcast_name, - "video0.m4s", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, from_ms, limit, ) { Ok(records) => records, Err(err) => return archive_error(500, &format!("{err:#}")), }; - let audio = match parse_archive_track( + let audio = match summarize_archive_track( &state.manifest_root, &broadcast_name, - "audio0.m4s", + WT_PUBLISH_PRIMARY_AUDIO_TRACK, from_ms, limit, ) { @@ -6246,23 +17336,32 @@ fn handle_archive_http_request( Err(err) => return archive_error(500, &format!("{err:#}")), }; let min_ms = video - .first() - .map(|record| record.received_unix_ms) + .start_unix_ms .into_iter() - .chain(audio.first().map(|record| record.received_unix_ms)) + .chain(audio.start_unix_ms) .min(); - let max_ms = video - .last() - .map(|record| record.received_unix_ms) - .into_iter() - .chain(audio.last().map(|record| record.received_unix_ms)) - .max(); + let max_ms = video.end_unix_ms.into_iter().chain(audio.end_unix_ms).max(); let body = ArchiveTimelineResponse { broadcast_name, start_unix_ms: min_ms, end_unix_ms: max_ms, - video_segments: video.len(), - audio_segments: audio.len(), + video_segments: video.segments, + audio_segments: audio.segments, + }; + match serde_json::to_vec(&body) { + Ok(buf) => archive_response(200, "application/json; charset=utf-8", buf), + Err(err) => archive_error(500, &format!("{err:#}")), + } + } + "ladder.json" if parts.len() == 3 => { + let body = match archive_ladder_status( + &state.manifest_root, + &broadcast_name, + now_unix_ms(), + state.archive_retention_seconds, + ) { + Ok(body) => body, + Err(err) => return archive_error(500, &format!("{err:#}")), }; match serde_json::to_vec(&body) { Ok(buf) => archive_response(200, "application/json; charset=utf-8", buf), @@ -6270,10 +17369,62 @@ fn handle_archive_http_request( } } "master.m3u8" if parts.len() == 3 => { - let encoded = urlencoding::encode(&broadcast_name); - let playlist = format!( - "#EXTM3U\n#EXT-X-VERSION:7\n#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID=\"audio\",NAME=\"main\",DEFAULT=YES,AUTOSELECT=YES,URI=\"/archive/{encoded}/audio.m3u8\"\n#EXT-X-STREAM-INF:BANDWIDTH=3500000,CODECS=\"avc1.640028,mp4a.40.2\",AUDIO=\"audio\"\n/archive/{encoded}/video.m3u8\n" - ); + let playlist = match archive_hls_master_playlist( + &state.manifest_root, + &broadcast_name, + &req_url, + now_unix_ms(), + state.archive_retention_seconds, + ) { + Ok(Some(playlist)) => playlist, + Ok(None) => { + let encoded = urlencoding::encode(&broadcast_name); + let query = archive_playlist_query(&req_url); + format!( + "#EXTM3U\n#EXT-X-VERSION:7\n#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID=\"audio\",NAME=\"main\",DEFAULT=YES,AUTOSELECT=YES,URI=\"/archive/{encoded}/audio.m3u8{query}\"\n#EXT-X-STREAM-INF:BANDWIDTH=3500000,CODECS=\"avc1.640028,mp4a.40.2\",AUDIO=\"audio\"\n/archive/{encoded}/video.m3u8{query}\n" + ) + } + Err(err) => return archive_error(500, &format!("{err:#}")), + }; + archive_response( + 200, + "application/vnd.apple.mpegurl; charset=utf-8", + playlist.into_bytes(), + ) + } + "track.m3u8" if parts.len() == 3 => { + let track_broadcast_name = req_url + .query_pairs() + .find(|(k, _)| k == "broadcast") + .map(|(_, value)| value.to_string()) + .unwrap_or_else(|| broadcast_name.clone()); + let track_name = req_url + .query_pairs() + .find(|(k, _)| k == "track") + .map(|(_, value)| value.to_string()) + .unwrap_or_else(|| WT_PUBLISH_PRIMARY_VIDEO_TRACK.to_string()); + let playlist_limit = limit.or(Some(WT_ARCHIVE_HLS_DEFAULT_LIMIT)); + let init_hash = + match latest_init_hash(&state.manifest_root, &track_broadcast_name, &track_name) { + Ok(Some(hash)) => hash, + Ok(None) => return archive_error(404, "missing init segment"), + Err(err) => return archive_error(500, &format!("{err:#}")), + }; + let records = match parse_archive_track( + &state.manifest_root, + &track_broadcast_name, + &track_name, + from_ms, + playlist_limit, + ) { + Ok(records) => records, + Err(err) => return archive_error(500, &format!("{err:#}")), + }; + if records.is_empty() { + return archive_error(404, "archive track is empty"); + } + let playlist = + hls_playlist_for_track(&track_broadcast_name, &track_name, &init_hash, &records); archive_response( 200, "application/vnd.apple.mpegurl; charset=utf-8", @@ -6281,18 +17432,22 @@ fn handle_archive_http_request( ) } "video.m3u8" if parts.len() == 3 => { - let init_hash = - match latest_init_hash(&state.manifest_root, &broadcast_name, "video0.m4s") { - Ok(Some(hash)) => hash, - Ok(None) => return archive_error(404, "missing init segment"), - Err(err) => return archive_error(500, &format!("{err:#}")), - }; + let playlist_limit = limit.or(Some(WT_ARCHIVE_HLS_DEFAULT_LIMIT)); + let init_hash = match latest_init_hash( + &state.manifest_root, + &broadcast_name, + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + ) { + Ok(Some(hash)) => hash, + Ok(None) => return archive_error(404, "missing init segment"), + Err(err) => return archive_error(500, &format!("{err:#}")), + }; let records = match parse_archive_track( &state.manifest_root, &broadcast_name, - "video0.m4s", + WT_PUBLISH_PRIMARY_VIDEO_TRACK, from_ms, - limit, + playlist_limit, ) { Ok(records) => records, Err(err) => return archive_error(500, &format!("{err:#}")), @@ -6300,8 +17455,12 @@ fn handle_archive_http_request( if records.is_empty() { return archive_error(404, "video archive is empty"); } - let playlist = - hls_playlist_for_track(&broadcast_name, "video0.m4s", &init_hash, &records); + let playlist = hls_playlist_for_track( + &broadcast_name, + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + &init_hash, + &records, + ); archive_response( 200, "application/vnd.apple.mpegurl; charset=utf-8", @@ -6309,18 +17468,22 @@ fn handle_archive_http_request( ) } "audio.m3u8" if parts.len() == 3 => { - let init_hash = - match latest_init_hash(&state.manifest_root, &broadcast_name, "audio0.m4s") { - Ok(Some(hash)) => hash, - Ok(None) => return archive_error(404, "missing init segment"), - Err(err) => return archive_error(500, &format!("{err:#}")), - }; + let playlist_limit = limit.or(Some(WT_ARCHIVE_HLS_DEFAULT_LIMIT)); + let init_hash = match latest_init_hash( + &state.manifest_root, + &broadcast_name, + WT_PUBLISH_PRIMARY_AUDIO_TRACK, + ) { + Ok(Some(hash)) => hash, + Ok(None) => return archive_error(404, "missing init segment"), + Err(err) => return archive_error(500, &format!("{err:#}")), + }; let records = match parse_archive_track( &state.manifest_root, &broadcast_name, - "audio0.m4s", + WT_PUBLISH_PRIMARY_AUDIO_TRACK, from_ms, - limit, + playlist_limit, ) { Ok(records) => records, Err(err) => return archive_error(500, &format!("{err:#}")), @@ -6328,8 +17491,12 @@ fn handle_archive_http_request( if records.is_empty() { return archive_error(404, "audio archive is empty"); } - let playlist = - hls_playlist_for_track(&broadcast_name, "audio0.m4s", &init_hash, &records); + let playlist = hls_playlist_for_track( + &broadcast_name, + WT_PUBLISH_PRIMARY_AUDIO_TRACK, + &init_hash, + &records, + ); archive_response( 200, "application/vnd.apple.mpegurl; charset=utf-8", @@ -6342,42 +17509,50 @@ fn handle_archive_http_request( .find(|(k, _)| k == "hash") .map(|(_, v)| v.to_string()) .or_else(|| { - latest_init_hash(&state.manifest_root, &broadcast_name, "video0.m4s") - .ok() - .flatten() + latest_init_hash( + &state.manifest_root, + &broadcast_name, + WT_PUBLISH_PRIMARY_VIDEO_TRACK, + ) + .ok() + .flatten() }) .or_else(|| { - latest_init_hash(&state.manifest_root, &broadcast_name, "audio0.m4s") - .ok() - .flatten() + latest_init_hash( + &state.manifest_root, + &broadcast_name, + WT_PUBLISH_PRIMARY_AUDIO_TRACK, + ) + .ok() + .flatten() }); let Some(hash) = hash else { return archive_error(404, "missing init segment"); }; - let path = match cas_path_for_hash(&state.cas_root, &hash) { - Ok(path) => path, - Err(_) => return archive_error(400, "invalid hash"), - }; - match fs::read(path) { - Ok(bytes) => match decode_archive_group_bytes(&bytes) { + if let Err(_) = cas_path_for_hash(&state.cas_root, &hash) { + return archive_error(400, "invalid hash"); + } + match read_archive_object(state, &hash).await { + Ok(Some(bytes)) => match decode_archive_group_bytes(&bytes) { Ok(payload) => archive_response(200, "video/mp4", payload), Err(err) => archive_error(500, &format!("{err:#}")), }, - Err(_) => archive_error(404, "init segment not found"), + Ok(None) => archive_error(404, "init segment not found"), + Err(err) => archive_error(502, &format!("{err:#}")), } } "segment" if parts.len() == 4 => { let hash_part = parts[3].strip_suffix(".m4s").unwrap_or(parts[3]); - let path = match cas_path_for_hash(&state.cas_root, hash_part) { - Ok(path) => path, - Err(_) => return archive_error(400, "invalid hash"), - }; - match fs::read(path) { - Ok(bytes) => match decode_archive_group_bytes(&bytes) { + if let Err(_) = cas_path_for_hash(&state.cas_root, hash_part) { + return archive_error(400, "invalid hash"); + } + match read_archive_object(state, hash_part).await { + Ok(Some(bytes)) => match decode_archive_group_bytes(&bytes) { Ok(payload) => archive_response(200, "video/mp4", payload), Err(err) => archive_error(500, &format!("{err:#}")), }, - Err(_) => archive_error(404, "segment not found"), + Ok(None) => archive_error(404, "segment not found"), + Err(err) => archive_error(502, &format!("{err:#}")), } } _ => archive_error(404, "not found"), @@ -6422,7 +17597,7 @@ async fn handle_archive_http_connection( let mut parts = first.split_whitespace(); let method = parts.next().unwrap_or(""); let target = parts.next().unwrap_or("/"); - let response = handle_archive_http_request(&state, method, target); + let response = handle_archive_http_request(&state, method, target).await; let is_head = method == "HEAD"; let body_len = if is_head { 0 } else { response.body.len() }; let head = format!( @@ -6462,7 +17637,8 @@ struct WtPublishRelayArgs { struct WtPublishRelayState { session: moq_lite::Session, broadcast: moq_lite::BroadcastProducer, - catalog: hang::CatalogProducer, + catalog: moq_mux::catalog::Producer, + init_track: moq_lite::TrackProducer, control_stop: Option>, } @@ -6717,7 +17893,7 @@ async fn open_wt_publish_relay(args: &WtPublishRelayArgs) -> Result Result Result Result> { + let mut offset = 0usize; + while data.len().saturating_sub(offset) >= 8 { + let size32 = u32::from_be_bytes([ + data[offset], + data[offset + 1], + data[offset + 2], + data[offset + 3], + ]); + let box_type = &data[offset + 4..offset + 8]; + let mut header_len = 8usize; + let box_size = if size32 == 1 { + if data.len().saturating_sub(offset) < 16 { + return Ok(None); + } + header_len = 16; + u64::from_be_bytes([ + data[offset + 8], + data[offset + 9], + data[offset + 10], + data[offset + 11], + data[offset + 12], + data[offset + 13], + data[offset + 14], + data[offset + 15], + ]) + } else if size32 == 0 { + return Ok(None); + } else { + u64::from(size32) + }; + if box_size < header_len as u64 { + return Err(anyhow!("invalid fMP4 box size")); + } + if box_type == b"moof" { + return Ok(Some(offset)); + } + let box_size = usize::try_from(box_size).context("fMP4 box too large")?; + let next = offset + .checked_add(box_size) + .ok_or_else(|| anyhow!("fMP4 box offset overflow"))?; + if data.len() < next { + return Ok(None); + } + offset = next; + } + Ok(None) +} + +async fn read_fmp4_init_replay_prefix( + reader: &mut R, + max_bytes: usize, +) -> Result<(Vec, Vec)> { + let mut buffer = Vec::new(); + let mut chunk = [0u8; 16 * 1024]; + loop { + let n = reader + .read(&mut chunk) + .await + .context("failed to read fMP4 publisher output")?; + if n == 0 { + return Err(anyhow!("fMP4 stream ended before first moof")); + } + if buffer.len().saturating_add(n) > max_bytes { + return Err(anyhow!( + "fMP4 init prefix exceeded {} bytes before first moof", + max_bytes + )); + } + buffer.extend_from_slice(&chunk[..n]); + let Some(moof_offset) = top_level_moof_offset(&buffer)? else { + continue; + }; + if moof_offset == 0 { + return Err(anyhow!("fMP4 stream started with moof and no init segment")); + } + return Ok((buffer[..moof_offset].to_vec(), buffer)); + } +} + +fn publish_raw_moq_group( + track: &mut moq_lite::TrackProducer, + track_name: &str, + sequence: u64, + data: Vec, +) -> Result<()> { + let mut group_writer = track + .create_group(moq_lite::Group { sequence }) + .with_context(|| format!("failed to create {track_name} group {sequence}"))?; + group_writer + .write_frame(bytes::Bytes::from(data)) + .with_context(|| format!("failed to write {track_name} group {sequence}"))?; + group_writer + .finish() + .with_context(|| format!("failed to finish {track_name} group {sequence}"))?; + Ok(()) +} + +async fn prepare_fmp4_import_reader( + reader: &mut R, + init_track: &mut moq_lite::TrackProducer, +) -> Result> { + let (init, replay_prefix) = + read_fmp4_init_replay_prefix(reader, WT_PUBLISH_INIT_CAPTURE_MAX_BYTES).await?; + if !init.starts_with(b"\0\0\0") || top_level_moof_offset(&init)?.is_some() { + return Err(anyhow!("captured fMP4 init prefix is invalid")); + } + publish_raw_moq_group(init_track, WT_PUBLISH_INIT_TRACK, 0, init)?; + Ok(replay_prefix) +} + +async fn import_fmp4_with_publisher_archive( + importer: &mut moq_mux::import::Fmp4, + reader: &mut R, + publisher_archive: &mut Option, +) -> Result<()> { + let mut import_buffer = BytesMut::new(); + let mut chunk = [0u8; 64 * 1024]; + loop { + let n = reader + .read(&mut chunk) + .await + .context("failed to read fMP4 publisher output")?; + if n == 0 { + break; + } + if let Some(writer) = publisher_archive.as_mut() { + writer.observe_bytes(&chunk[..n])?; + } + import_buffer.extend_from_slice(&chunk[..n]); + importer.decode(&mut import_buffer)?; + } + if let Some(writer) = publisher_archive.as_ref() { + writer.finish()?; + } + Ok(()) +} + +fn relay_announced_url(relay_url: &str) -> Result { + let mut url = + Url::parse(relay_url).with_context(|| format!("invalid relay url: {relay_url}"))?; + url.set_path("/announced"); + url.set_query(None); + url.set_fragment(None); + Ok(url) +} + +fn announced_body_contains_broadcast(body: &str, broadcast_name: &str) -> bool { + body.split_whitespace().any(|item| { + item.strip_prefix("anon/") + .unwrap_or(item) + .eq(broadcast_name) + }) +} + +async fn relay_announced_watchdog( + announced_url: Url, + broadcast_name: String, + missing_timeout: Duration, + interval: Duration, + tls_disable_verify: bool, +) -> Result<()> { + let client = reqwest::Client::builder() + .timeout(interval.min(Duration::from_secs(8))) + .danger_accept_invalid_certs(tls_disable_verify) + .build() + .context("failed to build relay announcement watchdog HTTP client")?; + let mut last_seen = Instant::now(); + let mut missing_logged = false; + + loop { + let observed = match client.get(announced_url.clone()).send().await { + Ok(res) if res.status().is_success() => { + let body = res + .text() + .await + .context("failed to read relay announcement body")?; + announced_body_contains_broadcast(&body, &broadcast_name) + } + Ok(res) => { + tracing::warn!( + status = %res.status(), + url = %announced_url, + broadcast = %broadcast_name, + "relay announcement probe returned non-success" + ); + false + } + Err(err) => { + tracing::warn!( + err = %err, + url = %announced_url, + broadcast = %broadcast_name, + "relay announcement probe failed" + ); + false + } + }; + + if observed { + if missing_logged { + tracing::info!( + url = %announced_url, + broadcast = %broadcast_name, + "relay announcement restored" + ); + } + last_seen = Instant::now(); + missing_logged = false; + } else { + if !missing_logged { + tracing::warn!( + url = %announced_url, + broadcast = %broadcast_name, + timeout_ms = missing_timeout.as_millis(), + "relay is not announcing broadcast" + ); + missing_logged = true; + } + if last_seen.elapsed() >= missing_timeout { + return Err(anyhow!( + "relay did not announce broadcast {broadcast_name} at {announced_url} for {}ms", + missing_timeout.as_millis() + )); + } + } + + tokio::time::sleep(interval).await; + } +} + async fn wt_archive_serve(args: WtArchiveServeArgs) -> Result<()> { let manifest_root = args .manifest_dir .unwrap_or_else(|| args.output_dir.join("manifests")); let cas_root = args.output_dir.join("objects").join("blake3"); + let cache_access_root = args + .cache_access_dir + .unwrap_or_else(|| args.output_dir.join("cache-access").join("blake3")); + let archive_origin_url = args + .archive_origin_url + .as_deref() + .map(Url::parse) + .transpose() + .context("invalid --archive-origin-url")?; + let archive_cas_origin_url = args + .archive_cas_origin_url + .as_deref() + .map(Url::parse) + .transpose() + .context("invalid --archive-cas-origin-url")?; + let http_client = reqwest::Client::builder() + .timeout(Duration::from_secs(20)) + .build() + .context("failed to build archive origin HTTP client")?; fs::create_dir_all(&manifest_root) .with_context(|| format!("failed to create manifest dir {}", manifest_root.display()))?; fs::create_dir_all(&cas_root) .with_context(|| format!("failed to create CAS dir {}", cas_root.display()))?; + fs::create_dir_all(&cache_access_root).with_context(|| { + format!( + "failed to create cache access dir {}", + cache_access_root.display() + ) + })?; let listener = TcpListener::bind(&args.listen) .await @@ -6808,12 +18245,20 @@ async fn wt_archive_serve(args: WtArchiveServeArgs) -> Result<()> { listen = %local, manifest_root = %manifest_root.display(), cas_root = %cas_root.display(), + archive_origin_url = ?archive_origin_url, + archive_cas_origin_url = ?archive_cas_origin_url, "archive replay server listening" ); let state = ArchiveReplayState { cas_root, manifest_root, + archive_origin_url, + archive_cas_origin_url, + archive_origin_max_bytes: args.archive_origin_max_bytes, + archive_retention_seconds: args.archive_retention_seconds, + cache_access_root, + http_client, }; loop { @@ -6827,7 +18272,1311 @@ async fn wt_archive_serve(args: WtArchiveServeArgs) -> Result<()> { } } +fn publisher_start_boundary_delay_ms(now_ms: u64, boundary_ms: u64) -> u64 { + if boundary_ms == 0 { + return 0; + } + let remainder = now_ms % boundary_ms; + if remainder == 0 { + 0 + } else { + boundary_ms - remainder + } +} + +async fn wait_for_publisher_start_boundary(boundary_ms: u64) { + let delay_ms = publisher_start_boundary_delay_ms(now_unix_ms(), boundary_ms); + if delay_ms == 0 { + return; + } + tracing::info!( + boundary_ms, + delay_ms, + "waiting for publisher media start boundary" + ); + tokio::time::sleep(Duration::from_millis(delay_ms)).await; +} + +fn wt_publish_ffmpeg_args(args: &WtPublishArgs, publish_input: &str) -> Vec { + let mut cmd_args = vec![ + OsString::from("-hide_banner"), + OsString::from("-loglevel"), + OsString::from("error"), + OsString::from("-nostats"), + OsString::from("-fflags"), + OsString::from("+nobuffer+bitexact"), + OsString::from("-flags"), + OsString::from("low_delay"), + OsString::from("-copyts"), + ]; + + if args.realtime_input { + cmd_args.push(OsString::from("-re")); + } + + if let Some(format) = args.input_format.as_deref() { + cmd_args.push(OsString::from("-f")); + cmd_args.push(OsString::from(format)); + } + + if args.publisher_wallclock_timestamps { + cmd_args.push(OsString::from("-use_wallclock_as_timestamps")); + cmd_args.push(OsString::from("1")); + } + + cmd_args.push(OsString::from("-i")); + cmd_args.push(OsString::from(publish_input)); + + if args.transcode { + cmd_args.extend([ + OsString::from("-map"), + OsString::from("0:v:0"), + OsString::from("-map"), + OsString::from("0:a:0?"), + OsString::from("-c:v"), + OsString::from("libx264"), + OsString::from("-vf"), + OsString::from(args.video_filter.as_str()), + OsString::from("-preset"), + OsString::from(args.video_preset.as_str()), + OsString::from("-tune"), + OsString::from("zerolatency"), + OsString::from("-crf"), + OsString::from(args.video_crf.to_string()), + OsString::from("-pix_fmt"), + OsString::from("yuv420p"), + OsString::from("-profile:v"), + OsString::from("main"), + OsString::from("-g"), + OsString::from(args.gop_frames.to_string()), + OsString::from("-keyint_min"), + OsString::from(args.gop_frames.to_string()), + OsString::from("-force_key_frames"), + OsString::from(format!( + "expr:if(isnan(prev_forced_t),gte(t,0),gte(t,prev_forced_t+{:.3}))", + args.publisher_archive_segment_duration_ms as f64 / 1000.0 + )), + OsString::from("-sc_threshold"), + OsString::from("0"), + OsString::from("-bf"), + OsString::from("0"), + OsString::from("-x264-params"), + OsString::from("open-gop=0:scenecut=0:rc-lookahead=0:sync-lookahead=0:stitchable=1"), + OsString::from("-threads"), + OsString::from("1"), + OsString::from("-c:a"), + OsString::from("aac"), + OsString::from("-profile:a"), + OsString::from("aac_low"), + OsString::from("-b:a"), + OsString::from("160k"), + OsString::from("-ac"), + OsString::from("2"), + OsString::from("-ar"), + OsString::from("48000"), + OsString::from("-af"), + OsString::from(ec_chopper::LIVE_AUDIO_RESAMPLE_FILTER), + OsString::from("-max_muxing_queue_size"), + OsString::from("2048"), + ]); + } else { + cmd_args.extend([OsString::from("-c"), OsString::from("copy")]); + } + + cmd_args.extend([ + OsString::from("-avoid_negative_ts"), + OsString::from("disabled"), + OsString::from("-f"), + OsString::from("mp4"), + OsString::from("-movflags"), + OsString::from(args.movflags.as_str()), + OsString::from("pipe:1"), + ]); + + cmd_args +} + +#[derive(Debug, Clone)] +struct StatelessProofEncodeProfile { + transcode: bool, + video_filter: String, + gop_frames: u32, + video_preset: String, + video_crf: u8, + movflags: String, +} + +impl StatelessProofEncodeProfile { + #[cfg(test)] + fn from_wt_publish_args(args: &WtPublishArgs) -> Self { + Self { + transcode: args.transcode, + video_filter: args.video_filter.clone(), + gop_frames: args.gop_frames, + video_preset: args.video_preset.clone(), + video_crf: args.video_crf, + movflags: args.movflags.clone(), + } + } + + fn from_publisher_proof_segment_args(args: &PublisherProofSegmentArgs) -> Self { + Self { + transcode: args.transcode, + video_filter: args.video_filter.clone(), + gop_frames: args.gop_frames, + video_preset: args.video_preset.clone(), + video_crf: args.video_crf, + movflags: args.movflags.clone(), + } + } + + fn from_publisher_proof_windows_args(args: &PublisherProofWindowsArgs) -> Self { + Self { + transcode: args.transcode, + video_filter: args.video_filter.clone(), + gop_frames: args.gop_frames, + video_preset: args.video_preset.clone(), + video_crf: args.video_crf, + movflags: args.movflags.clone(), + } + } + + fn from_publisher_proof_archive_source_args(args: &PublisherProofArchiveSourceArgs) -> Self { + Self { + transcode: args.transcode, + video_filter: args.video_filter.clone(), + gop_frames: args.gop_frames, + video_preset: args.video_preset.clone(), + video_crf: args.video_crf, + movflags: args.movflags.clone(), + } + } +} + +fn stateless_proof_ffmpeg_args( + profile: &StatelessProofEncodeProfile, + input_ts: &Path, + output_mp4: &Path, +) -> Vec { + let mut cmd_args = vec![ + OsString::from("-hide_banner"), + OsString::from("-loglevel"), + OsString::from("error"), + OsString::from("-nostdin"), + OsString::from("-y"), + OsString::from("-fflags"), + OsString::from("+bitexact"), + OsString::from("-i"), + input_ts.as_os_str().to_os_string(), + OsString::from("-map"), + OsString::from("0:v:0"), + OsString::from("-map"), + OsString::from("0:a:0?"), + OsString::from("-sn"), + OsString::from("-dn"), + OsString::from("-map_metadata"), + OsString::from("-1"), + OsString::from("-filter_threads"), + OsString::from("1"), + OsString::from("-filter_complex_threads"), + OsString::from("1"), + OsString::from("-c:v"), + OsString::from("libx264"), + OsString::from("-vf"), + OsString::from(profile.video_filter.as_str()), + OsString::from("-preset"), + OsString::from(profile.video_preset.as_str()), + OsString::from("-tune"), + OsString::from("zerolatency"), + OsString::from("-crf"), + OsString::from(profile.video_crf.to_string()), + OsString::from("-pix_fmt"), + OsString::from("yuv420p"), + OsString::from("-profile:v"), + OsString::from("main"), + OsString::from("-g"), + OsString::from(profile.gop_frames.to_string()), + OsString::from("-keyint_min"), + OsString::from(profile.gop_frames.to_string()), + OsString::from("-force_key_frames"), + OsString::from("expr:gte(t,0)"), + OsString::from("-sc_threshold"), + OsString::from("0"), + OsString::from("-bf"), + OsString::from("0"), + OsString::from("-x264-params"), + OsString::from("open-gop=0:scenecut=0:rc-lookahead=0:sync-lookahead=0:stitchable=1"), + OsString::from("-threads"), + OsString::from("1"), + OsString::from("-flags:v"), + OsString::from("+bitexact"), + OsString::from("-c:a"), + OsString::from("aac"), + OsString::from("-profile:a"), + OsString::from("aac_low"), + OsString::from("-b:a"), + OsString::from("160k"), + OsString::from("-ac"), + OsString::from("2"), + OsString::from("-ar"), + OsString::from("48000"), + OsString::from("-af"), + OsString::from(ec_chopper::LIVE_AUDIO_RESAMPLE_FILTER), + OsString::from("-flags:a"), + OsString::from("+bitexact"), + OsString::from("-max_muxing_queue_size"), + OsString::from("2048"), + OsString::from("-avoid_negative_ts"), + OsString::from("make_zero"), + OsString::from("-f"), + OsString::from("mp4"), + OsString::from("-movflags"), + OsString::from(profile.movflags.as_str()), + output_mp4.as_os_str().to_os_string(), + ]; + + if !profile.transcode { + cmd_args = vec![ + OsString::from("-hide_banner"), + OsString::from("-loglevel"), + OsString::from("error"), + OsString::from("-nostdin"), + OsString::from("-y"), + OsString::from("-fflags"), + OsString::from("+bitexact"), + OsString::from("-i"), + input_ts.as_os_str().to_os_string(), + OsString::from("-c"), + OsString::from("copy"), + OsString::from("-map_metadata"), + OsString::from("-1"), + OsString::from("-avoid_negative_ts"), + OsString::from("make_zero"), + OsString::from("-f"), + OsString::from("mp4"), + OsString::from("-movflags"), + OsString::from(profile.movflags.as_str()), + output_mp4.as_os_str().to_os_string(), + ]; + } + + cmd_args +} + +#[cfg(test)] +fn wt_publish_stateless_proof_ffmpeg_args( + args: &WtPublishArgs, + input_ts: &Path, + output_mp4: &Path, +) -> Vec { + stateless_proof_ffmpeg_args( + &StatelessProofEncodeProfile::from_wt_publish_args(args), + input_ts, + output_mp4, + ) +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofSegmentReport { + input_ts: PathBuf, + output_mp4: PathBuf, + size_bytes: usize, + init_size_bytes: usize, + init_blake3: String, + media_fragment_count: usize, + media_fragment_blake3: Vec, +} + +fn publisher_proof_segment_command(args: PublisherProofSegmentArgs) -> Result<()> { + if let Some(parent) = args.output_mp4.parent() { + if !parent.as_os_str().is_empty() { + fs::create_dir_all(parent).with_context(|| { + format!( + "failed to create proof output directory {}", + parent.display() + ) + })?; + } + } + + let profile = StatelessProofEncodeProfile::from_publisher_proof_segment_args(&args); + let status = Command::new("ffmpeg") + .args(stateless_proof_ffmpeg_args( + &profile, + &args.input_ts, + &args.output_mp4, + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::inherit()) + .status() + .context("failed to run stateless proof ffmpeg")?; + if !status.success() { + return Err(anyhow!("stateless proof ffmpeg exited with {status}")); + } + + let bytes = fs::read(&args.output_mp4) + .with_context(|| format!("failed to read {}", args.output_mp4.display()))?; + let split = split_fmp4_init_and_media(&bytes)?; + let media_fragment_blake3 = split + .media + .iter() + .map(|fragment| blake3::hash(fragment).to_hex().to_string()) + .collect::>(); + let report = PublisherProofSegmentReport { + input_ts: args.input_ts, + output_mp4: args.output_mp4, + size_bytes: bytes.len(), + init_size_bytes: split.init.len(), + init_blake3: blake3::hash(&split.init).to_hex().to_string(), + media_fragment_count: split.media.len(), + media_fragment_blake3, + }; + + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + Ok(()) +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct PublisherProofWindowReport { + chunk_index: u64, + chunk_start_27mhz: Option, + chunk_duration_27mhz: u64, + sync_status: String, + source_ts: PathBuf, + source_ts_blake3: String, + output_mp4: PathBuf, + size_bytes: usize, + init_size_bytes: usize, + init_blake3: String, + media_fragment_count: usize, + media_fragment_blake3: Vec, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct PublisherProofWindowsReport { + input_ts: PathBuf, + output_dir: PathBuf, + chunk_ms: u64, + preroll_packets: usize, + window_count: usize, + windows: Vec, +} + +fn publisher_proof_windows_report( + args: &PublisherProofWindowsArgs, +) -> Result { + if args.chunk_ms == 0 { + return Err(anyhow!("--chunk-ms must be greater than 0")); + } + + let source_window_dir = args.output_dir.join("source-windows"); + let proof_dir = args.output_dir.join("proof-mp4"); + fs::create_dir_all(&source_window_dir) + .with_context(|| format!("failed to create {}", source_window_dir.display()))?; + fs::create_dir_all(&proof_dir) + .with_context(|| format!("failed to create {}", proof_dir.display()))?; + + let input = File::open(&args.input_ts) + .with_context(|| format!("failed to open {}", args.input_ts.display()))?; + let manifest = ec_chopper::chunk_ts_stream_with_preroll( + input, + &source_window_dir, + args.chunk_ms, + args.max_chunks, + args.preroll_packets, + )?; + let profile = StatelessProofEncodeProfile::from_publisher_proof_windows_args(args); + let mut chunks = manifest.chunks; + chunks.sort_by_key(|chunk| chunk.index); + let mut windows = Vec::with_capacity(chunks.len()); + + for chunk in chunks { + let output_mp4 = proof_dir.join(format!("proof_{:010}.mp4", chunk.index)); + let status = Command::new("ffmpeg") + .args(stateless_proof_ffmpeg_args( + &profile, + &chunk.path, + &output_mp4, + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::inherit()) + .status() + .with_context(|| { + format!( + "failed to run stateless proof ffmpeg for chunk {}", + chunk.index + ) + })?; + if !status.success() { + return Err(anyhow!( + "stateless proof ffmpeg exited with {status} for chunk {}", + chunk.index + )); + } + + let bytes = fs::read(&output_mp4) + .with_context(|| format!("failed to read {}", output_mp4.display()))?; + let split = split_fmp4_init_and_media(&bytes)?; + let media_fragment_blake3 = split + .media + .iter() + .map(|fragment| blake3::hash(fragment).to_hex().to_string()) + .collect::>(); + + windows.push(PublisherProofWindowReport { + chunk_index: chunk.index, + chunk_start_27mhz: chunk.timing.chunk_start_27mhz, + chunk_duration_27mhz: chunk.timing.chunk_duration_27mhz, + sync_status: chunk.timing.sync_status, + source_ts_blake3: ec_chopper::hash_file_blake3(&chunk.path)?, + source_ts: chunk.path, + output_mp4, + size_bytes: bytes.len(), + init_size_bytes: split.init.len(), + init_blake3: blake3::hash(&split.init).to_hex().to_string(), + media_fragment_count: split.media.len(), + media_fragment_blake3, + }); + } + + Ok(PublisherProofWindowsReport { + input_ts: args.input_ts.clone(), + output_dir: args.output_dir.clone(), + chunk_ms: args.chunk_ms, + preroll_packets: args.preroll_packets, + window_count: windows.len(), + windows, + }) +} + +fn publisher_proof_windows_command(args: PublisherProofWindowsArgs) -> Result<()> { + let report = publisher_proof_windows_report(&args)?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + Ok(()) +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +struct PublisherProofDuplicatePublisherReport { + publisher: String, + report: PublisherProofWindowsReport, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +struct PublisherProofDuplicateDivergence { + chunk_index: u64, + reasons: Vec, + missing_publishers: Vec, + source_ts_blake3: BTreeMap, + init_blake3: BTreeMap, + media_fragment_blake3: BTreeMap>, +} + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +struct PublisherProofDuplicatesReport { + input_ts: PathBuf, + output_dir: PathBuf, + chunk_ms: u64, + preroll_packets: usize, + publisher_count: usize, + compared_window_count: usize, + matching_window_count: usize, + divergent_window_count: usize, + ok: bool, + reasons: Vec, + publishers: Vec, + divergences: Vec, +} + +fn publisher_proof_duplicate_publishers( + args: &PublisherProofDuplicatesArgs, +) -> Result> { + let publishers = if args.publisher.is_empty() { + vec!["publisher-a".to_string(), "publisher-b".to_string()] + } else { + args.publisher.clone() + }; + if publishers.len() < 2 { + return Err(anyhow!( + "publisher-proof-duplicates requires at least two --publisher values" + )); + } + + let mut seen = BTreeSet::new(); + for publisher in &publishers { + if publisher.trim().is_empty() { + return Err(anyhow!("publisher identity must not be empty")); + } + if publisher == "." + || publisher == ".." + || publisher.contains('/') + || publisher.contains('\\') + { + return Err(anyhow!( + "publisher identity {publisher:?} must be a single safe path component" + )); + } + if !seen.insert(publisher.clone()) { + return Err(anyhow!("publisher identity {publisher:?} is duplicated")); + } + } + + Ok(publishers) +} + +fn compare_publisher_proof_duplicate_reports( + publishers: &[String], + reports: &[PublisherProofDuplicatePublisherReport], +) -> (usize, usize, Vec) { + let mut chunk_indices = BTreeSet::new(); + let mut windows_by_publisher = Vec::new(); + + for publisher_report in reports { + let mut windows = BTreeMap::new(); + for window in &publisher_report.report.windows { + chunk_indices.insert(window.chunk_index); + windows.insert(window.chunk_index, window); + } + windows_by_publisher.push((publisher_report.publisher.as_str(), windows)); + } + + let mut matching_window_count = 0usize; + let mut divergences = Vec::new(); + + for chunk_index in chunk_indices { + let mut reasons = Vec::new(); + let mut missing_publishers = Vec::new(); + let mut source_ts_blake3 = BTreeMap::new(); + let mut init_blake3 = BTreeMap::new(); + let mut media_fragment_blake3 = BTreeMap::new(); + let mut source_values = BTreeSet::new(); + let mut init_values = BTreeSet::new(); + let mut media_values = BTreeSet::new(); + + for (publisher, windows) in &windows_by_publisher { + let Some(window) = windows.get(&chunk_index) else { + missing_publishers.push((*publisher).to_string()); + continue; + }; + source_values.insert(window.source_ts_blake3.clone()); + init_values.insert(window.init_blake3.clone()); + media_values.insert(window.media_fragment_blake3.clone()); + source_ts_blake3.insert((*publisher).to_string(), window.source_ts_blake3.clone()); + init_blake3.insert((*publisher).to_string(), window.init_blake3.clone()); + media_fragment_blake3.insert( + (*publisher).to_string(), + window.media_fragment_blake3.clone(), + ); + if window.media_fragment_count == 0 { + reasons.push(format!("{publisher}:media_fragments_empty")); + } + } + + if !missing_publishers.is_empty() { + reasons.push("missing_window".to_string()); + } + if source_values.len() > 1 { + reasons.push("source_ts_hash_mismatch".to_string()); + } + if init_values.len() > 1 { + reasons.push("init_hash_mismatch".to_string()); + } + if media_values.len() > 1 { + reasons.push("media_fragment_hash_mismatch".to_string()); + } + + if reasons.is_empty() && source_ts_blake3.len() == publishers.len() { + matching_window_count += 1; + } else { + divergences.push(PublisherProofDuplicateDivergence { + chunk_index, + reasons, + missing_publishers, + source_ts_blake3, + init_blake3, + media_fragment_blake3, + }); + } + } + + (matching_window_count, divergences.len(), divergences) +} + +fn publisher_proof_duplicates_report( + args: &PublisherProofDuplicatesArgs, +) -> Result { + if args.chunk_ms == 0 { + return Err(anyhow!("--chunk-ms must be greater than 0")); + } + + let publishers = publisher_proof_duplicate_publishers(args)?; + let mut publisher_reports = Vec::with_capacity(publishers.len()); + for publisher in &publishers { + let report = publisher_proof_windows_report(&PublisherProofWindowsArgs { + input_ts: args.input_ts.clone(), + output_dir: args.output_dir.join(publisher), + chunk_ms: args.chunk_ms, + max_chunks: args.max_chunks, + preroll_packets: args.preroll_packets, + transcode: args.transcode, + video_filter: args.video_filter.clone(), + gop_frames: args.gop_frames, + video_preset: args.video_preset.clone(), + video_crf: args.video_crf, + movflags: args.movflags.clone(), + pretty: false, + })?; + publisher_reports.push(PublisherProofDuplicatePublisherReport { + publisher: publisher.clone(), + report, + }); + } + + let compared_window_count = publisher_reports + .iter() + .flat_map(|publisher_report| { + publisher_report + .report + .windows + .iter() + .map(|window| window.chunk_index) + }) + .collect::>() + .len(); + let (matching_window_count, divergent_window_count, divergences) = + compare_publisher_proof_duplicate_reports(&publishers, &publisher_reports); + + let mut reasons = Vec::new(); + if compared_window_count == 0 { + reasons.push("no_windows".to_string()); + } + if divergent_window_count > 0 { + reasons.push("divergent_windows".to_string()); + } + if matching_window_count != compared_window_count { + reasons.push("matching_window_count_incomplete".to_string()); + } + + Ok(PublisherProofDuplicatesReport { + input_ts: args.input_ts.clone(), + output_dir: args.output_dir.clone(), + chunk_ms: args.chunk_ms, + preroll_packets: args.preroll_packets, + publisher_count: publishers.len(), + compared_window_count, + matching_window_count, + divergent_window_count, + ok: reasons.is_empty(), + reasons, + publishers: publisher_reports, + divergences, + }) +} + +fn publisher_proof_duplicates_command(args: PublisherProofDuplicatesArgs) -> Result<()> { + let report = publisher_proof_duplicates_report(&args)?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + if args.require_ok && !report.ok { + return Err(anyhow!( + "publisher proof duplicate comparison failed: {}", + report.reasons.join(",") + )); + } + Ok(()) +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofCompareInputSummary { + publisher: String, + report_path: PathBuf, + input_ts: PathBuf, + output_dir: PathBuf, + chunk_ms: u64, + preroll_packets: usize, + window_count: usize, +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofCompareReport { + publisher_count: usize, + compared_window_count: usize, + matching_window_count: usize, + divergent_window_count: usize, + ok: bool, + reasons: Vec, + inputs: Vec, + publishers: Vec, + divergences: Vec, +} + +fn publisher_proof_compare_report( + args: &PublisherProofCompareArgs, +) -> Result { + if args.report.len() < 2 { + return Err(anyhow!( + "publisher-proof-compare requires at least two --report NAME=PATH entries" + )); + } + + let mut seen_publishers = BTreeSet::new(); + let mut publisher_reports = Vec::with_capacity(args.report.len()); + let mut inputs = Vec::with_capacity(args.report.len()); + + for item in &args.report { + let (publisher, report_path) = parse_named_proof_report_path(item)?; + validate_publisher_proof_label(&publisher)?; + if !seen_publishers.insert(publisher.clone()) { + return Err(anyhow!( + "publisher proof report {publisher:?} is duplicated" + )); + } + + let bytes = fs::read(&report_path) + .with_context(|| format!("failed to read proof report {}", report_path.display()))?; + let report = + serde_json::from_slice::(&bytes).with_context(|| { + format!( + "failed to parse proof report JSON {}", + report_path.display() + ) + })?; + inputs.push(PublisherProofCompareInputSummary { + publisher: publisher.clone(), + report_path, + input_ts: report.input_ts.clone(), + output_dir: report.output_dir.clone(), + chunk_ms: report.chunk_ms, + preroll_packets: report.preroll_packets, + window_count: report.window_count, + }); + publisher_reports.push(PublisherProofDuplicatePublisherReport { publisher, report }); + } + + let publishers = publisher_reports + .iter() + .map(|report| report.publisher.clone()) + .collect::>(); + let compared_window_count = publisher_reports + .iter() + .flat_map(|publisher_report| { + publisher_report + .report + .windows + .iter() + .map(|window| window.chunk_index) + }) + .collect::>() + .len(); + let (matching_window_count, divergent_window_count, divergences) = + compare_publisher_proof_duplicate_reports(&publishers, &publisher_reports); + + let mut reasons = Vec::new(); + if compared_window_count == 0 { + reasons.push("no_windows".to_string()); + } + if inputs + .iter() + .map(|input| input.chunk_ms) + .collect::>() + .len() + > 1 + { + reasons.push("chunk_ms_mismatch".to_string()); + } + if inputs + .iter() + .map(|input| input.preroll_packets) + .collect::>() + .len() + > 1 + { + reasons.push("preroll_packets_mismatch".to_string()); + } + if divergent_window_count > 0 { + reasons.push("divergent_windows".to_string()); + } + if matching_window_count != compared_window_count { + reasons.push("matching_window_count_incomplete".to_string()); + } + + Ok(PublisherProofCompareReport { + publisher_count: publishers.len(), + compared_window_count, + matching_window_count, + divergent_window_count, + ok: reasons.is_empty(), + reasons, + inputs, + publishers: publisher_reports, + divergences, + }) +} + +fn publisher_proof_compare_command(args: PublisherProofCompareArgs) -> Result<()> { + let report = publisher_proof_compare_report(&args)?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + if args.require_ok && !report.ok { + return Err(anyhow!( + "publisher proof report comparison failed: {}", + report.reasons.join(",") + )); + } + Ok(()) +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)] +struct PublisherProofRemoteTarget { + publisher: String, + host: String, +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofRemoteExecutionReport { + publisher: String, + host: String, + remote_root: String, + remote_input_ts: String, + remote_output_dir: String, + local_report_path: PathBuf, + upload_elapsed_ms: u64, + proof_elapsed_ms: u64, + cleanup_elapsed_ms: Option, +} + +#[derive(Debug, serde::Serialize)] +struct PublisherProofRemoteCompareReport { + input_ts: PathBuf, + input_ts_blake3: String, + output_dir: PathBuf, + remote_count: usize, + chunk_ms: u64, + preroll_packets: usize, + remote_root: String, + ok: bool, + reasons: Vec, + remotes: Vec, + compare: PublisherProofCompareReport, +} + +fn publisher_proof_remote_compare_command(args: PublisherProofRemoteCompareArgs) -> Result<()> { + let report = publisher_proof_remote_compare_report(&args)?; + if args.pretty { + println!("{}", serde_json::to_string_pretty(&report)?); + } else { + println!("{}", serde_json::to_string(&report)?); + } + if args.require_ok && !report.ok { + return Err(anyhow!( + "remote publisher proof comparison failed: {}", + report.reasons.join(",") + )); + } + Ok(()) +} + +fn publisher_proof_remote_compare_report( + args: &PublisherProofRemoteCompareArgs, +) -> Result { + if args.chunk_ms == 0 { + return Err(anyhow!("--chunk-ms must be greater than 0")); + } + if args.remote.len() < 2 { + return Err(anyhow!( + "publisher-proof-remote-compare requires at least two --remote NAME=HOST targets" + )); + } + validate_remote_shell_component("remote ec-node", &args.remote_ec_node)?; + validate_remote_command_arg("video filter", &args.video_filter)?; + validate_remote_command_arg("video preset", &args.video_preset)?; + validate_remote_command_arg("movflags", &args.movflags)?; + for option in &args.ssh_option { + validate_ssh_option(option)?; + } + + let targets = parse_publisher_proof_remote_targets(&args.remote)?; + let remote_root = publisher_proof_remote_root(args)?; + validate_remote_absolute_tmp_path("remote root", &remote_root)?; + + let reports_dir = args.output_dir.join("reports"); + fs::create_dir_all(&reports_dir) + .with_context(|| format!("failed to create {}", reports_dir.display()))?; + let input_ts_blake3 = ec_chopper::hash_file_blake3(&args.input_ts)?; + let mut executions = Vec::with_capacity(targets.len()); + + for target in &targets { + let execution = run_publisher_proof_remote_target(args, target, &remote_root, &reports_dir) + .with_context(|| { + format!( + "failed to run remote publisher proof for {} on {}", + target.publisher, target.host + ) + })?; + executions.push(execution); + } + + let compare_args = PublisherProofCompareArgs { + report: executions + .iter() + .map(|execution| { + format!( + "{}={}", + execution.publisher, + execution.local_report_path.display() + ) + }) + .collect(), + pretty: false, + require_ok: false, + }; + let compare = publisher_proof_compare_report(&compare_args)?; + let compare_path = args.output_dir.join("compare.json"); + fs::write(&compare_path, serde_json::to_vec_pretty(&compare)?) + .with_context(|| format!("failed to write {}", compare_path.display()))?; + + let mut reasons = compare.reasons.clone(); + if !compare.ok && reasons.is_empty() { + reasons.push("remote_compare_failed".to_string()); + } + + Ok(PublisherProofRemoteCompareReport { + input_ts: args.input_ts.clone(), + input_ts_blake3, + output_dir: args.output_dir.clone(), + remote_count: targets.len(), + chunk_ms: args.chunk_ms, + preroll_packets: args.preroll_packets, + remote_root, + ok: compare.ok, + reasons, + remotes: executions, + compare, + }) +} + +fn run_publisher_proof_remote_target( + args: &PublisherProofRemoteCompareArgs, + target: &PublisherProofRemoteTarget, + remote_root: &str, + reports_dir: &Path, +) -> Result { + let remote_dir = format!("{}/{}", remote_root.trim_end_matches('/'), target.publisher); + let remote_input_ts = format!("{remote_dir}/source.ts"); + let remote_output_dir = format!("{remote_dir}/proof"); + validate_remote_absolute_tmp_path("remote directory", &remote_dir)?; + validate_remote_absolute_tmp_path("remote input", &remote_input_ts)?; + validate_remote_absolute_tmp_path("remote output", &remote_output_dir)?; + + let upload_started = Instant::now(); + run_ssh_status( + &args.ssh_option, + &target.host, + &["mkdir", "-p", &remote_dir], + )?; + run_scp_status( + &args.ssh_option, + &args.input_ts, + &target.host, + &remote_input_ts, + )?; + let upload_elapsed_ms = elapsed_ms(upload_started.elapsed()); + + let proof_started = Instant::now(); + let mut remote_command = publisher_proof_remote_command_args( + args, + &args.remote_ec_node, + &remote_input_ts, + &remote_output_dir, + ); + let output = run_ssh_output(&args.ssh_option, &target.host, &remote_command)?; + let proof_elapsed_ms = elapsed_ms(proof_started.elapsed()); + remote_command.clear(); + + let local_report_path = reports_dir.join(format!("{}.json", target.publisher)); + fs::write(&local_report_path, output.stdout) + .with_context(|| format!("failed to write {}", local_report_path.display()))?; + + let cleanup_elapsed_ms = if args.cleanup_remote { + let cleanup_started = Instant::now(); + run_ssh_status(&args.ssh_option, &target.host, &["rm", "-rf", &remote_dir])?; + Some(elapsed_ms(cleanup_started.elapsed())) + } else { + None + }; + + Ok(PublisherProofRemoteExecutionReport { + publisher: target.publisher.clone(), + host: target.host.clone(), + remote_root: remote_root.to_string(), + remote_input_ts, + remote_output_dir, + local_report_path, + upload_elapsed_ms, + proof_elapsed_ms, + cleanup_elapsed_ms, + }) +} + +fn publisher_proof_remote_command_args( + args: &PublisherProofRemoteCompareArgs, + remote_ec_node: &str, + remote_input_ts: &str, + remote_output_dir: &str, +) -> Vec { + let mut command = vec![ + remote_ec_node.to_string(), + "publisher-proof-windows".to_string(), + "--input-ts".to_string(), + remote_input_ts.to_string(), + "--output-dir".to_string(), + remote_output_dir.to_string(), + "--chunk-ms".to_string(), + args.chunk_ms.to_string(), + "--preroll-packets".to_string(), + args.preroll_packets.to_string(), + "--transcode".to_string(), + args.transcode.to_string(), + "--video-filter".to_string(), + args.video_filter.clone(), + "--gop-frames".to_string(), + args.gop_frames.to_string(), + "--video-preset".to_string(), + args.video_preset.clone(), + "--video-crf".to_string(), + args.video_crf.to_string(), + "--movflags".to_string(), + args.movflags.clone(), + ]; + if let Some(max_chunks) = args.max_chunks { + command.push("--max-chunks".to_string()); + command.push(max_chunks.to_string()); + } + command +} + +fn parse_publisher_proof_remote_targets( + values: &[String], +) -> Result> { + let mut seen = BTreeSet::new(); + let mut targets = Vec::with_capacity(values.len()); + for value in values { + let (publisher, host) = value + .split_once('=') + .ok_or_else(|| anyhow!("expected NAME=HOST for --remote, got {value:?}"))?; + let publisher = publisher.trim(); + let host = host.trim(); + validate_publisher_proof_label(publisher)?; + validate_remote_target_host(host)?; + if !seen.insert(publisher.to_string()) { + return Err(anyhow!("remote publisher {publisher:?} is duplicated")); + } + targets.push(PublisherProofRemoteTarget { + publisher: publisher.to_string(), + host: host.to_string(), + }); + } + Ok(targets) +} + +fn publisher_proof_remote_root(args: &PublisherProofRemoteCompareArgs) -> Result { + if let Some(root) = &args.remote_root { + return Ok(root.trim_end_matches('/').to_string()); + } + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + Ok(format!( + "/tmp/every-channel-publisher-proof-{}-{now_ms}", + std::process::id() + )) +} + +fn validate_remote_target_host(host: &str) -> Result<()> { + if host.is_empty() { + return Err(anyhow!("remote host must not be empty")); + } + if !host.chars().all(|ch| { + ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-' | '_' | '@' | ':' | '[' | ']') + }) { + return Err(anyhow!("remote host {host:?} has unsupported characters")); + } + Ok(()) +} + +fn validate_remote_shell_component(name: &str, value: &str) -> Result<()> { + if value.trim().is_empty() { + return Err(anyhow!("{name} must not be empty")); + } + if !value.chars().all(|ch| { + ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-' | '_' | '/' | ':' | '+' | '=') + }) { + return Err(anyhow!("{name} {value:?} has unsupported characters")); + } + Ok(()) +} + +fn validate_remote_command_arg(name: &str, value: &str) -> Result<()> { + if value.trim().is_empty() { + return Err(anyhow!("{name} must not be empty")); + } + if value.chars().any(|ch| { + ch.is_whitespace() + || matches!( + ch, + '\'' | '"' | '`' | '$' | ';' | '&' | '|' | '<' | '>' | '\\' | '(' | ')' | '{' | '}' + ) + }) { + return Err(anyhow!("{name} {value:?} has unsupported shell characters")); + } + Ok(()) +} + +fn validate_ssh_option(value: &str) -> Result<()> { + if value.trim().is_empty() { + return Err(anyhow!("ssh option must not be empty")); + } + if value.contains(char::is_whitespace) { + return Err(anyhow!("ssh option {value:?} must not contain whitespace")); + } + Ok(()) +} + +fn validate_remote_absolute_tmp_path(name: &str, value: &str) -> Result<()> { + validate_remote_shell_component(name, value)?; + if !value.starts_with("/tmp/every-channel-") { + return Err(anyhow!( + "{name} must live under /tmp/every-channel-* for remote proof safety" + )); + } + Ok(()) +} + +fn run_ssh_status(ssh_options: &[String], host: &str, remote_args: &[&str]) -> Result<()> { + let mut command = Command::new("ssh"); + push_ssh_options(&mut command, ssh_options); + command.arg(host); + for arg in remote_args { + command.arg(arg); + } + let output = command + .stdin(Stdio::null()) + .output() + .with_context(|| format!("failed to run ssh command on {host}"))?; + ensure_command_success("ssh", host, &output) +} + +fn run_ssh_output( + ssh_options: &[String], + host: &str, + remote_args: &[String], +) -> Result { + let mut command = Command::new("ssh"); + push_ssh_options(&mut command, ssh_options); + command.arg(host); + for arg in remote_args { + command.arg(arg); + } + let output = command + .stdin(Stdio::null()) + .output() + .with_context(|| format!("failed to run ssh command on {host}"))?; + ensure_command_success("ssh", host, &output)?; + Ok(output) +} + +fn run_scp_status( + ssh_options: &[String], + source: &Path, + host: &str, + remote_path: &str, +) -> Result<()> { + let mut command = Command::new("scp"); + push_ssh_options(&mut command, ssh_options); + command.arg(source); + command.arg(format!("{host}:{remote_path}")); + let output = command + .stdin(Stdio::null()) + .output() + .with_context(|| format!("failed to copy proof input to {host}"))?; + ensure_command_success("scp", host, &output) +} + +fn push_ssh_options(command: &mut Command, ssh_options: &[String]) { + for option in ssh_options { + command.arg("-o").arg(option); + } +} + +fn ensure_command_success(command: &str, host: &str, output: &std::process::Output) -> Result<()> { + if output.status.success() { + return Ok(()); + } + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + Err(anyhow!( + "{command} on {host} exited with {}: stdout={} stderr={}", + output.status, + stdout.trim(), + stderr.trim() + )) +} + +fn elapsed_ms(duration: Duration) -> u64 { + duration.as_millis().try_into().unwrap_or(u64::MAX) +} + +fn parse_named_proof_report_path(value: &str) -> Result<(String, PathBuf)> { + let (name, path) = value + .split_once('=') + .ok_or_else(|| anyhow!("expected NAME=PATH for --report, got {value:?}"))?; + let name = name.trim(); + let path = path.trim(); + if name.is_empty() || path.is_empty() { + return Err(anyhow!("expected NAME=PATH for --report, got {value:?}")); + } + Ok((name.to_string(), PathBuf::from(path))) +} + +fn validate_publisher_proof_label(label: &str) -> Result<()> { + if label.trim().is_empty() { + return Err(anyhow!("publisher proof label must not be empty")); + } + if label == "." || label == ".." || label.contains('/') || label.contains('\\') { + return Err(anyhow!( + "publisher proof label {label:?} must be a single safe path component" + )); + } + Ok(()) +} + async fn wt_publish(args: WtPublishArgs) -> Result<()> { + if args.publisher_archive_segment_duration_ms == 0 { + return Err(anyhow!( + "--publisher-archive-segment-duration-ms must be greater than 0" + )); + } + let publish_input = prepare_wt_publish_input(&args).await?; + let mut publisher_archive_child = + spawn_publisher_proof_archive_source_child(&args, &publish_input)?; + let mut publisher_archive = None; + let mut relay = open_wt_publish_relay(&WtPublishRelayArgs { url: args.url.clone(), name: args.name.clone(), @@ -6843,84 +19592,45 @@ async fn wt_publish(args: WtPublishArgs) -> Result<()> { .await?; // Spawn ffmpeg to generate fMP4 suitable for hang/moq-mux. + wait_for_publisher_start_boundary(args.publisher_start_boundary_ms).await; let mut cmd = TokioCommand::new("ffmpeg"); - cmd.arg("-hide_banner") - .arg("-loglevel") - .arg("error") - .arg("-nostats") - .arg("-fflags") - .arg("+nobuffer") - .arg("-flags") - .arg("low_delay") - .arg("-i") - .arg(&args.input); - - if args.transcode { - cmd.args([ - "-map", - "0:v:0", - "-map", - "0:a:0?", - "-c:v", - "libx264", - "-vf", - args.video_filter.as_str(), - "-preset", - "veryfast", - "-tune", - "zerolatency", - "-pix_fmt", - "yuv420p", - "-profile:v", - "main", - "-g", - &args.gop_frames.to_string(), - "-keyint_min", - &args.gop_frames.to_string(), - "-sc_threshold", - "0", - "-threads", - "1", - "-c:a", - "aac", - "-profile:a", - "aac_low", - "-b:a", - "160k", - "-ac", - "2", - "-ar", - "48000", - "-af", - ec_chopper::LIVE_AUDIO_RESAMPLE_FILTER, - "-max_muxing_queue_size", - "2048", - ]); - } else { - cmd.args(["-c", "copy"]); - } - - cmd.args(["-f", "mp4", "-movflags", args.movflags.as_str(), "pipe:1"]); - + cmd.args(wt_publish_ffmpeg_args(&args, &publish_input)); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::inherit()); - cmd.kill_on_drop(true); - tracing::info!(input=%args.input, "spawning ffmpeg"); + tracing::info!(input=%publish_input, "spawning ffmpeg"); let mut child = cmd.spawn().context("failed to spawn ffmpeg")?; let stdout = child .stdout .take() .ok_or_else(|| anyhow!("ffmpeg stdout unavailable"))?; - let config = moq_mux::import::Fmp4Config { - passthrough: args.passthrough, - }; - let mut importer = moq_mux::import::Fmp4::new(relay.broadcast, relay.catalog, config); + if !args.passthrough { + tracing::debug!( + "moq-mux 0.4 fMP4 import preserves CMAF fragments; --passthrough=false is accepted for CLI compatibility" + ); + } + let mut importer = moq_mux::import::Fmp4::new(relay.broadcast, relay.catalog); let mut stdout = stdout; - let decode_fut = importer.decode_from(&mut stdout); + let replay_prefix = prepare_fmp4_import_reader(&mut stdout, &mut relay.init_track).await?; + let replay = std::io::Cursor::new(replay_prefix); + let mut import_reader = tokio::io::AsyncReadExt::chain(replay, stdout); + let decode_fut = import_fmp4_with_publisher_archive( + &mut importer, + &mut import_reader, + &mut publisher_archive, + ); tokio::pin!(decode_fut); + let announced_watchdog_url = relay_announced_url(&args.url)?; + let announced_watchdog_fut = relay_announced_watchdog( + announced_watchdog_url, + args.name.clone(), + Duration::from_millis(args.relay_announced_watchdog_ms), + Duration::from_millis(args.relay_announced_watchdog_interval_ms.max(1000)), + args.tls_disable_verify, + ); + tokio::pin!(announced_watchdog_fut); tracing::info!("publishing fMP4 -> moq-mux -> relay"); let outcome = tokio::select! { @@ -6936,6 +19646,29 @@ async fn wt_publish(args: WtPublishArgs) -> Result<()> { let _ = child.kill().await; Err(anyhow!("relay session closed")) } + res = &mut announced_watchdog_fut, if args.relay_announced_watchdog_ms > 0 => { + let _ = child.kill().await; + match res { + Ok(()) => Err(anyhow!("relay announcement watchdog exited unexpectedly")), + Err(err) => Err(err).context("relay announcement watchdog failed"), + } + } + res = async { + if let Some(child) = publisher_archive_child.as_mut() { + child + .wait() + .await + .context("failed to wait for publisher source proof archive worker") + } else { + std::future::pending::>().await + } + }, if publisher_archive_child.is_some() => { + let _ = child.kill().await; + match res { + Ok(status) => Err(anyhow!("publisher source proof archive worker exited with {status}")), + Err(err) => Err(err), + } + } _ = tokio::signal::ctrl_c() => { tracing::info!("ctrl-c; shutting down"); let _ = child.kill().await; @@ -6944,6 +19677,10 @@ async fn wt_publish(args: WtPublishArgs) -> Result<()> { } }; + if let Some(child) = publisher_archive_child.as_mut() { + let _ = child.kill().await; + } + if let Some(stop) = relay.control_stop.take() { let _ = stop.send(()); } @@ -6951,6 +19688,85 @@ async fn wt_publish(args: WtPublishArgs) -> Result<()> { outcome } +#[derive(Debug, Clone, PartialEq, Eq)] +struct NtscRsPreprocessPlan { + cli: PathBuf, + args: Vec, + output: PathBuf, +} + +fn ntsc_rs_preprocess_plan(args: &WtPublishArgs) -> Result> { + let Some(preset) = args.ntsc_rs_preset.as_ref() else { + return Ok(None); + }; + if args.input_format.is_some() { + return Err(anyhow!( + "--ntsc-rs-preset cannot be combined with --input-format; ntsc-rs writes a normal output file for ffmpeg" + )); + } + let output = args.ntsc_rs_output.as_ref().ok_or_else(|| { + anyhow!( + "--ntsc-rs-preset requires --ntsc-rs-output so the preprocessed file path is explicit" + ) + })?; + if output.as_os_str().is_empty() { + return Err(anyhow!("--ntsc-rs-output must not be empty")); + } + let output_lossy = output.to_string_lossy(); + if args.input == output_lossy.as_ref() { + return Err(anyhow!("--ntsc-rs-output must not overwrite --input")); + } + + Ok(Some(NtscRsPreprocessPlan { + cli: args.ntsc_rs_cli.clone(), + args: vec![ + OsString::from("-i"), + OsString::from(args.input.as_str()), + OsString::from("-o"), + output.as_os_str().to_os_string(), + OsString::from("-p"), + preset.as_os_str().to_os_string(), + OsString::from("--overwrite"), + ], + output: output.clone(), + })) +} + +async fn prepare_wt_publish_input(args: &WtPublishArgs) -> Result { + let Some(plan) = ntsc_rs_preprocess_plan(args)? else { + return Ok(args.input.clone()); + }; + + if let Some(parent) = plan.output.parent() { + if !parent.as_os_str().is_empty() { + fs::create_dir_all(parent).with_context(|| { + format!( + "failed to create ntsc-rs output directory {}", + parent.display() + ) + })?; + } + } + + tracing::info!( + cli=%plan.cli.display(), + output=%plan.output.display(), + "running ntsc-rs preprocessor" + ); + let status = TokioCommand::new(&plan.cli) + .args(&plan.args) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .status() + .await + .with_context(|| format!("failed to run ntsc-rs CLI {}", plan.cli.display()))?; + if !status.success() { + return Err(anyhow!("ntsc-rs CLI exited with {status}")); + } + + Ok(plan.output.to_string_lossy().into_owned()) +} + fn nbc_bootstrap(args: NbcBootstrapArgs) -> Result<()> { let chrome_path = resolve_nbc_chrome_path(args.chrome_path.as_deref())?; let profile_dir = resolve_nbc_profile_dir(args.profile_dir.as_deref())?; @@ -7030,6 +19846,8 @@ async fn nbc_wt_publish(args: NbcWtPublishArgs) -> Result<()> { .args([ "-sc_threshold", "0", + "-bf", + "0", "-threads", "1", "-f", @@ -7042,7 +19860,6 @@ async fn nbc_wt_publish(args: NbcWtPublishArgs) -> Result<()> { cmd.stdin(Stdio::piped()); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::inherit()); - cmd.kill_on_drop(true); tracing::info!( source_url = %args.source_url, @@ -7066,13 +19883,18 @@ async fn nbc_wt_publish(args: NbcWtPublishArgs) -> Result<()> { .take() .ok_or_else(|| anyhow!("ffmpeg stdout unavailable"))?; - let config = moq_mux::import::Fmp4Config { - passthrough: args.passthrough, - }; - let mut importer = moq_mux::import::Fmp4::new(relay.broadcast, relay.catalog, config); + if !args.passthrough { + tracing::debug!( + "moq-mux 0.4 fMP4 import preserves CMAF fragments; --passthrough=false is accepted for CLI compatibility" + ); + } + let mut importer = moq_mux::import::Fmp4::new(relay.broadcast, relay.catalog); let mut stdout = stdout; - let decode_fut = importer.decode_from(&mut stdout); + let replay_prefix = prepare_fmp4_import_reader(&mut stdout, &mut relay.init_track).await?; + let replay = std::io::Cursor::new(replay_prefix); + let mut import_reader = tokio::io::AsyncReadExt::chain(replay, stdout); + let decode_fut = importer.decode_from(&mut import_reader); tokio::pin!(decode_fut); tracing::info!("publishing NBC browser capture -> fMP4 -> moq-mux -> relay"); diff --git a/crates/ec-node/src/nbc.rs b/crates/ec-node/src/nbc.rs index f62419d..d8d4bd6 100644 --- a/crates/ec-node/src/nbc.rs +++ b/crates/ec-node/src/nbc.rs @@ -2,7 +2,7 @@ use anyhow::{anyhow, Context, Result}; use headless_chrome::protocol::cdp::Page; use headless_chrome::{Browser, Tab}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::env; use std::fs; use std::io::{BufRead, BufReader, Cursor, Read, Write}; @@ -65,11 +65,15 @@ pub struct BootstrapResult { pub page_url: String, pub interactive_auth_required: bool, pub authorized: bool, + pub video_ready: bool, + pub current_time: f64, + pub width: u64, + pub height: u64, pub screenshot_path: Option, } -#[derive(Debug)] struct WaitOutcome { + tab: Arc, state: NbcVideoState, trace: NbcTraceState, interactive_auth_required: bool, @@ -199,6 +203,14 @@ fn nbc_bootstrap_timeout() -> Duration { .unwrap_or_else(|| Duration::from_secs(1800)) } +fn nbc_profile_signin_gate_timeout() -> Duration { + env::var("EVERY_CHANNEL_NBC_PROFILE_SIGNIN_GATE_TIMEOUT_SECS") + .ok() + .and_then(|value| value.parse::().ok()) + .map(Duration::from_secs) + .unwrap_or_else(|| Duration::from_secs(8)) +} + fn nbc_env_flag(name: &str) -> Option { env::var(name).ok().map(|value| { let value = value.trim().to_ascii_lowercase(); @@ -403,6 +415,10 @@ pub fn bootstrap_nbc_auth( )?; Ok(BootstrapResult { + video_ready: nbc_video_state_has_decoded_frame(&outcome.state), + current_time: outcome.state.current_time, + width: outcome.state.width, + height: outcome.state.height, title: outcome.state.title, page_url: outcome.state.page_url, interactive_auth_required: outcome.interactive_auth_required, @@ -619,7 +635,7 @@ fn run_nbc_capture_loop( register_nbc_trace_handlers(&tab, trace.clone())?; tab.navigate_to(&url)?; tab.wait_until_navigated()?; - wait_for_nbc_playback( + let outcome = wait_for_nbc_playback( chrome.browser(), &tab, &url, @@ -627,31 +643,34 @@ fn run_nbc_capture_loop( AuthMode::Forbidden, None, )?; + let capture_tab = outcome.tab; let frame_interval = Duration::from_millis(1000 / nbc_capture_fps().max(1)); let quality = nbc_capture_quality(); let mut first_frame = true; loop { - kick_nbc_player(&tab).ok(); - let frame = tab + kick_nbc_player(&capture_tab).ok(); + let state = probe_nbc_video(&capture_tab).unwrap_or_default(); + if !nbc_video_state_has_decoded_frame(&state) { + return Err(anyhow!( + "NBC capture tab lost decoded video (title='{}', page_url='{}', current_time={}, ready_state={}, has_video={})", + state.title, + state.page_url, + state.current_time, + state.ready_state, + state.has_video, + )); + } + let video = capture_tab .find_element("video") - .and_then(|video| { - video.parent.capture_screenshot( - Page::CaptureScreenshotFormatOption::Jpeg, - Some(quality), - Some(video.get_box_model()?.content_viewport()), - true, - ) - }) - .or_else(|_| { - tab.capture_screenshot( - Page::CaptureScreenshotFormatOption::Jpeg, - Some(quality), - None, - true, - ) - })?; + .context("NBC capture tab has no video element after playback readiness")?; + let frame = video.parent.capture_screenshot( + Page::CaptureScreenshotFormatOption::Jpeg, + Some(quality), + Some(video.get_box_model()?.content_viewport()), + true, + )?; if first_frame { first_frame = false; @@ -785,6 +804,15 @@ fn nbc_url_is_provider_linked(url: &str) -> bool { (host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("provider-linked") } +fn nbc_url_is_mvpd_complete(url: &str) -> bool { + let Ok(url) = Url::parse(url) else { + return false; + }; + let host = url.host_str().unwrap_or_default().to_ascii_lowercase(); + let path = url.path().to_ascii_lowercase(); + (host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("mvpd-complete") +} + fn nbc_url_is_optional_profile_signin(url: &str) -> bool { let Ok(url) = Url::parse(url) else { return false; @@ -812,6 +840,7 @@ fn nbc_page_is_watch_surface(url: &str) -> bool { (host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && !nbc_url_is_optional_profile_signin(url.as_str()) && !nbc_url_is_provider_linked(url.as_str()) + && !nbc_url_is_mvpd_complete(url.as_str()) } fn nbc_title_looks_like_verizon_popup(title: &str) -> bool { @@ -836,6 +865,14 @@ fn nbc_state_is_optional_profile_signin(state: &NbcVideoState) -> bool { || nbc_title_looks_like_optional_profile_signin(&state.title) } +fn nbc_clues_look_geo_blocked(clues: &NbcPageClues) -> bool { + let body_text = clues.body_text.to_ascii_lowercase(); + body_text.contains("not authorized to access this content from outside of the us") + || body_text.contains("not authorized to access this content from outside of the u.s.") + || body_text.contains("outside of the us and its territories") + || body_text.contains("outside of the u.s. and its territories") +} + fn browser_tabs(browser: &Browser) -> Vec> { browser.register_missing_tabs(); browser.get_tabs().lock().unwrap().iter().cloned().collect() @@ -877,20 +914,20 @@ fn find_primary_tab_state<'a>( .find(|candidate| candidate.tab.get_target_id() == target_id) } -fn find_playing_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> { - tabs.iter().find(|candidate| { - candidate.state.has_video - && candidate.state.width > 0 - && candidate.state.height > 0 - && !candidate.state.paused - && (candidate.state.current_time > 0.0 || candidate.state.ready_state >= 2) - }) +fn nbc_video_state_has_decoded_frame(state: &NbcVideoState) -> bool { + state.has_video + && state.width > 0 + && state.height > 0 + && !state.paused + && state.current_time > 0.0 + && state.ready_state >= 2 } fn find_provider_linked_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> { tabs.iter().find(|candidate| { nbc_title_looks_like_provider_linked(&candidate.state.title) || nbc_url_is_provider_linked(&candidate.state.page_url) + || nbc_url_is_mvpd_complete(&candidate.state.page_url) }) } @@ -1038,25 +1075,40 @@ fn advance_nbc_auth_flow(tab: &Arc) -> Result> }}; const actions = []; const url = window.location.href || ""; + let host = ""; + try {{ + host = new URL(url).hostname.toLowerCase(); + }} catch (_err) {{}} + const title = document.title || ""; + const titleText = `${{title}} ${{url}}`.toLowerCase(); + const looksLikeOptionalNbcProfile = + (host.endsWith("nbc.com") || host.endsWith(".nbc.com")) && + (url.includes("/sign-in") || + url.includes("/login") || + titleText.includes("nbc account sign in") || + titleText.includes("nbcuniversal profile") || + titleText.includes("nbc profile")); + if (looksLikeOptionalNbcProfile) {{ + return {{ pageUrl: url, title, actions }}; + }} const candidates = Array.from( document.querySelectorAll( "button,a,[role='button'],[role='option'],label,li,[data-provider-name],[data-provider-id],[data-provider]" ) ); + const providerCta = candidates.find((node) => {{ + const text = textOf(node); + return visible(node) && + ( + text === "link tv provider" || + text === "link provider" || + text.startsWith("link tv provider ") || + text.startsWith("link provider ") + ); + }}); + clickNode(providerCta, "click:link-provider"); if (url.includes("mvpd")) {{ - const providerCta = candidates.find((node) => {{ - const text = textOf(node); - return visible(node) && - ( - text === "link tv provider" || - text === "link provider" || - text.startsWith("link tv provider ") || - text.startsWith("link provider ") - ); - }}); - clickNode(providerCta, "click:link-provider"); - const fullListNode = candidates.find((node) => {{ const text = textOf(node); return visible(node) && (text === "full list" || text.startsWith("full list ")); @@ -1112,7 +1164,7 @@ fn advance_nbc_auth_flow(tab: &Arc) -> Result> return {{ pageUrl: url, - title: document.title || "", + title, actions, }}; }})()) @@ -1226,16 +1278,30 @@ fn advance_mvpd_login_flow(tab: &Arc) -> Result {{ + const dismissButton = profileButtons.find((node) => {{ const text = textOf(node); return visible(node) && ( - text === "link tv provider" || - text === "link provider" || - text.startsWith("link tv provider ") || - text.startsWith("link provider ") + text === "skip" || + text.startsWith("skip ") || + text === "skip for now" || + text === "maybe later" || + text === "not now" || + text === "no thanks" || + text === "close" || + text === "continue watching" || + text.startsWith("continue watching ") || + text === "continue without signing in" || + text === "continue without profile" || + text === "continue as guest" || + text === "watch live" || + text === "watch now" || + text.startsWith("watch live ") || + text.startsWith("watch now ") ); }}); - clickNode(providerLink, "click:profile-link-provider"); + if (dismissButton) {{ + clickNode(dismissButton, `click:profile-dismiss:${{textOf(dismissButton).slice(0, 120)}}`); + }} return {{ pageUrl: url, title, actions }}; }} if (!looksLikeProviderLogin) {{ @@ -1333,8 +1399,15 @@ fn advance_nbc_post_auth_flow(tab: &Arc) -> Result) -> Result, ) -> Result { let deadline = Instant::now() + nbc_capture_timeout(); + let auth_forbidden = matches!(&auth_mode, AuthMode::Forbidden); let mut interactive_deadline = None::; let mut interactive_auth_required = false; let mut screenshot_path = None::; @@ -1540,11 +1628,13 @@ fn wait_for_nbc_playback( let mut last_trace_state = None::; let mut last_log = Instant::now() - Duration::from_secs(10); let mut last_clue_log = Instant::now() - Duration::from_secs(30); + let mut playback_samples = HashMap::::new(); let mut resumed_after_background_login = false; let mut resumed_after_authenticated_surface = false; let mut optional_profile_signin_recoveries = 0_u8; let mut last_optional_profile_signin_retry = None::; let mut watch_surface_seen_at = None::; + let mut optional_profile_signin_seen_at = None::; let mut tracked_tabs = HashSet::new(); let mut provider_linked_completed = false; @@ -1554,13 +1644,33 @@ fn wait_for_nbc_playback( let primary_state = find_primary_tab_state(&tab_states, tab) .map(|value| value.state.clone()) .unwrap_or_else(|| probe_nbc_video(tab).unwrap_or_default()); - if let Some(playing_tab) = find_playing_tab_state(&tab_states) { - return Ok(WaitOutcome { - state: playing_tab.state.clone(), - trace: trace.lock().map(|state| state.clone()).unwrap_or_default(), - interactive_auth_required, - screenshot_path, - }); + let now = Instant::now(); + for playing_tab in tab_states + .iter() + .filter(|candidate| nbc_video_state_has_decoded_frame(&candidate.state)) + { + let target_id = playing_tab.tab.get_target_id().to_string(); + if let Some((previous_time, first_seen)) = playback_samples.get(&target_id) { + if playing_tab.state.current_time >= *previous_time + 0.25 + && first_seen.elapsed() >= Duration::from_millis(500) + { + return Ok(WaitOutcome { + tab: playing_tab.tab.clone(), + state: playing_tab.state.clone(), + trace: trace.lock().map(|state| state.clone()).unwrap_or_default(), + interactive_auth_required, + screenshot_path, + }); + } + } + playback_samples + .entry(target_id) + .and_modify(|(previous_time, _)| { + if playing_tab.state.current_time < *previous_time { + *previous_time = playing_tab.state.current_time; + } + }) + .or_insert((playing_tab.state.current_time, now)); } let interaction_tab = find_interaction_tab_state(&tab_states, tab) @@ -1571,6 +1681,7 @@ fn wait_for_nbc_playback( let pre_state = probe_nbc_video(&interaction_tab).unwrap_or_default(); if nbc_title_looks_like_provider_linked(&pre_state.title) || nbc_url_is_provider_linked(&pre_state.page_url) + || nbc_url_is_mvpd_complete(&pre_state.page_url) { provider_linked_completed = true; } @@ -1579,6 +1690,7 @@ fn wait_for_nbc_playback( if let Some(progress) = advance_nbc_post_auth_flow(&interaction_tab).ok().flatten() { if nbc_title_looks_like_provider_linked(&progress.title) || nbc_url_is_provider_linked(&progress.page_url) + || nbc_url_is_mvpd_complete(&progress.page_url) || progress .actions .iter() @@ -1614,6 +1726,7 @@ fn wait_for_nbc_playback( let state = probe_nbc_video(&interaction_tab).unwrap_or_default(); if nbc_title_looks_like_provider_linked(&state.title) || nbc_url_is_provider_linked(&state.page_url) + || nbc_url_is_mvpd_complete(&state.page_url) { provider_linked_completed = true; } @@ -1621,6 +1734,19 @@ fn wait_for_nbc_playback( let authorized = nbc_trace_is_authorized(&trace_state) || provider_linked_completed; let recent_media_activity = nbc_trace_has_recent_media_activity(&trace_state); + if !authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video { + let first_seen = *optional_profile_signin_seen_at.get_or_insert_with(Instant::now); + if auth_forbidden && first_seen.elapsed() >= nbc_profile_signin_gate_timeout() { + return Err(anyhow!( + "NBC account sign-in gate reached before TV-provider auth; refusing non-interactive retry loop without decoded video (title='{}', page_url='{}')", + state.title, + state.page_url, + )); + } + } else { + optional_profile_signin_seen_at = None; + } + if last_log.elapsed() >= Duration::from_secs(5) { last_log = Instant::now(); tracing::info!( @@ -1661,8 +1787,9 @@ fn wait_for_nbc_playback( } } - if (trace_state.background_login_complete - || nbc_url_is_background_login_complete(&state.page_url)) + let auth_completion_page = nbc_url_is_background_login_complete(&state.page_url) + || nbc_url_is_mvpd_complete(&state.page_url); + if (trace_state.background_login_complete || auth_completion_page) && !resumed_after_background_login { resumed_after_background_login = true; @@ -1673,41 +1800,49 @@ fn wait_for_nbc_playback( ); close_auxiliary_browser_tabs(browser, tab); let _ = tab.activate(); - let _ = tab.evaluate("window.location.reload()", true); + if nbc_url_is_mvpd_complete(&state.page_url) { + tab.navigate_to(source_url)?; + tab.wait_until_navigated()?; + } else { + let _ = tab.evaluate("window.location.reload()", true); + } std::thread::sleep(Duration::from_secs(2)); continue; } - if authorized - && nbc_state_is_optional_profile_signin(&state) - && !recent_media_activity - && optional_profile_signin_recoveries < 3 - && last_optional_profile_signin_retry - .map(|instant| instant.elapsed() >= Duration::from_secs(3)) - .unwrap_or(true) - { - optional_profile_signin_recoveries += 1; - last_optional_profile_signin_retry = Some(Instant::now()); - tracing::info!( - title = %state.title, - page_url = %state.page_url, - authorized, - source_url, - optional_profile_signin_recoveries, - "NBC profile sign-in surface detected after authorization; returning to the live source URL" - ); - close_auxiliary_browser_tabs(browser, tab); - let _ = tab.activate(); - tab.navigate_to(source_url)?; - tab.wait_until_navigated()?; - std::thread::sleep(Duration::from_secs(2)); - continue; + if authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video { + if optional_profile_signin_recoveries == 0 + && last_optional_profile_signin_retry + .map(|instant| instant.elapsed() >= Duration::from_secs(3)) + .unwrap_or(true) + { + optional_profile_signin_recoveries += 1; + last_optional_profile_signin_retry = Some(Instant::now()); + tracing::info!( + title = %state.title, + page_url = %state.page_url, + authorized, + source_url, + "NBC account sign-in gate detected after provider authorization; trying one live-url recovery" + ); + close_auxiliary_browser_tabs(browser, tab); + let _ = tab.activate(); + tab.navigate_to(source_url)?; + tab.wait_until_navigated()?; + std::thread::sleep(Duration::from_secs(2)); + continue; + } + return Err(anyhow!( + "NBC account sign-in gate reached after TV-provider auth; refusing retry loop without decoded video (title='{}', page_url='{}')", + state.title, + state.page_url, + )); } - if authorized && nbc_state_is_optional_profile_signin(&state) && recent_media_activity { + if authorized && nbc_state_is_optional_profile_signin(&state) && state.has_video { tracing::debug!( title = %state.title, page_url = %state.page_url, - "NBC optional profile sign-in is visible but media activity is already in flight; staying on the page" + "NBC optional profile sign-in is visible but a video element is already present; staying on the page" ); } @@ -1733,6 +1868,13 @@ fn wait_for_nbc_playback( body_text = %clues.body_text, "NBC watch surface clues" ); + if nbc_clues_look_geo_blocked(&clues) { + return Err(anyhow!( + "NBC geo-blocked current egress; page says this content is not authorized outside the US/territories (title='{}', page_url='{}')", + primary_state.title, + primary_state.page_url, + )); + } } } if fully_loaded_watch_surface && !primary_state.has_video { @@ -1862,6 +2004,9 @@ mod tests { assert!(nbc_url_is_provider_linked( "https://www.nbc.com/provider-linked" )); + assert!(nbc_url_is_mvpd_complete( + "https://www.nbc.com/mvpd-complete" + )); assert!(nbc_title_looks_like_provider_linked("TV Provider Linked")); assert!(!nbc_url_is_provider_linked( "https://www.nbc.com/live?brand=nbc-sports-philadelphia" @@ -1884,11 +2029,31 @@ mod tests { #[test] fn optional_profile_signin_is_not_treated_as_watch_surface() { assert!(!nbc_page_is_watch_surface("https://www.nbc.com/sign-in")); + assert!(!nbc_page_is_watch_surface( + "https://www.nbc.com/mvpd-complete" + )); assert!(nbc_page_is_watch_surface( "https://www.nbc.com/live?brand=nbc-sports-philadelphia" )); } + #[test] + fn geo_block_clues_fail_closed() { + let clues = NbcPageClues { + body_text: + "We're sorry. You are not authorized to access this content from outside of the US and its territories." + .to_string(), + ..NbcPageClues::default() + }; + assert!(nbc_clues_look_geo_blocked(&clues)); + + let allowed = NbcPageClues { + body_text: "NBC News NOW ON NOW until 7:00 AM".to_string(), + ..NbcPageClues::default() + }; + assert!(!nbc_clues_look_geo_blocked(&allowed)); + } + #[test] fn cssott_media_requests_mark_recent_media_activity() { let mut trace = NbcTraceState::default(); @@ -1899,4 +2064,25 @@ mod tests { assert!(trace.media_activity_seen); assert!(nbc_trace_has_recent_media_activity(&trace)); } + + #[test] + fn decoded_frame_detection_requires_advancing_video_surface() { + let mut state = NbcVideoState { + has_video: true, + width: 1920, + height: 1080, + paused: false, + ready_state: 2, + current_time: 1.0, + ..NbcVideoState::default() + }; + assert!(nbc_video_state_has_decoded_frame(&state)); + + state.current_time = 0.0; + assert!(!nbc_video_state_has_decoded_frame(&state)); + + state.current_time = 1.0; + state.width = 0; + assert!(!nbc_video_state_has_decoded_frame(&state)); + } } diff --git a/crates/ec-node/tests/determinism_cmaf_ladder.rs b/crates/ec-node/tests/determinism_cmaf_ladder.rs index 69c7669..ff706a6 100644 --- a/crates/ec-node/tests/determinism_cmaf_ladder.rs +++ b/crates/ec-node/tests/determinism_cmaf_ladder.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::io::{BufRead, BufReader}; use std::path::Path; use std::process::{Command, Stdio}; @@ -46,6 +47,15 @@ fn blake3_hex(path: &Path) -> anyhow::Result { Ok(blake3::hash(&bytes).to_hex().to_string()) } +fn command_available(name: &str) -> bool { + Command::new(name) + .arg("-version") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .is_ok() +} + fn concat_init_and_segment(init: &Path, seg: &Path, out: &Path) -> anyhow::Result<()> { let init_bytes = std::fs::read(init)?; let seg_bytes = std::fs::read(seg)?; @@ -157,11 +167,15 @@ fn write_deterministic_ts(out_path: &Path) -> anyhow::Result<()> { Ok(()) } -fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> { +fn run_ladder_with_identity( + ec_node: &Path, + input_ts: &Path, + out_dir: &Path, + stream_id: &str, + broadcast_name: &str, +) -> anyhow::Result<()> { let signing_key = "11".repeat(32); let network_secret = "22".repeat(32); - let stream_id = "every.channel/determinism/cmaf-ladder"; - let broadcast_name = "every.channel/determinism/cmaf-ladder"; let mut cmd = Command::new(ec_node); cmd.env("EVERY_CHANNEL_MANIFEST_SIGNING_KEY", &signing_key) @@ -210,6 +224,40 @@ fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result Ok(()) } +fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> { + run_ladder_with_identity( + ec_node, + input_ts, + out_dir, + "every.channel/determinism/cmaf-ladder", + "every.channel/determinism/cmaf-ladder", + ) +} + +fn ladder_artifact_hashes(root: &Path) -> BTreeMap { + let mut hashes = BTreeMap::new(); + for variant in ["1080p", "720p", "480p"] { + let variant_dir = root.join("cmaf-ladder").join(variant); + // `moq-publish --max-chunks 3` publishes init plus segments 0..=2. + // ffmpeg can race ahead and leave an unpublished tail segment before it is killed. + let init = variant_dir.join("init.mp4"); + assert!(init.exists(), "missing init for {variant}"); + hashes.insert(format!("{variant}/init.mp4"), blake3_hex(&init).unwrap()); + + for idx in 0..3 { + let name = format!("segment_{idx:06}.m4s"); + let path = variant_dir.join(&name); + assert!(path.exists(), "missing {name} for {variant}"); + hashes.insert(format!("{variant}/{name}"), blake3_hex(&path).unwrap()); + } + } + hashes +} + +fn assert_ladder_bytes_match(left: &Path, right: &Path) { + assert_eq!(ladder_artifact_hashes(left), ladder_artifact_hashes(right)); +} + #[test] #[ignore] fn deterministic_cmaf_ladder_outputs_match_across_runs() { @@ -235,36 +283,53 @@ fn deterministic_cmaf_ladder_outputs_match_across_runs() { run_ladder(&ec_node, &input_ts, &run1).expect("run ladder 1"); run_ladder(&ec_node, &input_ts, &run2).expect("run ladder 2"); - for variant in ["1080p", "720p", "480p"] { - let v1 = run1.join("cmaf-ladder").join(variant); - let v2 = run2.join("cmaf-ladder").join(variant); + assert_ladder_bytes_match(&run1, &run2); +} - let init1 = v1.join("init.mp4"); - let init2 = v2.join("init.mp4"); - assert!( - init1.exists() && init2.exists(), - "missing init for {variant}" - ); - assert_eq!( - blake3_hex(&init1).unwrap(), - blake3_hex(&init2).unwrap(), - "init differs for {variant}" - ); - - for idx in 0..3 { - let s1 = v1.join(format!("segment_{idx:06}.m4s")); - let s2 = v2.join(format!("segment_{idx:06}.m4s")); - assert!( - s1.exists() && s2.exists(), - "missing segment {idx} for {variant}" - ); - assert_eq!( - blake3_hex(&s1).unwrap(), - blake3_hex(&s2).unwrap(), - "segment {idx} differs for {variant}" - ); - } +#[test] +fn duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes() { + if !command_available("ffmpeg") { + eprintln!("skipping duplicate publisher CMAF ladder determinism test: ffmpeg unavailable"); + return; } + + let ec_node = ec_node_path(); + + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + let tmp = std::env::temp_dir().join(format!("ec-duplicate-publisher-cmaf-ladder-{ts}")); + let _ = std::fs::create_dir_all(&tmp); + + let input_ts = tmp.join("input.ts"); + write_deterministic_ts(&input_ts).expect("write deterministic TS"); + + let publisher_a = tmp.join("publisher-a"); + let publisher_b = tmp.join("publisher-b"); + let _ = std::fs::remove_dir_all(&publisher_a); + let _ = std::fs::remove_dir_all(&publisher_b); + std::fs::create_dir_all(&publisher_a).unwrap(); + std::fs::create_dir_all(&publisher_b).unwrap(); + + run_ladder_with_identity( + &ec_node, + &input_ts, + &publisher_a, + "every.channel/determinism/duplicate/publisher-a/la-kcop", + "publisher-a-la-kcop", + ) + .expect("run duplicate publisher a"); + run_ladder_with_identity( + &ec_node, + &input_ts, + &publisher_b, + "every.channel/determinism/duplicate/publisher-b/la-kcop", + "publisher-b-la-kcop", + ) + .expect("run duplicate publisher b"); + + assert_ladder_bytes_match(&publisher_a, &publisher_b); } #[test] diff --git a/crates/ec-node/tests/e2e_remote_website_watch_existing.rs b/crates/ec-node/tests/e2e_remote_website_watch_existing.rs index 9c1a0ba..5f0cf97 100644 --- a/crates/ec-node/tests/e2e_remote_website_watch_existing.rs +++ b/crates/ec-node/tests/e2e_remote_website_watch_existing.rs @@ -1,4 +1,5 @@ use std::ffi::OsStr; +use std::process::{Command, Stdio}; use std::time::{Duration, Instant}; fn which(cmd: &str) -> Option { @@ -16,6 +17,24 @@ fn chrome_path() -> Option { .or_else(|| which("chromium")) } +fn ec_node_path() -> std::path::PathBuf { + if let Ok(value) = std::env::var("EC_NODE_BIN") { + return value.into(); + } + if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec_node") { + return value.into(); + } + if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec-node") { + return value.into(); + } + let exe = std::env::current_exe().expect("current_exe"); + let debug_dir = exe + .parent() + .and_then(|p| p.parent()) + .expect("expected target/debug/deps"); + debug_dir.join("ec-node") +} + fn wait_for_canvas_element(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> { let deadline = Instant::now() + timeout; while Instant::now() < deadline { @@ -46,14 +65,41 @@ fn wait_for_moq_watch_element(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::bail!("timed out waiting for element"); } +fn wait_for_live_or_archive_player( + tab: &headless_chrome::Tab, + timeout: Duration, +) -> anyhow::Result<()> { + let deadline = Instant::now() + timeout; + while Instant::now() < deadline { + let js = r#"(function() { + return !!document.querySelector('moq-watch, video.archiveVideo'); +})();"#; + let v = tab.evaluate(js, false)?; + if v.value.and_then(|v| v.as_bool()).unwrap_or(false) { + return Ok(()); + } + std::thread::sleep(Duration::from_millis(200)); + } + anyhow::bail!("timed out waiting for live or archive player"); +} + fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result { let js = r#"(function() { let watch = document.querySelector('moq-watch'); let canvas = document.querySelector('moq-watch canvas'); + let video = document.querySelector('video.archiveVideo'); let placeholder = document.querySelector('.placeholder'); let placeholderText = placeholder ? (placeholder.innerText || '') : null; let status = document.querySelector('.source-status'); let statusText = status ? (status.innerText || '') : null; + let statusLine = document.querySelector('#statusLine'); + let statusLineText = statusLine ? (statusLine.innerText || '') : null; + let catalog = watch && watch.broadcast && watch.broadcast.catalog && watch.broadcast.catalog.peek + ? watch.broadcast.catalog.peek() + : null; + let established = watch && watch.connection && watch.connection.established && watch.connection.established.peek + ? watch.connection.established.peek() + : null; let sources = Array.from(document.querySelectorAll('button[data-testid="global-watch"]')).length; let hint = document.querySelector('#hint'); let hintText = hint ? (hint.innerText || '') : null; @@ -62,8 +108,27 @@ fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result { hasCanvas: !!canvas, canvasWidth: canvas ? canvas.width : null, canvasHeight: canvas ? canvas.height : null, + hasArchiveVideo: !!video, + videoCurrentTime: video ? video.currentTime : null, + videoDuration: video ? video.duration : null, + videoPaused: video ? video.paused : null, + videoReadyState: video ? video.readyState : null, + videoMuted: video ? video.muted : null, + videoVolume: video ? video.volume : null, + videoSrc: video ? (video.currentSrc || video.src || '') : null, muted: watch ? watch.muted : null, volume: watch ? watch.volume : null, + connectionStatus: watch?.connection?.status?.peek ? watch.connection.status.peek() : null, + connectionKind: established ? established.constructor?.name || null : null, + broadcastStatus: watch?.broadcast?.status?.peek ? watch.broadcast.status.peek() : null, + paused: watch?.backend?.paused?.peek ? watch.backend.paused.peek() : null, + audioMuted: watch?.backend?.audio?.muted?.peek ? watch.backend.audio.muted.peek() : null, + audioVolume: watch?.backend?.audio?.volume?.peek ? watch.backend.audio.volume.peek() : null, + catalogSeen: !!catalog, + catalogHasVideo: !!(catalog?.video?.renditions), + catalogHasAudio: !!(catalog?.audio?.renditions), + metrics: window.__ecPlaybackMetrics || null, + statusLineText, hintText, placeholderText, statusText, @@ -110,23 +175,120 @@ fn canvas_motion_sample(tab: &headless_chrome::Tab) -> anyhow::Result anyhow::Result<()> { +fn archive_video_motion_sample( + tab: &headless_chrome::Tab, +) -> anyhow::Result> { + let js = r#"(function() { + let video = document.querySelector('video.archiveVideo'); + if (!video) return null; + if (video.paused) video.play().catch(() => {}); + return JSON.stringify({ + wallTime: performance.now() / 1000, + currentTime: video.currentTime || 0, + readyState: video.readyState || 0, + paused: !!video.paused, + ended: !!video.ended, + muted: !!video.muted, + volume: video.volume || 0, + src: video.currentSrc || video.src || '' + }); +})();"#; + let v = tab.evaluate(js, false)?; + let Some(s) = v.value.and_then(|v| v.as_str().map(|s| s.to_string())) else { + return Ok(None); + }; + Ok(Some(serde_json::from_str(&s)?)) +} + +fn wait_for_canvas_or_archive_motion( + tab: &headless_chrome::Tab, + timeout: Duration, +) -> anyhow::Result { let deadline = Instant::now() + timeout; - let mut first: Option<(f64, u32)> = None; + let mut first_canvas: Option<(f64, u32)> = None; + let mut first_video_time: Option = None; while Instant::now() < deadline { if let Some(sample) = canvas_motion_sample(tab)? { - if let Some((first_time, first_hash)) = first { + if let Some((first_time, first_hash)) = first_canvas { if sample.0 > first_time + 0.5 && sample.1 != first_hash { - return Ok(()); + return Ok("moq-canvas".to_string()); } } else { - first = Some(sample); + first_canvas = Some(sample); } } + + if let Some(sample) = archive_video_motion_sample(tab)? { + let current_time = sample + .get("currentTime") + .and_then(|v| v.as_f64()) + .unwrap_or_default(); + let ready_state = sample + .get("readyState") + .and_then(|v| v.as_u64()) + .unwrap_or_default(); + let ended = sample + .get("ended") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + if ready_state >= 2 && !ended { + if let Some(first) = first_video_time { + if current_time > first + 0.5 { + return Ok("archive-video".to_string()); + } + } else { + first_video_time = Some(current_time); + } + } + } + std::thread::sleep(Duration::from_millis(500)); } let st = debug_player_state(tab).unwrap_or_default(); - anyhow::bail!("timed out waiting for changing canvas frames\nplayer_state={st}"); + anyhow::bail!("timed out waiting for live or archive motion\nplayer_state={st}"); +} + +fn wait_for_playback_probe_ok( + tab: &headless_chrome::Tab, + timeout: Duration, +) -> anyhow::Result { + let deadline = Instant::now() + timeout; + let mut last_metrics = String::new(); + while Instant::now() < deadline { + let js = r#"(function() { + const metrics = window.__ecPlaybackMetrics || null; + return metrics ? JSON.stringify(metrics) : ""; +})();"#; + let v = tab.evaluate(js, false)?; + last_metrics = v + .value + .and_then(|v| v.as_str().map(|s| s.to_string())) + .unwrap_or_default(); + if !last_metrics.is_empty() { + let metrics: serde_json::Value = serde_json::from_str(&last_metrics)?; + let ok = metrics.get("ok").and_then(|v| v.as_bool()).unwrap_or(false); + let samples = metrics + .get("samples") + .and_then(|v| v.as_u64()) + .unwrap_or_default(); + let changed = metrics + .get("changed_samples") + .and_then(|v| v.as_u64()) + .unwrap_or_default(); + let longest_static = metrics + .get("longest_same_hash_ms") + .and_then(|v| v.as_u64()) + .unwrap_or_default(); + if ok && samples >= 8 && changed >= 2 && longest_static < 5_000 { + return Ok(last_metrics); + } + } + std::thread::sleep(Duration::from_millis(250)); + } + let st = debug_player_state(tab).unwrap_or_default(); + anyhow::bail!( + "timed out waiting for playback probe ok\nplayer_state={st}\nmetrics={last_metrics}" + ); } fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> { @@ -134,7 +296,9 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any while Instant::now() < deadline { let js = r#"(function() { let watch = document.querySelector('moq-watch'); - return !!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted'); + let video = document.querySelector('video.archiveVideo'); + return (!!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted')) || + (!!video && video.muted === false && video.volume > 0); })();"#; let v = tab.evaluate(js, false)?; if v.value.and_then(|v| v.as_bool()).unwrap_or(false) { @@ -146,13 +310,21 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any anyhow::bail!("timed out waiting for unmuted player\nplayer_state={st}"); } -fn watch_url(site_url: &str, relay_url: &str, stream_id: &str) -> anyhow::Result { +fn watch_url( + site_url: &str, + relay_url: &str, + stream_id: &str, + verify: bool, +) -> anyhow::Result { let mut url = url::Url::parse(site_url)?; url.set_path("/watch"); url.query_pairs_mut() .clear() .append_pair("url", relay_url) .append_pair("name", stream_id); + if verify { + url.query_pairs_mut().append_pair("verify", "1"); + } Ok(url.to_string()) } @@ -190,23 +362,104 @@ fn e2e_remote_website_watch_existing_stream_id() -> anyhow::Result<()> { .unwrap(); let browser = headless_chrome::Browser::new(launch_options)?; let tab = browser.new_tab()?; - tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id)?)?; + tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, false)?)?; tab.wait_until_navigated()?; - // Ensure the player is instantiated. - if let Err(err) = wait_for_moq_watch_element(&tab, Duration::from_secs(90)) { + // Ensure either the native MoQ player or the archive live-edge fallback is instantiated. + if let Err(err) = wait_for_live_or_archive_player(&tab, Duration::from_secs(90)) { let st = debug_player_state(&tab).unwrap_or_default(); anyhow::bail!("{err}\nplayer_state={st}"); } - if let Err(err) = wait_for_canvas_element(&tab, Duration::from_secs(90)) { - let st = debug_player_state(&tab).unwrap_or_default(); - anyhow::bail!("{err}\nplayer_state={st}"); - } - - tab.wait_for_element("moq-watch canvas")?.click()?; + tab.evaluate( + r#"(function() { + const canvas = document.querySelector('moq-watch canvas'); + if (canvas) canvas.click(); + const audioButton = document.querySelector('#audioBtn'); + if (audioButton && audioButton.getAttribute('aria-pressed') !== 'true') { + audioButton.click(); + } +})();"#, + false, + )?; wait_for_unmuted_player(&tab, Duration::from_secs(10))?; - wait_for_canvas_motion(&tab, Duration::from_secs(30))?; + let playback_path = wait_for_canvas_or_archive_motion(&tab, Duration::from_secs(60))?; + eprintln!("playback path: {playback_path}"); Ok(()) } + +#[test] +#[ignore] +fn e2e_remote_website_watch_synthetic_relay_stream() -> anyhow::Result<()> { + if which("ffmpeg").is_none() { + return Ok(()); // skip + } + let chrome = match chrome_path() { + Some(p) => p, + None => return Ok(()), // skip + }; + + let site_url = std::env::var("EVERY_CHANNEL_SITE_URL") + .unwrap_or_else(|_| "https://every.channel/".to_string()); + let relay_url = std::env::var("EVERY_CHANNEL_RELAY_URL") + .unwrap_or_else(|_| "https://relay.every.channel/anon".to_string()); + let tls_disable_verify = std::env::var("EVERY_CHANNEL_RELAY_TLS_DISABLE_VERIFY") + .map(|v| v != "0" && v.to_lowercase() != "false") + .unwrap_or(true); + + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + let stream_id = format!("e2e-synthetic-{ts}"); + let ec_node = ec_node_path(); + + let mut publisher = Command::new(&ec_node); + publisher + .arg("wt-publish") + .arg("--url") + .arg(&relay_url) + .arg("--name") + .arg(&stream_id) + .arg("--realtime-input") + .arg("--input-format") + .arg("lavfi") + .arg("--input") + .arg("testsrc2=size=1280x720:rate=30") + .stdout(Stdio::null()) + .stderr(Stdio::inherit()); + if tls_disable_verify { + publisher.arg("--tls-disable-verify"); + } + let mut publisher = publisher.spawn()?; + + let test_result = (|| -> anyhow::Result<()> { + let launch_options = headless_chrome::LaunchOptionsBuilder::default() + .path(Some(chrome)) + .headless(true) + .args(vec![ + OsStr::new("--autoplay-policy=no-user-gesture-required"), + OsStr::new("--disable-application-cache"), + OsStr::new("--disable-service-worker"), + OsStr::new("--disk-cache-size=0"), + OsStr::new("--mute-audio"), + ]) + .build() + .unwrap(); + let browser = headless_chrome::Browser::new(launch_options)?; + let tab = browser.new_tab()?; + tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, true)?)?; + tab.wait_until_navigated()?; + + wait_for_moq_watch_element(&tab, Duration::from_secs(90))?; + wait_for_canvas_element(&tab, Duration::from_secs(90))?; + let metrics = wait_for_playback_probe_ok(&tab, Duration::from_secs(60))?; + eprintln!("playback metrics: {metrics}"); + Ok(()) + })(); + + let _ = publisher.kill(); + let _ = publisher.wait(); + test_result +} diff --git a/evolution/proposals/ECP-0156-duplicate-publisher-deterministic-data-layer.md b/evolution/proposals/ECP-0156-duplicate-publisher-deterministic-data-layer.md new file mode 100644 index 0000000..31dc231 --- /dev/null +++ b/evolution/proposals/ECP-0156-duplicate-publisher-deterministic-data-layer.md @@ -0,0 +1,334 @@ +# ECP-0156: Duplicate Publisher Deterministic Data Layer + +Status: Draft + +## Context + +Two publisher nodes may broadcast the same logical channel at the same time. The archive and relay +layers need this for resilience, but duplicate publishers currently risk looking like conflicting +streams instead of convergent copies of the same media. + +## Decision + +Duplicate publishers are valid for a published channel. The data layer dedupes and verifies media by +content identity, not by publisher envelope identity: + +- CMAF init and media segment bytes for the same input, ladder profile, and chunk cadence must be + byte-for-byte identical. +- BLAKE3 media hashes and per-rung Merkle roots are the shared data identity. +- Publisher manifests may carry different `stream_id`, `epoch_id`, `created_unix_ms`, signatures, + locators, and manifest ids. +- The archive must treat matching media hashes from different publishers as corroborating sources. +- Archive records must carry source identity. Two copied buffers with the same `source_node` are not + duplicate-publisher proof, even when their BLAKE3 hashes match. +- Divergent hashes for the same logical channel, rendition, and media time are misses that must be + measured before the data is promoted as redundant. + +## Verification + +The proof path has two stages: + +1. Single-node duplicate-publisher tests produce the same ladder twice with different publisher + identities and assert byte-for-byte BLAKE3 equality for every generated init and media segment. + The `duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes` test is part of the + default Rust test path when ffmpeg is present; it is not an ignored E2E. +2. Production verification runs the same channel on two real publishers long enough to measure + duplicate media convergence, hash divergence, missing objects, and backfill behavior in Grafana. + +The goal is not just "two publishers are online." Success requires elapsed production time behind the +numbers and dashboards that show duplicate hits, misses, and archive repair. + +## Consequences + +Manifest ids cannot be used as the archive dedupe key for duplicate publishers. Operators get a +clear signal when two publishers produce identical bytes versus merely announcing the same channel. +If encoder determinism changes, the single-node test fails before production redundancy silently +degrades. + +## Alternatives considered + +- Dedupe by manifest id. This preserves envelope identity but misses the resilience property because + duplicate publishers necessarily produce different envelopes. +- Dedupe by logical channel and time only. This can hide encoder divergence and promote bad + redundancy before byte-level media equality is proven. +- Disable duplicate publishers until the scheduler is perfect. This avoids conflict handling but + weakens live resilience and leaves the archive data layer untested. + +## Rollout/teardown + +Roll forward by landing the local deterministic test, adding miss/duplicate metrics to the archive +scrape surface, then running two publishers for one logical channel in production. Roll back by +disabling duplicate scheduling for that channel; existing content-addressed archive objects remain +valid. + +## Implementation notes + +The node-agent archive scrape now exposes duplicate-source and miss gauges without placing hashes in +labels. Per node, role, broadcast, rendition, and track it reports duplicate matching hash sources, +duplicate hash sequences, divergent hash sequences, and missing hash records. Grafana shows those +next to archive ladder coverage so the production duplicate-publisher run has an operator-visible +convergence and miss signal. + +`ec-node archive-convergence` is the primary proof surface for duplicate media identity. It compares +named archive manifest roots directly inside the Rust node binary, groups records by logical stream, +rendition, track, and sequence, and only returns `ok` when every expected sequence has matching +duplicate source hashes with no missing or divergent sequence. It also requires archive records to +carry at least two distinct `source_node` values, so mirrored global-origin manifests cannot pass as +independent publishers. This keeps the media-data invariant in the already-shipped Rust artifact +instead of extending the Python node-agent. Rollout gates should use +`ec-node archive-convergence --require-ok`; the command emits the JSON report either way, but +`--require-ok` exits non-zero unless duplicate convergence is actually proven. +`ec-node archive-convergence --prometheus` renders the same Rust convergence report as scrapeable +`every_channel_archive_*` gauges for duplicate source records, duplicate sequences, divergent +sequences, source-local divergence, missing hashes, missing source identity, media timing conflicts, +record source count, and pass/fail state. This gives Grafana a Rust-owned proof metric path while +the older node-agent ladder metrics remain available during migration. +`ec-node archive-convergence-serve` keeps that proof path live for Prometheus: it serves `/health` +and `/metrics`, recomputes convergence on each scrape, and emits `scrape_ok=0` metrics instead of +disappearing when manifests are missing or not ready. Production Grafana can therefore distinguish a +healthy metrics target from an unproven duplicate-publisher run. +The Nix `services.every-channel.ec-node.archive.convergence.proofs` option turns those Rust proof +servers into named systemd units. Each proof must name at least two `NAME=PATH` sources and gets a +dedicated listen address, so operators can add one Prometheus scrape target per duplicate channel +without resurrecting the Python node-agent as the proof oracle. +Forge enables an initial `la-kcop-publisher-origin` proof target on `127.0.0.1:7812` and Prometheus +scrapes it alongside the other local every.channel targets. Until two real publisher manifest roots +are mounted or fetched into Forge, the target intentionally uses the Forge manifest root as a +placeholder peer and must report unproven convergence rather than green duplicate-publisher proof. +Forge also exposes a static two-NUC `la-kcet-remote-publisher-origin` proof target once that channel +is the live converged duplicate sample. Dynamic Headscale file-SD remains useful for discovery, but +it can include relays and stale nodes; duplicate-publisher proof should use an explicit publisher +pair or future scheduler group labels so unrelated agents do not turn a passing channel red. +This static proof exports its own Rust convergence gauges rather than gating on broad legacy +Prometheus aggregates, because older node-agent archive metrics do not yet carry enough proof-role +labels to avoid summing stale divergence from unrelated scrape targets. + +`ec-node archive-convergence-measure` is the primary production proof harness. It fetches named +node-agent `/v1/archive-manifest` samples or direct manifest JSONL URLs, writes bounded temporary +manifest roots, reuses the Rust `archive-convergence` report, and optionally queries Prometheus for +the Grafana-facing duplicate/miss series. A production run only counts as complete when the report +has elapsed samples, matching duplicate media hashes, zero divergent hash sequences, and live +Prometheus series for the duplicate/miss gauges. The measurement groups records by archive record +source identity, not by the URL used to fetch a manifest, and reports source identity failures when +the sample is too weak to prove independent publisher data. The older +`scripts/measure-duplicate-publishers.py` stays compatibility-only until live operators and Forge +jobs are switched to the Rust command. +The convergence report carries bounded divergent-sequence samples with per-source hash, byte size, +receive time, source node/session, CAS path, and media timing when present, so a red proof is +immediately actionable without fetching full manifests by hand. +It also reports a non-blocking media-timing-missing count and Prometheus gauge; hash equality can +still prove duplicate bytes, but missing timing means a divergent proof cannot yet classify whether +the mismatch is a phase/windowing problem or an encoder byte problem. +Publisher service builders must pass proof cadence explicitly. Both the node-agent publisher +supervisor and Nix systemd publisher module set `--publisher-archive-segment-duration-ms` and +`--publisher-start-boundary-ms` by default, so netbooted NUCs do not depend on stale hotpatch CLI +defaults when aligning duplicate publisher proof windows. + +`ec-node archive-convergence-measure-serve` turns that production proof harness into a live +Prometheus target. Each `/metrics` scrape fetches one fresh sample from node-agent or direct JSONL +manifest URLs, keeps a bounded in-memory sample window, and only reports measurement `ok` after the +configured elapsed window has passed. This avoids blocking Prometheus scrapes for the measurement +duration while still preventing two immediate samples from looking like a real production run. +The service emits measurement-level gauges for fetch success, source record counts, invalid records, +elapsed seconds, Prometheus series presence, reasons, and then appends the same +`every_channel_archive_*` convergence gauges from the latest sample. The service can also read +Prometheus file-SD JSON from Forge's Headscale node-agent discovery and turn each discovered target +into a sampled node-agent manifest source. The Nix +`services.every-channel.ec-node.archive.convergence.remoteProofs` option creates these remote proof +services as systemd units from either static `NAME=URL` endpoints or dynamic file-SD inputs. Forge +now exposes `la-kcop-remote-publisher-origin` on `127.0.0.1:7813` using the live +`/var/lib/prometheus/every-channel-node-agents.json` inventory. It must stay red until that +inventory contains at least two independent publisher node-agents whose `publisher.m4s` records +converge. + +When archive-serve ports are not reachable from the proof runner, the node-agent exposes a bounded, +tailnet-authenticated `/v1/archive-manifest` sample endpoint. The harness can use that endpoint for +each named publisher, compare local manifest records directly, and still require at least two elapsed +samples before declaring success. + +Production duplicate proof also requires archive-buffer freshness on each participating publisher. +During mixed-generation rollouts, the current node-agent may supervise an older installed +`archive-hot-sync` helper. The agent must probe helper flag support and omit optional arguments such +as `--link-mode` when an older helper lacks them, because a silently failing archive-buffer sync can +leave one publisher with healthy live streams but stale manifests. + +The publisher buffer refresh is freshness-first: the node-managed sync must mirror full manifests +without origin object fetch before running the slower cache fill/prune pass. This lets convergence +checks, Grafana scrape surfaces, and demand fetch see current BLAKE3 indexes even when proactive CAS +object backfill is still catching up. + +`wt-archive` stamps each archive index record with `source_node` and `source_session`. The Nix +archive launcher passes the runtime hostname as `--source-node`; explicit CLI users can override it. +Older records without this identity continue to parse, but proof commands and production measurement +mark them incomplete instead of accepting them as independent publisher evidence. + +Publisher-origin proof must be captured before relay/archive mirroring can collapse source identity. +When node-agent archive buffering is enabled, supervised `wt-publish` processes pass +`--publisher-archive-output-dir`, `--publisher-archive-manifest-dir`, and +`--publisher-archive-source-node`. `wt-publish` now supervises the Rust +`publisher-proof-archive-source` worker for that archive track. The worker splits the MPEG-TS source +by source-clock windows, fresh-encodes each bounded window with the deterministic proof profile, +stores the resulting media fragments under `publisher.m4s` in the same CAS/index format, and stamps +them with node-agent source identity. The relay playback encoder remains continuous for watchability, +but it is no longer the BLAKE3 data identity for duplicate-publisher proof. The source identity is +explicit override first, then hostname plus a short hash of machine-id, with boot-id only as a +fallback; hostname alone is not enough because publisher images can share names like `ec-node`. +Production duplicate verification can therefore compare `publisher.m4s` from two publisher buffers +without treating copied relay-origin manifests as independent sources. + +Proof tooling defaults to `publisher.m4s`. The relay video track `0.m4s` is useful playback data, +but it is not duplicate-publisher proof: a publisher buffer may hold relay/cache records on `0.m4s` +that have no publisher source identity. Production convergence checks that sample `0.m4s` should be +treated as playback/archive-cache diagnostics, not byte-for-byte duplicate publisher evidence. + +The first live publisher-origin measurements on 2026-06-08 showed correct distinct source labels but +zero matching duplicate sequences for `la-nbc4`, `la-pbs-socal`, and `la-kcet`. The failure is +useful: independent `wt-publish` processes currently start their fragment sequence and encoder chunk +phase at local process start, so sequence `0` from two publishers is not necessarily the same +broadcast moment. Duplicate-publisher proof therefore requires a shared chunk clock or +scheduler-controlled aligned encoder phase before byte-for-byte archive convergence can pass in +production. + +Publisher-origin `publisher.m4s` records now require timed fMP4 fragments for global proof and map +those fragments onto observed wall-clock epoch buckets instead of local process counters. The Rust +writer learns track timescales from the init `moov` box, reads fragment +`moof/traf/tfhd+tfdt` decode timestamps to reject untimed proof when possible, then assigns +`group_sequence = observed_epoch_bucket * bucket_stride + fragment_slot`. Fragments that lack usable +timing still fall back to the previous local counter so publishing does not fail hard on malformed +metadata, but duplicate-publisher proof should use timed fragments. The `wt-publish` ffmpeg path +also preserves source timestamps and uses closed-GOP, single-threaded x264 settings with forced +keyframe cadence so independent publishers have a real chance of producing identical bytes for the +same media time window. + +A later live run on 2026-06-08 found a stricter local invariant before cross-publisher byte equality: +each publisher must produce at most one hash for a given `source_node` and `group_sequence`. +Production `publisher.m4s` samples for `la-kcop` and `la-ktla` showed multiple hashes from the same +source in the same sequence bucket because real fMP4 fragments can arrive faster than the configured +proof segment duration, and the writer rounded decode time into repeated buckets. The writer now +uses a fixed per-epoch bucket stride and increments an in-bucket fragment slot when multiple timed +fragments arrive inside the same proof duration. This keeps source-local manifests unique while +allowing independently restarted publishers to align on the same observed wall-clock bucket. +`ec-node archive-convergence` reports this separately as `source_local_divergent_sequences` so +operator tooling can distinguish a self-contradicting publisher from two publishers that simply +disagree about the same sequence. +Because bucket-strided proof sequences intentionally leave numeric gaps, archive convergence uses +the observed sparse sequence union for publisher-origin manifests. Dense contiguous sequence ranges +remain available in the simulation layer when a model explicitly expects every integer sequence. + +The 2026-06-08 live `la-kcet/publisher.m4s` sample from Forge confirmed that both publishers now +emit distinct source identities (`ec-node-c3546fa5abc3` and `ec-node-72cf1c3aa196`) with no missing +source identity records on the sampled publisher-origin manifests. It also confirmed the remaining +bug: 156 shared publisher-origin sequences had zero byte-for-byte BLAKE3 matches and 156 divergent +hashes. The next production fix must align the publisher chunk clock and encoded fMP4 byte stream, +not merely improve scrape or Grafana plumbing. + +After the wall-clock bucket hotpatch, the same live proof no longer has fake sparse-range missing +IDs: `la-kcet/publisher.m4s` reported 376 observed proof sequences, zero missing source identities, +zero source-local divergent sequences, and 234 divergent shared sequences. A byte-level sample for +sequence `7287381184512` had different sizes, different BLAKE3 hashes, different `tfdt` +base-media-decode-times (`210210` versus `0`), and different `mdat` payload prefixes. Across that +sampled window there were zero common fragment hashes even when sequence IDs were ignored, proving +that the remaining failure was independent-encoder media phase and fMP4 payload determinism, not an +archive manifest identity bug. + +A later `la-kcop/publisher.m4s` sample exposed a stricter live-source bug: source-window proof +records were using unsynced MPEG-TS PCR chunk indexes as `group_sequence` when the OTA UTC clock was +unavailable, causing restart-dependent jumps such as 93M, 135M, 341M, and 390M. The source-proof +writer now uses the chunk UTC start only when the chopper reports synced timing, otherwise it falls +back to the local wall-clock window start, and rewrites fMP4 `tfdt` onto that shared window before +hashing. The live HTTP proof worker also retries transient source opens/reader failures in unbounded +live mode, so a tuner `503` or malformed TS burst is skipped/retried instead of killing the +publisher proof process. + +The synced source-window clock must use the chopper's exact global chunk index, not integer UTC +seconds. A 1001 ms proof cadence makes whole-second UTC start metadata lossy: adjacent source +windows can share the same `utc_start_unix`, which caused one publisher to write several different +hashes under the same source-local `group_sequence`. Synced chunks therefore use +`ChunkTiming.chunk_index` directly; only unsynced chunks fall back to local wall-clock receipt. +The live source-window proof writer also keeps subfragment slot allocation as stream state instead +of per-chunk state. Real source windows can be emitted in more than one proof chunk for the same +media timing sequence; resetting the slot counter for every chunk reused the same +`group_sequence` and made one healthy publisher look self-divergent. The counter is bounded so the +long-running live worker does not grow state unbounded. + +`wt-publish` now has an explicit Unix-epoch start boundary, defaulting to the publisher-origin proof +cadence. After relay setup and immediately before spawning ffmpeg it waits until the next boundary, +so a newly restarted duplicate publisher starts its forced-keyframe clock on the same global cadence +as already-running publishers. +This does not by itself prove byte equality; it removes the local-process-start phase error from the +live publisher path and gives rollout measurement a deterministic knob (`--publisher-start-boundary-ms +0` disables it). The live ffmpeg argument plan is factored into a Rust unit-testable helper so +future timestamp/keyframe changes are pinned in `ec-node` instead of being inferred from node-agent +process strings or production samples. + +The first post-start-clock live sample still failed duplicate byte identity: both publishers landed +in the same wall-clock proof bucket, but one fragment carried `tfdt=390390` while the other carried +`tfdt=30030`, matching the staggered restart gap. Their `mdat` prefixes differed too, which means a +continuous x264 encoder keeps enough local history that a later restart cannot prove byte equality +merely by joining the same wall-clock cadence. The live profile therefore enables x264 +`stitchable=1` alongside closed GOP, no scenecut, no B-frames, no lookahead, and one thread. If that +still does not converge in production, the next fix is a deliberately stateless per-fragment encode +or a Rust-owned media clock/segmenter that resets encoder history at each proof boundary. + +The follow-up production hotpatch moved the start-boundary wait to immediately before ffmpeg spawn, +enabled `stitchable=1`, and restarted both publisher nodes in the same batch. The latest `la-kcet` +sample still reported zero matching duplicate hashes with no missing source identity and no +source-local divergence. A final sampled shared sequence differed by hundreds of milliseconds of +receive time and by media size (`439737` versus `270283` bytes for the video fragment), so the +remaining mismatch is not just MP4 timestamp metadata. Production duplicate proof now needs a +stateless fragment boundary: either encode each proof segment from the same bounded source window +with fresh encoder state, or make the Rust media pipeline own exact frame-window capture before +calling ffmpeg/x264. + +Archive manifests now carry optional fMP4 media timing for publisher-origin fragments. The +`archive-convergence` gate treats equal archive group sequence IDs with different media sequence or +decode-time metadata as `media_sequence_conflict`, even if the byte hash happens to match. This keeps +production proof aligned with the Rust simulation model: a duplicate publisher only proves the same +broadcast moment when the archive sequence and media window agree. + +The first stateless proof primitives are now in `ec-node`. `publisher-proof-segment` takes one +bounded MPEG-TS source-clock window, runs a fresh deterministic x264/AAC fMP4 encode, splits the +result into init bytes and media fragments, and emits BLAKE3 hashes for each. `publisher-proof-windows` +uses the Rust MPEG-TS source-clock splitter first, then fresh-encodes each bounded window and reports +per-window source TS, init, and media hashes. Proof windows carry explicit MPEG-TS decoder context +with `--preroll-packets`, defaulting to the repo-owned `WT_PUBLISH_PROOF_PREROLL_PACKETS` budget, so +mid-GOP windows do not silently depend on best-effort decoder recovery. Focused Rust tests +fresh-encode the same bounded input and the same finite source-window campaign twice and assert +byte-for-byte identical proof hashes. + +`publisher-proof-duplicates` is the single-node duplicate-publisher gate for the stateless path. It +runs `publisher-proof-windows` independently under at least two publisher identity labels, defaults +to `publisher-a` and `publisher-b`, and compares source TS, init, and media fragment BLAKE3 hashes +for every source-clock window. `--require-ok` exits non-zero unless every compared window matches, +and duplicate publisher labels are rejected so the proof cannot accidentally collapse to one source +identity. `publisher-proof-compare` is the cross-machine stateless proof gate: each publisher can run +`publisher-proof-windows` against the same bounded source TS file locally, copy the JSON report back +to the operator host, and compare the reports by named publisher. It rejects mismatched chunk cadence, +missing windows, source TS hash mismatches, init hash mismatches, media fragment hash mismatches, and +empty media windows. + +`publisher-proof-remote-compare` is the production operator harness for that cross-machine gate. It +copies one bounded `.ts` proof input to each named SSH target, runs `ec-node publisher-proof-windows` +on the target, stores each returned JSON report under the local output directory, writes a +`compare.json`, and returns the existing compare report with upload/proof timing. Remote labels use +the same single-component validation as publisher identities, remote proof roots are constrained to +`/tmp/every-channel-*`, and cleanup is opt-in so the generated proof files remain inspectable unless +the operator explicitly requests removal. This keeps the live proof path in Rust without making the +Python node-agent a new oracle. It proves the machine/runtime/compiler boundary without requiring +the two NUCs to share a live tuner at the exact same instant. + +`publisher-proof-archive-source` is the live archive implementation of the same proof model. It can +read local source files directly, read plain HTTP MPEG-TS bodies directly for HDHomeRun-style +sources, or fall back to an ffmpeg MPEG-TS copy reader for other inputs. Each emitted source-clock +window is encoded with fresh proof state, archived as CAS-backed `publisher.m4s` records, and mapped +to source-clock group sequences with explicit media timing metadata. A focused Rust regression now +archives the same bounded TS input as two source nodes, then runs `archive-convergence` against the +two manifest roots and requires full duplicate convergence with zero divergent or source-local +divergent sequences. + +Forge `ci-gates` now runs the `publisher_proof` and `archive_convergence` Rust filters before the +distributed simulator campaign, so single-node byte-for-byte determinism, source-window archive +proof semantics, and duplicate archive convergence are checked before production rollout evidence is +considered. The next production step is to deploy the updated node binary and let fresh +`publisher.m4s` source-window records age into the Grafana scrape window so live duplicate metrics +can replace the older continuous-encoder divergence. diff --git a/evolution/proposals/ECP-0157-rust-simulation-testing.md b/evolution/proposals/ECP-0157-rust-simulation-testing.md new file mode 100644 index 0000000..fc7d3ed --- /dev/null +++ b/evolution/proposals/ECP-0157-rust-simulation-testing.md @@ -0,0 +1,158 @@ +# ECP-0157: Rust Simulation Testing + +Status: Draft + +## Context + +Production is now fast enough to expose distributed bugs quickly, but it is still the wrong first +place to discover scheduler, archive, and duplicate-publisher invariants. The Python node-agent also +made this worse by putting core control behavior outside the already-built Rust node binary. + +## Decision + +Add a small deterministic simulation layer in `ec-core` and use it for distributed media invariants: + +- `ec-node` remains the runtime owner for node behavior. +- Tests model logical time, delayed delivery, backfill, duplicate publishers, and archive + convergence in Rust. +- Simulation scenarios are seed-replayable and include deterministic jitter, transient drops, + partition windows, publisher outage/restart windows, backfill retries, and encoder drift faults. +- A failing simulation must print or carry a replay hint so the exact schedule can be rerun. +- Simulation reports include deterministic execution history so a failure has an ordered event trace, + not only a final assertion. +- Simulation campaigns run many seed schedules in one fast test and preserve the first failing seed, + invariant report, and final state as the failure artifact. +- Campaign execution has a reusable seeded runner so new models can share replay/failure accounting + instead of copying bespoke loops. +- First failures are automatically shrunk where the model supports it. For duplicate publishers the + shrinker removes irrelevant partitions, publisher outages, timing jitter, transient drops, and + excess media sequence range while keeping the original invariant unchanged. +- Invariants are explicit checks, not implicit test prose: duplicate source count, missing + sequences, divergent hashes, missing media timing, conflicting media timing, complete duplicate + coverage, and convergence-deadline budgets. +- Media identity is checked by BLAKE3 hashes for stream, rendition, track, sequence, profile, and + source-material identity. +- Media timing is part of the proof model. Matching hashes are not considered a complete duplicate + proof unless both publishers also expose a shared logical media clock for the chunk. +- Source-material identity is separate from stream metadata. Two publishers can advertise the same + channel, sequence, timing, and encoder profile while still encoding different RF/source windows; + that must fail in simulation before production archive comparisons burn wall-clock time. +- Publisher-origin archive `group_sequence` is derived from parsed media-time identity plus stable + track id, not local receive time. Receive time is telemetry; it is not proof that two publishers + archived the same broadcast moment. +- Live publisher archive proof normalizes fMP4 `tfdt` to the Unix media slot before hashing a + fragment. The first fragment for each track anchors the process-local media clock to wall-clock + time; later fragments preserve ffmpeg's media cadence from that origin. ffmpeg still runs with + wall-clock timestamp input enabled where possible, but the Rust archive writer is the authority + for the proof clock when source MPEG-TS timestamps are process-relative. +- Archive `group_sequence` includes a stable subfragment slot inside each `(track_id, + media_sequence)` pair, because audio can legitimately emit multiple fragments within one media + slot and those must compare in order instead of colliding as source-local divergences. +- Duplicate-publisher scenarios model publisher content phase separately from advertised archive + sequence. A publisher that starts its local encoder at a different content phase must fail fast in + simulation, because production fragments with the same local sequence are not proof of the same + broadcast moment unless the chunk clock is shared. +- `ec-node sim-duplicate-publishers` runs the same campaign model from the compiled Rust binary and + emits JSON suitable for CI artifacts and rollout gates. +- `ec-node sim-duplicate-publishers --failure-artifact ` writes the first failing campaign as + a replayable JSON artifact with the shrunk scenario, invariant report, event trace, shrink steps, + and a command hint for replaying `replay_scenario` through `--scenario-json -`. +- `ec-node sim-duplicate-publishers --scenario-json ` replays an exact serialized + `DuplicatePublisherScenario`, so a shrunk failure from CI or production investigation can be rerun + without reconstructing command-line flags. +- `ec-node sim-duplicate-publishers` can inject timing faults directly with + `--missing-media-timing-publisher NODE` and `--publisher-media-time-offset NODE:OFFSET_MS`, so + the current production proof class can be reproduced without hand-writing scenario JSON. +- `ec-node sim-duplicate-publishers` and `ec-node sim-system` can inject source-window faults with + `--publisher-source-material NODE:MATERIAL_ID`. Any campaign with multiple source-material ids + reports source-material mismatch observations instead of leaving operators to infer that class + from divergent hashes. +- `ec-node archive-convergence` reads existing archive manifest JSONL and applies the same + convergence semantics to real duplicate publisher outputs. +- Control-plane simulation models logical nodes, seeded gossip fanout, delivery jitter, transient + drops, node-specific partitions, node outages, duplicate deliveries, and propagation deadlines. +- `ec-node sim-control-plane` runs the control-plane model from the compiled Rust binary and emits + replayable JSON with the first failing seed, scenario, invariant report, and ordered trace. +- Control-plane campaign reports track max propagation time, max delivery time, dropped messages, + partition-delayed messages, outage-delayed messages, and duplicate messages, so prod rollout + measurements have a fast simulation baseline. +- System simulation composes control-plane propagation with duplicate-publisher media production. + Control gossip produces per-publisher activation times; the media workload then proves that delayed + schedule propagation still converges when publishers use the global media sequence clock and fails + when they derive chunk identity from local activation time. +- `ec-node sim-system` runs that composed workload from the deployed node binary. Its default + campaign models the current publisher topology class and can switch `--sequence-clock` between + `global` and `local-activation` to reproduce the exact class of duplicate-publisher phase bug + before waiting for production samples. +- `ec-node sim-system --fault-profile foundationdb` uses a FoundationDB-style fault profile: each + seed generates a different but replayable cluster schedule with randomized control partitions, node + outages, transient gossip drops, duplicate messages, media partitions, publisher outages, and + archive backfill pressure. +- The FoundationDB-style profile must also have an explicit negative regression for + `local-activation` sequence clocks, so the model proves the current production failure class is + caught in Rust before any rollout waits for live fragments. +- `ec-node sim-system --failure-artifact ` writes the first failing composed system schedule + as replayable JSON, including the exact control/media scenario, invariant report, ordered trace, + and command hint for rerunning `--scenario-json -`. +- System campaign reports must include fault coverage counters, not just pass/fail. A fast campaign + is only useful if it proves that the simulated run actually exercised the failure modes operators + care about. +- System campaign reports also aggregate publisher phase-offset observations. A production-like + divergence caused by local activation clocks should identify itself as a phase bug in the campaign + JSON instead of requiring operators to infer that only from divergent hashes. +- System campaign reports also aggregate source-material mismatch observations. A production-like + divergence caused by independent tuner/source windows should identify itself as a source-material + bug in the campaign JSON instead of being confused with codec nondeterminism. +- System and duplicate-publisher reports aggregate missing media-timing records and media-timing + conflicts, so the live failure class where fragments arrive without a usable media clock is visible + in fast Rust simulation output. +- FoundationDB-profile `sim-system` campaigns require that coverage by default: control transient + drops, partition delays, node outage delays, duplicate messages, media transient drops, media + partition delays, publisher outages, backfill, and observed convergence timing must all appear in + the campaign report. A campaign that passes invariants but misses these classes is reported as a + weak simulation, not a green rollout gate. +- FoundationDB-profile coverage is breadth-gated, not only boolean-gated. By default at least + `max(2, iterations / 32)` seeds must exercise every required distributed fault class; operators + can raise that floor with `--min-fault-seed-coverage` for longer scientific campaigns. +- Campaign reports track both event totals and seed counts per fault class, plus a bounded list of + the slowest system schedules with replay hints. This makes green runs inspectable: operators can + see how broadly the randomized schedule space was exercised and which seeds define the current + latency tail. +- System campaign reports also aggregate deterministic simulated convergence time and trace event + counts. `ec-node sim-system` stamps wall-clock execution telemetry around the campaign so a run + reports iterations per second, simulated system seconds per wall second, and trace events per + second without putting wall-clock data into the replayed scenario itself. +- `sim-system --failure-artifact ` writes an artifact for weak coverage as well as invariant + failures, so CI can preserve evidence when a campaign was too small or too narrow to exercise the + required distributed faults. +- Forge `ci-gates` runs the Rust system simulator tests and a 1024-seed + `sim-system --fault-profile foundationdb` campaign from the compiled `ec-node` binary before web + build/deploy gates. This keeps the fast randomized check ahead of production rollout evidence. +- Simulation failures must be actionable before any matching production rollout is considered + healthy. + +## Consequences + +We get FoundationDB-style pressure in a much smaller shape: many deterministic failure schedules can +run as normal Rust tests without booting machines. The first media model covers duplicate publisher +convergence, network partitions, transient loss, publisher restart/backfill, convergence latency, +encoder drift, and publisher phase alignment, and the first runtime command applies it to archive +manifests. The first control model covers gossip propagation across relays and nodes under dropped, +delayed, duplicated, partitioned, and outage-delayed control messages. The shrink/replay path makes +supported failures small enough to debug before they become production event archaeology; exact +scenario JSON is the replay contract. Later models can add tuner scheduling, relay cache eviction, +and image rollout state machines. The composed system model is the first workload-level step: it +checks the boundary between control-plane speed and media determinism, which is where production +duplicate publishers are currently most fragile. + +## Alternatives considered + +- Keep writing production probes only. Rejected because probes prove what happened once, not what + should happen across many fault schedules. +- Extend the Python node-agent as the simulation oracle. Rejected because the image should get + thinner and the runtime behavior belongs in the Rust node. + +## Rollout/teardown + +Roll forward by adding simulation tests next to each new distributed invariant. Roll back by keeping +the production probes; the simulation module is library-only and has no runtime service impact. diff --git a/nix/modules/ec-node.nix b/nix/modules/ec-node.nix index d944bf8..ec5eb26 100644 --- a/nix/modules/ec-node.nix +++ b/nix/modules/ec-node.nix @@ -5,6 +5,10 @@ let ecNodePkgDefault = pkgs.callPackage ../pkgs/ec-node.nix { }; ecCliPkgDefault = pkgs.callPackage ../pkgs/ec-cli.nix { }; + nodeEntrypoint = pkgs.callPackage ../pkgs/every-channel-node.nix { + ec-node = cfg.package; + moq-relay = null; + }; # Minimal normalization for host strings. normalizeHost = host: @@ -16,7 +20,19 @@ let sanitizeUnitName = s: lib.concatStringsSep "-" (lib.filter (x: x != "") (lib.splitString "/" (lib.replaceStrings [ " " ":" "." ] [ "-" "-" "-" ] s))); + archiveConvergenceProofUnitName = proof: + "every-channel-archive-convergence-${sanitizeUnitName proof.name}"; + archiveConvergenceMeasureProofUnitName = proof: + "every-channel-archive-convergence-measure-${sanitizeUnitName proof.name}"; + hdhrBase = if cfg.hdhomerun.host != null then normalizeHost cfg.hdhomerun.host else null; + normalizePath = path: lib.removeSuffix "/" (toString path); + sameOrUnder = parent: child: + let + p = normalizePath parent; + c = normalizePath child; + in + c == p || lib.hasPrefix "${p}/" c; mkInputUrl = broadcast: let @@ -26,6 +42,163 @@ let else null; in "${base}/auto/v${broadcast.channel}"; + otelEnvironment = + lib.optionalAttrs (cfg.observability.otelTracesEndpoint != null) { + EVERY_CHANNEL_OTEL_TRACES_ENDPOINT = cfg.observability.otelTracesEndpoint; + OTEL_SERVICE_NAME = cfg.observability.serviceName; + }; + serviceEnvironment = { + EVERY_CHANNEL_NODE_NAME = config.networking.hostName; + } // cfg.environment // otelEnvironment; + + mkArchiveConvergenceProofService = proof: + let + unit = archiveConvergenceProofUnitName proof; + sourceArgLines = lib.concatMapStrings (source: "cmd+=(--source ${lib.escapeShellArg source})\n") proof.sources; + optionalArgLine = flag: value: + lib.optionalString (value != null) "cmd+=(${flag} ${lib.escapeShellArg (toString value)})\n"; + runner = pkgs.writeShellApplication { + name = unit; + runtimeInputs = [ + cfg.package + ]; + text = '' + set -euo pipefail + + cmd=( + ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} + archive-convergence-serve + --broadcast ${lib.escapeShellArg proof.broadcast} + --track ${lib.escapeShellArg proof.track} + --listen ${lib.escapeShellArg proof.listen} + --metrics-role ${lib.escapeShellArg proof.metricsRole} + ) + ${sourceArgLines} + ${optionalArgLine "--stream-id" proof.streamId} + ${optionalArgLine "--rendition" proof.rendition} + ${optionalArgLine "--start-sequence" proof.startSequence} + ${optionalArgLine "--end-sequence" proof.endSequence} + ${optionalArgLine "--metrics-node" proof.metricsNode} + + exec "''${cmd[@]}" + ''; + }; + in + { + name = unit; + value = { + description = "every.channel archive convergence metrics (${proof.name})"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + + unitConfig = { + StartLimitIntervalSec = 0; + }; + + serviceConfig = { + Type = "simple"; + ExecStart = "${runner}/bin/${unit}"; + Restart = "always"; + RestartSec = 2; + + NoNewPrivileges = true; + PrivateTmp = true; + ProtectSystem = "strict"; + ProtectHome = true; + ProtectKernelTunables = true; + ProtectKernelModules = true; + ProtectControlGroups = true; + LockPersonality = true; + MemoryDenyWriteExecute = true; + RestrictSUIDSGID = true; + RestrictRealtime = true; + SystemCallArchitectures = "native"; + }; + + environment = serviceEnvironment; + }; + }; + mkArchiveConvergenceMeasureProofService = proof: + let + unit = archiveConvergenceMeasureProofUnitName proof; + agentArgLines = lib.concatMapStrings (source: "cmd+=(--agent-manifest ${lib.escapeShellArg source})\n") proof.agentManifests; + agentPrometheusSdArgLines = lib.concatMapStrings (source: "cmd+=(--agent-prometheus-sd ${lib.escapeShellArg source})\n") proof.agentPrometheusSdFiles; + agentPrometheusSdLabelArgLines = lib.concatMapStrings (label: "cmd+=(--agent-prometheus-sd-label ${lib.escapeShellArg label})\n") proof.agentPrometheusSdLabels; + manifestArgLines = lib.concatMapStrings (source: "cmd+=(--manifest ${lib.escapeShellArg source})\n") proof.manifests; + optionalArgLine = flag: value: + lib.optionalString (value != null) "cmd+=(${flag} ${lib.escapeShellArg (toString value)})\n"; + runner = pkgs.writeShellApplication { + name = unit; + runtimeInputs = [ + cfg.package + ]; + text = '' + set -euo pipefail + + cmd=( + ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} + archive-convergence-measure-serve + --broadcast ${lib.escapeShellArg proof.broadcast} + --track ${lib.escapeShellArg proof.track} + --listen ${lib.escapeShellArg proof.listen} + --agent-manifest-role ${lib.escapeShellArg proof.agentManifestRole} + --timeout-ms ${toString proof.timeoutMs} + --max-manifest-bytes ${toString proof.maxManifestBytes} + --max-samples ${toString proof.maxSamples} + --min-elapsed-seconds ${toString proof.minElapsedSeconds} + --metrics-role ${lib.escapeShellArg proof.metricsRole} + ) + ${agentArgLines} + ${agentPrometheusSdArgLines} + ${agentPrometheusSdLabelArgLines} + ${manifestArgLines} + ${optionalArgLine "--stream-id" proof.streamId} + ${optionalArgLine "--rendition" proof.rendition} + ${optionalArgLine "--start-sequence" proof.startSequence} + ${optionalArgLine "--end-sequence" proof.endSequence} + ${optionalArgLine "--prometheus-url" proof.prometheusUrl} + ${optionalArgLine "--metrics-node" proof.metricsNode} + + exec "''${cmd[@]}" + ''; + }; + in + { + name = unit; + value = { + description = "every.channel remote archive convergence measurement metrics (${proof.name})"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + + unitConfig = { + StartLimitIntervalSec = 0; + }; + + serviceConfig = { + Type = "simple"; + ExecStart = "${runner}/bin/${unit}"; + Restart = "always"; + RestartSec = 2; + + NoNewPrivileges = true; + PrivateTmp = true; + ProtectSystem = "strict"; + ProtectHome = true; + ProtectKernelTunables = true; + ProtectKernelModules = true; + ProtectControlGroups = true; + LockPersonality = true; + MemoryDenyWriteExecute = true; + RestrictSUIDSGID = true; + RestrictRealtime = true; + SystemCallArchitectures = "native"; + }; + + environment = serviceEnvironment; + }; + }; in { @@ -52,16 +225,96 @@ in description = "MoQ relay URL for ec-node wt-publish."; }; + relayAnnouncedWatchdogMs = lib.mkOption { + type = lib.types.int; + default = 90000; + description = "Restart wt-publish when the relay /announced endpoint stops listing the broadcast for this many milliseconds. Set 0 to disable."; + }; + + relayAnnouncedWatchdogIntervalMs = lib.mkOption { + type = lib.types.int; + default = 10000; + description = "Polling interval for the wt-publish relay announcement watchdog."; + }; + transcode = lib.mkOption { type = lib.types.bool; default = true; description = "Whether ec-node should transcode to H.264/AAC before fragmenting."; }; + videoFilter = lib.mkOption { + type = lib.types.str; + default = "yadif=mode=send_frame:parity=auto:deint=interlaced,fps=30000/1001"; + description = "ffmpeg video filter passed to ec-node wt-publish when transcoding."; + }; + + gopFrames = lib.mkOption { + type = lib.types.int; + default = 30; + description = "H.264 GOP/keyframe interval in frames for ec-node wt-publish."; + }; + + videoPreset = lib.mkOption { + type = lib.types.str; + default = "medium"; + description = "x264 preset for ec-node wt-publish. Slower presets trade publisher CPU for lower bitrate at the same CRF."; + }; + + videoCrf = lib.mkOption { + type = lib.types.int; + default = 23; + description = "x264 CRF for ec-node wt-publish."; + }; + + publisherArchiveSegmentDurationMs = lib.mkOption { + type = lib.types.int; + default = 1001; + description = "Publisher-origin proof segment duration passed to ec-node wt-publish."; + }; + + publisherStartBoundaryMs = lib.mkOption { + type = lib.types.int; + default = 1001; + description = "Unix-epoch cadence boundary used before starting publisher ffmpeg."; + }; + + publisherArchive = { + enable = lib.mkOption { + type = lib.types.bool; + default = false; + description = "Enable publisher-origin source-window proof archive records for direct wt-publish units."; + }; + + outputDir = lib.mkOption { + type = lib.types.str; + default = "/srv/every-channel/archive-buffer"; + description = "Publisher-local CAS cache root passed to ec-node wt-publish."; + }; + + manifestDir = lib.mkOption { + type = lib.types.str; + default = "/srv/every-channel/archive-buffer/manifests"; + description = "Publisher-local manifest root passed to ec-node wt-publish."; + }; + + sourceNode = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Source node identity stamped into publisher-origin proof records."; + }; + + track = lib.mkOption { + type = lib.types.str; + default = "publisher.m4s"; + description = "Publisher-origin proof track name."; + }; + }; + passthrough = lib.mkOption { type = lib.types.bool; - default = true; - description = "Whether to transmit CMAF fMP4 fragments directly (seedbox-compatible passthrough)."; + default = false; + description = "Compatibility flag for older moq-mux passthrough selection; moq-mux 0.4 preserves generated CMAF fragments."; }; tlsDisableVerify = lib.mkOption { @@ -76,6 +329,12 @@ in description = "Extra arguments appended to each ec-node wt-publish invocation."; }; + ntscRsCli = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional ntsc-rs-cli binary path used by broadcasts that set `ntscRsPreset`."; + }; + environment = lib.mkOption { type = lib.types.attrsOf lib.types.str; default = { @@ -87,6 +346,21 @@ in description = "Environment variables for the publisher services."; }; + observability = { + otelTracesEndpoint = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + example = "http://127.0.0.1:4320/v1/traces"; + description = "Optional OTLP/HTTP trace endpoint for ec-node Rust tracing spans."; + }; + + serviceName = lib.mkOption { + type = lib.types.str; + default = "ec-node"; + description = "OpenTelemetry service.name for ec-node processes."; + }; + }; + hdhomerun = { host = lib.mkOption { type = lib.types.nullOr lib.types.str; @@ -206,13 +480,13 @@ in outputDir = lib.mkOption { type = lib.types.str; - default = "/var/lib/every-channel/archive"; + default = "/srv/every-channel/archive"; description = "CAS object root passed to `ec-node wt-archive --output-dir`."; }; manifestDir = lib.mkOption { type = lib.types.str; - default = "/var/lib/every-channel/manifests"; + default = "/srv/every-channel/archive/manifests"; description = "Manifest/index root passed to `ec-node wt-archive --manifest-dir`."; }; @@ -238,9 +512,10 @@ in type = lib.types.listOf lib.types.str; default = [ "catalog.json" + "catalog" "init.mp4" - "video0.m4s" - "audio0.m4s" + "0.m4s" + "1.m4s" ]; description = "Tracks passed to each `wt-archive` worker."; }; @@ -264,6 +539,220 @@ in description = "Listen address passed to `ec-node wt-archive-serve --listen`."; }; }; + + convergence = { + proofs = lib.mkOption { + type = lib.types.listOf (lib.types.submodule { + options = { + name = lib.mkOption { + type = lib.types.str; + description = "Short unique name for this archive convergence proof service."; + }; + + sources = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ + "nuc-a=/srv/every-channel/archive-buffer/manifests" + "nuc-b=/srv/every-channel/archive-peer/manifests" + ]; + description = "Archive source entries passed as repeated `ec-node archive-convergence-serve --source NAME=PATH` arguments."; + }; + + broadcast = lib.mkOption { + type = lib.types.str; + description = "Broadcast name passed to `archive-convergence-serve --broadcast`."; + }; + + track = lib.mkOption { + type = lib.types.str; + default = "publisher.m4s"; + description = "Archive track passed to `archive-convergence-serve --track`."; + }; + + streamId = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional stream id filter passed to `archive-convergence-serve --stream-id`."; + }; + + rendition = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional rendition filter passed to `archive-convergence-serve --rendition`."; + }; + + startSequence = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + description = "Optional start sequence passed to `archive-convergence-serve --start-sequence`."; + }; + + endSequence = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + description = "Optional end sequence passed to `archive-convergence-serve --end-sequence`."; + }; + + listen = lib.mkOption { + type = lib.types.str; + default = "127.0.0.1:7812"; + description = "Listen address for this proof service's `/health` and `/metrics` endpoints."; + }; + + metricsNode = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional node label override for emitted Prometheus metrics."; + }; + + metricsRole = lib.mkOption { + type = lib.types.str; + default = "duplicate-proof"; + description = "Role label for emitted Prometheus metrics."; + }; + }; + }); + default = [ ]; + description = "Named Rust archive convergence proof services exposed as Prometheus scrape targets."; + }; + + remoteProofs = lib.mkOption { + type = lib.types.listOf (lib.types.submodule { + options = { + name = lib.mkOption { + type = lib.types.str; + description = "Short unique name for this remote archive convergence measurement service."; + }; + + agentManifests = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ + "nuc-a=http://ec-publisher-a:7799" + "nuc-b=http://ec-publisher-b:7799" + ]; + description = "Node-agent base URLs passed as repeated `ec-node archive-convergence-measure-serve --agent-manifest NAME=URL` arguments."; + }; + + agentPrometheusSdFiles = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ "/var/lib/prometheus/every-channel-node-agents.json" ]; + description = "Prometheus file-SD JSON files passed as repeated `ec-node archive-convergence-measure-serve --agent-prometheus-sd PATH` arguments."; + }; + + agentPrometheusSdLabels = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ "headscale_user=node" ]; + description = "Optional label filters passed as repeated `ec-node archive-convergence-measure-serve --agent-prometheus-sd-label KEY=VALUE` arguments."; + }; + + agentManifestRole = lib.mkOption { + type = lib.types.str; + default = "publisher-buffer"; + description = "Role query parameter for node-agent `/v1/archive-manifest` samples."; + }; + + manifests = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ + "nuc-a=https://publisher-a.example/manifests/la-kcop/publisher.m4s.jsonl" + "nuc-b=https://publisher-b.example/manifests/la-kcop/publisher.m4s.jsonl" + ]; + description = "Direct manifest JSONL URLs passed as repeated `ec-node archive-convergence-measure-serve --manifest NAME=URL` arguments."; + }; + + broadcast = lib.mkOption { + type = lib.types.str; + description = "Broadcast name passed to `archive-convergence-measure-serve --broadcast`."; + }; + + track = lib.mkOption { + type = lib.types.str; + default = "publisher.m4s"; + description = "Archive track passed to `archive-convergence-measure-serve --track`."; + }; + + streamId = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional stream id filter passed to `archive-convergence-measure-serve --stream-id`."; + }; + + rendition = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional rendition filter passed to `archive-convergence-measure-serve --rendition`."; + }; + + startSequence = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + description = "Optional start sequence passed to `archive-convergence-measure-serve --start-sequence`."; + }; + + endSequence = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + description = "Optional end sequence passed to `archive-convergence-measure-serve --end-sequence`."; + }; + + prometheusUrl = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional Prometheus URL queried by the remote proof service for Grafana-facing duplicate/miss series."; + }; + + timeoutMs = lib.mkOption { + type = lib.types.int; + default = 10000; + description = "HTTP timeout for each sampled manifest or Prometheus query."; + }; + + maxManifestBytes = lib.mkOption { + type = lib.types.int; + default = 4 * 1024 * 1024; + description = "Maximum manifest bytes fetched per remote source sample."; + }; + + listen = lib.mkOption { + type = lib.types.str; + default = "127.0.0.1:7813"; + description = "Listen address for this remote proof service's `/health` and `/metrics` endpoints."; + }; + + maxSamples = lib.mkOption { + type = lib.types.int; + default = 16; + description = "Maximum recent scrape samples retained for elapsed duplicate convergence proof."; + }; + + minElapsedSeconds = lib.mkOption { + type = lib.types.int; + default = 30; + description = "Minimum elapsed sample window required before this remote proof reports ok."; + }; + + metricsNode = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional node label override for emitted Prometheus metrics."; + }; + + metricsRole = lib.mkOption { + type = lib.types.str; + default = "duplicate-proof"; + description = "Role label for emitted Prometheus metrics."; + }; + }; + }); + default = [ ]; + description = "Named Rust remote archive convergence measurement services exposed as Prometheus scrape targets."; + }; + }; }; nbc = { @@ -406,6 +895,21 @@ in default = null; description = "Optional NBC watch/live URL for a browser-backed relay publish worker."; }; + ntscRsPreset = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional ntsc-rs preset JSON rendered before wt-publish ffmpeg ingest."; + }; + ntscRsOutput = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional ntsc-rs processed output path. Defaults to a private /tmp path for this service."; + }; + ntscRsCli = lib.mkOption { + type = lib.types.nullOr lib.types.str; + default = null; + description = "Optional per-broadcast ntsc-rs-cli binary path."; + }; }; }); default = [ ]; @@ -415,10 +919,6 @@ in config = lib.mkIf cfg.enable { assertions = [ - { - assertion = (cfg.broadcasts != [ ]) || cfg.archive.enable; - message = "services.every-channel.ec-node.broadcasts must be non-empty unless archive.enable=true"; - } { assertion = let @@ -431,6 +931,10 @@ in assertion = !(cfg.hdhomerun.autoDiscover && cfg.hdhomerun.host != null); message = "hdhomerun.autoDiscover only applies when hdhomerun.host is unset"; } + { + assertion = cfg.publisherArchiveSegmentDurationMs > 0 && cfg.publisherStartBoundaryMs >= 0; + message = "services.every-channel.ec-node publisher proof cadence values must be usable."; + } { assertion = builtins.all @@ -447,12 +951,80 @@ in (!hasNbcBroadcast) || cfg.nbc.enable; message = "Set services.every-channel.ec-node.nbc.enable = true before configuring `broadcasts.*.nbcUrl`"; } + { + assertion = !cfg.archive.enable || !(sameOrUnder "/var/lib/every-channel" cfg.archive.outputDir); + message = "services.every-channel.ec-node.archive.outputDir must not live under /var/lib/every-channel; put archive video CAS on /srv/every-channel, /tank/every-channel, or another data filesystem."; + } + { + assertion = + builtins.all + (proof: lib.length proof.sources >= 2) + cfg.archive.convergence.proofs; + message = "Each services.every-channel.ec-node.archive.convergence.proofs entry must set at least two `sources` values."; + } + { + assertion = + let + proofNames = + (map (proof: proof.name) cfg.archive.convergence.proofs) + ++ (map (proof: proof.name) cfg.archive.convergence.remoteProofs); + in + lib.length proofNames == lib.length (lib.unique proofNames); + message = "services.every-channel.ec-node.archive.convergence proof names must be unique."; + } + { + assertion = + let + proofListens = + (map (proof: proof.listen) cfg.archive.convergence.proofs) + ++ (map (proof: proof.listen) cfg.archive.convergence.remoteProofs); + in + lib.length proofListens == lib.length (lib.unique proofListens); + message = "services.every-channel.ec-node.archive.convergence proof listen addresses must be unique."; + } + { + assertion = + builtins.all + (proof: + (proof.startSequence == null || proof.startSequence >= 0) + && (proof.endSequence == null || proof.endSequence >= 0)) + cfg.archive.convergence.proofs; + message = "services.every-channel.ec-node.archive.convergence.proofs sequence bounds must be non-negative."; + } + { + assertion = + builtins.all + (proof: + lib.length proof.agentManifests + lib.length proof.manifests >= 2 + || lib.length proof.agentPrometheusSdFiles > 0) + cfg.archive.convergence.remoteProofs; + message = "Each services.every-channel.ec-node.archive.convergence.remoteProofs entry must set at least two static sources or at least one `agentPrometheusSdFiles` value."; + } + { + assertion = + builtins.all + (proof: + (proof.startSequence == null || proof.startSequence >= 0) + && (proof.endSequence == null || proof.endSequence >= 0)) + cfg.archive.convergence.remoteProofs; + message = "services.every-channel.ec-node.archive.convergence.remoteProofs sequence bounds must be non-negative."; + } + { + assertion = + builtins.all + (proof: + proof.timeoutMs > 0 + && proof.maxManifestBytes > 0 + && proof.maxSamples >= 2 + && proof.minElapsedSeconds >= 0) + cfg.archive.convergence.remoteProofs; + message = "services.every-channel.ec-node.archive.convergence.remoteProofs timeout, manifest byte limit, sample count, and elapsed window must be usable."; + } ]; systemd.tmpfiles.rules = [ "d /run/every-channel 1777 root root - -" - "d /run/every-channel/source-locks 1777 root root - -" ] ++ lib.optionals cfg.nbc.enable [ "d /var/lib/every-channel 0750 every-channel every-channel - -" @@ -462,6 +1034,12 @@ in ++ lib.optionals cfg.archive.enable [ "d ${cfg.archive.outputDir} 0750 root root - -" "d ${cfg.archive.manifestDir} 0750 root root - -" + ] + ++ lib.optionals cfg.publisherArchive.enable [ + "d ${cfg.publisherArchive.outputDir} 0750 root root - -" + "d ${cfg.publisherArchive.outputDir}/objects 0750 root root - -" + "d ${cfg.publisherArchive.outputDir}/objects/blake3 0750 root root - -" + "d ${cfg.publisherArchive.manifestDir} 0750 root root - -" ]; users.groups.every-channel = lib.mkIf cfg.nbc.enable { }; @@ -478,6 +1056,11 @@ in let unit = "every-channel-wt-publish-${sanitizeUnitName b.name}"; isNbc = b.nbcUrl != null; + ntscRsCli = if b.ntscRsCli != null then b.ntscRsCli else cfg.ntscRsCli; + ntscRsOutput = + if b.ntscRsOutput != null + then b.ntscRsOutput + else "/tmp/every-channel-ntsc-rs/${sanitizeUnitName b.name}.mp4"; runner = pkgs.writeShellApplication { name = unit; runtimeInputs = @@ -488,8 +1071,7 @@ in pkgs.findutils pkgs.gawk pkgs.iproute2 - pkgs.util-linux - cfg.package + nodeEntrypoint ] ++ lib.optionals (isNbc && cfg.nbc.requireMullvad) [ pkgs.mullvad-vpn ] ++ lib.optionals (isNbc && cfg.nbc.isolateWithUserNetns) [ @@ -574,7 +1156,6 @@ in set +e wait "$ns_pid" status=$? - set -e kill "$slirp_pid" 2>/dev/null || true wait "$slirp_pid" 2>/dev/null || true @@ -582,36 +1163,8 @@ in return "$status" } - run_source_command() { - local status source_lock_fd - status=0 - source_lock_fd="" - - if [[ -n "''${source_lock:-}" ]]; then - exec {source_lock_fd}>"$source_lock" - if ! flock -n "$source_lock_fd"; then - echo "ec-node: source already active on this node, skipping duplicate publisher: $source_id" >&2 - exec {source_lock_fd}>&- - return 0 - fi - fi - - set +e - "$@" - status=$? - set -e - - if [[ -n "$source_lock_fd" ]]; then - flock -u "$source_lock_fd" 2>/dev/null || true - exec {source_lock_fd}>&- - fi - return "$status" - } - nbc_url=${lib.escapeShellArg nbcUrlStr} input="" - source_id="" - source_lock="" if [[ -z "$nbc_url" ]]; then explicit_input=${lib.escapeShellArg explicitInputStr} if [[ -n "$explicit_input" ]]; then @@ -706,29 +1259,52 @@ in host="''${hostport%%:*}" input="http://$host:5004/auto/v$ch" fi - source_id="$input" fi if [[ -n "$nbc_url" ]]; then - source_id="$nbc_url" cmd=( - ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} + ${lib.escapeShellArg "${nodeEntrypoint}/bin/every-channel-node"} nbc-wt-publish --url ${lib.escapeShellArg cfg.relayUrl} --name ${lib.escapeShellArg b.name} --source-url "$nbc_url" + --gop-frames ${toString cfg.gopFrames} ) else cmd=( - ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} + ${lib.escapeShellArg "${nodeEntrypoint}/bin/every-channel-node"} wt-publish --url ${lib.escapeShellArg cfg.relayUrl} --name ${lib.escapeShellArg b.name} --input "$input" + --video-filter ${lib.escapeShellArg cfg.videoFilter} + --gop-frames ${toString cfg.gopFrames} + --video-preset ${lib.escapeShellArg cfg.videoPreset} + --video-crf ${toString cfg.videoCrf} + --relay-announced-watchdog-ms ${toString cfg.relayAnnouncedWatchdogMs} + --relay-announced-watchdog-interval-ms ${toString cfg.relayAnnouncedWatchdogIntervalMs} + --publisher-archive-segment-duration-ms ${toString cfg.publisherArchiveSegmentDurationMs} + --publisher-start-boundary-ms ${toString cfg.publisherStartBoundaryMs} ) + ${lib.optionalString cfg.publisherArchive.enable '' + cmd+=(--publisher-archive-output-dir ${lib.escapeShellArg cfg.publisherArchive.outputDir}) + cmd+=(--publisher-archive-manifest-dir ${lib.escapeShellArg cfg.publisherArchive.manifestDir}) + cmd+=(--publisher-archive-track ${lib.escapeShellArg cfg.publisherArchive.track}) + ''} + ${lib.optionalString (cfg.publisherArchive.enable && cfg.publisherArchive.sourceNode != null) '' + cmd+=(--publisher-archive-source-node ${lib.escapeShellArg cfg.publisherArchive.sourceNode}) + ''} ${lib.optionalString (!cfg.transcode) "cmd+=(--transcode=false)"} + ${lib.optionalString (b.ntscRsPreset != null) '' + cmd+=(--ntsc-rs-preset ${lib.escapeShellArg b.ntscRsPreset}) + cmd+=(--ntsc-rs-output ${lib.escapeShellArg ntscRsOutput}) + ntsc_rs_cli=${lib.escapeShellArg (if ntscRsCli != null then ntscRsCli else "")} + if [[ -n "$ntsc_rs_cli" ]]; then + cmd+=(--ntsc-rs-cli "$ntsc_rs_cli") + fi + ''} fi - ${lib.optionalString (!cfg.passthrough) "cmd+=(--passthrough=false)"} + ${lib.optionalString cfg.passthrough "cmd+=(--passthrough=true)"} ${lib.optionalString cfg.tlsDisableVerify "cmd+=(--tls-disable-verify)"} ${lib.optionalString cfg.control.enable '' cmd+=(--control-announce) @@ -747,27 +1323,54 @@ in ''} ${extraArgsLine} - if [[ -n "$source_id" ]]; then - source_key="$(printf '%s' "$source_id" | tr -c 'A-Za-z0-9_.-' '_')" - source_lock="/run/every-channel/source-locks/$source_key.lock" - fi - # Keep the unit alive even if the relay is temporarily unreachable. # This avoids `switch-to-configuration test` failing due to a unit that exits # quickly during activation. trap 'exit 0' INT TERM + is_nbc=${lib.boolToString isNbc} + failure_count=0 while true; do ${lib.optionalString (isNbc && cfg.nbc.requireMullvad) '' if ! wait_for_mullvad; then - sleep 2 + sleep 30 continue fi ''} - ${lib.optionalString (isNbc && cfg.nbc.isolateWithUserNetns) "run_source_command run_in_user_netns || true"} + started_at="$(date +%s)" + status=0 + set +e + ${lib.optionalString (isNbc && cfg.nbc.isolateWithUserNetns) "run_in_user_netns"} ${lib.optionalString (!isNbc || !cfg.nbc.isolateWithUserNetns) '' - run_source_command "''${cmd[@]}" || true + "''${cmd[@]}" ''} - sleep 2 + status=$? + set -e + elapsed=$(( $(date +%s) - started_at )) + if [[ "$status" -eq 0 ]]; then + failure_count=0 + sleep_seconds=2 + else + failure_count=$(( failure_count + 1 )) + if [[ "$is_nbc" == "true" ]]; then + sleep_seconds=300 + if [[ "$failure_count" -gt 1 ]]; then + for _ in $(seq 2 "$failure_count"); do + sleep_seconds=$(( sleep_seconds * 2 )) + if [[ "$sleep_seconds" -ge 1800 ]]; then + sleep_seconds=1800 + break + fi + done + fi + else + sleep_seconds=$(( 2 * failure_count )) + if [[ "$sleep_seconds" -gt 60 ]]; then + sleep_seconds=60 + fi + fi + echo "ec-node: command exited with status $status after ''${elapsed}s; retrying in ''${sleep_seconds}s" >&2 + fi + sleep "$sleep_seconds" done ''; }; @@ -800,11 +1403,8 @@ in ExecStart = "${runner}/bin/${unit}"; Restart = "always"; RestartSec = 2; - KillMode = "control-group"; - TimeoutStopSec = "10s"; - SendSIGKILL = true; - DynamicUser = !isNbc; + DynamicUser = !isNbc && !cfg.publisherArchive.enable; User = lib.mkIf isNbc "every-channel"; Group = lib.mkIf isNbc "every-channel"; NoNewPrivileges = true; @@ -821,12 +1421,16 @@ in SystemCallArchitectures = "native"; ReadWritePaths = lib.optionals cfg.control.enable [ "/run/every-channel" ] + ++ lib.optionals cfg.publisherArchive.enable [ + cfg.publisherArchive.outputDir + cfg.publisherArchive.manifestDir + ] ++ lib.optionals isNbc [ "/tmp" ] ++ lib.optionals isNbc [ cfg.nbc.profileDir cfg.nbc.authScreenshotDir ]; }; environment = - cfg.environment + serviceEnvironment // lib.optionalAttrs isNbc ( { DISPLAY = cfg.nbc.display; @@ -873,7 +1477,7 @@ in name = bridgeUnit; runtimeInputs = [ pkgs.coreutils - cfg.package + nodeEntrypoint ]; text = '' set -euo pipefail @@ -885,7 +1489,7 @@ in while true; do cmd=( - ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} + ${lib.escapeShellArg "${nodeEntrypoint}/bin/every-channel-node"} control-bridge-web --directory-url "$directory_url" --timeout-ms ${toString cfg.control.bridgeWeb.timeoutMs} @@ -956,7 +1560,7 @@ in SystemCallArchitectures = "native"; }; - environment = cfg.environment; + environment = serviceEnvironment; }; }) // lib.optionalAttrs cfg.archive.enable @@ -971,7 +1575,7 @@ in pkgs.curl pkgs.gawk pkgs.jq - cfg.package + nodeEntrypoint ]; text = '' set -euo pipefail @@ -982,89 +1586,106 @@ in relay_fallback=${lib.escapeShellArg cfg.relayUrl} relay_override=${lib.escapeShellArg (if cfg.archive.relayUrlOverride == null then "" else cfg.archive.relayUrlOverride)} stream_prefix=${lib.escapeShellArg archivePrefix} + source_node="$(cat /proc/sys/kernel/hostname 2>/dev/null || echo unknown)" state_dir="/run/every-channel/archive" pids_dir="$state_dir/pids" logs_dir="$state_dir/logs" + desired_file="$state_dir/desired" mkdir -p "$pids_dir" "$logs_dir" poll_secs="$(awk 'BEGIN { printf "%.3f", ${toString cfg.archive.pollIntervalMs} / 1000.0 }')" + relay_is_owned() { + local relay="$1" + local host + host="''${relay#https://}" + host="''${host%%/*}" + case "$host" in + relay.every.channel|lax.relay.every.channel|ord.relay.every.channel|nyc.relay.every.channel) + return 0 + ;; + esac + return 1 + } + cleanup_children() { - pids=() for pid_file in "$pids_dir"/*.pid; do [[ -e "$pid_file" ]] || continue pid="$(cat "$pid_file" 2>/dev/null || true)" - if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then - pids+=("$pid") + if [[ -n "$pid" ]]; then + kill "$pid" 2>/dev/null || true fi rm -f "$pid_file" done - if [[ "''${#pids[@]}" -gt 0 ]]; then - kill -TERM "''${pids[@]}" 2>/dev/null || true - sleep 1 - for pid in "''${pids[@]}"; do - if kill -0 "$pid" 2>/dev/null; then - kill -KILL "$pid" 2>/dev/null || true - fi - done - fi } trap cleanup_children INT TERM EXIT while true; do + : > "$desired_file" entries_json="$(curl -fsS "$directory_url" || true)" - if [[ -z "$entries_json" ]]; then - sleep "$poll_secs" - continue - fi - - while IFS= read -r entry; do - name="$(printf '%s\n' "$entry" | jq -r '.broadcast_name // empty')" - relay="$(printf '%s\n' "$entry" | jq -r '(.relay_url // .relays[0].relay_url // empty)')" - if [[ -z "$name" ]]; then - continue - fi - if [[ -n "$stream_prefix" && "$name" != "$stream_prefix"* ]]; then - continue - fi - if [[ -n "$relay_override" ]]; then - relay="$relay_override" - elif [[ -z "$relay" ]]; then - relay="$relay_fallback" - fi - - key="$(printf '%s' "$name" | tr -c 'A-Za-z0-9_.-' '_')" - pid_file="$pids_dir/$key.pid" - if [[ -s "$pid_file" ]]; then - pid="$(cat "$pid_file" 2>/dev/null || true)" - if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + if [[ -n "$entries_json" ]]; then + while IFS= read -r entry; do + name="$(printf '%s\n' "$entry" | jq -r '.broadcast_name // empty')" + relay="$(printf '%s\n' "$entry" | jq -r '.relay_url // empty')" + if [[ -z "$name" ]]; then + continue + fi + if [[ -n "$stream_prefix" && "$name" != "$stream_prefix"* ]]; then + continue + fi + if [[ -n "$relay_override" ]]; then + relay="$relay_override" + elif [[ -z "$relay" ]]; then + relay="$relay_fallback" + fi + if ! relay_is_owned "$relay"; then + echo "ec-node: refusing to archive non-owned relay URL for $name: $relay" >&2 continue fi - fi - cmd=( - ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} - wt-archive - --url "$relay" - --name "$name" - --output-dir "$output_dir" - --manifest-dir "$manifest_dir" - ) - ${lib.optionalString cfg.archive.tlsDisableVerify "cmd+=(--tls-disable-verify)"} - ${archiveTrackLines} + key="$(printf '%s' "$name" | tr -c 'A-Za-z0-9_.-' '_')" + printf '%s\n' "$key" >> "$desired_file" + pid_file="$pids_dir/$key.pid" + if [[ -s "$pid_file" ]]; then + pid="$(cat "$pid_file" 2>/dev/null || true)" + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + continue + fi + fi - log_file="$logs_dir/$key.log" - ( - exec "''${cmd[@]}" - ) >>"$log_file" 2>&1 & - echo "$!" > "$pid_file" - done < <(printf '%s\n' "$entries_json" | jq -rc '.entries[]?') + cmd=( + ${lib.escapeShellArg "${nodeEntrypoint}/bin/every-channel-node"} + wt-archive + --url "$relay" + --name "$name" + --output-dir "$output_dir" + --manifest-dir "$manifest_dir" + --source-node "$source_node" + ) + ${lib.optionalString cfg.archive.tlsDisableVerify "cmd+=(--tls-disable-verify)"} + ${archiveTrackLines} + + log_file="$logs_dir/$key.log" + ( + exec "''${cmd[@]}" + ) >>"$log_file" 2>&1 & + echo "$!" > "$pid_file" + done < <(printf '%s\n' "$entries_json" | jq -rc '.entries[]?') + fi for pid_file in "$pids_dir"/*.pid; do [[ -e "$pid_file" ]] || continue + key="''${pid_file##*/}" + key="''${key%.pid}" pid="$(cat "$pid_file" 2>/dev/null || true)" if [[ -z "$pid" ]] || ! kill -0 "$pid" 2>/dev/null; then rm -f "$pid_file" + continue + fi + if ! grep -Fxq "$key" "$desired_file"; then + echo "ec-node: stopping archive worker for no-longer-advertised stream $key" >&2 + kill "$pid" 2>/dev/null || true + rm -f "$pid_file" fi done @@ -1089,9 +1710,6 @@ in ExecStart = "${archiveRunner}/bin/${archiveUnit}"; Restart = "always"; RestartSec = 2; - KillMode = "control-group"; - TimeoutStopSec = "10s"; - SendSIGKILL = true; NoNewPrivileges = true; PrivateTmp = true; @@ -1112,7 +1730,7 @@ in ]; }; - environment = cfg.environment; + environment = serviceEnvironment; }; }) // lib.optionalAttrs (cfg.archive.enable && cfg.archive.serve.enable) @@ -1121,11 +1739,11 @@ in archiveServeRunner = pkgs.writeShellApplication { name = archiveServeUnit; runtimeInputs = [ - cfg.package + nodeEntrypoint ]; text = '' set -euo pipefail - exec ${lib.escapeShellArg "${cfg.package}/bin/ec-node"} \ + exec ${lib.escapeShellArg "${nodeEntrypoint}/bin/every-channel-node"} \ wt-archive-serve \ --output-dir ${lib.escapeShellArg cfg.archive.outputDir} \ --manifest-dir ${lib.escapeShellArg cfg.archive.manifestDir} \ @@ -1168,9 +1786,11 @@ in ]; }; - environment = cfg.environment; + environment = serviceEnvironment; }; }) + // lib.listToAttrs (map mkArchiveConvergenceProofService cfg.archive.convergence.proofs) + // lib.listToAttrs (map mkArchiveConvergenceMeasureProofService cfg.archive.convergence.remoteProofs) // lib.optionalAttrs cfg.nbc.enable (let displayUnit = "every-channel-nbc-display"; @@ -1237,7 +1857,7 @@ in ReadWritePaths = [ "/tmp" "/var/lib/every-channel" ]; }; - environment = cfg.environment // { + environment = serviceEnvironment // { HOME = "/var/lib/every-channel"; }; }; @@ -1271,7 +1891,7 @@ in ReadWritePaths = [ "/tmp" "/var/lib/every-channel" ]; }; - environment = cfg.environment // { + environment = serviceEnvironment // { DISPLAY = cfg.nbc.display; HOME = "/var/lib/every-channel"; }; diff --git a/nix/pkgs/ec-node.nix b/nix/pkgs/ec-node.nix index 146b95a..e6b3509 100644 --- a/nix/pkgs/ec-node.nix +++ b/nix/pkgs/ec-node.nix @@ -7,25 +7,38 @@ }: let - # Keep the build input stable and small; avoid copying `target/`, `tmp/`, etc. into the Nix store. + root = ../../.; + # Keep the build input stable and small. NixOS, infra, docs, and script-only + # changes should not perturb the Rust source hash for config-only deploys. src = lib.cleanSourceWith { - src = ../../.; + src = root; filter = path: type: let - base = baseNameOf path; + rel = lib.removePrefix "${toString root}/" (toString path); in - # Skip typical build outputs and large scratch dirs. - !(base == "target" - || base == ".git" - || base == ".direnv" - || base == "tmp" - || base == "node_modules" - || base == "out" - || base == "test-results" - || base == "deploy" - || base == "intake" - || base == "cache" - || base == ".tower-minimal"); + rel == "" + || rel == "Cargo.toml" + || rel == "Cargo.lock" + || rel == "crates" + || lib.hasPrefix "crates/" rel + || rel == "third_party" + || rel == "third_party/iroh-org" + || rel == "third_party/iroh-org/iroh-gossip" + || lib.hasPrefix "third_party/iroh-org/iroh-gossip/" rel + || rel == "third_party/iroh-live" + || rel == "third_party/iroh-live/iroh-moq" + || lib.hasPrefix "third_party/iroh-live/iroh-moq/" rel + || rel == "third_party/iroh-live/web-transport-iroh" + || lib.hasPrefix "third_party/iroh-live/web-transport-iroh/" rel + || rel == "apps" + || rel == "apps/tauri" + || rel == "apps/tauri/Cargo.toml" + || rel == "apps/tauri/build.rs" + || rel == "apps/tauri/tauri.conf.json" + || rel == "apps/tauri/gen" + || lib.hasPrefix "apps/tauri/gen/" rel + || rel == "apps/tauri/src" + || lib.hasPrefix "apps/tauri/src/" rel; }; in rustPlatform.buildRustPackage { @@ -52,7 +65,7 @@ rustPlatform.buildRustPackage { doCheck = false; meta = with lib; { - description = "every.channel node runner (ingest + chunk + MoQ publish)"; + description = "every.channel node (ingest + chunk + MoQ publish)"; mainProgram = "ec-node"; platforms = platforms.unix; license = licenses.agpl3Only; diff --git a/scripts/measure-duplicate-publishers-test.py b/scripts/measure-duplicate-publishers-test.py new file mode 100644 index 0000000..f929448 --- /dev/null +++ b/scripts/measure-duplicate-publishers-test.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import importlib.util +import json +import sys +import unittest +from pathlib import Path + + +REPO = Path(__file__).resolve().parents[1] +SCRIPT = REPO / "scripts" / "measure-duplicate-publishers.py" + + +def load_module(): + spec = importlib.util.spec_from_file_location("measure_duplicate_publishers", SCRIPT) + if spec is None or spec.loader is None: + raise RuntimeError(f"unable to load {SCRIPT}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +class MeasureDuplicatePublishersTest(unittest.TestCase): + def test_manifest_hash_stats_counts_duplicates_divergence_and_missing_hashes(self) -> None: + module = load_module() + + records = [ + {"group_sequence": 10, "received_unix_ms": 1_000, "blake3": "same", "source_node": "nuc-a"}, + {"group_sequence": 10, "received_unix_ms": 1_001, "blake3": "same", "source_node": "nuc-b"}, + {"group_sequence": 11, "received_unix_ms": 2_000, "blake3": "left", "source_node": "nuc-a"}, + {"group_sequence": 11, "received_unix_ms": 2_001, "blake3": "right", "source_node": "nuc-b"}, + {"group_sequence": 12, "received_unix_ms": 3_000}, + ] + + stats = module.manifest_hash_stats(records, invalid_lines=2) + + self.assertEqual(5, stats["record_count"]) + self.assertEqual(2, stats["invalid_lines"]) + self.assertEqual(2, stats["sequence_count"]) + self.assertEqual(2, stats["source_identity_count"]) + self.assertEqual(["nuc-a", "nuc-b"], stats["source_identities"]) + self.assertEqual(1, stats["missing_source_identity_records"]) + self.assertEqual(1, stats["duplicate_hash_source_records"]) + self.assertEqual(1, stats["duplicate_hash_sequences"]) + self.assertEqual(1, stats["hash_divergent_sequences"]) + self.assertEqual(1, stats["missing_hash_records"]) + self.assertEqual(1_000, stats["first_received_unix_ms"]) + self.assertEqual(3_000, stats["latest_received_unix_ms"]) + + def test_compare_manifest_hashes_proves_byte_for_byte_matches(self) -> None: + module = load_module() + + comparison = module.compare_manifest_hashes( + { + "publisher-a": [ + {"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"}, + {"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"}, + ], + "publisher-b": [ + {"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"}, + {"group_sequence": 2, "blake3": "b", "source_node": "publisher-b"}, + ], + } + ) + + self.assertTrue(comparison["byte_for_byte_hash_match"]) + self.assertTrue(comparison["source_identity_ok"]) + self.assertEqual(["publisher-a", "publisher-b"], comparison["source_identities"]) + self.assertEqual(2, comparison["matching_sequence_count"]) + self.assertEqual(0, comparison["divergent_sequence_count"]) + self.assertEqual(0, comparison["missing_sequence_count"]) + + def test_compare_manifest_hashes_reports_divergent_sequences(self) -> None: + module = load_module() + + comparison = module.compare_manifest_hashes( + { + "publisher-a": [ + {"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"}, + {"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"}, + ], + "publisher-b": [ + {"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"}, + {"group_sequence": 2, "blake3": "different", "source_node": "publisher-b"}, + {"group_sequence": 3, "blake3": "extra", "source_node": "publisher-b"}, + ], + } + ) + + self.assertFalse(comparison["byte_for_byte_hash_match"]) + self.assertEqual(1, comparison["matching_sequence_count"]) + self.assertEqual(1, comparison["divergent_sequence_count"]) + self.assertEqual(1, comparison["missing_sequence_count"]) + self.assertEqual(2, comparison["divergent_examples"][0]["sequence"]) + self.assertEqual(["different"], comparison["divergent_examples"][0]["hashes"]["publisher-b"]) + + def test_compare_manifest_hashes_rejects_intra_manifest_divergence(self) -> None: + module = load_module() + + comparison = module.compare_manifest_hashes( + { + "publisher-a": [ + {"group_sequence": 1, "blake3": "same", "source_node": "publisher-a"}, + ], + "publisher-b": [ + {"group_sequence": 1, "blake3": "same", "source_node": "publisher-b"}, + {"group_sequence": 1, "blake3": "different", "source_node": "publisher-b"}, + ], + } + ) + + self.assertFalse(comparison["byte_for_byte_hash_match"]) + self.assertEqual(0, comparison["matching_sequence_count"]) + self.assertEqual(1, comparison["divergent_sequence_count"]) + self.assertEqual(["different", "same"], comparison["divergent_examples"][0]["hashes"]["publisher-b"]) + + def test_compare_manifest_hashes_rejects_mirrored_same_source_records(self) -> None: + module = load_module() + + comparison = module.compare_manifest_hashes( + { + "nuc-a-buffer": [ + {"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"}, + ], + "nuc-b-buffer": [ + {"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"}, + ], + } + ) + + self.assertFalse(comparison["byte_for_byte_hash_match"]) + self.assertFalse(comparison["source_identity_ok"]) + self.assertEqual(["archive-origin"], comparison["source_identities"]) + + def test_summary_requires_manifest_comparison_and_prometheus_series(self) -> None: + module = load_module() + + summary = module.summarize( + [ + { + "sample_unix_ms": 1_000, + "publishers": { + "a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + "b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + }, + "manifest_comparison": { + "source_identity_ok": True, + "matching_sequence_count": 2, + "divergent_sequence_count": 0, + "byte_for_byte_hash_match": True, + }, + "prometheus": [ + { + "metric": "every_channel_ladder_archive_duplicate_hash_source_records", + "ok": True, + "series_present": True, + "value": 2, + }, + { + "metric": "every_channel_ladder_archive_hash_divergent_sequences", + "ok": True, + "series_present": True, + "value": 0, + }, + ], + }, + { + "sample_unix_ms": 31_000, + "publishers": { + "a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + "b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + }, + "manifest_comparison": { + "source_identity_ok": True, + "matching_sequence_count": 4, + "divergent_sequence_count": 0, + "byte_for_byte_hash_match": True, + }, + "prometheus": [ + { + "metric": "every_channel_ladder_archive_duplicate_hash_source_records", + "ok": True, + "series_present": True, + "value": 4, + }, + { + "metric": "every_channel_ladder_archive_hash_divergent_sequences", + "ok": True, + "series_present": True, + "value": 0, + }, + ], + }, + ] + ) + + self.assertTrue(summary["ok"]) + self.assertEqual(30_000, summary["elapsed_ms"]) + self.assertEqual(2, summary["sample_count"]) + self.assertEqual(4, summary["latest_manifest_comparison"]["matching_sequence_count"]) + + def test_summary_rejects_single_sample_and_manifest_hash_errors(self) -> None: + module = load_module() + + summary = module.summarize( + [ + { + "sample_unix_ms": 1_000, + "publishers": { + "a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + "b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True}, + }, + "manifests": { + "a": { + "ok": True, + "hash_divergent_sequences": 1, + "missing_hash_records": 1, + "invalid_lines": 1, + }, + }, + "manifest_comparison": { + "source_identity_ok": True, + "matching_sequence_count": 2, + "divergent_sequence_count": 0, + "byte_for_byte_hash_match": True, + }, + "prometheus": [ + { + "metric": "every_channel_ladder_archive_duplicate_hash_source_records", + "ok": True, + "series_present": True, + "value": 2, + }, + ], + }, + ] + ) + + self.assertFalse(summary["ok"]) + self.assertIn("insufficient_elapsed_samples", summary["reasons"]) + self.assertIn("manifest_hash_divergence_observed", summary["reasons"]) + self.assertIn("manifest_hash_missing_records", summary["reasons"]) + self.assertIn("manifest_invalid_lines", summary["reasons"]) + + def test_summary_rejects_missing_or_non_diverse_source_identity(self) -> None: + module = load_module() + + summary = module.summarize( + [ + { + "sample_unix_ms": 1_000, + "manifest_comparison": { + "source_identity_ok": False, + "matching_sequence_count": 2, + "divergent_sequence_count": 0, + "byte_for_byte_hash_match": False, + }, + }, + { + "sample_unix_ms": 31_000, + "manifest_comparison": { + "source_identity_ok": False, + "matching_sequence_count": 2, + "divergent_sequence_count": 0, + "byte_for_byte_hash_match": False, + }, + "prometheus": [ + { + "metric": "every_channel_archive_missing_source_identity_records", + "ok": True, + "series_present": True, + "value": 2, + }, + ], + }, + ] + ) + + self.assertFalse(summary["ok"]) + self.assertIn("manifest_source_identity_missing_or_not_diverse", summary["reasons"]) + self.assertIn("prometheus_source_identity_missing_nonzero", summary["reasons"]) + + def test_agent_manifest_url_builds_bounded_tailnet_endpoint(self) -> None: + module = load_module() + + url = module.agent_manifest_url( + "http://100.64.0.5:7799/", + broadcast="la-kcop", + track="0.m4s", + role="publisher-buffer", + max_bytes=4096, + ) + + self.assertEqual( + "http://100.64.0.5:7799/v1/archive-manifest?broadcast=la-kcop&track=0.m4s&max_bytes=4096&role=publisher-buffer", + url, + ) + + def test_parser_defaults_to_publisher_origin_proof_track(self) -> None: + module = load_module() + + args = module.build_parser().parse_args([]) + + self.assertEqual("publisher.m4s", args.track) + + def test_parse_manifest_jsonl_tolerates_partial_first_tail_line(self) -> None: + module = load_module() + + body = 'not-json-prefix{"group_sequence":1}\n{"group_sequence":2,"blake3":"b"}\n' + records, invalid = module.parse_manifest_jsonl(body) + + self.assertEqual(0, invalid) + self.assertEqual([2], [record["group_sequence"] for record in records]) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/measure-duplicate-publishers.py b/scripts/measure-duplicate-publishers.py new file mode 100755 index 0000000..672c6ff --- /dev/null +++ b/scripts/measure-duplicate-publishers.py @@ -0,0 +1,581 @@ +#!/usr/bin/env python3 +"""Measure duplicate publisher media-hash convergence in production.""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +import urllib.parse +import urllib.request +from dataclasses import dataclass +from typing import Any, Callable + + +USER_AGENT = "every-channel-measure-duplicate-publishers/1" +DUPLICATE_PROMETHEUS_METRICS = [ + "every_channel_ladder_archive_duplicate_hash_source_records", + "every_channel_ladder_archive_duplicate_hash_sequences", + "every_channel_ladder_archive_hash_divergent_sequences", + "every_channel_ladder_archive_missing_hash_records", + "every_channel_ladder_archive_missing_source_identity_records", + "every_channel_archive_duplicate_hash_source_records", + "every_channel_archive_duplicate_hash_sequences", + "every_channel_archive_hash_divergent_sequences", + "every_channel_archive_missing_hash_records", + "every_channel_archive_missing_source_identity_records", +] +SOURCE_IDENTITY_KEYS = ("source_node", "publisher_node", "source_id") + + +@dataclass +class FetchResult: + url: str + status: int + body: str + elapsed_ms: int + error: str | None = None + + @property + def ok(self) -> bool: + return self.error is None and 200 <= self.status < 300 + + +def now_ms() -> int: + return int(time.time() * 1000) + + +def fetch_text(url: str, timeout: float, max_bytes: int = 4 * 1024 * 1024) -> FetchResult: + started = now_ms() + headers = {"User-Agent": USER_AGENT} + if max_bytes > 0: + headers["Range"] = f"bytes=-{max_bytes}" + req = urllib.request.Request(url, headers=headers) + try: + with urllib.request.urlopen(req, timeout=timeout) as res: + body = res.read(max_bytes + 1 if max_bytes > 0 else -1) + if max_bytes > 0 and len(body) > max_bytes: + body = body[-max_bytes:] + return FetchResult(url, int(res.status), body.decode("utf-8", "replace"), now_ms() - started) + except Exception as err: # noqa: BLE001 - measurements preserve transport failures. + return FetchResult(url, 0, "", now_ms() - started, str(err)) + + +def fetch_json(url: str, timeout: float, max_bytes: int = 1024 * 1024) -> tuple[FetchResult, Any | None]: + fetched = fetch_text(url, timeout, max_bytes=max_bytes) + if not fetched.ok: + return fetched, None + try: + return fetched, json.loads(fetched.body) + except json.JSONDecodeError as err: + fetched.error = f"invalid json: {err}" + return fetched, None + + +def parse_named_url(value: str) -> tuple[str, str]: + if "=" not in value: + raise ValueError(f"expected NAME=URL: {value}") + name, url = value.split("=", 1) + name = name.strip() + url = url.strip() + if not name or not url: + raise ValueError(f"expected NAME=URL: {value}") + return name, url + + +def manifest_url(origin: str, broadcast: str, track: str) -> str: + base = origin.rstrip("/") + "/" + return urllib.parse.urljoin(base, f"manifests/{broadcast}/{track}.jsonl") + + +def parse_manifest_jsonl(body: str) -> tuple[list[dict[str, Any]], int]: + records: list[dict[str, Any]] = [] + invalid_lines = 0 + for index, line in enumerate(body.splitlines()): + raw = line.strip() + if not raw: + continue + try: + record = json.loads(raw) + except json.JSONDecodeError: + # Tail range reads may start in the middle of a JSON line. + if index == 0: + continue + invalid_lines += 1 + continue + if isinstance(record, dict): + records.append(record) + else: + invalid_lines += 1 + return records, invalid_lines + + +def int_or_none(value: Any) -> int | None: + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + try: + return int(str(value)) + except (TypeError, ValueError): + return None + + +def record_source_identity(record: dict[str, Any]) -> str | None: + for key in SOURCE_IDENTITY_KEYS: + value = record.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + return None + + +def manifest_hash_stats(records: list[dict[str, Any]], invalid_lines: int = 0) -> dict[str, Any]: + hashes_by_sequence: dict[int, set[str]] = {} + source_hashes_by_sequence: dict[int, dict[str, set[str]]] = {} + missing_hash_records = 0 + missing_source_identity_records = 0 + source_identities: set[str] = set() + received_values: list[int] = [] + for record in records: + received_ms = int_or_none(record.get("received_unix_ms")) + if received_ms is not None: + received_values.append(received_ms) + sequence = int_or_none(record.get("group_sequence")) + digest = record.get("blake3") + if sequence is None: + continue + source_identity = record_source_identity(record) + if source_identity: + source_identities.add(source_identity) + else: + missing_source_identity_records += 1 + if not isinstance(digest, str) or not digest.strip(): + missing_hash_records += 1 + continue + clean_digest = digest.strip() + hashes_by_sequence.setdefault(sequence, set()).add(clean_digest) + if source_identity: + source_hashes_by_sequence.setdefault(sequence, {}).setdefault(clean_digest, set()).add(source_identity) + duplicate_hash_source_records = sum( + max(0, len(source_identities_for_hash) - 1) + for hashes in source_hashes_by_sequence.values() + for source_identities_for_hash in hashes.values() + ) + duplicate_hash_sequences = sum( + 1 + for hashes in source_hashes_by_sequence.values() + if any(len(source_identities_for_hash) > 1 for source_identities_for_hash in hashes.values()) + ) + hash_divergent_sequences = sum(1 for hashes in hashes_by_sequence.values() if len(hashes) > 1) + return { + "record_count": len(records), + "invalid_lines": invalid_lines, + "sequence_count": len(hashes_by_sequence), + "source_identity_count": len(source_identities), + "source_identities": sorted(source_identities), + "missing_source_identity_records": missing_source_identity_records, + "duplicate_hash_source_records": duplicate_hash_source_records, + "duplicate_hash_sequences": duplicate_hash_sequences, + "hash_divergent_sequences": hash_divergent_sequences, + "missing_hash_records": missing_hash_records, + "first_received_unix_ms": min(received_values) if received_values else None, + "latest_received_unix_ms": max(received_values) if received_values else None, + } + + +def first_hash_by_sequence(records: list[dict[str, Any]]) -> dict[int, str]: + out: dict[int, str] = {} + for record in records: + sequence = int_or_none(record.get("group_sequence")) + digest = record.get("blake3") + if sequence is None or not isinstance(digest, str) or not digest.strip(): + continue + out.setdefault(sequence, digest.strip()) + return out + + +def hash_sets_by_sequence(records: list[dict[str, Any]]) -> dict[int, set[str]]: + out: dict[int, set[str]] = {} + for record in records: + sequence = int_or_none(record.get("group_sequence")) + digest = record.get("blake3") + if sequence is None or not isinstance(digest, str) or not digest.strip(): + continue + out.setdefault(sequence, set()).add(digest.strip()) + return out + + +def compare_manifest_hashes(named_records: dict[str, list[dict[str, Any]]]) -> dict[str, Any]: + input_manifest_count = len(named_records) + missing_source_identity_records = 0 + source_records: dict[str, list[dict[str, Any]]] = {} + for manifest_name, records in named_records.items(): + for index, record in enumerate(records): + source_identity = record_source_identity(record) + if source_identity is None: + missing_source_identity_records += 1 + source_identity = f"manifest:{manifest_name}" + source_records.setdefault(source_identity, []).append(record) + names = sorted(source_records) + per_name = {name: hash_sets_by_sequence(records) for name, records in source_records.items()} + all_sequences = sorted(set().union(*(set(value) for value in per_name.values()))) if per_name else [] + shared_sequences = [ + sequence + for sequence in all_sequences + if all(sequence in per_name[name] for name in names) + ] + matching = 0 + divergent = 0 + examples: list[dict[str, Any]] = [] + for sequence in shared_sequences: + values = {name: per_name[name][sequence] for name in names} + flattened = [next(iter(digests)) for digests in values.values() if len(digests) == 1] + if len(flattened) == len(names) and len(set(flattened)) == 1: + matching += 1 + else: + divergent += 1 + if len(examples) < 5: + examples.append( + { + "sequence": sequence, + "hashes": { + name: sorted(digests) + for name, digests in values.items() + }, + } + ) + source_identity_ok = missing_source_identity_records == 0 and len(names) >= 2 + return { + "publisher_count": len(names), + "publishers": names, + "input_manifest_count": input_manifest_count, + "source_identity_count": len(names), + "source_identities": names, + "missing_source_identity_records": missing_source_identity_records, + "source_identity_ok": source_identity_ok, + "sequence_count": len(all_sequences), + "shared_sequence_count": len(shared_sequences), + "matching_sequence_count": matching, + "divergent_sequence_count": divergent, + "missing_sequence_count": max(0, len(all_sequences) - len(shared_sequences)), + "divergent_examples": examples, + "byte_for_byte_hash_match": bool( + source_identity_ok and shared_sequences and divergent == 0 and matching == len(shared_sequences) + ), + } + + +def prometheus_query_url(prometheus_url: str, expr: str) -> str: + return ( + prometheus_url.rstrip() + .rstrip("/") + + "/api/v1/query?" + + urllib.parse.urlencode({"query": expr}) + ) + + +def prometheus_metric_sum( + prometheus_url: str, + metric: str, + *, + broadcast: str, + timeout: float, + fetcher: Callable[[str, float, int], FetchResult] = fetch_text, +) -> dict[str, Any]: + selector = f'{metric}{{broadcast="{broadcast}"}}' + expr = f"sum({selector})" + fetched = fetcher(prometheus_query_url(prometheus_url, expr), timeout, 1024 * 1024) + if not fetched.ok: + return {"metric": metric, "ok": False, "value": None, "error": fetched.error} + try: + payload = json.loads(fetched.body) + result = payload.get("data", {}).get("result", []) + if not result: + return {"metric": metric, "ok": True, "value": None, "series_present": False} + raw_value = result[0].get("value", [None, None])[1] + value = float(raw_value) + except Exception as err: # noqa: BLE001 - preserve malformed Prometheus replies. + return {"metric": metric, "ok": False, "value": None, "error": f"invalid prometheus response: {err}"} + return {"metric": metric, "ok": True, "value": value, "series_present": True} + + +def agent_manifest_url(base_url: str, *, broadcast: str, track: str, role: str, max_bytes: int) -> str: + query = { + "broadcast": broadcast, + "track": track, + "max_bytes": str(max_bytes), + } + if role: + query["role"] = role + return base_url.rstrip("/") + "/v1/archive-manifest?" + urllib.parse.urlencode(query) + + +def sample_publishers( + publisher_urls: dict[str, str], + *, + timeout: float, + fetcher: Callable[[str, float, int], FetchResult] = fetch_text, +) -> dict[str, Any]: + out: dict[str, Any] = {} + for name, base_url in publisher_urls.items(): + base = base_url.rstrip("/") + health = fetcher(f"{base}/health", timeout, 1024 * 1024) + metrics = fetcher(f"{base}/metrics", timeout, 2 * 1024 * 1024) + row: dict[str, Any] = { + "agent_url": base, + "health_ok": health.ok, + "metrics_ok": metrics.ok, + "health_error": health.error, + "metrics_error": metrics.error, + "duplicate_metrics_present": False, + "node_modes": [], + "unhealthy_processes": [], + } + if health.ok: + try: + payload = json.loads(health.body) + row["node_modes"] = payload.get("node_modes") if isinstance(payload.get("node_modes"), list) else [] + row["unhealthy_processes"] = ( + payload.get("unhealthy_processes") + if isinstance(payload.get("unhealthy_processes"), list) + else [] + ) + system = payload.get("system") if isinstance(payload.get("system"), dict) else {} + row["hostname"] = system.get("hostname") or payload.get("hostname") + except json.JSONDecodeError: + row["health_error"] = "invalid health json" + if metrics.ok: + row["duplicate_metrics_present"] = any(metric in metrics.body for metric in DUPLICATE_PROMETHEUS_METRICS) + row["metrics_bytes"] = len(metrics.body.encode("utf-8")) + out[name] = row + return out + + +def sample_once(args: argparse.Namespace) -> dict[str, Any]: + manifests: dict[str, str] = dict(parse_named_url(item) for item in args.manifest) + if not manifests and args.archive_origin and args.broadcast and args.track: + manifests["archive-origin"] = manifest_url(args.archive_origin, args.broadcast, args.track) + publisher_urls: dict[str, str] = dict(parse_named_url(item) for item in args.publisher) + agent_manifest_urls: dict[str, str] = dict(parse_named_url(item) for item in args.agent_manifest) + + fetched_records: dict[str, list[dict[str, Any]]] = {} + manifest_stats: dict[str, Any] = {} + for name, url in manifests.items(): + fetched = fetch_text(url, args.timeout, max_bytes=args.max_manifest_bytes) + if not fetched.ok: + manifest_stats[name] = {"url": url, "ok": False, "error": fetched.error} + continue + records, invalid_lines = parse_manifest_jsonl(fetched.body) + fetched_records[name] = records + manifest_stats[name] = { + "url": url, + "ok": True, + "fetch_elapsed_ms": fetched.elapsed_ms, + **manifest_hash_stats(records, invalid_lines), + } + + if agent_manifest_urls and args.broadcast and args.track: + for name, base_url in agent_manifest_urls.items(): + url = agent_manifest_url( + base_url, + broadcast=args.broadcast, + track=args.track, + role=args.agent_manifest_role, + max_bytes=args.max_manifest_bytes, + ) + fetched, payload = fetch_json(url, args.timeout, max_bytes=args.max_manifest_bytes + 1024 * 1024) + if not fetched.ok or not isinstance(payload, dict) or payload.get("ok") is not True: + manifest_stats[name] = { + "url": url, + "ok": False, + "source": "node-agent", + "error": fetched.error or (payload.get("error") if isinstance(payload, dict) else "invalid response"), + } + continue + records = payload.get("records") if isinstance(payload.get("records"), list) else [] + records = [record for record in records if isinstance(record, dict)] + fetched_records[name] = records + invalid_lines = int_or_none(payload.get("invalid_lines")) or 0 + stats = payload.get("stats") if isinstance(payload.get("stats"), dict) else {} + manifest_stats[name] = { + "url": url, + "ok": True, + "source": "node-agent", + "fetch_elapsed_ms": fetched.elapsed_ms, + "role": payload.get("role"), + "file_bytes": int_or_none(payload.get("file_bytes")), + "partial_scan": payload.get("partial_scan") is True, + **manifest_hash_stats(records, invalid_lines), + "node_agent_stats": stats, + } + + prometheus_metrics = [] + if args.prometheus_url and args.broadcast: + for metric in DUPLICATE_PROMETHEUS_METRICS: + prometheus_metrics.append( + prometheus_metric_sum(args.prometheus_url, metric, broadcast=args.broadcast, timeout=args.timeout) + ) + + return { + "sample_unix_ms": now_ms(), + "broadcast": args.broadcast, + "track": args.track, + "publishers": sample_publishers(publisher_urls, timeout=args.timeout) if publisher_urls else {}, + "manifests": manifest_stats, + "manifest_comparison": compare_manifest_hashes(fetched_records) if len(fetched_records) >= 2 else None, + "prometheus": prometheus_metrics, + } + + +def summarize(samples: list[dict[str, Any]]) -> dict[str, Any]: + if not samples: + return {"ok": False, "reasons": ["no_samples"]} + reasons: list[str] = [] + elapsed_ms = max(0, int(samples[-1]["sample_unix_ms"]) - int(samples[0]["sample_unix_ms"])) + if len(samples) < 2 or elapsed_ms <= 0: + reasons.append("insufficient_elapsed_samples") + publisher_rows = [ + row + for sample in samples + for row in (sample.get("publishers") or {}).values() + if isinstance(row, dict) + ] + if publisher_rows and not all(row.get("health_ok") is True for row in publisher_rows): + reasons.append("publisher_health_missing") + if publisher_rows and not any(row.get("metrics_ok") is True for row in publisher_rows): + reasons.append("publisher_metrics_missing") + if publisher_rows and not any(row.get("duplicate_metrics_present") is True for row in publisher_rows): + reasons.append("duplicate_metrics_not_deployed_to_publishers") + comparisons = [ + sample.get("manifest_comparison") + for sample in samples + if isinstance(sample.get("manifest_comparison"), dict) + ] + latest_comparison = comparisons[-1] if comparisons else None + if latest_comparison is None: + reasons.append("manifest_comparison_missing") + elif latest_comparison.get("source_identity_ok") is not True: + reasons.append("manifest_source_identity_missing_or_not_diverse") + elif latest_comparison.get("matching_sequence_count", 0) <= 0: + reasons.append("no_matching_duplicate_sequences") + elif latest_comparison.get("divergent_sequence_count", 0) > 0: + reasons.append("duplicate_hash_divergence_observed") + + manifest_rows = [ + row + for sample in samples + for row in (sample.get("manifests") or {}).values() + if isinstance(row, dict) + ] + if manifest_rows and any(row.get("ok") is not True for row in manifest_rows): + reasons.append("manifest_fetch_missing") + if manifest_rows and any(int_or_none(row.get("hash_divergent_sequences")) or 0 for row in manifest_rows): + reasons.append("manifest_hash_divergence_observed") + if manifest_rows and any(int_or_none(row.get("missing_hash_records")) or 0 for row in manifest_rows): + reasons.append("manifest_hash_missing_records") + if manifest_rows and any(int_or_none(row.get("missing_source_identity_records")) or 0 for row in manifest_rows): + reasons.append("manifest_source_identity_missing") + if manifest_rows and any(int_or_none(row.get("invalid_lines")) or 0 for row in manifest_rows): + reasons.append("manifest_invalid_lines") + + prom_rows = [ + row + for sample in samples + for row in (sample.get("prometheus") or []) + if isinstance(row, dict) + ] + prom_series = [row for row in prom_rows if row.get("series_present") is True] + if prom_rows and not prom_series: + reasons.append("prometheus_duplicate_series_missing") + divergent_values = [ + float(row.get("value") or 0) + for row in prom_series + if str(row.get("metric", "")).endswith("hash_divergent_sequences") + ] + if any(value > 0 for value in divergent_values): + reasons.append("prometheus_hash_divergence_nonzero") + missing_source_values = [ + float(row.get("value") or 0) + for row in prom_series + if str(row.get("metric", "")).endswith("missing_source_identity_records") + ] + if any(value > 0 for value in missing_source_values): + reasons.append("prometheus_source_identity_missing_nonzero") + return { + "ok": not reasons, + "elapsed_ms": elapsed_ms, + "sample_count": len(samples), + "reasons": reasons, + "latest_manifest_comparison": latest_comparison, + "prometheus_series_present_count": len(prom_series), + "publisher_count": len(samples[-1].get("publishers") or {}), + } + + +def measure(args: argparse.Namespace) -> dict[str, Any]: + samples: list[dict[str, Any]] = [] + started = time.monotonic() + while True: + samples.append(sample_once(args)) + if args.duration_seconds <= 0: + break + if time.monotonic() - started >= args.duration_seconds: + break + time.sleep(args.poll_interval_seconds) + report = { + "started_unix_ms": samples[0]["sample_unix_ms"] if samples else now_ms(), + "duration_seconds": args.duration_seconds, + "samples": samples, + } + report["summary"] = summarize(samples) + return report + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--publisher", action="append", default=[], help="Named node-agent URL, NAME=http://IP:7799.") + parser.add_argument("--manifest", action="append", default=[], help="Named archive JSONL URL, NAME=https://...") + parser.add_argument( + "--agent-manifest", + action="append", + default=[], + help="Named node-agent URL to sample /v1/archive-manifest from, NAME=http://IP:7799.", + ) + parser.add_argument("--agent-manifest-role", default="publisher-buffer") + parser.add_argument("--archive-origin", default="", help="Archive origin root for manifests//.jsonl.") + parser.add_argument("--prometheus-url", default="", help="Prometheus base URL for Grafana-facing metrics.") + parser.add_argument("--broadcast", default="", help="Logical broadcast name to measure.") + parser.add_argument( + "--track", + default="publisher.m4s", + help="Track name to compare. Defaults to publisher-origin proof fragments, not relay video.", + ) + parser.add_argument("--duration-seconds", type=float, default=0.0) + parser.add_argument("--poll-interval-seconds", type=float, default=30.0) + parser.add_argument("--timeout", type=float, default=10.0) + parser.add_argument("--max-manifest-bytes", type=int, default=4 * 1024 * 1024) + parser.add_argument("--pretty", action="store_true") + parser.add_argument("--require-ok", action="store_true", help="Exit non-zero unless summary.ok is true.") + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + try: + report = measure(args) + except Exception as err: # noqa: BLE001 - command-line tool should preserve exact failure. + print(json.dumps({"ok": False, "error": str(err)}, sort_keys=True), file=sys.stderr) + return 1 + if args.pretty: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print(json.dumps(report, sort_keys=True)) + if args.require_ok and not report.get("summary", {}).get("ok"): + return 2 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())