Add duplicate publisher determinism proof
This commit is contained in:
parent
5d0f3077d3
commit
91dad67fc2
18 changed files with 21569 additions and 595 deletions
|
|
@ -109,6 +109,35 @@ jobs:
|
|||
fi
|
||||
cargo test -p ec-core -p ec-crypto -p ec-moq -p ec-iroh -p ec-linux-iptv
|
||||
|
||||
- name: Duplicate publisher proof gates
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
cd .repo
|
||||
if [[ -f "$HOME/.cargo/env" ]]; then
|
||||
. "$HOME/.cargo/env"
|
||||
fi
|
||||
cargo test -p ec-node publisher_proof
|
||||
cargo test -p ec-node archive_convergence
|
||||
|
||||
- name: Distributed simulation gates
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
cd .repo
|
||||
if [[ -f "$HOME/.cargo/env" ]]; then
|
||||
. "$HOME/.cargo/env"
|
||||
fi
|
||||
cargo test -p ec-node sim_system_
|
||||
cargo run -p ec-node -- sim-system \
|
||||
--fault-profile foundationdb \
|
||||
--seed 1 \
|
||||
--iterations 1024 \
|
||||
--max-system-complete-ms 6000 \
|
||||
--failure-artifact /tmp/ec-sim-system-foundationdb-failure.json \
|
||||
--pretty \
|
||||
> /tmp/ec-sim-system-foundationdb.json
|
||||
|
||||
- name: Build web (apps/web)
|
||||
shell: bash
|
||||
run: |
|
||||
|
|
|
|||
291
Cargo.lock
generated
291
Cargo.lock
generated
|
|
@ -1038,15 +1038,6 @@ dependencies = [
|
|||
"alloc-stdlib",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "buf-list"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6b175f9cf8fffedd4c4b18bcfef092356e952b81f596e148f18e98280994593"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.19.1"
|
||||
|
|
@ -1375,6 +1366,15 @@ dependencies = [
|
|||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "conducer"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d2cb64e61144d6960a830d3e6f2ba3a61d5c0ca689e87e11dc9effb96dcfff5"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-hex"
|
||||
version = "1.18.1"
|
||||
|
|
@ -2251,9 +2251,12 @@ dependencies = [
|
|||
"hex",
|
||||
"iroh",
|
||||
"just-webrtc",
|
||||
"moq-lite 0.14.0",
|
||||
"moq-lite 0.16.0",
|
||||
"moq-mux",
|
||||
"moq-native",
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry_sdk",
|
||||
"quinn",
|
||||
"reqwest",
|
||||
"rustls",
|
||||
|
|
@ -2261,9 +2264,10 @@ dependencies = [
|
|||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"tokio-tungstenite 0.24.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"urlencoding",
|
||||
|
|
@ -3264,24 +3268,20 @@ checksum = "253b313319f7109de64e480ffb606f89475cd758bae82e096e00c5d95341d30e"
|
|||
|
||||
[[package]]
|
||||
name = "hang"
|
||||
version = "0.14.0"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f548f7cdc8ec3b9eae085f7b61ff9603d6dc9f09192c5f4b0db4c02577786070"
|
||||
checksum = "59435f843c8a41ac499ce68828d16c575438e34ffa85b1ea46ba2529bb2a5b16"
|
||||
dependencies = [
|
||||
"buf-list",
|
||||
"bytes",
|
||||
"derive_more 2.1.1",
|
||||
"futures",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"moq-lite 0.14.0",
|
||||
"moq-lite 0.16.0",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
]
|
||||
|
||||
|
|
@ -4494,9 +4494,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
|||
|
||||
[[package]]
|
||||
name = "m3u8-rs"
|
||||
version = "5.0.5"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1d7ba86f7ea62f17f4310c55e93244619ddc7dadfc7e565de1967e4e41e6e7"
|
||||
checksum = "f03cd3335fb5f2447755d45cda9c70f76013626a9db44374973791b0926a86c3"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"nom",
|
||||
|
|
@ -4707,14 +4707,13 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "moq-lite"
|
||||
version = "0.14.0"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8a4c4e66081bc21067488da13f4131540b38b1cb79fb5176ef4ddacd104786b"
|
||||
checksum = "15b02845fa5cef29b516e0ed60dc95f5904502bf001a8a2790d543fae6571a94"
|
||||
dependencies = [
|
||||
"async-channel",
|
||||
"bytes",
|
||||
"conducer",
|
||||
"futures",
|
||||
"hex",
|
||||
"num_enum",
|
||||
"rand 0.9.2",
|
||||
"serde",
|
||||
|
|
@ -4726,24 +4725,38 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "moq-mux"
|
||||
version = "0.2.1"
|
||||
name = "moq-msf"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73e2570aa39feef3aa00fa0990862dcdfb44937d3eb9c448c3a4eb1fb8ff43d3"
|
||||
checksum = "2d61b0d5ce8285c75ed59343934aae278c4c49b1dedf41f1356939b40fab4d29"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moq-mux"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fd5f397f0d147ca8920434a74f092e0846ce23bb1cb5411253123913a3e7576"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"buf-list",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"conducer",
|
||||
"derive_more 2.1.1",
|
||||
"h264-parser",
|
||||
"hang",
|
||||
"m3u8-rs",
|
||||
"moq-lite 0.14.0",
|
||||
"moq-lite 0.16.0",
|
||||
"moq-msf",
|
||||
"mp4-atom",
|
||||
"num_enum",
|
||||
"reqwest",
|
||||
"scuffle-av1",
|
||||
"scuffle-h265",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
|
|
@ -4751,9 +4764,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "moq-native"
|
||||
version = "0.13.1"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9848c21bf5db3f8ff5e5a7d89bf2c567f0eb526390c26d5f66f3fec99a6751a5"
|
||||
checksum = "6942bac34d380bbab511e10069bc0f9615f20109807dab01b52d45e0812dc571"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
|
|
@ -4761,8 +4774,9 @@ dependencies = [
|
|||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"moq-lite 0.14.0",
|
||||
"moq-lite 0.16.0",
|
||||
"parking_lot",
|
||||
"qmux",
|
||||
"quinn",
|
||||
"rand 0.9.2",
|
||||
"rcgen 0.14.7",
|
||||
|
|
@ -4779,7 +4793,6 @@ dependencies = [
|
|||
"tracing-subscriber",
|
||||
"url",
|
||||
"web-transport-quinn",
|
||||
"web-transport-ws",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -5519,6 +5532,78 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"pin-project-lite",
|
||||
"thiserror 2.0.18",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"http",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.31.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
"http",
|
||||
"opentelemetry",
|
||||
"opentelemetry-http",
|
||||
"opentelemetry-proto",
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"reqwest",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"prost",
|
||||
"tonic",
|
||||
"tonic-prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"opentelemetry",
|
||||
"percent-encoding",
|
||||
"rand 0.9.2",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
|
|
@ -6206,6 +6291,47 @@ dependencies = [
|
|||
"unarray",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.14.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.14.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.13.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.114",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "qmux"
|
||||
version = "0.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a87859012c43a1e38dda29f2464e0ee39b0e96d0f95f870a73610bc6f2c3c2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures",
|
||||
"rustls",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-tungstenite 0.28.0",
|
||||
"tracing",
|
||||
"web-transport-proto 0.6.0",
|
||||
"web-transport-trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
|
|
@ -8315,6 +8441,22 @@ dependencies = [
|
|||
"webpki-roots 0.26.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d25a406cddcc431a75d3d9afc6a7c0f7428d4891dd973e4d54c56b46127bf857"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tungstenite 0.28.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.18"
|
||||
|
|
@ -8447,6 +8589,38 @@ version = "1.0.6+spec-1.1.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.14.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"sync_wrapper",
|
||||
"tokio-stream",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-prost"
|
||||
version = "0.14.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost",
|
||||
"tonic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.5.3"
|
||||
|
|
@ -8567,6 +8741,22 @@ dependencies = [
|
|||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.32.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"opentelemetry",
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.22"
|
||||
|
|
@ -8656,6 +8846,8 @@ dependencies = [
|
|||
"httparse",
|
||||
"log",
|
||||
"rand 0.9.2",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"sha1",
|
||||
"thiserror 2.0.18",
|
||||
"utf-8",
|
||||
|
|
@ -9136,9 +9328,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "web-async"
|
||||
version = "0.1.1"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6b2260b739b0e95cf9b78f22a64704af7ed9760ea12baa3745b4b97899dc89a"
|
||||
checksum = "f5414b65d9a5094649bb99987bb74db71febfdfa3677b7954a0a05c99d0424e8"
|
||||
dependencies = [
|
||||
"tokio",
|
||||
"tracing",
|
||||
|
|
@ -9198,7 +9390,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "web-transport-proto"
|
||||
version = "0.5.2"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0225d295c8ac00a2e9a498aefeaf3f3c6186da12a251c938189b15b82ea22808"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http",
|
||||
|
|
@ -9210,9 +9404,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "web-transport-quinn"
|
||||
version = "0.11.4"
|
||||
version = "0.11.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96b195557749e84091d7b912a25e190e9606283b5121d041faf538b0b55f40d7"
|
||||
checksum = "cac11b6caf163be7f980442a26fcba15e8074a5f22e85fbb71f0f77d11cecf60"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures",
|
||||
|
|
@ -9224,34 +9418,19 @@ dependencies = [
|
|||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
"web-transport-proto 0.5.2",
|
||||
"web-transport-proto 0.6.0",
|
||||
"web-transport-trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-transport-trait"
|
||||
version = "0.3.3"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "802d6aa508f2c63c9050ceabc17265bbf90ed4d6f4e4357e987583883628e79c"
|
||||
checksum = "cb67841c4a481ca3c1412ee4c9f463987401991e1ddc000903df2124f3dc85e9"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-transport-ws"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7b1cd89c36a28eae759329839e85f7dbca733896f048a6daaf5f8fc80f3bcba"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"web-transport-proto 0.5.2",
|
||||
"web-transport-trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webkit2gtk"
|
||||
version = "2.0.1"
|
||||
|
|
|
|||
11
Cargo.toml
11
Cargo.toml
|
|
@ -33,12 +33,9 @@ blake3 = "1"
|
|||
clap = { version = "4", features = ["derive"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
opentelemetry = { version = "0.31", features = ["trace"] }
|
||||
opentelemetry-otlp = { version = "0.31", default-features = false, features = ["http-proto", "reqwest-client", "trace", "gzip-http"] }
|
||||
opentelemetry_sdk = { version = "0.31", features = ["trace"] }
|
||||
tracing = "0.1"
|
||||
tracing-opentelemetry = "0.32"
|
||||
tracing-subscriber = "0.3"
|
||||
|
||||
[patch.crates-io]
|
||||
# Cloudflare's relay uses standard WebTransport subprotocol negotiation. The upstream
|
||||
# `web-transport-proto` crate (used by `web-transport-quinn`) currently uses legacy
|
||||
# header names (`wt-available-protocols` / `wt-protocol`), which prevents negotiating
|
||||
# `moqt-*` and causes the relay to close after MoQ SETUP.
|
||||
web-transport-proto = { path = "third_party/web-transport-proto" }
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ use ec_core::{
|
|||
};
|
||||
use ec_ts::{SectionAssembler, TimeSyncEngine, TimeSyncUpdate, TsReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::fs;
|
||||
use std::io::{Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
|
@ -299,12 +300,55 @@ pub fn chunk_ts_stream<T: Read>(
|
|||
})
|
||||
}
|
||||
|
||||
pub fn chunk_ts_stream_with_preroll<T: Read>(
|
||||
stream: T,
|
||||
output_dir: &Path,
|
||||
chunk_duration_ms: u64,
|
||||
max_chunks: Option<usize>,
|
||||
preroll_packets: usize,
|
||||
) -> Result<TsChunkManifest> {
|
||||
let mut chunks = Vec::new();
|
||||
chunk_ts_stream_live_with_preroll(
|
||||
stream,
|
||||
output_dir,
|
||||
chunk_duration_ms,
|
||||
max_chunks,
|
||||
preroll_packets,
|
||||
|chunk| {
|
||||
chunks.push(chunk);
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
Ok(TsChunkManifest {
|
||||
output_dir: output_dir.to_path_buf(),
|
||||
chunks,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
||||
stream: T,
|
||||
output_dir: &Path,
|
||||
chunk_duration_ms: u64,
|
||||
max_chunks: Option<usize>,
|
||||
mut on_chunk: F,
|
||||
) -> Result<()> {
|
||||
chunk_ts_stream_live_with_preroll(
|
||||
stream,
|
||||
output_dir,
|
||||
chunk_duration_ms,
|
||||
max_chunks,
|
||||
0,
|
||||
|chunk| on_chunk(chunk),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn chunk_ts_stream_live_with_preroll<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
||||
stream: T,
|
||||
output_dir: &Path,
|
||||
chunk_duration_ms: u64,
|
||||
max_chunks: Option<usize>,
|
||||
preroll_packets: usize,
|
||||
mut on_chunk: F,
|
||||
) -> Result<()> {
|
||||
fs::create_dir_all(output_dir)
|
||||
.with_context(|| format!("failed to create {}", output_dir.display()))?;
|
||||
|
|
@ -317,6 +361,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
let mut current_file: Option<std::fs::File> = None;
|
||||
let mut current_timing: Option<ChunkTiming> = None;
|
||||
let mut emitted = 0usize;
|
||||
let mut preroll = VecDeque::<[u8; ec_ts::TS_PACKET_SIZE]>::with_capacity(preroll_packets);
|
||||
|
||||
let mut close_and_emit =
|
||||
|index: u64, timing: ChunkTiming, file: std::fs::File| -> Result<bool> {
|
||||
|
|
@ -332,6 +377,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
};
|
||||
|
||||
while let Some(packet) = reader.read_packet()? {
|
||||
let packet_bytes = *packet.as_bytes();
|
||||
let updates = engine.ingest_packet(&packet, &mut assembler);
|
||||
for update in updates {
|
||||
if update.discontinuity {
|
||||
|
|
@ -344,6 +390,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
return Ok(());
|
||||
}
|
||||
}
|
||||
preroll.clear();
|
||||
}
|
||||
|
||||
if let Some(index) = update.chunk_index {
|
||||
|
|
@ -359,8 +406,11 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
}
|
||||
|
||||
let path = chunk_path(output_dir, index);
|
||||
let file = std::fs::File::create(&path)
|
||||
let mut file = std::fs::File::create(&path)
|
||||
.with_context(|| format!("failed to create {}", path.display()))?;
|
||||
for bytes in &preroll {
|
||||
file.write_all(bytes)?;
|
||||
}
|
||||
current_file = Some(file);
|
||||
current_index = Some(index);
|
||||
current_timing = Some(ChunkTiming {
|
||||
|
|
@ -381,6 +431,13 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
if let Some(file) = current_file.as_mut() {
|
||||
file.write_all(packet.as_bytes())?;
|
||||
}
|
||||
|
||||
if preroll_packets > 0 {
|
||||
preroll.push_back(packet_bytes);
|
||||
while preroll.len() > preroll_packets {
|
||||
preroll.pop_front();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(index), Some(timing), Some(file)) = (
|
||||
|
|
@ -388,7 +445,7 @@ pub fn chunk_ts_stream_live<T: Read, F: FnMut(TsChunk) -> Result<()>>(
|
|||
current_timing.take(),
|
||||
current_file.take(),
|
||||
) {
|
||||
let _ = close_and_emit(index, timing, file);
|
||||
close_and_emit(index, timing, file)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
@ -929,6 +986,43 @@ mod tests {
|
|||
let _ = fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn chunk_ts_stream_with_preroll_prepends_previous_packets() {
|
||||
let chunk_ms = 1000u64;
|
||||
let dir =
|
||||
std::env::temp_dir().join(format!("ec-chopper-chunks-preroll-{}", std::process::id()));
|
||||
let _ = fs::remove_dir_all(&dir);
|
||||
fs::create_dir_all(&dir).unwrap();
|
||||
|
||||
let packet0 = ts_packet_with_pcr(0x0100, 0, 0);
|
||||
let packet1 = ts_packet_with_pcr(0x0100, 1, 27_000_000);
|
||||
let packet2 = ts_packet_with_pcr(0x0100, 2, 54_000_000);
|
||||
let mut bytes = Vec::new();
|
||||
bytes.extend_from_slice(&packet0);
|
||||
bytes.extend_from_slice(&packet1);
|
||||
bytes.extend_from_slice(&packet2);
|
||||
|
||||
let manifest =
|
||||
chunk_ts_stream_with_preroll(Cursor::new(bytes), &dir, chunk_ms, None, 1).unwrap();
|
||||
let indices = manifest.chunks.iter().map(|c| c.index).collect::<Vec<_>>();
|
||||
assert_eq!(indices, vec![0, 1, 2]);
|
||||
|
||||
assert_eq!(
|
||||
fs::read(&manifest.chunks[0].path).unwrap(),
|
||||
packet0.to_vec()
|
||||
);
|
||||
assert_eq!(
|
||||
fs::read(&manifest.chunks[1].path).unwrap(),
|
||||
[packet0, packet1].concat()
|
||||
);
|
||||
assert_eq!(
|
||||
fs::read(&manifest.chunks[2].path).unwrap(),
|
||||
[packet1, packet2].concat()
|
||||
);
|
||||
|
||||
let _ = fs::remove_dir_all(&dir);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hashed_manifest_merkle_root_matches_core() {
|
||||
let dir = std::env::temp_dir().join(format!("ec-chopper-merkle-{}", std::process::id()));
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
2937
crates/ec-core/src/sim.rs
Normal file
2937
crates/ec-core/src/sim.rs
Normal file
File diff suppressed because it is too large
Load diff
986
crates/ec-core/tests/simulation.rs
Normal file
986
crates/ec-core/tests/simulation.rs
Normal file
|
|
@ -0,0 +1,986 @@
|
|||
use ec_core::sim::{
|
||||
check_control_plane_propagation_invariants, check_duplicate_publisher_invariants,
|
||||
check_system_duplicate_publisher_invariants, run_control_plane_propagation_campaign,
|
||||
run_control_plane_propagation_simulation, run_duplicate_publisher_campaign,
|
||||
run_duplicate_publisher_simulation, run_seeded_simulation_campaign,
|
||||
run_system_duplicate_publisher_campaign, run_system_duplicate_publisher_simulation,
|
||||
shrink_duplicate_publisher_failure, simulated_media_hash,
|
||||
ControlPlanePropagationInvariantConfig, ControlPlanePropagationScenario,
|
||||
ControlPlaneTraceEvent, DeterministicSimulation, DuplicatePublisherInvariantConfig,
|
||||
DuplicatePublisherScenario, EncoderDriftFault, FoundationStyleSystemScenarioConfig,
|
||||
PublisherSequenceClock, SimulationOutage, SimulationPartition, SimulationSeed,
|
||||
SystemDuplicatePublisherInvariantConfig, SystemDuplicatePublisherScenario,
|
||||
};
|
||||
|
||||
const STREAM: &str = "la-kcop";
|
||||
const RENDITION: &str = "720p";
|
||||
const TRACK: &str = "0.m4s";
|
||||
const PROFILE: &str = "x264-hd3-v1";
|
||||
|
||||
fn schedule_publisher_window(
|
||||
sim: &mut DeterministicSimulation,
|
||||
node: &str,
|
||||
start_sequence: u64,
|
||||
end_sequence: u64,
|
||||
first_delivery_ms: u64,
|
||||
step_ms: u64,
|
||||
profile: &str,
|
||||
) {
|
||||
for sequence in start_sequence..end_sequence {
|
||||
let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, profile);
|
||||
sim.schedule_observation(
|
||||
first_delivery_ms + (sequence - start_sequence) * step_ms,
|
||||
node,
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
sequence,
|
||||
&hash,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publishers_converge_after_delayed_backfill() {
|
||||
let mut sim = DeterministicSimulation::new();
|
||||
|
||||
schedule_publisher_window(&mut sim, "nuc-a", 0, 12, 0, 10, PROFILE);
|
||||
schedule_publisher_window(&mut sim, "nuc-b", 0, 4, 30, 10, PROFILE);
|
||||
schedule_publisher_window(&mut sim, "nuc-b", 4, 12, 500, 10, PROFILE);
|
||||
|
||||
sim.run_until(250);
|
||||
let before_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12);
|
||||
assert_eq!(before_backfill.expected_sequences, 12);
|
||||
assert_eq!(before_backfill.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
before_backfill.matching_duplicate_sequences,
|
||||
vec![0, 1, 2, 3]
|
||||
);
|
||||
assert!(before_backfill.ok());
|
||||
|
||||
sim.run_to_idle();
|
||||
let after_backfill = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 12);
|
||||
let duplicate_complete_at_ms = sim
|
||||
.convergence()
|
||||
.duplicate_complete_at_ms(STREAM, RENDITION, TRACK, 0, 12);
|
||||
assert_eq!(after_backfill.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(after_backfill.divergent_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
after_backfill.matching_duplicate_sequences,
|
||||
(0_u64..12).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(after_backfill.duplicate_source_records, 24);
|
||||
assert_eq!(duplicate_complete_at_ms, Some(570));
|
||||
assert_eq!(sim.trace().len(), 24);
|
||||
assert!(
|
||||
sim.trace()
|
||||
.windows(2)
|
||||
.all(|pair| (pair[0].at_ms, pair[0].order) <= (pair[1].at_ms, pair[1].order)),
|
||||
"trace should preserve deterministic event order"
|
||||
);
|
||||
assert!(after_backfill.ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn media_convergence_can_summarize_sparse_observed_sequences() {
|
||||
let mut sim = DeterministicSimulation::new();
|
||||
for sequence in [7_287_381_184_512, 7_287_381_188_608] {
|
||||
let hash = simulated_media_hash(STREAM, RENDITION, TRACK, sequence, PROFILE);
|
||||
sim.schedule_observation(0, "nuc-a", STREAM, RENDITION, TRACK, sequence, &hash);
|
||||
sim.schedule_observation(1, "nuc-b", STREAM, RENDITION, TRACK, sequence, &hash);
|
||||
}
|
||||
|
||||
sim.run_to_idle();
|
||||
let dense = sim.convergence().summarize(
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
7_287_381_184_512,
|
||||
7_287_381_188_609,
|
||||
);
|
||||
let sparse = sim.convergence().summarize_observed_sequences(
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
7_287_381_184_512,
|
||||
7_287_381_188_609,
|
||||
);
|
||||
|
||||
assert!(!dense.missing_sequences.is_empty());
|
||||
assert_eq!(sparse.expected_sequences, 2);
|
||||
assert_eq!(sparse.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
sparse.matching_duplicate_sequences,
|
||||
vec![7_287_381_184_512, 7_287_381_188_608]
|
||||
);
|
||||
assert!(sparse.ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_detects_encoder_drift() {
|
||||
let mut sim = DeterministicSimulation::new();
|
||||
|
||||
schedule_publisher_window(&mut sim, "nuc-a", 0, 8, 0, 10, PROFILE);
|
||||
schedule_publisher_window(&mut sim, "nuc-b", 0, 8, 5, 10, PROFILE);
|
||||
|
||||
let drift_hash = simulated_media_hash(STREAM, RENDITION, TRACK, 4, "x264-hd3-drift");
|
||||
sim.schedule_observation(90, "nuc-b", STREAM, RENDITION, TRACK, 4, &drift_hash);
|
||||
|
||||
sim.run_to_idle();
|
||||
let summary = sim.convergence().summarize(STREAM, RENDITION, TRACK, 0, 8);
|
||||
|
||||
assert_eq!(summary.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(summary.divergent_sequences, vec![4]);
|
||||
assert!(!summary.ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_fault_schedule_replays_from_seed() {
|
||||
let scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6d6f_712d_6475_7021));
|
||||
|
||||
let first = run_duplicate_publisher_simulation(&scenario);
|
||||
let second = run_duplicate_publisher_simulation(&scenario);
|
||||
|
||||
assert_eq!(first, second);
|
||||
assert!(first.duplicate_complete(), "replay {}", first.replay_hint);
|
||||
assert_eq!(first.summary.matching_duplicate_sequences.len(), 48);
|
||||
assert_eq!(
|
||||
first.trace, second.trace,
|
||||
"replayed reports should carry the same event history"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_many_seed_fault_schedules_converge() {
|
||||
let mut saw_transient_drop = false;
|
||||
let mut saw_partition_delay = false;
|
||||
let mut saw_publisher_outage = false;
|
||||
|
||||
for seed in 1..=96 {
|
||||
let scenario = faulted_duplicate_scenario(SimulationSeed::new(seed));
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
|
||||
saw_transient_drop |= report.fault_stats.transient_dropped_observations > 0;
|
||||
saw_partition_delay |= report.fault_stats.partition_delayed_observations > 0;
|
||||
saw_publisher_outage |= report.fault_stats.publisher_outage_observations > 0;
|
||||
|
||||
assert!(
|
||||
report.duplicate_complete(),
|
||||
"duplicate publisher convergence failed for {}: {:?}",
|
||||
report.replay_hint,
|
||||
report.summary
|
||||
);
|
||||
assert_eq!(report.summary.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
|
||||
assert_eq!(report.summary.duplicate_source_records, 96);
|
||||
}
|
||||
|
||||
assert!(
|
||||
saw_transient_drop,
|
||||
"fault suite did not exercise transient drops"
|
||||
);
|
||||
assert!(
|
||||
saw_partition_delay,
|
||||
"fault suite did not exercise partitions"
|
||||
);
|
||||
assert!(
|
||||
saw_publisher_outage,
|
||||
"fault suite did not exercise publisher outages"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seeded_fault_scenario_detects_encoder_drift() {
|
||||
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6472_6966_7421));
|
||||
scenario
|
||||
.encoder_drifts
|
||||
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
|
||||
assert!(!report.duplicate_complete());
|
||||
assert_eq!(report.summary.divergent_sequences, vec![17]);
|
||||
assert_eq!(report.duplicate_complete_at_ms, None);
|
||||
assert_eq!(report.fault_stats.encoder_drift_observations, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_detects_unaligned_publisher_phase() {
|
||||
let mut scenario = DuplicatePublisherScenario::new(
|
||||
SimulationSeed::new(0x7068_6173_652d_6275),
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
8,
|
||||
);
|
||||
scenario.base_network_delay_ms = 0;
|
||||
scenario.max_jitter_ms = 0;
|
||||
scenario
|
||||
.publisher_sequence_offsets
|
||||
.insert("nuc-b".to_string(), 3);
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
|
||||
);
|
||||
|
||||
assert!(!report.duplicate_complete());
|
||||
assert_eq!(report.summary.missing_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
report.summary.matching_duplicate_sequences,
|
||||
Vec::<u64>::new()
|
||||
);
|
||||
assert_eq!(
|
||||
report.summary.divergent_sequences,
|
||||
(0_u64..8).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(report.fault_stats.publisher_phase_offset_observations, 8);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"divergent_sequences".to_string(),
|
||||
"media_timing_conflict_sequences".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_rejects_missing_media_timing() {
|
||||
let mut scenario = DuplicatePublisherScenario::new(
|
||||
SimulationSeed::new(0x7469_6d65_2d6d_6973),
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
6,
|
||||
);
|
||||
scenario.base_network_delay_ms = 0;
|
||||
scenario.max_jitter_ms = 0;
|
||||
scenario
|
||||
.missing_media_timing_publishers
|
||||
.insert("nuc-b".to_string());
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
|
||||
);
|
||||
|
||||
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
|
||||
assert_eq!(report.summary.media_timing_missing_records, 6);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"media_timing_missing_records".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_rejects_conflicting_media_timing() {
|
||||
let mut scenario = DuplicatePublisherScenario::new(
|
||||
SimulationSeed::new(0x7469_6d65_2d73_6b65),
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
6,
|
||||
);
|
||||
scenario.base_network_delay_ms = 0;
|
||||
scenario.max_jitter_ms = 0;
|
||||
scenario
|
||||
.publisher_media_time_offsets_ms
|
||||
.insert("nuc-b".to_string(), 17);
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
|
||||
);
|
||||
|
||||
assert_eq!(report.summary.divergent_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
report.summary.media_timing_conflict_sequences,
|
||||
(0_u64..6).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"media_timing_conflict_sequences".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_rejects_independent_source_material() {
|
||||
let mut scenario = DuplicatePublisherScenario::new(
|
||||
SimulationSeed::new(0x736f_7572_6365_6d61),
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
6,
|
||||
);
|
||||
scenario.base_network_delay_ms = 0;
|
||||
scenario.max_jitter_ms = 0;
|
||||
scenario
|
||||
.publisher_source_material
|
||||
.insert("nuc-b".to_string(), "independent-rf-window".to_string());
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(1_000),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
report.summary.divergent_sequences,
|
||||
(0_u64..6).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
report.summary.media_timing_conflict_sequences,
|
||||
Vec::<u64>::new()
|
||||
);
|
||||
assert_eq!(report.fault_stats.source_material_mismatch_observations, 12);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"divergent_sequences".to_string(),
|
||||
"source_material_mismatch_observations".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_outage_backfills_after_restart() {
|
||||
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(0x6f75_7461_6765));
|
||||
scenario.partitions.clear();
|
||||
scenario.transient_drop_per_million = 0;
|
||||
scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 320, 760, 180)];
|
||||
|
||||
let report = run_duplicate_publisher_simulation(&scenario);
|
||||
|
||||
assert!(
|
||||
report.duplicate_complete(),
|
||||
"{} {:?}",
|
||||
report.replay_hint,
|
||||
report.summary
|
||||
);
|
||||
assert!(report.fault_stats.publisher_outage_observations > 0);
|
||||
assert_eq!(
|
||||
report.fault_stats.backfill_observations,
|
||||
report.fault_stats.publisher_outage_observations
|
||||
);
|
||||
assert!(
|
||||
report.duplicate_complete_at_ms.unwrap() >= 940,
|
||||
"outage restart should move convergence later than the live path"
|
||||
);
|
||||
assert!(report.duplicate_complete_at_ms.unwrap() <= 3_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_simulation_checks_convergence_deadline() {
|
||||
let report = run_duplicate_publisher_simulation(&faulted_duplicate_scenario(
|
||||
SimulationSeed::new(0x6465_6164_6c69_6e65),
|
||||
));
|
||||
let invariant = check_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000),
|
||||
);
|
||||
|
||||
assert!(
|
||||
invariant.ok,
|
||||
"{} {:?}",
|
||||
invariant.replay_hint, invariant.failures
|
||||
);
|
||||
assert!(invariant.duplicate_complete_at_ms.is_some());
|
||||
assert!(
|
||||
invariant.duplicate_complete_at_ms.unwrap() <= 3_000,
|
||||
"{} completed too late: {:?}",
|
||||
invariant.replay_hint,
|
||||
invariant.duplicate_complete_at_ms
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seeded_simulation_campaign_preserves_first_failure() {
|
||||
let campaign = run_seeded_simulation_campaign(
|
||||
"generic-seeded-campaign",
|
||||
SimulationSeed::new(40),
|
||||
8,
|
||||
|seed| (seed.0 == 44).then_some(seed.replay_hint()),
|
||||
);
|
||||
|
||||
assert!(!campaign.all_passed());
|
||||
assert_eq!(campaign.passed, 7);
|
||||
assert_eq!(campaign.failed, 1);
|
||||
assert_eq!(
|
||||
campaign.first_failure.as_deref(),
|
||||
Some("EC_SIM_SEED=000000000000002c")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn control_plane_propagation_replays_from_seed() {
|
||||
let scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6374_726c_7265_706c));
|
||||
|
||||
let first = run_control_plane_propagation_simulation(&scenario);
|
||||
let second = run_control_plane_propagation_simulation(&scenario);
|
||||
|
||||
assert_eq!(first, second);
|
||||
assert!(
|
||||
first.propagation_complete(),
|
||||
"control propagation failed for {}: {:?}",
|
||||
first.replay_hint,
|
||||
first.missing_nodes
|
||||
);
|
||||
assert_eq!(first.known_count, scenario.nodes.len() as u64);
|
||||
assert_eq!(
|
||||
first.trace, second.trace,
|
||||
"replayed control-plane schedules should carry identical traces"
|
||||
);
|
||||
assert!(first
|
||||
.trace
|
||||
.iter()
|
||||
.any(|entry| matches!(entry.event, ControlPlaneTraceEvent::MessageScheduled { .. })));
|
||||
assert!(first
|
||||
.trace
|
||||
.iter()
|
||||
.any(|entry| matches!(entry.event, ControlPlaneTraceEvent::NodeLearned { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn control_plane_campaign_runs_many_fault_schedules() {
|
||||
let invariant = ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900);
|
||||
let campaign = run_control_plane_propagation_campaign(
|
||||
"control-plane-gossip-fault-campaign",
|
||||
SimulationSeed::new(1),
|
||||
512,
|
||||
&invariant,
|
||||
faulted_control_plane_scenario,
|
||||
);
|
||||
|
||||
assert!(
|
||||
campaign.all_passed(),
|
||||
"campaign failed: {:?}",
|
||||
campaign.first_failure
|
||||
);
|
||||
assert_eq!(campaign.passed, 512);
|
||||
assert_eq!(campaign.failed, 0);
|
||||
assert!(campaign.total_transient_dropped_messages > 0);
|
||||
assert!(campaign.total_partition_delayed_messages > 0);
|
||||
assert!(campaign.total_node_outage_delayed_messages > 0);
|
||||
assert!(campaign.total_duplicate_messages > 0);
|
||||
assert!(campaign.max_propagation_complete_ms_observed <= 900);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn control_plane_simulation_detects_dead_fanout() {
|
||||
let mut scenario = faulted_control_plane_scenario(SimulationSeed::new(0x6661_6e6f_7574));
|
||||
scenario.fanout = 0;
|
||||
scenario.transient_drop_per_million = 0;
|
||||
scenario.partitions.clear();
|
||||
scenario.node_outages.clear();
|
||||
|
||||
let report = run_control_plane_propagation_simulation(&scenario);
|
||||
let invariant = check_control_plane_propagation_invariants(
|
||||
&report,
|
||||
&ControlPlanePropagationInvariantConfig::complete_with_deadline(7, 900),
|
||||
);
|
||||
|
||||
assert!(!report.propagation_complete());
|
||||
assert_eq!(report.known_nodes, vec!["nuc-a".to_string()]);
|
||||
assert_eq!(report.missing_nodes.len(), 6);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"propagation_incomplete".to_string(),
|
||||
"propagation_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_campaign_runs_many_seed_schedules() {
|
||||
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
|
||||
let campaign = run_duplicate_publisher_campaign(
|
||||
"duplicate-publisher-fault-campaign",
|
||||
SimulationSeed::new(1),
|
||||
512,
|
||||
&invariant,
|
||||
faulted_duplicate_scenario,
|
||||
);
|
||||
|
||||
assert!(
|
||||
campaign.all_passed(),
|
||||
"campaign failed: {:?}",
|
||||
campaign.first_failure
|
||||
);
|
||||
assert_eq!(campaign.passed, 512);
|
||||
assert_eq!(campaign.failed, 0);
|
||||
assert!(campaign.total_transient_dropped_observations > 0);
|
||||
assert!(campaign.total_partition_delayed_observations > 0);
|
||||
assert!(campaign.total_publisher_outage_observations > 0);
|
||||
assert!(campaign.total_backfill_observations > 0);
|
||||
assert!(campaign.max_duplicate_complete_ms_observed <= 3_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_shrinker_minimizes_noisy_drift_failure() {
|
||||
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
|
||||
let mut scenario = faulted_duplicate_scenario(SimulationSeed::new(19));
|
||||
scenario
|
||||
.encoder_drifts
|
||||
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
|
||||
|
||||
let shrunk = shrink_duplicate_publisher_failure(&scenario, &invariant)
|
||||
.expect("drift should fail and be shrinkable");
|
||||
|
||||
assert_eq!(shrunk.seed, SimulationSeed::new(19));
|
||||
assert_eq!(shrunk.scenario.expected_sequences(), 18);
|
||||
assert_eq!(shrunk.scenario.partitions.len(), 0);
|
||||
assert_eq!(shrunk.scenario.publisher_outages.len(), 0);
|
||||
assert_eq!(shrunk.scenario.transient_drop_per_million, 0);
|
||||
assert_eq!(shrunk.scenario.max_jitter_ms, 0);
|
||||
assert_eq!(shrunk.scenario.base_network_delay_ms, 0);
|
||||
assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]);
|
||||
assert_eq!(
|
||||
shrunk.invariant.failures,
|
||||
vec![
|
||||
"divergent_sequences".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
assert!(
|
||||
shrunk
|
||||
.steps
|
||||
.iter()
|
||||
.any(|step| step.dimension == "sequence_count" && step.after == "18"),
|
||||
"shrink steps should record the minimized failing media window"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn duplicate_publisher_campaign_keeps_first_replayable_failure() {
|
||||
let invariant = DuplicatePublisherInvariantConfig::duplicate_complete_with_deadline(3_000);
|
||||
let campaign = run_duplicate_publisher_campaign(
|
||||
"duplicate-publisher-replayable-failure",
|
||||
SimulationSeed::new(10),
|
||||
32,
|
||||
&invariant,
|
||||
|seed| {
|
||||
let mut scenario = faulted_duplicate_scenario(seed);
|
||||
if seed.0 == 19 {
|
||||
scenario
|
||||
.encoder_drifts
|
||||
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
|
||||
}
|
||||
scenario
|
||||
},
|
||||
);
|
||||
|
||||
let failure = campaign
|
||||
.first_failure
|
||||
.as_ref()
|
||||
.expect("campaign should preserve first failure");
|
||||
let shrunk = failure
|
||||
.shrunk_failure
|
||||
.as_ref()
|
||||
.expect("campaign should preserve a shrunk replay");
|
||||
assert_eq!(failure.seed, SimulationSeed::new(19));
|
||||
assert_eq!(
|
||||
failure.invariant.failures,
|
||||
vec![
|
||||
"divergent_sequences".to_string(),
|
||||
"duplicate_incomplete".to_string(),
|
||||
"duplicate_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
|
||||
let mut replay = faulted_duplicate_scenario(failure.seed);
|
||||
replay
|
||||
.encoder_drifts
|
||||
.push(EncoderDriftFault::new("nuc-b", 17, "x264-hd3-drift"));
|
||||
let replayed_report = run_duplicate_publisher_simulation(&replay);
|
||||
assert_eq!(replayed_report, failure.report);
|
||||
assert_eq!(shrunk.scenario.expected_sequences(), 18);
|
||||
assert_eq!(shrunk.report.summary.divergent_sequences, vec![17]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_duplicate_publishers_converge_with_global_sequence_clock() {
|
||||
let scenario = system_duplicate_scenario(
|
||||
SimulationSeed::new(0x7379_7374_656d_676c),
|
||||
PublisherSequenceClock::Global,
|
||||
);
|
||||
|
||||
let report = run_system_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_system_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500),
|
||||
);
|
||||
|
||||
assert!(
|
||||
report.system_complete(),
|
||||
"{} control={:?} media={:?}",
|
||||
report.replay_hint,
|
||||
report.control.missing_nodes,
|
||||
report.media.summary
|
||||
);
|
||||
assert!(
|
||||
invariant.ok,
|
||||
"{} {:?}",
|
||||
invariant.replay_hint, invariant.failures
|
||||
);
|
||||
assert_eq!(report.media.summary.divergent_sequences, Vec::<u64>::new());
|
||||
assert_eq!(
|
||||
report.media.summary.matching_duplicate_sequences.len() as u64,
|
||||
scenario.media.expected_sequences()
|
||||
);
|
||||
assert!(
|
||||
report
|
||||
.publisher_activation_ms
|
||||
.get("nuc-b")
|
||||
.copied()
|
||||
.unwrap_or_default()
|
||||
> report
|
||||
.publisher_activation_ms
|
||||
.get("nuc-a")
|
||||
.copied()
|
||||
.unwrap_or_default(),
|
||||
"faulted control plane should activate nuc-b later than nuc-a"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_duplicate_publishers_reject_local_activation_sequence_clock() {
|
||||
let scenario = system_duplicate_scenario(
|
||||
SimulationSeed::new(0x7379_7374_656d_6c6f),
|
||||
PublisherSequenceClock::LocalActivation,
|
||||
);
|
||||
|
||||
let report = run_system_duplicate_publisher_simulation(&scenario);
|
||||
let invariant = check_system_duplicate_publisher_invariants(
|
||||
&report,
|
||||
&SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500),
|
||||
);
|
||||
|
||||
assert!(report.control.propagation_complete());
|
||||
assert!(!report.media.duplicate_complete());
|
||||
assert!(
|
||||
!report.media.summary.divergent_sequences.is_empty(),
|
||||
"local activation clock should cause same advertised sequence to hash differently"
|
||||
);
|
||||
assert_eq!(
|
||||
invariant.failures,
|
||||
vec![
|
||||
"media_divergent_sequences".to_string(),
|
||||
"media_timing_conflict_sequences".to_string(),
|
||||
"media_duplicate_incomplete".to_string(),
|
||||
"system_complete_deadline_unreached".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_duplicate_publisher_campaign_runs_many_seed_schedules() {
|
||||
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500);
|
||||
let campaign = run_system_duplicate_publisher_campaign(
|
||||
"system-duplicate-publisher-fault-campaign",
|
||||
SimulationSeed::new(1),
|
||||
256,
|
||||
&invariant,
|
||||
|seed| system_duplicate_scenario(seed, PublisherSequenceClock::Global),
|
||||
);
|
||||
|
||||
assert!(
|
||||
campaign.all_passed(),
|
||||
"campaign failed: {:?}",
|
||||
campaign.first_failure
|
||||
);
|
||||
assert_eq!(campaign.passed, 256);
|
||||
assert_eq!(campaign.failed, 0);
|
||||
assert!(campaign.max_control_propagation_ms_observed > 0);
|
||||
assert!(campaign.max_media_duplicate_complete_ms_observed > 0);
|
||||
assert!(campaign.max_system_complete_ms_observed <= 3_500);
|
||||
assert!(campaign.total_system_complete_ms_observed > 0);
|
||||
assert!(campaign.total_control_trace_events > 0);
|
||||
assert!(campaign.total_media_trace_events > 0);
|
||||
assert_eq!(
|
||||
campaign.total_trace_events,
|
||||
campaign.total_control_trace_events + campaign.total_media_trace_events
|
||||
);
|
||||
assert!(campaign.total_control_transient_drops > 0);
|
||||
assert!(campaign.total_media_transient_drops > 0);
|
||||
assert!(campaign.total_media_backfill_observations > 0);
|
||||
assert!(campaign.seeds_with_system_convergence_time > 0);
|
||||
assert!(campaign.seeds_with_control_transient_drops > 0);
|
||||
assert!(campaign.seeds_with_media_transient_drops > 0);
|
||||
assert!(campaign.seeds_with_media_backfill_observations > 0);
|
||||
assert!(!campaign.slowest_system_runs.is_empty());
|
||||
assert!(campaign.slowest_system_runs.len() <= 16);
|
||||
assert!(campaign
|
||||
.slowest_system_runs
|
||||
.windows(2)
|
||||
.all(|pair| pair[0].system_complete_at_ms.unwrap_or(u64::MAX)
|
||||
>= pair[1].system_complete_at_ms.unwrap_or(u64::MAX)));
|
||||
assert_eq!(campaign.total_media_publisher_phase_offsets, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn foundation_style_system_campaign_runs_replayable_fault_schedules() {
|
||||
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000);
|
||||
let config = FoundationStyleSystemScenarioConfig::default();
|
||||
let campaign = run_system_duplicate_publisher_campaign(
|
||||
"foundation-style-system-campaign",
|
||||
SimulationSeed::new(1),
|
||||
512,
|
||||
&invariant,
|
||||
|seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config),
|
||||
);
|
||||
|
||||
assert!(
|
||||
campaign.all_passed(),
|
||||
"campaign failed: {:?}",
|
||||
campaign.first_failure
|
||||
);
|
||||
assert_eq!(campaign.passed, 512);
|
||||
assert_eq!(campaign.failed, 0);
|
||||
assert!(campaign.max_system_complete_ms_observed <= 6_000);
|
||||
assert!(campaign.total_system_complete_ms_observed > 0);
|
||||
assert!(campaign.total_control_trace_events > 0);
|
||||
assert!(campaign.total_media_trace_events > 0);
|
||||
assert_eq!(
|
||||
campaign.total_trace_events,
|
||||
campaign.total_control_trace_events + campaign.total_media_trace_events
|
||||
);
|
||||
assert!(campaign.total_control_transient_drops > 0);
|
||||
assert!(campaign.total_control_partition_delays > 0);
|
||||
assert!(campaign.total_control_node_outage_delays > 0);
|
||||
assert!(campaign.total_control_duplicate_messages > 0);
|
||||
assert!(campaign.total_media_transient_drops > 0);
|
||||
assert!(campaign.total_media_partition_delays > 0);
|
||||
assert!(campaign.total_media_publisher_outages > 0);
|
||||
assert!(campaign.total_media_backfill_observations > 0);
|
||||
assert!(campaign.seeds_with_system_convergence_time > 0);
|
||||
assert!(campaign.seeds_with_control_propagation_time > 0);
|
||||
assert!(campaign.seeds_with_media_duplicate_convergence_time > 0);
|
||||
assert!(campaign.seeds_with_control_transient_drops > 0);
|
||||
assert!(campaign.seeds_with_control_partition_delays > 0);
|
||||
assert!(campaign.seeds_with_control_node_outage_delays > 0);
|
||||
assert!(campaign.seeds_with_control_duplicate_messages > 0);
|
||||
assert!(campaign.seeds_with_media_transient_drops > 0);
|
||||
assert!(campaign.seeds_with_media_partition_delays > 0);
|
||||
assert!(campaign.seeds_with_media_publisher_outages > 0);
|
||||
assert!(campaign.seeds_with_media_backfill_observations > 0);
|
||||
assert!(campaign.fault_coverage_ok());
|
||||
assert!(!campaign.slowest_system_runs.is_empty());
|
||||
assert!(campaign.slowest_system_runs.len() <= 16);
|
||||
assert_eq!(campaign.total_media_publisher_phase_offsets, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn foundation_style_system_campaign_rejects_local_activation_sequence_clock() {
|
||||
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(6_000);
|
||||
let mut config = FoundationStyleSystemScenarioConfig::default();
|
||||
config.sequence_clock = PublisherSequenceClock::LocalActivation;
|
||||
let campaign = run_system_duplicate_publisher_campaign(
|
||||
"foundation-style-local-activation-failure",
|
||||
SimulationSeed::new(1),
|
||||
32,
|
||||
&invariant,
|
||||
|seed| ec_core::sim::foundation_style_system_duplicate_publisher_scenario(seed, &config),
|
||||
);
|
||||
|
||||
let failure = campaign
|
||||
.first_failure
|
||||
.as_ref()
|
||||
.expect("local activation clock should fail under foundation-style faults");
|
||||
|
||||
assert!(!campaign.all_passed());
|
||||
assert!(failure
|
||||
.invariant
|
||||
.failures
|
||||
.contains(&"media_divergent_sequences".to_string()));
|
||||
assert!(!failure.report.media.summary.divergent_sequences.is_empty());
|
||||
assert!(
|
||||
failure
|
||||
.report
|
||||
.media
|
||||
.fault_stats
|
||||
.publisher_phase_offset_observations
|
||||
> 0
|
||||
);
|
||||
assert!(campaign.total_media_publisher_phase_offsets > 0);
|
||||
assert!(campaign.seeds_with_media_publisher_phase_offsets > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_duplicate_publisher_campaign_classifies_source_material_mismatch() {
|
||||
let invariant = SystemDuplicatePublisherInvariantConfig::complete_with_deadline(3_500);
|
||||
let campaign = run_system_duplicate_publisher_campaign(
|
||||
"system-source-material-failure",
|
||||
SimulationSeed::new(1),
|
||||
1,
|
||||
&invariant,
|
||||
|seed| {
|
||||
let mut scenario = system_duplicate_scenario(seed, PublisherSequenceClock::Global);
|
||||
scenario
|
||||
.media
|
||||
.publisher_source_material
|
||||
.insert("nuc-b".to_string(), "independent-rf-window".to_string());
|
||||
scenario
|
||||
},
|
||||
);
|
||||
|
||||
let failure = campaign
|
||||
.first_failure
|
||||
.as_ref()
|
||||
.expect("source material mismatch should fail");
|
||||
|
||||
assert!(!campaign.all_passed());
|
||||
assert!(failure
|
||||
.invariant
|
||||
.failures
|
||||
.contains(&"media_source_material_mismatch_observations".to_string()));
|
||||
assert!(!failure.report.media.summary.divergent_sequences.is_empty());
|
||||
assert!(
|
||||
failure
|
||||
.report
|
||||
.media
|
||||
.fault_stats
|
||||
.source_material_mismatch_observations
|
||||
> 0
|
||||
);
|
||||
assert!(campaign.total_media_source_material_mismatches > 0);
|
||||
assert_eq!(campaign.seeds_with_media_source_material_mismatches, 1);
|
||||
}
|
||||
|
||||
fn faulted_duplicate_scenario(seed: SimulationSeed) -> DuplicatePublisherScenario {
|
||||
let mut scenario = DuplicatePublisherScenario::new(
|
||||
seed,
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
48,
|
||||
);
|
||||
scenario.segment_step_ms = 40;
|
||||
scenario.base_network_delay_ms = 5;
|
||||
scenario.max_jitter_ms = 75;
|
||||
scenario.transient_drop_per_million = 275_000;
|
||||
scenario.backfill_after_ms = 600;
|
||||
scenario.partitions = vec![
|
||||
SimulationPartition::new("nuc-b", 120, 520, 140),
|
||||
SimulationPartition::new("nuc-a", 940, 1_260, 90),
|
||||
];
|
||||
scenario.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)];
|
||||
scenario
|
||||
}
|
||||
|
||||
fn faulted_control_plane_scenario(seed: SimulationSeed) -> ControlPlanePropagationScenario {
|
||||
let mut scenario = ControlPlanePropagationScenario::new(
|
||||
seed,
|
||||
vec![
|
||||
"nuc-a".to_string(),
|
||||
"nuc-b".to_string(),
|
||||
"tower".to_string(),
|
||||
"forge".to_string(),
|
||||
"relay-lax".to_string(),
|
||||
"relay-nyc".to_string(),
|
||||
"relay-hel".to_string(),
|
||||
],
|
||||
"nuc-a",
|
||||
"ec.control.broadcast.la-kcop",
|
||||
"la-kcop@42",
|
||||
);
|
||||
scenario.fanout = 3;
|
||||
scenario.gossip_interval_ms = 35;
|
||||
scenario.max_gossip_rounds = 12;
|
||||
scenario.base_network_delay_ms = 6;
|
||||
scenario.max_jitter_ms = 45;
|
||||
scenario.transient_drop_per_million = 120_000;
|
||||
scenario.partitions = vec![
|
||||
SimulationPartition::new("relay-hel", 70, 190, 55),
|
||||
SimulationPartition::new("tower", 220, 310, 40),
|
||||
];
|
||||
scenario.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)];
|
||||
scenario
|
||||
}
|
||||
|
||||
fn system_duplicate_scenario(
|
||||
seed: SimulationSeed,
|
||||
sequence_clock: PublisherSequenceClock,
|
||||
) -> SystemDuplicatePublisherScenario {
|
||||
let mut control = ControlPlanePropagationScenario::new(
|
||||
seed,
|
||||
vec![
|
||||
"forge".to_string(),
|
||||
"nuc-a".to_string(),
|
||||
"nuc-b".to_string(),
|
||||
"tower".to_string(),
|
||||
"relay-lax".to_string(),
|
||||
"relay-nyc".to_string(),
|
||||
"relay-hel".to_string(),
|
||||
],
|
||||
"forge",
|
||||
"ec.control.broadcast.la-kcop",
|
||||
"la-kcop@42",
|
||||
);
|
||||
control.fanout = 3;
|
||||
control.gossip_interval_ms = 35;
|
||||
control.max_gossip_rounds = 12;
|
||||
control.base_network_delay_ms = 6;
|
||||
control.max_jitter_ms = 45;
|
||||
control.transient_drop_per_million = 120_000;
|
||||
control.partitions = vec![
|
||||
SimulationPartition::new("nuc-b", 0, 180, 40),
|
||||
SimulationPartition::new("relay-hel", 70, 190, 55),
|
||||
];
|
||||
control.node_outages = vec![SimulationOutage::new("relay-nyc", 105, 205, 45)];
|
||||
|
||||
let mut media = DuplicatePublisherScenario::new(
|
||||
SimulationSeed::new(seed.0 ^ 0x6d65_6469_6121),
|
||||
vec!["nuc-a".to_string(), "nuc-b".to_string()],
|
||||
STREAM,
|
||||
RENDITION,
|
||||
TRACK,
|
||||
PROFILE,
|
||||
0,
|
||||
48,
|
||||
);
|
||||
media.segment_step_ms = 40;
|
||||
media.base_network_delay_ms = 5;
|
||||
media.max_jitter_ms = 75;
|
||||
media.transient_drop_per_million = 275_000;
|
||||
media.backfill_after_ms = 600;
|
||||
media.partitions = vec![SimulationPartition::new("nuc-a", 940, 1_260, 90)];
|
||||
media.publisher_outages = vec![SimulationOutage::new("nuc-b", 1_360, 1_520, 220)];
|
||||
|
||||
let mut scenario = SystemDuplicatePublisherScenario::new(seed, control, media);
|
||||
scenario.publisher_activation_delay_ms = 25;
|
||||
scenario.publisher_backfill_delay_ms = 180;
|
||||
scenario.sequence_clock = sequence_clock;
|
||||
scenario
|
||||
}
|
||||
|
|
@ -29,17 +29,21 @@ rustls-native-certs = "0.8.3"
|
|||
urlencoding = "2"
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry-otlp.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio-tungstenite = { version = "0.24", default-features = false, features = ["connect", "rustls-tls-webpki-roots"] }
|
||||
futures-util = "0.3"
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
web-transport-quinn = "0.11.4"
|
||||
web-transport-trait = "0.3.3"
|
||||
hang = "0.14.0"
|
||||
moq-mux = "0.2.1"
|
||||
moq-lite = "0.14.0"
|
||||
moq-native = { version = "0.13.1", default-features = true }
|
||||
web-transport-quinn = "0.11.9"
|
||||
web-transport-trait = "0.3.4"
|
||||
hang = "0.16.0"
|
||||
moq-mux = "0.4.0"
|
||||
moq-lite = "0.16.0"
|
||||
moq-native = { version = "0.14.0", default-features = true }
|
||||
headless_chrome = "1"
|
||||
tokio-util = "0.7"
|
||||
url = "2"
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -2,7 +2,7 @@ use anyhow::{anyhow, Context, Result};
|
|||
use headless_chrome::protocol::cdp::Page;
|
||||
use headless_chrome::{Browser, Tab};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::{BufRead, BufReader, Cursor, Read, Write};
|
||||
|
|
@ -65,11 +65,15 @@ pub struct BootstrapResult {
|
|||
pub page_url: String,
|
||||
pub interactive_auth_required: bool,
|
||||
pub authorized: bool,
|
||||
pub video_ready: bool,
|
||||
pub current_time: f64,
|
||||
pub width: u64,
|
||||
pub height: u64,
|
||||
pub screenshot_path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WaitOutcome {
|
||||
tab: Arc<Tab>,
|
||||
state: NbcVideoState,
|
||||
trace: NbcTraceState,
|
||||
interactive_auth_required: bool,
|
||||
|
|
@ -199,6 +203,14 @@ fn nbc_bootstrap_timeout() -> Duration {
|
|||
.unwrap_or_else(|| Duration::from_secs(1800))
|
||||
}
|
||||
|
||||
fn nbc_profile_signin_gate_timeout() -> Duration {
|
||||
env::var("EVERY_CHANNEL_NBC_PROFILE_SIGNIN_GATE_TIMEOUT_SECS")
|
||||
.ok()
|
||||
.and_then(|value| value.parse::<u64>().ok())
|
||||
.map(Duration::from_secs)
|
||||
.unwrap_or_else(|| Duration::from_secs(8))
|
||||
}
|
||||
|
||||
fn nbc_env_flag(name: &str) -> Option<bool> {
|
||||
env::var(name).ok().map(|value| {
|
||||
let value = value.trim().to_ascii_lowercase();
|
||||
|
|
@ -403,6 +415,10 @@ pub fn bootstrap_nbc_auth(
|
|||
)?;
|
||||
|
||||
Ok(BootstrapResult {
|
||||
video_ready: nbc_video_state_has_decoded_frame(&outcome.state),
|
||||
current_time: outcome.state.current_time,
|
||||
width: outcome.state.width,
|
||||
height: outcome.state.height,
|
||||
title: outcome.state.title,
|
||||
page_url: outcome.state.page_url,
|
||||
interactive_auth_required: outcome.interactive_auth_required,
|
||||
|
|
@ -619,7 +635,7 @@ fn run_nbc_capture_loop(
|
|||
register_nbc_trace_handlers(&tab, trace.clone())?;
|
||||
tab.navigate_to(&url)?;
|
||||
tab.wait_until_navigated()?;
|
||||
wait_for_nbc_playback(
|
||||
let outcome = wait_for_nbc_playback(
|
||||
chrome.browser(),
|
||||
&tab,
|
||||
&url,
|
||||
|
|
@ -627,31 +643,34 @@ fn run_nbc_capture_loop(
|
|||
AuthMode::Forbidden,
|
||||
None,
|
||||
)?;
|
||||
let capture_tab = outcome.tab;
|
||||
|
||||
let frame_interval = Duration::from_millis(1000 / nbc_capture_fps().max(1));
|
||||
let quality = nbc_capture_quality();
|
||||
let mut first_frame = true;
|
||||
|
||||
loop {
|
||||
kick_nbc_player(&tab).ok();
|
||||
let frame = tab
|
||||
kick_nbc_player(&capture_tab).ok();
|
||||
let state = probe_nbc_video(&capture_tab).unwrap_or_default();
|
||||
if !nbc_video_state_has_decoded_frame(&state) {
|
||||
return Err(anyhow!(
|
||||
"NBC capture tab lost decoded video (title='{}', page_url='{}', current_time={}, ready_state={}, has_video={})",
|
||||
state.title,
|
||||
state.page_url,
|
||||
state.current_time,
|
||||
state.ready_state,
|
||||
state.has_video,
|
||||
));
|
||||
}
|
||||
let video = capture_tab
|
||||
.find_element("video")
|
||||
.and_then(|video| {
|
||||
video.parent.capture_screenshot(
|
||||
Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
Some(quality),
|
||||
Some(video.get_box_model()?.content_viewport()),
|
||||
true,
|
||||
)
|
||||
})
|
||||
.or_else(|_| {
|
||||
tab.capture_screenshot(
|
||||
Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
Some(quality),
|
||||
None,
|
||||
true,
|
||||
)
|
||||
})?;
|
||||
.context("NBC capture tab has no video element after playback readiness")?;
|
||||
let frame = video.parent.capture_screenshot(
|
||||
Page::CaptureScreenshotFormatOption::Jpeg,
|
||||
Some(quality),
|
||||
Some(video.get_box_model()?.content_viewport()),
|
||||
true,
|
||||
)?;
|
||||
|
||||
if first_frame {
|
||||
first_frame = false;
|
||||
|
|
@ -785,6 +804,15 @@ fn nbc_url_is_provider_linked(url: &str) -> bool {
|
|||
(host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("provider-linked")
|
||||
}
|
||||
|
||||
fn nbc_url_is_mvpd_complete(url: &str) -> bool {
|
||||
let Ok(url) = Url::parse(url) else {
|
||||
return false;
|
||||
};
|
||||
let host = url.host_str().unwrap_or_default().to_ascii_lowercase();
|
||||
let path = url.path().to_ascii_lowercase();
|
||||
(host.ends_with("nbc.com") || host.ends_with(".nbc.com")) && path.contains("mvpd-complete")
|
||||
}
|
||||
|
||||
fn nbc_url_is_optional_profile_signin(url: &str) -> bool {
|
||||
let Ok(url) = Url::parse(url) else {
|
||||
return false;
|
||||
|
|
@ -812,6 +840,7 @@ fn nbc_page_is_watch_surface(url: &str) -> bool {
|
|||
(host.ends_with("nbc.com") || host.ends_with(".nbc.com"))
|
||||
&& !nbc_url_is_optional_profile_signin(url.as_str())
|
||||
&& !nbc_url_is_provider_linked(url.as_str())
|
||||
&& !nbc_url_is_mvpd_complete(url.as_str())
|
||||
}
|
||||
|
||||
fn nbc_title_looks_like_verizon_popup(title: &str) -> bool {
|
||||
|
|
@ -836,6 +865,14 @@ fn nbc_state_is_optional_profile_signin(state: &NbcVideoState) -> bool {
|
|||
|| nbc_title_looks_like_optional_profile_signin(&state.title)
|
||||
}
|
||||
|
||||
fn nbc_clues_look_geo_blocked(clues: &NbcPageClues) -> bool {
|
||||
let body_text = clues.body_text.to_ascii_lowercase();
|
||||
body_text.contains("not authorized to access this content from outside of the us")
|
||||
|| body_text.contains("not authorized to access this content from outside of the u.s.")
|
||||
|| body_text.contains("outside of the us and its territories")
|
||||
|| body_text.contains("outside of the u.s. and its territories")
|
||||
}
|
||||
|
||||
fn browser_tabs(browser: &Browser) -> Vec<Arc<Tab>> {
|
||||
browser.register_missing_tabs();
|
||||
browser.get_tabs().lock().unwrap().iter().cloned().collect()
|
||||
|
|
@ -877,20 +914,20 @@ fn find_primary_tab_state<'a>(
|
|||
.find(|candidate| candidate.tab.get_target_id() == target_id)
|
||||
}
|
||||
|
||||
fn find_playing_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> {
|
||||
tabs.iter().find(|candidate| {
|
||||
candidate.state.has_video
|
||||
&& candidate.state.width > 0
|
||||
&& candidate.state.height > 0
|
||||
&& !candidate.state.paused
|
||||
&& (candidate.state.current_time > 0.0 || candidate.state.ready_state >= 2)
|
||||
})
|
||||
fn nbc_video_state_has_decoded_frame(state: &NbcVideoState) -> bool {
|
||||
state.has_video
|
||||
&& state.width > 0
|
||||
&& state.height > 0
|
||||
&& !state.paused
|
||||
&& state.current_time > 0.0
|
||||
&& state.ready_state >= 2
|
||||
}
|
||||
|
||||
fn find_provider_linked_tab_state(tabs: &[BrowserTabState]) -> Option<&BrowserTabState> {
|
||||
tabs.iter().find(|candidate| {
|
||||
nbc_title_looks_like_provider_linked(&candidate.state.title)
|
||||
|| nbc_url_is_provider_linked(&candidate.state.page_url)
|
||||
|| nbc_url_is_mvpd_complete(&candidate.state.page_url)
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -1038,25 +1075,40 @@ fn advance_nbc_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult>>
|
|||
}};
|
||||
const actions = [];
|
||||
const url = window.location.href || "";
|
||||
let host = "";
|
||||
try {{
|
||||
host = new URL(url).hostname.toLowerCase();
|
||||
}} catch (_err) {{}}
|
||||
const title = document.title || "";
|
||||
const titleText = `${{title}} ${{url}}`.toLowerCase();
|
||||
const looksLikeOptionalNbcProfile =
|
||||
(host.endsWith("nbc.com") || host.endsWith(".nbc.com")) &&
|
||||
(url.includes("/sign-in") ||
|
||||
url.includes("/login") ||
|
||||
titleText.includes("nbc account sign in") ||
|
||||
titleText.includes("nbcuniversal profile") ||
|
||||
titleText.includes("nbc profile"));
|
||||
if (looksLikeOptionalNbcProfile) {{
|
||||
return {{ pageUrl: url, title, actions }};
|
||||
}}
|
||||
const candidates = Array.from(
|
||||
document.querySelectorAll(
|
||||
"button,a,[role='button'],[role='option'],label,li,[data-provider-name],[data-provider-id],[data-provider]"
|
||||
)
|
||||
);
|
||||
const providerCta = candidates.find((node) => {{
|
||||
const text = textOf(node);
|
||||
return visible(node) &&
|
||||
(
|
||||
text === "link tv provider" ||
|
||||
text === "link provider" ||
|
||||
text.startsWith("link tv provider ") ||
|
||||
text.startsWith("link provider ")
|
||||
);
|
||||
}});
|
||||
clickNode(providerCta, "click:link-provider");
|
||||
|
||||
if (url.includes("mvpd")) {{
|
||||
const providerCta = candidates.find((node) => {{
|
||||
const text = textOf(node);
|
||||
return visible(node) &&
|
||||
(
|
||||
text === "link tv provider" ||
|
||||
text === "link provider" ||
|
||||
text.startsWith("link tv provider ") ||
|
||||
text.startsWith("link provider ")
|
||||
);
|
||||
}});
|
||||
clickNode(providerCta, "click:link-provider");
|
||||
|
||||
const fullListNode = candidates.find((node) => {{
|
||||
const text = textOf(node);
|
||||
return visible(node) && (text === "full list" || text.startsWith("full list "));
|
||||
|
|
@ -1112,7 +1164,7 @@ fn advance_nbc_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult>>
|
|||
|
||||
return {{
|
||||
pageUrl: url,
|
||||
title: document.title || "",
|
||||
title,
|
||||
actions,
|
||||
}};
|
||||
}})())
|
||||
|
|
@ -1226,16 +1278,30 @@ fn advance_mvpd_login_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceResult
|
|||
titleText.includes("nbc profile"));
|
||||
if (looksLikeOptionalNbcProfile) {{
|
||||
const profileButtons = Array.from(document.querySelectorAll("button,a,[role='button'],input[type='submit'],input[type='button']"));
|
||||
const providerLink = profileButtons.find((node) => {{
|
||||
const dismissButton = profileButtons.find((node) => {{
|
||||
const text = textOf(node);
|
||||
return visible(node) && (
|
||||
text === "link tv provider" ||
|
||||
text === "link provider" ||
|
||||
text.startsWith("link tv provider ") ||
|
||||
text.startsWith("link provider ")
|
||||
text === "skip" ||
|
||||
text.startsWith("skip ") ||
|
||||
text === "skip for now" ||
|
||||
text === "maybe later" ||
|
||||
text === "not now" ||
|
||||
text === "no thanks" ||
|
||||
text === "close" ||
|
||||
text === "continue watching" ||
|
||||
text.startsWith("continue watching ") ||
|
||||
text === "continue without signing in" ||
|
||||
text === "continue without profile" ||
|
||||
text === "continue as guest" ||
|
||||
text === "watch live" ||
|
||||
text === "watch now" ||
|
||||
text.startsWith("watch live ") ||
|
||||
text.startsWith("watch now ")
|
||||
);
|
||||
}});
|
||||
clickNode(providerLink, "click:profile-link-provider");
|
||||
if (dismissButton) {{
|
||||
clickNode(dismissButton, `click:profile-dismiss:${{textOf(dismissButton).slice(0, 120)}}`);
|
||||
}}
|
||||
return {{ pageUrl: url, title, actions }};
|
||||
}}
|
||||
if (!looksLikeProviderLogin) {{
|
||||
|
|
@ -1333,8 +1399,15 @@ fn advance_nbc_post_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceRes
|
|||
const bodyText = normalize(document.body?.innerText || "");
|
||||
const looksLinked = title.toLowerCase().includes("tv provider linked")
|
||||
|| url.includes("provider-linked")
|
||||
|| url.includes("mvpd-complete")
|
||||
|| bodyText.includes("tv provider linked");
|
||||
if (!looksLinked) {
|
||||
const looksOptionalProfile =
|
||||
(url.includes("/sign-in") ||
|
||||
url.includes("/login") ||
|
||||
normalize(title).includes("nbc account sign in") ||
|
||||
normalize(title).includes("nbcuniversal profile") ||
|
||||
normalize(title).includes("nbc profile"));
|
||||
if (!looksLinked && !looksOptionalProfile) {
|
||||
return { pageUrl: url, title, actions };
|
||||
}
|
||||
|
||||
|
|
@ -1344,8 +1417,22 @@ fn advance_nbc_post_auth_flow(tab: &Arc<Tab>) -> Result<Option<NbcAuthAdvanceRes
|
|||
return visible(node) && (
|
||||
text === "skip" ||
|
||||
text.startsWith("skip ") ||
|
||||
text === "continue" ||
|
||||
text.startsWith("continue watching")
|
||||
text === "skip for now" ||
|
||||
(looksLinked && text === "continue") ||
|
||||
(looksLinked && text.startsWith("continue ")) ||
|
||||
text === "maybe later" ||
|
||||
text === "not now" ||
|
||||
text === "no thanks" ||
|
||||
text === "close" ||
|
||||
text === "continue watching" ||
|
||||
text.startsWith("continue watching") ||
|
||||
text === "continue without signing in" ||
|
||||
text === "continue without profile" ||
|
||||
text === "continue as guest" ||
|
||||
text === "watch live" ||
|
||||
text === "watch now" ||
|
||||
text.startsWith("watch live ") ||
|
||||
text.startsWith("watch now ")
|
||||
);
|
||||
});
|
||||
if (skipButton) {
|
||||
|
|
@ -1533,6 +1620,7 @@ fn wait_for_nbc_playback(
|
|||
screenshot_out: Option<PathBuf>,
|
||||
) -> Result<WaitOutcome> {
|
||||
let deadline = Instant::now() + nbc_capture_timeout();
|
||||
let auth_forbidden = matches!(&auth_mode, AuthMode::Forbidden);
|
||||
let mut interactive_deadline = None::<Instant>;
|
||||
let mut interactive_auth_required = false;
|
||||
let mut screenshot_path = None::<PathBuf>;
|
||||
|
|
@ -1540,11 +1628,13 @@ fn wait_for_nbc_playback(
|
|||
let mut last_trace_state = None::<NbcTraceState>;
|
||||
let mut last_log = Instant::now() - Duration::from_secs(10);
|
||||
let mut last_clue_log = Instant::now() - Duration::from_secs(30);
|
||||
let mut playback_samples = HashMap::<String, (f64, Instant)>::new();
|
||||
let mut resumed_after_background_login = false;
|
||||
let mut resumed_after_authenticated_surface = false;
|
||||
let mut optional_profile_signin_recoveries = 0_u8;
|
||||
let mut last_optional_profile_signin_retry = None::<Instant>;
|
||||
let mut watch_surface_seen_at = None::<Instant>;
|
||||
let mut optional_profile_signin_seen_at = None::<Instant>;
|
||||
let mut tracked_tabs = HashSet::new();
|
||||
let mut provider_linked_completed = false;
|
||||
|
||||
|
|
@ -1554,13 +1644,33 @@ fn wait_for_nbc_playback(
|
|||
let primary_state = find_primary_tab_state(&tab_states, tab)
|
||||
.map(|value| value.state.clone())
|
||||
.unwrap_or_else(|| probe_nbc_video(tab).unwrap_or_default());
|
||||
if let Some(playing_tab) = find_playing_tab_state(&tab_states) {
|
||||
return Ok(WaitOutcome {
|
||||
state: playing_tab.state.clone(),
|
||||
trace: trace.lock().map(|state| state.clone()).unwrap_or_default(),
|
||||
interactive_auth_required,
|
||||
screenshot_path,
|
||||
});
|
||||
let now = Instant::now();
|
||||
for playing_tab in tab_states
|
||||
.iter()
|
||||
.filter(|candidate| nbc_video_state_has_decoded_frame(&candidate.state))
|
||||
{
|
||||
let target_id = playing_tab.tab.get_target_id().to_string();
|
||||
if let Some((previous_time, first_seen)) = playback_samples.get(&target_id) {
|
||||
if playing_tab.state.current_time >= *previous_time + 0.25
|
||||
&& first_seen.elapsed() >= Duration::from_millis(500)
|
||||
{
|
||||
return Ok(WaitOutcome {
|
||||
tab: playing_tab.tab.clone(),
|
||||
state: playing_tab.state.clone(),
|
||||
trace: trace.lock().map(|state| state.clone()).unwrap_or_default(),
|
||||
interactive_auth_required,
|
||||
screenshot_path,
|
||||
});
|
||||
}
|
||||
}
|
||||
playback_samples
|
||||
.entry(target_id)
|
||||
.and_modify(|(previous_time, _)| {
|
||||
if playing_tab.state.current_time < *previous_time {
|
||||
*previous_time = playing_tab.state.current_time;
|
||||
}
|
||||
})
|
||||
.or_insert((playing_tab.state.current_time, now));
|
||||
}
|
||||
|
||||
let interaction_tab = find_interaction_tab_state(&tab_states, tab)
|
||||
|
|
@ -1571,6 +1681,7 @@ fn wait_for_nbc_playback(
|
|||
let pre_state = probe_nbc_video(&interaction_tab).unwrap_or_default();
|
||||
if nbc_title_looks_like_provider_linked(&pre_state.title)
|
||||
|| nbc_url_is_provider_linked(&pre_state.page_url)
|
||||
|| nbc_url_is_mvpd_complete(&pre_state.page_url)
|
||||
{
|
||||
provider_linked_completed = true;
|
||||
}
|
||||
|
|
@ -1579,6 +1690,7 @@ fn wait_for_nbc_playback(
|
|||
if let Some(progress) = advance_nbc_post_auth_flow(&interaction_tab).ok().flatten() {
|
||||
if nbc_title_looks_like_provider_linked(&progress.title)
|
||||
|| nbc_url_is_provider_linked(&progress.page_url)
|
||||
|| nbc_url_is_mvpd_complete(&progress.page_url)
|
||||
|| progress
|
||||
.actions
|
||||
.iter()
|
||||
|
|
@ -1614,6 +1726,7 @@ fn wait_for_nbc_playback(
|
|||
let state = probe_nbc_video(&interaction_tab).unwrap_or_default();
|
||||
if nbc_title_looks_like_provider_linked(&state.title)
|
||||
|| nbc_url_is_provider_linked(&state.page_url)
|
||||
|| nbc_url_is_mvpd_complete(&state.page_url)
|
||||
{
|
||||
provider_linked_completed = true;
|
||||
}
|
||||
|
|
@ -1621,6 +1734,19 @@ fn wait_for_nbc_playback(
|
|||
let authorized = nbc_trace_is_authorized(&trace_state) || provider_linked_completed;
|
||||
let recent_media_activity = nbc_trace_has_recent_media_activity(&trace_state);
|
||||
|
||||
if !authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video {
|
||||
let first_seen = *optional_profile_signin_seen_at.get_or_insert_with(Instant::now);
|
||||
if auth_forbidden && first_seen.elapsed() >= nbc_profile_signin_gate_timeout() {
|
||||
return Err(anyhow!(
|
||||
"NBC account sign-in gate reached before TV-provider auth; refusing non-interactive retry loop without decoded video (title='{}', page_url='{}')",
|
||||
state.title,
|
||||
state.page_url,
|
||||
));
|
||||
}
|
||||
} else {
|
||||
optional_profile_signin_seen_at = None;
|
||||
}
|
||||
|
||||
if last_log.elapsed() >= Duration::from_secs(5) {
|
||||
last_log = Instant::now();
|
||||
tracing::info!(
|
||||
|
|
@ -1661,8 +1787,9 @@ fn wait_for_nbc_playback(
|
|||
}
|
||||
}
|
||||
|
||||
if (trace_state.background_login_complete
|
||||
|| nbc_url_is_background_login_complete(&state.page_url))
|
||||
let auth_completion_page = nbc_url_is_background_login_complete(&state.page_url)
|
||||
|| nbc_url_is_mvpd_complete(&state.page_url);
|
||||
if (trace_state.background_login_complete || auth_completion_page)
|
||||
&& !resumed_after_background_login
|
||||
{
|
||||
resumed_after_background_login = true;
|
||||
|
|
@ -1673,41 +1800,49 @@ fn wait_for_nbc_playback(
|
|||
);
|
||||
close_auxiliary_browser_tabs(browser, tab);
|
||||
let _ = tab.activate();
|
||||
let _ = tab.evaluate("window.location.reload()", true);
|
||||
if nbc_url_is_mvpd_complete(&state.page_url) {
|
||||
tab.navigate_to(source_url)?;
|
||||
tab.wait_until_navigated()?;
|
||||
} else {
|
||||
let _ = tab.evaluate("window.location.reload()", true);
|
||||
}
|
||||
std::thread::sleep(Duration::from_secs(2));
|
||||
continue;
|
||||
}
|
||||
|
||||
if authorized
|
||||
&& nbc_state_is_optional_profile_signin(&state)
|
||||
&& !recent_media_activity
|
||||
&& optional_profile_signin_recoveries < 3
|
||||
&& last_optional_profile_signin_retry
|
||||
.map(|instant| instant.elapsed() >= Duration::from_secs(3))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
optional_profile_signin_recoveries += 1;
|
||||
last_optional_profile_signin_retry = Some(Instant::now());
|
||||
tracing::info!(
|
||||
title = %state.title,
|
||||
page_url = %state.page_url,
|
||||
authorized,
|
||||
source_url,
|
||||
optional_profile_signin_recoveries,
|
||||
"NBC profile sign-in surface detected after authorization; returning to the live source URL"
|
||||
);
|
||||
close_auxiliary_browser_tabs(browser, tab);
|
||||
let _ = tab.activate();
|
||||
tab.navigate_to(source_url)?;
|
||||
tab.wait_until_navigated()?;
|
||||
std::thread::sleep(Duration::from_secs(2));
|
||||
continue;
|
||||
if authorized && nbc_state_is_optional_profile_signin(&state) && !state.has_video {
|
||||
if optional_profile_signin_recoveries == 0
|
||||
&& last_optional_profile_signin_retry
|
||||
.map(|instant| instant.elapsed() >= Duration::from_secs(3))
|
||||
.unwrap_or(true)
|
||||
{
|
||||
optional_profile_signin_recoveries += 1;
|
||||
last_optional_profile_signin_retry = Some(Instant::now());
|
||||
tracing::info!(
|
||||
title = %state.title,
|
||||
page_url = %state.page_url,
|
||||
authorized,
|
||||
source_url,
|
||||
"NBC account sign-in gate detected after provider authorization; trying one live-url recovery"
|
||||
);
|
||||
close_auxiliary_browser_tabs(browser, tab);
|
||||
let _ = tab.activate();
|
||||
tab.navigate_to(source_url)?;
|
||||
tab.wait_until_navigated()?;
|
||||
std::thread::sleep(Duration::from_secs(2));
|
||||
continue;
|
||||
}
|
||||
return Err(anyhow!(
|
||||
"NBC account sign-in gate reached after TV-provider auth; refusing retry loop without decoded video (title='{}', page_url='{}')",
|
||||
state.title,
|
||||
state.page_url,
|
||||
));
|
||||
}
|
||||
if authorized && nbc_state_is_optional_profile_signin(&state) && recent_media_activity {
|
||||
if authorized && nbc_state_is_optional_profile_signin(&state) && state.has_video {
|
||||
tracing::debug!(
|
||||
title = %state.title,
|
||||
page_url = %state.page_url,
|
||||
"NBC optional profile sign-in is visible but media activity is already in flight; staying on the page"
|
||||
"NBC optional profile sign-in is visible but a video element is already present; staying on the page"
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1733,6 +1868,13 @@ fn wait_for_nbc_playback(
|
|||
body_text = %clues.body_text,
|
||||
"NBC watch surface clues"
|
||||
);
|
||||
if nbc_clues_look_geo_blocked(&clues) {
|
||||
return Err(anyhow!(
|
||||
"NBC geo-blocked current egress; page says this content is not authorized outside the US/territories (title='{}', page_url='{}')",
|
||||
primary_state.title,
|
||||
primary_state.page_url,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
if fully_loaded_watch_surface && !primary_state.has_video {
|
||||
|
|
@ -1862,6 +2004,9 @@ mod tests {
|
|||
assert!(nbc_url_is_provider_linked(
|
||||
"https://www.nbc.com/provider-linked"
|
||||
));
|
||||
assert!(nbc_url_is_mvpd_complete(
|
||||
"https://www.nbc.com/mvpd-complete"
|
||||
));
|
||||
assert!(nbc_title_looks_like_provider_linked("TV Provider Linked"));
|
||||
assert!(!nbc_url_is_provider_linked(
|
||||
"https://www.nbc.com/live?brand=nbc-sports-philadelphia"
|
||||
|
|
@ -1884,11 +2029,31 @@ mod tests {
|
|||
#[test]
|
||||
fn optional_profile_signin_is_not_treated_as_watch_surface() {
|
||||
assert!(!nbc_page_is_watch_surface("https://www.nbc.com/sign-in"));
|
||||
assert!(!nbc_page_is_watch_surface(
|
||||
"https://www.nbc.com/mvpd-complete"
|
||||
));
|
||||
assert!(nbc_page_is_watch_surface(
|
||||
"https://www.nbc.com/live?brand=nbc-sports-philadelphia"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_block_clues_fail_closed() {
|
||||
let clues = NbcPageClues {
|
||||
body_text:
|
||||
"We're sorry. You are not authorized to access this content from outside of the US and its territories."
|
||||
.to_string(),
|
||||
..NbcPageClues::default()
|
||||
};
|
||||
assert!(nbc_clues_look_geo_blocked(&clues));
|
||||
|
||||
let allowed = NbcPageClues {
|
||||
body_text: "NBC News NOW ON NOW until 7:00 AM".to_string(),
|
||||
..NbcPageClues::default()
|
||||
};
|
||||
assert!(!nbc_clues_look_geo_blocked(&allowed));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cssott_media_requests_mark_recent_media_activity() {
|
||||
let mut trace = NbcTraceState::default();
|
||||
|
|
@ -1899,4 +2064,25 @@ mod tests {
|
|||
assert!(trace.media_activity_seen);
|
||||
assert!(nbc_trace_has_recent_media_activity(&trace));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decoded_frame_detection_requires_advancing_video_surface() {
|
||||
let mut state = NbcVideoState {
|
||||
has_video: true,
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
paused: false,
|
||||
ready_state: 2,
|
||||
current_time: 1.0,
|
||||
..NbcVideoState::default()
|
||||
};
|
||||
assert!(nbc_video_state_has_decoded_frame(&state));
|
||||
|
||||
state.current_time = 0.0;
|
||||
assert!(!nbc_video_state_has_decoded_frame(&state));
|
||||
|
||||
state.current_time = 1.0;
|
||||
state.width = 0;
|
||||
assert!(!nbc_video_state_has_decoded_frame(&state));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
|
|
@ -46,6 +47,15 @@ fn blake3_hex(path: &Path) -> anyhow::Result<String> {
|
|||
Ok(blake3::hash(&bytes).to_hex().to_string())
|
||||
}
|
||||
|
||||
fn command_available(name: &str) -> bool {
|
||||
Command::new(name)
|
||||
.arg("-version")
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.status()
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
fn concat_init_and_segment(init: &Path, seg: &Path, out: &Path) -> anyhow::Result<()> {
|
||||
let init_bytes = std::fs::read(init)?;
|
||||
let seg_bytes = std::fs::read(seg)?;
|
||||
|
|
@ -157,11 +167,15 @@ fn write_deterministic_ts(out_path: &Path) -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> {
|
||||
fn run_ladder_with_identity(
|
||||
ec_node: &Path,
|
||||
input_ts: &Path,
|
||||
out_dir: &Path,
|
||||
stream_id: &str,
|
||||
broadcast_name: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let signing_key = "11".repeat(32);
|
||||
let network_secret = "22".repeat(32);
|
||||
let stream_id = "every.channel/determinism/cmaf-ladder";
|
||||
let broadcast_name = "every.channel/determinism/cmaf-ladder";
|
||||
|
||||
let mut cmd = Command::new(ec_node);
|
||||
cmd.env("EVERY_CHANNEL_MANIFEST_SIGNING_KEY", &signing_key)
|
||||
|
|
@ -210,6 +224,40 @@ fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn run_ladder(ec_node: &Path, input_ts: &Path, out_dir: &Path) -> anyhow::Result<()> {
|
||||
run_ladder_with_identity(
|
||||
ec_node,
|
||||
input_ts,
|
||||
out_dir,
|
||||
"every.channel/determinism/cmaf-ladder",
|
||||
"every.channel/determinism/cmaf-ladder",
|
||||
)
|
||||
}
|
||||
|
||||
fn ladder_artifact_hashes(root: &Path) -> BTreeMap<String, String> {
|
||||
let mut hashes = BTreeMap::new();
|
||||
for variant in ["1080p", "720p", "480p"] {
|
||||
let variant_dir = root.join("cmaf-ladder").join(variant);
|
||||
// `moq-publish --max-chunks 3` publishes init plus segments 0..=2.
|
||||
// ffmpeg can race ahead and leave an unpublished tail segment before it is killed.
|
||||
let init = variant_dir.join("init.mp4");
|
||||
assert!(init.exists(), "missing init for {variant}");
|
||||
hashes.insert(format!("{variant}/init.mp4"), blake3_hex(&init).unwrap());
|
||||
|
||||
for idx in 0..3 {
|
||||
let name = format!("segment_{idx:06}.m4s");
|
||||
let path = variant_dir.join(&name);
|
||||
assert!(path.exists(), "missing {name} for {variant}");
|
||||
hashes.insert(format!("{variant}/{name}"), blake3_hex(&path).unwrap());
|
||||
}
|
||||
}
|
||||
hashes
|
||||
}
|
||||
|
||||
fn assert_ladder_bytes_match(left: &Path, right: &Path) {
|
||||
assert_eq!(ladder_artifact_hashes(left), ladder_artifact_hashes(right));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn deterministic_cmaf_ladder_outputs_match_across_runs() {
|
||||
|
|
@ -235,36 +283,53 @@ fn deterministic_cmaf_ladder_outputs_match_across_runs() {
|
|||
run_ladder(&ec_node, &input_ts, &run1).expect("run ladder 1");
|
||||
run_ladder(&ec_node, &input_ts, &run2).expect("run ladder 2");
|
||||
|
||||
for variant in ["1080p", "720p", "480p"] {
|
||||
let v1 = run1.join("cmaf-ladder").join(variant);
|
||||
let v2 = run2.join("cmaf-ladder").join(variant);
|
||||
assert_ladder_bytes_match(&run1, &run2);
|
||||
}
|
||||
|
||||
let init1 = v1.join("init.mp4");
|
||||
let init2 = v2.join("init.mp4");
|
||||
assert!(
|
||||
init1.exists() && init2.exists(),
|
||||
"missing init for {variant}"
|
||||
);
|
||||
assert_eq!(
|
||||
blake3_hex(&init1).unwrap(),
|
||||
blake3_hex(&init2).unwrap(),
|
||||
"init differs for {variant}"
|
||||
);
|
||||
|
||||
for idx in 0..3 {
|
||||
let s1 = v1.join(format!("segment_{idx:06}.m4s"));
|
||||
let s2 = v2.join(format!("segment_{idx:06}.m4s"));
|
||||
assert!(
|
||||
s1.exists() && s2.exists(),
|
||||
"missing segment {idx} for {variant}"
|
||||
);
|
||||
assert_eq!(
|
||||
blake3_hex(&s1).unwrap(),
|
||||
blake3_hex(&s2).unwrap(),
|
||||
"segment {idx} differs for {variant}"
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes() {
|
||||
if !command_available("ffmpeg") {
|
||||
eprintln!("skipping duplicate publisher CMAF ladder determinism test: ffmpeg unavailable");
|
||||
return;
|
||||
}
|
||||
|
||||
let ec_node = ec_node_path();
|
||||
|
||||
let ts = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis();
|
||||
let tmp = std::env::temp_dir().join(format!("ec-duplicate-publisher-cmaf-ladder-{ts}"));
|
||||
let _ = std::fs::create_dir_all(&tmp);
|
||||
|
||||
let input_ts = tmp.join("input.ts");
|
||||
write_deterministic_ts(&input_ts).expect("write deterministic TS");
|
||||
|
||||
let publisher_a = tmp.join("publisher-a");
|
||||
let publisher_b = tmp.join("publisher-b");
|
||||
let _ = std::fs::remove_dir_all(&publisher_a);
|
||||
let _ = std::fs::remove_dir_all(&publisher_b);
|
||||
std::fs::create_dir_all(&publisher_a).unwrap();
|
||||
std::fs::create_dir_all(&publisher_b).unwrap();
|
||||
|
||||
run_ladder_with_identity(
|
||||
&ec_node,
|
||||
&input_ts,
|
||||
&publisher_a,
|
||||
"every.channel/determinism/duplicate/publisher-a/la-kcop",
|
||||
"publisher-a-la-kcop",
|
||||
)
|
||||
.expect("run duplicate publisher a");
|
||||
run_ladder_with_identity(
|
||||
&ec_node,
|
||||
&input_ts,
|
||||
&publisher_b,
|
||||
"every.channel/determinism/duplicate/publisher-b/la-kcop",
|
||||
"publisher-b-la-kcop",
|
||||
)
|
||||
.expect("run duplicate publisher b");
|
||||
|
||||
assert_ladder_bytes_match(&publisher_a, &publisher_b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
use std::ffi::OsStr;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
fn which(cmd: &str) -> Option<std::path::PathBuf> {
|
||||
|
|
@ -16,6 +17,24 @@ fn chrome_path() -> Option<std::path::PathBuf> {
|
|||
.or_else(|| which("chromium"))
|
||||
}
|
||||
|
||||
fn ec_node_path() -> std::path::PathBuf {
|
||||
if let Ok(value) = std::env::var("EC_NODE_BIN") {
|
||||
return value.into();
|
||||
}
|
||||
if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec_node") {
|
||||
return value.into();
|
||||
}
|
||||
if let Ok(value) = std::env::var("CARGO_BIN_EXE_ec-node") {
|
||||
return value.into();
|
||||
}
|
||||
let exe = std::env::current_exe().expect("current_exe");
|
||||
let debug_dir = exe
|
||||
.parent()
|
||||
.and_then(|p| p.parent())
|
||||
.expect("expected target/debug/deps");
|
||||
debug_dir.join("ec-node")
|
||||
}
|
||||
|
||||
fn wait_for_canvas_element(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
|
||||
let deadline = Instant::now() + timeout;
|
||||
while Instant::now() < deadline {
|
||||
|
|
@ -46,14 +65,41 @@ fn wait_for_moq_watch_element(tab: &headless_chrome::Tab, timeout: Duration) ->
|
|||
anyhow::bail!("timed out waiting for <moq-watch> element");
|
||||
}
|
||||
|
||||
fn wait_for_live_or_archive_player(
|
||||
tab: &headless_chrome::Tab,
|
||||
timeout: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
let deadline = Instant::now() + timeout;
|
||||
while Instant::now() < deadline {
|
||||
let js = r#"(function() {
|
||||
return !!document.querySelector('moq-watch, video.archiveVideo');
|
||||
})();"#;
|
||||
let v = tab.evaluate(js, false)?;
|
||||
if v.value.and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
return Ok(());
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(200));
|
||||
}
|
||||
anyhow::bail!("timed out waiting for live or archive player");
|
||||
}
|
||||
|
||||
fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result<String> {
|
||||
let js = r#"(function() {
|
||||
let watch = document.querySelector('moq-watch');
|
||||
let canvas = document.querySelector('moq-watch canvas');
|
||||
let video = document.querySelector('video.archiveVideo');
|
||||
let placeholder = document.querySelector('.placeholder');
|
||||
let placeholderText = placeholder ? (placeholder.innerText || '') : null;
|
||||
let status = document.querySelector('.source-status');
|
||||
let statusText = status ? (status.innerText || '') : null;
|
||||
let statusLine = document.querySelector('#statusLine');
|
||||
let statusLineText = statusLine ? (statusLine.innerText || '') : null;
|
||||
let catalog = watch && watch.broadcast && watch.broadcast.catalog && watch.broadcast.catalog.peek
|
||||
? watch.broadcast.catalog.peek()
|
||||
: null;
|
||||
let established = watch && watch.connection && watch.connection.established && watch.connection.established.peek
|
||||
? watch.connection.established.peek()
|
||||
: null;
|
||||
let sources = Array.from(document.querySelectorAll('button[data-testid="global-watch"]')).length;
|
||||
let hint = document.querySelector('#hint');
|
||||
let hintText = hint ? (hint.innerText || '') : null;
|
||||
|
|
@ -62,8 +108,27 @@ fn debug_player_state(tab: &headless_chrome::Tab) -> anyhow::Result<String> {
|
|||
hasCanvas: !!canvas,
|
||||
canvasWidth: canvas ? canvas.width : null,
|
||||
canvasHeight: canvas ? canvas.height : null,
|
||||
hasArchiveVideo: !!video,
|
||||
videoCurrentTime: video ? video.currentTime : null,
|
||||
videoDuration: video ? video.duration : null,
|
||||
videoPaused: video ? video.paused : null,
|
||||
videoReadyState: video ? video.readyState : null,
|
||||
videoMuted: video ? video.muted : null,
|
||||
videoVolume: video ? video.volume : null,
|
||||
videoSrc: video ? (video.currentSrc || video.src || '') : null,
|
||||
muted: watch ? watch.muted : null,
|
||||
volume: watch ? watch.volume : null,
|
||||
connectionStatus: watch?.connection?.status?.peek ? watch.connection.status.peek() : null,
|
||||
connectionKind: established ? established.constructor?.name || null : null,
|
||||
broadcastStatus: watch?.broadcast?.status?.peek ? watch.broadcast.status.peek() : null,
|
||||
paused: watch?.backend?.paused?.peek ? watch.backend.paused.peek() : null,
|
||||
audioMuted: watch?.backend?.audio?.muted?.peek ? watch.backend.audio.muted.peek() : null,
|
||||
audioVolume: watch?.backend?.audio?.volume?.peek ? watch.backend.audio.volume.peek() : null,
|
||||
catalogSeen: !!catalog,
|
||||
catalogHasVideo: !!(catalog?.video?.renditions),
|
||||
catalogHasAudio: !!(catalog?.audio?.renditions),
|
||||
metrics: window.__ecPlaybackMetrics || null,
|
||||
statusLineText,
|
||||
hintText,
|
||||
placeholderText,
|
||||
statusText,
|
||||
|
|
@ -110,23 +175,120 @@ fn canvas_motion_sample(tab: &headless_chrome::Tab) -> anyhow::Result<Option<(f6
|
|||
Ok(Some((current_time, hash)))
|
||||
}
|
||||
|
||||
fn wait_for_canvas_motion(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
|
||||
fn archive_video_motion_sample(
|
||||
tab: &headless_chrome::Tab,
|
||||
) -> anyhow::Result<Option<serde_json::Value>> {
|
||||
let js = r#"(function() {
|
||||
let video = document.querySelector('video.archiveVideo');
|
||||
if (!video) return null;
|
||||
if (video.paused) video.play().catch(() => {});
|
||||
return JSON.stringify({
|
||||
wallTime: performance.now() / 1000,
|
||||
currentTime: video.currentTime || 0,
|
||||
readyState: video.readyState || 0,
|
||||
paused: !!video.paused,
|
||||
ended: !!video.ended,
|
||||
muted: !!video.muted,
|
||||
volume: video.volume || 0,
|
||||
src: video.currentSrc || video.src || ''
|
||||
});
|
||||
})();"#;
|
||||
let v = tab.evaluate(js, false)?;
|
||||
let Some(s) = v.value.and_then(|v| v.as_str().map(|s| s.to_string())) else {
|
||||
return Ok(None);
|
||||
};
|
||||
Ok(Some(serde_json::from_str(&s)?))
|
||||
}
|
||||
|
||||
fn wait_for_canvas_or_archive_motion(
|
||||
tab: &headless_chrome::Tab,
|
||||
timeout: Duration,
|
||||
) -> anyhow::Result<String> {
|
||||
let deadline = Instant::now() + timeout;
|
||||
let mut first: Option<(f64, u32)> = None;
|
||||
let mut first_canvas: Option<(f64, u32)> = None;
|
||||
let mut first_video_time: Option<f64> = None;
|
||||
while Instant::now() < deadline {
|
||||
if let Some(sample) = canvas_motion_sample(tab)? {
|
||||
if let Some((first_time, first_hash)) = first {
|
||||
if let Some((first_time, first_hash)) = first_canvas {
|
||||
if sample.0 > first_time + 0.5 && sample.1 != first_hash {
|
||||
return Ok(());
|
||||
return Ok("moq-canvas".to_string());
|
||||
}
|
||||
} else {
|
||||
first = Some(sample);
|
||||
first_canvas = Some(sample);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(sample) = archive_video_motion_sample(tab)? {
|
||||
let current_time = sample
|
||||
.get("currentTime")
|
||||
.and_then(|v| v.as_f64())
|
||||
.unwrap_or_default();
|
||||
let ready_state = sample
|
||||
.get("readyState")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or_default();
|
||||
let ended = sample
|
||||
.get("ended")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
if ready_state >= 2 && !ended {
|
||||
if let Some(first) = first_video_time {
|
||||
if current_time > first + 0.5 {
|
||||
return Ok("archive-video".to_string());
|
||||
}
|
||||
} else {
|
||||
first_video_time = Some(current_time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
let st = debug_player_state(tab).unwrap_or_default();
|
||||
anyhow::bail!("timed out waiting for changing canvas frames\nplayer_state={st}");
|
||||
anyhow::bail!("timed out waiting for live or archive motion\nplayer_state={st}");
|
||||
}
|
||||
|
||||
fn wait_for_playback_probe_ok(
|
||||
tab: &headless_chrome::Tab,
|
||||
timeout: Duration,
|
||||
) -> anyhow::Result<String> {
|
||||
let deadline = Instant::now() + timeout;
|
||||
let mut last_metrics = String::new();
|
||||
while Instant::now() < deadline {
|
||||
let js = r#"(function() {
|
||||
const metrics = window.__ecPlaybackMetrics || null;
|
||||
return metrics ? JSON.stringify(metrics) : "";
|
||||
})();"#;
|
||||
let v = tab.evaluate(js, false)?;
|
||||
last_metrics = v
|
||||
.value
|
||||
.and_then(|v| v.as_str().map(|s| s.to_string()))
|
||||
.unwrap_or_default();
|
||||
if !last_metrics.is_empty() {
|
||||
let metrics: serde_json::Value = serde_json::from_str(&last_metrics)?;
|
||||
let ok = metrics.get("ok").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||
let samples = metrics
|
||||
.get("samples")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or_default();
|
||||
let changed = metrics
|
||||
.get("changed_samples")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or_default();
|
||||
let longest_static = metrics
|
||||
.get("longest_same_hash_ms")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or_default();
|
||||
if ok && samples >= 8 && changed >= 2 && longest_static < 5_000 {
|
||||
return Ok(last_metrics);
|
||||
}
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(250));
|
||||
}
|
||||
let st = debug_player_state(tab).unwrap_or_default();
|
||||
anyhow::bail!(
|
||||
"timed out waiting for playback probe ok\nplayer_state={st}\nmetrics={last_metrics}"
|
||||
);
|
||||
}
|
||||
|
||||
fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> anyhow::Result<()> {
|
||||
|
|
@ -134,7 +296,9 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any
|
|||
while Instant::now() < deadline {
|
||||
let js = r#"(function() {
|
||||
let watch = document.querySelector('moq-watch');
|
||||
return !!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted');
|
||||
let video = document.querySelector('video.archiveVideo');
|
||||
return (!!watch && watch.muted === false && watch.volume > 0 && !watch.hasAttribute('muted')) ||
|
||||
(!!video && video.muted === false && video.volume > 0);
|
||||
})();"#;
|
||||
let v = tab.evaluate(js, false)?;
|
||||
if v.value.and_then(|v| v.as_bool()).unwrap_or(false) {
|
||||
|
|
@ -146,13 +310,21 @@ fn wait_for_unmuted_player(tab: &headless_chrome::Tab, timeout: Duration) -> any
|
|||
anyhow::bail!("timed out waiting for unmuted player\nplayer_state={st}");
|
||||
}
|
||||
|
||||
fn watch_url(site_url: &str, relay_url: &str, stream_id: &str) -> anyhow::Result<String> {
|
||||
fn watch_url(
|
||||
site_url: &str,
|
||||
relay_url: &str,
|
||||
stream_id: &str,
|
||||
verify: bool,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut url = url::Url::parse(site_url)?;
|
||||
url.set_path("/watch");
|
||||
url.query_pairs_mut()
|
||||
.clear()
|
||||
.append_pair("url", relay_url)
|
||||
.append_pair("name", stream_id);
|
||||
if verify {
|
||||
url.query_pairs_mut().append_pair("verify", "1");
|
||||
}
|
||||
Ok(url.to_string())
|
||||
}
|
||||
|
||||
|
|
@ -190,23 +362,104 @@ fn e2e_remote_website_watch_existing_stream_id() -> anyhow::Result<()> {
|
|||
.unwrap();
|
||||
let browser = headless_chrome::Browser::new(launch_options)?;
|
||||
let tab = browser.new_tab()?;
|
||||
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id)?)?;
|
||||
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, false)?)?;
|
||||
tab.wait_until_navigated()?;
|
||||
|
||||
// Ensure the player is instantiated.
|
||||
if let Err(err) = wait_for_moq_watch_element(&tab, Duration::from_secs(90)) {
|
||||
// Ensure either the native MoQ player or the archive live-edge fallback is instantiated.
|
||||
if let Err(err) = wait_for_live_or_archive_player(&tab, Duration::from_secs(90)) {
|
||||
let st = debug_player_state(&tab).unwrap_or_default();
|
||||
anyhow::bail!("{err}\nplayer_state={st}");
|
||||
}
|
||||
|
||||
if let Err(err) = wait_for_canvas_element(&tab, Duration::from_secs(90)) {
|
||||
let st = debug_player_state(&tab).unwrap_or_default();
|
||||
anyhow::bail!("{err}\nplayer_state={st}");
|
||||
}
|
||||
|
||||
tab.wait_for_element("moq-watch canvas")?.click()?;
|
||||
tab.evaluate(
|
||||
r#"(function() {
|
||||
const canvas = document.querySelector('moq-watch canvas');
|
||||
if (canvas) canvas.click();
|
||||
const audioButton = document.querySelector('#audioBtn');
|
||||
if (audioButton && audioButton.getAttribute('aria-pressed') !== 'true') {
|
||||
audioButton.click();
|
||||
}
|
||||
})();"#,
|
||||
false,
|
||||
)?;
|
||||
wait_for_unmuted_player(&tab, Duration::from_secs(10))?;
|
||||
wait_for_canvas_motion(&tab, Duration::from_secs(30))?;
|
||||
let playback_path = wait_for_canvas_or_archive_motion(&tab, Duration::from_secs(60))?;
|
||||
eprintln!("playback path: {playback_path}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn e2e_remote_website_watch_synthetic_relay_stream() -> anyhow::Result<()> {
|
||||
if which("ffmpeg").is_none() {
|
||||
return Ok(()); // skip
|
||||
}
|
||||
let chrome = match chrome_path() {
|
||||
Some(p) => p,
|
||||
None => return Ok(()), // skip
|
||||
};
|
||||
|
||||
let site_url = std::env::var("EVERY_CHANNEL_SITE_URL")
|
||||
.unwrap_or_else(|_| "https://every.channel/".to_string());
|
||||
let relay_url = std::env::var("EVERY_CHANNEL_RELAY_URL")
|
||||
.unwrap_or_else(|_| "https://relay.every.channel/anon".to_string());
|
||||
let tls_disable_verify = std::env::var("EVERY_CHANNEL_RELAY_TLS_DISABLE_VERIFY")
|
||||
.map(|v| v != "0" && v.to_lowercase() != "false")
|
||||
.unwrap_or(true);
|
||||
|
||||
let ts = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis();
|
||||
let stream_id = format!("e2e-synthetic-{ts}");
|
||||
let ec_node = ec_node_path();
|
||||
|
||||
let mut publisher = Command::new(&ec_node);
|
||||
publisher
|
||||
.arg("wt-publish")
|
||||
.arg("--url")
|
||||
.arg(&relay_url)
|
||||
.arg("--name")
|
||||
.arg(&stream_id)
|
||||
.arg("--realtime-input")
|
||||
.arg("--input-format")
|
||||
.arg("lavfi")
|
||||
.arg("--input")
|
||||
.arg("testsrc2=size=1280x720:rate=30")
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::inherit());
|
||||
if tls_disable_verify {
|
||||
publisher.arg("--tls-disable-verify");
|
||||
}
|
||||
let mut publisher = publisher.spawn()?;
|
||||
|
||||
let test_result = (|| -> anyhow::Result<()> {
|
||||
let launch_options = headless_chrome::LaunchOptionsBuilder::default()
|
||||
.path(Some(chrome))
|
||||
.headless(true)
|
||||
.args(vec![
|
||||
OsStr::new("--autoplay-policy=no-user-gesture-required"),
|
||||
OsStr::new("--disable-application-cache"),
|
||||
OsStr::new("--disable-service-worker"),
|
||||
OsStr::new("--disk-cache-size=0"),
|
||||
OsStr::new("--mute-audio"),
|
||||
])
|
||||
.build()
|
||||
.unwrap();
|
||||
let browser = headless_chrome::Browser::new(launch_options)?;
|
||||
let tab = browser.new_tab()?;
|
||||
tab.navigate_to(&watch_url(&site_url, &relay_url, &stream_id, true)?)?;
|
||||
tab.wait_until_navigated()?;
|
||||
|
||||
wait_for_moq_watch_element(&tab, Duration::from_secs(90))?;
|
||||
wait_for_canvas_element(&tab, Duration::from_secs(90))?;
|
||||
let metrics = wait_for_playback_probe_ok(&tab, Duration::from_secs(60))?;
|
||||
eprintln!("playback metrics: {metrics}");
|
||||
Ok(())
|
||||
})();
|
||||
|
||||
let _ = publisher.kill();
|
||||
let _ = publisher.wait();
|
||||
test_result
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,334 @@
|
|||
# ECP-0156: Duplicate Publisher Deterministic Data Layer
|
||||
|
||||
Status: Draft
|
||||
|
||||
## Context
|
||||
|
||||
Two publisher nodes may broadcast the same logical channel at the same time. The archive and relay
|
||||
layers need this for resilience, but duplicate publishers currently risk looking like conflicting
|
||||
streams instead of convergent copies of the same media.
|
||||
|
||||
## Decision
|
||||
|
||||
Duplicate publishers are valid for a published channel. The data layer dedupes and verifies media by
|
||||
content identity, not by publisher envelope identity:
|
||||
|
||||
- CMAF init and media segment bytes for the same input, ladder profile, and chunk cadence must be
|
||||
byte-for-byte identical.
|
||||
- BLAKE3 media hashes and per-rung Merkle roots are the shared data identity.
|
||||
- Publisher manifests may carry different `stream_id`, `epoch_id`, `created_unix_ms`, signatures,
|
||||
locators, and manifest ids.
|
||||
- The archive must treat matching media hashes from different publishers as corroborating sources.
|
||||
- Archive records must carry source identity. Two copied buffers with the same `source_node` are not
|
||||
duplicate-publisher proof, even when their BLAKE3 hashes match.
|
||||
- Divergent hashes for the same logical channel, rendition, and media time are misses that must be
|
||||
measured before the data is promoted as redundant.
|
||||
|
||||
## Verification
|
||||
|
||||
The proof path has two stages:
|
||||
|
||||
1. Single-node duplicate-publisher tests produce the same ladder twice with different publisher
|
||||
identities and assert byte-for-byte BLAKE3 equality for every generated init and media segment.
|
||||
The `duplicate_publishers_same_input_produce_identical_cmaf_ladder_bytes` test is part of the
|
||||
default Rust test path when ffmpeg is present; it is not an ignored E2E.
|
||||
2. Production verification runs the same channel on two real publishers long enough to measure
|
||||
duplicate media convergence, hash divergence, missing objects, and backfill behavior in Grafana.
|
||||
|
||||
The goal is not just "two publishers are online." Success requires elapsed production time behind the
|
||||
numbers and dashboards that show duplicate hits, misses, and archive repair.
|
||||
|
||||
## Consequences
|
||||
|
||||
Manifest ids cannot be used as the archive dedupe key for duplicate publishers. Operators get a
|
||||
clear signal when two publishers produce identical bytes versus merely announcing the same channel.
|
||||
If encoder determinism changes, the single-node test fails before production redundancy silently
|
||||
degrades.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
- Dedupe by manifest id. This preserves envelope identity but misses the resilience property because
|
||||
duplicate publishers necessarily produce different envelopes.
|
||||
- Dedupe by logical channel and time only. This can hide encoder divergence and promote bad
|
||||
redundancy before byte-level media equality is proven.
|
||||
- Disable duplicate publishers until the scheduler is perfect. This avoids conflict handling but
|
||||
weakens live resilience and leaves the archive data layer untested.
|
||||
|
||||
## Rollout/teardown
|
||||
|
||||
Roll forward by landing the local deterministic test, adding miss/duplicate metrics to the archive
|
||||
scrape surface, then running two publishers for one logical channel in production. Roll back by
|
||||
disabling duplicate scheduling for that channel; existing content-addressed archive objects remain
|
||||
valid.
|
||||
|
||||
## Implementation notes
|
||||
|
||||
The node-agent archive scrape now exposes duplicate-source and miss gauges without placing hashes in
|
||||
labels. Per node, role, broadcast, rendition, and track it reports duplicate matching hash sources,
|
||||
duplicate hash sequences, divergent hash sequences, and missing hash records. Grafana shows those
|
||||
next to archive ladder coverage so the production duplicate-publisher run has an operator-visible
|
||||
convergence and miss signal.
|
||||
|
||||
`ec-node archive-convergence` is the primary proof surface for duplicate media identity. It compares
|
||||
named archive manifest roots directly inside the Rust node binary, groups records by logical stream,
|
||||
rendition, track, and sequence, and only returns `ok` when every expected sequence has matching
|
||||
duplicate source hashes with no missing or divergent sequence. It also requires archive records to
|
||||
carry at least two distinct `source_node` values, so mirrored global-origin manifests cannot pass as
|
||||
independent publishers. This keeps the media-data invariant in the already-shipped Rust artifact
|
||||
instead of extending the Python node-agent. Rollout gates should use
|
||||
`ec-node archive-convergence --require-ok`; the command emits the JSON report either way, but
|
||||
`--require-ok` exits non-zero unless duplicate convergence is actually proven.
|
||||
`ec-node archive-convergence --prometheus` renders the same Rust convergence report as scrapeable
|
||||
`every_channel_archive_*` gauges for duplicate source records, duplicate sequences, divergent
|
||||
sequences, source-local divergence, missing hashes, missing source identity, media timing conflicts,
|
||||
record source count, and pass/fail state. This gives Grafana a Rust-owned proof metric path while
|
||||
the older node-agent ladder metrics remain available during migration.
|
||||
`ec-node archive-convergence-serve` keeps that proof path live for Prometheus: it serves `/health`
|
||||
and `/metrics`, recomputes convergence on each scrape, and emits `scrape_ok=0` metrics instead of
|
||||
disappearing when manifests are missing or not ready. Production Grafana can therefore distinguish a
|
||||
healthy metrics target from an unproven duplicate-publisher run.
|
||||
The Nix `services.every-channel.ec-node.archive.convergence.proofs` option turns those Rust proof
|
||||
servers into named systemd units. Each proof must name at least two `NAME=PATH` sources and gets a
|
||||
dedicated listen address, so operators can add one Prometheus scrape target per duplicate channel
|
||||
without resurrecting the Python node-agent as the proof oracle.
|
||||
Forge enables an initial `la-kcop-publisher-origin` proof target on `127.0.0.1:7812` and Prometheus
|
||||
scrapes it alongside the other local every.channel targets. Until two real publisher manifest roots
|
||||
are mounted or fetched into Forge, the target intentionally uses the Forge manifest root as a
|
||||
placeholder peer and must report unproven convergence rather than green duplicate-publisher proof.
|
||||
Forge also exposes a static two-NUC `la-kcet-remote-publisher-origin` proof target once that channel
|
||||
is the live converged duplicate sample. Dynamic Headscale file-SD remains useful for discovery, but
|
||||
it can include relays and stale nodes; duplicate-publisher proof should use an explicit publisher
|
||||
pair or future scheduler group labels so unrelated agents do not turn a passing channel red.
|
||||
This static proof exports its own Rust convergence gauges rather than gating on broad legacy
|
||||
Prometheus aggregates, because older node-agent archive metrics do not yet carry enough proof-role
|
||||
labels to avoid summing stale divergence from unrelated scrape targets.
|
||||
|
||||
`ec-node archive-convergence-measure` is the primary production proof harness. It fetches named
|
||||
node-agent `/v1/archive-manifest` samples or direct manifest JSONL URLs, writes bounded temporary
|
||||
manifest roots, reuses the Rust `archive-convergence` report, and optionally queries Prometheus for
|
||||
the Grafana-facing duplicate/miss series. A production run only counts as complete when the report
|
||||
has elapsed samples, matching duplicate media hashes, zero divergent hash sequences, and live
|
||||
Prometheus series for the duplicate/miss gauges. The measurement groups records by archive record
|
||||
source identity, not by the URL used to fetch a manifest, and reports source identity failures when
|
||||
the sample is too weak to prove independent publisher data. The older
|
||||
`scripts/measure-duplicate-publishers.py` stays compatibility-only until live operators and Forge
|
||||
jobs are switched to the Rust command.
|
||||
The convergence report carries bounded divergent-sequence samples with per-source hash, byte size,
|
||||
receive time, source node/session, CAS path, and media timing when present, so a red proof is
|
||||
immediately actionable without fetching full manifests by hand.
|
||||
It also reports a non-blocking media-timing-missing count and Prometheus gauge; hash equality can
|
||||
still prove duplicate bytes, but missing timing means a divergent proof cannot yet classify whether
|
||||
the mismatch is a phase/windowing problem or an encoder byte problem.
|
||||
Publisher service builders must pass proof cadence explicitly. Both the node-agent publisher
|
||||
supervisor and Nix systemd publisher module set `--publisher-archive-segment-duration-ms` and
|
||||
`--publisher-start-boundary-ms` by default, so netbooted NUCs do not depend on stale hotpatch CLI
|
||||
defaults when aligning duplicate publisher proof windows.
|
||||
|
||||
`ec-node archive-convergence-measure-serve` turns that production proof harness into a live
|
||||
Prometheus target. Each `/metrics` scrape fetches one fresh sample from node-agent or direct JSONL
|
||||
manifest URLs, keeps a bounded in-memory sample window, and only reports measurement `ok` after the
|
||||
configured elapsed window has passed. This avoids blocking Prometheus scrapes for the measurement
|
||||
duration while still preventing two immediate samples from looking like a real production run.
|
||||
The service emits measurement-level gauges for fetch success, source record counts, invalid records,
|
||||
elapsed seconds, Prometheus series presence, reasons, and then appends the same
|
||||
`every_channel_archive_*` convergence gauges from the latest sample. The service can also read
|
||||
Prometheus file-SD JSON from Forge's Headscale node-agent discovery and turn each discovered target
|
||||
into a sampled node-agent manifest source. The Nix
|
||||
`services.every-channel.ec-node.archive.convergence.remoteProofs` option creates these remote proof
|
||||
services as systemd units from either static `NAME=URL` endpoints or dynamic file-SD inputs. Forge
|
||||
now exposes `la-kcop-remote-publisher-origin` on `127.0.0.1:7813` using the live
|
||||
`/var/lib/prometheus/every-channel-node-agents.json` inventory. It must stay red until that
|
||||
inventory contains at least two independent publisher node-agents whose `publisher.m4s` records
|
||||
converge.
|
||||
|
||||
When archive-serve ports are not reachable from the proof runner, the node-agent exposes a bounded,
|
||||
tailnet-authenticated `/v1/archive-manifest` sample endpoint. The harness can use that endpoint for
|
||||
each named publisher, compare local manifest records directly, and still require at least two elapsed
|
||||
samples before declaring success.
|
||||
|
||||
Production duplicate proof also requires archive-buffer freshness on each participating publisher.
|
||||
During mixed-generation rollouts, the current node-agent may supervise an older installed
|
||||
`archive-hot-sync` helper. The agent must probe helper flag support and omit optional arguments such
|
||||
as `--link-mode` when an older helper lacks them, because a silently failing archive-buffer sync can
|
||||
leave one publisher with healthy live streams but stale manifests.
|
||||
|
||||
The publisher buffer refresh is freshness-first: the node-managed sync must mirror full manifests
|
||||
without origin object fetch before running the slower cache fill/prune pass. This lets convergence
|
||||
checks, Grafana scrape surfaces, and demand fetch see current BLAKE3 indexes even when proactive CAS
|
||||
object backfill is still catching up.
|
||||
|
||||
`wt-archive` stamps each archive index record with `source_node` and `source_session`. The Nix
|
||||
archive launcher passes the runtime hostname as `--source-node`; explicit CLI users can override it.
|
||||
Older records without this identity continue to parse, but proof commands and production measurement
|
||||
mark them incomplete instead of accepting them as independent publisher evidence.
|
||||
|
||||
Publisher-origin proof must be captured before relay/archive mirroring can collapse source identity.
|
||||
When node-agent archive buffering is enabled, supervised `wt-publish` processes pass
|
||||
`--publisher-archive-output-dir`, `--publisher-archive-manifest-dir`, and
|
||||
`--publisher-archive-source-node`. `wt-publish` now supervises the Rust
|
||||
`publisher-proof-archive-source` worker for that archive track. The worker splits the MPEG-TS source
|
||||
by source-clock windows, fresh-encodes each bounded window with the deterministic proof profile,
|
||||
stores the resulting media fragments under `publisher.m4s` in the same CAS/index format, and stamps
|
||||
them with node-agent source identity. The relay playback encoder remains continuous for watchability,
|
||||
but it is no longer the BLAKE3 data identity for duplicate-publisher proof. The source identity is
|
||||
explicit override first, then hostname plus a short hash of machine-id, with boot-id only as a
|
||||
fallback; hostname alone is not enough because publisher images can share names like `ec-node`.
|
||||
Production duplicate verification can therefore compare `publisher.m4s` from two publisher buffers
|
||||
without treating copied relay-origin manifests as independent sources.
|
||||
|
||||
Proof tooling defaults to `publisher.m4s`. The relay video track `0.m4s` is useful playback data,
|
||||
but it is not duplicate-publisher proof: a publisher buffer may hold relay/cache records on `0.m4s`
|
||||
that have no publisher source identity. Production convergence checks that sample `0.m4s` should be
|
||||
treated as playback/archive-cache diagnostics, not byte-for-byte duplicate publisher evidence.
|
||||
|
||||
The first live publisher-origin measurements on 2026-06-08 showed correct distinct source labels but
|
||||
zero matching duplicate sequences for `la-nbc4`, `la-pbs-socal`, and `la-kcet`. The failure is
|
||||
useful: independent `wt-publish` processes currently start their fragment sequence and encoder chunk
|
||||
phase at local process start, so sequence `0` from two publishers is not necessarily the same
|
||||
broadcast moment. Duplicate-publisher proof therefore requires a shared chunk clock or
|
||||
scheduler-controlled aligned encoder phase before byte-for-byte archive convergence can pass in
|
||||
production.
|
||||
|
||||
Publisher-origin `publisher.m4s` records now require timed fMP4 fragments for global proof and map
|
||||
those fragments onto observed wall-clock epoch buckets instead of local process counters. The Rust
|
||||
writer learns track timescales from the init `moov` box, reads fragment
|
||||
`moof/traf/tfhd+tfdt` decode timestamps to reject untimed proof when possible, then assigns
|
||||
`group_sequence = observed_epoch_bucket * bucket_stride + fragment_slot`. Fragments that lack usable
|
||||
timing still fall back to the previous local counter so publishing does not fail hard on malformed
|
||||
metadata, but duplicate-publisher proof should use timed fragments. The `wt-publish` ffmpeg path
|
||||
also preserves source timestamps and uses closed-GOP, single-threaded x264 settings with forced
|
||||
keyframe cadence so independent publishers have a real chance of producing identical bytes for the
|
||||
same media time window.
|
||||
|
||||
A later live run on 2026-06-08 found a stricter local invariant before cross-publisher byte equality:
|
||||
each publisher must produce at most one hash for a given `source_node` and `group_sequence`.
|
||||
Production `publisher.m4s` samples for `la-kcop` and `la-ktla` showed multiple hashes from the same
|
||||
source in the same sequence bucket because real fMP4 fragments can arrive faster than the configured
|
||||
proof segment duration, and the writer rounded decode time into repeated buckets. The writer now
|
||||
uses a fixed per-epoch bucket stride and increments an in-bucket fragment slot when multiple timed
|
||||
fragments arrive inside the same proof duration. This keeps source-local manifests unique while
|
||||
allowing independently restarted publishers to align on the same observed wall-clock bucket.
|
||||
`ec-node archive-convergence` reports this separately as `source_local_divergent_sequences` so
|
||||
operator tooling can distinguish a self-contradicting publisher from two publishers that simply
|
||||
disagree about the same sequence.
|
||||
Because bucket-strided proof sequences intentionally leave numeric gaps, archive convergence uses
|
||||
the observed sparse sequence union for publisher-origin manifests. Dense contiguous sequence ranges
|
||||
remain available in the simulation layer when a model explicitly expects every integer sequence.
|
||||
|
||||
The 2026-06-08 live `la-kcet/publisher.m4s` sample from Forge confirmed that both publishers now
|
||||
emit distinct source identities (`ec-node-c3546fa5abc3` and `ec-node-72cf1c3aa196`) with no missing
|
||||
source identity records on the sampled publisher-origin manifests. It also confirmed the remaining
|
||||
bug: 156 shared publisher-origin sequences had zero byte-for-byte BLAKE3 matches and 156 divergent
|
||||
hashes. The next production fix must align the publisher chunk clock and encoded fMP4 byte stream,
|
||||
not merely improve scrape or Grafana plumbing.
|
||||
|
||||
After the wall-clock bucket hotpatch, the same live proof no longer has fake sparse-range missing
|
||||
IDs: `la-kcet/publisher.m4s` reported 376 observed proof sequences, zero missing source identities,
|
||||
zero source-local divergent sequences, and 234 divergent shared sequences. A byte-level sample for
|
||||
sequence `7287381184512` had different sizes, different BLAKE3 hashes, different `tfdt`
|
||||
base-media-decode-times (`210210` versus `0`), and different `mdat` payload prefixes. Across that
|
||||
sampled window there were zero common fragment hashes even when sequence IDs were ignored, proving
|
||||
that the remaining failure was independent-encoder media phase and fMP4 payload determinism, not an
|
||||
archive manifest identity bug.
|
||||
|
||||
A later `la-kcop/publisher.m4s` sample exposed a stricter live-source bug: source-window proof
|
||||
records were using unsynced MPEG-TS PCR chunk indexes as `group_sequence` when the OTA UTC clock was
|
||||
unavailable, causing restart-dependent jumps such as 93M, 135M, 341M, and 390M. The source-proof
|
||||
writer now uses the chunk UTC start only when the chopper reports synced timing, otherwise it falls
|
||||
back to the local wall-clock window start, and rewrites fMP4 `tfdt` onto that shared window before
|
||||
hashing. The live HTTP proof worker also retries transient source opens/reader failures in unbounded
|
||||
live mode, so a tuner `503` or malformed TS burst is skipped/retried instead of killing the
|
||||
publisher proof process.
|
||||
|
||||
The synced source-window clock must use the chopper's exact global chunk index, not integer UTC
|
||||
seconds. A 1001 ms proof cadence makes whole-second UTC start metadata lossy: adjacent source
|
||||
windows can share the same `utc_start_unix`, which caused one publisher to write several different
|
||||
hashes under the same source-local `group_sequence`. Synced chunks therefore use
|
||||
`ChunkTiming.chunk_index` directly; only unsynced chunks fall back to local wall-clock receipt.
|
||||
The live source-window proof writer also keeps subfragment slot allocation as stream state instead
|
||||
of per-chunk state. Real source windows can be emitted in more than one proof chunk for the same
|
||||
media timing sequence; resetting the slot counter for every chunk reused the same
|
||||
`group_sequence` and made one healthy publisher look self-divergent. The counter is bounded so the
|
||||
long-running live worker does not grow state unbounded.
|
||||
|
||||
`wt-publish` now has an explicit Unix-epoch start boundary, defaulting to the publisher-origin proof
|
||||
cadence. After relay setup and immediately before spawning ffmpeg it waits until the next boundary,
|
||||
so a newly restarted duplicate publisher starts its forced-keyframe clock on the same global cadence
|
||||
as already-running publishers.
|
||||
This does not by itself prove byte equality; it removes the local-process-start phase error from the
|
||||
live publisher path and gives rollout measurement a deterministic knob (`--publisher-start-boundary-ms
|
||||
0` disables it). The live ffmpeg argument plan is factored into a Rust unit-testable helper so
|
||||
future timestamp/keyframe changes are pinned in `ec-node` instead of being inferred from node-agent
|
||||
process strings or production samples.
|
||||
|
||||
The first post-start-clock live sample still failed duplicate byte identity: both publishers landed
|
||||
in the same wall-clock proof bucket, but one fragment carried `tfdt=390390` while the other carried
|
||||
`tfdt=30030`, matching the staggered restart gap. Their `mdat` prefixes differed too, which means a
|
||||
continuous x264 encoder keeps enough local history that a later restart cannot prove byte equality
|
||||
merely by joining the same wall-clock cadence. The live profile therefore enables x264
|
||||
`stitchable=1` alongside closed GOP, no scenecut, no B-frames, no lookahead, and one thread. If that
|
||||
still does not converge in production, the next fix is a deliberately stateless per-fragment encode
|
||||
or a Rust-owned media clock/segmenter that resets encoder history at each proof boundary.
|
||||
|
||||
The follow-up production hotpatch moved the start-boundary wait to immediately before ffmpeg spawn,
|
||||
enabled `stitchable=1`, and restarted both publisher nodes in the same batch. The latest `la-kcet`
|
||||
sample still reported zero matching duplicate hashes with no missing source identity and no
|
||||
source-local divergence. A final sampled shared sequence differed by hundreds of milliseconds of
|
||||
receive time and by media size (`439737` versus `270283` bytes for the video fragment), so the
|
||||
remaining mismatch is not just MP4 timestamp metadata. Production duplicate proof now needs a
|
||||
stateless fragment boundary: either encode each proof segment from the same bounded source window
|
||||
with fresh encoder state, or make the Rust media pipeline own exact frame-window capture before
|
||||
calling ffmpeg/x264.
|
||||
|
||||
Archive manifests now carry optional fMP4 media timing for publisher-origin fragments. The
|
||||
`archive-convergence` gate treats equal archive group sequence IDs with different media sequence or
|
||||
decode-time metadata as `media_sequence_conflict`, even if the byte hash happens to match. This keeps
|
||||
production proof aligned with the Rust simulation model: a duplicate publisher only proves the same
|
||||
broadcast moment when the archive sequence and media window agree.
|
||||
|
||||
The first stateless proof primitives are now in `ec-node`. `publisher-proof-segment` takes one
|
||||
bounded MPEG-TS source-clock window, runs a fresh deterministic x264/AAC fMP4 encode, splits the
|
||||
result into init bytes and media fragments, and emits BLAKE3 hashes for each. `publisher-proof-windows`
|
||||
uses the Rust MPEG-TS source-clock splitter first, then fresh-encodes each bounded window and reports
|
||||
per-window source TS, init, and media hashes. Proof windows carry explicit MPEG-TS decoder context
|
||||
with `--preroll-packets`, defaulting to the repo-owned `WT_PUBLISH_PROOF_PREROLL_PACKETS` budget, so
|
||||
mid-GOP windows do not silently depend on best-effort decoder recovery. Focused Rust tests
|
||||
fresh-encode the same bounded input and the same finite source-window campaign twice and assert
|
||||
byte-for-byte identical proof hashes.
|
||||
|
||||
`publisher-proof-duplicates` is the single-node duplicate-publisher gate for the stateless path. It
|
||||
runs `publisher-proof-windows` independently under at least two publisher identity labels, defaults
|
||||
to `publisher-a` and `publisher-b`, and compares source TS, init, and media fragment BLAKE3 hashes
|
||||
for every source-clock window. `--require-ok` exits non-zero unless every compared window matches,
|
||||
and duplicate publisher labels are rejected so the proof cannot accidentally collapse to one source
|
||||
identity. `publisher-proof-compare` is the cross-machine stateless proof gate: each publisher can run
|
||||
`publisher-proof-windows` against the same bounded source TS file locally, copy the JSON report back
|
||||
to the operator host, and compare the reports by named publisher. It rejects mismatched chunk cadence,
|
||||
missing windows, source TS hash mismatches, init hash mismatches, media fragment hash mismatches, and
|
||||
empty media windows.
|
||||
|
||||
`publisher-proof-remote-compare` is the production operator harness for that cross-machine gate. It
|
||||
copies one bounded `.ts` proof input to each named SSH target, runs `ec-node publisher-proof-windows`
|
||||
on the target, stores each returned JSON report under the local output directory, writes a
|
||||
`compare.json`, and returns the existing compare report with upload/proof timing. Remote labels use
|
||||
the same single-component validation as publisher identities, remote proof roots are constrained to
|
||||
`/tmp/every-channel-*`, and cleanup is opt-in so the generated proof files remain inspectable unless
|
||||
the operator explicitly requests removal. This keeps the live proof path in Rust without making the
|
||||
Python node-agent a new oracle. It proves the machine/runtime/compiler boundary without requiring
|
||||
the two NUCs to share a live tuner at the exact same instant.
|
||||
|
||||
`publisher-proof-archive-source` is the live archive implementation of the same proof model. It can
|
||||
read local source files directly, read plain HTTP MPEG-TS bodies directly for HDHomeRun-style
|
||||
sources, or fall back to an ffmpeg MPEG-TS copy reader for other inputs. Each emitted source-clock
|
||||
window is encoded with fresh proof state, archived as CAS-backed `publisher.m4s` records, and mapped
|
||||
to source-clock group sequences with explicit media timing metadata. A focused Rust regression now
|
||||
archives the same bounded TS input as two source nodes, then runs `archive-convergence` against the
|
||||
two manifest roots and requires full duplicate convergence with zero divergent or source-local
|
||||
divergent sequences.
|
||||
|
||||
Forge `ci-gates` now runs the `publisher_proof` and `archive_convergence` Rust filters before the
|
||||
distributed simulator campaign, so single-node byte-for-byte determinism, source-window archive
|
||||
proof semantics, and duplicate archive convergence are checked before production rollout evidence is
|
||||
considered. The next production step is to deploy the updated node binary and let fresh
|
||||
`publisher.m4s` source-window records age into the Grafana scrape window so live duplicate metrics
|
||||
can replace the older continuous-encoder divergence.
|
||||
158
evolution/proposals/ECP-0157-rust-simulation-testing.md
Normal file
158
evolution/proposals/ECP-0157-rust-simulation-testing.md
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
# ECP-0157: Rust Simulation Testing
|
||||
|
||||
Status: Draft
|
||||
|
||||
## Context
|
||||
|
||||
Production is now fast enough to expose distributed bugs quickly, but it is still the wrong first
|
||||
place to discover scheduler, archive, and duplicate-publisher invariants. The Python node-agent also
|
||||
made this worse by putting core control behavior outside the already-built Rust node binary.
|
||||
|
||||
## Decision
|
||||
|
||||
Add a small deterministic simulation layer in `ec-core` and use it for distributed media invariants:
|
||||
|
||||
- `ec-node` remains the runtime owner for node behavior.
|
||||
- Tests model logical time, delayed delivery, backfill, duplicate publishers, and archive
|
||||
convergence in Rust.
|
||||
- Simulation scenarios are seed-replayable and include deterministic jitter, transient drops,
|
||||
partition windows, publisher outage/restart windows, backfill retries, and encoder drift faults.
|
||||
- A failing simulation must print or carry a replay hint so the exact schedule can be rerun.
|
||||
- Simulation reports include deterministic execution history so a failure has an ordered event trace,
|
||||
not only a final assertion.
|
||||
- Simulation campaigns run many seed schedules in one fast test and preserve the first failing seed,
|
||||
invariant report, and final state as the failure artifact.
|
||||
- Campaign execution has a reusable seeded runner so new models can share replay/failure accounting
|
||||
instead of copying bespoke loops.
|
||||
- First failures are automatically shrunk where the model supports it. For duplicate publishers the
|
||||
shrinker removes irrelevant partitions, publisher outages, timing jitter, transient drops, and
|
||||
excess media sequence range while keeping the original invariant unchanged.
|
||||
- Invariants are explicit checks, not implicit test prose: duplicate source count, missing
|
||||
sequences, divergent hashes, missing media timing, conflicting media timing, complete duplicate
|
||||
coverage, and convergence-deadline budgets.
|
||||
- Media identity is checked by BLAKE3 hashes for stream, rendition, track, sequence, profile, and
|
||||
source-material identity.
|
||||
- Media timing is part of the proof model. Matching hashes are not considered a complete duplicate
|
||||
proof unless both publishers also expose a shared logical media clock for the chunk.
|
||||
- Source-material identity is separate from stream metadata. Two publishers can advertise the same
|
||||
channel, sequence, timing, and encoder profile while still encoding different RF/source windows;
|
||||
that must fail in simulation before production archive comparisons burn wall-clock time.
|
||||
- Publisher-origin archive `group_sequence` is derived from parsed media-time identity plus stable
|
||||
track id, not local receive time. Receive time is telemetry; it is not proof that two publishers
|
||||
archived the same broadcast moment.
|
||||
- Live publisher archive proof normalizes fMP4 `tfdt` to the Unix media slot before hashing a
|
||||
fragment. The first fragment for each track anchors the process-local media clock to wall-clock
|
||||
time; later fragments preserve ffmpeg's media cadence from that origin. ffmpeg still runs with
|
||||
wall-clock timestamp input enabled where possible, but the Rust archive writer is the authority
|
||||
for the proof clock when source MPEG-TS timestamps are process-relative.
|
||||
- Archive `group_sequence` includes a stable subfragment slot inside each `(track_id,
|
||||
media_sequence)` pair, because audio can legitimately emit multiple fragments within one media
|
||||
slot and those must compare in order instead of colliding as source-local divergences.
|
||||
- Duplicate-publisher scenarios model publisher content phase separately from advertised archive
|
||||
sequence. A publisher that starts its local encoder at a different content phase must fail fast in
|
||||
simulation, because production fragments with the same local sequence are not proof of the same
|
||||
broadcast moment unless the chunk clock is shared.
|
||||
- `ec-node sim-duplicate-publishers` runs the same campaign model from the compiled Rust binary and
|
||||
emits JSON suitable for CI artifacts and rollout gates.
|
||||
- `ec-node sim-duplicate-publishers --failure-artifact <path>` writes the first failing campaign as
|
||||
a replayable JSON artifact with the shrunk scenario, invariant report, event trace, shrink steps,
|
||||
and a command hint for replaying `replay_scenario` through `--scenario-json -`.
|
||||
- `ec-node sim-duplicate-publishers --scenario-json <path-or->` replays an exact serialized
|
||||
`DuplicatePublisherScenario`, so a shrunk failure from CI or production investigation can be rerun
|
||||
without reconstructing command-line flags.
|
||||
- `ec-node sim-duplicate-publishers` can inject timing faults directly with
|
||||
`--missing-media-timing-publisher NODE` and `--publisher-media-time-offset NODE:OFFSET_MS`, so
|
||||
the current production proof class can be reproduced without hand-writing scenario JSON.
|
||||
- `ec-node sim-duplicate-publishers` and `ec-node sim-system` can inject source-window faults with
|
||||
`--publisher-source-material NODE:MATERIAL_ID`. Any campaign with multiple source-material ids
|
||||
reports source-material mismatch observations instead of leaving operators to infer that class
|
||||
from divergent hashes.
|
||||
- `ec-node archive-convergence` reads existing archive manifest JSONL and applies the same
|
||||
convergence semantics to real duplicate publisher outputs.
|
||||
- Control-plane simulation models logical nodes, seeded gossip fanout, delivery jitter, transient
|
||||
drops, node-specific partitions, node outages, duplicate deliveries, and propagation deadlines.
|
||||
- `ec-node sim-control-plane` runs the control-plane model from the compiled Rust binary and emits
|
||||
replayable JSON with the first failing seed, scenario, invariant report, and ordered trace.
|
||||
- Control-plane campaign reports track max propagation time, max delivery time, dropped messages,
|
||||
partition-delayed messages, outage-delayed messages, and duplicate messages, so prod rollout
|
||||
measurements have a fast simulation baseline.
|
||||
- System simulation composes control-plane propagation with duplicate-publisher media production.
|
||||
Control gossip produces per-publisher activation times; the media workload then proves that delayed
|
||||
schedule propagation still converges when publishers use the global media sequence clock and fails
|
||||
when they derive chunk identity from local activation time.
|
||||
- `ec-node sim-system` runs that composed workload from the deployed node binary. Its default
|
||||
campaign models the current publisher topology class and can switch `--sequence-clock` between
|
||||
`global` and `local-activation` to reproduce the exact class of duplicate-publisher phase bug
|
||||
before waiting for production samples.
|
||||
- `ec-node sim-system --fault-profile foundationdb` uses a FoundationDB-style fault profile: each
|
||||
seed generates a different but replayable cluster schedule with randomized control partitions, node
|
||||
outages, transient gossip drops, duplicate messages, media partitions, publisher outages, and
|
||||
archive backfill pressure.
|
||||
- The FoundationDB-style profile must also have an explicit negative regression for
|
||||
`local-activation` sequence clocks, so the model proves the current production failure class is
|
||||
caught in Rust before any rollout waits for live fragments.
|
||||
- `ec-node sim-system --failure-artifact <path>` writes the first failing composed system schedule
|
||||
as replayable JSON, including the exact control/media scenario, invariant report, ordered trace,
|
||||
and command hint for rerunning `--scenario-json -`.
|
||||
- System campaign reports must include fault coverage counters, not just pass/fail. A fast campaign
|
||||
is only useful if it proves that the simulated run actually exercised the failure modes operators
|
||||
care about.
|
||||
- System campaign reports also aggregate publisher phase-offset observations. A production-like
|
||||
divergence caused by local activation clocks should identify itself as a phase bug in the campaign
|
||||
JSON instead of requiring operators to infer that only from divergent hashes.
|
||||
- System campaign reports also aggregate source-material mismatch observations. A production-like
|
||||
divergence caused by independent tuner/source windows should identify itself as a source-material
|
||||
bug in the campaign JSON instead of being confused with codec nondeterminism.
|
||||
- System and duplicate-publisher reports aggregate missing media-timing records and media-timing
|
||||
conflicts, so the live failure class where fragments arrive without a usable media clock is visible
|
||||
in fast Rust simulation output.
|
||||
- FoundationDB-profile `sim-system` campaigns require that coverage by default: control transient
|
||||
drops, partition delays, node outage delays, duplicate messages, media transient drops, media
|
||||
partition delays, publisher outages, backfill, and observed convergence timing must all appear in
|
||||
the campaign report. A campaign that passes invariants but misses these classes is reported as a
|
||||
weak simulation, not a green rollout gate.
|
||||
- FoundationDB-profile coverage is breadth-gated, not only boolean-gated. By default at least
|
||||
`max(2, iterations / 32)` seeds must exercise every required distributed fault class; operators
|
||||
can raise that floor with `--min-fault-seed-coverage` for longer scientific campaigns.
|
||||
- Campaign reports track both event totals and seed counts per fault class, plus a bounded list of
|
||||
the slowest system schedules with replay hints. This makes green runs inspectable: operators can
|
||||
see how broadly the randomized schedule space was exercised and which seeds define the current
|
||||
latency tail.
|
||||
- System campaign reports also aggregate deterministic simulated convergence time and trace event
|
||||
counts. `ec-node sim-system` stamps wall-clock execution telemetry around the campaign so a run
|
||||
reports iterations per second, simulated system seconds per wall second, and trace events per
|
||||
second without putting wall-clock data into the replayed scenario itself.
|
||||
- `sim-system --failure-artifact <path>` writes an artifact for weak coverage as well as invariant
|
||||
failures, so CI can preserve evidence when a campaign was too small or too narrow to exercise the
|
||||
required distributed faults.
|
||||
- Forge `ci-gates` runs the Rust system simulator tests and a 1024-seed
|
||||
`sim-system --fault-profile foundationdb` campaign from the compiled `ec-node` binary before web
|
||||
build/deploy gates. This keeps the fast randomized check ahead of production rollout evidence.
|
||||
- Simulation failures must be actionable before any matching production rollout is considered
|
||||
healthy.
|
||||
|
||||
## Consequences
|
||||
|
||||
We get FoundationDB-style pressure in a much smaller shape: many deterministic failure schedules can
|
||||
run as normal Rust tests without booting machines. The first media model covers duplicate publisher
|
||||
convergence, network partitions, transient loss, publisher restart/backfill, convergence latency,
|
||||
encoder drift, and publisher phase alignment, and the first runtime command applies it to archive
|
||||
manifests. The first control model covers gossip propagation across relays and nodes under dropped,
|
||||
delayed, duplicated, partitioned, and outage-delayed control messages. The shrink/replay path makes
|
||||
supported failures small enough to debug before they become production event archaeology; exact
|
||||
scenario JSON is the replay contract. Later models can add tuner scheduling, relay cache eviction,
|
||||
and image rollout state machines. The composed system model is the first workload-level step: it
|
||||
checks the boundary between control-plane speed and media determinism, which is where production
|
||||
duplicate publishers are currently most fragile.
|
||||
|
||||
## Alternatives considered
|
||||
|
||||
- Keep writing production probes only. Rejected because probes prove what happened once, not what
|
||||
should happen across many fault schedules.
|
||||
- Extend the Python node-agent as the simulation oracle. Rejected because the image should get
|
||||
thinner and the runtime behavior belongs in the Rust node.
|
||||
|
||||
## Rollout/teardown
|
||||
|
||||
Roll forward by adding simulation tests next to each new distributed invariant. Roll back by keeping
|
||||
the production probes; the simulation module is library-only and has no runtime service impact.
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -7,25 +7,38 @@
|
|||
}:
|
||||
|
||||
let
|
||||
# Keep the build input stable and small; avoid copying `target/`, `tmp/`, etc. into the Nix store.
|
||||
root = ../../.;
|
||||
# Keep the build input stable and small. NixOS, infra, docs, and script-only
|
||||
# changes should not perturb the Rust source hash for config-only deploys.
|
||||
src = lib.cleanSourceWith {
|
||||
src = ../../.;
|
||||
src = root;
|
||||
filter = path: type:
|
||||
let
|
||||
base = baseNameOf path;
|
||||
rel = lib.removePrefix "${toString root}/" (toString path);
|
||||
in
|
||||
# Skip typical build outputs and large scratch dirs.
|
||||
!(base == "target"
|
||||
|| base == ".git"
|
||||
|| base == ".direnv"
|
||||
|| base == "tmp"
|
||||
|| base == "node_modules"
|
||||
|| base == "out"
|
||||
|| base == "test-results"
|
||||
|| base == "deploy"
|
||||
|| base == "intake"
|
||||
|| base == "cache"
|
||||
|| base == ".tower-minimal");
|
||||
rel == ""
|
||||
|| rel == "Cargo.toml"
|
||||
|| rel == "Cargo.lock"
|
||||
|| rel == "crates"
|
||||
|| lib.hasPrefix "crates/" rel
|
||||
|| rel == "third_party"
|
||||
|| rel == "third_party/iroh-org"
|
||||
|| rel == "third_party/iroh-org/iroh-gossip"
|
||||
|| lib.hasPrefix "third_party/iroh-org/iroh-gossip/" rel
|
||||
|| rel == "third_party/iroh-live"
|
||||
|| rel == "third_party/iroh-live/iroh-moq"
|
||||
|| lib.hasPrefix "third_party/iroh-live/iroh-moq/" rel
|
||||
|| rel == "third_party/iroh-live/web-transport-iroh"
|
||||
|| lib.hasPrefix "third_party/iroh-live/web-transport-iroh/" rel
|
||||
|| rel == "apps"
|
||||
|| rel == "apps/tauri"
|
||||
|| rel == "apps/tauri/Cargo.toml"
|
||||
|| rel == "apps/tauri/build.rs"
|
||||
|| rel == "apps/tauri/tauri.conf.json"
|
||||
|| rel == "apps/tauri/gen"
|
||||
|| lib.hasPrefix "apps/tauri/gen/" rel
|
||||
|| rel == "apps/tauri/src"
|
||||
|| lib.hasPrefix "apps/tauri/src/" rel;
|
||||
};
|
||||
in
|
||||
rustPlatform.buildRustPackage {
|
||||
|
|
@ -52,7 +65,7 @@ rustPlatform.buildRustPackage {
|
|||
doCheck = false;
|
||||
|
||||
meta = with lib; {
|
||||
description = "every.channel node runner (ingest + chunk + MoQ publish)";
|
||||
description = "every.channel node (ingest + chunk + MoQ publish)";
|
||||
mainProgram = "ec-node";
|
||||
platforms = platforms.unix;
|
||||
license = licenses.agpl3Only;
|
||||
|
|
|
|||
320
scripts/measure-duplicate-publishers-test.py
Normal file
320
scripts/measure-duplicate-publishers-test.py
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
SCRIPT = REPO / "scripts" / "measure-duplicate-publishers.py"
|
||||
|
||||
|
||||
def load_module():
|
||||
spec = importlib.util.spec_from_file_location("measure_duplicate_publishers", SCRIPT)
|
||||
if spec is None or spec.loader is None:
|
||||
raise RuntimeError(f"unable to load {SCRIPT}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
class MeasureDuplicatePublishersTest(unittest.TestCase):
|
||||
def test_manifest_hash_stats_counts_duplicates_divergence_and_missing_hashes(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
records = [
|
||||
{"group_sequence": 10, "received_unix_ms": 1_000, "blake3": "same", "source_node": "nuc-a"},
|
||||
{"group_sequence": 10, "received_unix_ms": 1_001, "blake3": "same", "source_node": "nuc-b"},
|
||||
{"group_sequence": 11, "received_unix_ms": 2_000, "blake3": "left", "source_node": "nuc-a"},
|
||||
{"group_sequence": 11, "received_unix_ms": 2_001, "blake3": "right", "source_node": "nuc-b"},
|
||||
{"group_sequence": 12, "received_unix_ms": 3_000},
|
||||
]
|
||||
|
||||
stats = module.manifest_hash_stats(records, invalid_lines=2)
|
||||
|
||||
self.assertEqual(5, stats["record_count"])
|
||||
self.assertEqual(2, stats["invalid_lines"])
|
||||
self.assertEqual(2, stats["sequence_count"])
|
||||
self.assertEqual(2, stats["source_identity_count"])
|
||||
self.assertEqual(["nuc-a", "nuc-b"], stats["source_identities"])
|
||||
self.assertEqual(1, stats["missing_source_identity_records"])
|
||||
self.assertEqual(1, stats["duplicate_hash_source_records"])
|
||||
self.assertEqual(1, stats["duplicate_hash_sequences"])
|
||||
self.assertEqual(1, stats["hash_divergent_sequences"])
|
||||
self.assertEqual(1, stats["missing_hash_records"])
|
||||
self.assertEqual(1_000, stats["first_received_unix_ms"])
|
||||
self.assertEqual(3_000, stats["latest_received_unix_ms"])
|
||||
|
||||
def test_compare_manifest_hashes_proves_byte_for_byte_matches(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
comparison = module.compare_manifest_hashes(
|
||||
{
|
||||
"publisher-a": [
|
||||
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"},
|
||||
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"},
|
||||
],
|
||||
"publisher-b": [
|
||||
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"},
|
||||
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-b"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
self.assertTrue(comparison["byte_for_byte_hash_match"])
|
||||
self.assertTrue(comparison["source_identity_ok"])
|
||||
self.assertEqual(["publisher-a", "publisher-b"], comparison["source_identities"])
|
||||
self.assertEqual(2, comparison["matching_sequence_count"])
|
||||
self.assertEqual(0, comparison["divergent_sequence_count"])
|
||||
self.assertEqual(0, comparison["missing_sequence_count"])
|
||||
|
||||
def test_compare_manifest_hashes_reports_divergent_sequences(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
comparison = module.compare_manifest_hashes(
|
||||
{
|
||||
"publisher-a": [
|
||||
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-a"},
|
||||
{"group_sequence": 2, "blake3": "b", "source_node": "publisher-a"},
|
||||
],
|
||||
"publisher-b": [
|
||||
{"group_sequence": 1, "blake3": "a", "source_node": "publisher-b"},
|
||||
{"group_sequence": 2, "blake3": "different", "source_node": "publisher-b"},
|
||||
{"group_sequence": 3, "blake3": "extra", "source_node": "publisher-b"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
self.assertFalse(comparison["byte_for_byte_hash_match"])
|
||||
self.assertEqual(1, comparison["matching_sequence_count"])
|
||||
self.assertEqual(1, comparison["divergent_sequence_count"])
|
||||
self.assertEqual(1, comparison["missing_sequence_count"])
|
||||
self.assertEqual(2, comparison["divergent_examples"][0]["sequence"])
|
||||
self.assertEqual(["different"], comparison["divergent_examples"][0]["hashes"]["publisher-b"])
|
||||
|
||||
def test_compare_manifest_hashes_rejects_intra_manifest_divergence(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
comparison = module.compare_manifest_hashes(
|
||||
{
|
||||
"publisher-a": [
|
||||
{"group_sequence": 1, "blake3": "same", "source_node": "publisher-a"},
|
||||
],
|
||||
"publisher-b": [
|
||||
{"group_sequence": 1, "blake3": "same", "source_node": "publisher-b"},
|
||||
{"group_sequence": 1, "blake3": "different", "source_node": "publisher-b"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
self.assertFalse(comparison["byte_for_byte_hash_match"])
|
||||
self.assertEqual(0, comparison["matching_sequence_count"])
|
||||
self.assertEqual(1, comparison["divergent_sequence_count"])
|
||||
self.assertEqual(["different", "same"], comparison["divergent_examples"][0]["hashes"]["publisher-b"])
|
||||
|
||||
def test_compare_manifest_hashes_rejects_mirrored_same_source_records(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
comparison = module.compare_manifest_hashes(
|
||||
{
|
||||
"nuc-a-buffer": [
|
||||
{"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"},
|
||||
],
|
||||
"nuc-b-buffer": [
|
||||
{"group_sequence": 1, "blake3": "same", "source_node": "archive-origin"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
self.assertFalse(comparison["byte_for_byte_hash_match"])
|
||||
self.assertFalse(comparison["source_identity_ok"])
|
||||
self.assertEqual(["archive-origin"], comparison["source_identities"])
|
||||
|
||||
def test_summary_requires_manifest_comparison_and_prometheus_series(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
summary = module.summarize(
|
||||
[
|
||||
{
|
||||
"sample_unix_ms": 1_000,
|
||||
"publishers": {
|
||||
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
},
|
||||
"manifest_comparison": {
|
||||
"source_identity_ok": True,
|
||||
"matching_sequence_count": 2,
|
||||
"divergent_sequence_count": 0,
|
||||
"byte_for_byte_hash_match": True,
|
||||
},
|
||||
"prometheus": [
|
||||
{
|
||||
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 2,
|
||||
},
|
||||
{
|
||||
"metric": "every_channel_ladder_archive_hash_divergent_sequences",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 0,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"sample_unix_ms": 31_000,
|
||||
"publishers": {
|
||||
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
},
|
||||
"manifest_comparison": {
|
||||
"source_identity_ok": True,
|
||||
"matching_sequence_count": 4,
|
||||
"divergent_sequence_count": 0,
|
||||
"byte_for_byte_hash_match": True,
|
||||
},
|
||||
"prometheus": [
|
||||
{
|
||||
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 4,
|
||||
},
|
||||
{
|
||||
"metric": "every_channel_ladder_archive_hash_divergent_sequences",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 0,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
self.assertTrue(summary["ok"])
|
||||
self.assertEqual(30_000, summary["elapsed_ms"])
|
||||
self.assertEqual(2, summary["sample_count"])
|
||||
self.assertEqual(4, summary["latest_manifest_comparison"]["matching_sequence_count"])
|
||||
|
||||
def test_summary_rejects_single_sample_and_manifest_hash_errors(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
summary = module.summarize(
|
||||
[
|
||||
{
|
||||
"sample_unix_ms": 1_000,
|
||||
"publishers": {
|
||||
"a": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
"b": {"health_ok": True, "metrics_ok": True, "duplicate_metrics_present": True},
|
||||
},
|
||||
"manifests": {
|
||||
"a": {
|
||||
"ok": True,
|
||||
"hash_divergent_sequences": 1,
|
||||
"missing_hash_records": 1,
|
||||
"invalid_lines": 1,
|
||||
},
|
||||
},
|
||||
"manifest_comparison": {
|
||||
"source_identity_ok": True,
|
||||
"matching_sequence_count": 2,
|
||||
"divergent_sequence_count": 0,
|
||||
"byte_for_byte_hash_match": True,
|
||||
},
|
||||
"prometheus": [
|
||||
{
|
||||
"metric": "every_channel_ladder_archive_duplicate_hash_source_records",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
self.assertFalse(summary["ok"])
|
||||
self.assertIn("insufficient_elapsed_samples", summary["reasons"])
|
||||
self.assertIn("manifest_hash_divergence_observed", summary["reasons"])
|
||||
self.assertIn("manifest_hash_missing_records", summary["reasons"])
|
||||
self.assertIn("manifest_invalid_lines", summary["reasons"])
|
||||
|
||||
def test_summary_rejects_missing_or_non_diverse_source_identity(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
summary = module.summarize(
|
||||
[
|
||||
{
|
||||
"sample_unix_ms": 1_000,
|
||||
"manifest_comparison": {
|
||||
"source_identity_ok": False,
|
||||
"matching_sequence_count": 2,
|
||||
"divergent_sequence_count": 0,
|
||||
"byte_for_byte_hash_match": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"sample_unix_ms": 31_000,
|
||||
"manifest_comparison": {
|
||||
"source_identity_ok": False,
|
||||
"matching_sequence_count": 2,
|
||||
"divergent_sequence_count": 0,
|
||||
"byte_for_byte_hash_match": False,
|
||||
},
|
||||
"prometheus": [
|
||||
{
|
||||
"metric": "every_channel_archive_missing_source_identity_records",
|
||||
"ok": True,
|
||||
"series_present": True,
|
||||
"value": 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
self.assertFalse(summary["ok"])
|
||||
self.assertIn("manifest_source_identity_missing_or_not_diverse", summary["reasons"])
|
||||
self.assertIn("prometheus_source_identity_missing_nonzero", summary["reasons"])
|
||||
|
||||
def test_agent_manifest_url_builds_bounded_tailnet_endpoint(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
url = module.agent_manifest_url(
|
||||
"http://100.64.0.5:7799/",
|
||||
broadcast="la-kcop",
|
||||
track="0.m4s",
|
||||
role="publisher-buffer",
|
||||
max_bytes=4096,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
"http://100.64.0.5:7799/v1/archive-manifest?broadcast=la-kcop&track=0.m4s&max_bytes=4096&role=publisher-buffer",
|
||||
url,
|
||||
)
|
||||
|
||||
def test_parser_defaults_to_publisher_origin_proof_track(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
args = module.build_parser().parse_args([])
|
||||
|
||||
self.assertEqual("publisher.m4s", args.track)
|
||||
|
||||
def test_parse_manifest_jsonl_tolerates_partial_first_tail_line(self) -> None:
|
||||
module = load_module()
|
||||
|
||||
body = 'not-json-prefix{"group_sequence":1}\n{"group_sequence":2,"blake3":"b"}\n'
|
||||
records, invalid = module.parse_manifest_jsonl(body)
|
||||
|
||||
self.assertEqual(0, invalid)
|
||||
self.assertEqual([2], [record["group_sequence"] for record in records])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
581
scripts/measure-duplicate-publishers.py
Executable file
581
scripts/measure-duplicate-publishers.py
Executable file
|
|
@ -0,0 +1,581 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Measure duplicate publisher media-hash convergence in production."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable
|
||||
|
||||
|
||||
USER_AGENT = "every-channel-measure-duplicate-publishers/1"
|
||||
DUPLICATE_PROMETHEUS_METRICS = [
|
||||
"every_channel_ladder_archive_duplicate_hash_source_records",
|
||||
"every_channel_ladder_archive_duplicate_hash_sequences",
|
||||
"every_channel_ladder_archive_hash_divergent_sequences",
|
||||
"every_channel_ladder_archive_missing_hash_records",
|
||||
"every_channel_ladder_archive_missing_source_identity_records",
|
||||
"every_channel_archive_duplicate_hash_source_records",
|
||||
"every_channel_archive_duplicate_hash_sequences",
|
||||
"every_channel_archive_hash_divergent_sequences",
|
||||
"every_channel_archive_missing_hash_records",
|
||||
"every_channel_archive_missing_source_identity_records",
|
||||
]
|
||||
SOURCE_IDENTITY_KEYS = ("source_node", "publisher_node", "source_id")
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
url: str
|
||||
status: int
|
||||
body: str
|
||||
elapsed_ms: int
|
||||
error: str | None = None
|
||||
|
||||
@property
|
||||
def ok(self) -> bool:
|
||||
return self.error is None and 200 <= self.status < 300
|
||||
|
||||
|
||||
def now_ms() -> int:
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
||||
def fetch_text(url: str, timeout: float, max_bytes: int = 4 * 1024 * 1024) -> FetchResult:
|
||||
started = now_ms()
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
if max_bytes > 0:
|
||||
headers["Range"] = f"bytes=-{max_bytes}"
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as res:
|
||||
body = res.read(max_bytes + 1 if max_bytes > 0 else -1)
|
||||
if max_bytes > 0 and len(body) > max_bytes:
|
||||
body = body[-max_bytes:]
|
||||
return FetchResult(url, int(res.status), body.decode("utf-8", "replace"), now_ms() - started)
|
||||
except Exception as err: # noqa: BLE001 - measurements preserve transport failures.
|
||||
return FetchResult(url, 0, "", now_ms() - started, str(err))
|
||||
|
||||
|
||||
def fetch_json(url: str, timeout: float, max_bytes: int = 1024 * 1024) -> tuple[FetchResult, Any | None]:
|
||||
fetched = fetch_text(url, timeout, max_bytes=max_bytes)
|
||||
if not fetched.ok:
|
||||
return fetched, None
|
||||
try:
|
||||
return fetched, json.loads(fetched.body)
|
||||
except json.JSONDecodeError as err:
|
||||
fetched.error = f"invalid json: {err}"
|
||||
return fetched, None
|
||||
|
||||
|
||||
def parse_named_url(value: str) -> tuple[str, str]:
|
||||
if "=" not in value:
|
||||
raise ValueError(f"expected NAME=URL: {value}")
|
||||
name, url = value.split("=", 1)
|
||||
name = name.strip()
|
||||
url = url.strip()
|
||||
if not name or not url:
|
||||
raise ValueError(f"expected NAME=URL: {value}")
|
||||
return name, url
|
||||
|
||||
|
||||
def manifest_url(origin: str, broadcast: str, track: str) -> str:
|
||||
base = origin.rstrip("/") + "/"
|
||||
return urllib.parse.urljoin(base, f"manifests/{broadcast}/{track}.jsonl")
|
||||
|
||||
|
||||
def parse_manifest_jsonl(body: str) -> tuple[list[dict[str, Any]], int]:
|
||||
records: list[dict[str, Any]] = []
|
||||
invalid_lines = 0
|
||||
for index, line in enumerate(body.splitlines()):
|
||||
raw = line.strip()
|
||||
if not raw:
|
||||
continue
|
||||
try:
|
||||
record = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
# Tail range reads may start in the middle of a JSON line.
|
||||
if index == 0:
|
||||
continue
|
||||
invalid_lines += 1
|
||||
continue
|
||||
if isinstance(record, dict):
|
||||
records.append(record)
|
||||
else:
|
||||
invalid_lines += 1
|
||||
return records, invalid_lines
|
||||
|
||||
|
||||
def int_or_none(value: Any) -> int | None:
|
||||
if isinstance(value, bool):
|
||||
return None
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
try:
|
||||
return int(str(value))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def record_source_identity(record: dict[str, Any]) -> str | None:
|
||||
for key in SOURCE_IDENTITY_KEYS:
|
||||
value = record.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
return None
|
||||
|
||||
|
||||
def manifest_hash_stats(records: list[dict[str, Any]], invalid_lines: int = 0) -> dict[str, Any]:
|
||||
hashes_by_sequence: dict[int, set[str]] = {}
|
||||
source_hashes_by_sequence: dict[int, dict[str, set[str]]] = {}
|
||||
missing_hash_records = 0
|
||||
missing_source_identity_records = 0
|
||||
source_identities: set[str] = set()
|
||||
received_values: list[int] = []
|
||||
for record in records:
|
||||
received_ms = int_or_none(record.get("received_unix_ms"))
|
||||
if received_ms is not None:
|
||||
received_values.append(received_ms)
|
||||
sequence = int_or_none(record.get("group_sequence"))
|
||||
digest = record.get("blake3")
|
||||
if sequence is None:
|
||||
continue
|
||||
source_identity = record_source_identity(record)
|
||||
if source_identity:
|
||||
source_identities.add(source_identity)
|
||||
else:
|
||||
missing_source_identity_records += 1
|
||||
if not isinstance(digest, str) or not digest.strip():
|
||||
missing_hash_records += 1
|
||||
continue
|
||||
clean_digest = digest.strip()
|
||||
hashes_by_sequence.setdefault(sequence, set()).add(clean_digest)
|
||||
if source_identity:
|
||||
source_hashes_by_sequence.setdefault(sequence, {}).setdefault(clean_digest, set()).add(source_identity)
|
||||
duplicate_hash_source_records = sum(
|
||||
max(0, len(source_identities_for_hash) - 1)
|
||||
for hashes in source_hashes_by_sequence.values()
|
||||
for source_identities_for_hash in hashes.values()
|
||||
)
|
||||
duplicate_hash_sequences = sum(
|
||||
1
|
||||
for hashes in source_hashes_by_sequence.values()
|
||||
if any(len(source_identities_for_hash) > 1 for source_identities_for_hash in hashes.values())
|
||||
)
|
||||
hash_divergent_sequences = sum(1 for hashes in hashes_by_sequence.values() if len(hashes) > 1)
|
||||
return {
|
||||
"record_count": len(records),
|
||||
"invalid_lines": invalid_lines,
|
||||
"sequence_count": len(hashes_by_sequence),
|
||||
"source_identity_count": len(source_identities),
|
||||
"source_identities": sorted(source_identities),
|
||||
"missing_source_identity_records": missing_source_identity_records,
|
||||
"duplicate_hash_source_records": duplicate_hash_source_records,
|
||||
"duplicate_hash_sequences": duplicate_hash_sequences,
|
||||
"hash_divergent_sequences": hash_divergent_sequences,
|
||||
"missing_hash_records": missing_hash_records,
|
||||
"first_received_unix_ms": min(received_values) if received_values else None,
|
||||
"latest_received_unix_ms": max(received_values) if received_values else None,
|
||||
}
|
||||
|
||||
|
||||
def first_hash_by_sequence(records: list[dict[str, Any]]) -> dict[int, str]:
|
||||
out: dict[int, str] = {}
|
||||
for record in records:
|
||||
sequence = int_or_none(record.get("group_sequence"))
|
||||
digest = record.get("blake3")
|
||||
if sequence is None or not isinstance(digest, str) or not digest.strip():
|
||||
continue
|
||||
out.setdefault(sequence, digest.strip())
|
||||
return out
|
||||
|
||||
|
||||
def hash_sets_by_sequence(records: list[dict[str, Any]]) -> dict[int, set[str]]:
|
||||
out: dict[int, set[str]] = {}
|
||||
for record in records:
|
||||
sequence = int_or_none(record.get("group_sequence"))
|
||||
digest = record.get("blake3")
|
||||
if sequence is None or not isinstance(digest, str) or not digest.strip():
|
||||
continue
|
||||
out.setdefault(sequence, set()).add(digest.strip())
|
||||
return out
|
||||
|
||||
|
||||
def compare_manifest_hashes(named_records: dict[str, list[dict[str, Any]]]) -> dict[str, Any]:
|
||||
input_manifest_count = len(named_records)
|
||||
missing_source_identity_records = 0
|
||||
source_records: dict[str, list[dict[str, Any]]] = {}
|
||||
for manifest_name, records in named_records.items():
|
||||
for index, record in enumerate(records):
|
||||
source_identity = record_source_identity(record)
|
||||
if source_identity is None:
|
||||
missing_source_identity_records += 1
|
||||
source_identity = f"manifest:{manifest_name}"
|
||||
source_records.setdefault(source_identity, []).append(record)
|
||||
names = sorted(source_records)
|
||||
per_name = {name: hash_sets_by_sequence(records) for name, records in source_records.items()}
|
||||
all_sequences = sorted(set().union(*(set(value) for value in per_name.values()))) if per_name else []
|
||||
shared_sequences = [
|
||||
sequence
|
||||
for sequence in all_sequences
|
||||
if all(sequence in per_name[name] for name in names)
|
||||
]
|
||||
matching = 0
|
||||
divergent = 0
|
||||
examples: list[dict[str, Any]] = []
|
||||
for sequence in shared_sequences:
|
||||
values = {name: per_name[name][sequence] for name in names}
|
||||
flattened = [next(iter(digests)) for digests in values.values() if len(digests) == 1]
|
||||
if len(flattened) == len(names) and len(set(flattened)) == 1:
|
||||
matching += 1
|
||||
else:
|
||||
divergent += 1
|
||||
if len(examples) < 5:
|
||||
examples.append(
|
||||
{
|
||||
"sequence": sequence,
|
||||
"hashes": {
|
||||
name: sorted(digests)
|
||||
for name, digests in values.items()
|
||||
},
|
||||
}
|
||||
)
|
||||
source_identity_ok = missing_source_identity_records == 0 and len(names) >= 2
|
||||
return {
|
||||
"publisher_count": len(names),
|
||||
"publishers": names,
|
||||
"input_manifest_count": input_manifest_count,
|
||||
"source_identity_count": len(names),
|
||||
"source_identities": names,
|
||||
"missing_source_identity_records": missing_source_identity_records,
|
||||
"source_identity_ok": source_identity_ok,
|
||||
"sequence_count": len(all_sequences),
|
||||
"shared_sequence_count": len(shared_sequences),
|
||||
"matching_sequence_count": matching,
|
||||
"divergent_sequence_count": divergent,
|
||||
"missing_sequence_count": max(0, len(all_sequences) - len(shared_sequences)),
|
||||
"divergent_examples": examples,
|
||||
"byte_for_byte_hash_match": bool(
|
||||
source_identity_ok and shared_sequences and divergent == 0 and matching == len(shared_sequences)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def prometheus_query_url(prometheus_url: str, expr: str) -> str:
|
||||
return (
|
||||
prometheus_url.rstrip()
|
||||
.rstrip("/")
|
||||
+ "/api/v1/query?"
|
||||
+ urllib.parse.urlencode({"query": expr})
|
||||
)
|
||||
|
||||
|
||||
def prometheus_metric_sum(
|
||||
prometheus_url: str,
|
||||
metric: str,
|
||||
*,
|
||||
broadcast: str,
|
||||
timeout: float,
|
||||
fetcher: Callable[[str, float, int], FetchResult] = fetch_text,
|
||||
) -> dict[str, Any]:
|
||||
selector = f'{metric}{{broadcast="{broadcast}"}}'
|
||||
expr = f"sum({selector})"
|
||||
fetched = fetcher(prometheus_query_url(prometheus_url, expr), timeout, 1024 * 1024)
|
||||
if not fetched.ok:
|
||||
return {"metric": metric, "ok": False, "value": None, "error": fetched.error}
|
||||
try:
|
||||
payload = json.loads(fetched.body)
|
||||
result = payload.get("data", {}).get("result", [])
|
||||
if not result:
|
||||
return {"metric": metric, "ok": True, "value": None, "series_present": False}
|
||||
raw_value = result[0].get("value", [None, None])[1]
|
||||
value = float(raw_value)
|
||||
except Exception as err: # noqa: BLE001 - preserve malformed Prometheus replies.
|
||||
return {"metric": metric, "ok": False, "value": None, "error": f"invalid prometheus response: {err}"}
|
||||
return {"metric": metric, "ok": True, "value": value, "series_present": True}
|
||||
|
||||
|
||||
def agent_manifest_url(base_url: str, *, broadcast: str, track: str, role: str, max_bytes: int) -> str:
|
||||
query = {
|
||||
"broadcast": broadcast,
|
||||
"track": track,
|
||||
"max_bytes": str(max_bytes),
|
||||
}
|
||||
if role:
|
||||
query["role"] = role
|
||||
return base_url.rstrip("/") + "/v1/archive-manifest?" + urllib.parse.urlencode(query)
|
||||
|
||||
|
||||
def sample_publishers(
|
||||
publisher_urls: dict[str, str],
|
||||
*,
|
||||
timeout: float,
|
||||
fetcher: Callable[[str, float, int], FetchResult] = fetch_text,
|
||||
) -> dict[str, Any]:
|
||||
out: dict[str, Any] = {}
|
||||
for name, base_url in publisher_urls.items():
|
||||
base = base_url.rstrip("/")
|
||||
health = fetcher(f"{base}/health", timeout, 1024 * 1024)
|
||||
metrics = fetcher(f"{base}/metrics", timeout, 2 * 1024 * 1024)
|
||||
row: dict[str, Any] = {
|
||||
"agent_url": base,
|
||||
"health_ok": health.ok,
|
||||
"metrics_ok": metrics.ok,
|
||||
"health_error": health.error,
|
||||
"metrics_error": metrics.error,
|
||||
"duplicate_metrics_present": False,
|
||||
"node_modes": [],
|
||||
"unhealthy_processes": [],
|
||||
}
|
||||
if health.ok:
|
||||
try:
|
||||
payload = json.loads(health.body)
|
||||
row["node_modes"] = payload.get("node_modes") if isinstance(payload.get("node_modes"), list) else []
|
||||
row["unhealthy_processes"] = (
|
||||
payload.get("unhealthy_processes")
|
||||
if isinstance(payload.get("unhealthy_processes"), list)
|
||||
else []
|
||||
)
|
||||
system = payload.get("system") if isinstance(payload.get("system"), dict) else {}
|
||||
row["hostname"] = system.get("hostname") or payload.get("hostname")
|
||||
except json.JSONDecodeError:
|
||||
row["health_error"] = "invalid health json"
|
||||
if metrics.ok:
|
||||
row["duplicate_metrics_present"] = any(metric in metrics.body for metric in DUPLICATE_PROMETHEUS_METRICS)
|
||||
row["metrics_bytes"] = len(metrics.body.encode("utf-8"))
|
||||
out[name] = row
|
||||
return out
|
||||
|
||||
|
||||
def sample_once(args: argparse.Namespace) -> dict[str, Any]:
|
||||
manifests: dict[str, str] = dict(parse_named_url(item) for item in args.manifest)
|
||||
if not manifests and args.archive_origin and args.broadcast and args.track:
|
||||
manifests["archive-origin"] = manifest_url(args.archive_origin, args.broadcast, args.track)
|
||||
publisher_urls: dict[str, str] = dict(parse_named_url(item) for item in args.publisher)
|
||||
agent_manifest_urls: dict[str, str] = dict(parse_named_url(item) for item in args.agent_manifest)
|
||||
|
||||
fetched_records: dict[str, list[dict[str, Any]]] = {}
|
||||
manifest_stats: dict[str, Any] = {}
|
||||
for name, url in manifests.items():
|
||||
fetched = fetch_text(url, args.timeout, max_bytes=args.max_manifest_bytes)
|
||||
if not fetched.ok:
|
||||
manifest_stats[name] = {"url": url, "ok": False, "error": fetched.error}
|
||||
continue
|
||||
records, invalid_lines = parse_manifest_jsonl(fetched.body)
|
||||
fetched_records[name] = records
|
||||
manifest_stats[name] = {
|
||||
"url": url,
|
||||
"ok": True,
|
||||
"fetch_elapsed_ms": fetched.elapsed_ms,
|
||||
**manifest_hash_stats(records, invalid_lines),
|
||||
}
|
||||
|
||||
if agent_manifest_urls and args.broadcast and args.track:
|
||||
for name, base_url in agent_manifest_urls.items():
|
||||
url = agent_manifest_url(
|
||||
base_url,
|
||||
broadcast=args.broadcast,
|
||||
track=args.track,
|
||||
role=args.agent_manifest_role,
|
||||
max_bytes=args.max_manifest_bytes,
|
||||
)
|
||||
fetched, payload = fetch_json(url, args.timeout, max_bytes=args.max_manifest_bytes + 1024 * 1024)
|
||||
if not fetched.ok or not isinstance(payload, dict) or payload.get("ok") is not True:
|
||||
manifest_stats[name] = {
|
||||
"url": url,
|
||||
"ok": False,
|
||||
"source": "node-agent",
|
||||
"error": fetched.error or (payload.get("error") if isinstance(payload, dict) else "invalid response"),
|
||||
}
|
||||
continue
|
||||
records = payload.get("records") if isinstance(payload.get("records"), list) else []
|
||||
records = [record for record in records if isinstance(record, dict)]
|
||||
fetched_records[name] = records
|
||||
invalid_lines = int_or_none(payload.get("invalid_lines")) or 0
|
||||
stats = payload.get("stats") if isinstance(payload.get("stats"), dict) else {}
|
||||
manifest_stats[name] = {
|
||||
"url": url,
|
||||
"ok": True,
|
||||
"source": "node-agent",
|
||||
"fetch_elapsed_ms": fetched.elapsed_ms,
|
||||
"role": payload.get("role"),
|
||||
"file_bytes": int_or_none(payload.get("file_bytes")),
|
||||
"partial_scan": payload.get("partial_scan") is True,
|
||||
**manifest_hash_stats(records, invalid_lines),
|
||||
"node_agent_stats": stats,
|
||||
}
|
||||
|
||||
prometheus_metrics = []
|
||||
if args.prometheus_url and args.broadcast:
|
||||
for metric in DUPLICATE_PROMETHEUS_METRICS:
|
||||
prometheus_metrics.append(
|
||||
prometheus_metric_sum(args.prometheus_url, metric, broadcast=args.broadcast, timeout=args.timeout)
|
||||
)
|
||||
|
||||
return {
|
||||
"sample_unix_ms": now_ms(),
|
||||
"broadcast": args.broadcast,
|
||||
"track": args.track,
|
||||
"publishers": sample_publishers(publisher_urls, timeout=args.timeout) if publisher_urls else {},
|
||||
"manifests": manifest_stats,
|
||||
"manifest_comparison": compare_manifest_hashes(fetched_records) if len(fetched_records) >= 2 else None,
|
||||
"prometheus": prometheus_metrics,
|
||||
}
|
||||
|
||||
|
||||
def summarize(samples: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
if not samples:
|
||||
return {"ok": False, "reasons": ["no_samples"]}
|
||||
reasons: list[str] = []
|
||||
elapsed_ms = max(0, int(samples[-1]["sample_unix_ms"]) - int(samples[0]["sample_unix_ms"]))
|
||||
if len(samples) < 2 or elapsed_ms <= 0:
|
||||
reasons.append("insufficient_elapsed_samples")
|
||||
publisher_rows = [
|
||||
row
|
||||
for sample in samples
|
||||
for row in (sample.get("publishers") or {}).values()
|
||||
if isinstance(row, dict)
|
||||
]
|
||||
if publisher_rows and not all(row.get("health_ok") is True for row in publisher_rows):
|
||||
reasons.append("publisher_health_missing")
|
||||
if publisher_rows and not any(row.get("metrics_ok") is True for row in publisher_rows):
|
||||
reasons.append("publisher_metrics_missing")
|
||||
if publisher_rows and not any(row.get("duplicate_metrics_present") is True for row in publisher_rows):
|
||||
reasons.append("duplicate_metrics_not_deployed_to_publishers")
|
||||
comparisons = [
|
||||
sample.get("manifest_comparison")
|
||||
for sample in samples
|
||||
if isinstance(sample.get("manifest_comparison"), dict)
|
||||
]
|
||||
latest_comparison = comparisons[-1] if comparisons else None
|
||||
if latest_comparison is None:
|
||||
reasons.append("manifest_comparison_missing")
|
||||
elif latest_comparison.get("source_identity_ok") is not True:
|
||||
reasons.append("manifest_source_identity_missing_or_not_diverse")
|
||||
elif latest_comparison.get("matching_sequence_count", 0) <= 0:
|
||||
reasons.append("no_matching_duplicate_sequences")
|
||||
elif latest_comparison.get("divergent_sequence_count", 0) > 0:
|
||||
reasons.append("duplicate_hash_divergence_observed")
|
||||
|
||||
manifest_rows = [
|
||||
row
|
||||
for sample in samples
|
||||
for row in (sample.get("manifests") or {}).values()
|
||||
if isinstance(row, dict)
|
||||
]
|
||||
if manifest_rows and any(row.get("ok") is not True for row in manifest_rows):
|
||||
reasons.append("manifest_fetch_missing")
|
||||
if manifest_rows and any(int_or_none(row.get("hash_divergent_sequences")) or 0 for row in manifest_rows):
|
||||
reasons.append("manifest_hash_divergence_observed")
|
||||
if manifest_rows and any(int_or_none(row.get("missing_hash_records")) or 0 for row in manifest_rows):
|
||||
reasons.append("manifest_hash_missing_records")
|
||||
if manifest_rows and any(int_or_none(row.get("missing_source_identity_records")) or 0 for row in manifest_rows):
|
||||
reasons.append("manifest_source_identity_missing")
|
||||
if manifest_rows and any(int_or_none(row.get("invalid_lines")) or 0 for row in manifest_rows):
|
||||
reasons.append("manifest_invalid_lines")
|
||||
|
||||
prom_rows = [
|
||||
row
|
||||
for sample in samples
|
||||
for row in (sample.get("prometheus") or [])
|
||||
if isinstance(row, dict)
|
||||
]
|
||||
prom_series = [row for row in prom_rows if row.get("series_present") is True]
|
||||
if prom_rows and not prom_series:
|
||||
reasons.append("prometheus_duplicate_series_missing")
|
||||
divergent_values = [
|
||||
float(row.get("value") or 0)
|
||||
for row in prom_series
|
||||
if str(row.get("metric", "")).endswith("hash_divergent_sequences")
|
||||
]
|
||||
if any(value > 0 for value in divergent_values):
|
||||
reasons.append("prometheus_hash_divergence_nonzero")
|
||||
missing_source_values = [
|
||||
float(row.get("value") or 0)
|
||||
for row in prom_series
|
||||
if str(row.get("metric", "")).endswith("missing_source_identity_records")
|
||||
]
|
||||
if any(value > 0 for value in missing_source_values):
|
||||
reasons.append("prometheus_source_identity_missing_nonzero")
|
||||
return {
|
||||
"ok": not reasons,
|
||||
"elapsed_ms": elapsed_ms,
|
||||
"sample_count": len(samples),
|
||||
"reasons": reasons,
|
||||
"latest_manifest_comparison": latest_comparison,
|
||||
"prometheus_series_present_count": len(prom_series),
|
||||
"publisher_count": len(samples[-1].get("publishers") or {}),
|
||||
}
|
||||
|
||||
|
||||
def measure(args: argparse.Namespace) -> dict[str, Any]:
|
||||
samples: list[dict[str, Any]] = []
|
||||
started = time.monotonic()
|
||||
while True:
|
||||
samples.append(sample_once(args))
|
||||
if args.duration_seconds <= 0:
|
||||
break
|
||||
if time.monotonic() - started >= args.duration_seconds:
|
||||
break
|
||||
time.sleep(args.poll_interval_seconds)
|
||||
report = {
|
||||
"started_unix_ms": samples[0]["sample_unix_ms"] if samples else now_ms(),
|
||||
"duration_seconds": args.duration_seconds,
|
||||
"samples": samples,
|
||||
}
|
||||
report["summary"] = summarize(samples)
|
||||
return report
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--publisher", action="append", default=[], help="Named node-agent URL, NAME=http://IP:7799.")
|
||||
parser.add_argument("--manifest", action="append", default=[], help="Named archive JSONL URL, NAME=https://...")
|
||||
parser.add_argument(
|
||||
"--agent-manifest",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Named node-agent URL to sample /v1/archive-manifest from, NAME=http://IP:7799.",
|
||||
)
|
||||
parser.add_argument("--agent-manifest-role", default="publisher-buffer")
|
||||
parser.add_argument("--archive-origin", default="", help="Archive origin root for manifests/<broadcast>/<track>.jsonl.")
|
||||
parser.add_argument("--prometheus-url", default="", help="Prometheus base URL for Grafana-facing metrics.")
|
||||
parser.add_argument("--broadcast", default="", help="Logical broadcast name to measure.")
|
||||
parser.add_argument(
|
||||
"--track",
|
||||
default="publisher.m4s",
|
||||
help="Track name to compare. Defaults to publisher-origin proof fragments, not relay video.",
|
||||
)
|
||||
parser.add_argument("--duration-seconds", type=float, default=0.0)
|
||||
parser.add_argument("--poll-interval-seconds", type=float, default=30.0)
|
||||
parser.add_argument("--timeout", type=float, default=10.0)
|
||||
parser.add_argument("--max-manifest-bytes", type=int, default=4 * 1024 * 1024)
|
||||
parser.add_argument("--pretty", action="store_true")
|
||||
parser.add_argument("--require-ok", action="store_true", help="Exit non-zero unless summary.ok is true.")
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
try:
|
||||
report = measure(args)
|
||||
except Exception as err: # noqa: BLE001 - command-line tool should preserve exact failure.
|
||||
print(json.dumps({"ok": False, "error": str(err)}, sort_keys=True), file=sys.stderr)
|
||||
return 1
|
||||
if args.pretty:
|
||||
print(json.dumps(report, indent=2, sort_keys=True))
|
||||
else:
|
||||
print(json.dumps(report, sort_keys=True))
|
||||
if args.require_ok and not report.get("summary", {}).get("ok"):
|
||||
return 2
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue