Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 260 additions & 0 deletions docs/adr/ADR-0003-full-node-detection-and-eviction.md

Large diffs are not rendered by default.

106 changes: 48 additions & 58 deletions src/replication/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,34 @@ pub const NEIGHBOR_SYNC_SCOPE: usize = 20;
/// round.
pub const NEIGHBOR_SYNC_PEER_COUNT: usize = 4;

/// Best-effort delivery retries for a fresh-replication push, per peer.
///
/// ADR-0003: on a transport/send failure the offer is retried up to this many
/// times so a transient hiccup does not silently drop it. This is delivery
/// assurance only — possession is judged separately by the delayed possession
/// check, which still penalises a close peer that lacks the chunk even if the
/// push never reached it.
pub const FRESH_REPLICATION_DELIVERY_MAX_RETRIES: u32 = 2;

const POSSESSION_CHECK_DELAY_MIN_SECS: u64 = 5 * 60;
const POSSESSION_CHECK_DELAY_MAX_SECS: u64 = 15 * 60;

/// Lower bound of the delay before a fresh-replication possession check runs
/// (ADR-0003).
///
/// The delay lets replication settle so an honest peer still mid-store is not
/// judged prematurely, and makes the check unpredictable to the peer.
pub const POSSESSION_CHECK_DELAY_MIN: Duration =
Duration::from_secs(POSSESSION_CHECK_DELAY_MIN_SECS);

/// Upper bound of the possession-check delay (ADR-0003).
pub const POSSESSION_CHECK_DELAY_MAX: Duration =
Duration::from_secs(POSSESSION_CHECK_DELAY_MAX_SECS);

// The possession probe reuses the `AuditChallenge` wire and the bandwidth-
// calibrated `audit_response_timeout(1)` deadline, so it needs no bespoke
// per-probe timeout or retry constants.

/// Width used when deciding whether this node may locally store or retain a
/// chunk.
#[must_use]
Expand Down Expand Up @@ -108,8 +136,9 @@ pub const MAX_CONCURRENT_REPLICATION_SENDS: usize = 3;
/// their disk reads don't stall replication. This caps how many run at once
/// across the engine, restoring backpressure: a peer flooding audit challenges
/// cannot fan out unbounded `get_raw` reads or multi-MiB byte serves. When the
/// cap is hit, the challenge is dropped — the auditor graces a non-response as a
/// timeout, so honest auditors are unaffected and only a flooder is throttled.
/// cap is hit, the challenge is dropped and the caller's audit-specific timeout
/// policy applies. The cap must therefore stay high enough for honest audit
/// traffic while still throttling flooders.
/// Sized to cover a handful of concurrent honest auditors (the per-peer
/// gossip-audit cooldown is 30 min, so genuine concurrent audits are few) while
/// bounding the byte round's worst-case resident bytes
Expand All @@ -120,9 +149,10 @@ pub const MAX_CONCURRENT_AUDIT_RESPONSES: usize = 16;
///
/// The global [`MAX_CONCURRENT_AUDIT_RESPONSES`] ceiling alone is not
/// flood-fair: one peer spamming challenges could occupy every slot and starve
/// honest auditors (whose dropped challenges convert to timeouts → strikes on
/// the honest peers). This per-peer cap guarantees no source holds more than
/// its share, so a flood self-throttles. Audits are cooldown-gated (one
/// honest auditors (whose dropped challenges convert to audit failures or
/// timeout verdicts on the challenged peers). This per-peer cap guarantees no
/// source holds more than its share, so a flood self-throttles. Audits are
/// cooldown-gated (one
/// gossip-triggered audit per peer per 30 min), so 2 in-flight per peer
/// comfortably covers the legitimate round-1 + round-2 overlap.
pub const MAX_AUDIT_RESPONSES_PER_PEER: u32 = 2;
Expand Down Expand Up @@ -277,23 +307,6 @@ const _: () = assert!(
"wire cap must fit at least one max-size chunk per byte-challenge response"
);

/// Rollout gate for timeout-driven eviction.
///
/// When `false`, a peer that crosses the consecutive-timeout strike threshold
/// is logged but NOT reported to the trust engine (no eviction). This PR is a
/// breaking wire change (old nodes cannot decode the new `StorageCommitment`
/// gossip), so a not-yet-upgraded peer times out on every new audit and looks
/// exactly like a non-storing peer; penalising timeouts during the mixed-version
/// window would make upgraded nodes evict every old node — a death spiral.
///
/// Confirmed storage-integrity failures (`DigestMismatch`/`KeyAbsent`/
/// `Rejected`/`MalformedResponse`) are NEVER gated by this — those only come
/// from a peer that actually answered with bad data, never an old node. Flip to
/// `true` in a small follow-up release once the fleet has upgraded. This is a
/// real `const` (not commented-out code) so both gate sites compile and stay in
/// sync, and the flip is one line.
pub const TIMEOUT_EVICTION_ENABLED: bool = false;

/// Verification request timeout (per-batch).
const VERIFICATION_REQUEST_TIMEOUT_SECS: u64 = 15;
/// Verification request timeout (per-batch).
Expand All @@ -313,28 +326,6 @@ pub const PENDING_VERIFY_MAX_AGE: Duration = Duration::from_secs(PENDING_VERIFY_
/// Trust event weight for confirmed audit failures.
pub const AUDIT_FAILURE_TRUST_WEIGHT: f64 = 5.0;

/// Consecutive audit *timeouts* a peer may accumulate before a timeout is
/// reported as an `ApplicationFailure` trust event.
///
/// The audit response timeout is an economic deterrent calibrated for
/// residential bandwidth, not a hard cryptographic bound: a single slow
/// response is routine for an honest node under transient load (GC pause,
/// disk flush, a burst of concurrent requests). Penalizing on the first
/// timeout false-positives those nodes.
///
/// Requiring `N` *consecutive* timeouts before penalizing removes that
/// false-positive while preserving the deterrent against a peer that does not
/// actually store the data and must fetch it at audit time: such a peer is
/// slow on *every* audit and accumulates a fresh strike each tick until it
/// crosses the threshold, whereas an honest node answers normally between rare
/// slow ticks and any success resets its strike counter to zero (see
/// `handle_audit_result`). The discriminator is *persistence* of slowness
/// versus *transience*. This deliberately does not widen the per-challenge
/// window. Applies ONLY to `AuditFailureReason::Timeout`; confirmed
/// storage-integrity failures (`DigestMismatch` / `KeyAbsent` / `Rejected` /
/// `MalformedResponse`) remain instantly punishable.
pub const AUDIT_TIMEOUT_STRIKE_THRESHOLD: u32 = 3;

/// Probability of launching a subtree audit when a peer's *changed* commitment
/// is ingested via gossip (ADR-0002). Keeps audits occasional surprise exams.
pub const AUDIT_ON_GOSSIP_PROBABILITY: f64 = 0.2;
Expand Down Expand Up @@ -428,6 +419,13 @@ pub struct ReplicationConfig {
/// Seconds to wait for `DhtNetworkEvent::BootstrapComplete` before
/// proceeding with bootstrap sync (covers bootstrap nodes with no peers).
pub bootstrap_complete_timeout_secs: u64,
/// Lower bound of the delay before a fresh-replication possession check
/// runs (ADR-0003). Defaults to [`POSSESSION_CHECK_DELAY_MIN`]; tests
/// shorten it so the scheduled check fires quickly.
pub possession_check_delay_min: Duration,
/// Upper bound of the possession-check delay window (ADR-0003). Defaults
/// to [`POSSESSION_CHECK_DELAY_MAX`].
pub possession_check_delay_max: Duration,
}

impl Default for ReplicationConfig {
Expand All @@ -454,6 +452,8 @@ impl Default for ReplicationConfig {
verification_request_timeout: VERIFICATION_REQUEST_TIMEOUT,
fetch_request_timeout: FETCH_REQUEST_TIMEOUT,
bootstrap_complete_timeout_secs: BOOTSTRAP_COMPLETE_TIMEOUT_SECS,
possession_check_delay_min: POSSESSION_CHECK_DELAY_MIN,
possession_check_delay_max: POSSESSION_CHECK_DELAY_MAX,
}
}
}
Expand Down Expand Up @@ -592,12 +592,10 @@ impl ReplicationConfig {
/// A relay attacker on a residential link (~5-12 MB/s) who must
/// fetch the same `k × 4 MiB` over the network sees ~10-100× higher
/// latency than disk for the data alone, plus per-chunk round-trips,
/// and misses the budget — recording a timeout strike (per
/// `handle_audit_timeout` → `handle_audit_failure`). After
/// [`AUDIT_TIMEOUT_STRIKE_THRESHOLD`] consecutive timeouts this would
/// fire an `application_failure` trust event — but note that report is
/// currently suppressed for the breaking rollout (grep
/// TIMEOUT-EVICTION-DISABLED); the strike accounting still runs.
/// and misses the budget. In the periodic responsible-chunk
/// `AuditChallenge`, prune-confirmation, and ADR-0003 possession-check paths
/// that timeout is an immediate audit failure. The heavier subtree audit
/// still graces timeouts separately.
///
/// This is an economic deterrent for the §7 relay limit calibrated
/// for residential bandwidth, NOT a hard bound: a relay on a
Expand Down Expand Up @@ -739,14 +737,6 @@ mod tests {
assert!((AUDIT_FAILURE_TRUST_WEIGHT - 5.0).abs() <= f64::EPSILON);
}

#[test]
fn audit_timeout_strike_threshold_is_three() {
// Smallest threshold that tolerates back-to-back transient slowness
// while still penalizing a persistently-slow non-storing peer within a
// few audit ticks.
assert_eq!(AUDIT_TIMEOUT_STRIKE_THRESHOLD, 3);
}

#[test]
fn replication_protocol_id_is_v2() {
// The v12 storage-bound audit changes replication SEMANTICS. The
Expand Down
59 changes: 47 additions & 12 deletions src/replication/fresh.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ use saorsa_core::P2PNode;
use tokio::sync::Semaphore;

use crate::ant_protocol::XorName;
use crate::replication::config::{ReplicationConfig, REPLICATION_PROTOCOL_ID};
use crate::replication::config::{
ReplicationConfig, FRESH_REPLICATION_DELIVERY_MAX_RETRIES, REPLICATION_PROTOCOL_ID,
};
use crate::replication::paid_list::PaidList;
use crate::replication::protocol::{
FreshReplicationOffer, PaidNotify, ReplicationMessage, ReplicationMessageBody,
Expand All @@ -36,9 +38,10 @@ pub struct FreshWriteEvent {

/// Execute fresh replication for a newly accepted record.
///
/// Sends fresh offers to close group members and `PaidNotify` to
/// `PaidCloseGroup`. Both are fire-and-forget (no ack tracking or retry per
/// Section 6.1, rule 8).
/// Sends fresh offers to close group members (with bounded delivery retries,
/// ADR-0003) and `PaidNotify` to `PaidCloseGroup`. Returns the close-group
/// peers responsible for the key (excluding self) so the caller can schedule
/// the delayed possession check; `PaidNotify` remains fire-and-forget.
///
/// The `send_semaphore` limits how many outbound chunk transfers can be
/// in-flight concurrently across the entire replication engine, preventing
Expand All @@ -51,7 +54,7 @@ pub async fn replicate_fresh(
paid_list: &Arc<PaidList>,
config: &ReplicationConfig,
send_semaphore: &Arc<Semaphore>,
) {
) -> Vec<PeerId> {
let self_id = *p2p_node.peer_id();

// Rule 6: Node that validates PoP adds K to PaidForList(self).
Expand Down Expand Up @@ -88,11 +91,15 @@ pub async fn replicate_fresh(
"Failed to encode FreshReplicationOffer for {}",
hex::encode(key),
);
return;
return Vec::new();
};
// Share one encoded copy across the per-peer send tasks so a retry only
// re-materialises the buffer for the (consuming) send call, keeping the
// common single-attempt path at one clone per peer.
let encoded = Arc::new(encoded);
for peer in &target_peers {
let p2p = Arc::clone(p2p_node);
let data = encoded.clone();
let data = Arc::clone(&encoded);
let peer_id = *peer;
let sem = Arc::clone(send_semaphore);
tokio::spawn(async move {
Expand All @@ -103,11 +110,37 @@ pub async fn replicate_fresh(
"Replication send permit acquired for peer {peer_id} ({} available)",
sem.available_permits()
);
if let Err(e) = p2p
.send_message(&peer_id, REPLICATION_PROTOCOL_ID, data, &[])
.await
{
debug!("Failed to send fresh offer to {peer_id}: {e}");
// ADR-0003: best-effort delivery. Retry the push up to
// FRESH_REPLICATION_DELIVERY_MAX_RETRIES times on a transport
// failure so a transient hiccup doesn't silently drop the offer.
// Possession is judged separately by the delayed possession check.
let mut attempt = 0u32;
loop {
match p2p
.send_message(
&peer_id,
REPLICATION_PROTOCOL_ID,
data.as_ref().clone(),
&[],
)
.await
{
Ok(()) => break,
Err(e) => {
if attempt >= FRESH_REPLICATION_DELIVERY_MAX_RETRIES {
debug!(
"Failed to send fresh offer to {peer_id} after {} attempts: {e}",
attempt + 1
);
break;
}
attempt += 1;
debug!(
"Retrying fresh offer to {peer_id} (attempt {}): {e}",
attempt + 1
);
}
}
}
});
}
Expand All @@ -122,6 +155,8 @@ pub async fn replicate_fresh(
hex::encode(key),
target_peers.len()
);

target_peers
}

/// Send `PaidNotify(K)` to every peer in `PaidCloseGroup(K)` (fire-and-forget).
Expand Down
Loading
Loading