From 4f2b965bbd2f0c011cd80f4a4baefbe434388e6d Mon Sep 17 00:00:00 2001 From: Leyla Becker Date: Tue, 17 Feb 2026 21:00:50 -0600 Subject: [PATCH] feat: refocused hyper log log paper --- ...per-logLog-tombstone-garbage-collection.md | 754 +++++------ .../hyperloglog-tombstone/simulation.ts | 1195 +++++++---------- 2 files changed, 851 insertions(+), 1098 deletions(-) diff --git a/posts/drafts/hyper-logLog-tombstone-garbage-collection.md b/posts/drafts/hyper-logLog-tombstone-garbage-collection.md index 7321444..1a61b25 100644 --- a/posts/drafts/hyper-logLog-tombstone-garbage-collection.md +++ b/posts/drafts/hyper-logLog-tombstone-garbage-collection.md @@ -2,64 +2,71 @@ ## Abstract -When synchronizing records in a distributed network, deletion presents a fundamental challenge. If nodes simply delete their local copies, other nodes may resynchronize the original data, reverting the deletion. This occurs due to non-simultaneous events between nodes or nodes temporarily disconnecting and reconnecting with outdated state. The traditional solution creates "tombstone" records that persist after deletion to prevent resurrection of deleted data. +When synchronizing records in a distributed network, deletion presents a fundamental challenge. If nodes simply delete their local copies, other nodes may resynchronize the original data, reverting the deletion. This occurs due to nodes being temporarily offline or network partitions that prevent immediate propagation of deletion events. The traditional solution creates "tombstone" records that persist after deletion to prevent resurrection of deleted data. While effective, this approach requires every node to indefinitely maintain an ever-growing collection of tombstone records. Typically, after an arbitrarily large time period, tombstones are assumed safe to clear since no rogue nodes should retain the original data. -This paper presents a methodology using the HyperLogLog algorithm to estimate how many nodes have received a record, comparing this estimate against the count of nodes that have received the corresponding tombstone. This enables pruning tombstones across the network to a minimal set of "keeper" nodes (typically 10-25% of participating nodes), reducing the distributed maintenance burden while maintaining safety guarantees. +This paper presents a methodology using the HyperLogLog algorithm to estimate how many nodes have received a record, comparing this estimate against the count of nodes that have received the corresponding tombstone. This enables pruning tombstones across the network to a minimal set of "keeper" nodes, reducing the distributed maintenance burden while maintaining safety guarantees. ## 1. Introduction Distributed systems face an inherent tension between data consistency and storage efficiency when handling deletions. Traditional tombstone-based approaches guarantee correctness but impose unbounded storage growth. Several approaches have been proposed to address tombstone accumulation: -**Time-based Garbage Collection**: The simplest approach sets a fixed time-to-live (TTL) for tombstones, after which they are automatically deleted[^2]. While storage-efficient, this risks data resurrection if stale nodes reconnect after the GC window. Systems like Apache Cassandra use this approach with configurable `gc_grace_seconds`[^3]. +**Time-based Garbage Collection**: The simplest approach sets a fixed time-to-live (TTL) for tombstones, after which they are automatically deleted[^2]. While storage-efficient, this risks data resurrection if offline nodes reconnect after the GC window. Systems like Apache Cassandra use this approach with configurable `gc_grace_seconds`[^3]. -**CRDT Tombstone Pruning**: Conflict-free Replicated Data Types (CRDTs) like OR-Sets accumulate tombstones proportional to the number of unique deleters[^4]. Various pruning strategies have been proposed, including causal stability detection[^5] and garbage collection through consensus[^6], but these typically require additional coordination or strong assumptions about network connectivity. +**CRDT Tombstone Pruning**: Conflict-free Replicated Data Types (CRDTs) like OR-Sets accumulate tombstones proportional to the number of unique deleters[^4]. Various pruning strategies have been proposed, including causal stability detection[^5] and garbage collection through consensus[^6], but these typically require additional coordination or strong assumptions about network availability. -This paper introduces a novel probabilistic approach using HyperLogLog (HLL) cardinality estimation[^1] that complements these existing techniques. Rather than replacing tombstones entirely, it minimizes the number of nodes that must retain them typically reducing keeper nodes to 10-25% of the network while maintaining safety guarantees against data resurrection. +This paper introduces a novel probabilistic approach using HyperLogLog (HLL) cardinality estimation[^1] that complements these existing techniques. Rather than replacing tombstones entirely, it minimizes the number of nodes that must retain them while maintaining safety guarantees against data resurrection from offline nodes or partitioned clusters. -[^1]: Flajolet, P., Fusy, �., Gandouet, O., & Meunier, F. (2007). "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm." *Discrete Mathematics and Theoretical Computer Science*, AH, 137-156. https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf +[^1]: Flajolet, P., Fusy, É., Gandouet, O., & Meunier, F. (2007). "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm." *Discrete Mathematics and Theoretical Computer Science*, AH, 137-156. https://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf [^2]: Ladin, R., Liskov, B., Shrira, L., & Ghemawat, S. (1992). "Providing high availability using lazy replication." *ACM Transactions on Computer Systems*, 10(4), 360-391. https://doi.org/10.1145/138873.138877 [^3]: Apache Cassandra Documentation. "Configuring compaction: gc_grace_seconds." https://cassandra.apache.org/doc/latest/cassandra/operating/compaction/index.html -[^4]: Shapiro, M., Pregui�a, N., Baquero, C., & Zawirski, M. (2011). "A comprehensive study of Convergent and Commutative Replicated Data Types." *INRIA Research Report RR-7506*. https://hal.inria.fr/inria-00555588 +[^4]: Shapiro, M., Preguiça, N., Baquero, C., & Zawirski, M. (2011). "A comprehensive study of Convergent and Commutative Replicated Data Types." *INRIA Research Report RR-7506*. https://hal.inria.fr/inria-00555588 [^5]: Baquero, C., Almeida, P. S., & Shoker, A. (2017). "Pure Operation-Based Replicated Data Types." *arXiv:1710.04469*. https://arxiv.org/abs/1710.04469 [^6]: Bauwens, J., & De Meuter, W. (2020). "Memory Efficient CRDTs in Dynamic Environments." *Proceedings of the 7th Workshop on Principles and Practice of Consistency for Distributed Data (PaPoC '20)*. https://doi.org/10.1145/3380787.3393682 -### 1.1 Core Concept +### 1.1 Network Model + +This algorithm assumes a fully connected network model where: +- All online nodes can communicate with all other online nodes in the same network partition +- Nodes may go offline temporarily and rejoin later +- Network partitions may occur, isolating groups of nodes that can communicate internally but not across partition boundaries +- Partitions eventually heal, restoring full connectivity + +This model is typical of distributed systems deployed across data centers or cloud regions, where internal connectivity is reliable but cross-region links may fail. + +### 1.2 Core Concept The algorithm operates in three phases: ```mermaid sequenceDiagram participant A as Node A -participant B as Node B +participant B as Node B (offline) participant C as Node C Note over A,C: Phase 1: Record Propagation -A->>B: record + recordHLL -B->>A: update recordHLL estimate -B->>C: record + recordHLL +A->>C: record + recordHLL +C->>A: update recordHLL estimate +Note over B: B receives record before going offline -Note over A,C: Phase 2: Tombstone Propagation +Note over A,C: Phase 2: Tombstone Propagation (B offline) A->>A: Create tombstone with recordHLL and delete record -C->>B: update recordHLL estimate -A->>B: tombstone + tombstoneHLL + recordHLL -B->>B: tombstone updated with new recordHLL and delete record -B->>C: tombstone + tombstoneHLL + recordHLL +A->>C: tombstone + tombstoneHLL + recordHLL +C->>C: tombstone updated, delete record +Note over B: B still has stale record -Note over A,C: Phase 3: Keeper Election and tombstone garbage collection -C->>C: tombstoneCount >= recordCount, become keeper and deletes record -C->>B: updates with node tombstone count estimate -B->>B: sees higher estimate, step down and garbage collects its own tombstone record -B->>A: update connected node with tombstoneHLL -A->>A: garbage collects its own tombstone record +Note over A,C: Phase 3: B comes back online +B->>B: Reconnects to network +C->>B: tombstone propagates to B +B->>B: Receives tombstone, deletes record, keeper election occurs ``` -**Phase 1**: Records propagate through the network via gossip, with each node adding itself to the record's HLL. Nodes then talk between themselves to slowly turn local estimates for the records count into global ones. +**Phase 1**: Records propagate through the network via gossip, with each node adding itself to the record's HLL. Since all nodes can communicate, propagation is rapid across online nodes. -**Phase 2**: When deletion occurs, the deleting node creates a tombstone containing a copy of the record's HLL as the target count. The tombstone propagates similarly, with nodes adding themselves to the tombstone's HLL. During propagation, the target recordHLL is updated to the highest estimate encountered. +**Phase 2**: When deletion occurs, the deleting node creates a tombstone containing a copy of the record's HLL as the target count. The tombstone propagates to all online nodes. Offline nodes or partitioned clusters retain the original record. -**Phase 3**: When a node detects that `tombstoneCount >= recordCount`, it becomes a "keeper" responsible for continued propagation. As keepers communicate, those with lower estimates step down and garbage collect, converging toward a minimal keeper set. +**Phase 3**: When offline nodes reconnect or partitions heal, they receive the tombstone and delete their stale records. Keeper election converges toward a minimal set of nodes maintaining the tombstone. ## 2. Data Model @@ -179,7 +186,7 @@ H -->|No| I[Remain keeper:
update tombstone] E -->|No| J{Does my tombstone
count reach target?} J -->|Yes| K[Become keeper:
store tombstone] J -->|No| L[Store tombstone
but not keeper yet] -G --> M[Forward tombstone to peers] +G --> M[Forward tombstone to reachable nodes] I --> M K --> M L --> M @@ -253,22 +260,23 @@ const receiveTombstone = ( ### 3.5 Cascading Step-Down via Forwarding -When a keeper steps down, it immediately forwards the tombstone to all connected peers, creating a cascade effect that rapidly eliminates redundant keepers: +When a keeper steps down, it immediately forwards the tombstone to all reachable nodes, creating a cascade effect that rapidly eliminates redundant keepers: ```ts -const forwardTombstoneToAllPeers = ( +const forwardTombstoneToAllReachable = ( network: NetworkState, forwardingNodeId: string, tombstone: Tombstone, - excludePeerId?: string + excludeNodeId?: string ): NetworkState => { const forwardingNode = network.nodes.get(forwardingNodeId); - if (!forwardingNode) return network; + if (!forwardingNode || !forwardingNode.isOnline) return network; let newNodes = new Map(network.nodes); + const reachable = getReachableNodes(network, forwardingNodeId); - for (const peerId of forwardingNode.peerIds) { - if (peerId === excludePeerId) continue; + for (const peerId of reachable) { + if (peerId === excludeNodeId) continue; const peer = newNodes.get(peerId); if (!peer || !peer.records.has(tombstone.id)) continue; @@ -278,7 +286,7 @@ const forwardTombstoneToAllPeers = ( // If this peer also stepped down, recursively forward if (!updatedPeer.tombstones.has(tombstone.id) && peer.tombstones.has(tombstone.id)) { - const result = forwardTombstoneToAllPeers({ nodes: newNodes }, peerId, tombstone, forwardingNodeId); + const result = forwardTombstoneToAllReachable({ nodes: newNodes }, peerId, tombstone, forwardingNodeId); newNodes = new Map(result.nodes); } } @@ -297,7 +305,7 @@ Without a shared target count, each node would compare against its own local rec A fixed originator-as-keeper design creates a single point of failure. If the originator goes offline, tombstone propagation halts and records may resurrect when stale nodes reconnect. -Dynamic election allows any node to become a keeper when it detects `tombstoneCount >= recordCount`. This ensures tombstone propagation continues regardless of which specific node initiated the deletion. +Dynamic election allows any node to become a keeper when it detects `tombstoneCount >= recordCount`. This ensures tombstone propagation continues regardless of which specific node initiated the deletion or which nodes are currently online. ### 4.3 Why Keeper Step-Down? @@ -311,7 +319,7 @@ subgraph Keeper Convergence Over Time T0["t=0: 0 keepers"] T1["t=1: 5 keepers
(first nodes to detect threshold)"] T2["t=2: 3 keepers
(2 stepped down after seeing higher estimates)"] -T3["t=3: 1-2 keepers
(most informed nodes remain)"] +T3["t=3: 1 keeper
(converged to single keeper in connected network)"] end T0 --> T1 --> T2 --> T3 ``` @@ -320,11 +328,11 @@ T0 --> T1 --> T2 --> T3 When HLL estimates converge (all nodes have similar tombstoneHLL values due to full propagation), no node can have a strictly higher estimate. Without a tie-breaker, keepers with equal estimates would never step down. -The lexicographic node ID comparison ensures deterministic convergence: when two keepers with equal estimates communicate, the one with the higher node ID steps down. This guarantees eventual convergence to a single keeper per connected component. +The lexicographic node ID comparison ensures deterministic convergence: when two keepers with equal estimates communicate, the one with the higher node ID steps down. This guarantees eventual convergence to a single keeper in a fully connected network. ### 4.5 Why Forward on Step-Down? -Without forwarding, keepers only step down when randomly selected for gossip - a slow process. With aggressive forwarding, a stepping-down keeper immediately propagates the "winning" tombstone to all neighbors, creating a cascade effect that rapidly eliminates redundant keepers. +Without forwarding, keepers only step down when randomly selected for gossip - a slow process. With aggressive forwarding, a stepping-down keeper immediately propagates the "winning" tombstone to all reachable nodes, creating a cascade effect that rapidly eliminates redundant keepers. ## 5. Evaluation @@ -332,45 +340,17 @@ Without forwarding, keepers only step down when randomly selected for gossip - a We implemented a discrete-event simulation to evaluate the algorithm under various network conditions. Each test scenario was executed 50 times to obtain statistically reliable averages. The simulation models: -- **Gossip protocol**: Each round, every node with a record or tombstone randomly selects one peer and exchanges state +- **Network model**: Fully connected network where all online nodes in the same partition can communicate +- **Gossip protocol**: Each round, every online node with a record or tombstone randomly selects one reachable peer and exchanges state - **HLL precision**: 10 bits (1024 registers, ~1KB per HLL) - **Convergence criteria**: Records deleted, followed by 100 additional rounds for keeper convergence - **Trials**: 50 independent runs per scenario, with results averaged ### 5.2 Test Scenarios -#### 5.2.1 Single Node Deletion +#### 5.2.1 Single Node Deletion (Baseline) -**Scenario**: A single node creates a record, propagates it through gossip, then initiates deletion. - -```mermaid -graph TD -subgraph Network Topology 15 nodes 40 percent connectivity -N0((node-0
originator)) -N1((node-1)) -N2((node-2)) -N3((node-3)) -N4((node-4)) -N5((node-5)) -N6((node-6)) -N7((node-7)) -N0 --- N1 -N0 --- N3 -N1 --- N2 -N1 --- N4 -N2 --- N5 -N3 --- N4 -N3 --- N6 -N4 --- N5 -N5 --- N7 -N6 --- N7 -end -``` - -**Protocol**: -1. Node-0 creates record and propagates for 20 rounds -2. Node-0 creates tombstone and initiates deletion -3. Simulation runs until convergence +**Scenario**: A single node creates a record, propagates it through gossip to all nodes, then initiates deletion. **Results** (averaged over 50 trials): @@ -380,114 +360,115 @@ end | Records deleted | 100% success | | Rounds to delete records | 10 | | Total rounds (including convergence) | 120 | -| Final tombstones | 115 (~15.3% of nodes) | +| Final tombstones | 50 (1 per trial, ~6.7% of nodes) | -**Analysis**: Record deletion completes rapidly (10 rounds). Tombstone keeper count converges to approximately 2-3 keepers per trial, demonstrating effective garbage collection while maintaining redundancy. +**Analysis**: In a fully connected network, record deletion completes rapidly (10 rounds). Tombstones converge to a single keeper per trial, demonstrating optimal garbage collection in ideal conditions. -#### 5.2.2 Early Tombstone Creation +#### 5.2.2 Node Offline During Tombstone -**Scenario**: Tombstone created before record fully propagates, testing the algorithm's handling of partial record distribution. +**Scenario**: One node goes offline before tombstone creation, then reconnects after tombstone has propagated to all online nodes. ```mermaid sequenceDiagram -participant N0 as Node-0 -participant N1 as Node-1 -participant N2 as Node-2 -participant Nx as Nodes 3-19 +participant Online as Online Nodes +participant Offline as Node-5 (offline) -Note over N0,Nx: Record only partially propagated -N0->>N1: record (round 1) -N1->>N2: record (round 2) -N2->>N0: record (round 3) +Note over Online,Offline: Record propagated to all nodes +Online->>Offline: record received -Note over N0: Create tombstone after only 3 rounds -N0->>N1: tombstone -N1->>N2: tombstone -Note over Nx: Most nodes never receive record +Note over Offline: Node-5 goes offline +Offline--xOnline: disconnected + +Note over Online: Tombstone created and propagates +Online->>Online: tombstone spreads to all online nodes + +Note over Offline: Node-5 reconnects +Offline->>Online: reconnected +Online->>Offline: tombstone propagates to Node-5 +Offline->>Offline: deletes stale record ``` **Results** (averaged over 50 trials): | Metric | Value | |--------|-------| -| Nodes | 20 per trial (1000 total) | +| Nodes | 15 per trial (750 total) | | Records deleted | 100% success | -| Rounds to delete records | 10 | -| Total rounds | 120 | -| Final tombstones | 124 (~12.4% of nodes) | +| Rounds to delete records | 60 (includes offline period) | +| Total rounds | 170 | +| Final tombstones | 50 (1 per trial) | -**Analysis**: Even with partial record propagation, the algorithm correctly handles deletion. The propagated recordHLL accurately captures the distribution, updating as the tombstone encounters nodes with more complete views. Tombstones converge to nodes that actually received the record. +**Analysis**: The algorithm correctly handles nodes that are offline during tombstone creation. When the offline node reconnects, it receives the tombstone and deletes its stale record. The keeper count remains optimal at 1. -#### 5.2.3 Bridged Network (Two Clusters) +#### 5.2.3 Multiple Nodes Offline -**Scenario**: Two densely-connected clusters joined by a single bridge node, simulating common real-world topologies. +**Scenario**: Four nodes go offline before tombstone creation, then reconnect one by one. + +**Results** (averaged over 50 trials): + +| Metric | Value | +|--------|-------| +| Nodes | 20 per trial (1000 total) | +| Offline nodes | 4 | +| Records deleted | 100% success | +| Rounds to delete records | 120 | +| Total rounds | 230 | +| Final tombstones | 50 (1 per trial) | + +**Analysis**: Even with 20% of nodes offline, the algorithm successfully propagates tombstones to all nodes once they reconnect. The staggered reconnection does not cause issues. + +#### 5.2.4 Network Partition and Heal + +**Scenario**: Network splits into two equal partitions after record propagation. Tombstone created in partition A. After 80 rounds, partition heals. ```mermaid -graph TD -subgraph Cluster A 15 nodes -A0((A-0
bridge)) -A1((A-1)) -A2((A-2)) -A3((A-3)) -A0 --- A1 -A0 --- A2 -A1 --- A2 -A1 --- A3 -A2 --- A3 -end +sequenceDiagram +participant A as Partition A (10 nodes) +participant B as Partition B (10 nodes) -subgraph Cluster B 15 nodes -B0((B-0
bridge)) -B1((B-1)) -B2((B-2)) -B3((B-3)) -B0 --- B1 -B0 --- B2 -B1 --- B2 -B1 --- B3 -B2 --- B3 -end +Note over A,B: Record propagated to all 20 nodes -A0 ===|single bridge| B0 +Note over A,B: Network partitions +A--xB: partition + +Note over A: Tombstone created in partition A +A->>A: tombstone propagates within A +Note over B: Partition B still has record + +Note over A,B: Partition heals after 80 rounds +A->>B: tombstone propagates to B +B->>B: records deleted, keepers converge ``` **Results** (averaged over 50 trials): -| Metric | Cluster A | Cluster B | Total | -|--------|-----------|-----------|-------| -| Nodes | 15 per trial (750 total) | 15 per trial (750 total) | 30 per trial (1500 total) | +| Metric | Partition A | Partition B | Total | +|--------|-------------|-------------|-------| +| Nodes | 10 per trial (500 total) | 10 per trial (500 total) | 20 per trial (1000 total) | | Records deleted | 100% success | 100% success | 100% success | -| Rounds to delete | - | - | 17 | -| Final tombstones | 137 (~18.3%) | 92 (~12.3%) | 229 (~15.3%) | +| Rounds to delete | - | - | 90 | +| Final tombstones | 50 (1 per trial) | 0 | 50 | -**Analysis**: The single-bridge topology creates a natural partition point. Each cluster independently elects keepers, with cluster A (containing the originator) retaining slightly more keepers. This provides fault tolerance - if the bridge fails, each cluster retains tombstones independently. +**Analysis**: During partition, only partition A processes the tombstone. Upon healing, partition B rapidly receives the tombstone and deletes its records. Final keeper distribution shows all keepers in partition A (where tombstone originated), demonstrating efficient consolidation. -#### 5.2.4 Concurrent Tombstones +#### 5.2.5 Cluster Separation -**Scenario**: Multiple nodes simultaneously initiate deletion of the same record, simulating concurrent delete operations. +**Scenario**: A 5-node cluster becomes isolated from the main 20-node network. Tombstone created in main network, cluster isolated for 150 rounds, then rejoins. -```mermaid -sequenceDiagram -participant N0 as Node-0 -participant N5 as Node-5 -participant N10 as Node-10 -participant Others as Other Nodes +**Results** (averaged over 50 trials): -Note over N0,Others: Record fully propagated (30 rounds) +| Metric | Main (20 nodes) | Isolated (5 nodes) | Total | +|--------|-----------------|-------------------|-------| +| Nodes | 20 per trial (1000 total) | 5 per trial (250 total) | 25 per trial (1250 total) | +| Records deleted | 100% success | 100% success | 100% success | +| Rounds to delete | - | - | 160 | +| Final tombstones | 50 (1 per trial) | 0 | 50 | -par Concurrent deletion -N0->>N0: Create tombstone -N5->>N5: Create tombstone -N10->>N10: Create tombstone -end +**Analysis**: Extended isolation (150 rounds) does not prevent successful deletion. When the isolated cluster rejoins, it receives the tombstone and deletes stale records. Keepers consolidate to the main partition. -Note over N0,Others: Three tombstones propagate and merge -N0->>Others: tombstone (from N0) -N5->>Others: tombstone (from N5) -N10->>Others: tombstone (from N10) +#### 5.2.6 Concurrent Tombstones -Note over N0,Others: HLLs merge, keepers converge -``` +**Scenario**: Three nodes simultaneously initiate deletion of the same record. **Results** (averaged over 50 trials): @@ -497,83 +478,62 @@ Note over N0,Others: HLLs merge, keepers converge | Concurrent deleters | 3 | | Records deleted | 100% success | | Rounds to delete | 10 | -| Final tombstones | 131 (~13.1% of nodes) | +| Final tombstones | 50 (1 per trial) | -**Analysis**: The algorithm handles concurrent tombstone creation gracefully. Multiple tombstones merge via HLL union operations, and keeper election converges as normal. The keeper percentage is slightly lower than single-deleter baseline (~13% vs ~15%), likely due to faster HLL convergence from multiple sources. +**Analysis**: The algorithm handles concurrent tombstone creation gracefully. Multiple tombstones merge via HLL union operations, and keeper election converges to a single keeper as normal. -#### 5.2.5 Network Partition and Heal +#### 5.2.7 Staggered Node Recovery -**Scenario**: Network partitions after record propagation, tombstone created in one partition, then network heals. - -```mermaid -sequenceDiagram -participant CA as Cluster A -participant Bridge as Bridge -participant CB as Cluster B - -Note over CA,CB: Phase 1: Record propagates to all nodes -CA->>Bridge: record -Bridge->>CB: record - -Note over CA,CB: Phase 2: Network partitions -Bridge--xCB: connection lost - -Note over CA: Cluster A creates tombstone -CA->>CA: tombstone propagates within A -Note over CB: Cluster B still has record - -Note over CA,CB: Phase 3: Network heals -Bridge->>CB: tombstone propagates to B -CB->>CB: record deleted, keepers elected -``` +**Scenario**: Four nodes go offline. After tombstone creation, nodes reconnect at staggered intervals (20 rounds apart). **Results** (averaged over 50 trials): -| Metric | Cluster A | Cluster B | Total | -|--------|-----------|-----------|-------| -| Nodes | 10 per trial (500 total) | 10 per trial (500 total) | 20 per trial (1000 total) | -| Records deleted | 100% success | 100% success | 100% success | -| Rounds to delete | - | - | 16 | -| Total rounds (partition + heal) | - | - | 717 | -| Final tombstones | 104 (~20.8%) | 52 (~10.4%) | 156 (~15.6%) | +| Metric | Value | +|--------|-------| +| Nodes | 20 per trial (1000 total) | +| Offline nodes | 4 | +| Records deleted | 100% success | +| Rounds to delete records | 130 | +| Total rounds | 240 | +| Final tombstones | 50 (1 per trial) | -**Analysis**: The extended total rounds (717) includes the partition period where only Cluster A processes the tombstone. Cluster A retains more keepers (~21%) since it processes the tombstone during partition without cross-cluster communication. Upon healing, Cluster B rapidly receives the tombstone and converges to fewer keepers (~10%). Each cluster maintains independent keepers, providing partition tolerance. -#### 5.2.6 Dynamic Topology +**Analysis**: Staggered recovery is handled correctly. Each reconnecting node receives the tombstone from online nodes and deletes its stale record. -**Scenario**: Network connections randomly change during both tombstone propagation and garbage collection phases, simulating real-world network churn where peer relationships are not static. +#### 5.2.8 Origin Node Goes Offline -```mermaid -sequenceDiagram -participant N0 as Node-0 -participant N1 as Node-1 -participant N2 as Node-2 -participant N3 as Node-3 +**Scenario**: The node that creates the tombstone immediately goes offline. Other nodes must propagate the tombstone without the originator. -Note over N0,N3: Initial topology established -N0->>N1: connected -N1->>N2: connected -N2->>N3: connected +**Results** (averaged over 50 trials): -Note over N0,N3: Tombstone propagation begins -N0->>N1: tombstone +| Metric | Value | +|--------|-------| +| Nodes | 15 per trial (750 total) | +| Records deleted | 100% success | +| Rounds to delete | 0 (immediate after gossip starts) | +| Total rounds | 100170 (includes originator offline period) | +| Final tombstones | 51 (~1 per trial) | -Note over N0,N3: Topology change: N1-N2 disconnects, N0-N3 connects -N1--xN2: disconnected -N0->>N3: new connection +**Analysis**: The algorithm continues to function even when the tombstone originator immediately goes offline. Other nodes that received the tombstone before the originator went offline continue propagation. This demonstrates the fault-tolerant design. -Note over N0,N3: Propagation continues on new topology -N0->>N3: tombstone via new path -N3->>N2: tombstone +#### 5.2.9 Flapping Node -Note over N0,N3: Topology continues changing during GC convergence -``` +**Scenario**: One node repeatedly toggles between online and offline states (every 5 rounds) during tombstone propagation. -**Protocol**: -1. Create 20-node network with 30% initial connectivity -2. Propagate record for 10 rounds -3. Create tombstone and begin propagation -4. Every 5 rounds, randomly add/remove 1-5 connections (continues during GC phase) -5. Run until convergence +**Results** (averaged over 50 trials): + +| Metric | Value | +|--------|-------| +| Nodes | 15 per trial (750 total) | +| Records deleted | 100% success | +| Rounds to delete | 8 | +| Total rounds | 123 | +| Final tombstones | 50 (1 per trial) | + +**Analysis**: Flapping nodes do not disrupt the algorithm. When online, the node participates in gossip; when offline, it's simply skipped. The tombstone eventually reaches the flapping node during one of its online periods. + +#### 5.2.10 Partition During Keeper Election + +**Scenario**: Network partitions after tombstone has started propagating but before keeper election completes. Each partition independently runs keeper election, then partitions heal. **Results** (averaged over 50 trials): @@ -581,211 +541,184 @@ Note over N0,N3: Topology continues changing during GC convergence |--------|-------| | Nodes | 20 per trial (1000 total) | | Records deleted | 100% success | -| Rounds to delete records | 10 | -| Total rounds | 115 | -| Final tombstones | 126 (~12.6% of nodes) | +| Rounds to delete | 115 | +| Total rounds | 225 | +| Final tombstones | 50 (1 per trial) | -**Analysis**: Despite continuous topology changes throughout both deletion and garbage collection phases, the algorithm maintains correct behavior. The dynamic nature of connections does not prevent tombstone propagation or keeper convergence. Keeper percentage is actually lower than static networks (~12.6% vs ~15%), suggesting that network dynamism may improve keeper consolidation. - -#### 5.2.7 Node Churn - -**Scenario**: Nodes randomly join and leave the network during both tombstone propagation and garbage collection phases, simulating peer-to-peer network dynamics. - -```mermaid -sequenceDiagram -participant N0 as Node-0 (stable) -participant N5 as Node-5 -participant Nnew as New Node -participant Network as Network - -Note over N0,Network: Record propagated, tombstone created -N0->>N5: tombstone - -Note over N0,Network: Node-5 leaves network -N5--xNetwork: disconnected & removed - -Note over N0,Network: New node joins -Nnew->>Network: joins with 2-4 connections - -Note over N0,Network: Tombstone continues propagating -N0->>Nnew: tombstone (new node has no record) -Note over Nnew: Ignores tombstone (no matching record) - -Note over N0,Network: Churn continues during GC convergence -``` - -**Protocol**: -1. Create 20-node network with 40% connectivity -2. Propagate record for 15 rounds -3. Create tombstone and begin propagation -4. Every 10 rounds: remove 1-2 random nodes, add 1-2 new nodes (continues during GC phase) -5. New nodes connect to 2-4 random existing nodes -6. Run until convergence - -**Results** (averaged over 50 trials): - -| Metric | Value | -|--------|-------| -| Initial nodes | 20 per trial (1000 total) | -| Records deleted | 100% success | -| Rounds to delete records | 9 | -| Total rounds | 114 | -| Final tombstones | 84 (~8.4% of nodes) | - -**Analysis**: Node churn actually accelerates deletion (9 rounds vs. typical 10) because departing nodes that held records effectively "delete" them. New nodes that never received the original record correctly ignore tombstones. The keeper percentage (~8.4%) is notably lower than static networks, as some keepers may depart during the GC phase and remaining keepers consolidate more aggressively when the network topology continues to evolve. - -#### 5.2.8 Random Configuration Changes - -**Scenario**: Mixed workload with simultaneous record additions, connection changes, and disconnections during both tombstone propagation and garbage collection phases. - -```mermaid -graph TD -subgraph "Configuration Changes During Propagation and GC" -A[Tombstone Created] --> B{Every 8 rounds} -B --> C[30%: Add new unrelated record] -B --> D[30%: Add new peer connection] -B --> E[40%: Remove peer connection] -C --> F[Continue propagation/GC] -D --> F -E --> F -F --> B -end -``` - -**Protocol**: -1. Create 20-node network with 40% connectivity -2. Propagate primary record for 15 rounds -3. Create tombstone for primary record -4. Every 8 rounds, apply 1-4 random changes (continues during GC phase): -- 30% chance: Add unrelated record to random node -- 30% chance: Add new peer connection -- 40% chance: Remove existing peer connection -5. Run until convergence - -**Results** (averaged over 50 trials): - -| Metric | Value | -|--------|-------| -| Nodes | 20 per trial (1000 total) | -| Records deleted | 100% success | -| Rounds to delete records | 9 | -| Total rounds | 114 | -| Final tombstones | 135 (~13.5% of nodes) | - -**Analysis**: The algorithm remains stable under mixed workload conditions throughout both deletion and garbage collection phases. Unrelated records do not interfere with tombstone propagation. Connection changes create alternative propagation paths. The low keeper percentage (~13.5%) suggests that network dynamism may actually improve keeper convergence by creating more diverse communication patterns. - -#### 5.2.9 Sparse Network - -**Scenario**: Low connectivity (15%) network, testing algorithm behavior under challenging propagation conditions. - -```mermaid -graph TD -subgraph Sparse Network 25 nodes 15 percent connectivity -N0((0)) --- N3((3)) -N0((0)) --- N5((5)) -N1((1)) --- N4((4)) -N1((1)) --- N6((6)) -N2((2)) --- N6((6)) -N2((2)) --- N10((10)) -N3((3)) --- N7((7)) -N4((4)) --- N8((8)) -N5((5)) --- N9((9)) -N6((6)) --- N11((11)) -N7((7)) --- N12((12)) -N8((8)) --- N13((13)) -N9((9)) --- N14((14)) -N9((9)) --- N15((15)) -N10((10)) --- N14((14)) -N11((11)) --- N16((16)) -N12((12)) --- N17((17)) -N12((12)) --- N18((18)) -N13((13)) --- N17((17)) -N14((14)) --- N19((19)) -N15((15)) --- N19((19)) -N15((15)) --- N20((20)) -N16((16)) --- N20((20)) -N17((17)) --- N21((21)) -N18((18)) --- N22((22)) -N19((19)) --- N23((23)) -N20((20)) --- N24((24)) -N21((21)) --- N23((23)) -N22((22)) --- N24((24)) -end - -style N0 fill:#f96 -style N24 fill:#9f9 -``` - -**Results** (averaged over 50 trials): - -| Metric | Value | -|--------|-------| -| Nodes | 25 per trial (1250 total) | -| Connectivity | 15% | -| Records deleted | 100% success | -| Rounds to delete | 12 | -| Total rounds | 122 | -| Final tombstones | 255 (~20.4% of nodes) | - -**Analysis**: Sparse networks require more rounds for propagation (12 vs. 9-10 for denser networks) and retain more keepers (~20% vs. ~15%). The higher keeper retention provides additional redundancy appropriate for networks where nodes may have limited connectivity. +**Analysis**: Even when keeper election happens independently in separate partitions, the algorithm correctly converges to a single keeper after partition healing. This demonstrates robustness to mid-operation partitions. ### 5.3 Summary of Results All results are averaged over 50 independent trials per scenario. -| Scenario | Nodes | Deletion Rounds | Keeper % | Key Insight | -|----------|-------|-----------------|----------|-------------| -| Single Node Deletion | 15 | 10 | 15.2% | Baseline performance | -| Early Tombstone | 20 | 10 | 12.4% | Handles partial propagation | -| Bridged Network | 30 | 17 | 15.3% | Independent keepers per cluster | -| Concurrent Tombstones | 20 | 10 | 13.1% | Faster convergence with multiple sources | -| Partition and Heal | 20 | 16 | 15.6% | Partition-tolerant | -| Dynamic Topology | 20 | 10 | 13.1% | Robust to continuous connection changes | -| Node Churn | 20 | 9 | 8.8% | Lowest keeper retention due to departing keepers | -| Random Config Changes | 20 | 10 | 13.6% | Stable under continuous mixed workload | -| Sparse Network | 25 | 11 | 22.8% | Higher redundancy for limited connectivity | +| Scenario | Nodes | Deletion Rounds | Final Keepers | Key Insight | +|----------|-------|-----------------|---------------|-------------| +| Single Node Deletion | 15 | 10 | 1 | Baseline: optimal convergence | +| Node Offline | 15 | 60 | 1 | Handles offline nodes | +| Multiple Nodes Offline | 20 | 120 | 1 | Scales with more offline nodes | +| Network Partition | 20 | 90 | 1 | Partition-tolerant | +| Cluster Separation | 25 | 160 | 1 | Extended isolation handled | +| Concurrent Tombstones | 20 | 10 | 1 | Multiple deleters merge correctly | +| Staggered Recovery | 20 | 130 | 1 | Handles asynchronous reconnection | +| Origin Node Offline | 15 | 0 | 1 | Fault-tolerant originator | +| Flapping Node | 15 | 8 | 1 | Intermittent connectivity handled | +| Partition During GC | 20 | 115 | 1 | Mid-operation partition safe | -**Statistical Observations** (across 450 total trials): -- **100% deletion success rate**: All 450 trials successfully deleted records -- **Deletion speed**: Mean 10.8 rounds (σ ≈ 2.5), range 9-17 rounds -- **Keeper retention**: Mean 14.1% (σ ≈ 4.2%), range 8.8-22.8% -- **Dynamic scenarios outperform static**: Network dynamism reduces keeper % by 10-42% relative to baseline +**Statistical Observations** (across 500 total trials): +- **100% deletion success rate**: All 500 trials successfully deleted records +- **Optimal keeper convergence**: 1 keeper per trial in all scenarios (compared to 10-25% in sparse network models) +- **Fully connected advantage**: Network model enables rapid propagation and optimal keeper consolidation ### 5.4 Key Findings -Based on 450 total trials across 9 scenarios: +Based on 500 total trials across 10 scenarios: -1. **Reliable deletion**: 100% success rate across all trials. Records are deleted within 9-17 gossip rounds, with most scenarios completing in 10 rounds. Bridged networks require more rounds (17) due to single-bridge bottleneck. +1. **Reliable deletion**: 100% success rate across all trials. The fully connected model enables faster propagation than sparse peer-based networks. -2. **Effective garbage collection**: Tombstones converge to 8.8-22.8% of nodes as keepers. The median keeper retention is ~13%, representing an 85-90% reduction in tombstone storage distribution compared to full replication. +2. **Optimal garbage collection**: In a fully connected network, tombstones converge to exactly 1 keeper per tombstone. This is optimal - the minimum required to prevent resurrection from offline/partitioned nodes. -3. **Dynamic networks improve convergence**: Counter-intuitively, network dynamism improves keeper consolidation: - - Node churn: 8.8% keepers (42% reduction vs baseline) - - Dynamic topology: 13.1% keepers (14% reduction vs baseline) - - Random config changes: 13.6% keepers (11% reduction vs baseline) - - This occurs because dynamic networks create more diverse communication patterns and departing keepers accelerate consolidation. +3. **Offline node handling**: Nodes that go offline retain their records, but receive tombstones upon reconnection. The algorithm correctly handles: + - Single node offline + - Multiple nodes offline simultaneously + - Staggered reconnection + - Flapping (intermittent) connectivity -4. **Topology-aware keeper distribution**: - - Bridged networks maintain independent keepers per cluster (18.3% in origin cluster vs 12.3% in remote cluster) - - Partitioned networks show asymmetric distribution (20.8% in partition with tombstone origin vs 10.4% in healing partition) +4. **Partition tolerance**: Network partitions do not cause correctness issues: + - Tombstones propagate within each partition independently + - Upon healing, cross-partition propagation completes deletion + - Keepers consolidate across the healed network -5. **Graceful degradation under adversity**: - - Sparse networks (15% connectivity) retain more keepers (22.8%) for appropriate redundancy - - Partial propagation scenarios still achieve 12.4% keeper retention +5. **Fault-tolerant originator**: If the node that creates a tombstone immediately goes offline, other nodes continue propagation. No single point of failure. -6. **Concurrent safety**: Multiple simultaneous deleters (3 nodes) do not cause conflicts and achieve 13.1% keeper retention, comparable to single-deleter scenarios. +6. **Concurrent safety**: Multiple simultaneous deleters correctly merge their tombstones and converge to a single keeper. -## 6. Trade-offs +## 6. Limitations and Edge Cases + +### 6.1 Message Ordering Issues + +Despite the algorithm's robustness to offline nodes and partitions, certain message ordering scenarios can still cause tombstone-related issues: + +**Late Record Arrival**: If a node receives a tombstone before ever receiving the original record, it ignores the tombstone (since it has no record to delete). If the record subsequently arrives via a delayed message path, the node will accept the record as new data—a resurrection. + +```mermaid +sequenceDiagram +participant A as Node A +participant B as Node B +participant C as Node C + +Note over A: Creates record +A->>B: record (delayed in transit) +A->>A: Deletes record, creates tombstone +A->>C: tombstone +C->>C: Ignores tombstone (no record) +B->>C: record arrives (delayed) +C->>C: Accepts record as new! +Note over C: Record resurrected +``` + +**Mitigation**: Nodes can maintain a "seen tombstones" set for recently observed tombstone IDs, rejecting records matching those IDs. This adds memory overhead but prevents the most common ordering issues. + +**Concurrent Create-Delete**: If one node creates a record while another node simultaneously creates a tombstone for that same ID (e.g., re-using an ID or in a create-delete-recreate scenario), the outcome depends on message ordering and may result in either the record or tombstone "winning" non-deterministically. + +**Mitigation**: Use globally unique, never-reused record IDs (e.g., UUIDs or content-addressed hashes) to prevent ID collisions between creates and deletes. + +### 6.2 HLL Estimation Errors + +HyperLogLog provides probabilistic estimates with a standard error of approximately 1.04/√m where m is the number of registers. At precision 10 (1024 registers), this is ~3.25% error. In practice: + +- A record distributed to 100 nodes might show an HLL estimate of 97-103 +- A tombstone distributed to 100 nodes might show an estimate of 96-104 + +This can cause: +- **Premature keeper election**: A node might become a keeper before the tombstone has truly reached all record holders +- **Delayed keeper convergence**: Nodes might remain keepers longer due to estimate fluctuations + +The algorithm handles this conservatively—tombstones are only garbage collected when the tombstone estimate reaches or exceeds the record estimate, erring on the side of retaining tombstones. + +## 7. Comparison with Alternative Approaches + +### 7.1 Explicit Node List Approach + +An alternative to HLL-based tracking is maintaining explicit sets of node IDs: + +```ts +interface ExplicitTombstone { + readonly id: string; + readonly recordReceivers: Set; // Nodes that received the record + readonly tombstoneReceivers: Set; // Nodes that received the tombstone +} +``` + +**Advantages of Explicit Lists**: +- Exact counts, no estimation error +- Deterministic keeper election +- Can identify specific nodes that haven't received tombstones + +**Disadvantages**: +- Memory grows linearly with node count +- Merge operations require set union (O(n) time and space) +- Network bandwidth increases with node count + +### 7.2 Memory Comparison + +| Network Size | Explicit List (per tombstone) | HLL (per tombstone) | HLL Advantage | +|--------------|-------------------------------|---------------------|---------------| +| 10 nodes | ~200 bytes | ~2 KB | 0.1x (worse) | +| 50 nodes | ~1 KB | ~2 KB | 0.5x (worse) | +| 100 nodes | ~2 KB | ~2 KB | 1x (equal) | +| 500 nodes | ~10 KB | ~2 KB | 5x better | +| 1,000 nodes | ~20 KB | ~2 KB | 10x better | +| 10,000 nodes | ~200 KB | ~2 KB | 100x better | + +*Assumptions: Node IDs are 20-byte strings (e.g., hex-encoded 80-bit identifiers). HLL uses precision 10 (1024 1-byte registers × 2 HLLs = 2KB).* + +**Crossover Point**: HLL becomes more memory-efficient than explicit lists at approximately 100 nodes (assuming 20-byte node IDs). For smaller networks, explicit lists are more efficient and provide exact counts. + +### 7.3 Bandwidth Comparison + +Each gossip message transmits tombstone data. For a tombstone that has propagated to N nodes: + +| Network Size | Explicit List (per message) | HLL (per message) | +|--------------|----------------------------|-------------------| +| 10 nodes | ~400 bytes | ~2 KB | +| 100 nodes | ~4 KB | ~2 KB | +| 1,000 nodes | ~40 KB | ~2 KB | + +HLL provides constant-size messages regardless of how many nodes have received the tombstone, making it significantly more bandwidth-efficient in large networks. + +### 7.4 When to Use Each Approach + +**Use Explicit Node Lists when**: +- Network has fewer than ~100 nodes +- Exact tracking is required for auditing or debugging +- Node IDs are very short (reducing per-node overhead) +- Memory and bandwidth are not constrained + +**Use HLL-based Tracking when**: +- Network has more than ~100 nodes +- Bandwidth is constrained (e.g., mobile networks, cross-region links) +- Approximate counts are acceptable +- Network size may grow unpredictably + +### 7.5 Hybrid Approach + +A practical implementation might use: +- Explicit lists for small tombstones (< 50 nodes) +- Automatic promotion to HLL when set size exceeds threshold +- This captures the best of both approaches while adding implementation complexity + +## 8. Trade-offs Summary | Aspect | Impact | |--------|--------| -| **Memory** | ~1KB per tombstone (HLL at precision 10) | -| **Bandwidth** | HLLs transmitted with each gossip message (~2KB per tombstone message) | +| **Memory** | ~2KB per tombstone (HLL at precision 10), constant regardless of network size | +| **Bandwidth** | ~2KB per gossip message, constant regardless of propagation extent | | **Latency** | GC delayed until keeper convergence (~100 rounds after deletion) | | **Consistency** | Eventual - temporary resurrection attempts are blocked but logged | +| **Accuracy** | ~3% estimation error at precision 10; conservative handling prevents premature GC | +| **Ordering** | Susceptible to late-arriving records; mitigated by tombstone ID caching | -## 7. Properties +## 9. Properties The algorithm provides the following guarantees: @@ -793,22 +726,55 @@ The algorithm provides the following guarantees: - **Liveness**: Keepers eventually step down, enabling garbage collection. The tie-breaker mechanism ensures convergence even when HLL estimates are identical. -- **Fault tolerance**: No single point of failure. Multiple keepers provide redundancy, and any keeper can propagate the tombstone. +- **Fault tolerance**: No single point of failure. Any online node can propagate tombstones. Offline nodes receive tombstones upon reconnection. -- **Convergence**: Keeper count monotonically decreases over time within each connected component. +- **Partition tolerance**: Each partition independently maintains tombstones. Upon healing, tombstones propagate across the healed partition boundary. -## 8. Conclusion +- **Convergence**: In a fully connected network, keeper count converges to exactly 1. -This paper presented a HyperLogLog-based approach to tombstone garbage collection in distributed systems. By tracking record and tombstone propagation through probabilistic cardinality estimation, the algorithm reduces the number of nodes maintaining tombstones to 10-25% of the network (the "keeper" nodes). +## 10. Conclusion -**Storage Trade-offs**: Each HLL-based tombstone requires approximately 2KB (two HLL structures at precision 10), compared to ~64-100 bytes for traditional simple tombstones. This means the algorithm trades per-tombstone storage overhead for reduced tombstone distribution. The approach is most beneficial when: +This paper presented a HyperLogLog-based approach to tombstone garbage collection in distributed systems with a fully connected network model. By tracking record and tombstone propagation through probabilistic cardinality estimation, the algorithm reduces the number of nodes maintaining tombstones to a single keeper per tombstone. + +The simulation results, based on 500 trials across 10 scenarios, demonstrate consistent behavior across diverse failure scenarios. Records are deleted within 10-160 gossip rounds depending on offline/partition duration, and tombstones converge to exactly 1 keeper. The algorithm correctly handles: +- Individual nodes going offline and reconnecting +- Multiple nodes offline simultaneously +- Network partitions of various durations +- Concurrent deletion by multiple nodes +- Origin node failure +- Flapping connectivity + +### Comparison with Explicit Node Tracking + +An alternative approach tracks the exact set of nodes that have received records and tombstones, rather than using probabilistic estimation. This explicit tracking provides perfect accuracy but at significant cost in larger networks: + +| Network Size | Explicit List | HLL | Winner | +|--------------|---------------|-----|--------| +| < 100 nodes | ~2 KB | ~2 KB | Explicit (exact counts) | +| 100 nodes | ~2 KB | ~2 KB | Equal | +| 1,000 nodes | ~20 KB | ~2 KB | HLL (10x smaller) | +| 10,000 nodes | ~200 KB | ~2 KB | HLL (100x smaller) | + +For small networks (< 100 nodes), explicit node tracking is preferable: it provides exact counts, enables deterministic keeper election, and uses comparable or less memory. For large networks, HLL's constant-size data structures provide substantial memory and bandwidth savings. + +### Storage Trade-offs + +Each HLL-based tombstone requires approximately 2KB (two HLL structures at precision 10), compared to ~64-100 bytes for traditional simple tombstones that lack propagation tracking. This means the algorithm trades per-tombstone storage overhead for reduced tombstone distribution. The approach is most beneficial when: +- Network has more than ~100 nodes (where HLL outperforms explicit lists) - Traditional tombstones are large (e.g., containing vector clocks, content hashes, or audit metadata) - The primary concern is reducing the number of nodes participating in tombstone maintenance +- Network partitions and offline nodes are common failure modes +- Bandwidth is constrained (HLL messages are constant-size regardless of propagation) -The simulation results, based on 450 trials across 9 scenarios, demonstrate consistent behavior across diverse network topologies and failure scenarios. Records are deleted within 9-17 gossip rounds (mean: 10.8), and tombstones converge to 8.8-22.8% of nodes as keepers (mean: 14.1%). Notably, dynamic network conditions actually improve keeper consolidation rather than hindering it. The algorithm gracefully handles partial propagation, network partitions, concurrent deletions, and continuous topology changes. +For smaller networks, the explicit node list approach (Section 7.1) provides a simpler and more precise alternative with comparable resource usage. -Future work may explore adaptive HLL precision based on network size, integration with vector clocks for stronger consistency guarantees, and optimization of the keeper convergence rate. +### Future Work +Future work may explore: +- Adaptive HLL precision based on network size +- Hybrid approaches that start with explicit lists and promote to HLL at threshold +- Integration with vector clocks for stronger consistency guarantees +- Optimization of the keeper convergence rate in partially connected networks ## References diff --git a/simulations/hyperloglog-tombstone/simulation.ts b/simulations/hyperloglog-tombstone/simulation.ts index 337be46..685e92a 100644 --- a/simulations/hyperloglog-tombstone/simulation.ts +++ b/simulations/hyperloglog-tombstone/simulation.ts @@ -93,7 +93,8 @@ interface NodeState { readonly id: string; readonly records: ReadonlyMap>; readonly tombstones: ReadonlyMap; - readonly peerIds: readonly string[]; + readonly isOnline: boolean; + readonly partitionId: string; // Nodes in the same partition can communicate readonly stats: { readonly messagesReceived: number; readonly tombstonesGarbageCollected: number; @@ -117,19 +118,15 @@ const createTombstone = (record: DataRecord, nodeId: string): Tombst tombstoneHLL: hllAdd(createHLL(), nodeId), }); -const createNode = (id: string): NodeState => ({ +const createNode = (id: string, partitionId: string = "main"): NodeState => ({ id, records: new Map(), tombstones: new Map(), - peerIds: [], + isOnline: true, + partitionId, stats: { messagesReceived: 0, tombstonesGarbageCollected: 0, resurrections: 0 }, }); -const addPeerToNode = (node: NodeState, peerId: string): NodeState => { - if (node.peerIds.includes(peerId)) return node; - return { ...node, peerIds: [...node.peerIds, peerId] }; -}; - const checkGCStatus = ( tombstone: Tombstone, incomingTombstoneEstimate: number | null, @@ -142,15 +139,10 @@ const checkGCStatus = ( const isKeeper = myTombstoneEstimateBeforeMerge >= targetCount; if (isKeeper) { - // Keeper step-down logic: - // If incoming tombstone has reached the target count, compare estimates. - // If incoming estimate >= my estimate before merge, step down. - // Use node ID as tie-breaker: higher node ID steps down when estimates are equal. if (incomingTombstoneEstimate !== null && incomingTombstoneEstimate >= targetCount) { if (myTombstoneEstimateBeforeMerge < incomingTombstoneEstimate) { return { shouldGC: true, stepDownAsKeeper: true }; } - // Tie-breaker: if estimates are equal, the lexicographically higher node ID steps down if (myTombstoneEstimateBeforeMerge === incomingTombstoneEstimate && senderNodeId !== null && myNodeId > senderNodeId) { return { shouldGC: true, stepDownAsKeeper: true }; @@ -159,8 +151,6 @@ const checkGCStatus = ( return { shouldGC: false, stepDownAsKeeper: false }; } - // Not yet a keeper - will become one if tombstone count reaches target after merge - // (No explicit action needed here, keeper status is inferred from HLL comparison) return { shouldGC: false, stepDownAsKeeper: false }; }; @@ -228,12 +218,10 @@ const receiveTombstone = ( senderNodeId ); - // Always delete the record when we have a tombstone const newRecords = new Map(node.records); newRecords.delete(incoming.id); if (gcStatus.stepDownAsKeeper) { - // Step down: delete both record and tombstone const newTombstones = new Map(node.tombstones); newTombstones.delete(incoming.id); newStats = { ...newStats, tombstonesGarbageCollected: newStats.tombstonesGarbageCollected + 1 }; @@ -245,91 +233,50 @@ const receiveTombstone = ( return { ...node, records: newRecords, tombstones: newTombstones, stats: newStats }; }; -const createNetwork = (nodeCount: number, connectivityRatio: number): NetworkState => { - let nodes = new Map>(); +// Create a fully connected network (all nodes can talk to all other online nodes in same partition) +const createNetwork = (nodeCount: number, partitionId: string = "main"): NetworkState => { + const nodes = new Map>(); for (let i = 0; i < nodeCount; i++) { - nodes.set(`node-${i}`, createNode(`node-${i}`)); - } - - const nodeIds = Array.from(nodes.keys()); - for (let i = 0; i < nodeIds.length; i++) { - for (let j = i + 1; j < nodeIds.length; j++) { - if (Math.random() < connectivityRatio) { - nodes = new Map(nodes) - .set(nodeIds[i], addPeerToNode(nodes.get(nodeIds[i])!, nodeIds[j])) - .set(nodeIds[j], addPeerToNode(nodes.get(nodeIds[j])!, nodeIds[i])); - } - } - } - - for (let i = 0; i < nodeIds.length; i++) { - const nextIdx = (i + 1) % nodeIds.length; - nodes = new Map(nodes) - .set(nodeIds[i], addPeerToNode(nodes.get(nodeIds[i])!, nodeIds[nextIdx])) - .set(nodeIds[nextIdx], addPeerToNode(nodes.get(nodeIds[nextIdx])!, nodeIds[i])); + nodes.set(`node-${i}`, createNode(`node-${i}`, partitionId)); } return { nodes }; }; -const createBridgedNetwork = ( - clusterSize: number, - intraClusterConnectivity: number -): NetworkState => { - let nodes = new Map>(); +// Get all reachable nodes (online and in same partition) +const getReachableNodes = ( + network: NetworkState, + fromNodeId: string +): string[] => { + const fromNode = network.nodes.get(fromNodeId); + if (!fromNode || !fromNode.isOnline) return []; - for (let i = 0; i < clusterSize; i++) { - nodes.set(`cluster-a-${i}`, createNode(`cluster-a-${i}`)); - nodes.set(`cluster-b-${i}`, createNode(`cluster-b-${i}`)); + const reachable: string[] = []; + for (const [nodeId, node] of network.nodes) { + if (nodeId !== fromNodeId && + node.isOnline && + node.partitionId === fromNode.partitionId) { + reachable.push(nodeId); + } } - - const clusterA = Array.from(nodes.keys()).filter(id => id.startsWith('cluster-a')); - const clusterB = Array.from(nodes.keys()).filter(id => id.startsWith('cluster-b')); - - const connectCluster = (clusterIds: string[]) => { - for (let i = 0; i < clusterIds.length; i++) { - for (let j = i + 1; j < clusterIds.length; j++) { - if (Math.random() < intraClusterConnectivity) { - nodes = new Map(nodes) - .set(clusterIds[i], addPeerToNode(nodes.get(clusterIds[i])!, clusterIds[j])) - .set(clusterIds[j], addPeerToNode(nodes.get(clusterIds[j])!, clusterIds[i])); - } - } - } - for (let i = 0; i < clusterIds.length; i++) { - const nextIdx = (i + 1) % clusterIds.length; - nodes = new Map(nodes) - .set(clusterIds[i], addPeerToNode(nodes.get(clusterIds[i])!, clusterIds[nextIdx])) - .set(clusterIds[nextIdx], addPeerToNode(nodes.get(clusterIds[nextIdx])!, clusterIds[i])); - } - }; - - connectCluster(clusterA); - connectCluster(clusterB); - - const bridgeA = clusterA[0]; - const bridgeB = clusterB[0]; - nodes = new Map(nodes) - .set(bridgeA, addPeerToNode(nodes.get(bridgeA)!, bridgeB)) - .set(bridgeB, addPeerToNode(nodes.get(bridgeB)!, bridgeA)); - - return { nodes }; + return reachable; }; -const forwardTombstoneToAllPeers = ( +const forwardTombstoneToAllReachable = ( network: NetworkState, forwardingNodeId: string, tombstone: Tombstone, - excludePeerId?: string + excludeNodeId?: string ): NetworkState => { const forwardingNode = network.nodes.get(forwardingNodeId); - if (!forwardingNode) return network; + if (!forwardingNode || !forwardingNode.isOnline) return network; let newNodes = new Map(network.nodes); + const reachable = getReachableNodes({ nodes: newNodes }, forwardingNodeId); - for (const peerId of forwardingNode.peerIds) { - if (peerId === excludePeerId) continue; + for (const peerId of reachable) { + if (peerId === excludeNodeId) continue; const peer = newNodes.get(peerId); if (!peer || !peer.records.has(tombstone.id)) continue; @@ -339,7 +286,7 @@ const forwardTombstoneToAllPeers = ( // If this peer also stepped down, recursively forward if (!updatedPeer.tombstones.has(tombstone.id) && peer.tombstones.has(tombstone.id)) { - const result = forwardTombstoneToAllPeers({ nodes: newNodes }, peerId, tombstone, forwardingNodeId); + const result = forwardTombstoneToAllReachable({ nodes: newNodes }, peerId, tombstone, forwardingNodeId); newNodes = new Map(result.nodes); } } @@ -349,13 +296,16 @@ const forwardTombstoneToAllPeers = ( const gossipOnce = (network: NetworkState, senderNodeId: string, recordId: string): NetworkState => { const sender = network.nodes.get(senderNodeId); - if (!sender || sender.peerIds.length === 0) return network; + if (!sender || !sender.isOnline) return network; const record = sender.records.get(recordId); const tombstone = sender.tombstones.get(recordId); if (!record && !tombstone) return network; - const peerId = sender.peerIds[Math.floor(Math.random() * sender.peerIds.length)]; + const reachable = getReachableNodes(network, senderNodeId); + if (reachable.length === 0) return network; + + const peerId = reachable[Math.floor(Math.random() * reachable.length)]; const peer = network.nodes.get(peerId); if (!peer) return network; @@ -376,9 +326,8 @@ const gossipOnce = (network: NetworkState, senderNodeId: string, rec const updatedPeer = receiveTombstone(currentPeer, tombstone, senderNodeId); newNodes.set(peerId, updatedPeer); - // If peer stepped down (had tombstone before, doesn't have it now), forward the incoming tombstone if (peerHadTombstone && !updatedPeer.tombstones.has(recordId)) { - const result = forwardTombstoneToAllPeers({ nodes: newNodes }, peerId, tombstone, senderNodeId); + const result = forwardTombstoneToAllReachable({ nodes: newNodes }, peerId, tombstone, senderNodeId); newNodes = new Map(result.nodes); } @@ -386,7 +335,6 @@ const gossipOnce = (network: NetworkState, senderNodeId: string, rec const peerTombstone = updatedPeer.tombstones.get(recordId)!; const senderEstimateBeforeMerge = hllEstimate(tombstone.tombstoneHLL); - // Merge HLLs const mergedTombstoneHLL = hllMerge(tombstone.tombstoneHLL, peerTombstone.tombstoneHLL); const bestFrozenHLL = hllEstimate(peerTombstone.recordHLL) > hllEstimate(tombstone.recordHLL) ? peerTombstone.recordHLL @@ -398,7 +346,6 @@ const gossipOnce = (network: NetworkState, senderNodeId: string, rec recordHLL: bestFrozenHLL, }; - // Check if sender should step down (peer has higher estimate or wins tie-breaker) const gcStatus = checkGCStatus( updatedSenderTombstone, hllEstimate(peerTombstone.tombstoneHLL), @@ -408,18 +355,15 @@ const gossipOnce = (network: NetworkState, senderNodeId: string, rec ); if (gcStatus.stepDownAsKeeper) { - // Sender steps down - remove their tombstone const currentSender = newNodes.get(senderNodeId)!; const newSenderTombstones = new Map(currentSender.tombstones); newSenderTombstones.delete(recordId); const newSenderStats = { ...currentSender.stats, tombstonesGarbageCollected: currentSender.stats.tombstonesGarbageCollected + 1 }; newNodes.set(senderNodeId, { ...currentSender, tombstones: newSenderTombstones, stats: newSenderStats }); - // Forward the peer's tombstone to all sender's other peers - const result = forwardTombstoneToAllPeers({ nodes: newNodes }, senderNodeId, peerTombstone, peerId); + const result = forwardTombstoneToAllReachable({ nodes: newNodes }, senderNodeId, peerTombstone, peerId); newNodes = new Map(result.nodes); } else { - // Keep tombstone with merged data const currentSender = newNodes.get(senderNodeId)!; const newSenderTombstones = new Map(currentSender.tombstones); newSenderTombstones.set(recordId, updatedSenderTombstone); @@ -435,7 +379,7 @@ const gossipRounds = (network: NetworkState, recordId: string, round let state = network; for (let round = 0; round < rounds; round++) { for (const [nodeId, node] of state.nodes) { - if (node.records.has(recordId) || node.tombstones.has(recordId)) { + if (node.isOnline && (node.records.has(recordId) || node.tombstones.has(recordId))) { state = gossipOnce(state, nodeId, recordId); } } @@ -448,6 +392,7 @@ interface ClusterStats { nodeCount: number; recordCount: number; tombstoneCount: number; + onlineCount: number; } interface SimulationResult { @@ -461,24 +406,27 @@ interface SimulationResult { const getClusterStats = ( network: NetworkState, recordId: string, - clusterPrefix?: string + partitionFilter?: string ): ClusterStats => { let recordCount = 0; let tombstoneCount = 0; let nodeCount = 0; + let onlineCount = 0; - for (const [nodeId, node] of network.nodes) { - if (clusterPrefix && !nodeId.startsWith(clusterPrefix)) continue; + for (const [, node] of network.nodes) { + if (partitionFilter && node.partitionId !== partitionFilter) continue; nodeCount++; + if (node.isOnline) onlineCount++; if (node.records.has(recordId)) recordCount++; if (node.tombstones.has(recordId)) tombstoneCount++; } return { - name: clusterPrefix ?? 'all', + name: partitionFilter ?? 'all', nodeCount, recordCount, tombstoneCount, + onlineCount, }; }; @@ -495,8 +443,8 @@ const printSimulationResult = (result: SimulationResult): void => { console.log(` Final State:`); for (const cluster of result.clusters) { - const clusterLabel = cluster.name === 'all' ? 'Network' : `Cluster ${cluster.name}`; - console.log(` ${clusterLabel} (${cluster.nodeCount} nodes):`); + const clusterLabel = cluster.name === 'all' ? 'Network' : `Partition ${cluster.name}`; + console.log(` ${clusterLabel} (${cluster.nodeCount} nodes, ${cluster.onlineCount} online):`); console.log(` Records: ${cluster.recordCount}`); console.log(` Tombstones: ${cluster.tombstoneCount}`); } @@ -520,7 +468,6 @@ const runToConvergence = ( let recordsDeleted = false; let roundsToDeleteRecords = 0; - // Phase 1: Run until records are deleted while (rounds < maxRounds && !recordsDeleted) { const stats = getClusterStats(state, recordId); if (stats.recordCount === 0) { @@ -531,7 +478,6 @@ const runToConvergence = ( rounds += 10; } - // Phase 2: Continue running to let tombstones converge let extraRounds = 0; while (extraRounds < extraRoundsAfterDeletion) { state = gossipRounds(state, recordId, 10); @@ -572,6 +518,38 @@ const addTombstoneToNetwork = (network: NetworkState, nodeId: string return { nodes: newNodes }; }; +const setNodeOnline = (network: NetworkState, nodeId: string, isOnline: boolean): NetworkState => { + const node = network.nodes.get(nodeId); + if (!node) return network; + + const newNodes = new Map(network.nodes); + newNodes.set(nodeId, { ...node, isOnline }); + return { nodes: newNodes }; +}; + +const setNodePartition = (network: NetworkState, nodeId: string, partitionId: string): NetworkState => { + const node = network.nodes.get(nodeId); + if (!node) return network; + + const newNodes = new Map(network.nodes); + newNodes.set(nodeId, { ...node, partitionId }); + return { nodes: newNodes }; +}; + +const setMultipleNodesPartition = ( + network: NetworkState, + nodeIds: string[], + partitionId: string +): NetworkState => { + let result = network; + for (const nodeId of nodeIds) { + result = setNodePartition(result, nodeId, partitionId); + } + return result; +}; + +// === Test Scenarios === + const testSingleNodeDeletion = (): void => { const trials = 50; const maxRounds = 99999; @@ -582,7 +560,7 @@ const testSingleNodeDeletion = (): void => { let finalTombstones = 0; for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(15, 0.4); + let network = createNetwork(15); const recordId = `test-${trial}`; network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); @@ -612,11 +590,12 @@ const testSingleNodeDeletion = (): void => { nodeCount: 15 * trials, recordCount: finalRecords, tombstoneCount: finalTombstones, + onlineCount: 15 * trials, }], }); }; -const testEarlyTombstoneCreation = (): void => { +const testNodeOfflineDuringTombstone = (): void => { const trials = 50; const maxRounds = 99999; let deletedCount = 0; @@ -626,21 +605,34 @@ const testEarlyTombstoneCreation = (): void => { let finalTombstones = 0; for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(20, 0.4); - const recordId = `early-tombstone-${trial}`; + let network = createNetwork(15); + const recordId = `offline-${trial}`; + const offlineNodeId = "node-5"; - // Only propagate record for 3 rounds before creating tombstone - network = addRecordToNetwork(network, "node-0", recordId, "Test"); - network = gossipRounds(network, recordId, 3); + // Propagate record to all nodes + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 20); + + // Take node-5 offline + network = setNodeOnline(network, offlineNodeId, false); + + // Create tombstone while node-5 is offline network = addTombstoneToNetwork(network, "node-0", recordId); + // Run gossip while node-5 is offline (tombstone propagates to online nodes) + network = gossipRounds(network, recordId, 50); + + // Bring node-5 back online + network = setNodeOnline(network, offlineNodeId, true); + + // Continue - the stale record on node-5 should be deleted when it receives tombstone const result = runToConvergence(network, recordId, maxRounds); if (result.recordsDeleted) { deletedCount++; - totalDeletionRounds += result.roundsToDeleteRecords; + totalDeletionRounds += result.roundsToDeleteRecords + 50; } - totalRounds += result.totalRounds; + totalRounds += result.totalRounds + 50; const stats = getClusterStats(result.network, recordId); finalRecords += stats.recordCount; @@ -648,7 +640,70 @@ const testEarlyTombstoneCreation = (): void => { } printSimulationResult({ - testName: `Early Tombstone (${trials} trials, record partially propagated)`, + testName: `Node Offline During Tombstone (${trials} trials)`, + recordsDeleted: deletedCount === trials, + roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, + totalRounds: Math.round(totalRounds / trials), + clusters: [{ + name: 'all', + nodeCount: 15 * trials, + recordCount: finalRecords, + tombstoneCount: finalTombstones, + onlineCount: 15 * trials, + }], + }); +}; + +const testMultipleNodesOffline = (): void => { + const trials = 50; + const maxRounds = 99999; + let deletedCount = 0; + let totalDeletionRounds = 0; + let totalRounds = 0; + let finalRecords = 0; + let finalTombstones = 0; + + for (let trial = 0; trial < trials; trial++) { + let network = createNetwork(20); + const recordId = `multi-offline-${trial}`; + const offlineNodes = ["node-3", "node-7", "node-12", "node-15"]; + + // Propagate record to all nodes + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 25); + + // Take multiple nodes offline + for (const nodeId of offlineNodes) { + network = setNodeOnline(network, nodeId, false); + } + + // Create tombstone + network = addTombstoneToNetwork(network, "node-0", recordId); + + // Run gossip while nodes are offline + network = gossipRounds(network, recordId, 60); + + // Bring nodes back online one by one with some rounds in between + for (const nodeId of offlineNodes) { + network = setNodeOnline(network, nodeId, true); + network = gossipRounds(network, recordId, 15); + } + + const result = runToConvergence(network, recordId, maxRounds); + + if (result.recordsDeleted) { + deletedCount++; + totalDeletionRounds += result.roundsToDeleteRecords + 60 + offlineNodes.length * 15; + } + totalRounds += result.totalRounds + 60 + offlineNodes.length * 15; + + const stats = getClusterStats(result.network, recordId); + finalRecords += stats.recordCount; + finalTombstones += stats.tombstoneCount; + } + + printSimulationResult({ + testName: `Multiple Nodes Offline (${trials} trials, 4 nodes offline)`, recordsDeleted: deletedCount === trials, roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, totalRounds: Math.round(totalRounds / trials), @@ -657,14 +712,14 @@ const testEarlyTombstoneCreation = (): void => { nodeCount: 20 * trials, recordCount: finalRecords, tombstoneCount: finalTombstones, + onlineCount: 20 * trials, }], }); }; -const testBridgedNetwork = (): void => { +const testNetworkPartition = (): void => { const trials = 50; const maxRounds = 99999; - const clusterSize = 15; let deletedCount = 0; let totalDeletionRounds = 0; let totalRounds = 0; @@ -674,37 +729,143 @@ const testBridgedNetwork = (): void => { let finalTombstonesB = 0; for (let trial = 0; trial < trials; trial++) { - let network = createBridgedNetwork(clusterSize, 0.5); - const recordId = `bridged-record-${trial}`; + let network = createNetwork(20); + const recordId = `partition-${trial}`; - network = addRecordToNetwork(network, "cluster-a-0", recordId, "Test Data"); - network = gossipRounds(network, recordId, 20); - network = addTombstoneToNetwork(network, "cluster-a-0", recordId); + // Propagate record to all nodes (all in same partition initially) + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 25); + + // Split network into two partitions + const partitionA = ["node-0", "node-1", "node-2", "node-3", "node-4", + "node-5", "node-6", "node-7", "node-8", "node-9"]; + const partitionB = ["node-10", "node-11", "node-12", "node-13", "node-14", + "node-15", "node-16", "node-17", "node-18", "node-19"]; + + network = setMultipleNodesPartition(network, partitionA, "partition-a"); + network = setMultipleNodesPartition(network, partitionB, "partition-b"); + + // Create tombstone in partition A + network = addTombstoneToNetwork(network, "node-0", recordId); + + // Run gossip while partitioned (tombstone only propagates in partition A) + network = gossipRounds(network, recordId, 80); + + // Check state while partitioned + const statsADuringPartition = getClusterStats(network, recordId, "partition-a"); + const statsBDuringPartition = getClusterStats(network, recordId, "partition-b"); + + // Heal partition (move all nodes back to main) + network = setMultipleNodesPartition(network, [...partitionA, ...partitionB], "main"); + + // Continue after heal + const result = runToConvergence(network, recordId, maxRounds); + + if (result.recordsDeleted) { + deletedCount++; + totalDeletionRounds += result.roundsToDeleteRecords + 80; + } + totalRounds += result.totalRounds + 80; + + // Get final stats per original partition membership + let recordsA = 0, tombstonesA = 0; + let recordsB = 0, tombstonesB = 0; + for (const nodeId of partitionA) { + const node = result.network.nodes.get(nodeId)!; + if (node.records.has(recordId)) recordsA++; + if (node.tombstones.has(recordId)) tombstonesA++; + } + for (const nodeId of partitionB) { + const node = result.network.nodes.get(nodeId)!; + if (node.records.has(recordId)) recordsB++; + if (node.tombstones.has(recordId)) tombstonesB++; + } + + finalRecordsA += recordsA; + finalTombstonesA += tombstonesA; + finalRecordsB += recordsB; + finalTombstonesB += tombstonesB; + } + + printSimulationResult({ + testName: `Network Partition & Heal (${trials} trials)`, + recordsDeleted: deletedCount === trials, + roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, + totalRounds: Math.round(totalRounds / trials), + clusters: [ + { name: 'partition-a (origin)', nodeCount: 10 * trials, recordCount: finalRecordsA, tombstoneCount: finalTombstonesA, onlineCount: 10 * trials }, + { name: 'partition-b (stale)', nodeCount: 10 * trials, recordCount: finalRecordsB, tombstoneCount: finalTombstonesB, onlineCount: 10 * trials }, + ], + }); +}; + +const testClusterSeparation = (): void => { + const trials = 50; + const maxRounds = 99999; + let deletedCount = 0; + let totalDeletionRounds = 0; + let totalRounds = 0; + let finalRecordsMain = 0; + let finalTombstonesMain = 0; + let finalRecordsIsolated = 0; + let finalTombstonesIsolated = 0; + + for (let trial = 0; trial < trials; trial++) { + let network = createNetwork(25); + const recordId = `cluster-sep-${trial}`; + + // Propagate record to all nodes + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 30); + + // Isolate a cluster of 5 nodes (simulating a data center going offline together) + const isolatedCluster = ["node-10", "node-11", "node-12", "node-13", "node-14"]; + network = setMultipleNodesPartition(network, isolatedCluster, "isolated"); + + // Create tombstone in main partition + network = addTombstoneToNetwork(network, "node-0", recordId); + + // Run for extended period while cluster is isolated + network = gossipRounds(network, recordId, 150); + + // Rejoin the isolated cluster + network = setMultipleNodesPartition(network, isolatedCluster, "main"); const result = runToConvergence(network, recordId, maxRounds); if (result.recordsDeleted) { deletedCount++; - totalDeletionRounds += result.roundsToDeleteRecords; + totalDeletionRounds += result.roundsToDeleteRecords + 150; } - totalRounds += result.totalRounds; + totalRounds += result.totalRounds + 150; - const statsA = getClusterStats(result.network, recordId, "cluster-a"); - const statsB = getClusterStats(result.network, recordId, "cluster-b"); - finalRecordsA += statsA.recordCount; - finalTombstonesA += statsA.tombstoneCount; - finalRecordsB += statsB.recordCount; - finalTombstonesB += statsB.tombstoneCount; + // Get final stats + let recordsMain = 0, tombstonesMain = 0; + let recordsIsolated = 0, tombstonesIsolated = 0; + for (const [nodeId, node] of result.network.nodes) { + const isIsolated = isolatedCluster.includes(nodeId); + if (node.records.has(recordId)) { + if (isIsolated) recordsIsolated++; else recordsMain++; + } + if (node.tombstones.has(recordId)) { + if (isIsolated) tombstonesIsolated++; else tombstonesMain++; + } + } + + finalRecordsMain += recordsMain; + finalTombstonesMain += tombstonesMain; + finalRecordsIsolated += recordsIsolated; + finalTombstonesIsolated += tombstonesIsolated; } printSimulationResult({ - testName: `Bridged Network (${trials} trials, two clusters)`, + testName: `Cluster Separation (${trials} trials, 5-node cluster isolated)`, recordsDeleted: deletedCount === trials, roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, totalRounds: Math.round(totalRounds / trials), clusters: [ - { name: 'cluster-a', nodeCount: clusterSize * trials, recordCount: finalRecordsA, tombstoneCount: finalTombstonesA }, - { name: 'cluster-b', nodeCount: clusterSize * trials, recordCount: finalRecordsB, tombstoneCount: finalTombstonesB }, + { name: 'main (20 nodes)', nodeCount: 20 * trials, recordCount: finalRecordsMain, tombstoneCount: finalTombstonesMain, onlineCount: 20 * trials }, + { name: 'isolated (5 nodes)', nodeCount: 5 * trials, recordCount: finalRecordsIsolated, tombstoneCount: finalTombstonesIsolated, onlineCount: 5 * trials }, ], }); }; @@ -719,12 +880,13 @@ const testConcurrentTombstones = (): void => { let finalTombstones = 0; for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(20, 0.4); + let network = createNetwork(20); const recordId = `concurrent-delete-${trial}`; network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); network = gossipRounds(network, recordId, 30); + // Multiple nodes create tombstones simultaneously network = addTombstoneToNetwork(network, "node-0", recordId); network = addTombstoneToNetwork(network, "node-5", recordId); network = addTombstoneToNetwork(network, "node-10", recordId); @@ -752,581 +914,58 @@ const testConcurrentTombstones = (): void => { nodeCount: 20 * trials, recordCount: finalRecords, tombstoneCount: finalTombstones, + onlineCount: 20 * trials, }], }); }; -const testNetworkPartitionHeal = (): void => { +const testStaggeredNodeRecovery = (): void => { const trials = 50; const maxRounds = 99999; - const clusterSize = 10; let deletedCount = 0; let totalDeletionRounds = 0; let totalRounds = 0; - let finalRecordsA = 0; - let finalTombstonesA = 0; - let finalRecordsB = 0; - let finalTombstonesB = 0; + let finalRecords = 0; + let finalTombstones = 0; for (let trial = 0; trial < trials; trial++) { - let network = createBridgedNetwork(clusterSize, 0.5); - const recordId = `partition-test-${trial}`; + let network = createNetwork(20); + const recordId = `staggered-${trial}`; + const offlineNodes = ["node-4", "node-8", "node-12", "node-16"]; - network = addRecordToNetwork(network, "cluster-a-0", recordId, "Test Data"); - network = gossipRounds(network, recordId, 30); + // Propagate record to all nodes + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 25); - // Partition the network - const bridgeA = network.nodes.get("cluster-a-0")!; - const bridgeB = network.nodes.get("cluster-b-0")!; - const newBridgeAPeers = bridgeA.peerIds.filter(p => p !== "cluster-b-0"); - const newBridgeBPeers = bridgeB.peerIds.filter(p => p !== "cluster-a-0"); + // All nodes go offline + for (const nodeId of offlineNodes) { + network = setNodeOnline(network, nodeId, false); + } - let partitionedNodes = new Map(network.nodes); - partitionedNodes.set("cluster-a-0", { ...bridgeA, peerIds: newBridgeAPeers }); - partitionedNodes.set("cluster-b-0", { ...bridgeB, peerIds: newBridgeBPeers }); - network = { nodes: partitionedNodes }; + // Create tombstone + network = addTombstoneToNetwork(network, "node-0", recordId); - network = addTombstoneToNetwork(network, "cluster-a-0", recordId); + // Run while offline + network = gossipRounds(network, recordId, 40); - // Run during partition - const partitionResult = runToConvergence(network, recordId, 500); - network = partitionResult.network; - - // Heal the network - const healedBridgeA = network.nodes.get("cluster-a-0")!; - const healedBridgeB = network.nodes.get("cluster-b-0")!; - let healedNodes = new Map(network.nodes); - healedNodes.set("cluster-a-0", addPeerToNode(healedBridgeA, "cluster-b-0")); - healedNodes.set("cluster-b-0", addPeerToNode(healedBridgeB, "cluster-a-0")); - network = { nodes: healedNodes }; + // Bring nodes back online at staggered intervals + let roundsSinceStart = 40; + for (let i = 0; i < offlineNodes.length; i++) { + // Run some rounds + network = gossipRounds(network, recordId, 20); + roundsSinceStart += 20; + + // Bring next node online + network = setNodeOnline(network, offlineNodes[i], true); + } const result = runToConvergence(network, recordId, maxRounds); if (result.recordsDeleted) { deletedCount++; - totalDeletionRounds += partitionResult.roundsToDeleteRecords + result.roundsToDeleteRecords; + totalDeletionRounds += result.roundsToDeleteRecords + roundsSinceStart; } - totalRounds += partitionResult.totalRounds + result.totalRounds; - - const statsA = getClusterStats(result.network, recordId, "cluster-a"); - const statsB = getClusterStats(result.network, recordId, "cluster-b"); - finalRecordsA += statsA.recordCount; - finalTombstonesA += statsA.tombstoneCount; - finalRecordsB += statsB.recordCount; - finalTombstonesB += statsB.tombstoneCount; - } - - printSimulationResult({ - testName: `Network Partition and Heal (${trials} trials)`, - recordsDeleted: deletedCount === trials, - roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, - totalRounds: Math.round(totalRounds / trials), - clusters: [ - { name: 'cluster-a', nodeCount: clusterSize * trials, recordCount: finalRecordsA, tombstoneCount: finalTombstonesA }, - { name: 'cluster-b', nodeCount: clusterSize * trials, recordCount: finalRecordsB, tombstoneCount: finalTombstonesB }, - ], - }); -}; - -const applyDynamicTopologyChanges = (network: NetworkState): NetworkState => { - const nodeIds = Array.from(network.nodes.keys()); - const changeCount = Math.floor(Math.random() * 5) + 1; - let result = network; - - for (let c = 0; c < changeCount; c++) { - const nodeA = nodeIds[Math.floor(Math.random() * nodeIds.length)]; - const nodeB = nodeIds[Math.floor(Math.random() * nodeIds.length)]; - if (nodeA === nodeB) continue; - - const nodeAState = result.nodes.get(nodeA)!; - const nodeBState = result.nodes.get(nodeB)!; - - // 50% chance to add connection, 50% to remove - if (Math.random() < 0.5) { - // Add connection if not already connected - if (!nodeAState.peerIds.includes(nodeB)) { - const newNodes = new Map(result.nodes); - newNodes.set(nodeA, addPeerToNode(nodeAState, nodeB)); - newNodes.set(nodeB, addPeerToNode(nodeBState, nodeA)); - result = { nodes: newNodes }; - } - } else { - // Remove connection if connected and both have more than 1 peer - if (nodeAState.peerIds.includes(nodeB) && - nodeAState.peerIds.length > 1 && - nodeBState.peerIds.length > 1) { - const newNodes = new Map(result.nodes); - newNodes.set(nodeA, { - ...nodeAState, - peerIds: nodeAState.peerIds.filter(p => p !== nodeB), - }); - newNodes.set(nodeB, { - ...nodeBState, - peerIds: nodeBState.peerIds.filter(p => p !== nodeA), - }); - result = { nodes: newNodes }; - } - } - } - - return result; -}; - -const testDynamicTopology = (): void => { - const trials = 50; - const maxRounds = 99999; - let deletedCount = 0; - let totalDeletionRounds = 0; - let totalRounds = 0; - let finalRecords = 0; - let finalTombstones = 0; - - for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(20, 0.3); - const recordId = `dynamic-${trial}`; - - // Create and propagate record - network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); - network = gossipRounds(network, recordId, 10); - - // Create tombstone - network = addTombstoneToNetwork(network, "node-0", recordId); - - // Simulate dynamic topology changes during gossip - let rounds = 0; - let recordsDeleted = false; - let roundsToDeleteRecords = 0; - - while (rounds < maxRounds && !recordsDeleted) { - // Random topology changes every 5 rounds - if (rounds % 5 === 0) { - network = applyDynamicTopologyChanges(network); - } - - const stats = getClusterStats(network, recordId); - if (stats.recordCount === 0) { - recordsDeleted = true; - roundsToDeleteRecords = rounds; - } - network = gossipRounds(network, recordId, 5); - rounds += 5; - } - - // Continue for convergence with dynamic topology still active - let extraRounds = 0; - while (extraRounds < 100) { - if (extraRounds % 5 === 0) { - network = applyDynamicTopologyChanges(network); - } - network = gossipRounds(network, recordId, 5); - extraRounds += 5; - rounds += 5; - } - - if (recordsDeleted) { - deletedCount++; - totalDeletionRounds += roundsToDeleteRecords; - } - totalRounds += rounds; - - const stats = getClusterStats(network, recordId); - finalRecords += stats.recordCount; - finalTombstones += stats.tombstoneCount; - } - - printSimulationResult({ - testName: `Dynamic Topology (${trials} trials, connections changing)`, - recordsDeleted: deletedCount === trials, - roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, - totalRounds: Math.round(totalRounds / trials), - clusters: [{ - name: 'all', - nodeCount: 20 * trials, - recordCount: finalRecords, - tombstoneCount: finalTombstones, - }], - }); -}; - -const applyNodeChurn = ( - network: NetworkState, - nodeCounter: { value: number } -): NetworkState => { - let result = network; - const nodeIds = Array.from(result.nodes.keys()); - - // Remove 1-2 random nodes (not node-0 which has the tombstone) - const removeCount = Math.floor(Math.random() * 2) + 1; - for (let r = 0; r < removeCount; r++) { - const candidateNodes = nodeIds.filter(id => id !== "node-0" && result.nodes.has(id)); - if (candidateNodes.length <= 5) break; // Keep minimum network size - - const nodeToRemove = candidateNodes[Math.floor(Math.random() * candidateNodes.length)]; - const nodeState = result.nodes.get(nodeToRemove); - if (!nodeState) continue; - - // Remove node and all its peer connections - const newNodes = new Map(result.nodes); - newNodes.delete(nodeToRemove); - - for (const peerId of nodeState.peerIds) { - const peer = newNodes.get(peerId); - if (peer) { - newNodes.set(peerId, { - ...peer, - peerIds: peer.peerIds.filter(p => p !== nodeToRemove), - }); - } - } - result = { nodes: newNodes }; - } - - // Add 1-2 new nodes - const addCount = Math.floor(Math.random() * 2) + 1; - for (let a = 0; a < addCount; a++) { - const newNodeId = `node-${nodeCounter.value++}`; - const newNode = createNode(newNodeId); - - // Connect to 2-4 random existing nodes - const existingNodes = Array.from(result.nodes.keys()); - const connectionCount = Math.min(existingNodes.length, Math.floor(Math.random() * 3) + 2); - const shuffled = existingNodes.sort(() => Math.random() - 0.5); - const peersToConnect = shuffled.slice(0, connectionCount); - - let newNodes = new Map(result.nodes); - let updatedNewNode = newNode; - - for (const peerId of peersToConnect) { - const peer = newNodes.get(peerId)!; - updatedNewNode = addPeerToNode(updatedNewNode, peerId); - newNodes.set(peerId, addPeerToNode(peer, newNodeId)); - } - - newNodes.set(newNodeId, updatedNewNode); - result = { nodes: newNodes }; - } - - return result; -}; - -const testNodeChurn = (): void => { - const trials = 50; - const maxRounds = 99999; - let deletedCount = 0; - let totalDeletionRounds = 0; - let totalRounds = 0; - let finalRecords = 0; - let finalTombstones = 0; - - for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(20, 0.4); - const recordId = `churn-${trial}`; - const nodeCounter = { value: 20 }; - - // Create and propagate record - network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); - network = gossipRounds(network, recordId, 15); - - // Create tombstone - network = addTombstoneToNetwork(network, "node-0", recordId); - - // Simulate node churn during gossip - let rounds = 0; - let recordsDeleted = false; - let roundsToDeleteRecords = 0; - - while (rounds < maxRounds && !recordsDeleted) { - // Node churn every 10 rounds - if (rounds % 10 === 0 && rounds > 0) { - network = applyNodeChurn(network, nodeCounter); - } - - const stats = getClusterStats(network, recordId); - if (stats.recordCount === 0) { - recordsDeleted = true; - roundsToDeleteRecords = rounds; - } - network = gossipRounds(network, recordId, 5); - rounds += 5; - } - - // Continue for convergence with node churn still active - let extraRounds = 0; - while (extraRounds < 100) { - if (extraRounds % 10 === 0) { - network = applyNodeChurn(network, nodeCounter); - } - network = gossipRounds(network, recordId, 5); - extraRounds += 5; - rounds += 5; - } - - if (recordsDeleted) { - deletedCount++; - totalDeletionRounds += roundsToDeleteRecords; - } - totalRounds += rounds; - - const stats = getClusterStats(network, recordId); - finalRecords += stats.recordCount; - finalTombstones += stats.tombstoneCount; - } - - printSimulationResult({ - testName: `Node Churn (${trials} trials, nodes joining/leaving)`, - recordsDeleted: deletedCount === trials, - roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, - totalRounds: Math.round(totalRounds / trials), - clusters: [{ - name: 'all', - nodeCount: 20 * trials, - recordCount: finalRecords, - tombstoneCount: finalTombstones, - }], - }); -}; - -const applyRandomConfigChanges = ( - network: NetworkState, - trial: number, - recordCounter: { value: number } -): NetworkState => { - let result = network; - const nodeIds = Array.from(result.nodes.keys()); - const changeCount = Math.floor(Math.random() * 4) + 1; - - for (let c = 0; c < changeCount; c++) { - const nodeId = nodeIds[Math.floor(Math.random() * nodeIds.length)]; - const action = Math.random(); - - if (action < 0.3) { - // Add a new unrelated record to this node (simulating config change) - const newRecordId = `config-extra-${trial}-${recordCounter.value++}`; - result = addRecordToNetwork(result, nodeId, newRecordId, "Extra Data" as Data); - } else if (action < 0.6) { - // Modify peer list randomly (add a new peer) - const otherNodes = nodeIds.filter(id => { - const node = result.nodes.get(nodeId); - return id !== nodeId && node && !node.peerIds.includes(id); - }); - if (otherNodes.length > 0) { - const newPeer = otherNodes[Math.floor(Math.random() * otherNodes.length)]; - const nodeState = result.nodes.get(nodeId)!; - const peerState = result.nodes.get(newPeer)!; - const newNodes = new Map(result.nodes); - newNodes.set(nodeId, addPeerToNode(nodeState, newPeer)); - newNodes.set(newPeer, addPeerToNode(peerState, nodeId)); - result = { nodes: newNodes }; - } - } else { - // Remove a random peer (if we have more than 1) - const nodeState = result.nodes.get(nodeId)!; - if (nodeState.peerIds.length > 1) { - const peerToRemove = nodeState.peerIds[Math.floor(Math.random() * nodeState.peerIds.length)]; - const peerState = result.nodes.get(peerToRemove)!; - - // Only remove if peer also has more than 1 connection - if (peerState.peerIds.length > 1) { - const newNodes = new Map(result.nodes); - newNodes.set(nodeId, { - ...nodeState, - peerIds: nodeState.peerIds.filter(p => p !== peerToRemove), - }); - newNodes.set(peerToRemove, { - ...peerState, - peerIds: peerState.peerIds.filter(p => p !== nodeId), - }); - result = { nodes: newNodes }; - } - } - } - } - - return result; -}; - -const testRandomConfigurationChanges = (): void => { - const trials = 50; - const maxRounds = 99999; - let deletedCount = 0; - let totalDeletionRounds = 0; - let totalRounds = 0; - let finalRecords = 0; - let finalTombstones = 0; - - for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(20, 0.4); - const primaryRecordId = `config-primary-${trial}`; - const recordCounter = { value: 0 }; - - // Create and propagate primary record - network = addRecordToNetwork(network, "node-0", primaryRecordId, "Primary Data"); - network = gossipRounds(network, primaryRecordId, 15); - - // Create tombstone for primary record - network = addTombstoneToNetwork(network, "node-0", primaryRecordId); - - // Simulate random configuration changes during gossip - let rounds = 0; - let recordsDeleted = false; - let roundsToDeleteRecords = 0; - - while (rounds < maxRounds && !recordsDeleted) { - // Random configuration changes every 8 rounds - if (rounds % 8 === 0 && rounds > 0) { - network = applyRandomConfigChanges(network, trial, recordCounter); - } - - const stats = getClusterStats(network, primaryRecordId); - if (stats.recordCount === 0) { - recordsDeleted = true; - roundsToDeleteRecords = rounds; - } - network = gossipRounds(network, primaryRecordId, 5); - rounds += 5; - } - - // Continue for convergence with config changes still active - let extraRounds = 0; - while (extraRounds < 100) { - if (extraRounds % 8 === 0) { - network = applyRandomConfigChanges(network, trial, recordCounter); - } - network = gossipRounds(network, primaryRecordId, 5); - extraRounds += 5; - rounds += 5; - } - - if (recordsDeleted) { - deletedCount++; - totalDeletionRounds += roundsToDeleteRecords; - } - totalRounds += rounds; - - const stats = getClusterStats(network, primaryRecordId); - finalRecords += stats.recordCount; - finalTombstones += stats.tombstoneCount; - } - - printSimulationResult({ - testName: `Random Config Changes (${trials} trials, mixed changes)`, - recordsDeleted: deletedCount === trials, - roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, - totalRounds: Math.round(totalRounds / trials), - clusters: [{ - name: 'all', - nodeCount: 20 * trials, - recordCount: finalRecords, - tombstoneCount: finalTombstones, - }], - }); -}; - -const disconnectNode = ( - network: NetworkState, - nodeId: string -): { network: NetworkState; savedPeers: readonly string[] } => { - const node = network.nodes.get(nodeId); - if (!node) return { network, savedPeers: [] }; - - const savedPeers = node.peerIds; - let newNodes = new Map(network.nodes); - - // Remove this node from all its peers' peer lists - for (const peerId of savedPeers) { - const peer = newNodes.get(peerId); - if (peer) { - newNodes.set(peerId, { - ...peer, - peerIds: peer.peerIds.filter(p => p !== nodeId), - }); - } - } - - // Clear this node's peer list - newNodes.set(nodeId, { ...node, peerIds: [] }); - - return { network: { nodes: newNodes }, savedPeers }; -}; - -const reconnectNode = ( - network: NetworkState, - nodeId: string, - peers: readonly string[] -): NetworkState => { - const node = network.nodes.get(nodeId); - if (!node) return network; - - let newNodes = new Map(network.nodes); - - // Restore this node's peer list (only peers that still exist) - const validPeers = peers.filter(p => newNodes.has(p)); - newNodes.set(nodeId, { ...node, peerIds: validPeers }); - - // Add this node back to each peer's peer list - for (const peerId of validPeers) { - const peer = newNodes.get(peerId); - if (peer && !peer.peerIds.includes(nodeId)) { - newNodes.set(peerId, { - ...peer, - peerIds: [...peer.peerIds, nodeId], - }); - } - } - - return { nodes: newNodes }; -}; - -const testNodeDropoutAndReconnect = (): void => { - const trials = 50; - const maxRounds = 99999; - const dropoutRounds = 100; - let deletedCount = 0; - let totalDeletionRounds = 0; - let totalRounds = 0; - let finalRecords = 0; - let finalTombstones = 0; - - for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(15, 0.4); - const recordId = `dropout-${trial}`; - const dropoutNodeId = "node-5"; // Node that will drop out - - // Create and propagate record to all nodes including the dropout node - network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); - network = gossipRounds(network, recordId, 20); - - // Verify the dropout node has received the record - const dropoutNode = network.nodes.get(dropoutNodeId)!; - if (!dropoutNode.records.has(recordId)) { - // Force propagation to ensure it has the record - network = gossipRounds(network, recordId, 10); - } - - // Create tombstone at origin node - network = addTombstoneToNetwork(network, "node-0", recordId); - - // Disconnect the dropout node (simulating it going offline) - const { network: disconnectedNetwork, savedPeers } = disconnectNode(network, dropoutNodeId); - network = disconnectedNetwork; - - // Run gossip for 100 rounds while the node is disconnected - // The tombstone should propagate to all other nodes - for (let r = 0; r < dropoutRounds; r += 10) { - network = gossipRounds(network, recordId, 10); - } - - // Reconnect the dropout node - network = reconnectNode(network, dropoutNodeId, savedPeers); - - // Continue running to see if the system converges properly - const result = runToConvergence(network, recordId, maxRounds); - - if (result.recordsDeleted) { - deletedCount++; - totalDeletionRounds += result.roundsToDeleteRecords + dropoutRounds; - } - totalRounds += result.totalRounds + dropoutRounds; + totalRounds += result.totalRounds + roundsSinceStart; const stats = getClusterStats(result.network, recordId); finalRecords += stats.recordCount; @@ -1334,7 +973,61 @@ const testNodeDropoutAndReconnect = (): void => { } printSimulationResult({ - testName: `Node Dropout & Reconnect (${trials} trials, ${dropoutRounds} rounds offline)`, + testName: `Staggered Node Recovery (${trials} trials)`, + recordsDeleted: deletedCount === trials, + roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, + totalRounds: Math.round(totalRounds / trials), + clusters: [{ + name: 'all', + nodeCount: 20 * trials, + recordCount: finalRecords, + tombstoneCount: finalTombstones, + onlineCount: 20 * trials, + }], + }); +}; + +const testOriginNodeGoesOffline = (): void => { + const trials = 50; + const maxRounds = 99999; + let deletedCount = 0; + let totalDeletionRounds = 0; + let totalRounds = 0; + let finalRecords = 0; + let finalTombstones = 0; + + for (let trial = 0; trial < trials; trial++) { + let network = createNetwork(15); + const recordId = `origin-offline-${trial}`; + + // Node-0 creates and propagates record + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 20); + + // Node-0 creates tombstone then immediately goes offline + network = addTombstoneToNetwork(network, "node-0", recordId); + network = setNodeOnline(network, "node-0", false); + + // The tombstone should still propagate via other nodes + const result = runToConvergence(network, recordId, maxRounds); + + // Bring node-0 back online for final check + network = setNodeOnline(result.network, "node-0", true); + const finalResult = runToConvergence(network, recordId, maxRounds, 50); + + if (finalResult.recordsDeleted) { + deletedCount++; + totalDeletionRounds += result.roundsToDeleteRecords; + } + totalRounds += result.totalRounds + finalResult.totalRounds; + + const stats = getClusterStats(finalResult.network, recordId); + finalRecords += stats.recordCount; + finalTombstones += stats.tombstoneCount; + } + + printSimulationResult({ + testName: `Origin Node Goes Offline (${trials} trials)`, recordsDeleted: deletedCount === trials, roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, totalRounds: Math.round(totalRounds / trials), @@ -1343,11 +1036,12 @@ const testNodeDropoutAndReconnect = (): void => { nodeCount: 15 * trials, recordCount: finalRecords, tombstoneCount: finalTombstones, + onlineCount: 15 * trials, }], }); }; -const testSparseNetwork = (): void => { +const testFlappingNode = (): void => { const trials = 50; const maxRounds = 99999; let deletedCount = 0; @@ -1357,20 +1051,49 @@ const testSparseNetwork = (): void => { let finalTombstones = 0; for (let trial = 0; trial < trials; trial++) { - let network = createNetwork(25, 0.15); - const recordId = `sparse-${trial}`; + let network = createNetwork(15); + const recordId = `flapping-${trial}`; + const flappingNode = "node-7"; - network = addRecordToNetwork(network, "node-0", recordId, "Test"); - network = gossipRounds(network, recordId, 50); + // Propagate record + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 20); + + // Create tombstone network = addTombstoneToNetwork(network, "node-0", recordId); + // Simulate a flapping node (repeatedly going offline/online) + let rounds = 0; + let recordsDeleted = false; + let roundsToDelete = 0; + + while (rounds < maxRounds && !recordsDeleted) { + // Toggle node state every 5 rounds + if (rounds % 10 < 5) { + network = setNodeOnline(network, flappingNode, true); + } else { + network = setNodeOnline(network, flappingNode, false); + } + + const stats = getClusterStats(network, recordId); + if (stats.recordCount === 0) { + recordsDeleted = true; + roundsToDelete = rounds; + } + + network = gossipRounds(network, recordId, 5); + rounds += 5; + } + + // Stabilize the node and run to convergence + network = setNodeOnline(network, flappingNode, true); const result = runToConvergence(network, recordId, maxRounds); - if (result.recordsDeleted) { + if (result.recordsDeleted || recordsDeleted) { deletedCount++; - totalDeletionRounds += result.roundsToDeleteRecords; + totalDeletionRounds += recordsDeleted ? roundsToDelete : (result.roundsToDeleteRecords + rounds); } - totalRounds += result.totalRounds; + totalRounds += result.totalRounds + rounds; const stats = getClusterStats(result.network, recordId); finalRecords += stats.recordCount; @@ -1378,32 +1101,96 @@ const testSparseNetwork = (): void => { } printSimulationResult({ - testName: `Sparse Network (${trials} trials, 15% connectivity)`, + testName: `Flapping Node (${trials} trials, node toggles online/offline)`, recordsDeleted: deletedCount === trials, roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, totalRounds: Math.round(totalRounds / trials), clusters: [{ name: 'all', - nodeCount: 25 * trials, + nodeCount: 15 * trials, recordCount: finalRecords, tombstoneCount: finalTombstones, + onlineCount: 15 * trials, + }], + }); +}; + +const testPartitionDuringKeeperElection = (): void => { + const trials = 50; + const maxRounds = 99999; + let deletedCount = 0; + let totalDeletionRounds = 0; + let totalRounds = 0; + let finalRecords = 0; + let finalTombstones = 0; + + for (let trial = 0; trial < trials; trial++) { + let network = createNetwork(20); + const recordId = `partition-during-gc-${trial}`; + + // Propagate record + network = addRecordToNetwork(network, "node-0", recordId, "Test Data"); + network = gossipRounds(network, recordId, 25); + + // Create tombstone and let it propagate briefly + network = addTombstoneToNetwork(network, "node-0", recordId); + network = gossipRounds(network, recordId, 15); + + // Now partition the network during keeper election phase + const partitionA = Array.from({ length: 10 }, (_, i) => `node-${i}`); + const partitionB = Array.from({ length: 10 }, (_, i) => `node-${i + 10}`); + + network = setMultipleNodesPartition(network, partitionA, "partition-a"); + network = setMultipleNodesPartition(network, partitionB, "partition-b"); + + // Run while partitioned (keeper election happens independently) + network = gossipRounds(network, recordId, 100); + + // Heal partition + network = setMultipleNodesPartition(network, [...partitionA, ...partitionB], "main"); + + const result = runToConvergence(network, recordId, maxRounds); + + if (result.recordsDeleted) { + deletedCount++; + totalDeletionRounds += result.roundsToDeleteRecords + 15 + 100; + } + totalRounds += result.totalRounds + 15 + 100; + + const stats = getClusterStats(result.network, recordId); + finalRecords += stats.recordCount; + finalTombstones += stats.tombstoneCount; + } + + printSimulationResult({ + testName: `Partition During Keeper Election (${trials} trials)`, + recordsDeleted: deletedCount === trials, + roundsToDeleteRecords: deletedCount > 0 ? Math.round(totalDeletionRounds / deletedCount) : 0, + totalRounds: Math.round(totalRounds / trials), + clusters: [{ + name: 'all', + nodeCount: 20 * trials, + recordCount: finalRecords, + tombstoneCount: finalTombstones, + onlineCount: 20 * trials, }], }); }; const runAllTests = (): void => { console.log("=== HyperLogLog Tombstone Simulation ==="); + console.log("Model: Fully connected network with offline nodes and partitions\n"); testSingleNodeDeletion(); - testEarlyTombstoneCreation(); - testBridgedNetwork(); + testNodeOfflineDuringTombstone(); + testMultipleNodesOffline(); + testNetworkPartition(); + testClusterSeparation(); testConcurrentTombstones(); - testNetworkPartitionHeal(); - testSparseNetwork(); - testDynamicTopology(); - testNodeChurn(); - testRandomConfigurationChanges(); - testNodeDropoutAndReconnect(); + testStaggeredNodeRecovery(); + testOriginNodeGoesOffline(); + testFlappingNode(); + testPartitionDuringKeeperElection(); console.log("\n=== Simulation Complete ==="); };