From b9dedc9950a820b49a707b0e0c862224558145d0 Mon Sep 17 00:00:00 2001 From: Jayaprabhakar Kadarkarai Date: Tue, 23 Apr 2024 09:26:40 -0700 Subject: [PATCH 1/4] FizzBee spec for the Venice's LeaderFollower protocol This is a literal transilation of TLA+ syntax to FizzBee syntax. --- specs/fizzbee/.gitignore | 2 + .../LeaderFollower/VeniceLeaderFollower.fizz | 136 ++++++++++++++++++ specs/fizzbee/LeaderFollower/fizz.yaml | 13 ++ 3 files changed, 151 insertions(+) create mode 100644 specs/fizzbee/.gitignore create mode 100644 specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz create mode 100644 specs/fizzbee/LeaderFollower/fizz.yaml diff --git a/specs/fizzbee/.gitignore b/specs/fizzbee/.gitignore new file mode 100644 index 0000000000..334994886f --- /dev/null +++ b/specs/fizzbee/.gitignore @@ -0,0 +1,2 @@ +*.json +**/out/ diff --git a/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz new file mode 100644 index 0000000000..59d5dbbc64 --- /dev/null +++ b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz @@ -0,0 +1,136 @@ +# The FizzBee specification for the Leader-Follower replication protocol +# used in Venice data storage platform. +# The TLA+ equivalent of this specification can be found at: +# https://github.com/linkedin/venice/tree/main/specs/TLA%2B/LeaderFollower +# Summary of the protocol: +# - The protocol consists of N nodes, each of which can be in one of two states: LEADER or FOLLOWER. +# - Any node can assume the role of a leader if there is no leader at this moment (The leader election is +# not modelled in this spec. +# - Clients publish messages to the real-time topic, which is consumed by the leader. +# - The leader appends the consumed messages to the version topic. +# - The followers consume messages from the version topic. +# +# The protocol is correct if the persisted records of all nodes are eventually consistent. + +# Note: The comments in this spec are written as a tutorial for the FizzBee language. +# In a real example, the comments should describe why you do what you do, not what you do. + +# Assertions are specified with assertion keyword. The body of the assertion is similar +# to a python function returning a boolean. +always eventually assertion ReplicasConsistent: + for i in NODE_IDS: + for j in range(i + 1, len(nodes)): + # Check if the persisted records of the current pair of nodes are equal + if nodes[i]['persistedRecords'] != nodes[j]['persistedRecords']: + return False + + return True + +# Init is a special action called when the model starts. It is used to initialize the state of the model. +action Init: + # Currently, FizzBee does not support parameters. So, we have to define these as variables with uppercase. + KEYS = ('key1', 'key2') + VALUES = ('value1', 'value2') + LEADER = 'LEADER' + FOLLOWER = 'FOLLOWER' + N_NODES = 3 + NODE_IDS = range(N_NODES) + + # Initialize the state of each node + real_time_topic = [] + version_topic = [] + nodes = [ + { + 'state': 'FOLLOWER', + 'rtOffset': 0, + 'vtOffset': 0, + 'persistedRecords': {} + } + for i in NODE_IDS + ] + + +# The action keyword is used to define an action. The atomic keyword implies +# there are no yield points within the statements. +atomic action PromoteLeader: + # A FOLLOWER ca be promoted to be a LEADER if and only if there is no leader in the system. + + if [node for node in nodes if node['state'] == 'LEADER']: + return + # `any` is a special keyword that creates non-determinism or alternate timelines. + # This is similar in structure to `for`, but in each timeline, only one of the + # non-deterministic choices is executed. + any id in NODE_IDS: + nodes[id]['state'] = 'LEADER' + + +atomic action DemoteLeader: + any id in NODE_IDS: + if nodes[id]['state'] == 'LEADER': + nodes[id]['state'] = 'FOLLOWER' + + +atomic action ClientProducesToVenice: + any kv in [(k, v) for k in KEYS for v in VALUES]: + real_time_topic.append(kv) + + +# The fair keyword implies this action is weakly fair. That is, the action is always enabled, +# it will eventually happen. +atomic fair action LeaderConsume: + any id in NODE_IDS: + if nodes[id]['state'] == 'LEADER': + if nodes[id]['vtOffset'] >= len(version_topic): + real_time_consume(id) + else: + version_topic_consume(id) + + +# The strong modifier to the fair keyword implies this action is strongly fair. That is, the action +# gets enabled repeatedly (that is it may get disabled in between), it will eventually happen. +atomic fair action FollowerConsume: + any id in NODE_IDS: + if nodes[id]['state'] == 'FOLLOWER' and nodes[id]['vtOffset'] < len(version_topic): + version_topic_consume(id) + + +atomic func real_time_consume(nodeId): + # A leader consuming unprocessed msgs from the real-time topic. + + # Check if the node's offset is within the bounds of the real-time topic + if nodes[nodeId]['rtOffset'] >= len(real_time_topic): + return + + # Update the node's state and persisted records + offset = nodes[nodeId]['rtOffset'] + key, value = real_time_topic[offset][0], real_time_topic[offset][1] + + nodes[nodeId]['rtOffset'] += 1 + nodes[nodeId]['persistedRecords'][key] = value + + # Append the consumed event to the version topic + version_topic.append((key, value, offset)) + + +atomic func version_topic_consume(nodeId): + # A node consuming events from the version topic. + + # Update the node's state and persisted records + offset = nodes[nodeId]['vtOffset'] + key, value = version_topic[offset][0], version_topic[offset][1] + nodes[nodeId]['vtOffset'] += 1 + nodes[nodeId]['persistedRecords'][key] = value + + +atomic action LeaderStaysAsLeader: + # This action is a hack to prevent deadlock error in the model checker. + # The model checker will identify deadlock error if there is no action to execute. + # Sometimes to speed up model checking and limit the infinite state space, + # we can limit the number times certain actions can execute. + # For example, if we limit the number of times a leader can be demoted, then after + # consuming all the messages, and the leader is demoted the specified number of times, + # and promoted again, the model checker will not have anything to execute. + # This action is a hack to prevent that deadlock error. + # If there is already a LEADER, no changes to the system is a valid behavior. + if [node for node in nodes if node['state'] == 'LEADER']: + pass diff --git a/specs/fizzbee/LeaderFollower/fizz.yaml b/specs/fizzbee/LeaderFollower/fizz.yaml new file mode 100644 index 0000000000..1533b10362 --- /dev/null +++ b/specs/fizzbee/LeaderFollower/fizz.yaml @@ -0,0 +1,13 @@ +liveness: strict +deadlock_detection: true +options: + max_actions: 200 +actionOptions: + ClientProducesToVenice: + # Limit the number of times ClientProducesToVenice can be executed. + maxActions: 3 + DemoteLeader: + # Limit the number of times DemoteLeader can be executed. + # Note: The PromoteLeader action is not limited, it must execute at least 1 more than the number of times + # a leader can be demoted, to ensure there is eventually be at least 1 leader. + maxActions: 1 From c8b49f4e081a03c0d549f35f65cf27b2b53f5806 Mon Sep 17 00:00:00 2001 From: Jayaprabhakar Kadarkarai Date: Tue, 23 Apr 2024 11:39:41 -0700 Subject: [PATCH 2/4] README --- specs/fizzbee/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 specs/fizzbee/README.md diff --git a/specs/fizzbee/README.md b/specs/fizzbee/README.md new file mode 100644 index 0000000000..d4f7209921 --- /dev/null +++ b/specs/fizzbee/README.md @@ -0,0 +1,10 @@ +This directory contains the formal specs for the FizzBee model checker. + +More about the tool and the instructions: [https://github.com/fizzbee-io/fizzbee?tab=readme-ov-file#run-a-model-checker](Run the model checker) + +Once installed from source and set the PATH, you can run with + +``` +fizz specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz +``` + From c721c5ee3de1b6e7c31663f9e5e89a1c35a7690b Mon Sep 17 00:00:00 2001 From: Jayaprabhakar Kadarkarai Date: Sat, 18 May 2024 13:36:50 -0700 Subject: [PATCH 3/4] Some review comments to match the spec to the implementation. 1. Maintain a bool variable newly_promoted to indicate a new leader who has not caught up on previous version topics. 2. Continue to update vtOffset and rtOffset for each node irrespective of whether the node is a leader or follower. 3. Make the leader and follower consumption serial instead of atomic - The leader will first post to version_topic, then, update the local datastore, then update the offsets - Similarly, the follower will first update its local datastore and then update the offsets. The system can crash at either of these points, and the design is verified to work even if the system crashes in between --- .../LeaderFollower/VeniceLeaderFollower.fizz | 83 +++++++++++++------ specs/fizzbee/LeaderFollower/fizz.yaml | 5 +- 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz index 59d5dbbc64..3fd949d23c 100644 --- a/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz +++ b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz @@ -26,15 +26,18 @@ always eventually assertion ReplicasConsistent: return True + +# Currently, FizzBee does not support parameters. So, we have to define these as variables with uppercase. +KEYS = ('key1', 'key2') +VALUES = ('value1', 'value2') +LEADER = 'LEADER' +FOLLOWER = 'FOLLOWER' +N_NODES = 3 +NODE_IDS = range(N_NODES) + + # Init is a special action called when the model starts. It is used to initialize the state of the model. action Init: - # Currently, FizzBee does not support parameters. So, we have to define these as variables with uppercase. - KEYS = ('key1', 'key2') - VALUES = ('value1', 'value2') - LEADER = 'LEADER' - FOLLOWER = 'FOLLOWER' - N_NODES = 3 - NODE_IDS = range(N_NODES) # Initialize the state of each node real_time_topic = [] @@ -42,8 +45,12 @@ action Init: nodes = [ { 'state': 'FOLLOWER', + # If true, the node became a leader, but not yet caught up with the version topic. + 'newly_promoted': False, + 'rtOffset': 0, 'vtOffset': 0, + 'persistedRecords': {} } for i in NODE_IDS @@ -62,12 +69,14 @@ atomic action PromoteLeader: # non-deterministic choices is executed. any id in NODE_IDS: nodes[id]['state'] = 'LEADER' + nodes[id]['newly_promoted'] = True atomic action DemoteLeader: any id in NODE_IDS: if nodes[id]['state'] == 'LEADER': nodes[id]['state'] = 'FOLLOWER' + nodes[id]['newly_promoted'] = False atomic action ClientProducesToVenice: @@ -77,18 +86,27 @@ atomic action ClientProducesToVenice: # The fair keyword implies this action is weakly fair. That is, the action is always enabled, # it will eventually happen. -atomic fair action LeaderConsume: +atomic fair action NewLeaderCatchup: any id in NODE_IDS: - if nodes[id]['state'] == 'LEADER': - if nodes[id]['vtOffset'] >= len(version_topic): - real_time_consume(id) - else: + if nodes[id]['state'] == 'LEADER' and nodes[id]['newly_promoted']: + if nodes[id]['vtOffset'] < len(version_topic): version_topic_consume(id) + else: + nodes[id]['newly_promoted'] = False + + +# The fair keyword implies this action is weakly fair. That is, the action is always enabled, +# it will eventually happen. +atomic fair action EstablishedLeaderConsume: + any id in NODE_IDS: + if nodes[id]['state'] == 'LEADER' and not nodes[id]['newly_promoted']: + real_time_consume(id) + # The strong modifier to the fair keyword implies this action is strongly fair. That is, the action # gets enabled repeatedly (that is it may get disabled in between), it will eventually happen. -atomic fair action FollowerConsume: +atomic fair action FollowerConsume: any id in NODE_IDS: if nodes[id]['state'] == 'FOLLOWER' and nodes[id]['vtOffset'] < len(version_topic): version_topic_consume(id) @@ -101,25 +119,38 @@ atomic func real_time_consume(nodeId): if nodes[nodeId]['rtOffset'] >= len(real_time_topic): return - # Update the node's state and persisted records - offset = nodes[nodeId]['rtOffset'] - key, value = real_time_topic[offset][0], real_time_topic[offset][1] + offset, key, value = -1, '', '' - nodes[nodeId]['rtOffset'] += 1 - nodes[nodeId]['persistedRecords'][key] = value + atomic: + offset = nodes[nodeId]['rtOffset'] + key, value = real_time_topic[offset][0], real_time_topic[offset][1] - # Append the consumed event to the version topic - version_topic.append((key, value, offset)) + # Append the consumed event to the version topic + if (key, value, offset) not in version_topic: + version_topic.append((key, value, offset)) + else: + pass + + # Update the node's state and persisted records + nodes[nodeId]['persistedRecords'][key] = value + atomic: + nodes[nodeId]['rtOffset'] += 1 + nodes[nodeId]['vtOffset'] += 1 -atomic func version_topic_consume(nodeId): +func version_topic_consume(nodeId): # A node consuming events from the version topic. - # Update the node's state and persisted records - offset = nodes[nodeId]['vtOffset'] - key, value = version_topic[offset][0], version_topic[offset][1] - nodes[nodeId]['vtOffset'] += 1 - nodes[nodeId]['persistedRecords'][key] = value + atomic: + # Update the node's state and persisted records + offset = nodes[nodeId]['vtOffset'] + key, value, rt_offset = version_topic[offset][0], version_topic[offset][1], version_topic[offset][2] + nodes[nodeId]['persistedRecords'][key] = value + + atomic: + # Update the node's state and persisted records + nodes[nodeId]['rtOffset'] += 1 + nodes[nodeId]['vtOffset'] += 1 atomic action LeaderStaysAsLeader: diff --git a/specs/fizzbee/LeaderFollower/fizz.yaml b/specs/fizzbee/LeaderFollower/fizz.yaml index 1533b10362..7805c5d8c3 100644 --- a/specs/fizzbee/LeaderFollower/fizz.yaml +++ b/specs/fizzbee/LeaderFollower/fizz.yaml @@ -2,12 +2,13 @@ liveness: strict deadlock_detection: true options: max_actions: 200 + max_concurrent_actions: 1 actionOptions: ClientProducesToVenice: # Limit the number of times ClientProducesToVenice can be executed. - maxActions: 3 + max_actions: 3 DemoteLeader: # Limit the number of times DemoteLeader can be executed. # Note: The PromoteLeader action is not limited, it must execute at least 1 more than the number of times # a leader can be demoted, to ensure there is eventually be at least 1 leader. - maxActions: 1 + max_actions: 1 From 3ff4b756d5743133993017d3ff7aebdbe75e5ebd Mon Sep 17 00:00:00 2001 From: jayaprabhakar Date: Wed, 5 Jun 2024 01:49:48 -0700 Subject: [PATCH 4/4] Make real_time_consume also non-atomic --- .../LeaderFollower/VeniceLeaderFollower.fizz | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz index 3fd949d23c..ea12324989 100644 --- a/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz +++ b/specs/fizzbee/LeaderFollower/VeniceLeaderFollower.fizz @@ -80,8 +80,15 @@ atomic action DemoteLeader: atomic action ClientProducesToVenice: - any kv in [(k, v) for k in KEYS for v in VALUES]: - real_time_topic.append(kv) + if len(real_time_topic) == 0: + # This reduces the state space, because if there are no messages in the real-time topic, + # first message being (k1,v1) is equivalent to first message being one of [(k1,v2),(k2,v1),(k2,v2)]. + # This reduces the state space by 75% + # Doesn't affect the correctness of the model. + real_time_topic.append((KEYS[0], VALUES[0])) + else: + any kv in [(k, v) for k in KEYS for v in VALUES]: + real_time_topic.append(kv) # The fair keyword implies this action is weakly fair. That is, the action is always enabled, @@ -112,7 +119,7 @@ atomic fair action FollowerConsume: version_topic_consume(id) -atomic func real_time_consume(nodeId): +func real_time_consume(nodeId): # A leader consuming unprocessed msgs from the real-time topic. # Check if the node's offset is within the bounds of the real-time topic