Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved parser Performance #3

Merged
merged 1 commit into from
Oct 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 36 additions & 118 deletions redis_clone/redis_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum

PROTOCOL_SEPARATOR = "\r\n"
PROTOCOL_SEPARATOR = b'\r\n'


class Protocol_2_Data_Types(Enum):
Expand All @@ -11,12 +11,11 @@ class Protocol_2_Data_Types(Enum):
For Bulk Strings, the first byte of the reply is "$"
For Arrays, the first byte of the reply is "*"
"""

SIMPLE_STRING = "+"
ERROR = "-"
INTEGER = ":"
BULK_STRING = "$"
ARRAY = "*"
SIMPLE_STRING = b"+"
ERROR = b"-"
INTEGER = b":"
BULK_STRING = b"$"
ARRAY = b"*"


class Parser:
Expand All @@ -43,44 +42,22 @@ def _parse_v2_client_request(self, data):
"""
if not data:
return None

# Check if first byte is an array specifier else raise exception
if data[0] != Protocol_2_Data_Types.ARRAY.value:
if data[0:1] != Protocol_2_Data_Types.ARRAY.value:
raise Exception("Invalid protocol data")

# Split data according to separator of protocol
# We'll split only once because we need to get number of elements in array
command_items = data.split(PROTOCOL_SEPARATOR, 1)

# Get number of elements in array
# First item will be * rest should be number of elements
num_elements = int(command_items[0][1:])

# Get command name
# Syntax of command is <command-name> <arg1> <arg2> ... <argn>
# So command name is first element after array specifier
# But we have both command name and arguments in the same array
# But we also know that command will be like $<length>\r\n<command-name>\r\n<text>...
# We need first 2 elements after array specifier as full string for parsing command name as we'll use data parser
command_name = self.parse_data(
"\r\n".join(command_items[1].split(PROTOCOL_SEPARATOR)[:2])
)

# Get command arguments
# Syntax of command is <command-name> <arg1> <arg2> ... <argn>
# So command arguments are elements after command name
# But here we have both command name and arguments in the same array
# Since args are also bulk strings, we need is full string for parsing command args as we'll use data parser
# After command name we have $<length>\r\n<arg1>\r\n$<length>\r\n<arg2>\r\n...
# For data parser, we need to 2 items each, length and data
command_args = []
unparsed_args = [
"\r\n".join(command_items[1].split(PROTOCOL_SEPARATOR)[i : i + 2])
for i in range(2, (num_elements * 2) - 1, 2)
]

for arg in unparsed_args:
command_args.append(self.parse_data(arg))
num_elements = int(data[1:data.index(PROTOCOL_SEPARATOR)])

# Getting the commands and arguments using slicing
remaining_data = data.split(PROTOCOL_SEPARATOR, num_elements * 2 + 1)[1:]
command_name = self._parse_bulk_string(remaining_data[0] + PROTOCOL_SEPARATOR + remaining_data[1])

# Parsing the arguments
command_args = [self._parse_bulk_string(remaining_data[i] + PROTOCOL_SEPARATOR + remaining_data[i+1]) for i in range(2, num_elements * 2, 2)]

return command_name, command_args

def parse_data(self, data):
Expand All @@ -90,57 +67,34 @@ def parse_data(self, data):
Data format differs based on the type of data but general syntax is
<type>[data-specific-fields\r\n]<data>\r\n
"""
if self.protocol_version != 2:
raise Exception("Protocol version not supported")

# Get first byte of data to determine type
data_type = data[0]

# Using simple if else ladder because data types are mutually exclusive
if data_type == Protocol_2_Data_Types.SIMPLE_STRING.value:
return self._parse_simple_string(data)
elif data_type == Protocol_2_Data_Types.ERROR.value:
return self._parse_error(data)
elif data_type == Protocol_2_Data_Types.INTEGER.value:
return self._parse_integer(data)
elif data_type == Protocol_2_Data_Types.BULK_STRING.value:
return self._parse_bulk_string(data)
elif data_type == Protocol_2_Data_Types.ARRAY.value:
return self._parse_array(data)
else:
raise Exception("Invalid protocol data")
# Using dictionary mapping for performance
parsing_funcs = {
Protocol_2_Data_Types.SIMPLE_STRING: self._parse_simple_string,
Protocol_2_Data_Types.ERROR: self._parse_error,
Protocol_2_Data_Types.INTEGER: self._parse_integer,
Protocol_2_Data_Types.BULK_STRING: self._parse_bulk_string,
Protocol_2_Data_Types.ARRAY: self._parse_array,
}

return parsing_funcs[data_type](data)

def _parse_simple_string(self, data):
"""
Simple Strings are used to transmit non binary safe strings with minimal overhead.
They are encoded in the following way:
+<data>\r\n
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of simple string is +<data>
# So data is second element after simple string specifier
data = data_items[0][1:]

return data
return data[1:-2].decode('utf-8')

def _parse_error(self, data):
"""
Errors are used in order to signal client errors.
They are encoded in the following way:
-<data>\r\n
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of error is -<data>
# So data is second element after error specifier
data = data_items[0][1:]

return data
return data[1:-2].decode('utf-8')

def _parse_integer(self, data):
"""
Expand All @@ -149,15 +103,7 @@ def _parse_integer(self, data):
:[<+|->]<value>\r\n
An optional plus (+) or minus (-) as the sign.
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of integer is :[<+|->]<value>
# So data is second element after integer specifier
data = data_items[0][1:]

return int(data)
return int(data[1:-2].decode('utf-8'))

def _parse_bulk_string(self, data):
"""
Expand All @@ -166,25 +112,9 @@ def _parse_bulk_string(self, data):
$<length>\r\n<data>\r\n
Where length is the number of bytes in data
"""

# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get length
# Syntax of bulk string is $<length>
# So length is second element after bulk string specifier
length = int(data_items[0][1:])

# Get data
# Syntax of bulk string is $<length>\r\n<data>\r\n
# So data is third element after bulk string specifier
data = data_items[1]

# Check if length of data is same as length specified
if len(data) != length:
raise Exception("Invalid protocol data")

return data
length = int(data[1:data.index(PROTOCOL_SEPARATOR)])
# Get data from index after separator till length of data
return data[data.index(PROTOCOL_SEPARATOR) + 2:data.index(PROTOCOL_SEPARATOR) + 2 + length].decode('utf-8')

def _parse_array(self, data):
"""
Expand All @@ -193,19 +123,7 @@ def _parse_array(self, data):
*<number-of-elements>\r\n<element-1>...<element-n>
Where each element has its own type specifier
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get number of elements in array
# First item will be * rest should be number of elements
num_elements = int(data_items[0][1:])

# Get elements
# Syntax of array is *<number-of-elements>\r\n<element-1>...<element-n>
# So elements are from second element after array specifier to end
# We need to parse each element
elements = []
for element in data_items[1:]:
elements.append(self.parse_data(element))

return elements
num_elements = int(data[1:data.index(PROTOCOL_SEPARATOR)])
remaining_data = data.split(PROTOCOL_SEPARATOR, num_elements * 2 + 1)[1:]
# Need to parse each element in the array
return [self.parse_data(remaining_data[i] + PROTOCOL_SEPARATOR + remaining_data[i+1]) for i in range(0, num_elements * 2, 2)]
17 changes: 8 additions & 9 deletions redis_clone/response_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ def _build_protocol_2_error(self, data):
"""
# Syntax of error is -<data>
# So data is second element after error specifier
data = f"-{data}{PROTOCOL_SEPARATOR}"

return data.encode("utf-8")
data = b"-" + data.encode("utf-8") + PROTOCOL_SEPARATOR
return data

def _build_protocol_2_simple_string(self, data):
"""
Expand All @@ -58,9 +57,8 @@ def _build_protocol_2_simple_string(self, data):
"""
# Syntax of simple string is +<data>
# So data is second element after simple string specifier
data = f"+{data}{PROTOCOL_SEPARATOR}"

return data.encode("utf-8")
data = b"+" + data.encode("utf-8") + PROTOCOL_SEPARATOR
return data

def _build_protocol_2_bulk_string(self, data):
"""
Expand All @@ -73,10 +71,11 @@ def _build_protocol_2_bulk_string(self, data):

# If data is None then return nil value
if data is None:
return f"${-1}{PROTOCOL_SEPARATOR}".encode("utf-8")
return b"$-1" + PROTOCOL_SEPARATOR
else:
# Syntax of bulk string is $<data length>
# So data is second element after bulk string specifier
data = f"${len(data)}{PROTOCOL_SEPARATOR}{data}{PROTOCOL_SEPARATOR}"
length = str(len(data)).encode("utf-8")
data = b"$" + length + PROTOCOL_SEPARATOR + data.encode("utf-8") + PROTOCOL_SEPARATOR

return data.encode("utf-8")
return data
2 changes: 0 additions & 2 deletions redis_clone/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ async def _handle_connection(self, reader, writer):
break

logger.info(f"Received data: {data}")
# Convert bytes to string
data = data.decode("utf-8")
command_name, command_args = self.parser.parse_client_request(data)
logger.info(f"Command name: {command_name}")
logger.info(f"Command args: {command_args}")
Expand Down
6 changes: 3 additions & 3 deletions tests/test_client_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_initial_command_request(self):
parser = Parser(protocol_version=2)

# Test initial connection
test_str = "*1\r\n$7\r\nCOMMAND\r\n"
test_str = b"*1\r\n$7\r\nCOMMAND\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "COMMAND"
Expand All @@ -21,7 +21,7 @@ def test_set_command_request(self):
Test SET command request
"""
# Test initial connection
test_str = "*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
test_str = b"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "SET"
Expand All @@ -32,7 +32,7 @@ def test_get_command_request(self):
Test GET command request
"""
# Test initial connection
test_str = "*2\r\n$3\r\nGET\r\n$5\r\nmykey\r\n"
test_str = b"*2\r\n$3\r\nGET\r\n$5\r\nmykey\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "GET"
Expand Down
1 change: 0 additions & 1 deletion tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def client():

def test_ping(client):
response = client.ping()
print(response)
assert response == True


Expand Down
Loading