Skip to content

Commit

Permalink
Merge pull request #3 from generalpy101/improve-parser
Browse files Browse the repository at this point in the history
Improved parser Performance
  • Loading branch information
generalpy101 authored Oct 15, 2023
2 parents 880beef + abf5c62 commit 9a7a417
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 133 deletions.
154 changes: 36 additions & 118 deletions redis_clone/redis_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum

PROTOCOL_SEPARATOR = "\r\n"
PROTOCOL_SEPARATOR = b'\r\n'


class Protocol_2_Data_Types(Enum):
Expand All @@ -11,12 +11,11 @@ class Protocol_2_Data_Types(Enum):
For Bulk Strings, the first byte of the reply is "$"
For Arrays, the first byte of the reply is "*"
"""

SIMPLE_STRING = "+"
ERROR = "-"
INTEGER = ":"
BULK_STRING = "$"
ARRAY = "*"
SIMPLE_STRING = b"+"
ERROR = b"-"
INTEGER = b":"
BULK_STRING = b"$"
ARRAY = b"*"


class Parser:
Expand All @@ -43,44 +42,22 @@ def _parse_v2_client_request(self, data):
"""
if not data:
return None

# Check if first byte is an array specifier else raise exception
if data[0] != Protocol_2_Data_Types.ARRAY.value:
if data[0:1] != Protocol_2_Data_Types.ARRAY.value:
raise Exception("Invalid protocol data")

# Split data according to separator of protocol
# We'll split only once because we need to get number of elements in array
command_items = data.split(PROTOCOL_SEPARATOR, 1)

# Get number of elements in array
# First item will be * rest should be number of elements
num_elements = int(command_items[0][1:])

# Get command name
# Syntax of command is <command-name> <arg1> <arg2> ... <argn>
# So command name is first element after array specifier
# But we have both command name and arguments in the same array
# But we also know that command will be like $<length>\r\n<command-name>\r\n<text>...
# We need first 2 elements after array specifier as full string for parsing command name as we'll use data parser
command_name = self.parse_data(
"\r\n".join(command_items[1].split(PROTOCOL_SEPARATOR)[:2])
)

# Get command arguments
# Syntax of command is <command-name> <arg1> <arg2> ... <argn>
# So command arguments are elements after command name
# But here we have both command name and arguments in the same array
# Since args are also bulk strings, we need is full string for parsing command args as we'll use data parser
# After command name we have $<length>\r\n<arg1>\r\n$<length>\r\n<arg2>\r\n...
# For data parser, we need to 2 items each, length and data
command_args = []
unparsed_args = [
"\r\n".join(command_items[1].split(PROTOCOL_SEPARATOR)[i : i + 2])
for i in range(2, (num_elements * 2) - 1, 2)
]

for arg in unparsed_args:
command_args.append(self.parse_data(arg))
num_elements = int(data[1:data.index(PROTOCOL_SEPARATOR)])

# Getting the commands and arguments using slicing
remaining_data = data.split(PROTOCOL_SEPARATOR, num_elements * 2 + 1)[1:]
command_name = self._parse_bulk_string(remaining_data[0] + PROTOCOL_SEPARATOR + remaining_data[1])

# Parsing the arguments
command_args = [self._parse_bulk_string(remaining_data[i] + PROTOCOL_SEPARATOR + remaining_data[i+1]) for i in range(2, num_elements * 2, 2)]

return command_name, command_args

def parse_data(self, data):
Expand All @@ -90,57 +67,34 @@ def parse_data(self, data):
Data format differs based on the type of data but general syntax is
<type>[data-specific-fields\r\n]<data>\r\n
"""
if self.protocol_version != 2:
raise Exception("Protocol version not supported")

# Get first byte of data to determine type
data_type = data[0]

# Using simple if else ladder because data types are mutually exclusive
if data_type == Protocol_2_Data_Types.SIMPLE_STRING.value:
return self._parse_simple_string(data)
elif data_type == Protocol_2_Data_Types.ERROR.value:
return self._parse_error(data)
elif data_type == Protocol_2_Data_Types.INTEGER.value:
return self._parse_integer(data)
elif data_type == Protocol_2_Data_Types.BULK_STRING.value:
return self._parse_bulk_string(data)
elif data_type == Protocol_2_Data_Types.ARRAY.value:
return self._parse_array(data)
else:
raise Exception("Invalid protocol data")
# Using dictionary mapping for performance
parsing_funcs = {
Protocol_2_Data_Types.SIMPLE_STRING: self._parse_simple_string,
Protocol_2_Data_Types.ERROR: self._parse_error,
Protocol_2_Data_Types.INTEGER: self._parse_integer,
Protocol_2_Data_Types.BULK_STRING: self._parse_bulk_string,
Protocol_2_Data_Types.ARRAY: self._parse_array,
}

return parsing_funcs[data_type](data)

def _parse_simple_string(self, data):
"""
Simple Strings are used to transmit non binary safe strings with minimal overhead.
They are encoded in the following way:
+<data>\r\n
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of simple string is +<data>
# So data is second element after simple string specifier
data = data_items[0][1:]

return data
return data[1:-2].decode('utf-8')

def _parse_error(self, data):
"""
Errors are used in order to signal client errors.
They are encoded in the following way:
-<data>\r\n
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of error is -<data>
# So data is second element after error specifier
data = data_items[0][1:]

return data
return data[1:-2].decode('utf-8')

def _parse_integer(self, data):
"""
Expand All @@ -149,15 +103,7 @@ def _parse_integer(self, data):
:[<+|->]<value>\r\n
An optional plus (+) or minus (-) as the sign.
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get data
# Syntax of integer is :[<+|->]<value>
# So data is second element after integer specifier
data = data_items[0][1:]

return int(data)
return int(data[1:-2].decode('utf-8'))

def _parse_bulk_string(self, data):
"""
Expand All @@ -166,25 +112,9 @@ def _parse_bulk_string(self, data):
$<length>\r\n<data>\r\n
Where length is the number of bytes in data
"""

# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get length
# Syntax of bulk string is $<length>
# So length is second element after bulk string specifier
length = int(data_items[0][1:])

# Get data
# Syntax of bulk string is $<length>\r\n<data>\r\n
# So data is third element after bulk string specifier
data = data_items[1]

# Check if length of data is same as length specified
if len(data) != length:
raise Exception("Invalid protocol data")

return data
length = int(data[1:data.index(PROTOCOL_SEPARATOR)])
# Get data from index after separator till length of data
return data[data.index(PROTOCOL_SEPARATOR) + 2:data.index(PROTOCOL_SEPARATOR) + 2 + length].decode('utf-8')

def _parse_array(self, data):
"""
Expand All @@ -193,19 +123,7 @@ def _parse_array(self, data):
*<number-of-elements>\r\n<element-1>...<element-n>
Where each element has its own type specifier
"""
# Split data according to separator of protocol
data_items = data.split(PROTOCOL_SEPARATOR)

# Get number of elements in array
# First item will be * rest should be number of elements
num_elements = int(data_items[0][1:])

# Get elements
# Syntax of array is *<number-of-elements>\r\n<element-1>...<element-n>
# So elements are from second element after array specifier to end
# We need to parse each element
elements = []
for element in data_items[1:]:
elements.append(self.parse_data(element))

return elements
num_elements = int(data[1:data.index(PROTOCOL_SEPARATOR)])
remaining_data = data.split(PROTOCOL_SEPARATOR, num_elements * 2 + 1)[1:]
# Need to parse each element in the array
return [self.parse_data(remaining_data[i] + PROTOCOL_SEPARATOR + remaining_data[i+1]) for i in range(0, num_elements * 2, 2)]
17 changes: 8 additions & 9 deletions redis_clone/response_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ def _build_protocol_2_error(self, data):
"""
# Syntax of error is -<data>
# So data is second element after error specifier
data = f"-{data}{PROTOCOL_SEPARATOR}"

return data.encode("utf-8")
data = b"-" + data.encode("utf-8") + PROTOCOL_SEPARATOR
return data

def _build_protocol_2_simple_string(self, data):
"""
Expand All @@ -58,9 +57,8 @@ def _build_protocol_2_simple_string(self, data):
"""
# Syntax of simple string is +<data>
# So data is second element after simple string specifier
data = f"+{data}{PROTOCOL_SEPARATOR}"

return data.encode("utf-8")
data = b"+" + data.encode("utf-8") + PROTOCOL_SEPARATOR
return data

def _build_protocol_2_bulk_string(self, data):
"""
Expand All @@ -73,10 +71,11 @@ def _build_protocol_2_bulk_string(self, data):

# If data is None then return nil value
if data is None:
return f"${-1}{PROTOCOL_SEPARATOR}".encode("utf-8")
return b"$-1" + PROTOCOL_SEPARATOR
else:
# Syntax of bulk string is $<data length>
# So data is second element after bulk string specifier
data = f"${len(data)}{PROTOCOL_SEPARATOR}{data}{PROTOCOL_SEPARATOR}"
length = str(len(data)).encode("utf-8")
data = b"$" + length + PROTOCOL_SEPARATOR + data.encode("utf-8") + PROTOCOL_SEPARATOR

return data.encode("utf-8")
return data
2 changes: 0 additions & 2 deletions redis_clone/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ async def _handle_connection(self, reader, writer):
break

logger.info(f"Received data: {data}")
# Convert bytes to string
data = data.decode("utf-8")
command_name, command_args = self.parser.parse_client_request(data)
logger.info(f"Command name: {command_name}")
logger.info(f"Command args: {command_args}")
Expand Down
6 changes: 3 additions & 3 deletions tests/test_client_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_initial_command_request(self):
parser = Parser(protocol_version=2)

# Test initial connection
test_str = "*1\r\n$7\r\nCOMMAND\r\n"
test_str = b"*1\r\n$7\r\nCOMMAND\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "COMMAND"
Expand All @@ -21,7 +21,7 @@ def test_set_command_request(self):
Test SET command request
"""
# Test initial connection
test_str = "*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
test_str = b"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "SET"
Expand All @@ -32,7 +32,7 @@ def test_get_command_request(self):
Test GET command request
"""
# Test initial connection
test_str = "*2\r\n$3\r\nGET\r\n$5\r\nmykey\r\n"
test_str = b"*2\r\n$3\r\nGET\r\n$5\r\nmykey\r\n"
command, args = self.parser.parse_client_request(test_str)

assert command == "GET"
Expand Down
1 change: 0 additions & 1 deletion tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def client():

def test_ping(client):
response = client.ping()
print(response)
assert response == True


Expand Down

0 comments on commit 9a7a417

Please sign in to comment.