Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile #128143

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
47 changes: 47 additions & 0 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3447,6 +3447,53 @@ def test_too_short(self):
self.assertEqual(
b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,)))

class StoredZipExtFileRandomAccessTest(unittest.TestCase):
def test_random_access(self):
from _pyio import BytesIO
class StatIO(BytesIO):
def __init__(self):
super().__init__()
self.bytes_read = 0

def read(self, size=-1):
bs = super().read(size)
self.bytes_read += len(bs)
return bs
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved

def get_bytes_read(self):
return self.bytes_read
picnixz marked this conversation as resolved.
Show resolved Hide resolved

sio = StatIO()
# 100000 bytes
txt = b'0123456789'*10000
picnixz marked this conversation as resolved.
Show resolved Hide resolved

# Check seek on a file
with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf:
zipf.writestr("foo.txt", txt)

with zipfile.ZipFile(sio, "r") as zipf:
with zipf.open("foo.txt", "r") as fp:
br = sio.get_bytes_read()
picnixz marked this conversation as resolved.
Show resolved Hide resolved
fp.seek(50000, os.SEEK_CUR)
self.assertEqual(sio.get_bytes_read() - br, 0, 'seek produces redundant read!')
picnixz marked this conversation as resolved.
Show resolved Hide resolved

b = fp.read(100)
self.assertEqual(b, txt[:100])
picnixz marked this conversation as resolved.
Show resolved Hide resolved

# seek length must be greater than ZipExtFile.MIN_READ_SIZE (4096)
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
# backward seek
picnixz marked this conversation as resolved.
Show resolved Hide resolved
br = sio.get_bytes_read()
fp.seek(5000, os.SEEK_CUR)
b = fp.read(100)
self.assertEqual(b, txt[50000:50100])
picnixz marked this conversation as resolved.
Show resolved Hide resolved
self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during backward seek!')

# forward seek
br = sio.get_bytes_read()
fp.seek(-40000, os.SEEK_CUR)
b = fp.read(100)
self.assertEqual(b, txt[10100:10200])
self.assertLessEqual(sio.get_bytes_read() - br, 4096, 'read redundant bytes during forward seek!')

if __name__ == "__main__":
unittest.main()
5 changes: 4 additions & 1 deletion Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,13 +1162,16 @@ def seek(self, offset, whence=os.SEEK_SET):
self._offset = buff_offset
read_offset = 0
# Fast seek uncompressed unencrypted file
elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
elif self._compress_type == ZIP_STORED and self._decrypter is None:
# disable CRC checking after first seeking - it would be invalid
self._expected_crc = None
# seek actual file taking already buffered data into account
read_offset -= len(self._readbuffer) - self._offset
self._fileobj.seek(read_offset, os.SEEK_CUR)
self._left -= read_offset
self._compress_left -= read_offset
5ec1cff marked this conversation as resolved.
Show resolved Hide resolved
if self._eof and read_offset < 0:
self._eof = False
read_offset = 0
# flush read buffer
self._readbuffer = b''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support fast forward seek in uncompressed unencrypted :class:`!zipfile.ZipExtFile`.
Loading