Let Barbie extraction proceed by skipping over the offending bytecode…

… with odd datum types.
npjg · Jun 5, 2024 · eccf44b · eccf44b
1 parent b9595ab
commit eccf44b
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 2 deletions.
diff --git a/src/MediaStation/Context.py b/src/MediaStation/Context.py
@@ -377,8 +377,41 @@ def read_header_section(self, chunk, reading_stage = False):
                 return False
 
         elif (Context.SectionType.FUNCTION == section_type):
-            function = Script(chunk, in_independent_asset_chunk = True)
-            self.assets.update({function.id: function})
+            try:
+                function = Script(chunk, in_independent_asset_chunk = True)
+                self.assets.update({function.id: function})
+            except BinaryParsingError as e:
+                # TODO: This check exists due to an odd bytecode sequence in Barbie 
+                # (117.CXT), around 0x188d and 0x18d9 in "function_5ps1_GetSavedGames".
+                # Seemingly nonsensical datums of type 0x0230 are provided right in
+                # the middle of otherwise normal bytecode.
+                # 
+                # Here is an example of what happens, where the datum
+                # type is indicated by `^` and the value is indicated by `-`.
+                # The offending sequence is indicated by `!`.
+                #  0300 0A00 0200 0603 0001 0030 0200 0603 0001 00
+                #  ^    -    ^    - ^    -    !! !!!! !!^    -   
+                #                             30 0200 0603 0001 00
+                #                             !! !!!! !!^    -    
+                #                             30 0200 0603 0001 00
+                #                             !! !!!! !!^    -
+                #                             30 0200 0603 0001 00
+                #                             !! !!!! !!^    -
+                #                             30 0200 0603 0001 00
+                #                             !! !!!! !!^    -
+                #                             30 0200 0603 0001 00 
+                #                             !! !!!! !!^    -
+                #                             30 0200 0603 0001 00
+                #                             !! !!!! !!^    -
+                #           02 0006 0300 0100 3002 0006 0300 0100 3003 0067 0003 00DB 0003 00BA 00                          
+                #           ^    -  ^    -    !!!! !!!! ^    -    !!^    -    ^    -    ^    -
+                # It is perfectly acceptable for single-byte datums to throw
+                # off the alignment until the end of the chunk, so that's not
+                # the problem. I haven't been able to figure out what it is, so
+                # to allow extraction to proceed we will just skip the bytecode
+                # for now.
+                print(f'WARNING: Parsing error in bytecode. The entire bytecode chunk will be skipped. {e}')
+                chunk.skip()
 
         elif (Context.SectionType.END == section_type):
             # TODO: Figure out what these are.

diff --git a/src/MediaStation/Riff/Chunk.py b/src/MediaStation/Riff/Chunk.py
@@ -26,6 +26,12 @@ def __init__(self, stream, fourcc_length = 4):
             raise ZeroLengthChunkError('Encountered a zero-length chunk. This usually indicates corrupted data - maybe a CD-ROM read error.')
         self.data_start_pointer = stream.tell()
 
+    ## Skips over the entire chunk. The stream is left pointing to the 
+    ## next chunk/subfile, and any bytes in the chunk not yet read are discarded.
+    def skip(self):
+        bytes_remaining_in_chunk = self.end_pointer - self.stream.tell()
+        self.stream.read(bytes_remaining_in_chunk)
+
     ## Reads the given number of bytes from the chunk, or throws an error if there is an attempt
     ## to read past the end of the chunk. Generally this is the only byte reading method that should
     ## be called directly because it includes this protection.