Bugfix and improvement in ZIP validator, some optimsations and notes

Will-Banksy · May 6, 2024 · 8a7d131 · 8a7d131
1 parent 16c3a49
commit 8a7d131
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 22 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/libsearchlight/Cargo.toml b/libsearchlight/Cargo.toml
@@ -19,7 +19,7 @@ log = "0.4.20"
 crc32fast = "1.3.2"
 unicode-segmentation = "1.11.0"
 strum = { version = "0.26.1", features = [ "derive" ] }
-flate2 = "1.0.28" # Need for decompressing deflate-compressed ZIP file data
+flate2 = { version = "1.0.28", features = [ "zlib-ng" ] } # Need for decompressing deflate-compressed ZIP file data
 serde_json = "1.0.115"
 
 [dev-dependencies]

diff --git a/libsearchlight/src/classifiers/jpeg_data.rs b/libsearchlight/src/classifiers/jpeg_data.rs
@@ -3,23 +3,17 @@ const FF00_THRESHOLD: u32 = 0; // Larger values seem to cause problems, especial
 const FF00_CERTAINTY_THRESHOLD: u32 = 4;
 
 /// Calculate the Shannon entropy of a slice
-fn shannon_entropy(data: &[u8]) -> f32 {
+fn shannon_entropy(counts: &[u32], data_len: usize) -> f32 {
 	// Can't calculate the entropy without data so return 0. Would panic otherwise
-	if data.len() == 0 {
+	if data_len == 0 {
 		return 0.0;
 	}
 
-	// Count the values
-	let mut counts = [0u32; 256];
-	for &byte in data {
-		counts[byte as usize] += 1;
-	}
-
 	// And calculate the entropy
 	let mut entropy = 0.0;
-	for count in counts {
+	for &count in counts {
 		if count != 0 {
-			let probability = (count as f32) / (data.len() as f32);
+			let probability = (count as f32) / (data_len as f32);
 			entropy -= probability * probability.log2();
 		}
 	}
@@ -37,9 +31,8 @@ fn shannon_entropy(data: &[u8]) -> f32 {
 /// is likely JPEG scan data, and the second contains the index of the likely end of the JPEG scan data
 /// (if it is likely scan data), i.e. the first 0xff that is not followed by 0xd0..=0xd7 or 0x00
 pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
-	// PERF: Could optimise this by both calculating the entropy and doing the analysis in one pass. Perhaps move the count
-	//       calculations out of the shannon_entropy fn
-	let entropy = shannon_entropy(cluster);
+	// Initialise the counts for each byte
+	let mut counts = [0u32; 256];
 
 	let mut count_ff00 = 0;
 	// Contains the first instance of a byte sequence that is invalid in a JPEG scan or terminates a JPEG scan,
@@ -49,7 +42,11 @@ pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
 	// RST markers have to be encountered in sequence
 	let mut rst_marker_ordering_valid = true;
 	let mut found_invalid_marker = false;
+	let mut bytes_counted = 0;
 	for i in 0..(cluster.len() - 1) {
+		counts[cluster[i] as usize] += 1;
+		bytes_counted += 1;
+
 		if cluster[i] == 0xff {
 			match cluster[i + 1] {
 				0x00 => {
@@ -86,6 +83,8 @@ pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
 		}
 	}
 
+	let entropy = shannon_entropy(&counts, bytes_counted);
+
 	let entropy_valid = entropy > ENTROPY_THRESHOLD;
 	let contents_valid = count_ff00 >= FF00_THRESHOLD && rst_marker_ordering_valid && !found_invalid_marker;
 

diff --git a/libsearchlight/src/searchlight.rs b/libsearchlight/src/searchlight.rs
@@ -274,7 +274,8 @@ impl Searchlight  {
 
 					let mut file = File::create(filepath)?;
 
-					// PERF: Writing to lots of files does seem like a perfect use case for io_uring... but windows... and other platforms...
+					// PERF: Writing to lots of files does seem like a perfect use case for io_uring... but windows... and other platforms... Maybe https://crates.io/crates/nuclei ?
+					//       At the very least, write_vectored should be more performant than repeated write_all calls, but does not seem to behave properly on windows, and nevertheless doesn't guarantee everything is written
 					// FIXME: write_vectored may not write everything
 					// file.write_vectored(
 					// 	&fragments.iter().map(|frag| IoSlice::new(&mmap[frag.start..frag.end])).collect::<Vec<IoSlice>>()
@@ -291,7 +292,7 @@ impl Searchlight  {
 
 				// BUG: If some text is written to stderr or stdout between writes of the progress, then there will be no
 				//      line break between the progress report and the output text. Put a space after the progress % to
-				//      make that look less bad but I'm not sure if this is fixable, in a compelling way anyway
+				//      make that look less bad but I'm not sure if this is fixable, in a compelling way anyway. Well apart from externalising the progress reporting
 				if log_enabled!(Level::Info) {
 					eprint!("\rProgress: {:.2}% ", (num_carved_files as f32 / match_pairs.len() as f32) * 100.0);
 				}

diff --git a/libsearchlight/src/utils.rs b/libsearchlight/src/utils.rs
@@ -119,12 +119,13 @@ pub fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range<u
 
 	let mut res = Vec::new();
 
+	// PERF: This is still quite hot, any obvious optimisations I can do?
 	while gap_idx <= clusters.len() - gap_len {
 		// Get all the clusters that are not in the gap, and simplify
-		let mut file_clusters: Vec<Range<usize>> = clusters.iter().enumerate().filter(|(i, _)| *i < gap_idx || *i >= (gap_idx + gap_len)).map(|(_, c)| c.clone()).collect();
-		simplify_ranges(&mut file_clusters);
+		res.push(Vec::with_capacity(num_file_clusters));
 
-		res.push(file_clusters);
+		res.last_mut().unwrap().extend(clusters.iter().enumerate().filter(|(i, _)| *i < gap_idx || *i >= (gap_idx + gap_len)).map(|(_, c)| c.clone()));
+		simplify_ranges(res.last_mut().unwrap());
 
 		gap_idx += 1;
 	}
@@ -134,16 +135,19 @@ pub fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range<u
 
 /// Takes a vec of assumed in-order, non-overlapping ranges, and where the end of a range is equal to the start of the next range, merges
 /// the two ranges into one
-pub fn simplify_ranges<T>(ranges: &mut Vec<Range<T>>) where T: PartialEq {
+// PERF: Changed to Vec::swap_remove with a sort after all removes are done instead of Vec::remove - Needs some testing if this actually helps performance
+pub fn simplify_ranges<T>(ranges: &mut Vec<Range<T>>) where T: PartialEq + Ord + Copy {
 	let mut i = 1;
 	while i < ranges.len() {
 		if ranges[i - 1].end == ranges[i].start {
-			ranges[i - 1].end = ranges.remove(i).end;
+			ranges[i - 1].end = ranges.swap_remove(i).end;
 			i -= 1;
 		}
 
 		i += 1;
 	}
+
+	ranges.sort_unstable_by_key(|r| r.start);
 }
 
 #[cfg(test)]

diff --git a/libsearchlight/src/validation/jpeg.rs b/libsearchlight/src/validation/jpeg.rs
@@ -31,6 +31,7 @@ impl JpegValidator {
 	/// as either JPEG scan data or not
 	// TODO: We could maybe try and tackle out-of-order JPEG fragmentations using the reset marker orderings (if present)
 	//       although apparently they are only present in ~12% of JPEGs (Uzun and Sencar, 2020, https://doi.org/10.1109/TIFS.2019.2953382)
+	// TODO: Ali and Mohamad (2021) manage to tackle intertwined JPEGs using the Coherence of Euclidean Distance (CED) to detect sharp changes in the image https://doi.org/10.1016/j.jksuci.2018.12.007
 	fn reconstruct_scan_data(file_data: &[u8], scan_marker_idx: usize, cluster_size: usize, config: &SearchlightConfig) -> JpegScanReconstructionInfo {
 		let fragmentation_start = utils::next_multiple_of(scan_marker_idx + 1, cluster_size) as usize;
 

diff --git a/libsearchlight/src/validation/png.rs b/libsearchlight/src/validation/png.rs
@@ -336,6 +336,7 @@ impl PngValidator {
 
 impl FileValidator for PngValidator {
 	// Written using https://www.w3.org/TR/png-3/
+	// TODO: Could try reconstructing out-of-order fragmentations by finding a chunk type and testing X clusters after this chunk type and Y clusters after that chunk type (where X+Y is the number of clusters needed)
 	fn validate(&self, file_data: &[u8], file_match: &MatchPair, _all_matches: &[Match], cluster_size: usize, config: &SearchlightConfig) -> FileValidationInfo {
 		let mut chunk_idx = file_match.start_idx as usize + 8;
 

diff --git a/libsearchlight/src/validation/zip.rs b/libsearchlight/src/validation/zip.rs
@@ -469,13 +469,16 @@ impl FileValidator for ZipValidator {
 					cd.push(record);
 				} else {
 					warn!("ZIP: Central directory file header signature incorrect, skipping entry. This is likely a sign of corruption or fragmentation (central directory at {:#0x})", central_directory_idx);
+					i += 1;
 				}
 			}
 
 			cd
 		};
 
-		let zip_header_matches: Vec<&Match> = all_matches.iter().filter(|m| m.id == ZIP_LOCAL_FILE_HEADER_SIG_ID).collect();
+		let zip_header_matches: Vec<&Match> = all_matches.iter().filter(|m| {
+			m.id == ZIP_LOCAL_FILE_HEADER_SIG_ID && (m.start_idx as usize) < central_directory_idx
+		}).collect();
 
 		let local_file_headers = {
 			let mut lfhs = Vec::new();