Added a JPEG scan/entropy-coded data classifier with shannon entropy …

…calculation, and used that to reconstruct fragmented JPEG scan chunks. Also added a modified copy of Stoneblock.toml
Will-Banksy · Apr 8, 2024 · 2d72784 · 2d72784
1 parent 4f99edf
commit 2d72784
Show file tree

Hide file tree

Showing 8 changed files with 231 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ See [Benchmarking.md](Benchmarking.md) for benchmarks & performance notes.
 
 ## Test Data
 
-In the test_data/corpus directory are some sample files for testing the tool with. My test image generation tool [stoneblock](https://github.com/Will-Banksy/stoneblock) currently contains a config file that uses these test files to build a test image (this may be removed or changed in the future however).
+In the test_data/corpus directory are some sample files for testing the tool with, and there is a config file [Stoneblock.toml](Stoneblock.toml) for usage with my test image generation tool [stoneblock](https://github.com/Will-Banksy/stoneblock) that uses these test files to build a test image.
 
 3.png, 7.zip, 9.png, and g6-1.jpg are from or derived from data provided by Digital Corpora (Garfinkel *et al*, 2009), in particular the disk image "[nps-2009-canon2](https://corp.digitalcorpora.org/corpora/drives/nps-2009-canon2)".
 

diff --git a/Searchlight.toml b/Searchlight.toml
@@ -3,15 +3,15 @@ max_reconstruction_search_len = 0xfffffffffffffff
 [[file_type]]
 extension = "jpg"
 headers = [ '\xff\xd8\xff\xe0', '\xff\xd8\xff\xe1' ]
-footers = [ '\xff\xd9' ]
+# footers = [ '\xff\xd9' ]
 max_len = 10485760
 pairing = "next"
 type_id = "jpeg"
 
 [[file_type]]
 extension = "png"
 headers = [ '\x89\x50\x4e\x47\x0d\x0a\x1a\x0a' ]
-footer = [ '\x49\x45\x4e\x44\xae\x42\x60\x82' ]
+footers = [ '\x49\x45\x4e\x44\xae\x42\x60\x82' ]
 max_len = 10485760
 pairing = "next"
 type_id = "png"

diff --git a/Stoneblock.toml b/Stoneblock.toml
@@ -0,0 +1,31 @@
+block_size = 4096
+corpus = "../searchlight/test_data/corpus"
+
+[scenarios.nonfragmented]
+path = "../searchlight/test_data/stoneblock_nonfragmented.dat"
+files = [
+	{ path = "3.png", fragments = 1 },
+	{ path = "7.zip", fragments = 1 },
+	{ path = "9.png", fragments = 1 },
+	{ path = "files.jpg", fragments = 1 },
+	{ path = "g6-1.jpg", fragments = 1 }
+]
+filler = "R"
+layout = "Z, 1.1, 2.1, 2.1, 3.1, Z, Z, 1.1, 4.1, R, 5.1"
+
+[scenarios.png_fragmented]
+path = "../searchlight/test_data/stoneblock_fragmented.dat"
+files = [
+	{ path = "3.png", fragments = 3 },
+	{ path = "9.png", fragments = 2 }
+]
+filler = "Z"
+layout = "1.1, R, 1.2, R, 2.1, 1.3, 2.2"
+
+[scenarios.jpeg_fragmented]
+path = "../searchlight/test_data/stoneblock_fragmented.dat"
+files = [
+	{ path = "files.jpg", fragments = 3 }
+]
+filler = "Z"
+layout = "1.1, R, 1.2, Z, 1.3, R, 1.1, 1.3"
diff --git a/libsearchlight/src/classifiers.rs b/libsearchlight/src/classifiers.rs
@@ -0,0 +1,3 @@
+mod jpeg_data;
+
+pub use jpeg_data::jpeg_data;
diff --git a/libsearchlight/src/classifiers/jpeg_data.rs b/libsearchlight/src/classifiers/jpeg_data.rs
@@ -0,0 +1,90 @@
+const ENTROPY_THRESHOLD: f32 = 0.6;
+const FF00_THRESHOLD: u32 = 1;
+
+/// Calculate the Shannon entropy of a slice
+fn shannon_entropy(data: &[u8]) -> f32 {
+	// Can't calculate the entropy without data so return 0. Would panic otherwise
+	if data.len() == 0 {
+		return 0.0;
+	}
+
+	// Count the values
+	let mut counts = [0u32; 256];
+	for &byte in data {
+		counts[byte as usize] += 1;
+	}
+
+	// And calculate the entropy
+	let mut entropy = 0.0;
+	for count in counts {
+		if count != 0 {
+			let probability = (count as f32) / (data.len() as f32);
+			entropy -= probability * probability.log2();
+		}
+	}
+
+	entropy
+}
+
+/// Attempts to classify a cluster of file data as JPEG scan data or not, by calculating the Shannon entropy
+/// and comparing it to a threshold (currently of 0.6), and by doing some analysis on the bytes to check
+/// whether 0xff's are followed by valid bytes in a JPEG-compressed datastream, also checking that if RST
+/// markers are present that they are correctly ordered. Also counts the number of 0xff00's, and compares
+/// that to a threshold.
+///
+/// Returns a tuple (`is_jpeg_data`, `likely_end`), where the first element contains whether the cluster
+/// is likely JPEG scan data, and the second contains the index of the likely end of the JPEG scan data
+/// (if it is likely scan data), i.e. the first 0xff that is not followed by 0xd0..=0xd7 or 0x00
+pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
+	// PERF: Could optimise this by both calculating the entropy and doing the analysis in one pass. Perhaps move the count
+	//       calculations out of the shannon_entropy fn
+	let entropy = shannon_entropy(cluster);
+
+	let mut count_ff00 = 0;
+	let mut first_ffxx = None; // Contains the first instance of a byte sequence that is invalid in a JPEG scan or terminates a JPEG scan, if one has been encountered
+	let mut curr_rst_marker = None;
+	let mut rst_marker_ordering_valid = true; // RST markers have to be encountered in sequence
+	for i in 0..(cluster.len() - 1) {
+		if cluster[i] == 0xff {
+			match cluster[i + 1] {
+				0x00 => {
+					if first_ffxx.is_none() { // If we've encountered an invalid sequence or terminator, don't increment ff00 counts
+						count_ff00 += 1;
+					}
+				}
+				val @ 0xd0..=0xd7 => {
+					if first_ffxx.is_some() { // RST markers shouldn't be outside of scan data, and i
+						rst_marker_ordering_valid = false;
+					}
+
+					if let Some(curr_rst) = curr_rst_marker {
+						if val == curr_rst + 1 {
+							curr_rst_marker = Some(val);
+						} else {
+							rst_marker_ordering_valid = false;
+						}
+					} else {
+						curr_rst_marker = Some(val);
+					}
+				}
+				_ => {
+					first_ffxx = Some(i);
+				}
+			}
+		}
+	}
+
+	let entropy_valid = entropy > ENTROPY_THRESHOLD;
+	let contents_valid = count_ff00 >= FF00_THRESHOLD && rst_marker_ordering_valid;
+
+	let is_likely_jpeg = entropy_valid && contents_valid;
+
+	(
+		is_likely_jpeg,
+		if is_likely_jpeg {
+			first_ffxx
+		} else {
+			None
+		}
+	)
+}
diff --git a/libsearchlight/src/lib.rs b/libsearchlight/src/lib.rs
@@ -5,6 +5,7 @@ pub mod error;
 pub mod utils;
 pub mod searchlight;
 pub mod validation;
+pub mod classifiers;
 
 #[cfg(not(target_pointer_width = "64"))]
 compile_error!("Target architecture is not 64-bit - This software is only supported on 64-bit platforms");
diff --git a/libsearchlight/src/validation/jpeg.rs b/libsearchlight/src/validation/jpeg.rs
@@ -1,6 +1,8 @@
-use crate::{search::pairing::MatchPair, searchlight::config::SearchlightConfig};
+use log::warn;
 
-use super::{FileValidationInfo, FileValidationType, FileValidator};
+use crate::{classifiers, search::pairing::MatchPair, searchlight::config::SearchlightConfig, utils};
+
+use super::{FileValidationInfo, FileValidationType, FileValidator, Fragment};
 
 // const JPEG_SOI: u8 = 0xd8;
 const JPEG_EOI: u8 = 0xd9;
@@ -12,65 +14,130 @@ const JPEG_SOS: u8 = 0xda;
 
 pub struct JpegValidator;
 
+enum JpegScanReconstructionInfo {
+	Success {
+		chunk_frags: Vec<Fragment>,
+		next_chunk_idx: usize
+	},
+	Failure {
+		failure_idx: usize
+	}
+}
+
 impl JpegValidator {
 	pub fn new() -> Self {
 		JpegValidator
 	}
+
+	/// Attempt to reconstruct JPEG scan data, assuming that all fragments are in-order, by looping through clusters and attempting to classify them
+	/// as either JPEG scan data or not
+	fn reconstruct_scan_data(file_data: &[u8], scan_marker_idx: usize, cluster_size: usize, config: &SearchlightConfig) -> JpegScanReconstructionInfo {
+		let fragmentation_start = utils::next_multiple_of(scan_marker_idx as u64 + 1, cluster_size as u64) as usize;
+
+		let mut fragments = vec![
+			(scan_marker_idx as u64)..(fragmentation_start as u64)
+		];
+
+		let mut cluster_idx = fragmentation_start;
+
+		loop {
+			// Check we're in bounds of the reconstruction search length and file
+			let search_offset = (cluster_idx + cluster_size) - scan_marker_idx;
+			if search_offset > config.max_reconstruction_search_len.unwrap_or(u64::MAX) as usize || (cluster_idx + cluster_size) > file_data.len() {
+				return JpegScanReconstructionInfo::Failure {
+					failure_idx: cluster_idx
+				}
+			}
+
+			let cluster = &file_data[cluster_idx..(cluster_idx + cluster_size)];
+
+			let classification_info = classifiers::jpeg_data(cluster);
+
+			match classification_info {
+				(false, None) => {
+					warn!("Cluster at idx {cluster_idx:#x} was classified as NOT JPEG");
+					()
+				}
+				(true, None) => {
+					fragments.push((cluster_idx as u64)..((cluster_idx + cluster_size) as u64));
+				}
+				(true, Some(next_marker)) => {
+					warn!("Cluster at idx {cluster_idx:#x} was classified as YES JPEG and END at {:#x}", next_marker + cluster_idx);
+					fragments.push((cluster_idx as u64)..(next_marker + cluster_idx) as u64);
+					utils::simplify_ranges(&mut fragments);
+
+					return JpegScanReconstructionInfo::Success {
+						chunk_frags: fragments,
+						next_chunk_idx: next_marker + cluster_idx
+					}
+				}
+				_ => {
+					assert!(false);
+				}
+			}
+
+			cluster_idx += cluster_size;
+		}
+	}
 }
 
 impl FileValidator for JpegValidator {
 	// Written using https://www.w3.org/Graphics/JPEG/jfif3.pdf,
 	// https://www.w3.org/Graphics/JPEG/itu-t81.pdf and https://stackoverflow.com/questions/32873541/scanning-a-jpeg-file-for-markers
-	fn validate(&self, file_data: &[u8], file_match: &MatchPair, _cluster_size: u64, _config: &SearchlightConfig) -> FileValidationInfo {
+	fn validate(&self, file_data: &[u8], file_match: &MatchPair, cluster_size: u64, config: &SearchlightConfig) -> FileValidationInfo {
 		let start = file_match.start_idx as usize;
-		let end = file_match.end_idx as usize;
 
 		// Mandatory segments for a complete JPEG file
 		let mut seen_appn = false; // Whether an APP0 or APP1 segment has been found
 		let mut seen_sofn = false; // Whether a SOF0 or SOF2 segment has been found
 
+		let mut fragments = Vec::new();
+
 		let mut i = start;
-		'outer: loop {
+		loop {
 			// Check if we are on a marker - the current byte should be 0xff and the next byte should not be 0x00
 			if file_data[i] == 0xff && file_data[i + 1] != 0x00 {
 				// The SOI and EOI markers don't have lengths after them - I did see someone saying that the whole range 0xd0 to 0xd9 has no lengths
 				// (https://stackoverflow.com/questions/4585527/detect-end-of-file-for-jpg-images) but I can't find anything in any documentation to back
 				// that up. Then again I can't see anything in any documentation to say that segments necessarily have lengths
 				if (file_data[i + 1] ^ 0xd0 < 0x09) || file_data[i + 1] == 0x01 {
 					// Move on to the next segment
+					fragments.push(i as u64..(i as u64 + 2));
+					utils::simplify_ranges(&mut fragments);
 					i += 2;
 					continue;
 				} else if file_data[i + 1] == JPEG_EOI {
+					fragments.push(i as u64..(i as u64 + 2 + cluster_size)); // NOTE: We're carving an extra cluster here which isn't necessary for the image but often metadata is stored past EOI so this will catch (some of) that
+					utils::simplify_ranges(&mut fragments);
+
 					// Return that this is a complete file with length start - i
 					// If any of APPn and SOFn segments haven't been seen though return Format Error
 					break FileValidationInfo {
 						validation_type: if seen_appn && seen_sofn { FileValidationType::Correct } else { FileValidationType::FormatError },
-						fragments: vec![ (file_match.start_idx..(i as u64 + 2)) ]
-						// file_len: Some((i - start) as u64 + 2),
-						// file_offset: None
+						fragments
 					}
 				} else if file_data[i + 1] == JPEG_SOS {
-					// Helpfully, the SOS marker doesn't have the length right after it, it is just immediately followed by the entropy-coded data
-					// However, the entropy-coded data puts 0x00 after any 0xffs so we can just scan for any 0xff that isn't followed by 0x00 to find
-					// the next marker
-					let scan_end = if let Some(max_len) = file_match.file_type.max_len {
-						(start + max_len as usize).min(file_data.len() - 1)
-					} else {
-						file_data.len() - 1
-					};
+					warn!("Attempting JPEG reconstruction...");
 
-					for j in (i + 2)..scan_end {
-						// Need to skip 0xff00, 0xff01, 0xffd[0-8], according to this stackoverflow answer (https://stackoverflow.com/questions/4585527/detect-end-of-file-for-jpg-images)
-						// I haven't seen anything in the docs I've looked at to confirm this, but testing on images does seem to indicate that this is the correct approach
-						if file_data[j] == 0xff && file_data[j + 1] != 0x00 && file_data[j + 1] != 0x01 && (file_data[j + 1] ^ 0xd0 > 0x08) {
-							i = j;
-							continue 'outer;
-						}
-					}
+					// Since we have no way of knowing, really, we treat the following data as if it might be fragmented
+					let recons_info = Self::reconstruct_scan_data(file_data, i, cluster_size as usize, config);
 
-					break FileValidationInfo {
-						validation_type: FileValidationType::Corrupt,
-						..Default::default()
+					match recons_info {
+						JpegScanReconstructionInfo::Success { mut chunk_frags, next_chunk_idx } => {
+							warn!("JPEG reconstruction success! Next chunk idx: {next_chunk_idx:#x}");
+							fragments.append(&mut chunk_frags);
+							i = next_chunk_idx;
+						},
+						JpegScanReconstructionInfo::Failure { failure_idx } => {
+							fragments.push(i as u64..failure_idx as u64);
+
+							warn!("JPEG reconstruction failure");
+
+							break FileValidationInfo {
+								validation_type: FileValidationType::Partial,
+								fragments
+							}
+						}
 					}
 				} else {
 					if file_data[i + 1] == JPEG_APP0 || file_data[i + 1] == JPEG_APP1 {
@@ -80,26 +147,25 @@ impl FileValidator for JpegValidator {
 					}
 					// Parse the length and skip the segment
 					let segment_len = u16::from_be_bytes(file_data[(i + 2)..=(i + 3)].try_into().unwrap());
+
+					fragments.push(i as u64..(i as u64 + segment_len as u64 + 2));
+					utils::simplify_ranges(&mut fragments);
+
 					i += segment_len as usize + 2;
 					continue;
 				}
 			} else { // We are not on a marker - We should be. Something has gone wrong - but what, is the difficulty
-				// If at least one of the mandatory markers has been seen, this is likely a partial file, and we can return i, which will be where we got up to in decoding
-				// But we'll only return i if that would take us beyond where the carver found the footer because, relying on sensible maximum file sizes, we want to carve as much data
-				// as possible
+				// If at least one of the mandatory markers has been seen, this is likely a partial file
+				warn!("JPEG ended up not on a marker...");
 				if seen_appn || seen_sofn {
 					break FileValidationInfo {
 						validation_type: FileValidationType::Partial,
-						fragments: if i > end {
-							vec![ file_match.start_idx..(i as u64) ]
-						} else {
-							Vec::new()
-						}
+						fragments
 					};
 				} else {
 					break FileValidationInfo {
 						validation_type: FileValidationType::Unrecognised,
-						..Default::default()
+						fragments
 					}
 				}
 			}

diff --git a/searchlight/src/args.rs b/searchlight/src/args.rs
@@ -5,7 +5,7 @@ use clap_verbosity_flag::InfoLevel;
 
 #[derive(Debug, Parser)]
 #[command(author, version, about, long_about = None)]
-pub struct Args {
+pub struct Args { // TODO: Add a "quick search" option to only look for headers at the start of clusters... but still need to find footers...
 	#[command(flatten)]
 	pub verbose: clap_verbosity_flag::Verbosity<InfoLevel>,
 	/// Path to the input disk image file to attempt to recover data from