Skip to content

Commit

Permalink
Added a JPEG scan/entropy-coded data classifier with shannon entropy …
Browse files Browse the repository at this point in the history
…calculation, and used that to reconstruct fragmented JPEG scan chunks. Also added a modified copy of Stoneblock.toml
  • Loading branch information
Will-Banksy committed Apr 8, 2024
1 parent 4f99edf commit 2d72784
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 40 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ See [Benchmarking.md](Benchmarking.md) for benchmarks & performance notes.

## Test Data

In the test_data/corpus directory are some sample files for testing the tool with. My test image generation tool [stoneblock](https://github.com/Will-Banksy/stoneblock) currently contains a config file that uses these test files to build a test image (this may be removed or changed in the future however).
In the test_data/corpus directory are some sample files for testing the tool with, and there is a config file [Stoneblock.toml](Stoneblock.toml) for usage with my test image generation tool [stoneblock](https://github.com/Will-Banksy/stoneblock) that uses these test files to build a test image.

3.png, 7.zip, 9.png, and g6-1.jpg are from or derived from data provided by Digital Corpora (Garfinkel *et al*, 2009), in particular the disk image "[nps-2009-canon2](https://corp.digitalcorpora.org/corpora/drives/nps-2009-canon2)".

Expand Down
4 changes: 2 additions & 2 deletions Searchlight.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ max_reconstruction_search_len = 0xfffffffffffffff
[[file_type]]
extension = "jpg"
headers = [ '\xff\xd8\xff\xe0', '\xff\xd8\xff\xe1' ]
footers = [ '\xff\xd9' ]
# footers = [ '\xff\xd9' ]
max_len = 10485760
pairing = "next"
type_id = "jpeg"

[[file_type]]
extension = "png"
headers = [ '\x89\x50\x4e\x47\x0d\x0a\x1a\x0a' ]
footer = [ '\x49\x45\x4e\x44\xae\x42\x60\x82' ]
footers = [ '\x49\x45\x4e\x44\xae\x42\x60\x82' ]
max_len = 10485760
pairing = "next"
type_id = "png"
Expand Down
31 changes: 31 additions & 0 deletions Stoneblock.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
block_size = 4096
corpus = "../searchlight/test_data/corpus"

[scenarios.nonfragmented]
path = "../searchlight/test_data/stoneblock_nonfragmented.dat"
files = [
{ path = "3.png", fragments = 1 },
{ path = "7.zip", fragments = 1 },
{ path = "9.png", fragments = 1 },
{ path = "files.jpg", fragments = 1 },
{ path = "g6-1.jpg", fragments = 1 }
]
filler = "R"
layout = "Z, 1.1, 2.1, 2.1, 3.1, Z, Z, 1.1, 4.1, R, 5.1"

[scenarios.png_fragmented]
path = "../searchlight/test_data/stoneblock_fragmented.dat"
files = [
{ path = "3.png", fragments = 3 },
{ path = "9.png", fragments = 2 }
]
filler = "Z"
layout = "1.1, R, 1.2, R, 2.1, 1.3, 2.2"

[scenarios.jpeg_fragmented]
path = "../searchlight/test_data/stoneblock_fragmented.dat"
files = [
{ path = "files.jpg", fragments = 3 }
]
filler = "Z"
layout = "1.1, R, 1.2, Z, 1.3, R, 1.1, 1.3"
3 changes: 3 additions & 0 deletions libsearchlight/src/classifiers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mod jpeg_data;

pub use jpeg_data::jpeg_data;
90 changes: 90 additions & 0 deletions libsearchlight/src/classifiers/jpeg_data.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
const ENTROPY_THRESHOLD: f32 = 0.6;
const FF00_THRESHOLD: u32 = 1;

/// Calculate the Shannon entropy of a slice
fn shannon_entropy(data: &[u8]) -> f32 {
// Can't calculate the entropy without data so return 0. Would panic otherwise
if data.len() == 0 {
return 0.0;
}

// Count the values
let mut counts = [0u32; 256];
for &byte in data {
counts[byte as usize] += 1;
}

// And calculate the entropy
let mut entropy = 0.0;
for count in counts {
if count != 0 {
let probability = (count as f32) / (data.len() as f32);
entropy -= probability * probability.log2();
}
}

entropy
}

/// Attempts to classify a cluster of file data as JPEG scan data or not, by calculating the Shannon entropy
/// and comparing it to a threshold (currently of 0.6), and by doing some analysis on the bytes to check
/// whether 0xff's are followed by valid bytes in a JPEG-compressed datastream, also checking that if RST
/// markers are present that they are correctly ordered. Also counts the number of 0xff00's, and compares
/// that to a threshold.
///
/// Returns a tuple (`is_jpeg_data`, `likely_end`), where the first element contains whether the cluster
/// is likely JPEG scan data, and the second contains the index of the likely end of the JPEG scan data
/// (if it is likely scan data), i.e. the first 0xff that is not followed by 0xd0..=0xd7 or 0x00
pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
// PERF: Could optimise this by both calculating the entropy and doing the analysis in one pass. Perhaps move the count
// calculations out of the shannon_entropy fn
let entropy = shannon_entropy(cluster);

let mut count_ff00 = 0;
let mut first_ffxx = None; // Contains the first instance of a byte sequence that is invalid in a JPEG scan or terminates a JPEG scan, if one has been encountered
let mut curr_rst_marker = None;
let mut rst_marker_ordering_valid = true; // RST markers have to be encountered in sequence
for i in 0..(cluster.len() - 1) {
if cluster[i] == 0xff {
match cluster[i + 1] {
0x00 => {
if first_ffxx.is_none() { // If we've encountered an invalid sequence or terminator, don't increment ff00 counts
count_ff00 += 1;
}
}
val @ 0xd0..=0xd7 => {
if first_ffxx.is_some() { // RST markers shouldn't be outside of scan data, and i
rst_marker_ordering_valid = false;
}

if let Some(curr_rst) = curr_rst_marker {
if val == curr_rst + 1 {
curr_rst_marker = Some(val);
} else {
rst_marker_ordering_valid = false;
}
} else {
curr_rst_marker = Some(val);
}
}
_ => {
first_ffxx = Some(i);
}
}
}
}

let entropy_valid = entropy > ENTROPY_THRESHOLD;
let contents_valid = count_ff00 >= FF00_THRESHOLD && rst_marker_ordering_valid;

let is_likely_jpeg = entropy_valid && contents_valid;

(
is_likely_jpeg,
if is_likely_jpeg {
first_ffxx
} else {
None
}
)
}
1 change: 1 addition & 0 deletions libsearchlight/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub mod error;
pub mod utils;
pub mod searchlight;
pub mod validation;
pub mod classifiers;

#[cfg(not(target_pointer_width = "64"))]
compile_error!("Target architecture is not 64-bit - This software is only supported on 64-bit platforms");
138 changes: 102 additions & 36 deletions libsearchlight/src/validation/jpeg.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use crate::{search::pairing::MatchPair, searchlight::config::SearchlightConfig};
use log::warn;

use super::{FileValidationInfo, FileValidationType, FileValidator};
use crate::{classifiers, search::pairing::MatchPair, searchlight::config::SearchlightConfig, utils};

use super::{FileValidationInfo, FileValidationType, FileValidator, Fragment};

// const JPEG_SOI: u8 = 0xd8;
const JPEG_EOI: u8 = 0xd9;
Expand All @@ -12,65 +14,130 @@ const JPEG_SOS: u8 = 0xda;

pub struct JpegValidator;

enum JpegScanReconstructionInfo {
Success {
chunk_frags: Vec<Fragment>,
next_chunk_idx: usize
},
Failure {
failure_idx: usize
}
}

impl JpegValidator {
pub fn new() -> Self {
JpegValidator
}

/// Attempt to reconstruct JPEG scan data, assuming that all fragments are in-order, by looping through clusters and attempting to classify them
/// as either JPEG scan data or not
fn reconstruct_scan_data(file_data: &[u8], scan_marker_idx: usize, cluster_size: usize, config: &SearchlightConfig) -> JpegScanReconstructionInfo {
let fragmentation_start = utils::next_multiple_of(scan_marker_idx as u64 + 1, cluster_size as u64) as usize;

let mut fragments = vec![
(scan_marker_idx as u64)..(fragmentation_start as u64)
];

let mut cluster_idx = fragmentation_start;

loop {
// Check we're in bounds of the reconstruction search length and file
let search_offset = (cluster_idx + cluster_size) - scan_marker_idx;
if search_offset > config.max_reconstruction_search_len.unwrap_or(u64::MAX) as usize || (cluster_idx + cluster_size) > file_data.len() {
return JpegScanReconstructionInfo::Failure {
failure_idx: cluster_idx
}
}

let cluster = &file_data[cluster_idx..(cluster_idx + cluster_size)];

let classification_info = classifiers::jpeg_data(cluster);

match classification_info {
(false, None) => {
warn!("Cluster at idx {cluster_idx:#x} was classified as NOT JPEG");
()
}
(true, None) => {
fragments.push((cluster_idx as u64)..((cluster_idx + cluster_size) as u64));
}
(true, Some(next_marker)) => {
warn!("Cluster at idx {cluster_idx:#x} was classified as YES JPEG and END at {:#x}", next_marker + cluster_idx);
fragments.push((cluster_idx as u64)..(next_marker + cluster_idx) as u64);
utils::simplify_ranges(&mut fragments);

return JpegScanReconstructionInfo::Success {
chunk_frags: fragments,
next_chunk_idx: next_marker + cluster_idx
}
}
_ => {
assert!(false);
}
}

cluster_idx += cluster_size;
}
}
}

impl FileValidator for JpegValidator {
// Written using https://www.w3.org/Graphics/JPEG/jfif3.pdf,
// https://www.w3.org/Graphics/JPEG/itu-t81.pdf and https://stackoverflow.com/questions/32873541/scanning-a-jpeg-file-for-markers
fn validate(&self, file_data: &[u8], file_match: &MatchPair, _cluster_size: u64, _config: &SearchlightConfig) -> FileValidationInfo {
fn validate(&self, file_data: &[u8], file_match: &MatchPair, cluster_size: u64, config: &SearchlightConfig) -> FileValidationInfo {
let start = file_match.start_idx as usize;
let end = file_match.end_idx as usize;

// Mandatory segments for a complete JPEG file
let mut seen_appn = false; // Whether an APP0 or APP1 segment has been found
let mut seen_sofn = false; // Whether a SOF0 or SOF2 segment has been found

let mut fragments = Vec::new();

let mut i = start;
'outer: loop {
loop {
// Check if we are on a marker - the current byte should be 0xff and the next byte should not be 0x00
if file_data[i] == 0xff && file_data[i + 1] != 0x00 {
// The SOI and EOI markers don't have lengths after them - I did see someone saying that the whole range 0xd0 to 0xd9 has no lengths
// (https://stackoverflow.com/questions/4585527/detect-end-of-file-for-jpg-images) but I can't find anything in any documentation to back
// that up. Then again I can't see anything in any documentation to say that segments necessarily have lengths
if (file_data[i + 1] ^ 0xd0 < 0x09) || file_data[i + 1] == 0x01 {
// Move on to the next segment
fragments.push(i as u64..(i as u64 + 2));
utils::simplify_ranges(&mut fragments);
i += 2;
continue;
} else if file_data[i + 1] == JPEG_EOI {
fragments.push(i as u64..(i as u64 + 2 + cluster_size)); // NOTE: We're carving an extra cluster here which isn't necessary for the image but often metadata is stored past EOI so this will catch (some of) that
utils::simplify_ranges(&mut fragments);

// Return that this is a complete file with length start - i
// If any of APPn and SOFn segments haven't been seen though return Format Error
break FileValidationInfo {
validation_type: if seen_appn && seen_sofn { FileValidationType::Correct } else { FileValidationType::FormatError },
fragments: vec![ (file_match.start_idx..(i as u64 + 2)) ]
// file_len: Some((i - start) as u64 + 2),
// file_offset: None
fragments
}
} else if file_data[i + 1] == JPEG_SOS {
// Helpfully, the SOS marker doesn't have the length right after it, it is just immediately followed by the entropy-coded data
// However, the entropy-coded data puts 0x00 after any 0xffs so we can just scan for any 0xff that isn't followed by 0x00 to find
// the next marker
let scan_end = if let Some(max_len) = file_match.file_type.max_len {
(start + max_len as usize).min(file_data.len() - 1)
} else {
file_data.len() - 1
};
warn!("Attempting JPEG reconstruction...");

for j in (i + 2)..scan_end {
// Need to skip 0xff00, 0xff01, 0xffd[0-8], according to this stackoverflow answer (https://stackoverflow.com/questions/4585527/detect-end-of-file-for-jpg-images)
// I haven't seen anything in the docs I've looked at to confirm this, but testing on images does seem to indicate that this is the correct approach
if file_data[j] == 0xff && file_data[j + 1] != 0x00 && file_data[j + 1] != 0x01 && (file_data[j + 1] ^ 0xd0 > 0x08) {
i = j;
continue 'outer;
}
}
// Since we have no way of knowing, really, we treat the following data as if it might be fragmented
let recons_info = Self::reconstruct_scan_data(file_data, i, cluster_size as usize, config);

break FileValidationInfo {
validation_type: FileValidationType::Corrupt,
..Default::default()
match recons_info {
JpegScanReconstructionInfo::Success { mut chunk_frags, next_chunk_idx } => {
warn!("JPEG reconstruction success! Next chunk idx: {next_chunk_idx:#x}");
fragments.append(&mut chunk_frags);
i = next_chunk_idx;
},
JpegScanReconstructionInfo::Failure { failure_idx } => {
fragments.push(i as u64..failure_idx as u64);

warn!("JPEG reconstruction failure");

break FileValidationInfo {
validation_type: FileValidationType::Partial,
fragments
}
}
}
} else {
if file_data[i + 1] == JPEG_APP0 || file_data[i + 1] == JPEG_APP1 {
Expand All @@ -80,26 +147,25 @@ impl FileValidator for JpegValidator {
}
// Parse the length and skip the segment
let segment_len = u16::from_be_bytes(file_data[(i + 2)..=(i + 3)].try_into().unwrap());

fragments.push(i as u64..(i as u64 + segment_len as u64 + 2));
utils::simplify_ranges(&mut fragments);

i += segment_len as usize + 2;
continue;
}
} else { // We are not on a marker - We should be. Something has gone wrong - but what, is the difficulty
// If at least one of the mandatory markers has been seen, this is likely a partial file, and we can return i, which will be where we got up to in decoding
// But we'll only return i if that would take us beyond where the carver found the footer because, relying on sensible maximum file sizes, we want to carve as much data
// as possible
// If at least one of the mandatory markers has been seen, this is likely a partial file
warn!("JPEG ended up not on a marker...");
if seen_appn || seen_sofn {
break FileValidationInfo {
validation_type: FileValidationType::Partial,
fragments: if i > end {
vec![ file_match.start_idx..(i as u64) ]
} else {
Vec::new()
}
fragments
};
} else {
break FileValidationInfo {
validation_type: FileValidationType::Unrecognised,
..Default::default()
fragments
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion searchlight/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use clap_verbosity_flag::InfoLevel;

#[derive(Debug, Parser)]
#[command(author, version, about, long_about = None)]
pub struct Args {
pub struct Args { // TODO: Add a "quick search" option to only look for headers at the start of clusters... but still need to find footers...
#[command(flatten)]
pub verbose: clap_verbosity_flag::Verbosity<InfoLevel>,
/// Path to the input disk image file to attempt to recover data from
Expand Down

0 comments on commit 2d72784

Please sign in to comment.