Skip to content

Commit

Permalink
Bugfix and improvement in ZIP validator, some optimsations and notes
Browse files Browse the repository at this point in the history
  • Loading branch information
Will-Banksy committed May 6, 2024
1 parent 16c3a49 commit 8a7d131
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 22 deletions.
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion libsearchlight/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ log = "0.4.20"
crc32fast = "1.3.2"
unicode-segmentation = "1.11.0"
strum = { version = "0.26.1", features = [ "derive" ] }
flate2 = "1.0.28" # Need for decompressing deflate-compressed ZIP file data
flate2 = { version = "1.0.28", features = [ "zlib-ng" ] } # Need for decompressing deflate-compressed ZIP file data
serde_json = "1.0.115"

[dev-dependencies]
Expand Down
25 changes: 12 additions & 13 deletions libsearchlight/src/classifiers/jpeg_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,17 @@ const FF00_THRESHOLD: u32 = 0; // Larger values seem to cause problems, especial
const FF00_CERTAINTY_THRESHOLD: u32 = 4;

/// Calculate the Shannon entropy of a slice
fn shannon_entropy(data: &[u8]) -> f32 {
fn shannon_entropy(counts: &[u32], data_len: usize) -> f32 {
// Can't calculate the entropy without data so return 0. Would panic otherwise
if data.len() == 0 {
if data_len == 0 {
return 0.0;
}

// Count the values
let mut counts = [0u32; 256];
for &byte in data {
counts[byte as usize] += 1;
}

// And calculate the entropy
let mut entropy = 0.0;
for count in counts {
for &count in counts {
if count != 0 {
let probability = (count as f32) / (data.len() as f32);
let probability = (count as f32) / (data_len as f32);
entropy -= probability * probability.log2();
}
}
Expand All @@ -37,9 +31,8 @@ fn shannon_entropy(data: &[u8]) -> f32 {
/// is likely JPEG scan data, and the second contains the index of the likely end of the JPEG scan data
/// (if it is likely scan data), i.e. the first 0xff that is not followed by 0xd0..=0xd7 or 0x00
pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
// PERF: Could optimise this by both calculating the entropy and doing the analysis in one pass. Perhaps move the count
// calculations out of the shannon_entropy fn
let entropy = shannon_entropy(cluster);
// Initialise the counts for each byte
let mut counts = [0u32; 256];

let mut count_ff00 = 0;
// Contains the first instance of a byte sequence that is invalid in a JPEG scan or terminates a JPEG scan,
Expand All @@ -49,7 +42,11 @@ pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
// RST markers have to be encountered in sequence
let mut rst_marker_ordering_valid = true;
let mut found_invalid_marker = false;
let mut bytes_counted = 0;
for i in 0..(cluster.len() - 1) {
counts[cluster[i] as usize] += 1;
bytes_counted += 1;

if cluster[i] == 0xff {
match cluster[i + 1] {
0x00 => {
Expand Down Expand Up @@ -86,6 +83,8 @@ pub fn jpeg_data(cluster: &[u8]) -> (bool, Option<usize>) {
}
}

let entropy = shannon_entropy(&counts, bytes_counted);

let entropy_valid = entropy > ENTROPY_THRESHOLD;
let contents_valid = count_ff00 >= FF00_THRESHOLD && rst_marker_ordering_valid && !found_invalid_marker;

Expand Down
5 changes: 3 additions & 2 deletions libsearchlight/src/searchlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ impl Searchlight {

let mut file = File::create(filepath)?;

// PERF: Writing to lots of files does seem like a perfect use case for io_uring... but windows... and other platforms...
// PERF: Writing to lots of files does seem like a perfect use case for io_uring... but windows... and other platforms... Maybe https://crates.io/crates/nuclei ?
// At the very least, write_vectored should be more performant than repeated write_all calls, but does not seem to behave properly on windows, and nevertheless doesn't guarantee everything is written
// FIXME: write_vectored may not write everything
// file.write_vectored(
// &fragments.iter().map(|frag| IoSlice::new(&mmap[frag.start..frag.end])).collect::<Vec<IoSlice>>()
Expand All @@ -291,7 +292,7 @@ impl Searchlight {

// BUG: If some text is written to stderr or stdout between writes of the progress, then there will be no
// line break between the progress report and the output text. Put a space after the progress % to
// make that look less bad but I'm not sure if this is fixable, in a compelling way anyway
// make that look less bad but I'm not sure if this is fixable, in a compelling way anyway. Well apart from externalising the progress reporting
if log_enabled!(Level::Info) {
eprint!("\rProgress: {:.2}% ", (num_carved_files as f32 / match_pairs.len() as f32) * 100.0);
}
Expand Down
14 changes: 9 additions & 5 deletions libsearchlight/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,13 @@ pub fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range<u

let mut res = Vec::new();

// PERF: This is still quite hot, any obvious optimisations I can do?
while gap_idx <= clusters.len() - gap_len {
// Get all the clusters that are not in the gap, and simplify
let mut file_clusters: Vec<Range<usize>> = clusters.iter().enumerate().filter(|(i, _)| *i < gap_idx || *i >= (gap_idx + gap_len)).map(|(_, c)| c.clone()).collect();
simplify_ranges(&mut file_clusters);
res.push(Vec::with_capacity(num_file_clusters));

res.push(file_clusters);
res.last_mut().unwrap().extend(clusters.iter().enumerate().filter(|(i, _)| *i < gap_idx || *i >= (gap_idx + gap_len)).map(|(_, c)| c.clone()));
simplify_ranges(res.last_mut().unwrap());

gap_idx += 1;
}
Expand All @@ -134,16 +135,19 @@ pub fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range<u

/// Takes a vec of assumed in-order, non-overlapping ranges, and where the end of a range is equal to the start of the next range, merges
/// the two ranges into one
pub fn simplify_ranges<T>(ranges: &mut Vec<Range<T>>) where T: PartialEq {
// PERF: Changed to Vec::swap_remove with a sort after all removes are done instead of Vec::remove - Needs some testing if this actually helps performance
pub fn simplify_ranges<T>(ranges: &mut Vec<Range<T>>) where T: PartialEq + Ord + Copy {
let mut i = 1;
while i < ranges.len() {
if ranges[i - 1].end == ranges[i].start {
ranges[i - 1].end = ranges.remove(i).end;
ranges[i - 1].end = ranges.swap_remove(i).end;
i -= 1;
}

i += 1;
}

ranges.sort_unstable_by_key(|r| r.start);
}

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions libsearchlight/src/validation/jpeg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ impl JpegValidator {
/// as either JPEG scan data or not
// TODO: We could maybe try and tackle out-of-order JPEG fragmentations using the reset marker orderings (if present)
// although apparently they are only present in ~12% of JPEGs (Uzun and Sencar, 2020, https://doi.org/10.1109/TIFS.2019.2953382)
// TODO: Ali and Mohamad (2021) manage to tackle intertwined JPEGs using the Coherence of Euclidean Distance (CED) to detect sharp changes in the image https://doi.org/10.1016/j.jksuci.2018.12.007
fn reconstruct_scan_data(file_data: &[u8], scan_marker_idx: usize, cluster_size: usize, config: &SearchlightConfig) -> JpegScanReconstructionInfo {
let fragmentation_start = utils::next_multiple_of(scan_marker_idx + 1, cluster_size) as usize;

Expand Down
1 change: 1 addition & 0 deletions libsearchlight/src/validation/png.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ impl PngValidator {

impl FileValidator for PngValidator {
// Written using https://www.w3.org/TR/png-3/
// TODO: Could try reconstructing out-of-order fragmentations by finding a chunk type and testing X clusters after this chunk type and Y clusters after that chunk type (where X+Y is the number of clusters needed)
fn validate(&self, file_data: &[u8], file_match: &MatchPair, _all_matches: &[Match], cluster_size: usize, config: &SearchlightConfig) -> FileValidationInfo {
let mut chunk_idx = file_match.start_idx as usize + 8;

Expand Down
5 changes: 4 additions & 1 deletion libsearchlight/src/validation/zip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -469,13 +469,16 @@ impl FileValidator for ZipValidator {
cd.push(record);
} else {
warn!("ZIP: Central directory file header signature incorrect, skipping entry. This is likely a sign of corruption or fragmentation (central directory at {:#0x})", central_directory_idx);
i += 1;
}
}

cd
};

let zip_header_matches: Vec<&Match> = all_matches.iter().filter(|m| m.id == ZIP_LOCAL_FILE_HEADER_SIG_ID).collect();
let zip_header_matches: Vec<&Match> = all_matches.iter().filter(|m| {
m.id == ZIP_LOCAL_FILE_HEADER_SIG_ID && (m.start_idx as usize) < central_directory_idx
}).collect();

let local_file_headers = {
let mut lfhs = Vec::new();
Expand Down

0 comments on commit 8a7d131

Please sign in to comment.