diff --git a/.vscode/launch.json b/.vscode/launch.json index 3b13377..1ef3e1e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -23,115 +23,6 @@ "args": [], "cwd": "${workspaceFolder}" }, - { - "type": "lldb", - "request": "launch", - "name": "Debug example 'io_uring_bench'", - "cargo": { - "args": [ - "build", - "--example=io_uring_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "io_uring_bench", - "kind": "example" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, - { - "type": "lldb", - "request": "launch", - "name": "Debug example 'filebuf_bench'", - "cargo": { - "args": [ - "build", - "--example=filebuf_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "filebuf_bench", - "kind": "example" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, - { - "type": "lldb", - "request": "launch", - "name": "Debug example 'generate_io_bench_dat'", - "cargo": { - "args": [ - "build", - "--example=generate_io_bench_dat", - "--package=libsearchlight" - ], - "filter": { - "name": "generate_io_bench_dat", - "kind": "example" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, - { - "type": "lldb", - "request": "launch", - "name": "Debug example 'mmap_bench'", - "cargo": { - "args": [ - "build", - "--example=mmap_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "mmap_bench", - "kind": "example" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, - { - "type": "lldb", - "request": "launch", - "name": "Debug example 'direct_bench'", - "cargo": { - "args": [ - "build", - "--example=direct_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "direct_bench", - "kind": "example" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, - { - "type": "lldb", - "request": "launch", - "name": "Debug benchmark 'io_bench'", - "cargo": { - "args": [ - "test", - "--no-run", - "--bench=io_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "io_bench", - "kind": "bench" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, { "type": "lldb", "request": "launch", @@ -151,25 +42,6 @@ "args": [], "cwd": "${workspaceFolder}" }, - { - "type": "lldb", - "request": "launch", - "name": "Debug benchmark 'memcpy_bench'", - "cargo": { - "args": [ - "test", - "--no-run", - "--bench=memcpy_bench", - "--package=libsearchlight" - ], - "filter": { - "name": "memcpy_bench", - "kind": "bench" - } - }, - "args": [], - "cwd": "${workspaceFolder}" - }, { "type": "lldb", "request": "launch", @@ -178,7 +50,7 @@ "args": [ "build", "--bin=searchlight", - "--package=searchlight", + "--package=searchlight" ], "filter": { "name": "searchlight", @@ -186,9 +58,9 @@ } }, "args": [ - "-v", + "-vv", "-i", - "test_data/zip_image.raw" + "test_data/stoneblock_fragmented.dat" ], "cwd": "${workspaceFolder}" }, diff --git a/libsearchlight/src/searchlight.rs b/libsearchlight/src/searchlight.rs index 177af7e..32f403f 100644 --- a/libsearchlight/src/searchlight.rs +++ b/libsearchlight/src/searchlight.rs @@ -137,14 +137,18 @@ impl Searchlight { // A None for cluster size here will indicate that the headers appear to be mostly not allocated on any usual cluster boundaries, or that // has been passed in as the case let cluster_size = info.cluster_size.unwrap_or_else(|| { - estimate_cluster_size(matches.iter().filter(|m| { + let est = estimate_cluster_size(matches.iter().filter(|m| { if let Some((_, _, part)) = id_ftype_map.get(&m.id) { *part == MatchPart::Header } else { assert!(false); panic!() // assert!(false) is not detected as a control flow terminator/does not return ! but is more semantically correct } - })).unwrap_or(1) // A cluster size of 1 is effectively the same as not being clustered + })).unwrap_or(1); // A cluster size of 1 is effectively the same as not being clustered + + info!("Calculated cluster size estimate: {est}"); + + est }); if log_enabled!(Level::Trace) { diff --git a/libsearchlight/src/utils.rs b/libsearchlight/src/utils.rs index 727a354..565be1e 100644 --- a/libsearchlight/src/utils.rs +++ b/libsearchlight/src/utils.rs @@ -3,7 +3,7 @@ pub mod str_parse; pub mod fragments_index; pub mod subrange; -use std::{collections::BTreeMap, fs::File, io::{self, Seek}, num::NonZeroUsize, ops::Range}; +use std::{collections::BTreeMap, fs::File, io::{self, Seek}, ops::Range}; use crate::{search::Match, utils::subrange::IntoSubrangesExact, validation::Fragment}; @@ -89,47 +89,80 @@ pub fn estimate_cluster_size<'a>(headers: impl IntoIterator) - } /// Generates a list of lists of fragments, as candidates for reconstructing fragmented data in `fragmentation_range`. That is, for fragmented data in -/// `fragmentation_range`, occupying a known `num_file_clusters` clusters, and being broken into `num_fragments` fragments, this function will generate -/// all possible arrangements of clusters that the fragmented data can occupy, assuming that the fragmented data is in-order. `num_fragments` will usually -/// just be a guess, in an attempt to reconstruct the low-hanging fruit, so to speak. +/// `fragmentation_range`, occupying a known `num_file_clusters` clusters, this function will generate some possible arrangements of clusters that the +/// fragmented data can occupy, assuming that the fragmented data is in-order. To reiterate, this function is non-exhaustive, but aims to tackle common +/// cases, such as bifragmentation/a single gap. +/// /// # Panics -/// Currently this function only supports `num_fragments` of 3 or under, and will panic if given a higher number. Will also panic if the fragmentation range -/// is not on cluster boundaries. -fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range, num_file_clusters: NonZeroUsize, num_fragments: usize) -> Vec> { +/// Panics if the fragmentation range is not on cluster boundaries. +pub fn generate_fragmentations(cluster_size: usize, fragmentation_range: Range, num_file_clusters: usize) -> Vec> { assert_eq!(fragmentation_range.start % cluster_size, 0); assert_eq!(fragmentation_range.end % cluster_size, 0); - if num_fragments == 1 && (fragmentation_range.len() / cluster_size) != num_file_clusters.get() { - panic!("Error: There are no solutions for no. fragments = 1 where the fragmentation range is larger than the number of file clusters"); + // Get the range for each cluster + let clusters = fragmentation_range.clone().into_subranges_exact(cluster_size); + assert_eq!(*clusters.remainder(), None); + assert_eq!(clusters.len(), fragmentation_range.len() / cluster_size); + + // NOTE: While for now we're just tackling the simple bifragmented case, the problem of finding all possible in-order cases is laid out below + // In an ordered set of N numbers, we need to find G non-adjacent groups of continous elements such that the count of elements across each of the G groups is equal to C + // 1, 2, 3, 4, 5; N = 5, G = 1, C = 3 + // -> [1, 2, 3], [2, 3, 4], [3, 4, 5] + // 1, 2, 3, 4, 5; N = 5, G = 2, C = 3 + // -> [1, 2][4], [1, 2][5], [2, 3][5], [1][3, 4], [1][4, 5], [2][4, 5] + // + // Number of solutions = G * C (N should factor in this...?) + + let mut gap_idx = 0; + let gap_len = clusters.len() - num_file_clusters; + + let mut res = Vec::new(); + + while gap_idx <= clusters.len() - gap_len { + // Get all the clusters that are not in the gap, and simplify + let mut file_clusters: Vec> = clusters.iter().enumerate().filter(|(i, _)| *i < gap_idx || *i >= (gap_idx + gap_len)).map(|(_, c)| c.start as u64..c.end as u64).collect(); + simplify_ranges(&mut file_clusters); + + res.push(file_clusters); + + gap_idx += 1; } - match num_fragments { - 1 => { - // Num_fragmentations = 1 is kinda a no-op - vec![vec![fragmentation_range.start as u64..fragmentation_range.end as u64]] - } - 2..=3 => { - let clusters = fragmentation_range.into_subranges_exact(cluster_size); - assert_eq!(*clusters.remainder(), None); + res +} - todo!() - } - _ => { - panic!("Error: Numbers of fragments over 3 is unsupported at this time"); +/// Takes a vec of assumed in-order, non-overlapping ranges, and where the end of a range is equal to the start of the next range, merges +/// the two ranges into one +pub fn simplify_ranges(ranges: &mut Vec>) where T: PartialEq { + let mut i = 1; + while i < ranges.len() { + if ranges[i - 1].end == ranges[i].start { + ranges[i - 1].end = ranges.remove(i).end; + i -= 1; } + + i += 1; } +} - // TODO: Implement a sliding window generator - For 2 fragments, the sliding window is the gap, for 3, it's the third fragment +/// Combines a list of ranges of indexes and a slice of data that is referred to by those indexes to produce a list of slices of that data +// NOTE: Is this useful? +pub fn idxs_to_slice<'a, T>(data: &'a [T], idxs: &[Range]) -> Vec<&'a [T]> { + let mut res = Vec::with_capacity(idxs.len()); - // TODO: Implement an algorithm to do as described in the doc comment. Look at https://doi.org/10.1016/j.diin.2019.04.014 for inspiration if need be -} + for range in idxs { + res.push(&data[range.clone()]) + } -// TODO: Need a function to merge adjacent fragments (simplification) + res +} #[cfg(test)] mod test { use crate::{search::Match, utils::estimate_cluster_size}; + use super::{generate_fragmentations, simplify_ranges}; + #[test] fn test_cluster_size_estimates() { macro_rules! simple_match { @@ -153,8 +186,60 @@ mod test { let est_cs = estimate_cluster_size(headers.iter()); - // println!("est_cs: {:?}", est_cs); - assert_eq!(est_cs, Some(1024)) } + + #[test] + fn test_generate_fragmentations() { + let cluster_size = 2; + + let fragmentation_range = 10..20; + + let num_file_clusters = 3; + + // 10..12, 12..14, 14..16, 16..18, 18..20 + + let expected = vec![ + vec![ + 14..20 + ], + vec![ + 10..12, + 16..20 + ], + vec![ + 10..14, + 18..20 + ], + vec![ + 10..16 + ] + ]; + + let calc_fragmentations = generate_fragmentations(cluster_size, fragmentation_range, num_file_clusters); + + assert_eq!(calc_fragmentations, expected); + } + + #[test] + fn test_simplify_ranges() { + let mut test_data = vec![ + 0..5, + 5..10, + 11..15, + 14..20, + 20..30, + 30..40 + ]; + + let expected = vec![ + 0..10, + 11..15, + 14..40 + ]; + + simplify_ranges(&mut test_data); + + assert_eq!(test_data, expected); + } } \ No newline at end of file diff --git a/libsearchlight/src/validation/png.rs b/libsearchlight/src/validation/png.rs index 14c463f..9260c8e 100644 --- a/libsearchlight/src/validation/png.rs +++ b/libsearchlight/src/validation/png.rs @@ -1,5 +1,3 @@ -use log::trace; - use crate::{search::pairing::MatchPair, utils::{self, fragments_index::FragmentsIndex}}; use super::{FileValidationInfo, FileValidationType, FileValidator, Fragment}; @@ -161,7 +159,7 @@ impl PngValidator { } // Attempt to reconstruct the chunk - let recons_info = Self::reconstruct_chunk(file_data, chunk_idx, chunk_type, chunk_data_len as usize, cluster_size); + let recons_info = Self::reconstruct_chunk(file_data, chunk_idx, chunk_data_len as usize, cluster_size); match recons_info { ChunkReconstructionInfo::Failure => { @@ -210,16 +208,16 @@ impl PngValidator { /// Attempts to reconstruct a fragmented PNG chunk, assuming that the length, chunk type, and CRC are not fragmented and that all /// fragments of the chunk are in-order (limitations) by searching forwards for a valid chunk type, decoding the CRC that should occur just before it, /// and enumerating the possible cluster arrangements between the start of the chunk data and the decoded CRC for a matching calculated CRC - fn reconstruct_chunk(file_data: &[u8], chunk_idx: usize, chunk_type: u32, chunk_data_len: usize, cluster_size: u64) -> ChunkReconstructionInfo { + fn reconstruct_chunk(file_data: &[u8], chunk_idx: usize, chunk_data_len: usize, cluster_size: u64) -> ChunkReconstructionInfo { let unfrag_crc_offset = chunk_idx + chunk_data_len + 8; - let mut next_chunk_type_offset = unfrag_crc_offset + 4; + let mut next_chunk_type_offset = unfrag_crc_offset + 8; // Find the next valid chunk type // NOTE: Currently, we're checking against a list of known valid chunk types. This can't be exhaustive though so will miss valid chunks // Perhaps an alternative method that could stop text files being counted be checking that the CRC and length are not ASCII (alphabetical?)? // Course, they may be in a valid file, but are unlikely to be - while !Self::validate_chunk_type(&file_data[next_chunk_type_offset..4]) { + while !Self::validate_chunk_type(&file_data[next_chunk_type_offset..(next_chunk_type_offset + 4)]) { next_chunk_type_offset += cluster_size as usize; // If we're now out of bounds (or will be upon attempting to read the chunk data len) then return with failure @@ -228,8 +226,6 @@ impl PngValidator { } } - trace!("Omg we got to the icky bit"); - // Load the (what we assume is) the CRC let stored_crc = u32::from_be_bytes(file_data[(next_chunk_type_offset - 8)..(next_chunk_type_offset - 4)].try_into().unwrap()); @@ -247,12 +243,42 @@ impl PngValidator { assert_eq!((next_chunk_type_offset - (unfrag_crc_offset + 8)) % cluster_size as usize, 0); assert_eq!((fragmentation_end - fragmentation_start) % cluster_size as usize, 0); - // TODO: Use utils::generate_fragmentations (once implemented) and calculate the CRC over each fragmentation (including data outside of the fragmentation - // area, like the chunk type and data in the same cluster) to find one that matches the stored CRC. If none can be found, return a failure + let fragmentations = utils::generate_fragmentations(cluster_size as usize, fragmentation_start..fragmentation_end, clusters_needed); - // utils::generate_fragmentations + let mut correct_fragmentation = None; - todo!() + // Initialise CRC hasher with the chunk type, and chunk data up to the fragmentation point + let mut hasher = crc32fast::Hasher::new(); + hasher.update(&file_data[(chunk_idx + 4)..fragmentation_start]); + + for data_frags in fragmentations { + // Clone the hasher and hash the fragments + let mut hasher = hasher.clone(); + for range in &data_frags { + hasher.update(&file_data[range.start as usize..range.end as usize]); + } + + // Finish hashing with the chunk data from the fragmentation end to the stored CRC + hasher.update(&file_data[fragmentation_end..(next_chunk_type_offset - 8)]); + + // Then check whether the calculated CRC matches the stored one + let calc_crc = hasher.finalize(); + if calc_crc == stored_crc { + correct_fragmentation = Some(data_frags); + break; + } + } + + if let Some(mut data_frags) = correct_fragmentation { + data_frags.insert(0, chunk_idx as u64..fragmentation_start as u64); + data_frags.push(fragmentation_end as u64..(next_chunk_type_offset - 4) as u64); + + utils::simplify_ranges(&mut data_frags); + + ChunkReconstructionInfo::Success { chunk_frags: data_frags, next_chunk_idx: next_chunk_type_offset as u64 - 4 } + } else { + ChunkReconstructionInfo::Failure + } } /// In the PNG spec, a valid chunk type must have each byte match \[a-zA-Z\]. However, this could mean that plain text files are caught, @@ -341,6 +367,7 @@ impl FileValidator for PngValidator { let mut chunk_info = Self::validate_chunk(&mut requires_plte, &mut plte_forbidden, &file_data, chunk_idx, cluster_size); fragments.append(&mut chunk_info.chunk_frags); + utils::simplify_ranges(&mut fragments); worst_chunk_validation = worst_chunk_validation.worst_of(chunk_info.validation_type);