From 6cd83372f1e6fc7be4fb332084e0cc64f3ff4b0a Mon Sep 17 00:00:00 2001 From: Richard Gomez Date: Wed, 5 Jun 2024 18:58:26 -0400 Subject: [PATCH] feat(engine): compare results per chunk/detector --- main.go | 83 ++++------------------- pkg/engine/ahocorasick/ahocorasickcore.go | 10 --- pkg/engine/engine.go | 80 +++++++++++++++++----- 3 files changed, 76 insertions(+), 97 deletions(-) diff --git a/main.go b/main.go index 20b6ad80c692..ad8fba30f1f0 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,6 @@ import ( "runtime" "strconv" "strings" - "sync" "syscall" "github.com/alecthomas/kingpin/v2" @@ -446,26 +445,20 @@ func run(state overseer.State) { // default detectors, which can be further filtered by the // user. The filters are applied by the engine and are only // subtractive. - Detectors: append(engine.DefaultDetectors(), conf.Detectors...), - Verify: !*noVerification, - IncludeDetectors: *includeDetectors, - ExcludeDetectors: *excludeDetectors, - CustomVerifiersOnly: *customVerifiersOnly, - VerifierEndpoints: *verifiers, - Dispatcher: engine.NewPrinterDispatcher(printer), - FilterUnverified: *filterUnverified, - FilterEntropy: *filterEntropy, - VerificationOverlap: *allowVerificationOverlap, - Results: parsedResults, - PrintAvgDetectorTime: *printAvgDetectorTime, - ShouldScanEntireChunk: *scanEntireChunk, - } - - if *compareDetectionStrategies { - if err := compareScans(ctx, cmd, engConf); err != nil { - logFatal(err, "error comparing detection strategies") - } - return + Detectors: conf.Detectors, + Verify: !*noVerification, + IncludeDetectors: *includeDetectors, + ExcludeDetectors: *excludeDetectors, + CustomVerifiersOnly: *customVerifiersOnly, + VerifierEndpoints: *verifiers, + Dispatcher: engine.NewPrinterDispatcher(printer), + FilterUnverified: *filterUnverified, + FilterEntropy: *filterEntropy, + VerificationOverlap: *allowVerificationOverlap, + Results: parsedResults, + PrintAvgDetectorTime: *printAvgDetectorTime, + ShouldScanEntireChunk: *scanEntireChunk, + CompareDetectionStrategies: *compareDetectionStrategies, } topLevelSubCommand, _, _ := strings.Cut(cmd, " ") @@ -495,54 +488,6 @@ func run(state overseer.State) { } } -func compareScans(ctx context.Context, cmd string, cfg engine.Config) error { - var ( - entireMetrics metrics - maxLengthMetrics metrics - err error - ) - - var wg sync.WaitGroup - wg.Add(1) - - go func() { - defer wg.Done() - // Run scan with entire chunk span calculator. - cfg.ShouldScanEntireChunk = true - entireMetrics, err = runSingleScan(ctx, cmd, cfg) - if err != nil { - ctx.Logger().Error(err, "error running scan with entire chunk span calculator") - } - }() - - // Run scan with max-length span calculator. - maxLengthMetrics, err = runSingleScan(ctx, cmd, cfg) - if err != nil { - return fmt.Errorf("error running scan with custom span calculator: %v", err) - } - - wg.Wait() - - return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics) -} - -func compareMetrics(customMetrics, entireMetrics engine.Metrics) error { - fmt.Printf("Comparison of scan results: \n") - fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String()) - fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n", - entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String()) - - // Check for differences in scan metrics. - if customMetrics.ChunksScanned != entireMetrics.ChunksScanned || - customMetrics.BytesScanned != entireMetrics.BytesScanned || - customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound { - return fmt.Errorf("scan metrics do not match") - } - - return nil -} - type metrics struct { engine.Metrics hasFoundResults bool diff --git a/pkg/engine/ahocorasick/ahocorasickcore.go b/pkg/engine/ahocorasick/ahocorasickcore.go index 9ec2da626858..6e46450f67ec 100644 --- a/pkg/engine/ahocorasick/ahocorasickcore.go +++ b/pkg/engine/ahocorasick/ahocorasickcore.go @@ -52,16 +52,6 @@ type spanCalculationParams struct { detector detectors.Detector } -// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data. -// This is used when we want to match against the full length of the provided chunk. -type EntireChunkSpanCalculator struct{} - -// calculateSpan returns the match span as the length of the chunk data, -// effectively using the entire chunk for matching. -func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan { - return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))} -} - // adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude // or values provided by specific detectors to adjust the start and end indices of the span, allowing // for more granular control over the match. diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index d4af8d5a2ffe..ddf016932307 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -144,6 +144,8 @@ type Config struct { // VerificationOverlapWorkerMultiplier is used to determine the number of verification overlap workers to spawn. VerificationOverlapWorkerMultiplier int + + CompareDetectionStrategies bool } // Engine represents the core scanning engine responsible for detecting secrets in input data. @@ -173,6 +175,12 @@ type Engine struct { // By default, the engine will only scan a subset of the chunk if a detector matches the chunk. // If this flag is set to true, the engine will scan the entire chunk. scanEntireChunk bool + // If this flag is set to true, the engine will run two scans per chunk: + // 1. the entire chunk (old) + // 2. a subset of the chunk (new) + // + // Any discrepancies between methods will be logged. + compareScanStrategies bool // ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups. ahoCorasickCore *ahocorasick.Core @@ -232,6 +240,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) { detectorWorkerMultiplier: cfg.DetectorWorkerMultiplier, notificationWorkerMultiplier: cfg.NotificationWorkerMultiplier, verificationOverlapWorkerMultiplier: cfg.VerificationOverlapWorkerMultiplier, + compareScanStrategies: cfg.CompareDetectionStrategies, } if engine.sourceManager == nil { return nil, fmt.Errorf("source manager is required") @@ -505,14 +514,8 @@ func (e *Engine) initialize(ctx context.Context) error { e.dedupeCache = cache ctx.Logger().V(4).Info("engine initialized") - // Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk. - var ahoCOptions []ahocorasick.CoreOption - if e.scanEntireChunk { - ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator))) - } - ctx.Logger().V(4).Info("setting up aho-corasick core") - e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...) + e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors) ctx.Logger().V(4).Info("set up aho-corasick core") return nil @@ -1021,12 +1024,26 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) { func (e *Engine) detectorWorker(ctx context.Context) { for data := range e.detectableChunksChan { start := time.Now() - e.detectChunk(ctx, data) - chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds())) + + if !e.compareScanStrategies { + // Typical use case: scan the chunk. + e.detectChunk(ctx, data, e.scanEntireChunk) + chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds())) + } else { + // --compare-detection-strategies is enabled, scan with both methods and compare results. + customSpanResultCount := e.detectChunk(ctx, data, false) + entireChunkResultCount := e.detectChunk(ctx, data, true) + chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds())) + + if customSpanResultCount != entireChunkResultCount { + err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount) + ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String()) + } + } } } -func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { +func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int { var start time.Time if e.printAvgDetectorTime { start = time.Now() @@ -1043,8 +1060,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { // The matches field of the DetectorMatch struct contains the // relevant portions of the chunk data that were matched. // This avoids the need for additional regex processing on the entire chunk data. - matches := data.detector.Matches() - for _, matchBytes := range matches { + var matchedBytes [][]byte + if scanEntireChunk { + matchedBytes = [][]byte{data.chunk.Data} + } else { + matchedBytes = data.detector.Matches() + } + resultCount := 0 + for _, matchBytes := range matchedBytes { matchCount++ detectBytesPerMatch.Observe(float64(len(matchBytes))) @@ -1072,13 +1095,24 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { if e.printAvgDetectorTime && len(results) > 0 { elapsed := time.Since(start) detectorName := results[0].DetectorType.String() + avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName) + if !ok { + ctx.Logger().Error( + errors.New("failed to load metric"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults + } + var avgTime []time.Duration - if ok { - avgTime, ok = avgTimeI.([]time.Duration) - if !ok { - return - } + avgTime, ok = avgTimeI.([]time.Duration) + if !ok { + ctx.Logger().Error( + errors.New("failed to cast metric as []time.Duration"), + "Unable to track detector time", + "detector", detectorName) + goto HandleResults } avgTime = append(avgTime, elapsed) e.metrics.detectorAvgTime.Store(detectorName, avgTime) @@ -1093,6 +1127,10 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { results = e.filterResults(ctx, data.detector, results) } + HandleResults: + results = e.filterResults(ctx, data.detector, results) + + resultCount += len(results) for _, res := range results { e.processResult(ctx, data, res, isFalsePositive) } @@ -1100,7 +1138,13 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) { matchesPerChunk.Observe(float64(matchCount)) - data.wgDoneFn() + // If `e.compareScanStrategies` is enabled, two scans will be run. + // Don't decrement the WaitGroup until both have been completed. + if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) { + data.wgDoneFn() + } + + return resultCount } func (e *Engine) filterResults(