Skip to content

Commit

Permalink
feat(engine): compare results per chunk/detector
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz committed Nov 8, 2024
1 parent 781157a commit 6cd8337
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 97 deletions.
83 changes: 14 additions & 69 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"syscall"

"github.com/alecthomas/kingpin/v2"
Expand Down Expand Up @@ -446,26 +445,20 @@ func run(state overseer.State) {
// default detectors, which can be further filtered by the
// user. The filters are applied by the engine and are only
// subtractive.
Detectors: append(engine.DefaultDetectors(), conf.Detectors...),
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
}

if *compareDetectionStrategies {
if err := compareScans(ctx, cmd, engConf); err != nil {
logFatal(err, "error comparing detection strategies")
}
return
Detectors: conf.Detectors,
Verify: !*noVerification,
IncludeDetectors: *includeDetectors,
ExcludeDetectors: *excludeDetectors,
CustomVerifiersOnly: *customVerifiersOnly,
VerifierEndpoints: *verifiers,
Dispatcher: engine.NewPrinterDispatcher(printer),
FilterUnverified: *filterUnverified,
FilterEntropy: *filterEntropy,
VerificationOverlap: *allowVerificationOverlap,
Results: parsedResults,
PrintAvgDetectorTime: *printAvgDetectorTime,
ShouldScanEntireChunk: *scanEntireChunk,
CompareDetectionStrategies: *compareDetectionStrategies,
}

topLevelSubCommand, _, _ := strings.Cut(cmd, " ")
Expand Down Expand Up @@ -495,54 +488,6 @@ func run(state overseer.State) {
}
}

func compareScans(ctx context.Context, cmd string, cfg engine.Config) error {
var (
entireMetrics metrics
maxLengthMetrics metrics
err error
)

var wg sync.WaitGroup
wg.Add(1)

go func() {
defer wg.Done()
// Run scan with entire chunk span calculator.
cfg.ShouldScanEntireChunk = true
entireMetrics, err = runSingleScan(ctx, cmd, cfg)
if err != nil {
ctx.Logger().Error(err, "error running scan with entire chunk span calculator")
}
}()

// Run scan with max-length span calculator.
maxLengthMetrics, err = runSingleScan(ctx, cmd, cfg)
if err != nil {
return fmt.Errorf("error running scan with custom span calculator: %v", err)
}

wg.Wait()

return compareMetrics(maxLengthMetrics.Metrics, entireMetrics.Metrics)
}

func compareMetrics(customMetrics, entireMetrics engine.Metrics) error {
fmt.Printf("Comparison of scan results: \n")
fmt.Printf("Custom span - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
customMetrics.ChunksScanned, customMetrics.BytesScanned, customMetrics.VerifiedSecretsFound, customMetrics.UnverifiedSecretsFound, customMetrics.ScanDuration.String())
fmt.Printf("Entire chunk - Chunks: %d, Bytes: %d, Verified Secrets: %d, Unverified Secrets: %d, Duration: %s\n",
entireMetrics.ChunksScanned, entireMetrics.BytesScanned, entireMetrics.VerifiedSecretsFound, entireMetrics.UnverifiedSecretsFound, entireMetrics.ScanDuration.String())

// Check for differences in scan metrics.
if customMetrics.ChunksScanned != entireMetrics.ChunksScanned ||
customMetrics.BytesScanned != entireMetrics.BytesScanned ||
customMetrics.VerifiedSecretsFound != entireMetrics.VerifiedSecretsFound {
return fmt.Errorf("scan metrics do not match")
}

return nil
}

type metrics struct {
engine.Metrics
hasFoundResults bool
Expand Down
10 changes: 0 additions & 10 deletions pkg/engine/ahocorasick/ahocorasickcore.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,6 @@ type spanCalculationParams struct {
detector detectors.Detector
}

// EntireChunkSpanCalculator is a strategy that calculates the match span to use the entire chunk data.
// This is used when we want to match against the full length of the provided chunk.
type EntireChunkSpanCalculator struct{}

// calculateSpan returns the match span as the length of the chunk data,
// effectively using the entire chunk for matching.
func (e *EntireChunkSpanCalculator) calculateSpan(params spanCalculationParams) matchSpan {
return matchSpan{startOffset: 0, endOffset: int64(len(params.chunkData))}
}

// adjustableSpanCalculator is a strategy that calculates match spans. It uses a default offset magnitude
// or values provided by specific detectors to adjust the start and end indices of the span, allowing
// for more granular control over the match.
Expand Down
80 changes: 62 additions & 18 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ type Config struct {

// VerificationOverlapWorkerMultiplier is used to determine the number of verification overlap workers to spawn.
VerificationOverlapWorkerMultiplier int

CompareDetectionStrategies bool
}

// Engine represents the core scanning engine responsible for detecting secrets in input data.
Expand Down Expand Up @@ -173,6 +175,12 @@ type Engine struct {
// By default, the engine will only scan a subset of the chunk if a detector matches the chunk.
// If this flag is set to true, the engine will scan the entire chunk.
scanEntireChunk bool
// If this flag is set to true, the engine will run two scans per chunk:
// 1. the entire chunk (old)
// 2. a subset of the chunk (new)
//
// Any discrepancies between methods will be logged.
compareScanStrategies bool

// ahoCorasickHandler manages the Aho-Corasick trie and related keyword lookups.
ahoCorasickCore *ahocorasick.Core
Expand Down Expand Up @@ -232,6 +240,7 @@ func NewEngine(ctx context.Context, cfg *Config) (*Engine, error) {
detectorWorkerMultiplier: cfg.DetectorWorkerMultiplier,
notificationWorkerMultiplier: cfg.NotificationWorkerMultiplier,
verificationOverlapWorkerMultiplier: cfg.VerificationOverlapWorkerMultiplier,
compareScanStrategies: cfg.CompareDetectionStrategies,
}
if engine.sourceManager == nil {
return nil, fmt.Errorf("source manager is required")
Expand Down Expand Up @@ -505,14 +514,8 @@ func (e *Engine) initialize(ctx context.Context) error {
e.dedupeCache = cache
ctx.Logger().V(4).Info("engine initialized")

// Configure the EntireChunkSpanCalculator if the engine is set to scan the entire chunk.
var ahoCOptions []ahocorasick.CoreOption
if e.scanEntireChunk {
ahoCOptions = append(ahoCOptions, ahocorasick.WithSpanCalculator(new(ahocorasick.EntireChunkSpanCalculator)))
}

ctx.Logger().V(4).Info("setting up aho-corasick core")
e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors, ahoCOptions...)
e.ahoCorasickCore = ahocorasick.NewAhoCorasickCore(e.detectors)
ctx.Logger().V(4).Info("set up aho-corasick core")

return nil
Expand Down Expand Up @@ -1021,12 +1024,26 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
func (e *Engine) detectorWorker(ctx context.Context) {
for data := range e.detectableChunksChan {
start := time.Now()
e.detectChunk(ctx, data)
chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds()))

if !e.compareScanStrategies {
// Typical use case: scan the chunk.
e.detectChunk(ctx, data, e.scanEntireChunk)
chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds()))
} else {
// --compare-detection-strategies is enabled, scan with both methods and compare results.
customSpanResultCount := e.detectChunk(ctx, data, false)
entireChunkResultCount := e.detectChunk(ctx, data, true)
chunksDetectedLatency.Observe(float64(time.Since(start).Milliseconds()))

if customSpanResultCount != entireChunkResultCount {
err := fmt.Errorf("mismatch between custom span and entire chunk: %d vs %d", customSpanResultCount, entireChunkResultCount)
ctx.Logger().Error(err, "Scan results do not match", "detector", data.detector.Type().String())
}
}
}
}

func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
func (e *Engine) detectChunk(ctx context.Context, data detectableChunk, scanEntireChunk bool) int {
var start time.Time
if e.printAvgDetectorTime {
start = time.Now()
Expand All @@ -1043,8 +1060,14 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
// The matches field of the DetectorMatch struct contains the
// relevant portions of the chunk data that were matched.
// This avoids the need for additional regex processing on the entire chunk data.
matches := data.detector.Matches()
for _, matchBytes := range matches {
var matchedBytes [][]byte
if scanEntireChunk {
matchedBytes = [][]byte{data.chunk.Data}
} else {
matchedBytes = data.detector.Matches()
}
resultCount := 0
for _, matchBytes := range matchedBytes {
matchCount++
detectBytesPerMatch.Observe(float64(len(matchBytes)))

Expand Down Expand Up @@ -1072,13 +1095,24 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
if e.printAvgDetectorTime && len(results) > 0 {
elapsed := time.Since(start)
detectorName := results[0].DetectorType.String()

avgTimeI, ok := e.metrics.detectorAvgTime.Load(detectorName)
if !ok {
ctx.Logger().Error(
errors.New("failed to load metric"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}

var avgTime []time.Duration
if ok {
avgTime, ok = avgTimeI.([]time.Duration)
if !ok {
return
}
avgTime, ok = avgTimeI.([]time.Duration)
if !ok {
ctx.Logger().Error(
errors.New("failed to cast metric as []time.Duration"),
"Unable to track detector time",
"detector", detectorName)
goto HandleResults
}
avgTime = append(avgTime, elapsed)
e.metrics.detectorAvgTime.Store(detectorName, avgTime)
Expand All @@ -1093,14 +1127,24 @@ func (e *Engine) detectChunk(ctx context.Context, data detectableChunk) {
results = e.filterResults(ctx, data.detector, results)
}

HandleResults:
results = e.filterResults(ctx, data.detector, results)

resultCount += len(results)
for _, res := range results {
e.processResult(ctx, data, res, isFalsePositive)
}
}

matchesPerChunk.Observe(float64(matchCount))

data.wgDoneFn()
// If `e.compareScanStrategies` is enabled, two scans will be run.
// Don't decrement the WaitGroup until both have been completed.
if (!e.compareScanStrategies) || (e.compareScanStrategies && !scanEntireChunk) {
data.wgDoneFn()
}

return resultCount
}

func (e *Engine) filterResults(
Expand Down

0 comments on commit 6cd8337

Please sign in to comment.