From dbc1464c636ea81f400248cacff4771d2281cded Mon Sep 17 00:00:00 2001 From: Cody Rose Date: Thu, 29 Aug 2024 16:10:11 -0400 Subject: [PATCH] Download files when reverifying (#3252) The previous implementation of targeted file scanning pulled patches out of commit data, which didn't work for binary files (because GitHub doesn't return patches for them). This PR changes the system to always just download the requested file and scan it, which means we get binary file support. --- pkg/sources/github/github.go | 40 +++++++------- pkg/sources/github/github_integration_test.go | 18 +++++++ pkg/sources/github/repo.go | 54 ------------------- 3 files changed, 38 insertions(+), 74 deletions(-) diff --git a/pkg/sources/github/github.go b/pkg/sources/github/github.go index a6eba956e7c7..e201bb5bcc6b 100644 --- a/pkg/sources/github/github.go +++ b/pkg/sources/github/github.go @@ -16,6 +16,7 @@ import ( "github.com/go-logr/logr" "github.com/gobwas/glob" "github.com/google/go-github/v63/github" + "github.com/trufflesecurity/trufflehog/v3/pkg/handlers" "golang.org/x/exp/rand" "golang.org/x/sync/errgroup" "google.golang.org/protobuf/proto" @@ -1392,35 +1393,34 @@ func (s *Source) scanTarget(ctx context.Context, target sources.ChunkingTarget, return fmt.Errorf("invalid GitHub URL") } - qry := commitQuery{ - repo: segments[2], - owner: segments[1], - sha: meta.GetCommit(), - filename: meta.GetFile(), + readCloser, resp, err := s.connector.APIClient().Repositories.DownloadContents( + ctx, + segments[1], + segments[2], + meta.GetFile(), + &github.RepositoryContentGetOptions{Ref: meta.GetCommit()}) + // As of this writing, if the returned readCloser is not nil, it's just the Body of the returned github.Response, so + // there's no need to independently close it. + if resp != nil && resp.Body != nil { + defer resp.Body.Close() } - res, err := s.getDiffForFileInCommit(ctx, qry) if err != nil { - return err + return fmt.Errorf("could not download file for scan: %w", err) + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("unexpected HTTP response status when trying to download file for scan: %v", resp.Status) } - chunk := &sources.Chunk{ + + reporter := sources.ChanReporter{Ch: chunksChan} + chunkSkel := sources.Chunk{ SourceType: s.Type(), SourceName: s.name, SourceID: s.SourceID(), JobID: s.JobID(), SecretID: target.SecretID, - Data: []byte(stripLeadingPlusMinus(res)), SourceMetadata: &source_metadatapb.MetaData{ Data: &source_metadatapb.MetaData_Github{Github: meta}, }, - Verify: s.verify, - } - - return common.CancellableWrite(ctx, chunksChan, chunk) -} - -// stripLeadingPlusMinus removes leading + and - characters from lines in a diff string. These characters exist in the -// diffs returned when performing a targeted scan and need to be removed so that detectors are operating on the correct -// text. -func stripLeadingPlusMinus(diff string) string { - return strings.ReplaceAll(strings.ReplaceAll(diff, "\n+", "\n"), "\n-", "\n") + Verify: s.verify} + return handlers.HandleFile(ctx, readCloser, &chunkSkel, reporter) } diff --git a/pkg/sources/github/github_integration_test.go b/pkg/sources/github/github_integration_test.go index cfc6fe01703b..9fff037c321b 100644 --- a/pkg/sources/github/github_integration_test.go +++ b/pkg/sources/github/github_integration_test.go @@ -758,6 +758,24 @@ func TestSource_Chunks_TargetedScan(t *testing.T) { }, wantChunks: 1, }, + { + name: "targeted scan, binary file", + init: init{ + name: "test source", + connection: &sourcespb.GitHub{Credential: &sourcespb.GitHub_Token{Token: githubToken}}, + queryCriteria: &source_metadatapb.MetaData{ + Data: &source_metadatapb.MetaData_Github{ + Github: &source_metadatapb.Github{ + Repository: "https://github.com/truffle-sandbox/test-secrets.git", + Link: "https://github.com/truffle-sandbox/test-secrets/blob/70bef8590f87257c0992eecc7db529827a12b801/null_text_w_ptp.ipynb", + Commit: "70bef8590f87257c0992eecc7db529827a12b801", + File: "null_text_w_ptp.ipynb", + }, + }, + }, + }, + wantChunks: 607, + }, { name: "no file in commit", init: init{ diff --git a/pkg/sources/github/repo.go b/pkg/sources/github/repo.go index 67e7728816c7..79be8653ffcb 100644 --- a/pkg/sources/github/repo.go +++ b/pkg/sources/github/repo.go @@ -281,60 +281,6 @@ func (s *Source) wikiIsReachable(ctx context.Context, repoURL string) bool { return wikiURL == res.Request.URL.String() } -// commitQuery represents the details required to fetch a commit. -type commitQuery struct { - repo string - owner string - sha string - filename string -} - -// getDiffForFileInCommit retrieves the diff for a specified file in a commit. -// If the file or its diff is not found, it returns an error. -func (s *Source) getDiffForFileInCommit(ctx context.Context, query commitQuery) (string, error) { - var ( - commit *github.RepositoryCommit - err error - ) - for { - commit, _, err = s.connector.APIClient().Repositories.GetCommit(ctx, query.owner, query.repo, query.sha, nil) - if s.handleRateLimit(err) { - continue - } - if err != nil { - return "", fmt.Errorf("error fetching commit %s: %w", query.sha, err) - } - break - } - - if len(commit.Files) == 0 { - return "", fmt.Errorf("commit %s does not contain any files", query.sha) - } - - res := new(strings.Builder) - // Only return the diff if the file is in the commit. - for _, file := range commit.Files { - if *file.Filename != query.filename { - continue - } - - if file.Patch == nil { - return "", fmt.Errorf("commit %s file %s does not have a diff", query.sha, query.filename) - } - - if _, err := res.WriteString(*file.Patch); err != nil { - return "", fmt.Errorf("buffer write error for commit %s file %s: %w", query.sha, query.filename, err) - } - res.WriteString("\n") - } - - if res.Len() == 0 { - return "", fmt.Errorf("commit %s does not contain patch for file %s", query.sha, query.filename) - } - - return res.String(), nil -} - func (s *Source) normalizeRepo(repo string) (string, error) { // If there's a '/', assume it's a URL and try to normalize it. if strings.ContainsRune(repo, '/') {