Skip to content

Commit

Permalink
Download files when reverifying (#3252)
Browse files Browse the repository at this point in the history
The previous implementation of targeted file scanning pulled patches out of commit data, which didn't work for binary files (because GitHub doesn't return patches for them). This PR changes the system to always just download the requested file and scan it, which means we get binary file support.
  • Loading branch information
rosecodym authored Aug 29, 2024
1 parent 247b56a commit dbc1464
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 74 deletions.
40 changes: 20 additions & 20 deletions pkg/sources/github/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/go-logr/logr"
"github.com/gobwas/glob"
"github.com/google/go-github/v63/github"
"github.com/trufflesecurity/trufflehog/v3/pkg/handlers"
"golang.org/x/exp/rand"
"golang.org/x/sync/errgroup"
"google.golang.org/protobuf/proto"
Expand Down Expand Up @@ -1392,35 +1393,34 @@ func (s *Source) scanTarget(ctx context.Context, target sources.ChunkingTarget,
return fmt.Errorf("invalid GitHub URL")
}

qry := commitQuery{
repo: segments[2],
owner: segments[1],
sha: meta.GetCommit(),
filename: meta.GetFile(),
readCloser, resp, err := s.connector.APIClient().Repositories.DownloadContents(
ctx,
segments[1],
segments[2],
meta.GetFile(),
&github.RepositoryContentGetOptions{Ref: meta.GetCommit()})
// As of this writing, if the returned readCloser is not nil, it's just the Body of the returned github.Response, so
// there's no need to independently close it.
if resp != nil && resp.Body != nil {
defer resp.Body.Close()
}
res, err := s.getDiffForFileInCommit(ctx, qry)
if err != nil {
return err
return fmt.Errorf("could not download file for scan: %w", err)
}
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected HTTP response status when trying to download file for scan: %v", resp.Status)
}
chunk := &sources.Chunk{

reporter := sources.ChanReporter{Ch: chunksChan}
chunkSkel := sources.Chunk{
SourceType: s.Type(),
SourceName: s.name,
SourceID: s.SourceID(),
JobID: s.JobID(),
SecretID: target.SecretID,
Data: []byte(stripLeadingPlusMinus(res)),
SourceMetadata: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{Github: meta},
},
Verify: s.verify,
}

return common.CancellableWrite(ctx, chunksChan, chunk)
}

// stripLeadingPlusMinus removes leading + and - characters from lines in a diff string. These characters exist in the
// diffs returned when performing a targeted scan and need to be removed so that detectors are operating on the correct
// text.
func stripLeadingPlusMinus(diff string) string {
return strings.ReplaceAll(strings.ReplaceAll(diff, "\n+", "\n"), "\n-", "\n")
Verify: s.verify}
return handlers.HandleFile(ctx, readCloser, &chunkSkel, reporter)
}
18 changes: 18 additions & 0 deletions pkg/sources/github/github_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,24 @@ func TestSource_Chunks_TargetedScan(t *testing.T) {
},
wantChunks: 1,
},
{
name: "targeted scan, binary file",
init: init{
name: "test source",
connection: &sourcespb.GitHub{Credential: &sourcespb.GitHub_Token{Token: githubToken}},
queryCriteria: &source_metadatapb.MetaData{
Data: &source_metadatapb.MetaData_Github{
Github: &source_metadatapb.Github{
Repository: "https://github.com/truffle-sandbox/test-secrets.git",
Link: "https://github.com/truffle-sandbox/test-secrets/blob/70bef8590f87257c0992eecc7db529827a12b801/null_text_w_ptp.ipynb",
Commit: "70bef8590f87257c0992eecc7db529827a12b801",
File: "null_text_w_ptp.ipynb",
},
},
},
},
wantChunks: 607,
},
{
name: "no file in commit",
init: init{
Expand Down
54 changes: 0 additions & 54 deletions pkg/sources/github/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,60 +281,6 @@ func (s *Source) wikiIsReachable(ctx context.Context, repoURL string) bool {
return wikiURL == res.Request.URL.String()
}

// commitQuery represents the details required to fetch a commit.
type commitQuery struct {
repo string
owner string
sha string
filename string
}

// getDiffForFileInCommit retrieves the diff for a specified file in a commit.
// If the file or its diff is not found, it returns an error.
func (s *Source) getDiffForFileInCommit(ctx context.Context, query commitQuery) (string, error) {
var (
commit *github.RepositoryCommit
err error
)
for {
commit, _, err = s.connector.APIClient().Repositories.GetCommit(ctx, query.owner, query.repo, query.sha, nil)
if s.handleRateLimit(err) {
continue
}
if err != nil {
return "", fmt.Errorf("error fetching commit %s: %w", query.sha, err)
}
break
}

if len(commit.Files) == 0 {
return "", fmt.Errorf("commit %s does not contain any files", query.sha)
}

res := new(strings.Builder)
// Only return the diff if the file is in the commit.
for _, file := range commit.Files {
if *file.Filename != query.filename {
continue
}

if file.Patch == nil {
return "", fmt.Errorf("commit %s file %s does not have a diff", query.sha, query.filename)
}

if _, err := res.WriteString(*file.Patch); err != nil {
return "", fmt.Errorf("buffer write error for commit %s file %s: %w", query.sha, query.filename, err)
}
res.WriteString("\n")
}

if res.Len() == 0 {
return "", fmt.Errorf("commit %s does not contain patch for file %s", query.sha, query.filename)
}

return res.String(), nil
}

func (s *Source) normalizeRepo(repo string) (string, error) {
// If there's a '/', assume it's a URL and try to normalize it.
if strings.ContainsRune(repo, '/') {
Expand Down

0 comments on commit dbc1464

Please sign in to comment.