Skip to content

Commit

Permalink
Use single map for collecting files across branches (#839)
Browse files Browse the repository at this point in the history
When looking at large profiles for `inuse_space` on dot com, I noticed the
filename maps in `prepareNormalBuild` taking a bunch of memory. This PR avoids
allocating a separate map per branch, instead having `RepoWalker` collect all
the entries in a single instance variable.
  • Loading branch information
jtibshirani authored Oct 1, 2024
1 parent 6755eca commit 6501360
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 109 deletions.
14 changes: 8 additions & 6 deletions cmd/zoekt-repo-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/sourcegraph/zoekt"
"github.com/sourcegraph/zoekt/build"
"github.com/sourcegraph/zoekt/gitindex"
"github.com/sourcegraph/zoekt/ignore"
"go.uber.org/automaxprocs/maxprocs"

git "github.com/go-git/go-git/v5"
Expand Down Expand Up @@ -180,7 +181,7 @@ func main() {
}
}

perBranch := map[string]map[fileKey]gitindex.BlobRepo{}
perBranch := map[string]map[fileKey]gitindex.BlobLocation{}
opts.SubRepositories = map[string]*zoekt.Repository{}

// branch => repo => version
Expand Down Expand Up @@ -325,8 +326,8 @@ func getManifest(repo *git.Repository, branch, path string) (*manifest.Manifest,
func iterateManifest(mf *manifest.Manifest,
baseURL url.URL, revPrefix string,
cache *gitindex.RepoCache,
) (map[fileKey]gitindex.BlobRepo, map[string]plumbing.Hash, error) {
allFiles := map[fileKey]gitindex.BlobRepo{}
) (map[fileKey]gitindex.BlobLocation, map[string]plumbing.Hash, error) {
allFiles := map[fileKey]gitindex.BlobLocation{}
allVersions := map[string]plumbing.Hash{}
for _, p := range mf.Project {
rev := mf.ProjectRevision(&p)
Expand Down Expand Up @@ -359,20 +360,21 @@ func iterateManifest(mf *manifest.Manifest,
return nil, nil, err
}

files, versions, err := gitindex.TreeToFiles(topRepo, tree, projURL.String(), cache)
rw := gitindex.NewRepoWalker(topRepo, projURL.String(), cache)
subVersions, err := rw.CollectFiles(tree, rev, &ignore.Matcher{})
if err != nil {
return nil, nil, err
}

for key, repo := range files {
for key, repo := range rw.Files {
allFiles[fileKey{
SubRepoPath: filepath.Join(p.GetPath(), key.SubRepoPath),
Path: key.Path,
ID: key.ID,
}] = repo
}

for path, version := range versions {
for path, version := range subVersions {
allVersions[filepath.Join(p.GetPath(), path)] = version
}
}
Expand Down
63 changes: 23 additions & 40 deletions gitindex/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
}

// branch => (path, sha1) => repo.
var repos map[fileKey]BlobIndexInfo
var repos map[fileKey]BlobLocation

// Branch => Repo => SHA1
var branchVersions map[string]map[string]plumbing.Hash
Expand All @@ -452,7 +452,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
}
}

reposByPath := map[string]BlobIndexInfo{}
reposByPath := map[string]BlobLocation{}
for key, info := range repos {
reposByPath[key.SubRepoPath] = info
}
Expand All @@ -461,9 +461,9 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) {
for path, info := range reposByPath {
tpl := opts.BuildOptions.RepositoryDescription
if path != "" {
tpl = zoekt.Repository{URL: info.Repo.URL.String()}
if err := SetTemplatesFromOrigin(&tpl, info.Repo.URL); err != nil {
log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.Repo.URL, err)
tpl = zoekt.Repository{URL: info.URL.String()}
if err := SetTemplatesFromOrigin(&tpl, info.URL); err != nil {
log.Printf("setTemplatesFromOrigin(%s, %s): %s", path, info.URL, err)
}
}
opts.BuildOptions.SubRepositories[path] = &tpl
Expand Down Expand Up @@ -592,11 +592,11 @@ func newIgnoreMatcher(tree *object.Tree) (*ignore.Matcher, error) {

// prepareDeltaBuildFunc is a function that calculates the necessary metadata for preparing
// a build.Builder instance for generating a delta build.
type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)
type prepareDeltaBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error)

// prepareNormalBuildFunc is a function that calculates the necessary metadata for preparing
// a build.Builder instance for generating a normal build.
type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error)
type prepareNormalBuildFunc func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error)

type gitIndexConfig struct {
// prepareDeltaBuild, if not nil, is the function that is used to calculate the metadata that will be used to
Expand All @@ -612,7 +612,7 @@ type gitIndexConfig struct {
prepareNormalBuild prepareNormalBuildFunc
}

func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
if options.Submodules {
return nil, nil, nil, fmt.Errorf("delta builds currently don't support submodule indexing")
}
Expand Down Expand Up @@ -670,7 +670,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f
}

// branch => (path, sha1) => repo.
repos = map[fileKey]BlobIndexInfo{}
repos = map[fileKey]BlobLocation{}

// branch name -> git worktree at most current commit
branchToCurrentTree := make(map[string]*object.Tree, len(options.Branches))
Expand All @@ -696,12 +696,7 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f
}

// TODO: Support repository submodules for delta builds
// For this prototype, we are ignoring repository submodules, which means that we can use the same
// blob location for all files
hackSharedBlobLocation := BlobRepo{
GitRepo: repository,
URL: u,
}

// loop over all branches, calculate the diff between our
// last indexed commit and the current commit, and add files mentioned in the diff
for _, branch := range existingRepository.Branches {
Expand Down Expand Up @@ -742,8 +737,9 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f
existing.Branches = append(existing.Branches, branch.Name)
repos[file] = existing
} else {
repos[file] = BlobIndexInfo{
Repo: hackSharedBlobLocation,
repos[file] = BlobLocation{
GitRepo: repository,
URL: u,
Branches: []string{branch.Name},
}
}
Expand Down Expand Up @@ -780,8 +776,9 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f
existing.Branches = append(existing.Branches, b)
repos[file] = existing
} else {
repos[file] = BlobIndexInfo{
Repo: hackSharedBlobLocation,
repos[file] = BlobLocation{
GitRepo: repository,
URL: u,
Branches: []string{b},
}
}
Expand All @@ -806,15 +803,12 @@ func prepareDeltaBuild(options Options, repository *git.Repository) (repos map[f
return repos, nil, changedOrDeletedPaths, nil
}

func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) {
func prepareNormalBuild(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
var repoCache *RepoCache
if options.Submodules {
repoCache = NewRepoCache(options.RepoCacheDir)
}

// branch => (path, sha1) => metadata.
repos = map[fileKey]BlobIndexInfo{}

// Branch => Repo => SHA1
branchVersions = map[string]map[string]plumbing.Hash{}

Expand All @@ -823,6 +817,7 @@ func prepareNormalBuild(options Options, repository *git.Repository) (repos map[
return nil, nil, fmt.Errorf("expandBranches: %w", err)
}

rw := NewRepoWalker(repository, options.BuildOptions.RepositoryDescription.URL, repoCache)
for _, b := range branches {
commit, err := getCommit(repository, options.BranchPrefix, b)
if err != nil {
Expand All @@ -843,35 +838,23 @@ func prepareNormalBuild(options Options, repository *git.Repository) (repos map[
return nil, nil, fmt.Errorf("newIgnoreMatcher: %w", err)
}

files, subVersions, err := TreeToFiles(repository, tree, options.BuildOptions.RepositoryDescription.URL, repoCache)
subVersions, err := rw.CollectFiles(tree, b, ig)
if err != nil {
return nil, nil, fmt.Errorf("TreeToFiles: %w", err)
}
for k, v := range files {
if ig.Match(k.Path) {
continue
}

if existing, ok := repos[k]; ok {
existing.Branches = append(existing.Branches, b)
repos[k] = existing
} else {
repos[k] = BlobIndexInfo{Repo: v, Branches: []string{b}}
}
return nil, nil, fmt.Errorf("CollectFiles: %w", err)
}

branchVersions[b] = subVersions
}

return repos, branchVersions, nil
return rw.Files, branchVersions, nil
}

func createDocument(key fileKey,
repos map[fileKey]BlobIndexInfo,
repos map[fileKey]BlobLocation,
ranks repoPathRanks,
opts build.Options,
) (zoekt.Document, error) {
repo := repos[key].Repo
repo := repos[key]
blob, err := repo.GitRepo.BlobObject(key.ID)
branches := repos[key].Branches

Expand Down
4 changes: 2 additions & 2 deletions gitindex/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -608,13 +608,13 @@ func TestIndexDeltaBasic(t *testing.T) {
// setup: prepare spy versions of prepare delta / normal build so that we can observe
// whether they were called appropriately
deltaBuildCalled := false
prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
prepareDeltaSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, changedOrDeletedPaths []string, err error) {
deltaBuildCalled = true
return prepareDeltaBuild(options, repository)
}

normalBuildCalled := false
prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobIndexInfo, branchVersions map[string]map[string]plumbing.Hash, err error) {
prepareNormalSpy := func(options Options, repository *git.Repository) (repos map[fileKey]BlobLocation, branchVersions map[string]map[string]plumbing.Hash, err error) {
normalBuildCalled = true
return prepareNormalBuild(options, repository)
}
Expand Down
Loading

0 comments on commit 6501360

Please sign in to comment.