Skip to content

Commit

Permalink
working
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryang20718 committed Jan 4, 2024
1 parent 7677214 commit 177e5bd
Show file tree
Hide file tree
Showing 5 changed files with 218 additions and 67 deletions.
13 changes: 0 additions & 13 deletions .trunk/trunk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,10 @@ lint:
batch: false
disable_upstream: true
read_output_from: stderr
- name: go_vendor
files: [go]
runtime: go
commands:
- output: regex
parse_regex: ((?P<path>.*):(?P<line>-?\d+):(?P<message>.*))
target: .
success_codes: [0, 1]
run: go mod tidy
batch: false
disable_upstream: true
read_output_from: stderr
enabled:
- git-diff-check
- markdownlint@0.38.0
- prettier@3.1.1
- trufflehog@3.63.7
- go_vendor@SYSTEM
- gen_golang_build@SYSTEM
- golangci-lint@1.53.3
202 changes: 160 additions & 42 deletions bazel_disk_cache_cleaner/bazel_disk_cache_cleaner.go
Original file line number Diff line number Diff line change
@@ -1,86 +1,204 @@
package main

import (
"bufio"
"fmt"
"log"
"os"
"path/filepath"
"reflect"
"strings"
"syscall"
"time"

"github.com/spf13/cobra"
"go.uber.org/zap"
)

/*
Bazel is a hungry process that lacks any sort of cache bounding mechanism.
The ticket tracking that progress https://github.com/bazelbuild/bazel/issues/5139 has been opened since 2018.
Since we don't know when that will be implemented, this script is intended to workaround that.
It leverages access time to keep the bazel cache bounded by deleting all files greater than the atime specified.
*/

var (
BazelCacheDir string
KeepFilesAccessedDays int
BazelCacheDir string
KeepFilesAccessedDays int
ExternalRepoTargetList string
Verbose bool
)

type CleanBazelInput struct {
BazelCacheDir string
KeepFilesAccessedDays int
ActiveExternalRepoTargets map[string]string
BlackListFiles map[string]string
BlackListDirectories map[string]string
}

func NewRootCmd() *cobra.Command {
return &cobra.Command{
Short: "Script to clean bazel cache",
}
}

/*
Bazel is a hungry process that lacks any sort of cache bounding mechanism.
Generate a mapping to what targets still are "active" in the bazel cache under external repos
The ticket tracking that progress https://github.com/bazelbuild/bazel/issues/5139 has been opened since 2018.
Since we don't know when that will be implemented, this script is intended to workaround that.
It leverages access time to keep the bazel cache bounded by deleting all files greater than the atime specified.
Each target in the external repo is structured as <target> accompanied by @<target>.marker
We need to rid both of these in order to fetch if they were incorrectly removed
*/
func generateExternalRepoTargets(externalRepoTargetList string) (map[string]string, error) {
activeExternalRepoTargets := make(map[string]string) // ideally we have a set....
file, err := os.Open(externalRepoTargetList)
if err != nil {
return nil, err
}
defer file.Close()

scanner := bufio.NewScanner(file)
for scanner.Scan() {
targetName := scanner.Text()
markerName := fmt.Sprintf("@%s.marker", targetName)
activeExternalRepoTargets[targetName] = ""
activeExternalRepoTargets[markerName] = ""
}
if err := scanner.Err(); err != nil {
return nil, err
}
return activeExternalRepoTargets, nil
}

func createLogger(verbose bool) *zap.SugaredLogger {
logger, _ := zap.NewProduction()
if verbose {
logger, _ = zap.NewDevelopment()
}
defer logger.Sync() //nolint:errcheck
log := logger.Sugar()

return log
}

// Check if the directory or file is blacklisted
func isBlacklisted(path string, f os.FileInfo, input CleanBazelInput) bool {
_, blackListedFile := input.BlackListFiles[f.Name()]
_, blackListedDir := input.BlackListDirectories[filepath.Base(path)]
return blackListedFile || blackListedDir
}

// Check if the directory we're trying to remove will break bazel and cause us to purge cache
func isBazelDirectory(path string, input CleanBazelInput) bool {
bazelJvmToolDir := strings.Contains(filepath.Dir(path), "embedded_tools")
bazelInstallDir := strings.Contains(filepath.Dir(path), "install")
bazelInvocationDir := filepath.Base(path) != "cache" && filepath.Dir(path) == input.BazelCacheDir
return bazelJvmToolDir || bazelInstallDir || bazelInvocationDir
}

/*
Find files to remove that have an access time greater than input.timeKeepFilesAccessedDays
Skip any directories with "install" in the path
*/
func findFilesToClean(input CleanBazelInput, log *zap.SugaredLogger) ([]string, error) {
filesToRemove := []string{}
err := filepath.Walk(BazelCacheDir, func(path string, f os.FileInfo, err error) error {
// Get the syscall.Stat_t structure
stat := f.Sys().(*syscall.Stat_t)
var accessTime time.Time
atimField := reflect.ValueOf(stat).FieldByName("Atim")
atimespecField := reflect.ValueOf(stat).FieldByName("Atimespec")
if atimField.IsValid() { // Valid for linux
secField := atimField.FieldByName("Sec")
nsecField := atimField.FieldByName("Nsec")
if secField.IsValid() && nsecField.IsValid() {
accessTime = time.Unix(int64(secField.Uint()), int64(nsecField.Uint()))
}
}else if atimespecField.IsValid() { // Macos
secField := atimespecField.FieldByName("Sec")
nsecField := atimespecField.FieldByName("Nsec")
if secField.IsValid() && nsecField.IsValid() {
accessTime = time.Unix(int64(secField.Uint()), int64(nsecField.Uint()))
}
}
timeKeepFilesAccessedDays := time.Duration(input.KeepFilesAccessedDays) * 24 * time.Hour
if accessTime.Add(timeKeepFilesAccessedDays).Before(time.Now()) && path != input.BazelCacheDir {
// Check if Access time is greater than desired days to keep

_, activeExternalTarget := input.ActiveExternalRepoTargets[f.Name()]
activeTarget := f.IsDir() && activeExternalTarget
if activeTarget {
log.Debugf("skipping path %s active Target", path)
return filepath.SkipDir
}

if !activeExternalTarget && !isBlacklisted(path, f, input) && !isBazelDirectory(path, input) {
log.Debugf("adding file to remove %s", path)
filesToRemove = append(filesToRemove, path)
} else {
log.Debugf("skipping path %s", path)
}
}
if err != nil {
return fmt.Errorf("failing to walk path %s err: %v", BazelCacheDir, err)
}
return nil
})
if err != nil {
return nil, err
}
return filesToRemove, nil
}

func main() {
rootCmd := NewRootCmd()
rootCmd.PersistentFlags().StringVar(&BazelCacheDir, "bazel-cache-dir", "", "path to bazel cache directory to clear")
rootCmd.PersistentFlags().IntVar(&KeepFilesAccessedDays, "keep-files-access-days", 0, "purge files with access time greater than")
rootCmd.PersistentFlags().StringVar(&ExternalRepoTargetList, "external-repo-target-list", "", "path to file containing list of external repo targets to keep")
rootCmd.PersistentFlags().BoolVar(&Verbose, "verbose", false, "set verbosity for understanding what this script is doing")

rootCmd.AddCommand(&cobra.Command{
Use: "clean",
Short: "clean bazel cache",
RunE: func(cmd *cobra.Command, args []string) error {
log.Printf("Starting Clean up Process of Bazel Directory %s. This may take a couple of minutes", BazelCacheDir)
var fileList []string
err := filepath.Walk(BazelCacheDir, func(path string, info os.FileInfo, err error) error {
// Get the syscall.Stat_t structure
stat := info.Sys().(*syscall.Stat_t)
var accessTime time.Time
atimField := reflect.ValueOf(stat).FieldByName("Atim")
atimespecField := reflect.ValueOf(stat).FieldByName("Atimespec")
if atimField.IsValid() { // Valid for linux
secField := atimField.FieldByName("Sec")
nsecField := atimField.FieldByName("Nsec")
if secField.IsValid() && nsecField.IsValid() {
accessTime = time.Unix(int64(secField.Uint()), int64(nsecField.Uint()))
}
}else if atimespecField.IsValid() { // Macos
secField := atimespecField.FieldByName("Sec")
nsecField := atimespecField.FieldByName("Nsec")
if secField.IsValid() && nsecField.IsValid() {
accessTime = time.Unix(int64(secField.Uint()), int64(nsecField.Uint()))
}
}
timeKeepFilesAccessedDays := time.Duration(KeepFilesAccessedDays) * 24 * time.Hour
if accessTime.Add(timeKeepFilesAccessedDays).Before(time.Now()) && path != BazelCacheDir && !info.IsDir() {
// Check if Access time is greater than desired days to keep
// don't delete cache dir, otherwise you'll have to purge bazel cache completely
fileList = append(fileList, path)
}
if err != nil {
return fmt.Errorf("failing to walk path %s err: %v", BazelCacheDir, err)
}
return nil
})
log := createLogger(Verbose)
log.Infof("Starting Clean up Process of Bazel Directory %s. This may take a couple of minutes", BazelCacheDir)

activeExternalRepoTargets, err := generateExternalRepoTargets(ExternalRepoTargetList)
if err != nil {
return fmt.Errorf("failing to find external repo targets %v", err)
}

blackListDirectories := map[string]string{
"install": "", // install base for bazel
"embedded_tools": "", // bazel dev tools
"external": "", // we can't purge this external repo. need to selectively purge
}

blackListFiles := map[string]string{
"lock": "", // bazel lock
}

cleanBazelInput := CleanBazelInput{
BazelCacheDir: BazelCacheDir,
KeepFilesAccessedDays: KeepFilesAccessedDays,
ActiveExternalRepoTargets: activeExternalRepoTargets,
BlackListFiles: blackListFiles,
BlackListDirectories: blackListDirectories,
}

filesToRemove, err := findFilesToClean(cleanBazelInput, log)
if err != nil {
return err
return fmt.Errorf("failing to find files to clean: %v", err)
}
for _, file := range fileList {
for _, file := range filesToRemove {
err := os.RemoveAll(file)
if err != nil {
return fmt.Errorf("failing to remove file err: %v", err)
}
}
log.Println("Finished Cleaning Bazel Cache up!")
log.Info("Finished Cleaning Bazel Cache up!")
return nil
},
})
Expand Down
51 changes: 40 additions & 11 deletions bazel_disk_cache_cleaner/bazel_disk_cache_cleaner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,57 @@

set -euo pipefail

# Fetch all external repos still in use
# Reason for doing this is bazel doesn't update atime on external repos; Bazel keeps these repos in memory.
# However, data artifacts may remain in this directory over time and thus fill the cache
# We query for these repos and only clear from this directory for any repos that aren't found within the bazel query
USERNAME=$(/usr/bin/logname)
if [[ -z "${BAZEL_CACHE_DIR+x}" ]]; then
BAZEL_CACHE_DIR="$(readlink -f /home/$USERNAME/.cache/bazel/_bazel_$USERNAME)"

mkdir -p "/tmp/${USERNAME}"
external_repo_list="/tmp/${USERNAME}/bazel_external_repos.txt"

if [[ -z ${BAZEL_CACHE_DIR+x} ]]; then
BAZEL_CACHE_DIR="$(readlink -f /home/${USERNAME}/.cache/bazel/_bazel_${USERNAME})"
fi

if [[ -z "${KEEP_FILES_ACCESSED_DAYS+x}" ]]; then
if [[ -z ${KEEP_FILES_ACCESSED_DAYS+x} ]]; then
KEEP_FILES_ACCESSED_DAYS=5
fi

before_storage_used=$(du -sh ${BAZEL_CACHE_DIR} &)
bazel_cache_binary=bazel-out/k8-opt/bin/tools/bazel_disk_cache_cleaner/bazel_disk_cache_cleaner_/bazel_disk_cache_cleaner
if [[ ! -f "$bazel_cache_binary" ]]; then
SET_VERBOSE=""
if [[ -n ${VERBOSE+x} ]]; then
SET_VERBOSE="--verbose"
fi

# raw data is //external:<target>. strip away //external: for easier processing from Go's side
tools/bazel query //external:* | sed 's|//external:||' >"${external_repo_list}"

# This script concurrently with other bazel processes. Thus, we must hold the lock
echo "Clean Bazel Cache script is now holding the bazel lock, preventing other bazel processes from running"
echo "This script is cleaning out unused files from the bazel cache so you don't need to expunge!"
bazel_locks_path="${BAZEL_CACHE_DIR}"
for file in "${bazel_locks_path}"/*/lock; do
sudo sed -i "s/pid=[0-9]*/pid=$$/g" "$file"
done

before_storage_used="$(du -sh ${BAZEL_CACHE_DIR} &)"
clean_bazel_cache_binary=bazel-out/k8-opt/bin/tools/clean_bazel_cache/clean_bazel_cache_/clean_bazel_cache
if [[ ! -f $clean_bazel_cache_binary ]]; then
# Build binary if this tool doesn't exist
tools/bazel build //tools/bazel_disk_cache_cleaner
tools/bazel build //tools/clean_bazel_cache
fi

# bazel cache requires sudo to remove
sudo ${bazel_cache_binary} clean \
--bazel-cache-dir ${BAZEL_CACHE_DIR} \
--keep-files-access-days ${KEEP_FILES_ACCESSED_DAYS}
sudo ${clean_bazel_cache_binary} clean \
"${SET_VERBOSE}" \
--bazel-cache-dir "${BAZEL_CACHE_DIR}" \
--keep-files-access-days "${KEEP_FILES_ACCESSED_DAYS}" \
--external-repo-target-list "${external_repo_list}"

# Statistics for storage cleared
after_storage_used=$(du -sh ${BAZEL_CACHE_DIR} &)
after_storage_used="$(du -sh ${BAZEL_CACHE_DIR} &)"
echo "Bazel Cache Disk Usage before cleaning: ${before_storage_used}"
echo "Bazel Cache Disk Usage after cleaning: ${after_storage_used}"

# Reset Bazel Memory state now that we've cleaned the cache
tools/bazel shutdown
6 changes: 5 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ module github.com/Ryang20718/bazel-disk-cache-cleaner

go 1.21

require github.com/spf13/cobra v1.8.0
require (
github.com/spf13/cobra v1.8.0
go.uber.org/zap v1.26.0
)

require (
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/multierr v1.10.0 // indirect
)
13 changes: 13 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0=
github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk=
go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo=
go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

0 comments on commit 177e5bd

Please sign in to comment.