[Exporter] Allow to match resource names by regular expression (#4177)

## Changes  In addition to the existing `-match` option, this PR allows the matching of names by regex during the listing operation. There are new options: - `-matchRegex` - checks if name matches a regex - this could be useful for exporting notebooks for only specific users, or something like that. - `-excludeRegex` - checks if name matches a regex, and skips processing of that object. For example, it could be used to exclude `databricks_automl` directories. This parameter has higher priority than the `-match` and `-matchRegex`. - `filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify the condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`* ## Tests  - [x] `make test` run locally - [x] relevant change in `docs/` folder - [ ] covered with integration tests in `internal/acceptance` - [ ] relevant acceptance tests are passing - [ ] using Go SDK --------- Co-authored-by: Miles Yucht <miles@databricks.com>
databricks · Nov 1, 2024 · 0fbfbf4 · 0fbfbf4
1 parent da1f7e4
commit 0fbfbf4
Show file tree

Hide file tree

Showing 7 changed files with 230 additions and 36 deletions.
diff --git a/docs/guides/experimental-exporter.md b/docs/guides/experimental-exporter.md
@@ -61,6 +61,9 @@ All arguments are optional, and they tune what code is being generated.
 * `-listing` - Comma-separated list of services to be listed and further passed on for importing. For each service specified, the exporter performs a listing of available resources using the `List` function and emits them for importing together with their dependencies. The `-services` parameter could be used to control which transitive dependencies will be also imported.
 * `-services` - Comma-separated list of services to import. By default, all services are imported.
 * `-match` - Match resource names during listing operation. This filter applies to all resources that are getting listed, so if you want to import all dependencies of just one cluster, specify `-match=autoscaling -listing=compute`. By default, it is empty, which matches everything.
+* `-matchRegex` - Match resource names against a given regex during listing operation. Applicable to all resources selected for listing.
+* `-excludeRegex` - Exclude resource names matching a given regex. Applied during the listing operation and has higher priority than `-match` and `-matchRegex`.  Applicable to all resources selected for listing.  Could be used to exclude things like `databricks_automl` notebooks, etc.  
+* `-filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking.  *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`*.
 * `-mounts` - List DBFS mount points, an extremely slow operation that would not trigger unless explicitly specified.
 * `-generateProviderDeclaration` - the flag that toggles the generation of `databricks.tf` file with the declaration of the Databricks Terraform provider that is necessary for Terraform versions since Terraform 0.13 (disabled by default).
 * `-prefix` - optional prefix that will be added to the name of all exported resources - that's useful for exporting resources from multiple workspaces for merging into a single one.

diff --git a/exporter/command.go b/exporter/command.go
@@ -131,6 +131,8 @@ func Run(args ...string) error {
 	flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
 	flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
 		"Generate Databricks provider declaration.")
+	flags.BoolVar(&ic.filterDirectoriesDuringWorkspaceWalking, "filterDirectoriesDuringWorkspaceWalking", false,
+		"Apply filtering to directory names during workspace walking")
 	flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
 		"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
 	services, listing := ic.allServicesAndListing()
@@ -145,6 +147,12 @@ func Run(args ...string) error {
 	flags.StringVar(&ic.match, "match", "", "Match resource names during listing operation. "+
 		"This filter applies to all resources that are getting listed, so if you want to import "+
 		"all dependencies of just one cluster, specify -listing=compute")
+	flags.StringVar(&ic.matchRegexStr, "matchRegex", "", "Match resource names during listing operation against a regex. "+
+		"This filter applies to all resources that are getting listed, so if you want to import "+
+		"all dependencies of just one cluster, specify -listing=compute")
+	flags.StringVar(&ic.excludeRegexStr, "excludeRegex", "", "Exclude resource names matching regex during listing operation. "+
+		"This filter applies to all resources that are getting listed, so if you want to import "+
+		"all dependencies of just one cluster, specify -listing=compute")
 	prefix := ""
 	flags.StringVar(&prefix, "prefix", "", "Prefix that will be added to the name of all exported resources")
 	newArgs := args

diff --git a/exporter/context.go b/exporter/context.go
@@ -78,28 +78,33 @@ type importContext struct {
 	Scope importedResources
 
 	// command-line resources (immutable, or set by the single thread)
-	includeUserDomains       bool
-	importAllUsers           bool
-	exportDeletedUsersAssets bool
-	incremental              bool
-	mounts                   bool
-	noFormat                 bool
-	nativeImportSupported    bool
-	services                 map[string]struct{}
-	listing                  map[string]struct{}
-	match                    string
-	lastActiveDays           int64
-	lastActiveMs             int64
-	generateDeclaration      bool
-	exportSecrets            bool
-	meAdmin                  bool
-	meUserName               string
-	prefix                   string
-	accountLevel             bool
-	shImports                map[string]bool
-	notebooksFormat          string
-	updatedSinceStr          string
-	updatedSinceMs           int64
+	includeUserDomains                      bool
+	importAllUsers                          bool
+	exportDeletedUsersAssets                bool
+	incremental                             bool
+	mounts                                  bool
+	noFormat                                bool
+	nativeImportSupported                   bool
+	services                                map[string]struct{}
+	listing                                 map[string]struct{}
+	match                                   string
+	matchRegexStr                           string
+	matchRegex                              *regexp.Regexp
+	excludeRegexStr                         string
+	excludeRegex                            *regexp.Regexp
+	filterDirectoriesDuringWorkspaceWalking bool
+	lastActiveDays                          int64
+	lastActiveMs                            int64
+	generateDeclaration                     bool
+	exportSecrets                           bool
+	meAdmin                                 bool
+	meUserName                              string
+	prefix                                  string
+	accountLevel                            bool
+	shImports                               map[string]bool
+	notebooksFormat                         string
+	updatedSinceStr                         string
+	updatedSinceMs                          int64
 
 	waitGroup *sync.WaitGroup
 
@@ -297,6 +302,24 @@ func (ic *importContext) Run() error {
 		return fmt.Errorf("no services to import")
 	}
 
+	if ic.matchRegexStr != "" {
+		log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.matchRegexStr)
+		re, err := regexp.Compile(ic.matchRegexStr)
+		if err != nil {
+			log.Printf("[ERROR] can't compile regex '%s': %v", ic.matchRegexStr, err)
+			return err
+		}
+		ic.matchRegex = re
+	}
+	if ic.excludeRegexStr != "" {
+		log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.excludeRegexStr)
+		re, err := regexp.Compile(ic.excludeRegexStr)
+		if err != nil {
+			log.Printf("[ERROR] can't compile regex '%s': %v", ic.excludeRegexStr, err)
+			return err
+		}
+		ic.excludeRegex = re
+	}
 	if ic.incremental {
 		if ic.updatedSinceStr == "" {
 			ic.updatedSinceStr = getLastRunString(statsFileName)

diff --git a/exporter/exporter_test.go b/exporter/exporter_test.go
@@ -2349,7 +2349,7 @@ func TestImportingGlobalSqlConfig(t *testing.T) {
 		})
 }
 
-func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
+func TestImportingNotebooksWorkspaceFilesWithFilter(t *testing.T) {
 	fileStatus := workspace.ObjectStatus{
 		ObjectID:   123,
 		ObjectType: workspace.File,
@@ -2371,7 +2371,135 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
 				Method:   "GET",
 				Resource: "/api/2.0/workspace/list?path=%2F",
 				Response: workspace.ObjectList{
-					Objects: []workspace.ObjectStatus{notebookStatus, fileStatus},
+					Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
+						{
+							ObjectID:   4567,
+							ObjectType: workspace.Notebook,
+							Path:       "/UnmatchedNotebook",
+							Language:   "PYTHON",
+						},
+						{
+							ObjectID:   1234,
+							ObjectType: workspace.File,
+							Path:       "/UnmatchedFile",
+						},
+						{
+							ObjectID:   456,
+							ObjectType: workspace.Directory,
+							Path:       "/databricks_automl",
+						},
+						{
+							ObjectID:   456,
+							ObjectType: workspace.Directory,
+							Path:       "/.bundle",
+						},
+					},
+				},
+				ReuseRequest: true,
+			},
+			{
+				Method:   "GET",
+				Resource: "/api/2.0/workspace/list?path=%2Fdatabricks_automl",
+				Response: workspace.ObjectList{},
+			},
+			{
+				Method:       "GET",
+				Resource:     "/api/2.0/workspace/get-status?path=%2FNotebook",
+				Response:     notebookStatus,
+				ReuseRequest: true,
+			},
+			{
+				Method:       "GET",
+				Resource:     "/api/2.0/workspace/get-status?path=%2FFile",
+				Response:     fileStatus,
+				ReuseRequest: true,
+			},
+			{
+				Method:   "GET",
+				Resource: "/api/2.0/workspace/export?format=AUTO&path=%2FFile",
+				Response: workspace.ExportPath{
+					Content: "dGVzdA==",
+				},
+				ReuseRequest: true,
+			},
+			{
+				Method:   "GET",
+				Resource: "/api/2.0/workspace/export?format=SOURCE&path=%2FNotebook",
+				Response: workspace.ExportPath{
+					Content: "dGVzdA==",
+				},
+				ReuseRequest: true,
+			},
+		},
+		func(ctx context.Context, client *common.DatabricksClient) {
+			tmpDir := fmt.Sprintf("/tmp/tf-%s", qa.RandomName())
+			defer os.RemoveAll(tmpDir)
+
+			ic := newImportContext(client)
+			ic.Directory = tmpDir
+			ic.enableListing("notebooks,wsfiles")
+			ic.excludeRegexStr = "databricks_automl"
+			ic.matchRegexStr = "^/[FN].*$"
+
+			err := ic.Run()
+			assert.NoError(t, err)
+			// check generated code for notebooks
+			content, err := os.ReadFile(tmpDir + "/notebooks.tf")
+			assert.NoError(t, err)
+			contentStr := string(content)
+			assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
+			assert.True(t, strings.Contains(contentStr, `path   = "/Notebook"`))
+			assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
+			// check generated code for workspace files
+			content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
+			assert.NoError(t, err)
+			contentStr = string(content)
+			assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
+			assert.True(t, strings.Contains(contentStr, `path   = "/File"`))
+			assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
+		})
+}
+
+func TestImportingNotebooksWorkspaceFilesWithFilterDuringWalking(t *testing.T) {
+	fileStatus := workspace.ObjectStatus{
+		ObjectID:   123,
+		ObjectType: workspace.File,
+		Path:       "/File",
+	}
+	notebookStatus := workspace.ObjectStatus{
+		ObjectID:   456,
+		ObjectType: workspace.Notebook,
+		Path:       "/Notebook",
+		Language:   "PYTHON",
+	}
+	qa.HTTPFixturesApply(t,
+		[]qa.HTTPFixture{
+			meAdminFixture,
+			noCurrentMetastoreAttached,
+			emptyRepos,
+			emptyIpAccessLIst,
+			{
+				Method:   "GET",
+				Resource: "/api/2.0/workspace/list?path=%2F",
+				Response: workspace.ObjectList{
+					Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
+						{
+							ObjectID:   4567,
+							ObjectType: workspace.Notebook,
+							Path:       "/UnmatchedNotebook",
+							Language:   "PYTHON",
+						},
+						{
+							ObjectID:   1234,
+							ObjectType: workspace.File,
+							Path:       "/UnmatchedFile",
+						},
+						{
+							ObjectID:   456,
+							ObjectType: workspace.Directory,
+							Path:       "/databricks_automl",
+						},
+					},
 				},
 				ReuseRequest: true,
 			},
@@ -2410,10 +2538,27 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
 
 			ic := newImportContext(client)
 			ic.Directory = tmpDir
-			ic.enableListing("notebooks")
+			ic.enableListing("notebooks,wsfiles")
+			ic.excludeRegexStr = "databricks_automl"
+			ic.matchRegexStr = "^/[FN].*$"
+			ic.filterDirectoriesDuringWorkspaceWalking = true
 
 			err := ic.Run()
 			assert.NoError(t, err)
+			// check generated code for notebooks
+			content, err := os.ReadFile(tmpDir + "/notebooks.tf")
+			assert.NoError(t, err)
+			contentStr := string(content)
+			assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
+			assert.True(t, strings.Contains(contentStr, `path   = "/Notebook"`))
+			assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
+			// check generated code for workspace files
+			content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
+			assert.NoError(t, err)
+			contentStr = string(content)
+			assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
+			assert.True(t, strings.Contains(contentStr, `path   = "/File"`))
+			assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
 		})
 }
 

diff --git a/exporter/util.go b/exporter/util.go
@@ -35,9 +35,15 @@ func (ic *importContext) isServiceInListing(service string) bool {
 }
 
 func (ic *importContext) MatchesName(n string) bool {
-	if ic.match == "" {
+	if ic.match == "" && ic.matchRegex == nil && ic.excludeRegex == nil {
 		return true
 	}
+	if ic.excludeRegex != nil && ic.excludeRegex.MatchString(n) {
+		return false
+	}
+	if ic.matchRegex != nil {
+		return ic.matchRegex.MatchString(n)
+	}
 	return strings.Contains(strings.ToLower(n), strings.ToLower(ic.match))
 }
 

diff --git a/exporter/util_test.go b/exporter/util_test.go
@@ -316,16 +316,16 @@ func TestGetEnvAsInt(t *testing.T) {
 }
 
 func TestExcludeAuxiliaryDirectories(t *testing.T) {
-	assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
-	assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{ObjectType: workspace.File}))
-	assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc",
+	assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
+	assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{ObjectType: workspace.File}))
+	assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc",
 		ObjectType: workspace.Directory}))
 	// should be ignored
-	assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/.ide",
+	assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/.ide",
 		ObjectType: workspace.Directory}))
-	assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Shared/.bundle",
+	assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Shared/.bundle",
 		ObjectType: workspace.Directory}))
-	assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc/__pycache__",
+	assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/user@domain.com/abc/__pycache__",
 		ObjectType: workspace.Directory}))
 }
 

diff --git a/exporter/util_workspace.go b/exporter/util_workspace.go
@@ -93,17 +93,18 @@ func (ic *importContext) getAllDirectories() []workspace.ObjectStatus {
 var directoriesToIgnore = []string{".ide", ".bundle", "__pycache__"}
 
 // TODO: add ignoring directories of deleted users?  This could potentially decrease the number of processed objects...
-func excludeAuxiliaryDirectories(v workspace.ObjectStatus) bool {
+func isAuxiliaryDirectory(v workspace.ObjectStatus) bool {
 	if v.ObjectType != workspace.Directory {
-		return true
+		return false
 	}
 	// TODO: rewrite to use suffix check, etc., instead of split and slice contains?
 	parts := strings.Split(v.Path, "/")
 	result := len(parts) > 1 && slices.Contains[[]string, string](directoriesToIgnore, parts[len(parts)-1])
+	log.Printf("[DEBUG] directory %s: %v", v.Path, result)
 	if result {
 		log.Printf("[DEBUG] Ignoring directory %s", v.Path)
 	}
-	return !result
+	return result
 }
 
 func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectStatus)) []workspace.ObjectStatus {
@@ -113,7 +114,15 @@ func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectS
 		t1 := time.Now()
 		log.Print("[INFO] Starting to list all workspace objects")
 		notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
-		ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", excludeAuxiliaryDirectories, visitor)
+		shouldIncludeDirectory := func(v workspace.ObjectStatus) bool {
+			decision := !isAuxiliaryDirectory(v)
+			if decision && ic.filterDirectoriesDuringWorkspaceWalking {
+				decision = ic.MatchesName(v.Path)
+			}
+			// log.Printf("[DEBUG] decision of shouldIncludeDirectory for %s: %v", v.Path, decision)
+			return decision
+		}
+		ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", shouldIncludeDirectory, visitor)
 		log.Printf("[INFO] Finished listing of all workspace objects. %d objects in total. %v seconds",
 			len(ic.allWorkspaceObjects), time.Since(t1).Seconds())
 	}