From dec6a4cd2fcc0dba71e83abe639c90f4efc8da1e Mon Sep 17 00:00:00 2001 From: Felipe Date: Fri, 16 Jun 2023 14:37:11 +1000 Subject: [PATCH] .ocrignore file support --- main.go | 19 ++++++++ vision-api.go | 69 ++++++++++++++++++++++++++++ vision-api_test.go | 111 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 vision-api_test.go diff --git a/main.go b/main.go index 2d73e66..ee60f27 100644 --- a/main.go +++ b/main.go @@ -24,6 +24,8 @@ import ( "github.com/rs/zerolog" ) +const ignoreFileName = ".ocrignore" + var logger zerolog.Logger var applicationText = "%s 0.1.0%s" var copyrightText = "Copyright 2022-2023, Matthew Winter\n" @@ -35,6 +37,9 @@ submitting all image files for optical character recognition (OCR) via the Google Cloud Vision API, Outputting the OCR response to a single newline delimited JSON File. +If a file named .ocrignore is found in the current dir it's used a a list of +GLOBs to exclude, akin to .gitignore. + Use --help for more details. @@ -79,12 +84,26 @@ func main() { zerolog.SetGlobalLevel(zerolog.InfoLevel) } + // Checking for .ocrignore file + hasIgnoreFile := false + ignoreFile, err_ := filepath.Glob("./" + ignoreFileName) + if err_ != nil { + logger.Error().Err(err_).Msg("Failed to check for .ocrignore file.") + + } else { + // in the odd case that filepath.Glob returns several files + if len(ignoreFile) == 1 { + hasIgnoreFile = true + } + } + // Output Header logger.Info().Msgf(applicationText, filepath.Base(os.Args[0]), "") logger.Info().Msg("Arguments") logger.Info().Str("Input Path", *inputPath).Msg(indent) logger.Info().Str("Output File", *outputFile).Msg(indent) logger.Info().Bool("Output Full Details", *outputFull).Msg(indent) + logger.Info().Bool("Using ignore file", hasIgnoreFile).Msg(indent) logger.Info().Msg("Begin") // Walk the provided input path and populate a list of images in preparation for OCR diff --git a/vision-api.go b/vision-api.go index 346bb1c..7fc3084 100644 --- a/vision-api.go +++ b/vision-api.go @@ -20,8 +20,11 @@ import ( "encoding/json" "fmt" "image" + "io" "os" + "path" "path/filepath" + "regexp" "strings" i32 "github.com/adam-lavrik/go-imath/i32" @@ -61,8 +64,16 @@ func (files *ImageFiles) PopulateImages(inputPath string) error { return fmt.Errorf("Glob Failed: %w", err) } + // Get lits of GLOBs to ignore + ignoreThis := GetIgnoreList(ignoreFileName) + // Load all matching files returned from the Glob for _, filename := range matches { + + if IsIgnorableFile(filename, ignoreThis) { + continue + } + fileInfo, err := os.Stat(filename) if err != nil { return fmt.Errorf("Failed to get file info: %w", err) @@ -340,3 +351,61 @@ func (info *ImageInfo) GetCompactJSON() ([]byte, error) { func (info *ImageInfo) GetFullJSON() ([]byte, error) { return json.Marshal(info) } + +// Simply opens a file so handleIgnoreFile can do the work and make testing easier. +func GetIgnoreList(ignoreFileName string) []string { + fileReader, err := os.Open(path.Join("./", ignoreFileName)) + if err != nil { + // ignore if error is about the ignore file not existing + if os.IsNotExist(err) { + return []string{} + } + + logger.Error().Err(err).Msg("couldn't Open ignore file.") + return []string{} + } + + defer fileReader.Close() + + return handleIgnoreFile(fileReader) +} + +func handleIgnoreFile(file io.Reader) []string { + fileContents, err := io.ReadAll(file) + if err != nil { + logger.Error().Err(err).Msg("couldn't Read ignore file.") + return []string{} + } + lineBreakRegExp := regexp.MustCompile(`\r?\n`) + globs := lineBreakRegExp.Split(string(fileContents), -1) + + cleanGlobs := make([]string, 0) + + for _, g := range globs { + s := strings.TrimSpace(g) + if len(s) > 0 { + cleanGlobs = append(cleanGlobs, s) + } + } + + return cleanGlobs +} + +func IsIgnorableFile(fileName string, ignoreList []string) bool { + + for _, glob := range ignoreList { + + matches, err := filepath.Match(glob, fileName) + if err != nil { + errMsg := fmt.Sprintf("Ecountered a malformed GLOB while checking if a file should be ignored. Offending glob: %s", glob) + logger.Error().Err(err).Msg(errMsg) + continue + } + + if matches { + return true + } + } + + return false +} diff --git a/vision-api_test.go b/vision-api_test.go new file mode 100644 index 0000000..12d7d81 --- /dev/null +++ b/vision-api_test.go @@ -0,0 +1,111 @@ +// Copyright 2021-2023, Matthew Winter +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bytes" + "io" + "reflect" + "testing" +) + +func Test_handleIgnoreFile(t *testing.T) { + + someContent := bytes.NewBufferString("*.jpg\n*.bmp\n./examples/**") + noContent := bytes.NewBufferString("") + malformed := bytes.NewBufferString(" *.jpg\n*.bmp \n ./examples/**") + + tests := []struct { + name string + file io.Reader + want []string + }{ + { + name: "some values", + file: someContent, + want: []string{"*.jpg", "*.bmp", "./examples/**"}, + }, + { + name: "one", + file: noContent, + want: []string{}, + }, + { + name: "whitespaced", + file: malformed, + want: []string{"*.jpg", "*.bmp", "./examples/**"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := handleIgnoreFile(tt.file); !reflect.DeepEqual(got, tt.want) { + t.Errorf("handleIgnoreFile() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_isIgnorableFile(t *testing.T) { + + type args struct { + fileName string + ignoreList []string + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "ok matches", + args: args{ + fileName: "aloha.jpg", + ignoreList: []string{"*.jpg"}, + }, + want: true, + }, + { + name: "not matches", + args: args{ + fileName: "aloha.jpg", + ignoreList: []string{"*.bmp"}, + }, + want: false, + }, + { + name: "not matches because in dir", + args: args{ + fileName: "./examples/aloha.jpg", + ignoreList: []string{"*.bmp"}, + }, + want: false, + }, + { + name: "ok matches in dir", + args: args{ + fileName: "./examples/aloha.jpg", + ignoreList: []string{"./examples/*.jpg"}, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsIgnorableFile(tt.args.fileName, tt.args.ignoreList); got != tt.want { + t.Errorf("isIgnorableFile() = %v, want %v", got, tt.want) + } + }) + } +}