Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

.ocrignore file support #1

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"github.com/rs/zerolog"
)

const ignoreFileName = ".ocrignore"

var logger zerolog.Logger
var applicationText = "%s 0.1.0%s"
var copyrightText = "Copyright 2022-2023, Matthew Winter\n"
Expand All @@ -35,6 +37,9 @@ submitting all image files for optical character recognition (OCR) via the
Google Cloud Vision API, Outputting the OCR response to a single newline
delimited JSON File.

If a file named .ocrignore is found in the current dir it's used a a list of
GLOBs to exclude, akin to .gitignore.

Use --help for more details.


Expand Down Expand Up @@ -79,12 +84,26 @@ func main() {
zerolog.SetGlobalLevel(zerolog.InfoLevel)
}

// Checking for .ocrignore file
hasIgnoreFile := false
ignoreFile, err_ := filepath.Glob("./" + ignoreFileName)
if err_ != nil {
logger.Error().Err(err_).Msg("Failed to check for .ocrignore file.")

} else {
// in the odd case that filepath.Glob returns several files
if len(ignoreFile) == 1 {
hasIgnoreFile = true
}
}

// Output Header
logger.Info().Msgf(applicationText, filepath.Base(os.Args[0]), "")
logger.Info().Msg("Arguments")
logger.Info().Str("Input Path", *inputPath).Msg(indent)
logger.Info().Str("Output File", *outputFile).Msg(indent)
logger.Info().Bool("Output Full Details", *outputFull).Msg(indent)
logger.Info().Bool("Using ignore file", hasIgnoreFile).Msg(indent)
logger.Info().Msg("Begin")

// Walk the provided input path and populate a list of images in preparation for OCR
Expand Down
69 changes: 69 additions & 0 deletions vision-api.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ import (
"encoding/json"
"fmt"
"image"
"io"
"os"
"path"
"path/filepath"
"regexp"
"strings"

i32 "github.com/adam-lavrik/go-imath/i32"
Expand Down Expand Up @@ -61,8 +64,16 @@ func (files *ImageFiles) PopulateImages(inputPath string) error {
return fmt.Errorf("Glob Failed: %w", err)
}

// Get lits of GLOBs to ignore
ignoreThis := GetIgnoreList(ignoreFileName)

// Load all matching files returned from the Glob
for _, filename := range matches {

if IsIgnorableFile(filename, ignoreThis) {
continue
}

fileInfo, err := os.Stat(filename)
if err != nil {
return fmt.Errorf("Failed to get file info: %w", err)
Expand Down Expand Up @@ -340,3 +351,61 @@ func (info *ImageInfo) GetCompactJSON() ([]byte, error) {
func (info *ImageInfo) GetFullJSON() ([]byte, error) {
return json.Marshal(info)
}

// Simply opens a file so handleIgnoreFile can do the work and make testing easier.
func GetIgnoreList(ignoreFileName string) []string {
fileReader, err := os.Open(path.Join("./", ignoreFileName))
if err != nil {
// ignore if error is about the ignore file not existing
if os.IsNotExist(err) {
return []string{}
}

logger.Error().Err(err).Msg("couldn't Open ignore file.")
return []string{}
}

defer fileReader.Close()

return handleIgnoreFile(fileReader)
}

func handleIgnoreFile(file io.Reader) []string {
fileContents, err := io.ReadAll(file)
if err != nil {
logger.Error().Err(err).Msg("couldn't Read ignore file.")
return []string{}
}
lineBreakRegExp := regexp.MustCompile(`\r?\n`)
globs := lineBreakRegExp.Split(string(fileContents), -1)

cleanGlobs := make([]string, 0)

for _, g := range globs {
s := strings.TrimSpace(g)
if len(s) > 0 {
cleanGlobs = append(cleanGlobs, s)
}
}

return cleanGlobs
}

func IsIgnorableFile(fileName string, ignoreList []string) bool {

for _, glob := range ignoreList {

matches, err := filepath.Match(glob, fileName)
if err != nil {
errMsg := fmt.Sprintf("Ecountered a malformed GLOB while checking if a file should be ignored. Offending glob: %s", glob)
logger.Error().Err(err).Msg(errMsg)
continue
}

if matches {
return true
}
}

return false
}
111 changes: 111 additions & 0 deletions vision-api_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright 2021-2023, Matthew Winter
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"bytes"
"io"
"reflect"
"testing"
)

func Test_handleIgnoreFile(t *testing.T) {

someContent := bytes.NewBufferString("*.jpg\n*.bmp\n./examples/**")
noContent := bytes.NewBufferString("")
malformed := bytes.NewBufferString(" *.jpg\n*.bmp \n ./examples/**")

tests := []struct {
name string
file io.Reader
want []string
}{
{
name: "some values",
file: someContent,
want: []string{"*.jpg", "*.bmp", "./examples/**"},
},
{
name: "one",
file: noContent,
want: []string{},
},
{
name: "whitespaced",
file: malformed,
want: []string{"*.jpg", "*.bmp", "./examples/**"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := handleIgnoreFile(tt.file); !reflect.DeepEqual(got, tt.want) {
t.Errorf("handleIgnoreFile() = %v, want %v", got, tt.want)
}
})
}
}

func Test_isIgnorableFile(t *testing.T) {

type args struct {
fileName string
ignoreList []string
}
tests := []struct {
name string
args args
want bool
}{
{
name: "ok matches",
args: args{
fileName: "aloha.jpg",
ignoreList: []string{"*.jpg"},
},
want: true,
},
{
name: "not matches",
args: args{
fileName: "aloha.jpg",
ignoreList: []string{"*.bmp"},
},
want: false,
},
{
name: "not matches because in dir",
args: args{
fileName: "./examples/aloha.jpg",
ignoreList: []string{"*.bmp"},
},
want: false,
},
{
name: "ok matches in dir",
args: args{
fileName: "./examples/aloha.jpg",
ignoreList: []string{"./examples/*.jpg"},
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := IsIgnorableFile(tt.args.fileName, tt.args.ignoreList); got != tt.want {
t.Errorf("isIgnorableFile() = %v, want %v", got, tt.want)
}
})
}
}