Skip to content

Commit

Permalink
Add links.validate.config flag
Browse files Browse the repository at this point in the history
Signed-off-by: Saswata Mukherjee <saswataminsta@yahoo.com>
  • Loading branch information
saswatamcode committed May 27, 2021
1 parent 2cfb2e9 commit 88790de
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 92 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ Flags:
--links.validate.without-address-regex=^$
If specified, all links will be validated,
except those matching the given target address.
--links.validate.without-github-links=""
If specified, all links will be validated,
except the GitHub links for PRs and issues of
the given repo.
--links.validate.config=""
Path to YAML file for skipping link check, with
spec defined in
github.com/bwplotka/mdox/pkg/linktranformer.Config
Args:
<files> Markdown file(s) to process.
Expand Down
4 changes: 2 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ This directive runs executable with arguments and put its stderr and stdout outp
// TODO(bwplotka): Add cache in file?
linksValidateEnabled := cmd.Flag("links.validate", "If true, all links will be validated").Short('l').Bool()
linksValidateExceptDomains := cmd.Flag("links.validate.without-address-regex", "If specified, all links will be validated, except those matching the given target address.").Default(`^$`).Regexp()
linksSkipGitHub := cmd.Flag("links.validate.without-github-links", "If specified, all links will be validated, except the GitHub links for PRs and issues of the given repo.").Default("").String()
linksValidateConfig := cmd.Flag("links.validate.config", "Path to YAML file for skipping link check, with spec defined in github.com/bwplotka/mdox/pkg/linktranformer.Config").Default("").String()

cmd.Run(func(ctx context.Context, logger log.Logger) (err error) {
var opts []mdformatter.Option
Expand All @@ -152,7 +152,7 @@ This directive runs executable with arguments and put its stderr and stdout outp

var linkTr []mdformatter.LinkTransformer
if *linksValidateEnabled {
v, err := linktransformer.NewValidator(logger, *linksValidateExceptDomains, *linksSkipGitHub, anchorDir)
v, err := linktransformer.NewValidator(logger, *linksValidateExceptDomains, *linksValidateConfig, anchorDir)
if err != nil {
return err
}
Expand Down
64 changes: 64 additions & 0 deletions pkg/mdformatter/linktransformer/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright (c) Bartłomiej Płotka @bwplotka
// Licensed under the Apache License 2.0.

package linktransformer

import (
"bytes"
"io/ioutil"
"path/filepath"
"regexp"

"github.com/pkg/errors"
"gopkg.in/yaml.v3"
)

type Config struct {
Version int

Validate struct {
Validators []Validator `yaml:"validators"`
}
}

type Validator struct {
_regex *regexp.Regexp
_maxnum int
// Regex for type github is reponame matcher, like `bwplotka\/mdox`.
Regex string `yaml:"regex"`
// By default type is `roundtrip`. Could be `github`.
Type string `yaml:"type"`
}

func parseConfigFile(configFile string) (Config, error) {
if configFile == "" {
return Config{}, nil
}
configFile, err := filepath.Abs(configFile)
if err != nil {
return Config{}, errors.Wrap(err, "abs")
}
c, err := ioutil.ReadFile(configFile)
if err != nil {
return Config{}, errors.Wrap(err, "read config file")
}
return ParseConfig(c)
}

func ParseConfig(c []byte) (Config, error) {
cfg := Config{}
dec := yaml.NewDecoder(bytes.NewReader(c))
dec.KnownFields(true)
if err := dec.Decode(&cfg); err != nil {
return Config{}, errors.Wrapf(err, "parsing YAML content %q", string(c))
}

if len(cfg.Validate.Validators) <= 0 {
return Config{}, errors.New("No validator provided")
}

for i := range cfg.Validate.Validators {
cfg.Validate.Validators[i]._regex = regexp.MustCompile(cfg.Validate.Validators[i].Regex)
}
return cfg, nil
}
106 changes: 22 additions & 84 deletions pkg/mdformatter/linktransformer/link.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,11 @@ package linktransformer
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"sync"

Expand All @@ -38,7 +33,6 @@ var (

const (
originalURLKey = "originalURLKey"
gitHubAPIURL = "https://api.github.com/repos/%v/%v?sort=created&direction=desc&per_page=1"
)

type chain struct {
Expand Down Expand Up @@ -116,11 +110,10 @@ func (l *localizer) TransformDestination(ctx mdformatter.SourceContext, destinat
func (l *localizer) Close(mdformatter.SourceContext) error { return nil }

type validator struct {
logger log.Logger
anchorDir string
except *regexp.Regexp
skipGitHub *regexp.Regexp
gitHubNum int
logger log.Logger
anchorDir string
except *regexp.Regexp
validateConfig Config

localLinks localLinksCache
rMu sync.RWMutex
Expand All @@ -143,21 +136,20 @@ type futureResult struct {

// NewValidator returns mdformatter.LinkTransformer that crawls all links.
// TODO(bwplotka): Add optimization and debug modes - this is the main source of latency and pain.
func NewValidator(logger log.Logger, except *regexp.Regexp, repo string, anchorDir string) (mdformatter.LinkTransformer, error) {
skipGitHub, gitHubNum, err := getGitHubRegex(repo)
func NewValidator(logger log.Logger, except *regexp.Regexp, linksValidateConfig string, anchorDir string) (mdformatter.LinkTransformer, error) {
config, err := parseConfigFile(linksValidateConfig)
if err != nil {
return nil, err
}
v := &validator{
logger: logger,
anchorDir: anchorDir,
except: except,
skipGitHub: skipGitHub,
gitHubNum: gitHubNum,
localLinks: map[string]*[]string{},
remoteLinks: map[string]error{},
c: colly.NewCollector(colly.Async()),
destFutures: map[futureKey]*futureResult{},
logger: logger,
anchorDir: anchorDir,
except: except,
validateConfig: config,
localLinks: map[string]*[]string{},
remoteLinks: map[string]error{},
c: colly.NewCollector(colly.Async()),
destFutures: map[futureKey]*futureResult{},
}
// Set very soft limits.
// E.g github has 50-5000 https://docs.github.com/en/free-pro-team@latest/rest/reference/rate-limit limit depending
Expand All @@ -183,62 +175,16 @@ func NewValidator(logger log.Logger, except *regexp.Regexp, repo string, anchorD
defer v.rMu.Unlock()
v.remoteLinks[response.Ctx.Get(originalURLKey)] = errors.Wrapf(err, "%q not accessible; status code %v", response.Request.URL.String(), response.StatusCode)
})
return v, nil
}

type GitHubResponse struct {
Number int `json:"number"`
}

func getGitHubRegex(reponame string) (*regexp.Regexp, int, error) {
if reponame != "" {
var pullNum []GitHubResponse
var issueNum []GitHubResponse
max := 0
// Check latest pull request number.
respPull, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "pulls"))
if err != nil {
return nil, math.MaxInt64, err
}
if respPull.StatusCode != 200 {
return nil, math.MaxInt64, errors.New("pulls API request failed. status code: " + strconv.Itoa(respPull.StatusCode))
}
defer respPull.Body.Close()
if err := json.NewDecoder(respPull.Body).Decode(&pullNum); err != nil {
return nil, math.MaxInt64, err
}
if len(pullNum) > 0 {
max = pullNum[0].Number
}

// Check latest issue number and return whichever is greater.
respIssue, err := http.Get(fmt.Sprintf(gitHubAPIURL, reponame, "issues"))
if err != nil {
return nil, math.MaxInt64, err
}
if respIssue.StatusCode != 200 {
return nil, math.MaxInt64, errors.New("issues API request failed. status code: " + strconv.Itoa(respIssue.StatusCode))
}
defer respIssue.Body.Close()
if err := json.NewDecoder(respIssue.Body).Decode(&issueNum); err != nil {
return nil, math.MaxInt64, err
}
if len(issueNum) > 0 && issueNum[0].Number > max {
max = issueNum[0].Number
}

// Place forward slash between org and repo to escape slash character.
idx := strings.Index(reponame, "/")
reponame = reponame[:idx] + `\` + reponame[idx:]
return regexp.MustCompile(`(^http[s]?:\/\/)(www\.)?(github\.com\/)(` + reponame + `)(\/pull\/|\/issues\/)`), max, nil
err = CheckGitHub(v.validateConfig)
if err != nil {
return nil, err
}

return regexp.MustCompile(`^$`), math.MaxInt64, nil
return v, nil
}

// MustNewValidator returns mdformatter.LinkTransformer that crawls all links.
func MustNewValidator(logger log.Logger, except *regexp.Regexp, reponame string, anchorDir string) mdformatter.LinkTransformer {
v, err := NewValidator(logger, except, reponame, anchorDir)
func MustNewValidator(logger log.Logger, except *regexp.Regexp, linksValidateConfig string, anchorDir string) mdformatter.LinkTransformer {
v, err := NewValidator(logger, except, linksValidateConfig, anchorDir)
if err != nil {
panic(err)
}
Expand Down Expand Up @@ -299,16 +245,8 @@ func (v *validator) visit(filepath string, dest string) {
if v.except.MatchString(dest) {
return
}
if v.skipGitHub.MatchString(dest) {
// Find rightmost index of match i.e, where regex match ends.
// This will be where issue/PR number starts. Split incase of section link and convert to int.
idx := v.skipGitHub.FindStringIndex(dest)
stringNum := strings.Split(dest[idx[1]:], "#")
num, err := strconv.Atoi(stringNum[0])
// If number in link does not exceed then link is valid. Otherwise will be checked by v.c.Visit.
if v.gitHubNum >= num && err == nil {
return
}
if CheckValidators(dest, v.validateConfig) {
return
}

matches := remoteLinkPrefixRe.FindAllStringIndex(dest, 1)
Expand Down
24 changes: 22 additions & 2 deletions pkg/mdformatter/linktransformer/link_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,16 +244,36 @@ func TestValidator_TransformDestination(t *testing.T) {
testutil.Ok(t, err)
})

t.Run("check github links, skipped", func(t *testing.T) {
t.Run("check links with validate config", func(t *testing.T) {
testFile := filepath.Join(tmpDir, "repo", "docs", "test", "links.md")
mdoxFile := filepath.Join(tmpDir, "mdox.yaml")

testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://fakelink1.com/ http://fakelink2.com/ https://www.fakelink3.com/\n"), os.ModePerm))
testutil.Ok(t, ioutil.WriteFile(mdoxFile, []byte("version: 1\n\nvalidate:\n validators:\n - regex: '(^http[s]?:\\/\\/)(www\\.)?(fakelink[0-9]\\.com\\/)'\n type: 'roundtrip'\n"), os.ModePerm))

diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile})
testutil.Ok(t, err)
testutil.Equals(t, 0, len(diff), diff.String())

_, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer(
MustNewValidator(logger, regexp.MustCompile(`^$`), mdoxFile, anchorDir),
))
testutil.Ok(t, err)
})

t.Run("check github links with validate config", func(t *testing.T) {
testFile := filepath.Join(tmpDir, "repo", "docs", "test", "github-link.md")
mdoxFile := filepath.Join(tmpDir, "mdox.yaml")

testutil.Ok(t, ioutil.WriteFile(testFile, []byte("https://github.com/bwplotka/mdox/issues/23 https://github.com/bwplotka/mdox/pull/32 https://github.com/bwplotka/mdox/pull/27#pullrequestreview-659598194\n"), os.ModePerm))
testutil.Ok(t, ioutil.WriteFile(mdoxFile, []byte("version: 1\n\nvalidate:\n validators:\n - regex: 'bwplotka\\/mdox'\n type: 'github'\n"), os.ModePerm))

diff, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile})
testutil.Ok(t, err)
testutil.Equals(t, 0, len(diff), diff.String())

_, err = mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer(
MustNewValidator(logger, regexp.MustCompile(`^$`), "bwplotka/mdox", anchorDir),
MustNewValidator(logger, regexp.MustCompile(`^$`), mdoxFile, anchorDir),
))
testutil.Ok(t, err)
})
Expand Down
Loading

0 comments on commit 88790de

Please sign in to comment.