Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create decoder for HTML entities #2563

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/snifftest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func main() {
for chunk := range chunksChan {
for name, scanner := range selectedScanners {
for _, dec := range allDecoders {
decoded := dec.FromChunk(&sources.Chunk{Data: chunk.Data})
decoded := dec.FromChunk(ctx, &sources.Chunk{Data: chunk.Data})
if decoded != nil {
foundKeyword := false
for _, kw := range scanner.Keywords() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/base64.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/base64"
"unicode"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -31,7 +32,7 @@ func (d *Base64) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_BASE64
}

func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *Base64) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
decodedSubstrings := make(map[string][]byte)
Expand Down
9 changes: 5 additions & 4 deletions pkg/decoders/base64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand Down Expand Up @@ -134,7 +135,7 @@ func TestBase64_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &Base64{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand All @@ -156,7 +157,7 @@ func BenchmarkFromChunkSmall(b *testing.B) {
data := detectors.MustGetBenchmarkData()["small"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -165,7 +166,7 @@ func BenchmarkFromChunkMedium(b *testing.B) {
data := detectors.MustGetBenchmarkData()["medium"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}

Expand All @@ -174,6 +175,6 @@ func BenchmarkFromChunkLarge(b *testing.B) {
data := detectors.MustGetBenchmarkData()["big"]

for n := 0; n < b.N; n++ {
d.FromChunk(&sources.Chunk{Data: data})
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
}
}
7 changes: 5 additions & 2 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package decoders

import (
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -12,6 +13,7 @@ func DefaultDecoders() []Decoder {
&Base64{},
&UTF16{},
&EscapedUnicode{},
&HtmlEntity{},
}
}

Expand All @@ -23,21 +25,22 @@ type DecodableChunk struct {
}

type Decoder interface {
FromChunk(chunk *sources.Chunk) *DecodableChunk
FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk
Type() detectorspb.DecoderType
}

// Fuzz is an entrypoint for go-fuzz, which is an AFL-style fuzzing tool.
// This one attempts to uncover any panics during decoding.
func Fuzz(data []byte) int {
decoded := false
ctx := context.Background()
for i, decoder := range DefaultDecoders() {
// Skip the first decoder (plain), because it will always decode and give
// priority to the input (return 1).
if i == 0 {
continue
}
chunk := decoder.FromChunk(&sources.Chunk{Data: data})
chunk := decoder.FromChunk(ctx, &sources.Chunk{Data: data})
if chunk != nil {
decoded = true
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/decoders/escaped_unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)
Expand All @@ -18,7 +19,7 @@ var _ Decoder = (*EscapedUnicode)(nil)
// https://dencode.com/en/string/unicode-escape
var (
// Standard Unicode notation.
//https://unicode.org/standard/principles.html
// https://unicode.org/standard/principles.html
codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)

// Common escape sequence used in programming languages.
Expand All @@ -29,7 +30,7 @@ func (d *EscapedUnicode) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_ESCAPED_UNICODE
}

func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
func (d *EscapedUnicode) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/decoders/escaped_unicode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

Expand Down Expand Up @@ -68,7 +69,7 @@ func TestUnicodeEscape_FromChunk(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &EscapedUnicode{}
got := d.FromChunk(tt.chunk)
got := d.FromChunk(context.Background(), tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
Expand Down
219 changes: 219 additions & 0 deletions pkg/decoders/html_entity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package decoders

import (
"bytes"
"errors"
"regexp"
"strconv"
"strings"
"sync"

ahocorasick "github.com/BobuSumisu/aho-corasick"
"github.com/go-logr/logr"
"golang.org/x/exp/maps"

"github.com/trufflesecurity/trufflehog/v3/pkg/context"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
type HtmlEntity struct{}

var (
_ Decoder = (*HtmlEntity)(nil)

once sync.Once
htmlTrie *ahocorasick.Trie
)

func init() {
// Use Aho-Corasick to pre-filter potential matches.
once.Do(func() {
keywords := map[string]struct{}{
`&#`: {}, // decimal
`&#x`: {}, // hex
}
for entity := range namedEntityMap {
keywords[strings.ToLower(entity)] = struct{}{}
}
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
})
}

func (d *HtmlEntity) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_HTML
}

func (d *HtmlEntity) FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
return nil
}

var (
logger = ctx.Logger().WithName("decoders.html")
// Necessary to avoid data races.
chunkData = bytes.Clone(chunk.Data)
matched = false
)
if namedEntityPat.Match(chunkData) {
matched = true
chunkData = decodeNamedEntities(logger, chunkData)
}
if decimalEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlDecimal(logger, chunkData)
}
if hexEntityPat.Match(chunkData) {
matched = true
chunkData = decodeHtmlHex(logger, chunkData)
}

if matched {
return &DecodableChunk{
DecoderType: d.Type(),
Chunk: &sources.Chunk{
Data: chunkData,
SourceName: chunk.SourceName,
SourceID: chunk.SourceID,
JobID: chunk.JobID,
SecretID: chunk.SecretID,
SourceMetadata: chunk.SourceMetadata,
SourceType: chunk.SourceType,
Verify: chunk.Verify,
},
}
} else {
return nil
}
}

// `A` = `&#65;`
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)

func decodeHtmlDecimal(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
decStartIndex := match[2]
decEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid decimal byte"), "Unable to decode HTML entity", "match", input[decStartIndex:decEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))
lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

// `A` = `&#x1;`
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)

func decodeHtmlHex(logger logr.Logger, input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
hexStartIndex := match[2]
hexEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

// Parse the hexadecimal value to an integer
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
if err != nil {
continue
}

// Append the decoded byte
if num < 0 || num > 255 {
logger.Error(errors.New("invalid hex byte"), "Unable to decode HTML entity", "match", input[hexStartIndex:hexEndIndex], "byte", num)
continue
}
decoded = append(decoded, byte(num))

lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

var (
// https://www.compart.com/en/unicode/html
namedEntityMap = map[string][]byte{
"&tab;": []byte(" "),
"&newline;": []byte("\n"),
"&excl;": []byte("!"),
"&quot;": []byte(`"`),
"&num;": []byte("#"),
"&dollar;": []byte("$"),
"&percnt;": []byte("%"),
"&amp;": []byte("&"),
"&apos;": []byte("'"),
"&lpar;": []byte("("),
"&rpar;": []byte(")"),
"&ast;": []byte("*"),
"&plus;": []byte("+"),
"&comma;": []byte(","),
"&period;": []byte("."),
"&sol;": []byte("/"),
"&colon;": []byte(":"),
"&semi;": []byte(";"),
"&lt;": []byte("<"),
"&equals;": []byte("="),
"&gt;": []byte(">"),
"&quest;": []byte("?"),
"&commat;": []byte("@"),
"&lsqb;": []byte("["),
"&bsol;": []byte("\\"),
"&rsqb;": []byte("]"),
"&hat;": []byte("^"),
"&underbar;": []byte("_"),
"&diacriticalgrave;": []byte("`"),
"&lcub;": []byte("{"),
"&verticalline;": []byte("|"),
"&rcub;": []byte("}"),
"&nonbreakingspace;": []byte(" "),
}
namedEntityPat = func() *regexp.Regexp {
return regexp.MustCompile(
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
}()
)

func decodeNamedEntities(_ logr.Logger, input []byte) []byte {
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
m := strings.ToLower(string(match))
if replacement, ok := namedEntityMap[m]; ok {
return replacement
}
return match
})
}
Loading