Skip to content

Commit

Permalink
feat(decoders): HTML entities
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz committed Dec 15, 2024
1 parent e932ea9 commit 3676e9b
Show file tree
Hide file tree
Showing 5 changed files with 1,357 additions and 1,039 deletions.
1 change: 1 addition & 0 deletions pkg/decoders/decoders.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder {
&Base64{},
&UTF16{},
&EscapedUnicode{},
&HtmlEntity{},
}
}

Expand Down
208 changes: 208 additions & 0 deletions pkg/decoders/html_entity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
package decoders

import (
"bytes"
"regexp"
"strconv"
"strings"
"sync"

ahocorasick "github.com/BobuSumisu/aho-corasick"
"golang.org/x/exp/maps"

"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
type HtmlEntity struct{}

var (
_ Decoder = (*HtmlEntity)(nil)

once sync.Once
htmlTrie *ahocorasick.Trie
)

func init() {
// Use Aho-Corasick to pre-filter potential matches.
once.Do(func() {
keywords := map[string]struct{}{
`&#`: {}, // decimal
`&#x`: {}, // hex
}
for entity := range namedEntityMap {
keywords[strings.ToLower(entity)] = struct{}{}
}
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
})
}

func (d *HtmlEntity) Type() detectorspb.DecoderType {
return detectorspb.DecoderType_HTML
}

func (d *HtmlEntity) FromChunk(chunk *sources.Chunk) *DecodableChunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
return nil
}

var (
// Necessary to avoid data races.
chunkData = bytes.Clone(chunk.Data)
matched = false
)
if namedEntityPat.Match(chunkData) {
matched = true
chunk.Data = decodeNamedEntities(chunkData)
}
if decimalEntityPat.Match(chunkData) {
matched = true
chunk.Data = decodeHtmlDecimal(chunkData)
}
if hexEntityPat.Match(chunkData) {
matched = true
chunk.Data = decodeHtmlHex(chunkData)
}

if matched {
return &DecodableChunk{
DecoderType: d.Type(),
Chunk: &sources.Chunk{
Data: chunkData,
SourceName: chunk.SourceName,
SourceID: chunk.SourceID,
JobID: chunk.JobID,
SecretID: chunk.SecretID,
SourceMetadata: chunk.SourceMetadata,
SourceType: chunk.SourceType,
Verify: chunk.Verify,
},
}
} else {
return nil
}
}

// `A` = `A`
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)

func decodeHtmlDecimal(input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
decStartIndex := match[2]
decEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
if err != nil {
continue
}

// Append the decoded byte
decoded = append(decoded, byte(num))

lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

// `A` = ``
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)

func decodeHtmlHex(input []byte) []byte {
decoded := make([]byte, 0, len(input))
lastIndex := 0

for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
startIndex := match[0]
endIndex := match[1]
hexStartIndex := match[2]
hexEndIndex := match[3]

// Copy the part of the input until the start of the entity
decoded = append(decoded, input[lastIndex:startIndex]...)

// Parse the hexadecimal value to an integer
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
if err != nil {
continue
}

// Append the decoded byte
decoded = append(decoded, byte(num))

lastIndex = endIndex
}

// Append the remaining part of the input
decoded = append(decoded, input[lastIndex:]...)

return decoded
}

var (
// https://www.compart.com/en/unicode/html
namedEntityMap = map[string][]byte{
"&tab;": []byte(" "),
"&newline;": []byte("\n"),
"!": []byte("!"),
""": []byte(`"`),
"#": []byte("#"),
"$": []byte("$"),
"%": []byte("%"),
"&": []byte("&"),
"'": []byte("'"),
"(": []byte("("),
")": []byte(")"),
"*": []byte("*"),
"+": []byte("+"),
",": []byte(","),
".": []byte("."),
"/": []byte("/"),
":": []byte(":"),
";": []byte(";"),
"&lt;": []byte("<"),
"&equals;": []byte("="),
"&gt;": []byte(">"),
"&quest;": []byte("?"),
"&commat;": []byte("@"),
"&lsqb;": []byte("["),
"&bsol;": []byte("\\"),
"&rsqb;": []byte("]"),
"&hat;": []byte("^"),
"&underbar;": []byte("_"),
"&diacriticalgrave;": []byte("`"),
"&lcub;": []byte("{"),
"&verticalline;": []byte("|"),
"&rcub;": []byte("}"),
"&nonbreakingspace;": []byte(" "),
}
namedEntityPat = func() *regexp.Regexp {
return regexp.MustCompile(
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
}()
)

func decodeNamedEntities(input []byte) []byte {
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
m := strings.ToLower(string(match))
if replacement, ok := namedEntityMap[m]; ok {
return replacement
}
return match
})
}
105 changes: 105 additions & 0 deletions pkg/decoders/html_entity_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package decoders

import (
"testing"

"github.com/kylelemons/godebug/pretty"

"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

func TestHtmlEntity_FromChunk(t *testing.T) {
tests := []struct {
name string
chunk *sources.Chunk
want *sources.Chunk
wantErr bool
}{
// &#01;
{
name: "[decimal] all encoded",
chunk: &sources.Chunk{
Data: []byte("&#116;&#111;&#107;&#101;&#110;&#58;&#32;&#34;&#103;&#104;&#112;&#95;&#73;&#119;&#100;&#77;&#120;&#57;&#87;&#70;&#87;&#82;&#82;&#102;&#77;&#104;&#84;&#89;&#105;&#97;&#86;&#106;&#90;&#55;&#56;&#74;&#102;&#117;&#97;&#109;&#118;&#110;&#48;&#89;&#87;&#82;&#77;&#48;&#34;"),
},
want: &sources.Chunk{
Data: []byte("token: \"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\""),
},
},
{
name: "[decimal] mixed content",
chunk: &sources.Chunk{
Data: []byte(`token: "&#103;&#104;&#112;_IwdMx9WFWRRfMhTYiaVjZ78Jfuamvn0YWRM0"`),
},
want: &sources.Chunk{
Data: []byte(`token: "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"`),
},
},
// &#x1;
{
name: "[hex] all encoded",
chunk: &sources.Chunk{
Data: []byte("&#x74;&#x6f;&#x6b;&#x65;&#x6e;&#x3a;&#x20;&#x22;&#x67;&#x68;&#x70;&#x5f;&#x49;&#x77;&#x64;&#x4d;&#x78;&#x39;&#x57;&#x46;&#x57;&#x52;&#x52;&#x66;&#x4d;&#x68;&#x54;&#x59;&#x69;&#x61;&#x56;&#x6a;&#x5a;&#x37;&#x38;&#x4a;&#x66;&#x75;&#x61;&#x6d;&#x76;&#x6e;&#x30;&#x59;&#x57;&#x52;&#x4d;&#x30;&#x22;"),
},
want: &sources.Chunk{
Data: []byte(`token: "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"`),
},
},
{
name: "[hex] mixed content",
chunk: &sources.Chunk{
Data: []byte(`token&colon; "ghp&UnderBar;IwdMx9WFWRRfMhTYiaVjZ78Jfuamvn0YWRM0"`),
},
want: &sources.Chunk{
Data: []byte(`token: "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"`),
},
},
// &quot;
{
name: "[named] all encoded",
chunk: &sources.Chunk{
Data: []byte("&Tab;&NewLine;&excl;&quot;&num;&dollar;&percnt;&amp;&apos;&lpar;&rpar;&ast;&plus;&comma;&period;&sol;&colon;&semi;&lt;&equals;&gt;&quest;&commat;&lsqb;&bsol;&rsqb;&Hat;&UnderBar;&DiacriticalGrave;&lcub;&VerticalLine;&rcub;&NonBreakingSpace;"),
},
want: &sources.Chunk{
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "),
},
},
{
name: "[named] mixed content",
chunk: &sources.Chunk{
Data: []byte("\t&NewLine;!&quot;#&dollar;%&amp;'&lpar;)&ast;+&comma;.&sol;:&semi;<&equals;>&quest;@&lsqb;\\&rsqb;^&UnderBar;`&lcub;|&rcub;&NonBreakingSpace;"),
},
want: &sources.Chunk{
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "),
},
},

// nothing
{
name: "no escaped",
chunk: &sources.Chunk{
Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312
+//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`),
},
want: nil,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &HtmlEntity{}
got := d.FromChunk(tt.chunk)
if tt.want != nil {
if got == nil {
t.Fatal("got nil, did not want nil")
}
if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" {
t.Errorf("HtmlEntity.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff)
}
} else {
if got != nil {
t.Error("Expected nil chunk")
}
}
})
}
}
Loading

0 comments on commit 3676e9b

Please sign in to comment.