-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
1,357 additions
and
1,039 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ func DefaultDecoders() []Decoder { | |
&Base64{}, | ||
&UTF16{}, | ||
&EscapedUnicode{}, | ||
&HtmlEntity{}, | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
package decoders | ||
|
||
import ( | ||
"bytes" | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
|
||
ahocorasick "github.com/BobuSumisu/aho-corasick" | ||
"golang.org/x/exp/maps" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" | ||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
) | ||
|
||
// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities. | ||
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html | ||
type HtmlEntity struct{} | ||
|
||
var ( | ||
_ Decoder = (*HtmlEntity)(nil) | ||
|
||
once sync.Once | ||
htmlTrie *ahocorasick.Trie | ||
) | ||
|
||
func init() { | ||
// Use Aho-Corasick to pre-filter potential matches. | ||
once.Do(func() { | ||
keywords := map[string]struct{}{ | ||
`&#`: {}, // decimal | ||
`&#x`: {}, // hex | ||
} | ||
for entity := range namedEntityMap { | ||
keywords[strings.ToLower(entity)] = struct{}{} | ||
} | ||
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build() | ||
}) | ||
} | ||
|
||
func (d *HtmlEntity) Type() detectorspb.DecoderType { | ||
return detectorspb.DecoderType_HTML | ||
} | ||
|
||
func (d *HtmlEntity) FromChunk(chunk *sources.Chunk) *DecodableChunk { | ||
if chunk == nil || len(chunk.Data) == 0 { | ||
return nil | ||
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil { | ||
return nil | ||
} | ||
|
||
var ( | ||
// Necessary to avoid data races. | ||
chunkData = bytes.Clone(chunk.Data) | ||
matched = false | ||
) | ||
if namedEntityPat.Match(chunkData) { | ||
matched = true | ||
chunk.Data = decodeNamedEntities(chunkData) | ||
} | ||
if decimalEntityPat.Match(chunkData) { | ||
matched = true | ||
chunk.Data = decodeHtmlDecimal(chunkData) | ||
} | ||
if hexEntityPat.Match(chunkData) { | ||
matched = true | ||
chunk.Data = decodeHtmlHex(chunkData) | ||
} | ||
|
||
if matched { | ||
return &DecodableChunk{ | ||
DecoderType: d.Type(), | ||
Chunk: &sources.Chunk{ | ||
Data: chunkData, | ||
SourceName: chunk.SourceName, | ||
SourceID: chunk.SourceID, | ||
JobID: chunk.JobID, | ||
SecretID: chunk.SecretID, | ||
SourceMetadata: chunk.SourceMetadata, | ||
SourceType: chunk.SourceType, | ||
Verify: chunk.Verify, | ||
}, | ||
} | ||
} else { | ||
return nil | ||
} | ||
} | ||
|
||
// `A` = `A` | ||
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`) | ||
|
||
func decodeHtmlDecimal(input []byte) []byte { | ||
decoded := make([]byte, 0, len(input)) | ||
lastIndex := 0 | ||
|
||
for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) { | ||
startIndex := match[0] | ||
endIndex := match[1] | ||
decStartIndex := match[2] | ||
decEndIndex := match[3] | ||
|
||
// Copy the part of the input until the start of the entity | ||
decoded = append(decoded, input[lastIndex:startIndex]...) | ||
|
||
num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex])) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
// Append the decoded byte | ||
decoded = append(decoded, byte(num)) | ||
|
||
lastIndex = endIndex | ||
} | ||
|
||
// Append the remaining part of the input | ||
decoded = append(decoded, input[lastIndex:]...) | ||
|
||
return decoded | ||
} | ||
|
||
// `A` = `` | ||
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`) | ||
|
||
func decodeHtmlHex(input []byte) []byte { | ||
decoded := make([]byte, 0, len(input)) | ||
lastIndex := 0 | ||
|
||
for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) { | ||
startIndex := match[0] | ||
endIndex := match[1] | ||
hexStartIndex := match[2] | ||
hexEndIndex := match[3] | ||
|
||
// Copy the part of the input until the start of the entity | ||
decoded = append(decoded, input[lastIndex:startIndex]...) | ||
|
||
// Parse the hexadecimal value to an integer | ||
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
// Append the decoded byte | ||
decoded = append(decoded, byte(num)) | ||
|
||
lastIndex = endIndex | ||
} | ||
|
||
// Append the remaining part of the input | ||
decoded = append(decoded, input[lastIndex:]...) | ||
|
||
return decoded | ||
} | ||
|
||
var ( | ||
// https://www.compart.com/en/unicode/html | ||
namedEntityMap = map[string][]byte{ | ||
"&tab;": []byte(" "), | ||
"&newline;": []byte("\n"), | ||
"!": []byte("!"), | ||
""": []byte(`"`), | ||
"#": []byte("#"), | ||
"$": []byte("$"), | ||
"%": []byte("%"), | ||
"&": []byte("&"), | ||
"'": []byte("'"), | ||
"(": []byte("("), | ||
")": []byte(")"), | ||
"*": []byte("*"), | ||
"+": []byte("+"), | ||
",": []byte(","), | ||
".": []byte("."), | ||
"/": []byte("/"), | ||
":": []byte(":"), | ||
";": []byte(";"), | ||
"<": []byte("<"), | ||
"=": []byte("="), | ||
">": []byte(">"), | ||
"?": []byte("?"), | ||
"@": []byte("@"), | ||
"[": []byte("["), | ||
"\": []byte("\\"), | ||
"]": []byte("]"), | ||
"&hat;": []byte("^"), | ||
"&underbar;": []byte("_"), | ||
"&diacriticalgrave;": []byte("`"), | ||
"{": []byte("{"), | ||
"&verticalline;": []byte("|"), | ||
"}": []byte("}"), | ||
"&nonbreakingspace;": []byte(" "), | ||
} | ||
namedEntityPat = func() *regexp.Regexp { | ||
return regexp.MustCompile( | ||
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")") | ||
}() | ||
) | ||
|
||
func decodeNamedEntities(input []byte) []byte { | ||
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte { | ||
m := strings.ToLower(string(match)) | ||
if replacement, ok := namedEntityMap[m]; ok { | ||
return replacement | ||
} | ||
return match | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package decoders | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/kylelemons/godebug/pretty" | ||
|
||
"github.com/trufflesecurity/trufflehog/v3/pkg/sources" | ||
) | ||
|
||
func TestHtmlEntity_FromChunk(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
chunk *sources.Chunk | ||
want *sources.Chunk | ||
wantErr bool | ||
}{ | ||
//  | ||
{ | ||
name: "[decimal] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("token: "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""), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("token: \"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\""), | ||
}, | ||
}, | ||
{ | ||
name: "[decimal] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
//  | ||
{ | ||
name: "[hex] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("token: "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""), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
{ | ||
name: "[hex] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte(`token: "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"`), | ||
}, | ||
}, | ||
// " | ||
{ | ||
name: "[named] all encoded", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("	
!"#$%&'()*+,./:;<=>?@[\]^_`{|} "), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
}, | ||
{ | ||
name: "[named] mixed content", | ||
chunk: &sources.Chunk{ | ||
Data: []byte("\t
!"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
want: &sources.Chunk{ | ||
Data: []byte("\t\n!\"#$%&'()*+,./:;<=>?@[\\]^_`{|} "), | ||
}, | ||
}, | ||
|
||
// nothing | ||
{ | ||
name: "no escaped", | ||
chunk: &sources.Chunk{ | ||
Data: []byte(`-//npm.fontawesome.com/:_authToken=12345678-2323-1111-1111-12345670B312 | ||
+//npm.fontawesome.com/:_authToken=REMOVED_TOKEN`), | ||
}, | ||
want: nil, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
d := &HtmlEntity{} | ||
got := d.FromChunk(tt.chunk) | ||
if tt.want != nil { | ||
if got == nil { | ||
t.Fatal("got nil, did not want nil") | ||
} | ||
if diff := pretty.Compare(string(tt.want.Data), string(got.Data)); diff != "" { | ||
t.Errorf("HtmlEntity.FromChunk() %s diff: (-want +got)\n%s", tt.name, diff) | ||
} | ||
} else { | ||
if got != nil { | ||
t.Error("Expected nil chunk") | ||
} | ||
} | ||
}) | ||
} | ||
} |
Oops, something went wrong.