Skip to content

Commit

Permalink
feat(scrapingbee): tweak detections (#3820)
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz authored Jan 7, 2025
1 parent 827a201 commit 6a4856c
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 53 deletions.
88 changes: 56 additions & 32 deletions pkg/detectors/scrapingbee/scrapingbee.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,74 +2,98 @@ package scrapingbee

import (
"context"
regexp "github.com/wasilibs/go-re2"
"fmt"
"io"
"net/http"
"strings"

regexp "github.com/wasilibs/go-re2"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)

type Scanner struct{}
type Scanner struct {
client *http.Client
}

// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*Scanner)(nil)

var (
client = common.SaneHttpClient()
func (s Scanner) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_ScrapingBee
}

// Make sure that your group is surrounded in boundary characters such as below to reduce false positives.
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scrapingbee"}) + `\b([A-Z0-9]{80})\b`)
)
func (s Scanner) Description() string {
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
}

// Keywords are used for efficiently pre-filtering chunks.
// Use identifiers in the secret preferably, or the provider name.
func (s Scanner) Keywords() []string {
return []string{"scrapingbee"}
return []string{"scrapingbee", "scraping bee", "scraping-bee", "scraping_bee"}
}

var (
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scraping[ _-]?bee"}) + `\b([A-Z0-9]{80})\b`)
)

// FromData will find and optionally verify ScrapingBee secrets in a given set of bytes.
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)

matches := keyPat.FindAllStringSubmatch(dataStr, -1)

for _, match := range matches {
if len(match) != 2 {
uniqueMatches := make(map[string]struct{})
for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) {
m := match[1]
if detectors.StringShannonEntropy(m) < 3.5 {
continue
}
resMatch := strings.TrimSpace(match[1])
uniqueMatches[m] = struct{}{}
}

s1 := detectors.Result{
for key := range uniqueMatches {
r := detectors.Result{
DetectorType: detectorspb.DetectorType_ScrapingBee,
Raw: []byte(resMatch),
Raw: []byte(key),
}

if verify {
req, err := http.NewRequestWithContext(ctx, "GET", "https://app.scrapingbee.com/api/v1/?api_key="+resMatch+"&url=https://httpbin.org/anything?json&render_js=false", nil)
if err != nil {
continue
}
res, err := client.Do(req)
if err == nil {
defer res.Body.Close()
if res.StatusCode >= 200 && res.StatusCode < 300 {
s1.Verified = true
}
if s.client == nil {
s.client = common.SaneHttpClient()
}

isVerified, verificationErr := verifyMatch(ctx, s.client, key)
r.Verified = isVerified
r.SetVerificationError(verificationErr, key)
}

results = append(results, s1)
results = append(results, r)
}

return results, nil
}

func (s Scanner) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_ScrapingBee
}
func verifyMatch(ctx context.Context, client *http.Client, key string) (bool, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "https://app.scrapingbee.com/api/v1/?api_key="+key+"&url=https://httpbin.org/anything?json&render_js=false", nil)
if err != nil {
return false, err
}

func (s Scanner) Description() string {
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
res, err := client.Do(req)
if err != nil {
return false, err
}
defer func() {
_, _ = io.Copy(io.Discard, res.Body)
_ = res.Body.Close()
}()

switch res.StatusCode {
case http.StatusOK:
return true, nil
case http.StatusUnauthorized:
return false, nil
default:
return false, fmt.Errorf("unexpected status code: %d", res.StatusCode)
}
}
132 changes: 111 additions & 21 deletions pkg/detectors/scrapingbee/scrapingbee_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package scrapingbee

import (
"context"
"fmt"
"testing"

"github.com/google/go-cmp/cmp"
Expand All @@ -11,42 +10,133 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
)

var (
validPattern = "HOLTCTPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
invalidPattern = "HOLT?TPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
keyword = "scrapingbee"
)

func TestScrapingBee_Pattern(t *testing.T) {
d := Scanner{}
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
tests := []struct {
name string
input string
want []string
}{
// True positives
{
name: `valid_query_param`,
input: ` #CHANGE API KEY TO CURRENT API KEY ON SCRAPINGBEE BELOW:
uri = URI("https://app.scrapingbee.com/api/v1/?api_key=VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE&url=#{url}&stealth_proxy=True&country_code=sg&wait_browser=networkidle2&json_response=True&block_resources=False&block_ads=True&js_scenario=" + CGI.escape(js_scenario))`,
want: []string{`VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE`},
},
{
name: `valid_function_comment`,
input: `func connectToScrapingBee() {
// API KEY = M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`,
want: []string{`M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`},
},
{
name: "valid pattern - with keyword scrapingbee",
input: fmt.Sprintf("%s token = '%s'", keyword, validPattern),
want: []string{validPattern},
name: `valid_csharp`,
input: ` class test{
string BASE_URL = @"https://app.scrapingbee.com/api/v1/";
string API_KEY = "2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85";
public static string Get(string url)`,
want: []string{`2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85`},
},
{
name: `valid_js1`,
input: ` const options = {
uri: "https://app.scrapingbee.com/api/v1?",
api_key: "34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI",
};`,
want: []string{`34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI`},
},
{
name: "valid pattern - ignore duplicate",
input: fmt.Sprintf("%s token = '%s' | '%s'", keyword, validPattern, validPattern),
want: []string{validPattern},
name: `valid_js2`,
input: ` useEffect(() => {
setLoading(true)
base.get('https://app.scrapingbee.com/api/v1', {
params:{'api_key':'BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X',
'url': 'https://www.flipkart.com/search?q=${searchItem}',
'block_resources': 'false',
}
}).then((response) => {`,
want: []string{`BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X`},
},
{
name: "valid pattern - key out of prefix range",
input: fmt.Sprintf("%s keyword is not close to the real key in the data\n = '%s'", keyword, validPattern),
want: []string{},
name: `valid_js3`,
input: `const scrapingBeeApiKey =
"P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP"; // Replace 'YOUR_SCRAPING_BEE_API_KEY' with your actual API key`,
want: []string{`P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP`},
},
{
name: "invalid pattern",
input: fmt.Sprintf("%s = '%s'", keyword, invalidPattern),
want: []string{},
name: `valid_php`,
input: `// Set base url & API key
$BASE_URL = "https://app.scrapingbee.com/api/v1/?";
$API_KEY = "R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8";
`,
want: []string{`R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8`},
},
{
name: `valid_python_sdk`,
input: `client = ScrapingBeeClient(api_key='MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX')`,
want: []string{`MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX`},
},
{
name: `valid_python_sdk_newline`,
input: `def main():
client = ScrapingBeeClient(
api_key='E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3')`,

want: []string{`E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3`},
},
{
name: `valid_python_notebook`,
input: ` "source": [
"Every time you call any function there is an HTTPS request to Google's servers. To prevent your servers IP address being locked by Google we should use a service that handles proxy rotation for us. In this case we are using **ScrapingBee API**.\n",
"\n",
"ScrapingBee API key:\n",
"\n",
" QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G\n",
"\n",
"NOTE: This API key is available till 08 March 2021 and expires after 200 requests \n",
"NOTE: **this Python package still works out of the box**."
]`,

want: []string{`QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G`},
},
{
name: `valid_python_nonapiurl`,
input: `##########################################################################################################
# We use the best scraper service API, Scraping Bee.
api_key = "CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2"`,
want: []string{`CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2`},
},
{
name: `valid_underscore`,
input: ` gn = GoogleNews()
# it's a fake API key, do not try to use it
gn.top_news(scraping_bee = 'I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5')`,

want: []string{`I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5`},
},
// TODO: support this
// {
// name: `valid_js_suffix`,
// input: ` do {
// // const apiKey = 'TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ';
// // const client = new scrapingbee.ScrapingBeeClient(apiKey);
// `,
//
// want: []string{ `TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ`},
// },

// False positives
{
name: `invalid - lowercase`,
input: `const scrapingbeeKey = 'tq9cdazsorupu1nmzxzem11vy7k3nc3hjpbnyp2v4czzxuy9sweulndhoz77xgwo9fa9a12xwfvwubzj'`,
},
}

d := Scanner{}
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input))
Expand Down

0 comments on commit 6a4856c

Please sign in to comment.