Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ScrapingBee detector #3820

Merged
merged 1 commit into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 56 additions & 32 deletions pkg/detectors/scrapingbee/scrapingbee.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,74 +2,98 @@ package scrapingbee

import (
"context"
regexp "github.com/wasilibs/go-re2"
"fmt"
"io"
"net/http"
"strings"

regexp "github.com/wasilibs/go-re2"

"github.com/trufflesecurity/trufflehog/v3/pkg/common"
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
)

type Scanner struct{}
type Scanner struct {
client *http.Client
}

// Ensure the Scanner satisfies the interface at compile time.
var _ detectors.Detector = (*Scanner)(nil)

var (
client = common.SaneHttpClient()
func (s Scanner) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_ScrapingBee
}

// Make sure that your group is surrounded in boundary characters such as below to reduce false positives.
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scrapingbee"}) + `\b([A-Z0-9]{80})\b`)
)
func (s Scanner) Description() string {
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
}

// Keywords are used for efficiently pre-filtering chunks.
// Use identifiers in the secret preferably, or the provider name.
func (s Scanner) Keywords() []string {
return []string{"scrapingbee"}
return []string{"scrapingbee", "scraping bee", "scraping-bee", "scraping_bee"}
}

var (
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scraping[ _-]?bee"}) + `\b([A-Z0-9]{80})\b`)
)

// FromData will find and optionally verify ScrapingBee secrets in a given set of bytes.
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
dataStr := string(data)

matches := keyPat.FindAllStringSubmatch(dataStr, -1)

for _, match := range matches {
if len(match) != 2 {
uniqueMatches := make(map[string]struct{})
for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) {
m := match[1]
if detectors.StringShannonEntropy(m) < 3.5 {
continue
}
resMatch := strings.TrimSpace(match[1])
uniqueMatches[m] = struct{}{}
}

s1 := detectors.Result{
for key := range uniqueMatches {
r := detectors.Result{
DetectorType: detectorspb.DetectorType_ScrapingBee,
Raw: []byte(resMatch),
Raw: []byte(key),
}

if verify {
req, err := http.NewRequestWithContext(ctx, "GET", "https://app.scrapingbee.com/api/v1/?api_key="+resMatch+"&url=https://httpbin.org/anything?json&render_js=false", nil)
if err != nil {
continue
}
res, err := client.Do(req)
if err == nil {
defer res.Body.Close()
if res.StatusCode >= 200 && res.StatusCode < 300 {
s1.Verified = true
}
if s.client == nil {
s.client = common.SaneHttpClient()
}

isVerified, verificationErr := verifyMatch(ctx, s.client, key)
r.Verified = isVerified
r.SetVerificationError(verificationErr, key)
}

results = append(results, s1)
results = append(results, r)
}

return results, nil
}

func (s Scanner) Type() detectorspb.DetectorType {
return detectorspb.DetectorType_ScrapingBee
}
func verifyMatch(ctx context.Context, client *http.Client, key string) (bool, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "https://app.scrapingbee.com/api/v1/?api_key="+key+"&url=https://httpbin.org/anything?json&render_js=false", nil)
if err != nil {
return false, err
}

func (s Scanner) Description() string {
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
res, err := client.Do(req)
if err != nil {
return false, err
}
defer func() {
_, _ = io.Copy(io.Discard, res.Body)
_ = res.Body.Close()
}()

switch res.StatusCode {
case http.StatusOK:
return true, nil
case http.StatusUnauthorized:
return false, nil
default:
return false, fmt.Errorf("unexpected status code: %d", res.StatusCode)
}
}
132 changes: 111 additions & 21 deletions pkg/detectors/scrapingbee/scrapingbee_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package scrapingbee

import (
"context"
"fmt"
"testing"

"github.com/google/go-cmp/cmp"
Expand All @@ -11,42 +10,133 @@ import (
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
)

var (
validPattern = "HOLTCTPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
invalidPattern = "HOLT?TPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
keyword = "scrapingbee"
)

func TestScrapingBee_Pattern(t *testing.T) {
d := Scanner{}
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
tests := []struct {
name string
input string
want []string
}{
// True positives
{
name: `valid_query_param`,
input: ` #CHANGE API KEY TO CURRENT API KEY ON SCRAPINGBEE BELOW:
uri = URI("https://app.scrapingbee.com/api/v1/?api_key=VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE&url=#{url}&stealth_proxy=True&country_code=sg&wait_browser=networkidle2&json_response=True&block_resources=False&block_ads=True&js_scenario=" + CGI.escape(js_scenario))`,
want: []string{`VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE`},
},
{
name: `valid_function_comment`,
input: `func connectToScrapingBee() {
// API KEY = M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`,
want: []string{`M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`},
},
{
name: "valid pattern - with keyword scrapingbee",
input: fmt.Sprintf("%s token = '%s'", keyword, validPattern),
want: []string{validPattern},
name: `valid_csharp`,
input: ` class test{

string BASE_URL = @"https://app.scrapingbee.com/api/v1/";
string API_KEY = "2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85";

public static string Get(string url)`,
want: []string{`2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85`},
},
{
name: `valid_js1`,
input: ` const options = {
uri: "https://app.scrapingbee.com/api/v1?",
api_key: "34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI",
};`,
want: []string{`34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI`},
},
{
name: "valid pattern - ignore duplicate",
input: fmt.Sprintf("%s token = '%s' | '%s'", keyword, validPattern, validPattern),
want: []string{validPattern},
name: `valid_js2`,
input: ` useEffect(() => {
setLoading(true)
base.get('https://app.scrapingbee.com/api/v1', {
params:{'api_key':'BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X',
'url': 'https://www.flipkart.com/search?q=${searchItem}',
'block_resources': 'false',
}
}).then((response) => {`,
want: []string{`BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X`},
},
{
name: "valid pattern - key out of prefix range",
input: fmt.Sprintf("%s keyword is not close to the real key in the data\n = '%s'", keyword, validPattern),
want: []string{},
name: `valid_js3`,
input: `const scrapingBeeApiKey =
"P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP"; // Replace 'YOUR_SCRAPING_BEE_API_KEY' with your actual API key`,
want: []string{`P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP`},
},
{
name: "invalid pattern",
input: fmt.Sprintf("%s = '%s'", keyword, invalidPattern),
want: []string{},
name: `valid_php`,
input: `// Set base url & API key
$BASE_URL = "https://app.scrapingbee.com/api/v1/?";
$API_KEY = "R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8";
`,
want: []string{`R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8`},
},
{
name: `valid_python_sdk`,
input: `client = ScrapingBeeClient(api_key='MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX')`,
want: []string{`MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX`},
},
{
name: `valid_python_sdk_newline`,
input: `def main():
client = ScrapingBeeClient(
api_key='E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3')`,

want: []string{`E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3`},
},
{
name: `valid_python_notebook`,
input: ` "source": [
"Every time you call any function there is an HTTPS request to Google's servers. To prevent your servers IP address being locked by Google we should use a service that handles proxy rotation for us. In this case we are using **ScrapingBee API**.\n",
"\n",
"ScrapingBee API key:\n",
"\n",
" QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G\n",
"\n",
"NOTE: This API key is available till 08 March 2021 and expires after 200 requests \n",
"NOTE: **this Python package still works out of the box**."
]`,

want: []string{`QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G`},
},
{
name: `valid_python_nonapiurl`,
input: `##########################################################################################################
# We use the best scraper service API, Scraping Bee.
api_key = "CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2"`,
want: []string{`CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2`},
},
{
name: `valid_underscore`,
input: ` gn = GoogleNews()

# it's a fake API key, do not try to use it
gn.top_news(scraping_bee = 'I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5')`,

want: []string{`I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5`},
},
// TODO: support this
// {
// name: `valid_js_suffix`,
// input: ` do {
// // const apiKey = 'TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ';
// // const client = new scrapingbee.ScrapingBeeClient(apiKey);
// `,
//
// want: []string{ `TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ`},
// },

// False positives
{
name: `invalid - lowercase`,
input: `const scrapingbeeKey = 'tq9cdazsorupu1nmzxzem11vy7k3nc3hjpbnyp2v4czzxuy9sweulndhoz77xgwo9fa9a12xwfvwubzj'`,
},
}

d := Scanner{}
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input))
Expand Down
Loading