diff --git a/pkg/detectors/scrapingbee/scrapingbee.go b/pkg/detectors/scrapingbee/scrapingbee.go index f5375555e4cf..7ce96f6b5c52 100644 --- a/pkg/detectors/scrapingbee/scrapingbee.go +++ b/pkg/detectors/scrapingbee/scrapingbee.go @@ -2,74 +2,98 @@ package scrapingbee import ( "context" - regexp "github.com/wasilibs/go-re2" + "fmt" + "io" "net/http" - "strings" + + regexp "github.com/wasilibs/go-re2" "github.com/trufflesecurity/trufflehog/v3/pkg/common" "github.com/trufflesecurity/trufflehog/v3/pkg/detectors" "github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb" ) -type Scanner struct{} +type Scanner struct { + client *http.Client +} // Ensure the Scanner satisfies the interface at compile time. var _ detectors.Detector = (*Scanner)(nil) -var ( - client = common.SaneHttpClient() +func (s Scanner) Type() detectorspb.DetectorType { + return detectorspb.DetectorType_ScrapingBee +} - // Make sure that your group is surrounded in boundary characters such as below to reduce false positives. - keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scrapingbee"}) + `\b([A-Z0-9]{80})\b`) -) +func (s Scanner) Description() string { + return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks." +} // Keywords are used for efficiently pre-filtering chunks. // Use identifiers in the secret preferably, or the provider name. func (s Scanner) Keywords() []string { - return []string{"scrapingbee"} + return []string{"scrapingbee", "scraping bee", "scraping-bee", "scraping_bee"} } +var ( + keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scraping[ _-]?bee"}) + `\b([A-Z0-9]{80})\b`) +) + // FromData will find and optionally verify ScrapingBee secrets in a given set of bytes. func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) { dataStr := string(data) - matches := keyPat.FindAllStringSubmatch(dataStr, -1) - - for _, match := range matches { - if len(match) != 2 { + uniqueMatches := make(map[string]struct{}) + for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) { + m := match[1] + if detectors.StringShannonEntropy(m) < 3.5 { continue } - resMatch := strings.TrimSpace(match[1]) + uniqueMatches[m] = struct{}{} + } - s1 := detectors.Result{ + for key := range uniqueMatches { + r := detectors.Result{ DetectorType: detectorspb.DetectorType_ScrapingBee, - Raw: []byte(resMatch), + Raw: []byte(key), } if verify { - req, err := http.NewRequestWithContext(ctx, "GET", "https://app.scrapingbee.com/api/v1/?api_key="+resMatch+"&url=https://httpbin.org/anything?json&render_js=false", nil) - if err != nil { - continue - } - res, err := client.Do(req) - if err == nil { - defer res.Body.Close() - if res.StatusCode >= 200 && res.StatusCode < 300 { - s1.Verified = true - } + if s.client == nil { + s.client = common.SaneHttpClient() } + + isVerified, verificationErr := verifyMatch(ctx, s.client, key) + r.Verified = isVerified + r.SetVerificationError(verificationErr, key) } - results = append(results, s1) + results = append(results, r) } return results, nil } -func (s Scanner) Type() detectorspb.DetectorType { - return detectorspb.DetectorType_ScrapingBee -} +func verifyMatch(ctx context.Context, client *http.Client, key string) (bool, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "https://app.scrapingbee.com/api/v1/?api_key="+key+"&url=https://httpbin.org/anything?json&render_js=false", nil) + if err != nil { + return false, err + } -func (s Scanner) Description() string { - return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks." + res, err := client.Do(req) + if err != nil { + return false, err + } + defer func() { + _, _ = io.Copy(io.Discard, res.Body) + _ = res.Body.Close() + }() + + switch res.StatusCode { + case http.StatusOK: + return true, nil + case http.StatusUnauthorized: + return false, nil + default: + return false, fmt.Errorf("unexpected status code: %d", res.StatusCode) + } } diff --git a/pkg/detectors/scrapingbee/scrapingbee_test.go b/pkg/detectors/scrapingbee/scrapingbee_test.go index 24061f0a463d..209552d55b7e 100644 --- a/pkg/detectors/scrapingbee/scrapingbee_test.go +++ b/pkg/detectors/scrapingbee/scrapingbee_test.go @@ -2,7 +2,6 @@ package scrapingbee import ( "context" - "fmt" "testing" "github.com/google/go-cmp/cmp" @@ -11,42 +10,133 @@ import ( "github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick" ) -var ( - validPattern = "HOLTCTPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB" - invalidPattern = "HOLT?TPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB" - keyword = "scrapingbee" -) - func TestScrapingBee_Pattern(t *testing.T) { - d := Scanner{} - ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) tests := []struct { name string input string want []string }{ + // True positives + { + name: `valid_query_param`, + input: ` #CHANGE API KEY TO CURRENT API KEY ON SCRAPINGBEE BELOW: + uri = URI("https://app.scrapingbee.com/api/v1/?api_key=VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE&url=#{url}&stealth_proxy=True&country_code=sg&wait_browser=networkidle2&json_response=True&block_resources=False&block_ads=True&js_scenario=" + CGI.escape(js_scenario))`, + want: []string{`VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE`}, + }, + { + name: `valid_function_comment`, + input: `func connectToScrapingBee() { + // API KEY = M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`, + want: []string{`M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`}, + }, { - name: "valid pattern - with keyword scrapingbee", - input: fmt.Sprintf("%s token = '%s'", keyword, validPattern), - want: []string{validPattern}, + name: `valid_csharp`, + input: ` class test{ + + string BASE_URL = @"https://app.scrapingbee.com/api/v1/"; + string API_KEY = "2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85"; + + public static string Get(string url)`, + want: []string{`2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85`}, + }, + { + name: `valid_js1`, + input: ` const options = { + uri: "https://app.scrapingbee.com/api/v1?", + api_key: "34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI", + };`, + want: []string{`34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI`}, }, { - name: "valid pattern - ignore duplicate", - input: fmt.Sprintf("%s token = '%s' | '%s'", keyword, validPattern, validPattern), - want: []string{validPattern}, + name: `valid_js2`, + input: ` useEffect(() => { + setLoading(true) + base.get('https://app.scrapingbee.com/api/v1', { +params:{'api_key':'BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X', + 'url': 'https://www.flipkart.com/search?q=${searchItem}', + 'block_resources': 'false', + } + }).then((response) => {`, + want: []string{`BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X`}, }, { - name: "valid pattern - key out of prefix range", - input: fmt.Sprintf("%s keyword is not close to the real key in the data\n = '%s'", keyword, validPattern), - want: []string{}, + name: `valid_js3`, + input: `const scrapingBeeApiKey = + "P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP"; // Replace 'YOUR_SCRAPING_BEE_API_KEY' with your actual API key`, + want: []string{`P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP`}, }, { - name: "invalid pattern", - input: fmt.Sprintf("%s = '%s'", keyword, invalidPattern), - want: []string{}, + name: `valid_php`, + input: `// Set base url & API key +$BASE_URL = "https://app.scrapingbee.com/api/v1/?"; +$API_KEY = "R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8"; +`, + want: []string{`R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8`}, + }, + { + name: `valid_python_sdk`, + input: `client = ScrapingBeeClient(api_key='MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX')`, + want: []string{`MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX`}, + }, + { + name: `valid_python_sdk_newline`, + input: `def main(): + client = ScrapingBeeClient( + api_key='E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3')`, + + want: []string{`E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3`}, + }, + { + name: `valid_python_notebook`, + input: ` "source": [ + "Every time you call any function there is an HTTPS request to Google's servers. To prevent your servers IP address being locked by Google we should use a service that handles proxy rotation for us. In this case we are using **ScrapingBee API**.\n", + "\n", + "ScrapingBee API key:\n", + "\n", + " QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G\n", + "\n", + "NOTE: This API key is available till 08 March 2021 and expires after 200 requests \n", + "NOTE: **this Python package still works out of the box**." + ]`, + + want: []string{`QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G`}, + }, + { + name: `valid_python_nonapiurl`, + input: `########################################################################################################## +# We use the best scraper service API, Scraping Bee. +api_key = "CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2"`, + want: []string{`CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2`}, + }, + { + name: `valid_underscore`, + input: ` gn = GoogleNews() + + # it's a fake API key, do not try to use it + gn.top_news(scraping_bee = 'I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5')`, + + want: []string{`I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5`}, + }, + // TODO: support this + // { + // name: `valid_js_suffix`, + // input: ` do { + // // const apiKey = 'TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ'; + // // const client = new scrapingbee.ScrapingBeeClient(apiKey); + // `, + // + // want: []string{ `TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ`}, + // }, + + // False positives + { + name: `invalid - lowercase`, + input: `const scrapingbeeKey = 'tq9cdazsorupu1nmzxzem11vy7k3nc3hjpbnyp2v4czzxuy9sweulndhoz77xgwo9fa9a12xwfvwubzj'`, }, } + d := Scanner{} + ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d}) for _, test := range tests { t.Run(test.name, func(t *testing.T) { matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input))