Merge pull request #247 from ddosify/develop

implement html extration feature
getanteon · Nov 24, 2023 · ae9bcd5 · ae9bcd5
2 parents daeabc1 + 30023a2
commit ae9bcd5
Show file tree

Hide file tree

Showing 11 changed files with 273 additions and 13 deletions.
diff --git a/config/json.go b/config/json.go
@@ -70,6 +70,7 @@ type RegexCaptureConf struct {
 type capturePath struct {
 	JsonPath   *string           `json:"json_path"`
 	XPath      *string           `json:"xpath"`
+	XpathHtml  *string           `json:"xpath_html"`
 	RegExp     *RegexCaptureConf `json:"regexp"`
 	From       string            `json:"from"` // body,header,cookie
 	CookieName *string           `json:"cookie_name"`
@@ -375,6 +376,7 @@ func stepToScenarioStep(s step) (types.ScenarioStep, error) {
 		capConf := types.EnvCaptureConf{
 			JsonPath:   path.JsonPath,
 			Xpath:      path.XPath,
+			XpathHtml:  path.XpathHtml,
 			Name:       name,
 			From:       types.SourceType(path.From),
 			Key:        path.HeaderKey,

diff --git a/core/scenario/scripting/assertion/assert_test.go b/core/scenario/scripting/assertion/assert_test.go
@@ -488,7 +488,7 @@ func TestAssert(t *testing.T) {
 			expected: true,
 		},
 		{
-			input: `equals(xml_path("//item/title"),"ABC")`,
+			input: `equals(xpath("//item/title"),"ABC")`,
 			envs: &evaluator.AssertEnv{
 				Body: `<?xml version="1.0" encoding="UTF-8" ?>
 		<rss version="2.0">
@@ -502,6 +502,19 @@ func TestAssert(t *testing.T) {
 
 			expected: true,
 		},
+		{
+			input: `equals(html_path("//body/h1"),"ABC")`,
+			envs: &evaluator.AssertEnv{
+				Body: `<!DOCTYPE html>
+				<html>
+				<body>
+				<h1>ABC</h1>
+				</body>
+				</html>`,
+			},
+
+			expected: true,
+		},
 		{
 			input: "equals(cookies.test.value, \"value\")",
 			envs: &evaluator.AssertEnv{
@@ -790,7 +803,12 @@ func TestAssert(t *testing.T) {
 			expectedError: "ArgumentError",
 		},
 		{
-			input:         "xml_path(23)", // arg must be string
+			input:         "xpath(23)", // arg must be string
+			expected:      false,
+			expectedError: "ArgumentError",
+		},
+		{
+			input:         "html_path(23)", // arg must be string
 			expected:      false,
 			expectedError: "ArgumentError",
 		},

diff --git a/core/scenario/scripting/assertion/evaluator/evaluator.go b/core/scenario/scripting/assertion/evaluator/evaluator.go
@@ -152,6 +152,15 @@ func Eval(node ast.Node, env *AssertEnv, receivedMap map[string]interface{}) (in
 						}
 					}
 					return xmlExtract(env.Body, xpath)
+				case HTMLPATH:
+					html, ok := args[0].(string)
+					if !ok {
+						return false, ArgumentError{
+							msg:        "htmlpath must be a string",
+							wrappedErr: nil,
+						}
+					}
+					return htmlExtract(env.Body, html)
 				case REGEXP:
 					regexp, ok := args[1].(string)
 					if !ok {

diff --git a/core/scenario/scripting/assertion/evaluator/function.go b/core/scenario/scripting/assertion/evaluator/function.go
@@ -133,6 +133,11 @@ var xmlExtract = func(source interface{}, xPath string) (interface{}, error) {
 	return val, err
 }
 
+var htmlExtract = func(source interface{}, xPath string) (interface{}, error) {
+	val, err := extraction.ExtractFromHtml(source, xPath)
+	return val, err
+}
+
 var regexExtract = func(source interface{}, xPath string, matchNo int64) (interface{}, error) {
 	val, err := extraction.ExtractWithRegex(source, types.RegexCaptureConf{
 		Exp: &xPath,
@@ -194,6 +199,7 @@ var assertionFuncMap = map[string]struct{}{
 	IN:           {},
 	JSONPATH:     {},
 	XMLPATH:      {},
+	HTMLPATH:     {},
 	REGEXP:       {},
 	EXISTS:       {},
 	CONTAINS:     {},
@@ -216,7 +222,8 @@ const (
 	EQUALS       = "equals"
 	IN           = "in"
 	JSONPATH     = "json_path"
-	XMLPATH      = "xml_path"
+	XMLPATH      = "xpath"
+	HTMLPATH     = "html_path"
 	REGEXP       = "regexp"
 	EXISTS       = "exists"
 	CONTAINS     = "contains"

diff --git a/core/scenario/scripting/extraction/base.go b/core/scenario/scripting/extraction/base.go
@@ -49,6 +49,8 @@ func Extract(source interface{}, ce types.EnvCaptureConf) (val interface{}, err
 			val, err = ExtractWithRegex(source, *ce.RegExp)
 		} else if ce.Xpath != nil {
 			val, err = ExtractFromXml(source, *ce.Xpath)
+		} else if ce.XpathHtml != nil {
+			val, err = ExtractFromHtml(source, *ce.XpathHtml)
 		}
 	case types.Cookie:
 		cookies := source.(map[string]*http.Cookie)
@@ -111,6 +113,18 @@ func ExtractFromXml(source interface{}, xPath string) (interface{}, error) {
 	}
 }
 
+func ExtractFromHtml(source interface{}, xPath string) (interface{}, error) {
+	xe := htmlExtractor{}
+	switch s := source.(type) {
+	case []byte: // from response body
+		return xe.extractFromByteSlice(s, xPath)
+	case string: // from response header
+		return xe.extractFromString(s, xPath)
+	default:
+		return "", fmt.Errorf("Unsupported type for extraction source")
+	}
+}
+
 type ExtractionError struct { // UnWrappable
 	msg        string
 	wrappedErr error

diff --git a/core/scenario/scripting/extraction/html.go b/core/scenario/scripting/extraction/html.go
@@ -0,0 +1,43 @@
+package extraction
+
+import (
+	"bytes"
+	"fmt"
+
+	"github.com/antchfx/htmlquery"
+)
+
+type htmlExtractor struct {
+}
+
+func (xe htmlExtractor) extractFromByteSlice(source []byte, xPath string) (interface{}, error) {
+	reader := bytes.NewBuffer(source)
+	rootNode, err := htmlquery.Parse(reader)
+	if err != nil {
+		return nil, err
+	}
+
+	// returns the first matched element
+	foundNode, err := htmlquery.Query(rootNode, xPath)
+	if foundNode == nil || err != nil {
+		return nil, fmt.Errorf("no match for the xPath_html: %s", xPath)
+	}
+
+	return foundNode.FirstChild.Data, nil
+}
+
+func (xe htmlExtractor) extractFromString(source string, xPath string) (interface{}, error) {
+	reader := bytes.NewBufferString(source)
+	rootNode, err := htmlquery.Parse(reader)
+	if err != nil {
+		return nil, err
+	}
+
+	// returns the first matched element
+	foundNode, err := htmlquery.Query(rootNode, xPath)
+	if foundNode == nil || err != nil {
+		return nil, fmt.Errorf("no match for this xpath_html")
+	}
+
+	return foundNode.FirstChild.Data, nil
+}
diff --git a/core/scenario/scripting/extraction/html_test.go b/core/scenario/scripting/extraction/html_test.go
@@ -0,0 +1,120 @@
+package extraction
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+func TestHtmlExtraction(t *testing.T) {
+	expected := "Html Title"
+	HtmlSource := fmt.Sprintf(`<!DOCTYPE html>
+	<html>
+	<body>
+	<h1>%s</h1>
+	<p>My first paragraph.</p>
+	</body>
+	</html>`, expected)
+
+	xe := htmlExtractor{}
+	xpath := "//body/h1"
+	val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath)
+
+	if err != nil {
+		t.Errorf("TestHtmlExtraction %v", err)
+	}
+
+	if !strings.EqualFold(val.(string), expected) {
+		t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val)
+	}
+}
+
+func TestHtmlExtractionSeveralNode(t *testing.T) {
+	//should extract only the first one
+	expected := "Html Title"
+	HtmlSource := fmt.Sprintf(`<!DOCTYPE html>
+	<html>
+	<body>
+	<h1>%s</h1>
+	<h1>another node</h1>
+	<p>My first paragraph.</p>
+	</body>
+	</html>`, expected)
+
+	xe := htmlExtractor{}
+	xpath := "//h1"
+	val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath)
+
+	if err != nil {
+		t.Errorf("TestHtmlExtraction %v", err)
+	}
+
+	if !strings.EqualFold(val.(string), expected) {
+		t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val)
+	}
+}
+
+func TestHtmlExtraction_PathNotFound(t *testing.T) {
+	expected := "XML Title"
+	xmlSource := fmt.Sprintf(`<!DOCTYPE html>
+	<html>
+	<body>
+	<h1>%s</h1>
+	<h1>another node</h1>
+	<p>My first paragraph.</p>
+	</body>
+	</html>`, expected)
+
+	xe := htmlExtractor{}
+	xpath := "//h2"
+	_, err := xe.extractFromByteSlice([]byte(xmlSource), xpath)
+
+	if err == nil {
+		t.Errorf("TestHtmlExtraction_PathNotFound, should be err, got :%v", err)
+	}
+}
+
+func TestInvalidHtml(t *testing.T) {
+	xmlSource := `invalid html source`
+
+	xe := htmlExtractor{}
+	xpath := "//input"
+	_, err := xe.extractFromByteSlice([]byte(xmlSource), xpath)
+
+	if err == nil {
+		t.Errorf("TestInvalidXml, should be err, got :%v", err)
+	}
+}
+
+func TestHtmlComplexExtraction(t *testing.T) {
+	expected := "Html Title"
+	HtmlSource := fmt.Sprintf(`<!DOCTYPE html>
+	<html>
+	<body>
+	<script>
+		if (typeof resourceLoadedSuccessfully === "function") {
+			resourceLoadedSuccessfully();
+		}
+		$(() => {
+			typeof cssVars === "function" && cssVars({onlyLegacy: true});
+		})
+		var trackGeoLocation = false;
+		alert('#@=$*€');
+		</script>
+	<h1>%s</h1>
+	<p>My first paragraph.</p>
+	</body>
+	</html>`, expected)
+
+	xe := htmlExtractor{}
+	xpath := "//body/h1"
+	val, err := xe.extractFromByteSlice([]byte(HtmlSource), xpath)
+
+	if err != nil {
+		t.Errorf("TestHtmlExtraction %v", err)
+	}
+
+	if !strings.EqualFold(val.(string), expected) {
+		t.Errorf("TestHtmlExtraction expected: %s, got: %s", expected, val)
+	}
+}
diff --git a/core/types/scenario.go b/core/types/scenario.go
@@ -86,7 +86,7 @@ func (s *Scenario) validate() error {
 
 	// add global envs
 	for key := range s.Envs {
-		if !envVarNameRegexp.Match([]byte(key)) { // not a valid env definition
+		if !envVarNameRegexp.MatchString(key) { // not a valid env definition
 			return fmt.Errorf("env key is not valid: %s", key)
 		}
 		definedEnvs[key] = struct{}{} // exist
@@ -98,7 +98,7 @@ func (s *Scenario) validate() error {
 			return fmt.Errorf("csv key can not have dot in it: %s", key)
 		}
 		for _, s := range splitted {
-			if !envVarNameRegexp.Match([]byte(s)) { // not a valid env definition
+			if !envVarNameRegexp.MatchString(s) { // not a valid env definition
 				return fmt.Errorf("csv key is not valid: %s", key)
 			}
 		}
@@ -112,7 +112,7 @@ func (s *Scenario) validate() error {
 
 		// enrich Envs map with captured envs from each step
 		for _, ce := range st.EnvsToCapture {
-			if !envVarNameRegexp.Match([]byte(ce.Name)) { // not a valid env definition
+			if !envVarNameRegexp.MatchString(ce.Name) { // not a valid env definition
 				return fmt.Errorf("captured env key is not valid: %s", ce.Name)
 			}
 			definedEnvs[ce.Name] = struct{}{}
@@ -251,6 +251,7 @@ type RegexCaptureConf struct {
 type EnvCaptureConf struct {
 	JsonPath   *string           `json:"json_path"`
 	Xpath      *string           `json:"xpath"`
+	XpathHtml  *string           `json:"xpath_html"`
 	RegExp     *RegexCaptureConf `json:"regexp"`
 	Name       string            `json:"as"`
 	From       SourceType        `json:"from"`
@@ -339,9 +340,9 @@ func validateCaptureConf(conf EnvCaptureConf) error {
 		}
 	}
 
-	if conf.From == Body && conf.JsonPath == nil && conf.RegExp == nil && conf.Xpath == nil {
+	if conf.From == Body && conf.JsonPath == nil && conf.RegExp == nil && conf.Xpath == nil && conf.XpathHtml == nil {
 		return CaptureConfigError{
-			msg: fmt.Sprintf("%s, one of json_path, regexp, xpath key must be specified when extracting from body", conf.Name),
+			msg: fmt.Sprintf("%s, one of json_path, regexp, xpath or xpath_html key must be specified when extracting from body", conf.Name),
 		}
 	}