Utilize new SimpleRobotRulesParser API entry point,fix #1086

Signed-off-by: Julien Nioche <julien@digitalpebble.com>
apache · Dec 6, 2023 · 5740f42 · 5740f42
1 parent 5f83770
commit 5740f42
Show file tree

Hide file tree

Showing 9 changed files with 127 additions and 26 deletions.
diff --git a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
@@ -37,10 +37,21 @@ config:
    - isSitemap
    - isFeed
 
+  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
+  # The full user agent value sent as part of the HTTP requests
+  # is built from the elements below. Only the agent.name is mandatory,
+  # it is also used to parse the robots.txt directives. 
+
+  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
+  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
   http.agent.name: "\${http-agent-name}"
+  # version of your crawler
   http.agent.version: "\${http-agent-version}"
+  # description of what it does
   http.agent.description: "\${http-agent-description}"
+  # URL webmasters can go to to learn about it
   http.agent.url: "\${http-agent-url}"
+  # Finally, an email so that they can get in touch with you
   http.agent.email: "\${http-agent-email}"
 
   http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"

diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java
@@ -126,6 +126,12 @@ public BaseRobotRules getRobotRules(String url) {
     @Override
     public void cleanup() {}
 
+    /**
+     * Build the user agent from the configuration. Used by the protocol implementation to build the
+     * requests
+     *
+     * @return full user agent
+     */
     public static String getAgentString(Config conf) {
         String agent = ConfUtils.getString(conf, "http.agent");
         if (agent != null && !agent.isEmpty()) {

diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java
@@ -23,7 +23,11 @@
 import crawlercommons.robots.SimpleRobotRulesParser;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.List;
 import java.util.StringTokenizer;
+import java.util.regex.Pattern;
 import javax.security.auth.login.Configuration;
 import org.apache.storm.Config;
 import org.slf4j.Logger;
@@ -78,7 +82,14 @@ public abstract class RobotRulesParser {
         ROBOT_PARSER.setMaxCrawlDelay(Long.MAX_VALUE);
     }
 
-    protected String agentNames;
+    protected final Collection<String> agentNames = new LinkedHashSet<>();
+
+    /**
+     * Pattern to match a valid user-agent product tokens as defined in <a
+     * href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309, section 2.2.1</a>
+     */
+    protected static final Pattern USER_AGENT_PRODUCT_TOKEN_MATCHER =
+            Pattern.compile("[a-zA-Z_-]+");
 
     public RobotRulesParser() {}
 
@@ -91,22 +102,39 @@ public void setConf(Config conf) {
             throw new RuntimeException("Agent name not configured!");
         }
 
-        String configuredAgentNames = ConfUtils.getString(conf, "http.robots.agents", "");
-        StringTokenizer tok = new StringTokenizer(configuredAgentNames, ",");
+        agentName = agentName.toLowerCase();
+        checkAgentValue(agentName);
+
         ArrayList<String> agents = new ArrayList<>();
-        while (tok.hasMoreTokens()) {
-            agents.add(tok.nextToken().trim());
+
+        List<String> configuredAgentNames = ConfUtils.loadListFromConf("http.robots.agents", conf);
+        // backward compatibility
+        // if it has a single entry - parse it
+        if (configuredAgentNames.size() == 1) {
+            StringTokenizer tok = new StringTokenizer(configuredAgentNames.get(0), ",");
+            while (tok.hasMoreTokens()) {
+                String agent = tok.nextToken().trim().toLowerCase();
+                checkAgentValue(agent);
+                agents.add(agent);
+            }
+        } else {
+            for (String ag : configuredAgentNames) {
+                String agent = ag.trim().toLowerCase();
+                checkAgentValue(agent);
+                agents.add(agent);
+            }
         }
 
         /*
-         * If there are no agents for robots-parsing, use the default agent-string. If both are
-         * present, our agent-string should be the first one we advertise to robots-parsing.
+         * If there are no agents for robots-parsing, use the default agent-string. If
+         * both are present, our agent-string should be the first one we advertise to
+         * robots-parsing.
          */
         if (agents.isEmpty()) {
             LOG.info(
                     "No agents listed in 'http.robots.agents' property! Using http.agent.name [{}]",
                     agentName);
-            this.agentNames = agentName;
+            this.agentNames.add(agentName.toLowerCase());
         } else {
             int index = 0;
             if ((agents.get(0)).equalsIgnoreCase(agentName)) {
@@ -117,13 +145,10 @@ public void setConf(Config conf) {
                         agentName);
             }
 
-            StringBuilder combinedAgentsString = new StringBuilder(agentName);
             // append all the agents from the http.robots.agents property
             for (; index < agents.size(); index++) {
-                combinedAgentsString.append(", ").append(agents.get(index));
+                agentNames.add(agents.get(index));
             }
-
-            this.agentNames = combinedAgentsString.toString();
         }
 
         String spec =
@@ -138,17 +163,34 @@ public void setConf(Config conf) {
     }
 
     /**
-     * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons
+     * Check that the agent is valid as defined in <a
+     * href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC / 9309, section
+     * 2.2.1</a>
+     *
+     * @param agentName
+     */
+    protected static void checkAgentValue(String agentName) {
+        if (!USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(agentName).matches()) {
+            String message =
+                    "Invalid agent name: "
+                            + agentName
+                            + ". It MUST contain only uppercase and lowercase letters (\"a-z\" and \"A-Z\"), underscores (\"_\"), and hyphens (\"-\")";
+            throw new RuntimeException(message);
+        }
+    }
+
+    /**
+     * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler-commons
      *
-     * @param url A string containing url
+     * @param url A string representation of a URL
      * @param content Contents of the robots file in a byte array
      * @param contentType The
-     * @param robotName A string containing value of
+     * @param robotNames Collection of robot names
      * @return BaseRobotRules object
      */
     public BaseRobotRules parseRules(
-            String url, byte[] content, String contentType, String robotName) {
-        return ROBOT_PARSER.parseContent(url, content, contentType, robotName);
+            String url, byte[] content, String contentType, Collection<String> robotNames) {
+        return ROBOT_PARSER.parseContent(url, content, contentType, robotNames);
     }
 
     public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {

diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml
@@ -67,15 +67,35 @@ config:
   metadata.track.path: true
   metadata.track.depth: true
 
-  # agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
-  # the full user agent value sent as part of the HTTP requests
-  # is built from the elements below
+  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
+  # The full user agent value sent as part of the HTTP requests
+  # is built from the elements below. Only the agent.name is mandatory,
+  # it is also used to parse the robots.txt directives. 
+
+  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
+  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
   # http.agent.name: "AnonymousCoward"
+  # version of your crawler
   # http.agent.version: "1.0"
-  # http.agent.description: "built with StormCrawler ${version}"
+  # description of what it does
+  # http.agent.description: "built with StormCrawler"
+  # URL webmasters can go to to learn about it
   # http.agent.url: "http://someorganization.com/"
+  # Finally, an email so that they can get in touch with you
   # http.agent.email: "someone@someorganization.com"
 
+  # user-agent name(s), used to select rules from the
+  # robots.txt file by matching the names against the user-agent
+  # lines in the robots.txt file. Optional, if empty, the value
+  # of http.agent.name is used. Otherwise, it must be listed first.
+  # the tokens must be compliant with RFC 9309 (section 2.2.1).
+  # http.robots.agents: agents as a comma separated string but can also take a list
+
+  # (advanced) Specify the user agent to send to the HTTP requests
+  # note that this is not used for parsing the robots.txt and 
+  # therefore you need to have set _http.agent.name_.
+  # http.agent: "Verbatim user agent"
+
   http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
   http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
   http.content.limit: -1

diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java
@@ -51,7 +51,7 @@ public void testDodgyURL() throws IOException {
         TestOutputCollector output = new TestOutputCollector();
 
         Map config = new HashMap();
-        config.put("http.agent.name", "this is only a test");
+        config.put("http.agent.name", "this_is_only_a_test");
 
         bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
 
@@ -82,7 +82,7 @@ public void test304() {
         TestOutputCollector output = new TestOutputCollector();
 
         Map config = new HashMap();
-        config.put("http.agent.name", "this is only a test");
+        config.put("http.agent.name", "this_is_only_a_test");
 
         bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
 

diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java
@@ -51,15 +51,15 @@ public class HttpRobotRulesParserTest {
 
     @Before
     public void setUp() throws Exception {
-        conf.put("http.agent.name", "this.is.only.a.test");
+        conf.put("http.agent.name", "this_is_only_a_test");
         ProtocolFactory protocolFactory = ProtocolFactory.getInstance(conf);
         protocol = protocolFactory.getProtocol("http")[0];
         protocolFactory.cleanup();
 
         String newLine = System.getProperty("line.separator");
         body =
                 new StringBuilder()
-                        .append("User-agent: this.is.only.a.test")
+                        .append("User-agent: this_is_only_a_test")
                         .append(newLine)
                         .append("Disallow: /restricted/")
                         .toString();

diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java
@@ -80,7 +80,7 @@ public RemoteDriverProtocol getProtocol() {
         capabilities.put("goog:chromeOptions", m);
 
         Config conf = new Config();
-        conf.put("http.agent.name", "this.is.only.a.test");
+        conf.put("http.agent.name", "this_is_only_a_test");
         conf.put("selenium.addresses", chrome.getSeleniumAddress().toExternalForm());
 
         Map<String, Object> timeouts = new HashMap<>();

diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
@@ -37,10 +37,21 @@ config:
    - isSitemap
    - isFeed
 
+  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
+  # The full user agent value sent as part of the HTTP requests
+  # is built from the elements below. Only the agent.name is mandatory,
+  # it is also used to parse the robots.txt directives. 
+
+  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
+  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
   http.agent.name: "\${http-agent-name}"
+  # version of your crawler
   http.agent.version: "\${http-agent-version}"
+  # description of what it does
   http.agent.description: "\${http-agent-description}"
+  # URL webmasters can go to to learn about it
   http.agent.url: "\${http-agent-url}"
+  # Finally, an email so that they can get in touch with you
   http.agent.email: "\${http-agent-email}"
 
   http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"

diff --git a/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml
@@ -37,10 +37,21 @@ config:
    - isSitemap
    - isFeed
 
+  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
+  # The full user agent value sent as part of the HTTP requests
+  # is built from the elements below. Only the agent.name is mandatory,
+  # it is also used to parse the robots.txt directives. 
+
+  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
+  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
   http.agent.name: "\${http-agent-name}"
+  # version of your crawler
   http.agent.version: "\${http-agent-version}"
+  # description of what it does
   http.agent.description: "\${http-agent-description}"
+  # URL webmasters can go to to learn about it
   http.agent.url: "\${http-agent-url}"
+  # Finally, an email so that they can get in touch with you
   http.agent.email: "\${http-agent-email}"
 
   http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"