From 5740f42e6469970e346f3cd834f7e9878cc44cd8 Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Wed, 6 Dec 2023 14:37:27 +0000 Subject: [PATCH] Utilize new SimpleRobotRulesParser API entry point,fix #1086 Signed-off-by: Julien Nioche --- .../archetype-resources/crawler-conf.yaml | 11 +++ .../protocol/AbstractHttpProtocol.java | 6 ++ .../protocol/RobotRulesParser.java | 76 ++++++++++++++----- core/src/main/resources/crawler-default.yaml | 28 ++++++- .../bolt/AbstractFetcherBoltTest.java | 4 +- .../protocol/HttpRobotRulesParserTest.java | 4 +- .../protocol/selenium/ProtocolTest.java | 2 +- .../archetype-resources/crawler-conf.yaml | 11 +++ .../archetype-resources/crawler-conf.yaml | 11 +++ 9 files changed, 127 insertions(+), 26 deletions(-) diff --git a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index b2ced4764..9a349b3ea 100644 --- a/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -37,10 +37,21 @@ config: - isSitemap - isFeed + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") http.agent.name: "\${http-agent-name}" + # version of your crawler http.agent.version: "\${http-agent-version}" + # description of what it does http.agent.description: "\${http-agent-description}" + # URL webmasters can go to to learn about it http.agent.url: "\${http-agent-url}" + # Finally, an email so that they can get in touch with you http.agent.email: "\${http-agent-email}" http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol" diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java index 3451a72a1..0686698da 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java @@ -126,6 +126,12 @@ public BaseRobotRules getRobotRules(String url) { @Override public void cleanup() {} + /** + * Build the user agent from the configuration. Used by the protocol implementation to build the + * requests + * + * @return full user agent + */ public static String getAgentString(Config conf) { String agent = ConfUtils.getString(conf, "http.agent"); if (agent != null && !agent.isEmpty()) { diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java index a81e4a278..7164e0e5a 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/RobotRulesParser.java @@ -23,7 +23,11 @@ import crawlercommons.robots.SimpleRobotRulesParser; import java.net.URL; import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; import java.util.StringTokenizer; +import java.util.regex.Pattern; import javax.security.auth.login.Configuration; import org.apache.storm.Config; import org.slf4j.Logger; @@ -78,7 +82,14 @@ public abstract class RobotRulesParser { ROBOT_PARSER.setMaxCrawlDelay(Long.MAX_VALUE); } - protected String agentNames; + protected final Collection agentNames = new LinkedHashSet<>(); + + /** + * Pattern to match a valid user-agent product tokens as defined in RFC 9309, section 2.2.1 + */ + protected static final Pattern USER_AGENT_PRODUCT_TOKEN_MATCHER = + Pattern.compile("[a-zA-Z_-]+"); public RobotRulesParser() {} @@ -91,22 +102,39 @@ public void setConf(Config conf) { throw new RuntimeException("Agent name not configured!"); } - String configuredAgentNames = ConfUtils.getString(conf, "http.robots.agents", ""); - StringTokenizer tok = new StringTokenizer(configuredAgentNames, ","); + agentName = agentName.toLowerCase(); + checkAgentValue(agentName); + ArrayList agents = new ArrayList<>(); - while (tok.hasMoreTokens()) { - agents.add(tok.nextToken().trim()); + + List configuredAgentNames = ConfUtils.loadListFromConf("http.robots.agents", conf); + // backward compatibility + // if it has a single entry - parse it + if (configuredAgentNames.size() == 1) { + StringTokenizer tok = new StringTokenizer(configuredAgentNames.get(0), ","); + while (tok.hasMoreTokens()) { + String agent = tok.nextToken().trim().toLowerCase(); + checkAgentValue(agent); + agents.add(agent); + } + } else { + for (String ag : configuredAgentNames) { + String agent = ag.trim().toLowerCase(); + checkAgentValue(agent); + agents.add(agent); + } } /* - * If there are no agents for robots-parsing, use the default agent-string. If both are - * present, our agent-string should be the first one we advertise to robots-parsing. + * If there are no agents for robots-parsing, use the default agent-string. If + * both are present, our agent-string should be the first one we advertise to + * robots-parsing. */ if (agents.isEmpty()) { LOG.info( "No agents listed in 'http.robots.agents' property! Using http.agent.name [{}]", agentName); - this.agentNames = agentName; + this.agentNames.add(agentName.toLowerCase()); } else { int index = 0; if ((agents.get(0)).equalsIgnoreCase(agentName)) { @@ -117,13 +145,10 @@ public void setConf(Config conf) { agentName); } - StringBuilder combinedAgentsString = new StringBuilder(agentName); // append all the agents from the http.robots.agents property for (; index < agents.size(); index++) { - combinedAgentsString.append(", ").append(agents.get(index)); + agentNames.add(agents.get(index)); } - - this.agentNames = combinedAgentsString.toString(); } String spec = @@ -138,17 +163,34 @@ public void setConf(Config conf) { } /** - * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons + * Check that the agent is valid as defined in RFC / 9309, section + * 2.2.1 + * + * @param agentName + */ + protected static void checkAgentValue(String agentName) { + if (!USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(agentName).matches()) { + String message = + "Invalid agent name: " + + agentName + + ". It MUST contain only uppercase and lowercase letters (\"a-z\" and \"A-Z\"), underscores (\"_\"), and hyphens (\"-\")"; + throw new RuntimeException(message); + } + } + + /** + * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler-commons * - * @param url A string containing url + * @param url A string representation of a URL * @param content Contents of the robots file in a byte array * @param contentType The - * @param robotName A string containing value of + * @param robotNames Collection of robot names * @return BaseRobotRules object */ public BaseRobotRules parseRules( - String url, byte[] content, String contentType, String robotName) { - return ROBOT_PARSER.parseContent(url, content, contentType, robotName); + String url, byte[] content, String contentType, Collection robotNames) { + return ROBOT_PARSER.parseContent(url, content, contentType, robotNames); } public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) { diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index 92ef7cf5b..324c734d2 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -67,15 +67,35 @@ config: metadata.track.path: true metadata.track.depth: true - # agent name info - given here as an example. Do not be an anonynmous coward, use your real information! - # the full user agent value sent as part of the HTTP requests - # is built from the elements below + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") # http.agent.name: "AnonymousCoward" + # version of your crawler # http.agent.version: "1.0" - # http.agent.description: "built with StormCrawler ${version}" + # description of what it does + # http.agent.description: "built with StormCrawler" + # URL webmasters can go to to learn about it # http.agent.url: "http://someorganization.com/" + # Finally, an email so that they can get in touch with you # http.agent.email: "someone@someorganization.com" + # user-agent name(s), used to select rules from the + # robots.txt file by matching the names against the user-agent + # lines in the robots.txt file. Optional, if empty, the value + # of http.agent.name is used. Otherwise, it must be listed first. + # the tokens must be compliant with RFC 9309 (section 2.2.1). + # http.robots.agents: agents as a comma separated string but can also take a list + + # (advanced) Specify the user agent to send to the HTTP requests + # note that this is not used for parsing the robots.txt and + # therefore you need to have set _http.agent.name_. + # http.agent: "Verbatim user agent" + http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3" http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" http.content.limit: -1 diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java index 83b7e4c01..b712c42ed 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/bolt/AbstractFetcherBoltTest.java @@ -51,7 +51,7 @@ public void testDodgyURL() throws IOException { TestOutputCollector output = new TestOutputCollector(); Map config = new HashMap(); - config.put("http.agent.name", "this is only a test"); + config.put("http.agent.name", "this_is_only_a_test"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); @@ -82,7 +82,7 @@ public void test304() { TestOutputCollector output = new TestOutputCollector(); Map config = new HashMap(); - config.put("http.agent.name", "this is only a test"); + config.put("http.agent.name", "this_is_only_a_test"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java index 0f56c06a7..ec837e7a0 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParserTest.java @@ -51,7 +51,7 @@ public class HttpRobotRulesParserTest { @Before public void setUp() throws Exception { - conf.put("http.agent.name", "this.is.only.a.test"); + conf.put("http.agent.name", "this_is_only_a_test"); ProtocolFactory protocolFactory = ProtocolFactory.getInstance(conf); protocol = protocolFactory.getProtocol("http")[0]; protocolFactory.cleanup(); @@ -59,7 +59,7 @@ public void setUp() throws Exception { String newLine = System.getProperty("line.separator"); body = new StringBuilder() - .append("User-agent: this.is.only.a.test") + .append("User-agent: this_is_only_a_test") .append(newLine) .append("Disallow: /restricted/") .toString(); diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java index 854721db9..d8e14e7b6 100644 --- a/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java +++ b/core/src/test/java/com/digitalpebble/stormcrawler/protocol/selenium/ProtocolTest.java @@ -80,7 +80,7 @@ public RemoteDriverProtocol getProtocol() { capabilities.put("goog:chromeOptions", m); Config conf = new Config(); - conf.put("http.agent.name", "this.is.only.a.test"); + conf.put("http.agent.name", "this_is_only_a_test"); conf.put("selenium.addresses", chrome.getSeleniumAddress().toExternalForm()); Map timeouts = new HashMap<>(); diff --git a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index 504aecb87..9db2ad466 100644 --- a/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/external/elasticsearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -37,10 +37,21 @@ config: - isSitemap - isFeed + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") http.agent.name: "\${http-agent-name}" + # version of your crawler http.agent.version: "\${http-agent-version}" + # description of what it does http.agent.description: "\${http-agent-description}" + # URL webmasters can go to to learn about it http.agent.url: "\${http-agent-url}" + # Finally, an email so that they can get in touch with you http.agent.email: "\${http-agent-email}" http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol" diff --git a/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml b/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml index 504aecb87..9db2ad466 100644 --- a/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml +++ b/external/opensearch/archetype/src/main/resources/archetype-resources/crawler-conf.yaml @@ -37,10 +37,21 @@ config: - isSitemap - isFeed + # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information! + # The full user agent value sent as part of the HTTP requests + # is built from the elements below. Only the agent.name is mandatory, + # it is also used to parse the robots.txt directives. + + # The agent name must be compliant with RFC 9309 (section 2.2.1) + # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-") http.agent.name: "\${http-agent-name}" + # version of your crawler http.agent.version: "\${http-agent-version}" + # description of what it does http.agent.description: "\${http-agent-description}" + # URL webmasters can go to to learn about it http.agent.url: "\${http-agent-url}" + # Finally, an email so that they can get in touch with you http.agent.email: "\${http-agent-email}" http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"