Skip to content

Commit

Permalink
Utilize new SimpleRobotRulesParser API entry point,fix #1086
Browse files Browse the repository at this point in the history
Signed-off-by: Julien Nioche <julien@digitalpebble.com>
  • Loading branch information
jnioche committed Dec 6, 2023
1 parent 5f83770 commit 5740f42
Show file tree
Hide file tree
Showing 9 changed files with 127 additions and 26 deletions.
11 changes: 11 additions & 0 deletions archetype/src/main/resources/archetype-resources/crawler-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,21 @@ config:
- isSitemap
- isFeed

# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.

# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
http.agent.name: "\${http-agent-name}"
# version of your crawler
http.agent.version: "\${http-agent-version}"
# description of what it does
http.agent.description: "\${http-agent-description}"
# URL webmasters can go to to learn about it
http.agent.url: "\${http-agent-url}"
# Finally, an email so that they can get in touch with you
http.agent.email: "\${http-agent-email}"

http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ public BaseRobotRules getRobotRules(String url) {
@Override
public void cleanup() {}

/**
* Build the user agent from the configuration. Used by the protocol implementation to build the
* requests
*
* @return full user agent
*/
public static String getAgentString(Config conf) {
String agent = ConfUtils.getString(conf, "http.agent");
if (agent != null && !agent.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
import crawlercommons.robots.SimpleRobotRulesParser;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.security.auth.login.Configuration;
import org.apache.storm.Config;
import org.slf4j.Logger;
Expand Down Expand Up @@ -78,7 +82,14 @@ public abstract class RobotRulesParser {
ROBOT_PARSER.setMaxCrawlDelay(Long.MAX_VALUE);
}

protected String agentNames;
protected final Collection<String> agentNames = new LinkedHashSet<>();

/**
* Pattern to match a valid user-agent product tokens as defined in <a
* href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309, section 2.2.1</a>
*/
protected static final Pattern USER_AGENT_PRODUCT_TOKEN_MATCHER =
Pattern.compile("[a-zA-Z_-]+");

public RobotRulesParser() {}

Expand All @@ -91,22 +102,39 @@ public void setConf(Config conf) {
throw new RuntimeException("Agent name not configured!");
}

String configuredAgentNames = ConfUtils.getString(conf, "http.robots.agents", "");
StringTokenizer tok = new StringTokenizer(configuredAgentNames, ",");
agentName = agentName.toLowerCase();
checkAgentValue(agentName);

ArrayList<String> agents = new ArrayList<>();
while (tok.hasMoreTokens()) {
agents.add(tok.nextToken().trim());

List<String> configuredAgentNames = ConfUtils.loadListFromConf("http.robots.agents", conf);
// backward compatibility
// if it has a single entry - parse it
if (configuredAgentNames.size() == 1) {
StringTokenizer tok = new StringTokenizer(configuredAgentNames.get(0), ",");
while (tok.hasMoreTokens()) {
String agent = tok.nextToken().trim().toLowerCase();
checkAgentValue(agent);
agents.add(agent);
}
} else {
for (String ag : configuredAgentNames) {
String agent = ag.trim().toLowerCase();
checkAgentValue(agent);
agents.add(agent);
}
}

/*
* If there are no agents for robots-parsing, use the default agent-string. If both are
* present, our agent-string should be the first one we advertise to robots-parsing.
* If there are no agents for robots-parsing, use the default agent-string. If
* both are present, our agent-string should be the first one we advertise to
* robots-parsing.
*/
if (agents.isEmpty()) {
LOG.info(
"No agents listed in 'http.robots.agents' property! Using http.agent.name [{}]",
agentName);
this.agentNames = agentName;
this.agentNames.add(agentName.toLowerCase());
} else {
int index = 0;
if ((agents.get(0)).equalsIgnoreCase(agentName)) {
Expand All @@ -117,13 +145,10 @@ public void setConf(Config conf) {
agentName);
}

StringBuilder combinedAgentsString = new StringBuilder(agentName);
// append all the agents from the http.robots.agents property
for (; index < agents.size(); index++) {
combinedAgentsString.append(", ").append(agents.get(index));
agentNames.add(agents.get(index));
}

this.agentNames = combinedAgentsString.toString();
}

String spec =
Expand All @@ -138,17 +163,34 @@ public void setConf(Config conf) {
}

/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons
* Check that the agent is valid as defined in <a
* href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC / 9309, section
* 2.2.1</a>
*
* @param agentName
*/
protected static void checkAgentValue(String agentName) {
if (!USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(agentName).matches()) {
String message =
"Invalid agent name: "
+ agentName
+ ". It MUST contain only uppercase and lowercase letters (\"a-z\" and \"A-Z\"), underscores (\"_\"), and hyphens (\"-\")";
throw new RuntimeException(message);
}
}

/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from crawler-commons
*
* @param url A string containing url
* @param url A string representation of a URL
* @param content Contents of the robots file in a byte array
* @param contentType The
* @param robotName A string containing value of
* @param robotNames Collection of robot names
* @return BaseRobotRules object
*/
public BaseRobotRules parseRules(
String url, byte[] content, String contentType, String robotName) {
return ROBOT_PARSER.parseContent(url, content, contentType, robotName);
String url, byte[] content, String contentType, Collection<String> robotNames) {
return ROBOT_PARSER.parseContent(url, content, contentType, robotNames);
}

public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
Expand Down
28 changes: 24 additions & 4 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,35 @@ config:
metadata.track.path: true
metadata.track.depth: true

# agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# the full user agent value sent as part of the HTTP requests
# is built from the elements below
# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.

# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
# http.agent.name: "AnonymousCoward"
# version of your crawler
# http.agent.version: "1.0"
# http.agent.description: "built with StormCrawler ${version}"
# description of what it does
# http.agent.description: "built with StormCrawler"
# URL webmasters can go to to learn about it
# http.agent.url: "http://someorganization.com/"
# Finally, an email so that they can get in touch with you
# http.agent.email: "someone@someorganization.com"

# user-agent name(s), used to select rules from the
# robots.txt file by matching the names against the user-agent
# lines in the robots.txt file. Optional, if empty, the value
# of http.agent.name is used. Otherwise, it must be listed first.
# the tokens must be compliant with RFC 9309 (section 2.2.1).
# http.robots.agents: agents as a comma separated string but can also take a list

# (advanced) Specify the user agent to send to the HTTP requests
# note that this is not used for parsing the robots.txt and
# therefore you need to have set _http.agent.name_.
# http.agent: "Verbatim user agent"

http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
http.content.limit: -1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void testDodgyURL() throws IOException {
TestOutputCollector output = new TestOutputCollector();

Map config = new HashMap();
config.put("http.agent.name", "this is only a test");
config.put("http.agent.name", "this_is_only_a_test");

bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output));

Expand Down Expand Up @@ -82,7 +82,7 @@ public void test304() {
TestOutputCollector output = new TestOutputCollector();

Map config = new HashMap();
config.put("http.agent.name", "this is only a test");
config.put("http.agent.name", "this_is_only_a_test");

bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@ public class HttpRobotRulesParserTest {

@Before
public void setUp() throws Exception {
conf.put("http.agent.name", "this.is.only.a.test");
conf.put("http.agent.name", "this_is_only_a_test");
ProtocolFactory protocolFactory = ProtocolFactory.getInstance(conf);
protocol = protocolFactory.getProtocol("http")[0];
protocolFactory.cleanup();

String newLine = System.getProperty("line.separator");
body =
new StringBuilder()
.append("User-agent: this.is.only.a.test")
.append("User-agent: this_is_only_a_test")
.append(newLine)
.append("Disallow: /restricted/")
.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public RemoteDriverProtocol getProtocol() {
capabilities.put("goog:chromeOptions", m);

Config conf = new Config();
conf.put("http.agent.name", "this.is.only.a.test");
conf.put("http.agent.name", "this_is_only_a_test");
conf.put("selenium.addresses", chrome.getSeleniumAddress().toExternalForm());

Map<String, Object> timeouts = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,21 @@ config:
- isSitemap
- isFeed

# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.

# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
http.agent.name: "\${http-agent-name}"
# version of your crawler
http.agent.version: "\${http-agent-version}"
# description of what it does
http.agent.description: "\${http-agent-description}"
# URL webmasters can go to to learn about it
http.agent.url: "\${http-agent-url}"
# Finally, an email so that they can get in touch with you
http.agent.email: "\${http-agent-email}"

http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,21 @@ config:
- isSitemap
- isFeed

# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.

# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
http.agent.name: "\${http-agent-name}"
# version of your crawler
http.agent.version: "\${http-agent-version}"
# description of what it does
http.agent.description: "\${http-agent-description}"
# URL webmasters can go to to learn about it
http.agent.url: "\${http-agent-url}"
# Finally, an email so that they can get in touch with you
http.agent.email: "\${http-agent-email}"

http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
Expand Down

0 comments on commit 5740f42

Please sign in to comment.