From e9d0edeeb79f6cb25234a166290914524d2566e8 Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Sun, 29 Oct 2023 10:13:23 +0000 Subject: [PATCH] Applied formatting with new version of the plugin Signed-off-by: Julien Nioche --- .../stormcrawler/filtering/basic/BasicURLNormalizer.java | 1 + .../stormcrawler/persistence/AbstractQueryingSpout.java | 1 + .../stormcrawler/protocol/ProtocolResponse.java | 6 ++++++ .../digitalpebble/stormcrawler/warc/WARCRecordFormat.java | 2 ++ 4 files changed, 10 insertions(+) diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/com/digitalpebble/stormcrawler/filtering/basic/BasicURLNormalizer.java index b383d7e9b..fccec6c6f 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/filtering/basic/BasicURLNormalizer.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/filtering/basic/BasicURLNormalizer.java @@ -44,6 +44,7 @@ public class BasicURLNormalizer extends URLFilter { private static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); + /** Nutch 1098 - finds URL encoded parts of the URL */ private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})"); diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/persistence/AbstractQueryingSpout.java b/core/src/main/java/com/digitalpebble/stormcrawler/persistence/AbstractQueryingSpout.java index 7e2140d23..bb06a09a2 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/persistence/AbstractQueryingSpout.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/persistence/AbstractQueryingSpout.java @@ -52,6 +52,7 @@ public abstract class AbstractQueryingSpout extends BaseRichSpout { * 30 secs. */ protected static final String StatusTTLPurgatory = "spout.ttl.purgatory"; + /** * Min time to allow between 2 successive queries to the backend. Value in msecs, default 2000. */ diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/ProtocolResponse.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/ProtocolResponse.java index 8b661bd20..48b0c0ef3 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/ProtocolResponse.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/ProtocolResponse.java @@ -23,25 +23,31 @@ public class ProtocolResponse { * implementation and if http.store.headers is true). */ public static final String REQUEST_HEADERS_KEY = "_request.headers_"; + /** Key which holds the verbatim HTTP response headers in metadata. */ public static final String RESPONSE_HEADERS_KEY = "_response.headers_"; + /** * Key which holds the IP address of the server the request was sent to (response received from) * in metadata. */ public static final String RESPONSE_IP_KEY = "_response.ip_"; + /** Key which holds the request time (begin of request) in metadata. */ public static final String REQUEST_TIME_KEY = "_request.time_"; + /** * Key which holds the protocol version(s) used for this request (for layered protocols this * field may hold multiple comma-separated values) */ public static final String PROTOCOL_VERSIONS_KEY = "_protocol_versions_"; + /** * Metadata key which holds a boolean value in metadata whether the response content is trimmed * or not. */ public static final String TRIMMED_RESPONSE_KEY = "http.trimmed"; + /** * Metadata key which holds the reason why content has been trimmed, see {@link * TrimmedContentReason}. diff --git a/external/warc/src/main/java/com/digitalpebble/stormcrawler/warc/WARCRecordFormat.java b/external/warc/src/main/java/com/digitalpebble/stormcrawler/warc/WARCRecordFormat.java index 66d3bb61b..ad4e10d4d 100644 --- a/external/warc/src/main/java/com/digitalpebble/stormcrawler/warc/WARCRecordFormat.java +++ b/external/warc/src/main/java/com/digitalpebble/stormcrawler/warc/WARCRecordFormat.java @@ -50,8 +50,10 @@ public class WARCRecordFormat implements RecordFormat { // http://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-record-types /** WARC record type to hold a HTTP request */ protected static final String WARC_TYPE_REQUEST = "request"; + /** WARC record type to hold a HTTP response */ protected static final String WARC_TYPE_RESPONSE = "response"; + /** * WARC record type to hold any other resource, including a HTTP response with no HTTP headers * available