diff --git a/conf/config.xml b/conf/config.xml index fefdfe16..bb175f1b 100644 --- a/conf/config.xml +++ b/conf/config.xml @@ -147,7 +147,7 @@ de.pangaea.metadataportal.harvester.OAIHarvester - http://ws.pangaea.de/oai/ + https://ws.pangaea.de/oai/provider dif CARBOOCEAN false diff --git a/src/de/pangaea/metadataportal/config/Config.java b/src/de/pangaea/metadataportal/config/Config.java index 544a6b5e..9297e419 100644 --- a/src/de/pangaea/metadataportal/config/Config.java +++ b/src/de/pangaea/metadataportal/config/Config.java @@ -20,7 +20,6 @@ import java.io.InputStream; import java.io.Reader; import java.lang.reflect.InvocationTargetException; -import java.net.CookieHandler; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; @@ -59,7 +58,6 @@ import de.pangaea.metadataportal.utils.ExtendedDigester; import de.pangaea.metadataportal.utils.HostAndPort; import de.pangaea.metadataportal.utils.PublicForDigesterUse; -import de.pangaea.metadataportal.utils.SimpleCookieHandler; import de.pangaea.metadataportal.utils.StaticFactories; /** @@ -80,13 +78,6 @@ public Config(Path file) throws Exception { log.info(Package.getFullPackageDescription()); - final CookieHandler defCookieH = CookieHandler.getDefault(); - if (defCookieH != null && defCookieH != SimpleCookieHandler.INSTANCE) { - log.warn("There is a CookieHandler already registered with the JVM, panFMP's customized HTTP cookie handling will be not available during harvesting."); - } else { - CookieHandler.setDefault(SimpleCookieHandler.INSTANCE); - } - try { dig = new ExtendedDigester(); dig.setNamespaceAware(true); diff --git a/src/de/pangaea/metadataportal/harvester/OAIHarvesterBase.java b/src/de/pangaea/metadataportal/harvester/OAIHarvesterBase.java index 206d2a42..a821e33d 100644 --- a/src/de/pangaea/metadataportal/harvester/OAIHarvesterBase.java +++ b/src/de/pangaea/metadataportal/harvester/OAIHarvesterBase.java @@ -18,19 +18,27 @@ import java.io.IOException; import java.io.InputStream; +import java.net.CookieManager; +import java.net.CookiePolicy; import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.http.HttpClient; +import java.net.http.HttpClient.Redirect; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.net.http.HttpResponse.BodyHandlers; import java.nio.charset.StandardCharsets; +import java.time.Duration; import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; import java.util.Arrays; import java.util.Collections; import java.util.Locale; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Supplier; -import java.util.zip.GZIPInputStream; -import java.util.zip.InflaterInputStream; import org.apache.commons.digester.AbstractObjectCreationFactory; import org.apache.commons.digester.ObjectCreationFactory; @@ -43,8 +51,8 @@ import de.pangaea.metadataportal.processor.MetadataDocument; import de.pangaea.metadataportal.utils.BooleanParser; import de.pangaea.metadataportal.utils.ExtendedDigester; +import de.pangaea.metadataportal.utils.HttpClientUtils; import de.pangaea.metadataportal.utils.HugeStringHashBuilder; -import de.pangaea.metadataportal.utils.SimpleCookieHandler; /** * Abstract base class for OAI harvesting support in panFMP. Use one of the @@ -100,7 +108,7 @@ public abstract class OAIHarvesterBase extends Harvester { protected final int retryTime; /** the timeout from configuration */ - protected final int timeout; + protected final Duration timeout; /** the authorizationHeader from configuration */ protected final String authorizationHeader; @@ -114,6 +122,9 @@ public abstract class OAIHarvesterBase extends Harvester { /** Contains all valid identifiers, if not {@code null}. Will be initialized by subclasses. */ private HugeStringHashBuilder validIdentifiersBuilder = null; + /** HttpClient to use, configured with correct connect timeout. */ + protected final HttpClient httpClient; + /** * The harvester should filter incoming documents according to its set * metadata. Should be disabled for OAI-PMH protocol with only one set. @@ -135,7 +146,7 @@ public OAIHarvesterBase(HarvesterConfig iconfig) { retryCount = Integer.parseInt(iconfig.properties.getProperty("retryCount", Integer.toString(DEFAULT_RETRY_COUNT))); retryTime = Integer.parseInt(iconfig.properties.getProperty("retryAfterSeconds", Integer.toString(DEFAULT_RETRY_TIME))); - timeout = Integer.parseInt(iconfig.properties.getProperty("timeoutAfterSeconds", Integer.toString(DEFAULT_TIMEOUT))); + timeout = Duration.ofSeconds(Integer.parseInt(iconfig.properties.getProperty("timeoutAfterSeconds", Integer.toString(DEFAULT_TIMEOUT)))); authorizationHeader = iconfig.properties.getProperty("authorizationHeader"); metadataPrefix = iconfig.properties.getProperty("metadataPrefix"); if (metadataPrefix == null) { @@ -144,12 +155,17 @@ public OAIHarvesterBase(HarvesterConfig iconfig) { identifierPrefix = iconfig.properties.getProperty("identifierPrefix", ""); ignoreDatestamps = BooleanParser.parseBoolean(iconfig.properties.getProperty("ignoreDatestamps", "false")); deleteMissingDocuments = BooleanParser.parseBoolean(iconfig.properties.getProperty("deleteMissingDocuments", "true")); + + httpClient = HttpClient.newBuilder() + .followRedirects(Redirect.NORMAL) + .connectTimeout(timeout) + .cookieHandler(new CookieManager(null, CookiePolicy.ACCEPT_ORIGINAL_SERVER)) + .build(); } @Override public void open(ElasticsearchConnection es, String targetIndex) throws Exception { super.open(es, targetIndex); - SimpleCookieHandler.INSTANCE.enable(); recreateDigester(); } @@ -215,7 +231,7 @@ public Object createObject(org.xml.sax.Attributes attributes) { */ protected boolean doParse(Supplier digSupplier, String url, AtomicReference checkModifiedDate) throws Exception { - URL u = new URL(url); + final URI u = new URI(url); for (int retry = 0; retry <= retryCount; retry++) { try { final ExtendedDigester dig = digSupplier.get(); @@ -234,16 +250,11 @@ protected boolean doParse(Supplier digSupplier, String url, // throw the real Exception not the digester one if (saxe.getException() != null) throw saxe.getException(); else throw saxe; - } catch (IOException ioe) { + } catch (RetryAfterIOException ioe) { int after = retryTime; - if (ioe instanceof RetryAfterIOException) { - if (retry >= retryCount) throw (IOException) ioe.getCause(); - log.warn("OAI server returned '503 Service Unavailable' with a 'Retry-After' value being set."); - after = ((RetryAfterIOException) ioe).getRetryAfter(); - } else { - if (retry >= retryCount) throw ioe; - log.error("OAI server access failed with exception: ", ioe); - } + if (retry >= retryCount) throw (IOException) ioe.getCause(); + log.warn(ioe.getMessage()); + after = ((RetryAfterIOException) ioe).getRetryAfter(); log.info("Retrying after " + after + " seconds (" + (retryCount - retry) + " retries left)..."); try { @@ -271,13 +282,15 @@ protected EntityResolver getEntityResolver(final EntityResolver parent) { public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { try { - URL url = new URL(systemId); - String proto = url.getProtocol().toLowerCase(Locale.ROOT); + URI uri = new URI(systemId); + String proto = uri.getScheme().toLowerCase(Locale.ROOT); if ("http".equals(proto) || "https".equals(proto)) return getInputSource( - url, null); + uri, null); else return (parent == null) ? null : parent.resolveEntity(publicId, systemId); - } catch (MalformedURLException malu) { + } catch (InterruptedException e) { + throw new IOException(e); + } catch (URISyntaxException e) { return (parent == null) ? null : parent.resolveEntity(publicId, systemId); } @@ -300,92 +313,90 @@ else return (parent == null) ? null : parent.resolveEntity(publicId, * object with the new modification date. Supply null * for no checking of last modification, a last modification date is * then not returned back (as there is no reference). + * @throws InterruptedException * @see #getEntityResolver */ - protected InputSource getInputSource(URL url, - AtomicReference checkModifiedDate) throws IOException { - String proto = url.getProtocol().toLowerCase(Locale.ROOT); + protected InputSource getInputSource(URI url, + AtomicReference checkModifiedDate) throws IOException, InterruptedException { + String proto = url.getScheme().toLowerCase(Locale.ROOT); if (!("http".equals(proto) || "https".equals(proto))) throw new IllegalArgumentException( "OAI only allows HTTP(S) as network protocol!"); - HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - conn.setConnectTimeout(timeout * 1000); - conn.setReadTimeout(timeout * 1000); - conn.setRequestProperty("User-Agent", USER_AGENT); + final var reqBuilder = HttpRequest.newBuilder(url).GET() + .timeout(timeout) + .setHeader("User-Agent", USER_AGENT) + .setHeader("Accept-Charset", StandardCharsets.UTF_8.name() + ", *;q=0.1") + .setHeader("Accept", "text/xml, application/xml, *;q=0.1"); + HttpClientUtils.sendCompressionHeaders(reqBuilder); if (authorizationHeader != null) { - conn.setRequestProperty("Authorization", authorizationHeader); + reqBuilder.header("Authorization", authorizationHeader); } - - conn.setRequestProperty("Accept-Encoding", - "gzip, deflate, identity;q=0.3, *;q=0"); - conn.setRequestProperty("Accept-Charset", StandardCharsets.UTF_8.name() + ", *;q=0.1"); - conn.setRequestProperty("Accept", "text/xml, application/xml, *;q=0.1"); - if (checkModifiedDate != null && checkModifiedDate.get() != null) { - conn.setIfModifiedSince(checkModifiedDate.get().toEpochMilli()); + reqBuilder.setHeader("If-Modified-Since", DateTimeFormatter.RFC_1123_DATE_TIME.format(checkModifiedDate.get().atOffset(ZoneOffset.UTC))); } - - conn.setUseCaches(false); - conn.setInstanceFollowRedirects(true); + log.debug("Opening connection..."); - InputStream in = null; + final HttpResponse resp; try { - conn.connect(); - in = conn.getInputStream(); + resp = httpClient.send(reqBuilder.build(), BodyHandlers.ofInputStream()); } catch (IOException ioe) { - int after, code; - try { - after = conn.getHeaderFieldInt("Retry-After", -1); - code = conn.getResponseCode(); - } catch (IOException ioe2) { - after = -1; - code = -1; - } - if (code == HttpURLConnection.HTTP_UNAVAILABLE && after > 0) throw new RetryAfterIOException( - after, ioe); - throw ioe; + throw new RetryAfterIOException(retryTime, ioe); } - - if (checkModifiedDate != null) { - if (conn.getResponseCode() == HttpURLConnection.HTTP_NOT_MODIFIED) { - log.debug("File not modified since " + checkModifiedDate.get()); - if (in != null) in.close(); - return null; + boolean success = false; + try { + final int statusCode = resp.statusCode(); + switch (statusCode) { + case HttpURLConnection.HTTP_UNAVAILABLE: + var retryAfter = resp.headers().firstValue("Retry-After").map(Integer::parseInt); + if (retryAfter.isPresent()) { + throw new RetryAfterIOException(retryAfter.get(), + "OAI server returned '503 Service Unavailable', repeating after " + retryAfter.get() + "s."); + } + throw new IOException("OAI service unavailable (status 503)"); + case HttpURLConnection.HTTP_NOT_MODIFIED: + if (checkModifiedDate != null) { + log.debug("File not modified since " + checkModifiedDate.get()); + return null; + } + throw new IOException("OAI service returned 'not modified', although"); + case HttpURLConnection.HTTP_OK: + break; + default: + if (statusCode >= 500) { + throw new RetryAfterIOException(retryTime, "OAI Server returned error code, repeating after " + retryTime + "s: " + statusCode); + } + throw new IOException("OAI service returned invalid status code: " + statusCode); } - long d = conn.getLastModified(); - checkModifiedDate.set((d == 0L) ? null : Instant.ofEpochMilli(d)); - } - - String encoding = conn.getContentEncoding(); - if (encoding == null) encoding = "identity"; - encoding = encoding.toLowerCase(Locale.ROOT); - log.debug("HTTP server uses " + encoding + " content encoding."); - if ("gzip".equals(encoding)) in = new GZIPInputStream(in); - else if ("deflate".equals(encoding)) in = new InflaterInputStream(in); - else if (!"identity".equals(encoding)) throw new IOException( - "Server uses an invalid content encoding: " + encoding); - - // get charset from content-type to fill into InputSource to prevent - // SAXParser from guessing it - // if charset is superseded by declaration, it is changed later by - // parser - String contentType = conn.getContentType(); - String charset = null; - if (contentType != null) { - contentType = contentType.toLowerCase(Locale.ROOT); - int charsetStart = contentType.indexOf("charset="); - if (charsetStart >= 0) { - int charsetEnd = contentType.indexOf(";", charsetStart); - if (charsetEnd == -1) charsetEnd = contentType.length(); - charsetStart += "charset=".length(); - charset = contentType.substring(charsetStart, charsetEnd).trim(); + + if (checkModifiedDate != null) { + var d = resp.headers().firstValue("Last-Modified").map(DateTimeFormatter.RFC_1123_DATE_TIME::parse).map(Instant::from).orElse(null); + checkModifiedDate.set(d); } + + // get charset from content-type to fill into InputSource to prevent + // SAXParser from guessing it + // if charset is superseded by declaration, it is changed later by + // parser + final String charset = resp.headers().firstValue("Content-Type").map(contentType -> { + contentType = contentType.toLowerCase(Locale.ROOT); + int charsetStart = contentType.indexOf("charset="); + if (charsetStart >= 0) { + int charsetEnd = contentType.indexOf(";", charsetStart); + if (charsetEnd == -1) charsetEnd = contentType.length(); + charsetStart += "charset=".length(); + return contentType.substring(charsetStart, charsetEnd).trim(); + } + return null; + }).orElse(null); + log.debug("Charset from Content-Type: '" + charset + "'"); + + final InputSource src = new InputSource(HttpClientUtils.getDecompressingInputStream(resp)); + src.setSystemId(url.toString()); + src.setEncoding(charset); + success = true; + return src; + } finally { + if (!success) resp.body().close(); } - log.debug("Charset from Content-Type: '" + charset + "'"); - - InputSource src = new InputSource(in); - src.setSystemId(url.toString()); - src.setEncoding(charset); - return src; } /** Resets the internal variables. */ @@ -420,7 +431,6 @@ public void close(boolean cleanShutdown) throws Exception { setValidIdentifiers(validIdentifiersBuilder.build()); } reset(); - SimpleCookieHandler.INSTANCE.disable(); super.close(cleanShutdown); } diff --git a/src/de/pangaea/metadataportal/harvester/RetryAfterIOException.java b/src/de/pangaea/metadataportal/harvester/RetryAfterIOException.java index ae22af0d..1fa5d6e8 100644 --- a/src/de/pangaea/metadataportal/harvester/RetryAfterIOException.java +++ b/src/de/pangaea/metadataportal/harvester/RetryAfterIOException.java @@ -26,8 +26,16 @@ public class RetryAfterIOException extends java.io.IOException { public RetryAfterIOException(int retryAfter, java.io.IOException ioe) { - super(); + this(retryAfter, "HTTP request delayed by " + retryAfter + " seconds: " + ioe.getMessage(), ioe); + } + + public RetryAfterIOException(int retryAfter, String message, java.io.IOException ioe) { + this(retryAfter, message); initCause(ioe); + } + + public RetryAfterIOException(int retryAfter, String message) { + super(message); this.retryAfter = retryAfter; } diff --git a/src/de/pangaea/metadataportal/harvester/WebCrawlingHarvester.java b/src/de/pangaea/metadataportal/harvester/WebCrawlingHarvester.java index a2715fe3..8e060cd7 100644 --- a/src/de/pangaea/metadataportal/harvester/WebCrawlingHarvester.java +++ b/src/de/pangaea/metadataportal/harvester/WebCrawlingHarvester.java @@ -16,14 +16,21 @@ package de.pangaea.metadataportal.harvester; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.net.CookieManager; +import java.net.CookiePolicy; import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URL; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpClient.Redirect; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.net.http.HttpResponse.BodyHandlers; import java.nio.charset.StandardCharsets; +import java.time.Duration; import java.time.Instant; +import java.time.format.DateTimeFormatter; import java.util.Arrays; import java.util.HashSet; import java.util.Locale; @@ -32,8 +39,8 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.zip.GZIPInputStream; -import java.util.zip.InflaterInputStream; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.xml.transform.sax.SAXSource; @@ -44,8 +51,7 @@ import org.xml.sax.helpers.DefaultHandler; import de.pangaea.metadataportal.config.HarvesterConfig; -import de.pangaea.metadataportal.processor.ElasticsearchConnection; -import de.pangaea.metadataportal.utils.SimpleCookieHandler; +import de.pangaea.metadataportal.utils.HttpClientUtils; import de.pangaea.metadataportal.utils.StaticFactories; /** @@ -96,15 +102,23 @@ public class WebCrawlingHarvester extends SingleFileEntitiesHarvester { public static final Set HTML_CONTENT_TYPES = new HashSet<>( Arrays.asList("text/html", "application/xhtml+xml")); + public static final String USER_AGENT = new StringBuilder("Java/") + .append(Runtime.version()).append(" (") + .append(de.pangaea.metadataportal.Package.getProductName()) + .append('/').append(de.pangaea.metadataportal.Package.getVersion()) + .append("; WebCrawlingHarvester)").toString(); + // Class members private String baseURL; private final Pattern filenameFilter, excludeUrlPattern; private final Set contentTypes = new HashSet<>(); private final int retryCount; private final int retryTime; - private final int timeout; + private final Duration timeout; + private final String authorizationHeader; private final long pauseBetweenRequests; - + private final HttpClient httpClient; + private Set harvested = new HashSet<>(); private SortedSet needsHarvest = new TreeSet<>(); @@ -116,8 +130,8 @@ public WebCrawlingHarvester(HarvesterConfig iconfig) throws Exception { String s = iconfig.properties.getProperty("baseUrl"); if (s == null) throw new IllegalArgumentException( "Missing base URL to start harvesting (property \"baseUrl\")"); - URL u = new URL(s); - String proto = u.getProtocol().toLowerCase(Locale.ROOT); + URI u = new URI(s); + String proto = u.getScheme().toLowerCase(Locale.ROOT); if (!("http".equals(proto) || "https".equals(proto))) throw new IllegalArgumentException( "WebCrawlingHarvester only allows HTTP(S) as network protocol!"); baseURL = u.toString(); @@ -130,7 +144,8 @@ public WebCrawlingHarvester(HarvesterConfig iconfig) throws Exception { retryCount = Integer.parseInt(iconfig.properties.getProperty("retryCount", Integer.toString(DEFAULT_RETRY_COUNT))); retryTime = Integer.parseInt(iconfig.properties.getProperty("retryAfterSeconds", Integer.toString(DEFAULT_RETRY_TIME))); - timeout = Integer.parseInt(iconfig.properties.getProperty("timeoutAfterSeconds", Integer.toString(DEFAULT_TIMEOUT))); + timeout = Duration.ofSeconds(Integer.parseInt(iconfig.properties.getProperty("timeoutAfterSeconds", Integer.toString(DEFAULT_TIMEOUT)))); + authorizationHeader = iconfig.properties.getProperty("authorizationHeader"); pauseBetweenRequests = Long.parseLong(iconfig.properties.getProperty("pauseBetweenRequests", "0")); s = iconfig.properties.getProperty("filenameFilter"); @@ -139,6 +154,12 @@ public WebCrawlingHarvester(HarvesterConfig iconfig) throws Exception { s = iconfig.properties.getProperty("excludeUrlPattern"); excludeUrlPattern = (s == null) ? null : Pattern.compile(s); + httpClient = HttpClient.newBuilder() + .followRedirects(Redirect.NORMAL) + .connectTimeout(timeout) + .cookieHandler(new CookieManager(null, CookiePolicy.ACCEPT_ORIGINAL_SERVER)) + .build(); + // initialize and test for HTML SAX Parser try { htmlReaderClass = Class.forName(HTML_SAX_PARSER_CLASS).asSubclass(XMLReader.class); @@ -147,18 +168,6 @@ public WebCrawlingHarvester(HarvesterConfig iconfig) throws Exception { } } - @Override - public void open(ElasticsearchConnection es, String targetIndex) throws Exception { - super.open(es, targetIndex); - SimpleCookieHandler.INSTANCE.enable(); - } - - @Override - public void close(boolean cleanShutdown) throws Exception { - SimpleCookieHandler.INSTANCE.disable(); - super.close(cleanShutdown); - } - @Override public void harvest() throws Exception { // process this URL directly and save possible redirect as new base @@ -166,12 +175,12 @@ public void harvest() throws Exception { baseURL = ""; // disable base checking for the entry point to follow a // initial redirect for sure harvested.add(urlStr); - URL newbaseURL = processURL(new URL(urlStr)); + URI newbaseURL = processURL(new URI(urlStr)); // get an URL that points to the current directory // from now on this is used as baseURL baseURL = ("".equals(newbaseURL.getPath())) ? newbaseURL.toString() - : new URL(newbaseURL, "./").toString(); + : newbaseURL.resolve("./").toString(); log.debug("URL directory which harvesting may not escape: " + baseURL); // remove invalid URLs from queued list (because until now we had no baseURL @@ -188,7 +197,7 @@ public void harvest() throws Exception { urlStr = needsHarvest.first(); needsHarvest.remove(urlStr); harvested.add(urlStr); - processURL(new URL(urlStr)); + processURL(new URI(urlStr)); } } @@ -197,7 +206,8 @@ protected void enumerateValidHarvesterPropertyNames(Set props) { super.enumerateValidHarvesterPropertyNames(props); props.addAll(Arrays.asList("baseUrl", "retryCount", "retryAfterSeconds", "timeoutAfterSeconds", "filenameFilter", - "contentTypes", "excludeUrlPattern", "pauseBetweenRequests" + "contentTypes", "excludeUrlPattern", "pauseBetweenRequests", + "authorizationHeader" )); } @@ -218,75 +228,7 @@ void queueURL(String url) { needsHarvest.add(url); } - private InputStream sendHTTPRequest(HttpURLConnection conn, String method) - throws IOException { - try { - conn.setConnectTimeout(timeout * 1000); - conn.setReadTimeout(timeout * 1000); - conn.setRequestMethod(method); - - StringBuilder ua = new StringBuilder("Java/") - .append(Runtime.version()).append(" (") - .append(de.pangaea.metadataportal.Package.getProductName()) - .append('/').append(de.pangaea.metadataportal.Package.getVersion()) - .append("; WebCrawlingHarvester)"); - conn.setRequestProperty("User-Agent", ua.toString()); - - conn.setRequestProperty("Accept-Encoding", - "gzip, deflate, identity;q=0.3, *;q=0"); - conn.setRequestProperty("Accept-Charset", StandardCharsets.UTF_8.name() + ", *;q=0.5"); - - StringBuilder ac = new StringBuilder(); - for (String c : contentTypes) - ac.append(c).append(", "); - for (String c : HTML_CONTENT_TYPES) - ac.append(c).append(", "); - ac.append("*;q=0.1"); - conn.setRequestProperty("Accept", ac.toString()); - - conn.setUseCaches(false); - conn.setInstanceFollowRedirects(true); - - log.debug("Opening connection..."); - InputStream in = null; - try { - conn.connect(); - in = conn.getInputStream(); - } catch (IOException ioe) { - int after, code; - try { - after = conn.getHeaderFieldInt("Retry-After", -1); - code = conn.getResponseCode(); - } catch (IOException ioe2) { - after = -1; - code = -1; - } - if (code == HttpURLConnection.HTTP_UNAVAILABLE && after > 0) throw new RetryAfterIOException( - after, ioe); - throw ioe; - } - - // cast stream if encoding different from identity - if (!"HEAD".equals(method)) { - String encoding = conn.getContentEncoding(); - if (encoding == null) encoding = "identity"; - encoding = encoding.toLowerCase(Locale.ROOT); - - log.debug("HTTP server uses " + encoding + " content encoding."); - if ("gzip".equals(encoding)) in = new GZIPInputStream(in); - else if ("deflate".equals(encoding)) in = new InflaterInputStream(in); - else if (!"identity".equals(encoding)) throw new IOException( - "Server uses an invalid content encoding: " + encoding); - } - - return in; - } catch (FileNotFoundException fnfe) { - log.warn("Cannot find URL '" + conn.getURL() + "'."); - return null; - } - } - - private void analyzeHTML(final URL baseURL, final InputSource source) + private void analyzeHTML(final URI baseURL, final InputSource source) throws Exception { XMLReader r = htmlReaderClass.getConstructor().newInstance(); r.setFeature("http://xml.org/sax/features/namespaces", true); @@ -298,7 +240,7 @@ private void analyzeHTML(final URL baseURL, final InputSource source) DefaultHandler handler = new DefaultHandler() { - private URL base = baseURL; // make it unfinal ;-) + private URI base = baseURL; // make it unfinal ;-) private int inBODY = 0; private int inFRAMESET = 0; private int inHEAD = 0; @@ -317,8 +259,8 @@ public void startElement(String namespaceURI, String localName, if ("BASE".equals(localName)) { String newBase = atts.getValue("href"); if (newBase != null) try { - base = new URL(base, newBase); - } catch (MalformedURLException mue) { + base = base.resolve(newBase); + } catch (IllegalArgumentException mue) { // special exception to stop processing log.debug("Found invalid BASE-URL: " + url); throw new SAXException("#panFMP#HTML_INVALID_BASE"); @@ -340,8 +282,8 @@ public void startElement(String namespaceURI, String localName, } // append a possible url to queue if (url != null) try { - queueURL(new URL(base, url).toString()); - } catch (MalformedURLException mue) { + queueURL(base.resolve(url).toString()); + } catch (IllegalArgumentException mue) { // there may be javascript:-URLs in the document or something other // we will not throw errors! log.debug("Found invalid URL: " + url); @@ -373,7 +315,7 @@ public void endElement(String namespaceURI, String localName, String qName) } } - private boolean acceptFile(URL url) { + private boolean acceptFile(URI url) { if (filenameFilter == null) return true; String name = url.getPath(); int p = name.lastIndexOf('/'); @@ -383,113 +325,135 @@ private boolean acceptFile(URL url) { } @SuppressWarnings("resource") - private URL processURL(URL url) throws Exception { + private URI processURL(URI uri) throws Exception { for (int retry = 0; retry <= retryCount; retry++) { - log.info("Requesting props of '" + url + "'..."); + log.info("Requesting props of '" + uri + "'..."); + var proto = uri.getScheme().toLowerCase(Locale.ROOT); + if (!("http".equals(proto) || "https".equals(proto))) throw new IllegalArgumentException( + "WebCrawlingHarvester only allows HTTP(S) as network protocol!"); + final var reqBuilder = HttpRequest.newBuilder(uri).GET() + .timeout(timeout) + .setHeader("User-Agent", USER_AGENT) + .setHeader("Accept-Charset", StandardCharsets.UTF_8.name() + ", *;q=0.5") + .setHeader("Accept", "text/xml, application/xml, *;q=0.1") + .setHeader("Accept", Stream.of(contentTypes, HTML_CONTENT_TYPES, Set.of("*;q=0.1")) + .flatMap(Set::stream).distinct().collect(Collectors.joining(", "))); + HttpClientUtils.sendCompressionHeaders(reqBuilder); + if (authorizationHeader != null) { + reqBuilder.header("Authorization", authorizationHeader); + } + + log.debug("Opening connection..."); try { - HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - InputStream in = sendHTTPRequest(conn, "HEAD"); - if (in == null) return url; - in.close(); // it is empty - - // check connection properties - String contentType = conn.getContentType(); - String charset = null; - if (contentType != null) { - contentType = contentType.toLowerCase(Locale.ROOT); - int charsetStart = contentType.indexOf("charset="); - if (charsetStart >= 0) { - int charsetEnd = contentType.indexOf(";", charsetStart); - if (charsetEnd == -1) charsetEnd = contentType.length(); - charsetStart += "charset=".length(); - charset = contentType.substring(charsetStart, charsetEnd).trim(); - } - int contentEnd = contentType.indexOf(';'); - if (contentEnd >= 0) contentType = contentType.substring(0, - contentEnd); - contentType = contentType.trim(); - } - log.debug("Charset from Content-Type: '" + charset - + "'; Type from Content-Type: '" + contentType + "'"); - if (contentType == null) { - log.warn("Connection to URL '" + url - + "' did not return a content-type, skipping."); - return url; + final HttpResponse resp; + try { + resp = httpClient.send(reqBuilder.build(), BodyHandlers.ofInputStream()); + } catch (IOException ioe) { + throw new RetryAfterIOException(retryTime, ioe); } - - // if we got a redirect the new URL is now needed - URL newurl = conn.getURL(); - if (!url.toString().equals(newurl.toString())) { - log.debug("Got redirect to: " + newurl); - url = newurl; - // check if it is below base - if (!url.toString().startsWith(baseURL)) return url; - // was it already harvested? - if (harvested.contains(url.toString())) return url; - // clean this new url from lists - needsHarvest.remove(url.toString()); - harvested.add(url.toString()); + final int statusCode = resp.statusCode(); + switch (statusCode) { + case HttpURLConnection.HTTP_UNAVAILABLE: + var retryAfter = resp.headers().firstValue("Retry-After").map(Integer::parseInt); + if (retryAfter.isPresent()) { + throw new RetryAfterIOException(retryAfter.get(), + "Webserver returned '503 Service Unavailable', repeating after " + retryAfter.get() + "s."); + } + throw new IOException("Webserver unavailable (status 503)"); + case HttpURLConnection.HTTP_OK: + break; + case HttpURLConnection.HTTP_NOT_FOUND: + case HttpURLConnection.HTTP_GONE: + log.warn("Cannot find URL '" + resp.uri() + "'."); + return uri; + default: + if (statusCode >= 500) { + throw new RetryAfterIOException(retryTime, "Webserver returned error code, repeating after " + retryTime + "s: " + statusCode); + } + throw new IOException("Webserver returned invalid status code: " + statusCode); } - if (HTML_CONTENT_TYPES.contains(contentType)) { - log.info("Analyzing HTML links in '" + url + "'..."); + try (final InputStream in = HttpClientUtils.getDecompressingInputStream(resp)) { + // check connection properties + String contentType = resp.headers().firstValue("Content-Type").orElse(null); + String charset = null; + if (contentType != null) { + contentType = contentType.toLowerCase(Locale.ROOT); + int charsetStart = contentType.indexOf("charset="); + if (charsetStart >= 0) { + int charsetEnd = contentType.indexOf(";", charsetStart); + if (charsetEnd == -1) charsetEnd = contentType.length(); + charsetStart += "charset=".length(); + charset = contentType.substring(charsetStart, charsetEnd).trim(); + } + int contentEnd = contentType.indexOf(';'); + if (contentEnd >= 0) contentType = contentType.substring(0, + contentEnd); + contentType = contentType.trim(); + } + log.debug("Charset from Content-Type: '" + charset + + "'; Type from Content-Type: '" + contentType + "'"); + if (contentType == null) { + log.warn("Connection to URL '" + uri + + "' did not return a content-type, skipping."); + return uri; + } + + // if we got a redirect the new URL is now needed + URI newurl = resp.uri(); + if (!uri.toString().equals(newurl.toString())) { + log.debug("Got redirect to: " + newurl); + uri = newurl; + // check if it is below base + if (!uri.toString().startsWith(baseURL)) return uri; + // was it already harvested? + if (harvested.contains(uri.toString())) return uri; + // clean this new url from lists + needsHarvest.remove(uri.toString()); + harvested.add(uri.toString()); + } - // reopen for GET - conn = (HttpURLConnection) url.openConnection(); - in = sendHTTPRequest(conn, "GET"); - if (in != null) try { - InputSource src = new InputSource(in); - src.setSystemId(url.toString()); + if (HTML_CONTENT_TYPES.contains(contentType)) { + log.info("Analyzing HTML links in '" + uri + "'..."); + + final InputSource src = new InputSource(in); + src.setSystemId(uri.toString()); src.setEncoding(charset); - analyzeHTML(url, src); - } finally { - in.close(); - } - } else if (contentTypes.contains(contentType)) { - if (acceptFile(url)) { - long lastModified = conn.getLastModified(); - if (isDocumentOutdated(lastModified == 0L ? null : Instant.ofEpochMilli(lastModified))) { - log.info("Harvesting '" + url + "'..."); - - // reopen for GET and parse as XML - conn = (HttpURLConnection) url.openConnection(); - in = sendHTTPRequest(conn, "GET"); - if (in != null) try { - InputSource src = new InputSource(in); - src.setSystemId(url.toString()); + analyzeHTML(uri, src); + } else if (contentTypes.contains(contentType)) { + if (acceptFile(uri)) { + var lastModified = resp.headers().firstValue("Last-Modified").map(DateTimeFormatter.RFC_1123_DATE_TIME::parse).map(Instant::from).orElse(null); + if (isDocumentOutdated(lastModified)) { + log.info("Harvesting '" + uri + "'..."); + + final InputSource src = new InputSource(in); + src.setSystemId(uri.toString()); src.setEncoding(charset); - SAXSource saxsrc = new SAXSource(StaticFactories.saxFactory + final SAXSource saxsrc = new SAXSource(StaticFactories.saxFactory .newSAXParser().getXMLReader(), src); - addDocument(url.toString(), lastModified, saxsrc); - } finally { - in.close(); + addDocument(uri.toString(), lastModified, saxsrc); + } else { + // add this empty doc here, to update datestamps for next + // harvesting + addDocument(uri.toString(), lastModified, null); } - } else { - // add this empty doc here, to update datestamps for next - // harvesting - addDocument(url.toString(), lastModified, null); } } + return uri; } - return url; - } catch (IOException ioe) { + } catch (RetryAfterIOException ioe) { int after = retryTime; - if (ioe instanceof RetryAfterIOException) { - if (retry >= retryCount) throw (IOException) ioe.getCause(); - log.warn("HTTP server returned '503 Service Unavailable' with a 'Retry-After' value being set."); - after = ((RetryAfterIOException) ioe).getRetryAfter(); - } else { - if (retry >= retryCount) throw ioe; - log.error("HTTP server access failed with exception: ", ioe); - } + if (retry >= retryCount) throw (IOException) ioe.getCause(); + log.warn(ioe.getMessage()); + after = ((RetryAfterIOException) ioe).getRetryAfter(); log.info("Retrying after " + after + " seconds (" + (retryCount - retry) + " retries left)..."); try { Thread.sleep(1000L * after); } catch (InterruptedException ie) {} + log.debug("Recreating digester instances to recover from incomplete parsers..."); } } throw new IOException("Unable to properly connect HTTP server."); } - } \ No newline at end of file diff --git a/src/de/pangaea/metadataportal/harvester/ZipFileHarvester.java b/src/de/pangaea/metadataportal/harvester/ZipFileHarvester.java index ae08e374..99f2c730 100644 --- a/src/de/pangaea/metadataportal/harvester/ZipFileHarvester.java +++ b/src/de/pangaea/metadataportal/harvester/ZipFileHarvester.java @@ -79,6 +79,13 @@ public class ZipFileHarvester extends SingleFileEntitiesHarvester { public static final int DEFAULT_RETRY_COUNT = 5; public static final int DEFAULT_TIMEOUT = 180; // seconds + public static final String USER_AGENT = new StringBuilder("Java/") + .append(Runtime.version()).append(" (") + .append(de.pangaea.metadataportal.Package.getProductName()) + .append('/') + .append(de.pangaea.metadataportal.Package.getVersion()) + .append("; ZipFileHarvester)").toString(); + /** the retryCount from configuration */ protected final int retryCount; /** the retryTime from configuration */ @@ -168,13 +175,7 @@ private InputStream openStream() throws IOException { conn.setReadTimeout(timeout * 1000); if (conn instanceof HttpURLConnection) { - StringBuilder ua = new StringBuilder("Java/") - .append(Runtime.version()).append(" (") - .append(de.pangaea.metadataportal.Package.getProductName()) - .append('/') - .append(de.pangaea.metadataportal.Package.getVersion()) - .append("; ZipFileHarvester)"); - conn.setRequestProperty("User-Agent", ua.toString()); + conn.setRequestProperty("User-Agent", USER_AGENT); conn.setRequestProperty("Accept-Encoding", "identity, *;q=0"); conn.setRequestProperty("Accept", "application/zip, *;q=0.1"); diff --git a/src/de/pangaea/metadataportal/utils/HttpClientUtils.java b/src/de/pangaea/metadataportal/utils/HttpClientUtils.java new file mode 100644 index 00000000..ebbe4c63 --- /dev/null +++ b/src/de/pangaea/metadataportal/utils/HttpClientUtils.java @@ -0,0 +1,56 @@ +/* + * Copyright panFMP Developers Team c/o Uwe Schindler + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package de.pangaea.metadataportal.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.util.Locale; +import java.util.zip.GZIPInputStream; +import java.util.zip.InflaterInputStream; + +/** + * Some utility methods for decompressing {@link HttpResponse} + * + * @author Uwe Schindler + */ +public final class HttpClientUtils { + private static final org.apache.commons.logging.Log log = org.apache.commons.logging.LogFactory + .getLog(HttpClientUtils.class); + + private HttpClientUtils() {} + + /** Returns an InputStream which decodes with header "Content-Encoding" */ + public static InputStream getDecompressingInputStream(final HttpResponse resp) throws IOException { + final String encoding = resp.headers().firstValue("Content-Encoding").orElse("identity").toLowerCase(Locale.ROOT).trim(); + log.debug("HTTP server uses " + encoding + " content encoding."); + switch (encoding) { + case "gzip": return new GZIPInputStream(resp.body()); + case "deflate": return new InflaterInputStream(resp.body()); + case "identity": return resp.body(); + } + throw new IOException("Server uses an invalid content encoding: " + encoding); + } + + /** Sends "Accept-Encoding" header to ask server to compress result. + * The response can later be parsed with {@link #getDecompressingInputStream(HttpResponse)} */ + public static void sendCompressionHeaders(final HttpRequest.Builder builder) { + builder.setHeader("Accept-Encoding", "gzip, deflate, identity;q=0.3, *;q=0"); + } + +} \ No newline at end of file diff --git a/src/de/pangaea/metadataportal/utils/SimpleCookieHandler.java b/src/de/pangaea/metadataportal/utils/SimpleCookieHandler.java deleted file mode 100644 index 913efbf4..00000000 --- a/src/de/pangaea/metadataportal/utils/SimpleCookieHandler.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright panFMP Developers Team c/o Uwe Schindler - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package de.pangaea.metadataportal.utils; - -import java.io.IOException; -import java.net.CookieHandler; -import java.net.CookieManager; -import java.net.CookiePolicy; -import java.net.URI; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import de.pangaea.metadataportal.harvester.Harvester; - -/** - * A CookieHandler that can be enabled and used per thread. - * @author Uwe Schindler - */ -public final class SimpleCookieHandler extends CookieHandler { - - static final org.apache.commons.logging.Log log = org.apache.commons.logging.LogFactory.getLog(SimpleCookieHandler.class); - - /** - * Singleton instance of this class. Should be set with - * {@link CookieHandler#setDefault} as default. - */ - public static final SimpleCookieHandler INSTANCE = new SimpleCookieHandler(); - - private SimpleCookieHandler() {} - - private final ThreadLocal manager = new ThreadLocal() { - @Override - protected CookieManager initialValue() { - return new CookieManager(null, CookiePolicy.ACCEPT_ORIGINAL_SERVER); - } - }; - private final ThreadLocal enabled = new ThreadLocal() { - @Override - protected Boolean initialValue() { - return Boolean.FALSE; - } - }; - - /** - * Resets all recorded cookies for the current thread. This method is called - * from {@link Harvester#open} to have an empty cookie list. - */ - public void enable() { - manager.remove(); - enabled.set(true); - } - - /** - * Cleans up the cookie list and disables the handler. - */ - public void disable() { - enabled.set(false); - manager.remove(); - } - - @Override - public void put(URI uri, Map> responseHeaders) throws IOException { - if (enabled.get().booleanValue()) { - manager.get().put(uri, responseHeaders); - } - } - - @Override - public Map> get(URI uri, Map> requestHeaders) throws IOException { - if (enabled.get().booleanValue()) { - return manager.get().get(uri, requestHeaders); - } else { - return Collections.emptyMap(); - } - } - -}