crawler-conf.yaml

# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones 
# for your custom components.
# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.

config: 
  topology.workers: 1
  topology.message.timeout.secs: 300
  topology.max.spout.pending: 5000
  topology.debug: false

  fetcher.threads.number: 50
  
  # override the JVM parameters for the workers
  topology.worker.childopts: "-Xmx10g -Djava.net.preferIPv4Stack=true"

  # mandatory when using Flux
  topology.kryo.register:
    - com.digitalpebble.stormcrawler.Metadata
    - com.digitalpebble.stormcrawler.persistence.Status

  # Lists the metadata to transfer to outlinks
  # Used by Fetcher and SiteMapParser for redirections,
  # discovered links, passing cookies to child pages, etc.
  # These are also persisted for the parent document (see below).
  # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
  # metadata.transfer:
  # - customMetadataName

  # Lists the metadata to persist to storage
  # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
  # The full user agent value sent as part of the HTTP requests
  # is built from the elements below. Only the agent.name is mandatory,
  # it is also used to parse the robots.txt directives. 

  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
  http.agent.name: "digitalpebble-crawler"
  # version of your crawler
  http.agent.version: "1"
  # description of what it does
  http.agent.description: "none"
  # URL webmasters can go to to learn about it
  http.agent.url: "http://digitalpebble.com"
  # Finally, an email so that they can get in touch with you
  http.agent.email: "contact@digitalpebble.com"

  http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
  https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"

  # The maximum number of bytes for returned HTTP response bodies.
  # The fetched page will be trimmed to 65KB in this case
  # Set -1 to disable the limit.
  http.content.limit: -1

  # FetcherBolt queue dump => comment out to activate
  # if a file exists on the worker machine with the corresponding port number
  # the FetcherBolt will log the content of its internal queues to the logs
  # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"

  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"
  jsoup.filters.config.file: "jsoupfilters.json"

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: -1

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # set to true if you don't need any text to be extracted by JSoup
  textextractor.no.text: false

  # text extraction for JSoupParserBolt
  textextractor.include.pattern:
   - DIV[id="maincontent"]
   - DIV[itemprop="articleBody"]
   - ARTICLE

  textextractor.exclude.tags:
   - STYLE
   - SCRIPT

  # needed for parsing with Tika
  jsoup.treat.non.html.as.error: false

  # restricts the documents types to be parsed with Tika
  parser.mimetype.whitelist:
   - application/.+word.*
   - application/.+excel.*
   - application/.+powerpoint.*
   - application/.*pdf.*

  # Tika parser configuration file
  parse.tika.config.file: "tika-config.xml"

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched successfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true: 30
  # fetchInterval.isFeed=true: 10

  # configuration for the classes extending AbstractIndexerBolt
  # indexer.md.filter: "someKey=aValue"
  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.canonical.name: "canonical"
  # How to convert metadata key values into fields for indexing
  # 
  # if no alias is specified with =alias, the key value is used
  # for instance below, _domain_ and _format_ will be used 
  # as field names, whereas _title_ will be used for _parse.title_.
  # You can specify the index of the value to store from the values array 
  # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to 
  # get the first value for the metadata _parse.title_ (which is the default anyway).
  # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
  # index all the keys with _parse_ as a prefix. Note that in that case, you can't
  # specify an alias with =, nor can you specify an index.
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description
  - domain
  - format

  # Metrics consumers:
  topology.metrics.consumer.register:
     - class: "org.apache.storm.metric.LoggingMetricsConsumer"
       parallelism.hint: 1