-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler-conf.yaml
143 lines (118 loc) · 5.38 KB
/
crawler-conf.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones
# for your custom components.
# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.
config:
topology.workers: 1
topology.message.timeout.secs: 300
topology.max.spout.pending: 5000
topology.debug: false
fetcher.threads.number: 50
# override the JVM parameters for the workers
topology.worker.childopts: "-Xmx10g -Djava.net.preferIPv4Stack=true"
# mandatory when using Flux
topology.kryo.register:
- com.digitalpebble.stormcrawler.Metadata
- com.digitalpebble.stormcrawler.persistence.Status
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.
# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
http.agent.name: "digitalpebble-crawler"
# version of your crawler
http.agent.version: "1"
# description of what it does
http.agent.description: "none"
# URL webmasters can go to to learn about it
http.agent.url: "http://digitalpebble.com"
# Finally, an email so that they can get in touch with you
http.agent.email: "contact@digitalpebble.com"
http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
# The maximum number of bytes for returned HTTP response bodies.
# The fetched page will be trimmed to 65KB in this case
# Set -1 to disable the limit.
http.content.limit: -1
# FetcherBolt queue dump => comment out to activate
# if a file exists on the worker machine with the corresponding port number
# the FetcherBolt will log the content of its internal queues to the logs
# fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
jsoup.filters.config.file: "jsoupfilters.json"
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: -1
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
# set to true if you don't need any text to be extracted by JSoup
textextractor.no.text: false
# text extraction for JSoupParserBolt
textextractor.include.pattern:
- DIV[id="maincontent"]
- DIV[itemprop="articleBody"]
- ARTICLE
textextractor.exclude.tags:
- STYLE
- SCRIPT
# needed for parsing with Tika
jsoup.treat.non.html.as.error: false
# restricts the documents types to be parsed with Tika
parser.mimetype.whitelist:
- application/.+word.*
- application/.+excel.*
- application/.+powerpoint.*
- application/.*pdf.*
# Tika parser configuration file
parse.tika.config.file: "tika-config.xml"
# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched successfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true: 30
# fetchInterval.isFeed=true: 10
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description
- domain
- format
# Metrics consumers:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1