-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml.example
75 lines (70 loc) · 3.56 KB
/
config.yaml.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
{%- set THREADS = env['THREADS'] %}
{%- set CREDENTIALS = env['CREDENTIALS'] %}
{%- set INTROSPECT_THREADS = env['INTROSPECT_THREADS'] %}
{%- set BQ_LOAD_THREADS = env['BQ_LOAD_THREADS'] -%}
{%- set EXTRACT_LOG_FILE = env['EXTRACT_LOG_FILE'] -%}
spark:
threads: {{THREADS}}
format: "json"
compression: "gzip"
normalize_schema: "true"
timestamp_format: "yyyy-MM-dd HH:mm:ss"
log_level: INFO
properties:
#spark.scheduler.mode: "fair"
spark.ui.showConsoleProgress: "false"
spark.default.parallelism: "{{THREADS}}"
spark.eventLog.enabled: "false"
spark.eventLog.dir: "spark_log"
spark.task.maxFailures: "20"
spark.hadoop.fs.gs.http.max.retry: "20"
# 2023/05/24 - added cache size to hopefully fix "410 Gone" errors
# https://github.com/GoogleCloudDataproc/hadoop-connectors/pull/264
spark.hadoop.fs.gs.outputstream.upload.cache.size: "67108864"
spark.driver.extraClassPath: "spark-hadoop-cloud_2.12-3.3.2.jar:mssql-jdbc-12.2.0.jre11.jar:gcs-connector-hadoop3-2.2.12-shaded.jar:spark-3.1-bigquery-0.28.0-preview.jar"
spark.sql.session.timeZone: "America/Los_Angeles"
spark.sql.jsonGenerator.ignoreNullFields": "false"
spark.hadoop.google.cloud.auth.service.account.enable: "true"
spark.hadoop.google.cloud.auth.service.account.json.keyfile: "{{CREDENTIALS}}"
# NOTE: Below require Hadoop 3.3.5+ jars installed
spark.hadoop.mapreduce.outputcommitter.factory.scheme.gcs: "org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory"
spark.hadoop.mapreduce.outputcommitter.factory.scheme.gs: "org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory"
spark.hadoop.mapreduce.manifest.committer.delete.target.files: "true"
spark.sql.parquet.output.committer.class: "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter"
spark.sql.sources.commitProtocolClass: "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol"
spark.executor.memory: "8g"
spark.driver.memory: "16g"
spark.executor.defaultJavaOptions: "-XX:+UseG1GC"
spark.driver.defaultJavaOptions: "-XX:+UseG1GC"
spark.storage.memoryFraction: "0"
spark.sql.debug.maxToStringFields: "500"
sqlalchemy:
url: "mssql+pyodbc://USERNAME:PASSWORD@DBHOSTNAME:PORT/DATABASE?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes&Encrypt=yes"
connect_args:
connect_timeout: 60
isolation_level: "READ UNCOMMITTED"
jdbc:
url: "jdbc:sqlserver://HOSTNAME:PORT;databaseName=DATABASE;sendStringParametersAsUnicode=false;encrypt=true;trustServerCertificate=true;"
properties:
user: USERNAME
password: "PASSWORD"
driver: "com.microsoft.sqlserver.jdbc.SQLServerDriver"
fetchsize: "2000"
sessionInitStatement: "SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED"
# 50 MiB is our target .json.gz size
# Make sure this is less than spark.hadoop.fs.gs.outputstream.upload.cache.size above
target_partition_size_bytes: 50000000
# First-pass introspection will use this value to determine partition size
default_rows_per_partition: 1000000
introspection_expire_s: 1209600 # 2 weeks
tinydb_database_file: tinydb_dumpy.json
tinydb_date: tinydb_date.json
last_successful_run: "1-DEC-2023"
# Controls parallelism of SQL introspection workers (one SQL connection per worker)
# note: introspect_workers + spark.threads = total parallel SQL queries
introspect_workers: {{INTROSPECT_THREADS}}
# No more than this number of Spark jobs will be in 'running' state.
extract_workers: 64
# Controls parallelism of BigQuery load operations
load_workers: {{BQ_LOAD_THREADS}}
log_file: {{EXTRACT_LOG_FILE}}