Skip to content

Commit

Permalink
Use population-based weighted sampling for global builds
Browse files Browse the repository at this point in the history
Extend the weighted sampling approach from regional builds to global
builds. This comes with the added benefit of simplifying logic to avoid
region/country-specific max_sequences.
  • Loading branch information
victorlin committed Sep 30, 2024
1 parent d6de734 commit ae764bb
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 789 deletions.
281 changes: 18 additions & 263 deletions nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,297 +287,52 @@ subsampling:
# Custom subsampling logic for global region over 1m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_1m:
africa_early:
early:
group_by: "country year month"
max_sequences: 150
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 125
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 90
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 1025
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
group_by: "country week"
max_sequences: 600
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Africa'"
asia_recent:
group_by: "country week"
max_sequences: 800
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division week"
max_sequences: 700
min_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
europe_recent:
recent:
group_by: "country week"
max_sequences: 500
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division week"
max_sequences: 700
max_sequences: 4100
min_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division week"
max_sequences: 400
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=North America'"
south_america_recent:
group_by: "country week"
max_sequences: 360
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=South America'"
oceania_recent:
group_by: "division week"
max_sequences: 60
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 2m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_2m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
europe_early:
early:
group_by: "country year month"
max_sequences: 125
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
group_by: "country year month"
max_sequences: 90
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 1025
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
recent:
group_by: "country week"
max_sequences: 600
max_sequences: 4100
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Africa'"
asia_recent:
group_by: "country week"
max_sequences: 800
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division week"
max_sequences: 700
min_date: "--min-date 2M"
exclude: "--exclude-where 'country!=China'"
europe_recent:
group_by: "country week"
max_sequences: 500
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division week"
max_sequences: 700
min_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division week"
max_sequences: 400
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=North America'"
south_america_recent:
group_by: "country week"
max_sequences: 360
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=South America'"
oceania_recent:
group_by: "division week"
max_sequences: 60
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over 6m
# 5125 total (expect ~3400)
# 4:1 ratio of recent to early
# all eight regions equal except Oceania at 20%
nextstrain_global_6m:
africa_early:
group_by: "country year month"
max_sequences: 150
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Africa'"
asia_early:
group_by: "country year month"
max_sequences: 200
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
europe_early:
group_by: "country year month"
max_sequences: 125
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Europe'"
india_early:
group_by: "division year month"
max_sequences: 175
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
north_america_early:
group_by: "division year month"
max_sequences: 100
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=North America'"
south_america_early:
early:
group_by: "country year month"
max_sequences: 90
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=South America'"
oceania_early:
group_by: "division year month"
max_sequences: 15
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 1025
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Oceania'"
africa_recent:
group_by: "country year month"
max_sequences: 600
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Africa'"
asia_recent:
group_by: "country year month"
max_sequences: 800
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china_recent:
group_by: "division year month"
max_sequences: 700
min_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
europe_recent:
group_by: "country year month"
max_sequences: 500
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Europe'"
india_recent:
group_by: "division year month"
max_sequences: 700
min_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
north_america_recent:
group_by: "division year month"
max_sequences: 400
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=North America'"
south_america_recent:
recent:
group_by: "country year month"
max_sequences: 360
max_sequences: 4100
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=South America'"
oceania_recent:
group_by: "division year month"
max_sequences: 60
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Oceania'"

# Custom subsampling logic for global region over all-time
# 4320 total (expect ~3200)
# all eight regions equal except Oceania at 20%
nextstrain_global_all_time:
africa:
group_by: "country year month"
max_sequences: 750
exclude: "--exclude-where 'region!=Africa'"
asia:
all:
group_by: "country year month"
max_sequences: 1000
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
china:
group_by: "division year month"
max_sequences: 875
exclude: "--exclude-where 'country!=China'"
europe:
group_by: "country year month"
max_sequences: 625
exclude: "--exclude-where 'region!=Europe'"
india:
group_by: "division year month"
max_sequences: 875
exclude: "--exclude-where 'country!=India'"
north_america:
group_by: "division year month"
max_sequences: 500
exclude: "--exclude-where 'region!=North America'"
south_america:
group_by: "country year month"
max_sequences: 450
exclude: "--exclude-where 'region!=South America'"
oceania:
group_by: "division year month"
max_sequences: 75
exclude: "--exclude-where 'region!=Oceania'"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 4320

# Root to clade 21L
refine:
Expand Down
Loading

0 comments on commit ae764bb

Please sign in to comment.