Skip to content

Commit

Permalink
Consistent bot useragent filtering (#81)
Browse files Browse the repository at this point in the history
* Use macro to define bots. Consistent filtering

* Rm redundant bots

* Spiders too, oh my
  • Loading branch information
jtcohen6 authored Mar 20, 2020
1 parent 3f43112 commit 4acbdc9
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 36 deletions.
3 changes: 2 additions & 1 deletion macros/adapters/bigquery/pageviews/snowplow_page_views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ page_views as (
where event = 'page_view'
and (br_family != 'Robot/Spider' or br_family is null)
and (
not regexp_contains(LOWER(useragent), '^.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|phantomjs|yandexbot|twitterbot|a_archiver|facebookexternalhit|bingbot|bingpreview|googlebot|baiduspider|360(spider|user-agent)|semalt).*$')
{% set bad_agents_psv = bot_any()|join('|') %}
not regexp_contains(LOWER(useragent), '^.*({{bad_agents_psv}}).*$')
or useragent is null
)
and domain_userid is not null
Expand Down
42 changes: 7 additions & 35 deletions macros/adapters/default/page_views/snowplow_page_views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -296,41 +296,13 @@ prep as (
{% endif %}

where (a.br_family != 'Robot/Spider' or a.br_family is null)
and not (
(useragent like '%bot%'
or useragent like '%crawl%'
or useragent like '%slurp%'
or useragent like '%spider%'
or useragent like '%archiv%'
or useragent like '%spinn%'
or useragent like '%sniff%'
or useragent like '%seo%'
or useragent like '%audit%'
or useragent like '%survey%'
or useragent like '%pingdom%'
or useragent like '%worm%'
or useragent like '%capture%'
or useragent like '%browsershots%'
or useragent like '%screenshots%'
or useragent like '%analyz%'
or useragent like '%index%'
or useragent like '%thumb%'
or useragent like '%check%'
or useragent like '%facebook%'
or useragent like '%PingdomBot%'
or useragent like '%PhantomJS%'
or useragent like '%YorexBot%'
or useragent like '%Twitterbot%'
or useragent like '%a_archiver%'
or useragent like '%facebookexternalhit%'
or useragent like '%Bingbot%'
or useragent like '%BingPreview%'
or useragent like '%Googlebot%'
or useragent like '%Baiduspider%'
or useragent like '%360Spider%'
or useragent like '%360User-agent%'
or useragent like '%semalt%')
or a.useragent is null)
and (
not ({% for bad_agent in bot_any() %}
lower(useragent) like '%{{bad_agent}}%'
{{- 'or' if not loop.last -}}
{% endfor %})
or a.useragent is null
)
and coalesce(a.br_type, 'unknown') not in ('Bot/Crawler', 'Robot')
and a.domain_userid is not null
and a.domain_sessionidx > 0
Expand Down
34 changes: 34 additions & 0 deletions macros/bots.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{% macro bot_any() %}

{% set all_my_bots = [
'bot',
'crawl',
'slurp',
'spider',
'archiv',
'spinn',
'sniff',
'seo',
'audit',
'survey',
'pingdom',
'worm',
'capture',
'browsershots',
'screenshots',
'analyz',
'index',
'thumb',
'check',
'facebook',
'phantomjs',
'a_archiver',
'facebookexternalhit',
'bingpreview',
'360user-agent',
'semalt'
] %}

{% do return(all_my_bots) %}

{% endmacro %}

0 comments on commit 4acbdc9

Please sign in to comment.