diff --git a/macros/adapters/bigquery/pageviews/snowplow_page_views.sql b/macros/adapters/bigquery/pageviews/snowplow_page_views.sql index 7c37ba0..4aeebe2 100644 --- a/macros/adapters/bigquery/pageviews/snowplow_page_views.sql +++ b/macros/adapters/bigquery/pageviews/snowplow_page_views.sql @@ -177,7 +177,8 @@ page_views as ( where event = 'page_view' and (br_family != 'Robot/Spider' or br_family is null) and ( - not regexp_contains(LOWER(useragent), '^.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|phantomjs|yandexbot|twitterbot|a_archiver|facebookexternalhit|bingbot|bingpreview|googlebot|baiduspider|360(spider|user-agent)|semalt).*$') + {% set bad_agents_psv = bot_any()|join('|') %} + not regexp_contains(LOWER(useragent), '^.*({{bad_agents_psv}}).*$') or useragent is null ) and domain_userid is not null diff --git a/macros/adapters/default/page_views/snowplow_page_views.sql b/macros/adapters/default/page_views/snowplow_page_views.sql index 886ef1b..d99eac8 100644 --- a/macros/adapters/default/page_views/snowplow_page_views.sql +++ b/macros/adapters/default/page_views/snowplow_page_views.sql @@ -296,41 +296,13 @@ prep as ( {% endif %} where (a.br_family != 'Robot/Spider' or a.br_family is null) - and not ( - (useragent like '%bot%' - or useragent like '%crawl%' - or useragent like '%slurp%' - or useragent like '%spider%' - or useragent like '%archiv%' - or useragent like '%spinn%' - or useragent like '%sniff%' - or useragent like '%seo%' - or useragent like '%audit%' - or useragent like '%survey%' - or useragent like '%pingdom%' - or useragent like '%worm%' - or useragent like '%capture%' - or useragent like '%browsershots%' - or useragent like '%screenshots%' - or useragent like '%analyz%' - or useragent like '%index%' - or useragent like '%thumb%' - or useragent like '%check%' - or useragent like '%facebook%' - or useragent like '%PingdomBot%' - or useragent like '%PhantomJS%' - or useragent like '%YorexBot%' - or useragent like '%Twitterbot%' - or useragent like '%a_archiver%' - or useragent like '%facebookexternalhit%' - or useragent like '%Bingbot%' - or useragent like '%BingPreview%' - or useragent like '%Googlebot%' - or useragent like '%Baiduspider%' - or useragent like '%360Spider%' - or useragent like '%360User-agent%' - or useragent like '%semalt%') - or a.useragent is null) + and ( + not ({% for bad_agent in bot_any() %} + lower(useragent) like '%{{bad_agent}}%' + {{- 'or' if not loop.last -}} + {% endfor %}) + or a.useragent is null + ) and coalesce(a.br_type, 'unknown') not in ('Bot/Crawler', 'Robot') and a.domain_userid is not null and a.domain_sessionidx > 0 diff --git a/macros/bots.sql b/macros/bots.sql new file mode 100644 index 0000000..a62c5bc --- /dev/null +++ b/macros/bots.sql @@ -0,0 +1,34 @@ +{% macro bot_any() %} + + {% set all_my_bots = [ + 'bot', + 'crawl', + 'slurp', + 'spider', + 'archiv', + 'spinn', + 'sniff', + 'seo', + 'audit', + 'survey', + 'pingdom', + 'worm', + 'capture', + 'browsershots', + 'screenshots', + 'analyz', + 'index', + 'thumb', + 'check', + 'facebook', + 'phantomjs', + 'a_archiver', + 'facebookexternalhit', + 'bingpreview', + '360user-agent', + 'semalt' + ] %} + + {% do return(all_my_bots) %} + +{% endmacro %} \ No newline at end of file