From 93d303f8f07f9d07e6a409af86a7370daf23ae74 Mon Sep 17 00:00:00 2001 From: Tristan Handy Date: Tue, 15 Aug 2017 19:02:23 -0400 Subject: [PATCH] added optional fivetran layer --- dbt_project.yml | 4 +- models/base/snowplow_base_events.sql | 47 ++++- models/fivetran/sp_base_events_fivetran.sql | 190 ++++++++++++++++++++ 3 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 models/fivetran/sp_base_events_fivetran.sql diff --git a/dbt_project.yml b/dbt_project.yml index 332c375..083a184 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -1,7 +1,7 @@ #settings specifically for this models directory #config other dbt settings within ~/.dbt/profiles.yml name: 'snowplow' -version: '0.0.0.1' +version: '0.0.1' source-paths: ["models"] target-path: "target" @@ -14,12 +14,14 @@ macro-paths: ["macros"] models: snowplow: vars: + 'snowplow:use_fivetran_interface': false #'snowplow:events': TABLE OR {{ REF() }} #'snowplow:context:web_page': TABLE OR {{ REF() }} #'snowplow:context:performance_timing': TABLE OR {{ REF() }} or FALSE #'snowplow:context:useragent': TABLE OR {{ REF() }} or FALSE 'snowplow:timezone': 'America/New_York' 'snowplow:page_ping_frequency': 30 + 'snowplow:app_ids': [] base: materialized: ephemeral optional: diff --git a/models/base/snowplow_base_events.sql b/models/base/snowplow_base_events.sql index 685a632..177d806 100644 --- a/models/base/snowplow_base_events.sql +++ b/models/base/snowplow_base_events.sql @@ -1,2 +1,47 @@ +with source as ( -select * from {{ var('snowplow:events') }} + select * from + + {% if var('snowplow:use_fivetran_interface') %} + + {{ref('sp_base_events_fivetran')}} + + {% else %} + + {{ var('snowplow:events') }} + + {% endif %} + +), + +filtered as ( + + select * + from source + + + where true + + {% if var('snowplow:app_ids')|length > 0 %} + + and app_id in ( + {% for app_id in var('snowplow:app_ids') %} + '{{app_id}}' + {% if not loop.last %} + , + {% endif %} + {% endfor %} + ) + + {% endif %} + + --these fields should never be null; there's a quirk where small numbers of + --events have made it through without these fields; ignore these events + --so as not to throw off downstream models. + and domain_sessionid is not null + and domain_sessionidx is not null + and domain_userid is not null + +) + +select * from filtered diff --git a/models/fivetran/sp_base_events_fivetran.sql b/models/fivetran/sp_base_events_fivetran.sql new file mode 100644 index 0000000..4daf176 --- /dev/null +++ b/models/fivetran/sp_base_events_fivetran.sql @@ -0,0 +1,190 @@ +{% set br_viewwidth = " + case + when br_viewheight like '%x%' + then split_part(br_viewheight, 'x', 1)::integer + else null + end +"%} + +{% set br_viewheight = " + case + when br_viewheight like '%x%' + then split_part(br_viewheight, 'x', 2)::integer + else null + end +"%} + + +with events as ( + + select * from {{ var('snowplow:events') }} + +), + +fixed as ( + + select *, + -- Fivetran doesn't exactly adhere to the Snowplow spec. + -- Just make these fields null so we can use the open source snowplow dbt package + collector_tstamp as derived_tstamp, + dvce_tstamp::timestamp as dvce_created_tstamp, + case when event = 'pv' then 'page_view' + when event = 'pp' then 'page_ping' + else event end as fixed_event, + null::timestamp as etl_tstamp, + null as dvce_screenwidth, + null as mkt_clickid, + null as mkt_network + + from events + + where + -- this breaks the snowplow package + (br_viewheight is null or br_viewheight != 'cliqz.com/tracking') + +), renamed as ( + + select + "br_name", + "br_family", + "br_version", + "br_type", + "br_renderengine", + "collector_tstamp", + "dvce_type", + "dvce_ismobile", + "os_name", + "os_family", + "os_manufacturer", + "event_vendor", + "event_format", + "event_version", + "page_urlscheme", + "page_urlhost", + "page_urlport", + "page_urlpath", + "page_urlquery", + "page_urlfragment", + "refr_urlscheme", + "refr_urlhost", + "refr_urlport", + "refr_urlpath", + "refr_urlfragment", + "refr_medium", + "refr_source", + "refr_term", + nullif("mkt_source", '') as "mkt_source", + nullif("mkt_medium", '') as "mkt_medium", + nullif("mkt_campaign", '') as "mkt_campaign", + nullif("mkt_term", '') as "mkt_term", + nullif("mkt_content", '') as "mkt_content", + "sequence_number", + "fixed_event" as "event", -- * + "fixed_event" as "event_name", -- * + "user_ipaddress", + "app_id", + "platform", + "txn_id", + "user_id", + "domain_userid", + "network_userid", + "user_fingerprint", + "domain_sessionidx", + "domain_sessionid", + "dvce_sent_tstamp", + "name_tracker", + "v_tracker", + "v_collector", + "br_lang", + "br_features_pdf", + "br_features_flash", + "br_features_java", + "br_features_director", + "br_features_quicktime", + "br_features_realplayer", + "br_features_windowsmedia", + "br_features_gears", + "br_features_silverlight", + "br_cookies", + "dvce_screenheight", + "br_colordepth", + "os_timezone", + "page_url", + "doc_charset", + "doc_height", + "doc_width", + "event_id", + "se_category", + "se_action", + replace(replace(se_label, '%40', '@'), '%2B', '+') as se_label, + "se_property", + "se_value", + "tr_orderid", + "tr_affiliation", + "tr_total", + "tr_tax", + "tr_shipping", + "tr_city", + "tr_state", + "tr_country", + "ti_orderid", + "ti_sku", + "ti_name", + "ti_category", + "ti_price", + "ti_quantity", + "pp_xoffset_min", + "pp_xoffset_max", + "pp_yoffset_min", + "pp_yoffset_max", + "tr_currency", + "ti_currency", + "refr_urlquery", + "page_referrer", + "page_title", + "useragent", + + -- Missing from Fivetran -- injected above + "dvce_created_tstamp", + "etl_tstamp", + "geo_country", + "geo_region", + "geo_city", + "geo_zipcode", + "geo_latitude", + "geo_longitude", + "geo_timezone", + "dvce_screenwidth", + "mkt_clickid", + "mkt_network", + "derived_tstamp", + "geo_region_name", + + null as "ip_isp", + null as "ip_organization", + null as "ip_domain", + null as "ip_netspeed", + + null as "browser", + null as "browser_name", + null as "browser_major_version", + null as "browser_minor_version", + null as "browser_build_version", + null as "browser_engine", + null as "browser_language", + + case + when ({{br_viewwidth}}) > 100000 then null + else {{br_viewwidth}} + end as br_viewwidth, + + case + when ({{br_viewheight}}) > 100000 then null + else {{br_viewheight}} + end as br_viewheight + + from fixed + +) + +select * from renamed