Skip to content

Commit

Permalink
fix resids being off when there is no constant term
Browse files Browse the repository at this point in the history
  • Loading branch information
dwreeves committed Apr 3, 2023
1 parent 953240a commit 162ca0a
Show file tree
Hide file tree
Showing 15 changed files with 120 additions and 70 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,9 @@ All formats have their own format options, which can be passed into the `format_
- **coefficient_column_name** (default = `'coefficient'`): Column name storing model coefficients.
- **strip_quotes** (default = `True`): If true, strip outer quotes from column names if provided; if false, always use string literals.

These options are only available when `method='chol'`:
These options are available for `format='long'` only when `method='chol'`:

- **calculate_standard_error** (default = `'calculate_standard_error'`): If true, provide the standard error in the output.
- **calculate_standard_error** (default = `True if not alpha else False`): If true, provide the standard error in the output.
- **standard_error_column_name** (default = `'standard_error'`): Column name storing the standard error for the parameter.
-- **t_statistic_column_name** (default = `'t_statistic'`): Column name storing the t-statistic for the parameter.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ with

expected as (

select 'x1' as variable_name, 30.500076644845674 as coefficient
select 'x1' as variable_name, 30.500076644845674 as coefficient, 0.8396121329329627 as standard_error, 36.326388636502585 as t_statistic

)

Expand All @@ -11,5 +11,10 @@ from {{ ref('collinear_matrix_1var_without_const') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ with

expected as (

select 'x1' as variable_name, 63.18154691334764 as coefficient
select 'x1' as variable_name, 63.18154691334764 as coefficient, 0.4056389914380657 as standard_error, 155.75807120848344 as t_statistic
union all
select 'x2' as variable_name, 55.39820150046505 as coefficient
select 'x2' as variable_name, 55.39820150046505 as coefficient, 0.2738669097295638 as standard_error, 202.2814715190283 as t_statistic

)

Expand All @@ -13,5 +13,10 @@ from {{ ref('collinear_matrix_2var_without_const') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ with

expected as (

select 'x1' as variable_name, 20.090207982897063 as coefficient
select 'x1' as variable_name, 20.090207982897063 as coefficient, 0.5196176972417176 as standard_error, 38.6634406209445 as t_statistic
union all
select 'x2' as variable_name, -16.533211090826203 as coefficient
select 'x2' as variable_name, -16.533211090826203 as coefficient, 0.7481701784700665 as standard_error, -22.098195793682894 as t_statistic
union all
select 'x3' as variable_name, 35.00389104686492 as coefficient
select 'x3' as variable_name, 35.00389104686492 as coefficient, 0.351617515124373 as standard_error, 99.55104493154575 as t_statistic

)

Expand All @@ -15,5 +15,10 @@ from {{ ref('collinear_matrix_3var_without_const') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ with

expected as (

select 'x1' as variable_name, 20.587532776354163 as coefficient
select 'x1' as variable_name, 20.587532776354163 as coefficient, 0.5176259827853541 as standard_error, 39.772989496339235 as t_statistic
union all
select 'x2' as variable_name, -20.41001520357013 as coefficient
select 'x2' as variable_name, -20.41001520357013 as coefficient, 0.8103907603637923 as standard_error, -25.185399688426696 as t_statistic
union all
select 'x3' as variable_name, 35.084935774341524 as coefficient
select 'x3' as variable_name, 35.084935774341524 as coefficient, 0.34920588221192245 as standard_error, 100.4706322588505 as t_statistic
union all
select 'x4' as variable_name, 1.8960558858899716 as coefficient
select 'x4' as variable_name, 1.8960558858899716 as coefficient, 0.1583538085466205 as standard_error, 11.973541421529871 as t_statistic

)

Expand All @@ -17,5 +17,10 @@ from {{ ref('collinear_matrix_4var_without_const') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ with

expected as (

select 'x1' as variable_name, 11.392300499659957 as coefficient
select 'x1' as variable_name, 11.392300499659957 as coefficient, 0.5240533254061608 as standard_error, 21.73881921430515 as t_statistic
union all
select 'x2' as variable_name, 2.333060182571783 as coefficient
select 'x2' as variable_name, 2.333060182571783 as coefficient, 0.9201150492406911 as standard_error, 2.5356178931070636 as t_statistic
union all
select 'x3' as variable_name, 21.895814737788875 as coefficient
select 'x3' as variable_name, 21.895814737788875 as coefficient, 0.44810399169425286 as standard_error, 48.8632441210849 as t_statistic
union all
select 'x4' as variable_name, 3.4480236159406785 as coefficient
select 'x4' as variable_name, 3.4480236159406785 as coefficient, 0.1504072830205524 as standard_error, 22.92457882820424 as t_statistic
union all
select 'x5' as variable_name, 15.766951731565559 as coefficient
select 'x5' as variable_name, 15.766951731565559 as coefficient, 0.37297028350495787 as standard_error, 42.274015997727524 as t_statistic

)

Expand All @@ -19,5 +19,10 @@ from {{ ref('collinear_matrix_5var_without_const') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
21 changes: 13 additions & 8 deletions integration_tests/tests/test_collinear_matrix_regression_chol.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,29 @@ with

expected as (

select 'const' as variable_name, 19.757104885315176 as coefficient
select 'const' as variable_name, 19.757104885315176 as coefficient, 2.992803142237603 as standard_error, 6.601538406078909 as t_statistic
union all
select 'x1' as variable_name, 9.90708767581426 as coefficient
select 'x1' as variable_name, 9.90708767581426 as coefficient, 0.5692826957191374 as standard_error, 17.402755696445837 as t_statistic
union all
select 'x2' as variable_name, 6.187473206056227 as coefficient
select 'x2' as variable_name, 6.187473206056227 as coefficient, 1.0880807259333622 as standard_error, 5.686593888287631 as t_statistic
union all
select 'x3' as variable_name, 19.66874583168642 as coefficient
select 'x3' as variable_name, 19.66874583168642 as coefficient, 0.5601379212447676 as standard_error, 35.11411223146169 as t_statistic
union all
select 'x4' as variable_name, 3.7192417102253468 as coefficient
select 'x4' as variable_name, 3.7192417102253468 as coefficient, 0.15560940177101745 as standard_error, 23.901137514160553 as t_statistic
union all
select 'x5' as variable_name, 13.444273483323244 as coefficient
select 'x5' as variable_name, 13.444273483323244 as coefficient, 0.5121595119107619 as standard_error, 26.250168493728488 as t_statistic

)

select base.variable_name
select base.*
from {{ ref('collinear_matrix_regression_chol') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,29 @@ with

expected as (

select 'const' as variable_name, 19.757104885315176 as coefficient
select 'const' as variable_name, 19.757104885315176 as coefficient, 2.992803142237603 as standard_error, 6.601538406078909 as t_statistic
union all
select 'x1' as variable_name, 9.90708767581426 as coefficient
select 'x1' as variable_name, 9.90708767581426 as coefficient, 0.5692826957191374 as standard_error, 17.402755696445837 as t_statistic
union all
select 'x2' as variable_name, 6.187473206056227 as coefficient
select 'x2' as variable_name, 6.187473206056227 as coefficient, 1.0880807259333622 as standard_error, 5.686593888287631 as t_statistic
union all
select 'x3' as variable_name, 19.66874583168642 as coefficient
select 'x3' as variable_name, 19.66874583168642 as coefficient, 0.5601379212447676 as standard_error, 35.11411223146169 as t_statistic
union all
select 'x4' as variable_name, 3.7192417102253468 as coefficient
select 'x4' as variable_name, 3.7192417102253468 as coefficient, 0.15560940177101745 as standard_error, 23.901137514160553 as t_statistic
union all
select 'x5' as variable_name, 13.444273483323244 as coefficient
select 'x5' as variable_name, 13.444273483323244 as coefficient, 0.5121595119107619 as standard_error, 26.250168493728488 as t_statistic

)

select base.variable_name
select base.*
from {{ ref('collinear_matrix_regression_chol_unoptimized') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ from {{ ref('collinear_matrix_regression_fwl') }} as base
full outer join expected
on base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or base.coefficient is null
or expected.coefficient is null
23 changes: 14 additions & 9 deletions integration_tests/tests/test_groups_matrix_regression_chol.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@ with

expected as (

select 'a' as gb_var, 'const' as variable_name, -0.06563066041472207 as coefficient
select 'a' as gb_var, 'const' as variable_name, -0.06563066041472207 as coefficient, 0.053945103940799474 as standard_error, -1.2166194078844779 as t_statistic
union all
select 'a' as gb_var, 'x1' as variable_name, 0.9905419281557593 as coefficient
select 'a' as gb_var, 'x1' as variable_name, 0.9905419281557593 as coefficient, 0.015209571618398615 as standard_error, 65.12622136954383 as t_statistic
union all
select 'a' as gb_var, 'x2' as variable_name, 4.948221700496285 as coefficient
select 'a' as gb_var, 'x2' as variable_name, 4.948221700496285 as coefficient, 0.02906881854690807 as standard_error, 170.2243829590593 as t_statistic
union all
select 'a' as gb_var, 'x3' as variable_name, 0.031234030051974747 as coefficient
select 'a' as gb_var, 'x3' as variable_name, 0.031234030051974747 as coefficient, 0.014337008978330493 as standard_error, 2.178559705108859 as t_statistic
union all
select 'b' as gb_var, 'const' as variable_name, 2.0117130483709955 as coefficient
select 'b' as gb_var, 'const' as variable_name, 2.0117130483709955 as coefficient, 0.035587045398501334 as standard_error, 56.529364150464545 as t_statistic
union all
select 'b' as gb_var, 'x1' as variable_name, 2.996331112245573 as coefficient
select 'b' as gb_var, 'x1' as variable_name, 2.996331112245573 as coefficient, 0.006731681784764358 as standard_error, 445.1088462064698 as t_statistic
union all
select 'b' as gb_var, 'x2' as variable_name, 9.019683491736044 as coefficient
select 'b' as gb_var, 'x2' as variable_name, 9.019683491736044 as coefficient, 0.008744674914389008 as standard_error, 1031.4486907791759 as t_statistic
union all
select 'b' as gb_var, 'x3' as variable_name, 0.016151316166848173 as coefficient
select 'b' as gb_var, 'x3' as variable_name, 0.016151316166848173 as coefficient, 0.0072206704541224525 as standard_error, 2.2368166875178472 as t_statistic

)

Expand All @@ -27,5 +27,10 @@ on
base.gb_var = expected.gb_var
and base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@ with

expected as (

select 'a' as gb_var, 'const' as variable_name, -0.06563066041472207 as coefficient
select 'a' as gb_var, 'const' as variable_name, -0.06563066041472207 as coefficient, 0.053945103940799474 as standard_error, -1.2166194078844779 as t_statistic
union all
select 'a' as gb_var, 'x1' as variable_name, 0.9905419281557593 as coefficient
select 'a' as gb_var, 'x1' as variable_name, 0.9905419281557593 as coefficient, 0.015209571618398615 as standard_error, 65.12622136954383 as t_statistic
union all
select 'a' as gb_var, 'x2' as variable_name, 4.948221700496285 as coefficient
select 'a' as gb_var, 'x2' as variable_name, 4.948221700496285 as coefficient, 0.02906881854690807 as standard_error, 170.2243829590593 as t_statistic
union all
select 'a' as gb_var, 'x3' as variable_name, 0.031234030051974747 as coefficient
select 'a' as gb_var, 'x3' as variable_name, 0.031234030051974747 as coefficient, 0.014337008978330493 as standard_error, 2.178559705108859 as t_statistic
union all
select 'b' as gb_var, 'const' as variable_name, 2.0117130483709955 as coefficient
select 'b' as gb_var, 'const' as variable_name, 2.0117130483709955 as coefficient, 0.035587045398501334 as standard_error, 56.529364150464545 as t_statistic
union all
select 'b' as gb_var, 'x1' as variable_name, 2.996331112245573 as coefficient
select 'b' as gb_var, 'x1' as variable_name, 2.996331112245573 as coefficient, 0.006731681784764358 as standard_error, 445.1088462064698 as t_statistic
union all
select 'b' as gb_var, 'x2' as variable_name, 9.019683491736044 as coefficient
select 'b' as gb_var, 'x2' as variable_name, 9.019683491736044 as coefficient, 0.008744674914389008 as standard_error, 1031.4486907791759 as t_statistic
union all
select 'b' as gb_var, 'x3' as variable_name, 0.016151316166848173 as coefficient
select 'b' as gb_var, 'x3' as variable_name, 0.016151316166848173 as coefficient, 0.0072206704541224525 as standard_error, 2.2368166875178472 as t_statistic

)

Expand All @@ -27,5 +27,10 @@ on
base.gb_var = expected.gb_var
and base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or round(base.standard_error, 7) != round(expected.standard_error, 7)
or round(base.t_statistic, 7) != round(expected.t_statistic, 7)
or base.coefficient is null
or base.standard_error is null
or base.t_statistic is null
or expected.coefficient is null
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ on
base.gb_var = expected.gb_var
and base.variable_name = expected.variable_name
where
round(base.coefficient, 7) - round(expected.coefficient, 7)
round(base.coefficient, 7) != round(expected.coefficient, 7)
or base.coefficient is null
or expected.coefficient is null
13 changes: 9 additions & 4 deletions macros/linear_regression/ols_impl_chol/_ols_impl_chol.sql
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,12 @@
{%- endif %}
{%- set subquery_optimization = method_options.get('subquery_optimization', True) %}
{%- set safe_sqrt = method_options.get('safe', True) %}
{%- set calculate_standard_error = format_options.get('calculate_standard_error', True) and format == 'long' %}
{%- set calculate_standard_error = format_options.get('calculate_standard_error', (not alpha)) and format == 'long' %}
{%- if alpha and calculate_standard_error %}
{% do log(
'Warning: Standard errors are NOT designed to take into account ridge regression regularization.'
) %}
{%- endif %}
{%- if add_constant %}
{% set xmin = 0 %}
{%- else %}
Expand Down Expand Up @@ -264,11 +269,11 @@ _dbt_linreg_final_coefs as (
_dbt_linreg_resid as (
select
{{ dbt_linreg._gb_cols(group_by, trailing_comma=True, prefix='b') | indent(4) }}
var_pop(y
avg(pow(y
{%- for x in xcols %}
- x{{ x }} * x{{ x }}_coef
{%- endfor %}
) as resid_var,
, 2)) as resid_square_mean,
count(*) as n
from
_dbt_linreg_base as b
Expand All @@ -282,7 +287,7 @@ _dbt_linreg_stderrs as (
select
{{ dbt_linreg._gb_cols(group_by, trailing_comma=True, prefix='b') | indent(4) }}
{%- for x in xcols %}
sqrt(inv_x{{ x }}x{{ x }} * resid_var * n / (n - {{ xcols | length }})) as x{{ x }}_stderr
sqrt(inv_x{{ x }}x{{ x }} * resid_square_mean * n / (n - {{ xcols | length }})) as x{{ x }}_stderr
{%- if not loop.last -%}
,
{%- endif %}
Expand Down
3 changes: 0 additions & 3 deletions run
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

set -eo pipefail



function setup {
poetry install
poetry run pre-commit install
Expand All @@ -17,7 +15,6 @@ function testloc {
poetry run dbt run --project-dir ./integration_tests --select tag:perftest
}


function test {
# rm -f integration_tests/dbt.duckdb
export DBT_PROFILES_DIR=./integration_tests/profiles
Expand Down
11 changes: 6 additions & 5 deletions scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,14 @@ def _run_model(cond=None):
).fit_regularized(L1_wt=0, alpha=alpha_arr)
else:
model = sm.OLS(y, x_mat).fit()
res_df = pd.DataFrame(index=x_mat.columns)
res_df["coef"] = model.params
res_df["stderr"] = model.bse
res_df["tstat"] = res_df["coef"] / res_df["stderr"]
click.echo(
tabulate(
pd.DataFrame(
{"coefficient": model.params},
index=x_mat.columns
),
headers=["coefficient", "value"],
res_df,
headers=["column name", "coef", "stderr", "tstat"],
disable_numparse=True,
tablefmt="psql",
)
Expand Down

0 comments on commit 162ca0a

Please sign in to comment.