Skip to content

Commit

Permalink
fix: make bootstrap more robust (and slower 😢) (#1351)
Browse files Browse the repository at this point in the history
This diff ensures that the boostrap is more robust in the scenario
described by ooni/probe#2544.

Because we're close to a release, I did not want to write a heavy
refactor of the boostrap, and specifically of the iplookup part, but I
documented what is needed at ooni/probe#2551.

This diff has been tested in a problematic network under two distinct
conditions:

1. cold bootstrap (i.e., w/o prior knowledge), where the correct fix is
to give the system resolver intermediate priority (i.e., 0.5);

2. hot bootstrap with very f***** up `sessionresolver.state`, where the
correct fix is to use a larger coarse grained timeout, such that
_eventually_ we try the system resolver.

The fix that prevents marking a resolver when the external context has been
canceled is required in both of these scenarios.

As you can see, I removed a "reliability fix", which actually was more
of an optimization. This removal means that the probe bootstrap is now
slower than it could be, hence my choice of documenting in
ooni/probe#2551 what I'd do had I had more
time to work on this topic.

BTW, I had to create an overall 45 seconds timeout for IP lookups
because we have 7 DNS over HTTPS resolvers plus the system resolver.
This means that, in the worst case where the system resolver has the
least priority, we expect 7*4 = 28 seconds of timeout before eventually
using the system resolver. The rest of the 45s timeout accounts for
operations happening after the DNS lookup has completed.
  • Loading branch information
bassosimone authored Oct 9, 2023
1 parent 9405dfd commit 6b6271a
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 15 deletions.
14 changes: 11 additions & 3 deletions internal/enginelocate/iplookup.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,21 @@ func makeSlice() []method {
return ret
}

func contextForIPLookupWithTimeout(ctx context.Context) (context.Context, context.CancelFunc) {
// TODO(https://github.com/ooni/probe/issues/2551): we must enforce a timeout this
// large to ensure we give all resolvers a chance to run. We set this value as part of
// an hotfix. The above mentioned issue explains how to improve the situation and
// avoid the need of setting such large timeouts here.
const timeout = 45 * time.Second
return context.WithTimeout(ctx, timeout)
}

func (c ipLookupClient) doWithCustomFunc(
ctx context.Context, fn lookupFunc,
) (string, error) {
// Reliability fix: let these mechanisms timeout earlier.
const timeout = 7 * time.Second
ctx, cancel := context.WithTimeout(ctx, timeout)
ctx, cancel := contextForIPLookupWithTimeout(ctx)
defer cancel()

// Implementation note: we MUST use an HTTP client that we're
// sure IS NOT using any proxy. To this end, we construct a
// client ourself that we know is not proxied.
Expand Down
17 changes: 17 additions & 0 deletions internal/enginelocate/iplookup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"net"
"testing"
"time"

"github.com/apex/log"
"github.com/ooni/probe-cli/v3/internal/model"
Expand Down Expand Up @@ -55,3 +56,19 @@ func TestIPLookupInvalidIP(t *testing.T) {
t.Fatal("expected the default IP here")
}
}

func TestContextForIPLookupWithTimeout(t *testing.T) {
now := time.Now()
ctx, cancel := contextForIPLookupWithTimeout(context.Background())
defer cancel()
deadline, okay := ctx.Deadline()
if !okay {
t.Fatal("the context does not have a deadline")
}
delta := deadline.Sub(now)
// Note: super conservative check. Assume it may take up to five seconds
// for the code to create a context, which is totally unrealistic.
if delta < 40*time.Second {
t.Fatal("the deadline is too short")
}
}
2 changes: 2 additions & 0 deletions internal/enginelocate/stun.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func stunNewClient(conn net.Conn) (stunClient, error) {

func stunIPLookup(ctx context.Context, config stunConfig) (string, error) {
config.Logger.Debugf("STUNIPLookup: start using %s", config.Endpoint)

ip, err := func() (string, error) {
dialer := config.Dialer
if dialer == nil {
Expand Down Expand Up @@ -74,6 +75,7 @@ func stunIPLookup(ctx context.Context, config stunConfig) (string, error) {
return model.DefaultProbeIP, ctx.Err()
}
}()

if err != nil {
config.Logger.Debugf("STUNIPLookup: failure using %s: %+v", config.Endpoint, err)
return model.DefaultProbeIP, err
Expand Down
4 changes: 2 additions & 2 deletions internal/engineresolver/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// have any preferential ordering. The initial resolutions may be slower
// if there are many issues with resolvers.
//
// The system resolver is given the lowest priority at the beginning
// but it will of course be the most popular resolver if anything else
// The system resolver is given intermediate priority at the beginning (i.e.,
// 0.5) but it will of course be the most popular resolver if anything else
// is failing us. (We will still occasionally probe for other working
// resolvers and increase their score on success.)
//
Expand Down
10 changes: 10 additions & 0 deletions internal/engineresolver/resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,16 @@ func (r *Resolver) LookupHost(ctx context.Context, hostname string) ([]string, e
r.logger().Infof("sessionresolver: skipping with proxy: %+v", e)
continue // we cannot proxy this URL so ignore it
}

// Hotfix: if the context has been canceled from the outside avoid
// doing a dnslookup, which would mark the resolver as not WAI.
//
// See https://github.com/ooni/probe/issues/2544
if err := ctx.Err(); err != nil {
me.Add(newErrWrapper(err, e.URL))
continue
}

addrs, err := r.lookupHost(ctx, e, hostname)
if err == nil {
return addrs, nil
Expand Down
23 changes: 19 additions & 4 deletions internal/engineresolver/resolver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func TestAddressWorks(t *testing.T) {
}
}

func TestTypicalUsageWithFailure(t *testing.T) {
func TestResolverLookupHostUsingACanceledContext(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel() // fail immediately
reso := &Resolver{KVStore: &kvstore.Memory{}}
Expand All @@ -47,14 +47,15 @@ func TestTypicalUsageWithFailure(t *testing.T) {
t.Fatal("cannot convert error")
}
for _, child := range me.Children {

// net.DNSError does not include the underlying error
// but just a string representing the error. This
// means that we need to go down hunting what's the
// real error that occurred and use more verbose code.
{
var ew *errWrapper
if !errors.As(child, &ew) {
t.Fatal("not an instance of errwrapper")
t.Fatalf("not an instance of errwrapper: '%v' [%T]", child, child)
}
var de *net.DNSError
if errors.As(ew, &de) {
Expand All @@ -64,6 +65,7 @@ func TestTypicalUsageWithFailure(t *testing.T) {
continue
}
}

// otherwise just unwrap and check whether it's
// a real context.Canceled error.
if !errors.Is(child, context.Canceled) {
Expand All @@ -73,9 +75,22 @@ func TestTypicalUsageWithFailure(t *testing.T) {
if addrs != nil {
t.Fatal("expected nil here")
}
if len(reso.res) < 1 {
t.Fatal("expected to see some resolvers here")

// Since https://github.com/ooni/probe-cli/pull/1351 we avoid
// constructing any resolver if we start running with a canceled context
// because of https://github.com/ooni/probe/issues/2544.
//
// In other words, as long as we see zero here we can be confident
// that we're not creating a resolver if the context has been canceled,
// which should imply we're not changing its score.
//
// Heavier refactoring of this package should probably more aggressively
// ensure that we're not changing the score, but for now this test
// is sufficient given that we are committing an hotfix.
if len(reso.res) != 0 {
t.Fatal("expected to see no resolvers here")
}

reso.CloseIdleConnections()
if len(reso.res) != 0 {
t.Fatal("expected to see no resolvers after CloseIdleConnections")
Expand Down
20 changes: 14 additions & 6 deletions internal/engineresolver/resolvermaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,29 @@ var allmakers = []*resolvermaker{{
}}

// allbyurl contains all the resolvermakers by URL
var allbyurl map[string]*resolvermaker
var allbyurl = resolverMakeInitialState()

// init fills allbyname and gives a nonzero initial score
// resolverMakeInitialState initializes the initial
// state by giving a nonzero initial score
// to all resolvers except for the system resolver. We set
// the system resolver score to zero, so that it's less
// the system resolver score to be 0.5, so that it's less
// likely than other resolvers in this list.
func init() {
allbyurl = make(map[string]*resolvermaker)
//
// We used to set this value to 0, but this proved to
// create issues when it was the only available resolver,
// see https://github.com/ooni/probe/issues/2544.
func resolverMakeInitialState() map[string]*resolvermaker {
output := make(map[string]*resolvermaker)
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
for _, e := range allmakers {
allbyurl[e.url] = e
output[e.url] = e
if e.url != systemResolverURL {
e.score = rng.Float64()
} else {
e.score = 0.5
}
}
return output
}

// logger returns the configured logger or a default
Expand Down
16 changes: 16 additions & 0 deletions internal/engineresolver/resolvermaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,19 @@ func TestGetResolverHTTP3(t *testing.T) {
t.Fatal("expected true")
}
}

func TestResolverMakeInitialState(t *testing.T) {
t.Run("the system resolver has a default score of 0.5", func(t *testing.T) {
state := resolverMakeInitialState()
var okay bool
for URL, entry := range state {
t.Logf("entry: %v %v", URL, *entry)
if URL == systemResolverURL && entry.score == 0.5 {
okay = true
}
}
if !okay {
t.Fatal("expected to see the system resolver with 0.5 as its score")
}
})
}

0 comments on commit 6b6271a

Please sign in to comment.