Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPM] Use IMDSv2 for vpc_id (network_id) lookups #29027

Merged
merged 18 commits into from
Sep 6, 2024
8 changes: 5 additions & 3 deletions cmd/process-agent/subcommands/check/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,12 @@ func RunCheckCmd(deps Dependencies) error {
names = append(names, ch.Name())

_, processModuleEnabled := deps.Syscfg.SysProbeObject().EnabledModules[sysconfig.ProcessModule]
_, networkTracerModuleEnabled := deps.Syscfg.SysProbeObject().EnabledModules[sysconfig.NetworkTracerModule]
cfg := &checks.SysProbeConfig{
MaxConnsPerMessage: deps.Syscfg.SysProbeObject().MaxConnsPerMessage,
SystemProbeAddress: deps.Syscfg.SysProbeObject().SocketAddress,
ProcessModuleEnabled: processModuleEnabled,
MaxConnsPerMessage: deps.Syscfg.SysProbeObject().MaxConnsPerMessage,
SystemProbeAddress: deps.Syscfg.SysProbeObject().SocketAddress,
ProcessModuleEnabled: processModuleEnabled,
NetworkTracerModuleEnabled: networkTracerModuleEnabled,
}

if !matchingCheck(deps.CliParams.checkName, ch) {
Expand Down
11 changes: 11 additions & 0 deletions cmd/system-probe/modules/network_tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
"runtime"
Expand Down Expand Up @@ -108,6 +109,16 @@ func (nt *networkTracer) Register(httpMux *module.Router) error {
logRequests(id, count, len(cs.Conns), start)
}))

httpMux.HandleFunc("/network_id", utils.WithConcurrencyLimit(utils.DefaultMaxConcurrentRequests, func(w http.ResponseWriter, req *http.Request) {
id, err := nt.tracer.GetNetworkID(req.Context())
if err != nil {
log.Errorf("unable to retrieve network_id: %s", err)
w.WriteHeader(500)
return
}
io.WriteString(w, id)
}))

httpMux.HandleFunc("/register", utils.WithConcurrencyLimit(utils.DefaultMaxConcurrentRequests, func(w http.ResponseWriter, req *http.Request) {
id := getClientID(req)
err := nt.tracer.RegisterClient(id)
Expand Down
15 changes: 15 additions & 0 deletions pkg/network/tracer/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/DataDog/datadog-agent/pkg/process/util"
timeresolver "github.com/DataDog/datadog-agent/pkg/security/resolvers/time"
"github.com/DataDog/datadog-agent/pkg/telemetry"
"github.com/DataDog/datadog-agent/pkg/util/ec2"
"github.com/DataDog/datadog-agent/pkg/util/kernel"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
Expand Down Expand Up @@ -850,3 +851,17 @@ func newUSMMonitor(c *config.Config, tracer connection.Tracer) *usm.Monitor {

return monitor
}

// GetNetworkID retrieves the vpc_id (network_id) from IMDS
func (t *Tracer) GetNetworkID(context context.Context) (string, error) {
id := ""
err := kernel.WithRootNS(kernel.ProcFSRoot(), func() error {
var err error
id, err = ec2.GetNetworkID(context)
return err
})
if err != nil {
return "", err
}
return id, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should cache the id here to avoid another call to the metadata endpoint.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe either way this will only get called once, since this is hit via the Init() method of the PA check which happens only at startup

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

although if multiple checks are calling this it will indeed be called multiple times, so we can save 2 RPCs by caching this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

upon further reflection I think I would rather not cache this, because:

  • we are maintaining the same call pattern as before
  • we are only calling max 3 times on startup (if process/container/network checks are all enabled)
  • anecdotally I have seen a failure to fetch the IMDSv2 token sometimes when testing. I am going to follow up with ASC about this, but I am wary to not populate this value if the first request fails

}
5 changes: 5 additions & 0 deletions pkg/network/tracer/tracer_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ func (t *Tracer) GetActiveConnections(_ string) (*network.Connections, error) {
return nil, ebpf.ErrNotImplemented
}

// GetNetworkID is not implemented on this OS for Tracer
func (t *Tracer) GetNetworkID(_ context.Context) (string, error) {
return "", ebpf.ErrNotImplemented
}

// RegisterClient registers the client
func (t *Tracer) RegisterClient(_ string) error {
return ebpf.ErrNotImplemented
Expand Down
5 changes: 5 additions & 0 deletions pkg/network/tracer/tracer_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,11 @@ func (t *Tracer) DebugDumpProcessCache(_ context.Context) (interface{}, error) {
return nil, ebpf.ErrNotImplemented
}

// GetNetworkID is not implemented on this OS for Tracer
func (t *Tracer) GetNetworkID(_ context.Context) (string, error) {
return "", ebpf.ErrNotImplemented
akarpz marked this conversation as resolved.
Show resolved Hide resolved
}

func newUSMMonitor(c *config.Config, dh driver.Handle) usm.Monitor {
if !c.EnableHTTPMonitoring && !c.EnableNativeTLSMonitoring {
return nil
Expand Down
2 changes: 2 additions & 0 deletions pkg/process/checks/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ type SysProbeConfig struct {
SystemProbeAddress string
// System probe process module on/off configuration
ProcessModuleEnabled bool
// System probe network_tracer module on/off configuration
NetworkTracerModuleEnabled bool
}

// Check is an interface for Agent checks that collect data. Each check returns
Expand Down
17 changes: 13 additions & 4 deletions pkg/process/checks/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
package checks

import (
"context"
"fmt"
"math"
"sync"
Expand All @@ -16,9 +15,9 @@ import (

workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def"
ddconfig "github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/process/net"
"github.com/DataDog/datadog-agent/pkg/process/statsd"
proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers"
"github.com/DataDog/datadog-agent/pkg/util/cloudproviders"
"github.com/DataDog/datadog-agent/pkg/util/flavor"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
Expand Down Expand Up @@ -53,11 +52,21 @@ type ContainerCheck struct {
}

// Init initializes a ContainerCheck instance.
func (c *ContainerCheck) Init(_ *SysProbeConfig, info *HostInfo, _ bool) error {
func (c *ContainerCheck) Init(syscfg *SysProbeConfig, info *HostInfo, _ bool) error {
c.containerProvider = proccontainers.GetSharedContainerProvider(c.wmeta)
c.hostInfo = info

networkID, err := cloudproviders.GetNetworkID(context.TODO())
var tu *net.RemoteSysProbeUtil
var err error
if syscfg.NetworkTracerModuleEnabled {
// Calling the remote tracer will cause it to initialize and check connectivity
tu, err = net.GetRemoteSystemProbeUtil(syscfg.SystemProbeAddress)
if err != nil {
log.Warnf("could not initiate connection with system probe: %s", err)
}
}

networkID, err := retryGetNetworkID(tu)
if err != nil {
log.Infof("no network ID detected: %s", err)
}
Expand Down
16 changes: 15 additions & 1 deletion pkg/process/checks/net.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (c *ConnectionsCheck) Init(syscfg *SysProbeConfig, hostInfo *HostInfo, _ bo
}
}

networkID, err := cloudproviders.GetNetworkID(context.TODO())
networkID, err := retryGetNetworkID(tu)
if err != nil {
log.Infof("no network ID detected: %s", err)
}
Expand Down Expand Up @@ -503,3 +503,17 @@ func convertAndEnrichWithServiceCtx(tags []string, tagOffsets []uint32, serviceC

return tagsStr
}

// fetches network_id from the current netNS or from the system probe if necessary, where the root netNS is used
func retryGetNetworkID(sysProbeUtil *net.RemoteSysProbeUtil) (string, error) {
networkID, err := cloudproviders.GetNetworkID(context.TODO())
if err != nil && sysProbeUtil != nil {
log.Infof("no network ID detected. retrying via system-probe: %s", err)
networkID, err = sysProbeUtil.GetNetworkID()
if err != nil {
log.Infof("failed to get network ID from system-probe: %s", err)
return "", err
}
}
return networkID, err
}
14 changes: 11 additions & 3 deletions pkg/process/checks/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
package checks

import (
"context"
"errors"
"fmt"
"math"
Expand All @@ -28,7 +27,6 @@ import (
"github.com/DataDog/datadog-agent/pkg/process/statsd"
"github.com/DataDog/datadog-agent/pkg/process/util"
proccontainers "github.com/DataDog/datadog-agent/pkg/process/util/containers"
"github.com/DataDog/datadog-agent/pkg/util/cloudproviders"
"github.com/DataDog/datadog-agent/pkg/util/flavor"
"github.com/DataDog/datadog-agent/pkg/util/log"
"github.com/DataDog/datadog-agent/pkg/util/subscriptions"
Expand Down Expand Up @@ -137,7 +135,17 @@ func (p *ProcessCheck) Init(syscfg *SysProbeConfig, info *HostInfo, oneShot bool

p.notInitializedLogLimit = log.NewLogLimit(1, time.Minute*10)

networkID, err := cloudproviders.GetNetworkID(context.TODO())
var tu *net.RemoteSysProbeUtil
var err error
if syscfg.NetworkTracerModuleEnabled {
// Calling the remote tracer will cause it to initialize and check connectivity
tu, err = net.GetRemoteSystemProbeUtil(syscfg.SystemProbeAddress)
if err != nil {
log.Warnf("could not initiate connection with system probe: %s", err)
}
}

networkID, err := retryGetNetworkID(tu)
if err != nil {
log.Infof("no network ID detected: %s", err)
}
Expand Down
31 changes: 29 additions & 2 deletions pkg/process/net/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type Conn interface {

const (
contentTypeProtobuf = "application/protobuf"
contentTypeJSON = "application/json"
)

var (
Expand Down Expand Up @@ -166,14 +167,40 @@ func (r *RemoteSysProbeUtil) GetConnections(clientID string) (*model.Connections
return conns, nil
}

// GetNetworkID fetches the network_id (vpc_id) from system-probe
func (r *RemoteSysProbeUtil) GetNetworkID() (string, error) {
req, err := http.NewRequest("GET", networkIDURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}

req.Header.Set("Accept", "text/plain")
resp, err := r.httpClient.Do(req)
if err != nil {
return "", fmt.Errorf("failed to execute request: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("network_id request failed: url: %s, status code: %d", networkIDURL, resp.StatusCode)
}

body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response body: %w", err)
}

return string(body), nil
akarpz marked this conversation as resolved.
Show resolved Hide resolved
}

// GetPing returns the results of a ping to a host
func (r *RemoteSysProbeUtil) GetPing(clientID string, host string, count int, interval time.Duration, timeout time.Duration) ([]byte, error) {
req, err := http.NewRequest("GET", fmt.Sprintf("%s/%s?client_id=%s&count=%d&interval=%d&timeout=%d", pingURL, host, clientID, count, interval, timeout), nil)
if err != nil {
return nil, err
}

req.Header.Set("Accept", "application/json")
req.Header.Set("Accept", contentTypeJSON)
resp, err := r.httpClient.Do(req)
if err != nil {
return nil, err
Expand Down Expand Up @@ -208,7 +235,7 @@ func (r *RemoteSysProbeUtil) GetTraceroute(clientID string, host string, port ui
return nil, err
}

req.Header.Set("Accept", "application/json")
req.Header.Set("Accept", contentTypeJSON)
resp, err := r.tracerouteClient.Do(req)
if err != nil {
return nil, err
Expand Down
1 change: 1 addition & 0 deletions pkg/process/net/common_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const (
pingURL = "http://unix/" + string(sysconfig.PingModule) + "/ping/"
tracerouteURL = "http://unix/" + string(sysconfig.TracerouteModule) + "/traceroute/"
connectionsURL = "http://unix/" + string(sysconfig.NetworkTracerModule) + "/connections"
networkIDURL = "http://unix/" + string(sysconfig.NetworkTracerModule) + "/network_id"
procStatsURL = "http://unix/" + string(sysconfig.ProcessModule) + "/stats"
registerURL = "http://unix/" + string(sysconfig.NetworkTracerModule) + "/register"
statsURL = "http://unix/debug/stats"
Expand Down
5 changes: 5 additions & 0 deletions pkg/process/net/common_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ func (r *RemoteSysProbeUtil) GetConnections(_ string) (*model.Connections, error
return nil, ErrNotImplemented
}

// GetNetworkID is not supported
func (r *RemoteSysProbeUtil) GetNetworkID() (string, error) {
return "", ErrNotImplemented
}

// GetStats is not supported
func (r *RemoteSysProbeUtil) GetStats() (map[string]interface{}, error) {
return nil, ErrNotImplemented
Expand Down
1 change: 1 addition & 0 deletions pkg/process/net/common_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

const (
connectionsURL = "http://localhost:3333/" + string(sysconfig.NetworkTracerModule) + "/connections"
networkIDURL = "http://unix/" + string(sysconfig.NetworkTracerModule) + "/network_id"
registerURL = "http://localhost:3333/" + string(sysconfig.NetworkTracerModule) + "/register"
languageDetectionURL = "http://localhost:3333/" + string(sysconfig.LanguageDetectionModule) + "/detect"
statsURL = "http://localhost:3333/debug/stats"
Expand Down
28 changes: 28 additions & 0 deletions pkg/process/net/mocks/sys_probe_util.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/process/net/shared.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ type SysProbeUtil interface {
GetStats() (map[string]interface{}, error)
GetProcStats(pids []int32) (*model.ProcStatsWithPermByPID, error)
Register(clientID string) error
GetNetworkID() (string, error)
}
2 changes: 1 addition & 1 deletion pkg/util/cloudproviders/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func GetNetworkID(ctx context.Context) (string, error) {
return cache.Get[string](
networkIDCacheKey,
func() (string, error) {
// the the id from configuration
// the id from configuration
if networkID := config.Datadog().GetString("network.id"); networkID != "" {
log.Debugf("GetNetworkID: using configured network ID: %s", networkID)
return networkID, nil
Expand Down
Loading
Loading