Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(WIP) storage: add storage lag metrics #1168

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ require (
golang.org/x/net v0.0.0-20220225172249-27dd8689420f
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
golang.org/x/sys v0.0.0-20220317061510-51cd9980dadf
golang.org/x/time v0.0.0-20220224211638-0e9765cccd65
google.golang.org/grpc v1.44.0
)

Expand Down Expand Up @@ -158,7 +159,6 @@ require (
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 // indirect
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/time v0.0.0-20220224211638-0e9765cccd65 // indirect
golang.org/x/tools v0.1.10 // indirect
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/api v0.69.0 // indirect
Expand Down
10 changes: 10 additions & 0 deletions pump/storage/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ var (
Help: "How long the catch up step takes to run.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 22),
})

commitTsLagHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "binlog",
Subsystem: "pump_storage",
Name: "commit_ts_lag_time",
Help: "Bucketed histogram of the lag of currently handled maximum commit-ts",
Buckets: prometheus.ExponentialBuckets(0.00005, 2, 18),
}, []string{"type"})
)

// InitMetircs register the metrics to registry
Expand All @@ -123,4 +132,5 @@ func InitMetircs(registry *prometheus.Registry) {
registry.MustRegister(storageSizeGauge)
registry.MustRegister(slowChaserCount)
registry.MustRegister(slowChaserCatchUpTimeHistogram)
registry.MustRegister(commitTsLagHistogram)
}
52 changes: 48 additions & 4 deletions pump/storage/sorter.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ package storage
import (
"container/list"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/tikv/client-go/v2/oracle"
"go.uber.org/zap"
"golang.org/x/time/rate"
"math/rand"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -105,6 +109,9 @@ type sorter struct {
// save the startTS of txn which we need to wait for the C binlog
waitStartTS map[int64]struct{}

commitTsLagMetrics prometheus.Observer
maxPushItemCommitTs int64 // atomic. Used for metrics only

lock sync.Mutex
cond *sync.Cond
items *list.List
Expand All @@ -114,14 +121,16 @@ type sorter struct {

func newSorter(fn func(item sortItem)) *sorter {
sorter := &sorter{
maxTSItemCB: fn,
items: list.New(),
waitStartTS: make(map[int64]struct{}),
maxTSItemCB: fn,
items: list.New(),
waitStartTS: make(map[int64]struct{}),
commitTsLagMetrics: commitTsLagHistogram.With(map[string]string{"type": "sorter"}),
}

sorter.cond = sync.NewCond(&sorter.lock)
sorter.wg.Add(1)
sorter.wg.Add(2)
go sorter.run()
go sorter.updateMetrics()

return sorter
}
Expand All @@ -148,6 +157,12 @@ func (s *sorter) pushTSItem(item sortItem) {
if item.tp == pb.BinlogType_Prewrite {
s.waitStartTS[item.start] = struct{}{}
} else {
if item.commit > atomic.LoadInt64(&s.maxPushItemCommitTs) {
// No need to CAS because we are inside a lock, and
// this is the only place where s.maxPushItemCommitTs
// is written to.
atomic.StoreInt64(&s.maxPushItemCommitTs, item.commit)
}
delete(s.waitStartTS, item.start)
s.cond.Signal()
}
Expand Down Expand Up @@ -225,6 +240,35 @@ func (s *sorter) run() {
}
}

func (s *sorter) updateMetrics() {
defer s.wg.Done()

tick := time.NewTicker(1 * time.Second)
defer tick.Stop()

rl := rate.NewLimiter(rate.Every(time.Second*10), 1)

for range tick.C {
if s.isClosed() {
return
}

curMaxPushItemCommitTs := atomic.LoadInt64(&s.maxPushItemCommitTs)
maxCommitTsPhysicalMs := oracle.ExtractPhysical(uint64(curMaxPushItemCommitTs))
millisecond := time.Now().UnixMilli() - maxCommitTsPhysicalMs

// Update metrics
s.commitTsLagMetrics.Observe(float64(millisecond) / 1000.0)

if millisecond > 10000 /* 10s */ && rl.Allow() {
// Prints a log if the lag is more than 10 seconds, and
// if the rate limiter allows.
duration := time.Duration(millisecond) * time.Millisecond
log.Info("sorter commit ts lag", zap.Duration("duration", duration))
}
}
}

func (s *sorter) isClosed() bool {
return atomic.LoadInt32(&s.closed) == 1
}
Expand Down
29 changes: 29 additions & 0 deletions pump/storage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package storage
import (
"context"
"encoding/binary"
"golang.org/x/time/rate"
"math"
"os"
"path"
Expand Down Expand Up @@ -119,6 +120,8 @@ type Append struct {

options *Options

maxWriteToKVCommitTs int64 // atomic. Used for metrics only.

close chan struct{}
wg sync.WaitGroup
}
Expand Down Expand Up @@ -362,6 +365,14 @@ func (a *Append) updateStatus() {
logStatsTicker := time.NewTicker(time.Second * 10)
defer logStatsTicker.Stop()

updateMetricsTicker := time.NewTicker(time.Second * 1)
defer updateMetricsTicker.Stop()

kvCommitTsLagMetrics := commitTsLagHistogram.With(map[string]string{
"type": "kv",
})
logRL := rate.NewLimiter(rate.Every(time.Second*10), 1)

for {
select {
case <-a.close:
Expand Down Expand Up @@ -389,6 +400,20 @@ func (a *Append) updateStatus() {
log.Warn("in WritePaused stat")
}
}
case <-updateMetricsTicker.C:
curKVCommitTs := atomic.LoadInt64(&a.maxWriteToKVCommitTs)
maxCommitTsPhysicalMs := oracle.ExtractPhysical(uint64(curKVCommitTs))
millisecond := time.Now().UnixMilli() - maxCommitTsPhysicalMs

// Update metrics
kvCommitTsLagMetrics.Observe(float64(millisecond) / 1000.0)

if millisecond > 10000 /* 10s */ && logRL.Allow() {
// Prints a log if the lag is more than 10 seconds, and
// if the rate limiter allows.
duration := time.Duration(millisecond) * time.Millisecond
log.Info("kv commit ts lag", zap.Duration("duration", duration))
}
}
}
}
Expand Down Expand Up @@ -888,6 +913,10 @@ func (a *Append) writeToKV(reqs chan *request) chan *request {
return
}
for _, req := range bufReqs {
if req.commitTS > atomic.LoadInt64(&a.maxWriteToKVCommitTs) {
atomic.StoreInt64(&a.maxWriteToKVCommitTs, req.commitTS)
}

done <- req
}
}
Expand Down