Skip to content

Commit

Permalink
fix: Deadlock issue caused in listResources (argoproj/argo-cd#18902)
Browse files Browse the repository at this point in the history
It is possible for Argo to reach a state where the maximum amount
of go-routines have incremented the semaphore and are waiting on
the lock in loadInitialState WHILE another go-routine is holding
the lock waiting on the semaphore. This change allows the
go-routine holding the lock to bypass the semaphore and complete
its execution. The downside of this approach is that more could
be running than intended, but that is preferable to the cache
deadlocking.

Signed-off-by: James Root <jroot@indeed.com>
  • Loading branch information
James Root committed Jul 5, 2024
1 parent fbecbb8 commit 546930d
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions pkg/cache/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,11 +527,14 @@ func runSynced(lock sync.Locker, action func() error) error {
}

// listResources creates list pager and enforces number of concurrent list requests
func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, callback func(*pager.ListPager) error) (string, error) {
if err := c.listSemaphore.Acquire(ctx, 1); err != nil {
return "", err
func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, skipSemaphore bool, callback func(*pager.ListPager) error) (string, error) {
if !skipSemaphore {
if err := c.listSemaphore.Acquire(ctx, 1); err != nil {
return "", err
}
defer c.listSemaphore.Release(1)
}
defer c.listSemaphore.Release(1)

var retryCount int64 = 0
resourceVersion := ""
listPager := pager.New(func(ctx context.Context, opts metav1.ListOptions) (runtime.Object, error) {
Expand Down Expand Up @@ -568,7 +571,8 @@ func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.Reso
}

func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, lock bool) (string, error) {
return c.listResources(ctx, resClient, func(listPager *pager.ListPager) error {
// If we are intending to take the lock in our callback, we need to skip taking the semaphore in listResources to avoid deadlock.
return c.listResources(ctx, resClient, !lock, func(listPager *pager.ListPager) error {
var items []*Resource
err := listPager.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error {
if un, ok := obj.(*unstructured.Unstructured); !ok {
Expand Down Expand Up @@ -859,7 +863,7 @@ func (c *clusterCache) sync() error {
lock.Unlock()

return c.processApi(client, api, func(resClient dynamic.ResourceInterface, ns string) error {
resourceVersion, err := c.listResources(ctx, resClient, func(listPager *pager.ListPager) error {
resourceVersion, err := c.listResources(ctx, resClient, false, func(listPager *pager.ListPager) error {
return listPager.EachListItem(context.Background(), metav1.ListOptions{}, func(obj runtime.Object) error {
if un, ok := obj.(*unstructured.Unstructured); !ok {
return fmt.Errorf("object %s/%s has an unexpected type", un.GroupVersionKind().String(), un.GetName())
Expand Down

0 comments on commit 546930d

Please sign in to comment.