From 546930de3f8abd18d7089d6e06d1f5f528d457fb Mon Sep 17 00:00:00 2001 From: James Root Date: Wed, 3 Jul 2024 20:33:03 +0000 Subject: [PATCH 1/2] fix: Deadlock issue caused in listResources (argoproj/argo-cd#18902) It is possible for Argo to reach a state where the maximum amount of go-routines have incremented the semaphore and are waiting on the lock in loadInitialState WHILE another go-routine is holding the lock waiting on the semaphore. This change allows the go-routine holding the lock to bypass the semaphore and complete its execution. The downside of this approach is that more could be running than intended, but that is preferable to the cache deadlocking. Signed-off-by: James Root --- pkg/cache/cluster.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pkg/cache/cluster.go b/pkg/cache/cluster.go index 31e16e711..e072f9a69 100644 --- a/pkg/cache/cluster.go +++ b/pkg/cache/cluster.go @@ -527,11 +527,14 @@ func runSynced(lock sync.Locker, action func() error) error { } // listResources creates list pager and enforces number of concurrent list requests -func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, callback func(*pager.ListPager) error) (string, error) { - if err := c.listSemaphore.Acquire(ctx, 1); err != nil { - return "", err +func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, skipSemaphore bool, callback func(*pager.ListPager) error) (string, error) { + if !skipSemaphore { + if err := c.listSemaphore.Acquire(ctx, 1); err != nil { + return "", err + } + defer c.listSemaphore.Release(1) } - defer c.listSemaphore.Release(1) + var retryCount int64 = 0 resourceVersion := "" listPager := pager.New(func(ctx context.Context, opts metav1.ListOptions) (runtime.Object, error) { @@ -568,7 +571,8 @@ func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.Reso } func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, lock bool) (string, error) { - return c.listResources(ctx, resClient, func(listPager *pager.ListPager) error { + // If we are intending to take the lock in our callback, we need to skip taking the semaphore in listResources to avoid deadlock. + return c.listResources(ctx, resClient, !lock, func(listPager *pager.ListPager) error { var items []*Resource err := listPager.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error { if un, ok := obj.(*unstructured.Unstructured); !ok { @@ -859,7 +863,7 @@ func (c *clusterCache) sync() error { lock.Unlock() return c.processApi(client, api, func(resClient dynamic.ResourceInterface, ns string) error { - resourceVersion, err := c.listResources(ctx, resClient, func(listPager *pager.ListPager) error { + resourceVersion, err := c.listResources(ctx, resClient, false, func(listPager *pager.ListPager) error { return listPager.EachListItem(context.Background(), metav1.ListOptions{}, func(obj runtime.Object) error { if un, ok := obj.(*unstructured.Unstructured); !ok { return fmt.Errorf("object %s/%s has an unexpected type", un.GroupVersionKind().String(), un.GetName()) From 78b1c5001bb7a64f6cebbccf3f36a125526e2453 Mon Sep 17 00:00:00 2001 From: James Root Date: Mon, 8 Jul 2024 15:11:22 +0000 Subject: [PATCH 2/2] fix: inverting boolean logic to avoid confusing negations (argocdproj/argo-cd#18902) Signed-off-by: James Root --- pkg/cache/cluster.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/cache/cluster.go b/pkg/cache/cluster.go index e072f9a69..92286c24d 100644 --- a/pkg/cache/cluster.go +++ b/pkg/cache/cluster.go @@ -527,8 +527,8 @@ func runSynced(lock sync.Locker, action func() error) error { } // listResources creates list pager and enforces number of concurrent list requests -func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, skipSemaphore bool, callback func(*pager.ListPager) error) (string, error) { - if !skipSemaphore { +func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.ResourceInterface, takeSemaphore bool, callback func(*pager.ListPager) error) (string, error) { + if takeSemaphore { if err := c.listSemaphore.Acquire(ctx, 1); err != nil { return "", err } @@ -572,7 +572,7 @@ func (c *clusterCache) listResources(ctx context.Context, resClient dynamic.Reso func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, lock bool) (string, error) { // If we are intending to take the lock in our callback, we need to skip taking the semaphore in listResources to avoid deadlock. - return c.listResources(ctx, resClient, !lock, func(listPager *pager.ListPager) error { + return c.listResources(ctx, resClient, lock, func(listPager *pager.ListPager) error { var items []*Resource err := listPager.EachListItem(ctx, metav1.ListOptions{}, func(obj runtime.Object) error { if un, ok := obj.(*unstructured.Unstructured); !ok { @@ -863,7 +863,7 @@ func (c *clusterCache) sync() error { lock.Unlock() return c.processApi(client, api, func(resClient dynamic.ResourceInterface, ns string) error { - resourceVersion, err := c.listResources(ctx, resClient, false, func(listPager *pager.ListPager) error { + resourceVersion, err := c.listResources(ctx, resClient, true, func(listPager *pager.ListPager) error { return listPager.EachListItem(context.Background(), metav1.ListOptions{}, func(obj runtime.Object) error { if un, ok := obj.(*unstructured.Unstructured); !ok { return fmt.Errorf("object %s/%s has an unexpected type", un.GroupVersionKind().String(), un.GetName())