cert-manager · jetstack-bot · Nov 30, 2023 · Aug 31, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/manager/manager.go b/manager/manager.go
@@ -18,6 +18,7 @@ package manager
 
 import (
 	"context"
+	"crypto"
 	"crypto/x509"
 	"encoding/pem"
 	"errors"
@@ -40,6 +41,7 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/tools/cache"
 	"k8s.io/utils/clock"
 
 	internalapi "github.com/cert-manager/csi-lib/internal/api"
@@ -157,14 +159,73 @@ func NewManager(opts Options) (*Manager, error) {
 		return nil, fmt.Errorf("building node name label selector: %w", err)
 	}
 
+	// construct the requestToPrivateKeyMap so we can use event handlers below to manage it
+	var requestToPrivateKeyLock sync.Mutex
+	requestToPrivateKeyMap := make(map[types.UID]crypto.PrivateKey)
 	// Create an informer factory
 	informerFactory := cminformers.NewSharedInformerFactoryWithOptions(opts.Client, 0, cminformers.WithTweakListOptions(func(opts *metav1.ListOptions) {
 		opts.LabelSelector = labels.NewSelector().Add(*nodeNameReq).String()
 	}))
 	// Fetch the lister before calling Start() to ensure this informer is
 	// registered with the factory
 	lister := informerFactory.Certmanager().V1().CertificateRequests().Lister()
+	informerFactory.Certmanager().V1().CertificateRequests().Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+		DeleteFunc: func(obj interface{}) {
+			requestToPrivateKeyLock.Lock()
+			defer requestToPrivateKeyLock.Unlock()
+			key, ok := obj.(string)
+			if !ok {
+				return
+			}
+			namespace, name, err := cache.SplitMetaNamespaceKey(key)
+			if err != nil {
+				return
+			}
+			req, err := lister.CertificateRequests(namespace).Get(name)
+			if err != nil {
+				// we no longer have a copy of this request, so we can't work out its UID.
+				// instead the associated pending request entry for this request will be cleaned up by the periodic 'janitor' task.
+				return
+			}
+
+			if _, ok := requestToPrivateKeyMap[req.UID]; ok {
+				delete(requestToPrivateKeyMap, req.UID)
+			}
+		},
+	})
+
+	// create a stop channel that manages all sub goroutines
 	stopCh := make(chan struct{})
+	// begin a background routine which periodically checks to ensure all members of the pending request map actually
+	// have corresponding CertificateRequest objects in the apiserver.
+	// This avoids leaking memory if we don't observe a request being deleted, or we observe it after the lister has purged
+	// the request data from its cache.
+	// this routine must be careful to not delete entries from this map that have JUST been added to the map, but haven't
+	// been observed by the lister yet (else it may purge data we want to keep, causing a whole new request cycle).
+	// for now, to avoid this case, we only run the routine every 5 minutes. It would be better if we recorded the time we
+	// added the entry to the map instead, and only purged items from the map that are older that N duration (TBD).
+	janitorLogger := opts.Log.WithName("pending_request_janitor")
+	go wait.Until(func() {
+		requestToPrivateKeyLock.Lock()
+		defer requestToPrivateKeyLock.Unlock()
+		reqs, err := lister.List(labels.Everything())
+		if err != nil {
+			janitorLogger.Error(err, "failed listing existing requests")
+			return
+		}
+
+		existsMap := make(map[types.UID]struct{})
+		for _, req := range reqs {
+			existsMap[req.UID] = struct{}{}
+		}
+
+		for uid := range requestToPrivateKeyMap {
+			if _, ok := existsMap[uid]; !ok {
+				// purge the item from the map as it does not exist in the apiserver
+				delete(requestToPrivateKeyMap, uid)
+			}
+		}
+	}, time.Minute*5, stopCh)
 	// Begin watching the API
 	informerFactory.Start(stopCh)
 	informerFactory.WaitForCacheSync(stopCh)
@@ -173,9 +234,13 @@ func NewManager(opts Options) (*Manager, error) {
 		client:            opts.Client,
 		clientForMetadata: opts.ClientForMetadata,
 		lister:            lister,
-		metadataReader:    opts.MetadataReader,
-		clock:             opts.Clock,
-		log:               *opts.Log,
+		// we pass a pointer to the mutex as the janitor routine above also uses this lock,
+		// so we want to avoid making a copy of it
+		requestToPrivateKeyLock: &requestToPrivateKeyLock,
+		requestToPrivateKeyMap:  requestToPrivateKeyMap,
+		metadataReader:          opts.MetadataReader,
+		clock:                   opts.Clock,
+		log:                     *opts.Log,
 
 		generatePrivateKey: opts.GeneratePrivateKey,
 		generateRequest:    opts.GenerateRequest,
@@ -260,6 +325,10 @@ type Manager struct {
 	// lister is used as a read-only cache of CertificateRequest resources
 	lister cmlisters.CertificateRequestLister
 
+	// A map that associates a CertificateRequest's UID with its private key.
+	requestToPrivateKeyLock *sync.Mutex
+	requestToPrivateKeyMap  map[types.UID]crypto.PrivateKey
+
 	// used to read metadata from the store
 	metadataReader storage.MetadataReader
 
@@ -298,10 +367,23 @@ type Manager struct {
 	// requestNameGenerator generates a new random name for a certificaterequest object
 	// Defaults to uuid.NewUUID() from k8s.io/apimachinery/pkg/util/uuid.
 	requestNameGenerator func() string
+
+	// doNotUse_CallOnEachIssue is a field used SOLELY for testing, and cannot be configured by external package consumers.
+	// It is used to perform some action (e.g. counting) each time issue() is called.
+	// It will be removed as soon as we have actual metrics support in csi-lib, which will allow us to measure
+	// things like the number of times issue() is called.
+	// No thread safety is added around this field, and it MUST NOT be used for any implementation logic.
+	// It should not be used full-stop :).
+	doNotUse_CallOnEachIssue func()
 }
 
 // issue will step through the entire issuance flow for a volume.
 func (m *Manager) issue(ctx context.Context, volumeID string) error {
+	// TODO: remove this code and replace with actual metrics support
+	if m.doNotUse_CallOnEachIssue != nil {
+		m.doNotUse_CallOnEachIssue()
+	}
+
 	log := m.log.WithValues("volume_id", volumeID)
 	log.Info("Processing issuance")
 
@@ -315,10 +397,32 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 	}
 	log.V(2).Info("Read metadata", "metadata", meta)
 
+	// check if there is already a pending request in-flight for this volume.
+	// if there is, and we still have a copy of its private key in memory, we can resume the existing request and
+	// avoid creating additional CertificateRequest objects.
+	existingReq, err := m.findPendingRequest(meta)
+	if err != nil {
+		return fmt.Errorf("failed when checking if an existing request exists: %w", err)
+	}
+	// if there is an existing in-flight request, attempt to 'resume' it (i.e. re-check to see if it is ready)
+	if existingReq != nil {
+		// we can only resume a request if we still have a reference to its private key, so look that up in our pending
+		// requests map
+		if key, ok := m.readPendingRequestPrivateKey(existingReq.UID); ok {
+			log.V(4).Info("Re-using existing certificaterequest")
+			return m.handleRequest(ctx, volumeID, meta, key, existingReq)
+		}
+
+		// if we don't have a copy of the associated private key, delete the currently in-flight request
+		log.V(2).Info("Found existing request that we don't have corresponding private key for - restarting request process")
+		if err := m.client.CertmanagerV1().CertificateRequests(existingReq.Namespace).Delete(ctx, existingReq.Name, metav1.DeleteOptions{}); err != nil {
+			return fmt.Errorf("failed to delete existing in-flight request: %w", err)
+		}
+	}
+
 	if ready, reason := m.readyToRequest(meta); !ready {
 		return fmt.Errorf("driver is not ready to request a certificate for this volume: %v", reason)
 	}
-
 	key, err := m.generatePrivateKey(meta)
 	if err != nil {
 		return fmt.Errorf("generating private key: %w", err)
@@ -343,11 +447,78 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 	}
 	log.Info("Created new CertificateRequest resource")
 
-	// Poll every 1s for the CertificateRequest to be ready
+	// persist the reference to the private key in memory so we can resume this request in future if it doesn't complete
+	// the first time.
+	m.writePendingRequestPrivateKey(req.UID, key)
+	return m.handleRequest(ctx, volumeID, meta, key, req)
+}
+
+func (m *Manager) readPendingRequestPrivateKey(uid types.UID) (crypto.PrivateKey, bool) {
+	m.requestToPrivateKeyLock.Lock()
+	defer m.requestToPrivateKeyLock.Unlock()
+	key, ok := m.requestToPrivateKeyMap[uid]
+	return key, ok
+}
+
+func (m *Manager) writePendingRequestPrivateKey(uid types.UID, key crypto.PrivateKey) {
+	m.requestToPrivateKeyLock.Lock()
+	defer m.requestToPrivateKeyLock.Unlock()
+	m.requestToPrivateKeyMap[uid] = key
+}
+
+func (m *Manager) deletePendingRequestPrivateKey(uid types.UID) {
+	m.requestToPrivateKeyLock.Lock()
+	defer m.requestToPrivateKeyLock.Unlock()
+	delete(m.requestToPrivateKeyMap, uid)
+}
+
+func (m *Manager) findPendingRequest(meta metadata.Metadata) (*cmapi.CertificateRequest, error) {
+	reqs, err := m.listAllRequestsForVolume(meta.VolumeID)
+	if err != nil {
+		return nil, err
+	}
+
+	if len(reqs) == 0 {
+		return nil, nil
+	}
+
+	// only consider the newest request - we will never resume an older request
+	req := reqs[0]
+	if !certificateRequestCanBeResumed(req) {
+		return nil, nil
+	}
+
+	// TODO: check if this request is still actually valid for the input metadata
+	return req, nil
+}
+
+func certificateRequestCanBeResumed(req *cmapi.CertificateRequest) bool {
+	for _, cond := range req.Status.Conditions {
+		if cond.Type == cmapi.CertificateRequestConditionReady {
+			switch cond.Reason {
+			case cmapi.CertificateRequestReasonPending, cmapi.CertificateRequestReasonIssued, "":
+				// either explicit Pending, Issued or empty is considered re-sumable
+				return true
+			default:
+				// any other state is a terminal failed state and means the request has failed
+				return false
+			}
+		}
+	}
+	// if there is no Ready condition, the request is still pending processing
+	return true
+}
+
+func (m *Manager) handleRequest(ctx context.Context, volumeID string, meta metadata.Metadata, key crypto.PrivateKey, req *cmapi.CertificateRequest) error {
+	log := m.log.WithValues("volume_id", volumeID)
+
+	// Poll every 200ms for the CertificateRequest to be ready
 	lastFailureReason := ""
-	if err := wait.PollUntilWithContext(ctx, time.Second, func(ctx context.Context) (done bool, err error) {
+	if err := wait.PollUntilWithContext(ctx, time.Millisecond*200, func(ctx context.Context) (done bool, err error) {
+		log.V(4).Info("Reading CertificateRequest from lister cache")
 		updatedReq, err := m.lister.CertificateRequests(req.Namespace).Get(req.Name)
 		if apierrors.IsNotFound(err) {
+			log.V(4).Info("Failed to read CertificateRequest from lister cache", "error", err)
 			// A NotFound error implies something deleted the resource - fail
 			// early to allow a retry to occur at a later time if needed.
 			return false, err
@@ -371,6 +542,7 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 
 		isApproved := apiutil.CertificateRequestIsApproved(updatedReq)
 		if !isApproved {
+			log.V(4).Info("CertificateRequest is not explicitly approved - continuing to check if the request has been issued anyway")
 			lastFailureReason = fmt.Sprintf("request %q has not yet been approved by approval plugin", updatedReq.Name)
 			// we don't stop execution here, as some versions of cert-manager (and some external issuer plugins)
 			// may not be aware/utilise approval.
@@ -380,6 +552,7 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 
 		readyCondition := apiutil.GetCertificateRequestCondition(updatedReq, cmapi.CertificateRequestConditionReady)
 		if readyCondition == nil {
+			log.V(4).Info("Ready condition not found - will recheck...")
 			// only overwrite the approval failure message if the request is actually approved
 			// otherwise we may hide more useful information from the user by accident.
 			if isApproved {
@@ -390,10 +563,12 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 
 		switch readyCondition.Reason {
 		case cmapi.CertificateRequestReasonIssued:
+			log.V(4).Info("CertificateRequest has been issued!")
 			break
 		case cmapi.CertificateRequestReasonFailed:
 			return false, fmt.Errorf("request %q has failed: %s", updatedReq.Name, readyCondition.Message)
 		case cmapi.CertificateRequestReasonPending:
+			log.V(4).Info("CertificateRequest is still pending...")
 			if isApproved {
 				lastFailureReason = fmt.Sprintf("request %q is pending: %v", updatedReq.Name, readyCondition.Message)
 			}
@@ -425,37 +600,55 @@ func (m *Manager) issue(ctx context.Context, volumeID string) error {
 		return fmt.Errorf("calculating next issuance time: %w", err)
 	}
 	meta.NextIssuanceTime = &renewalPoint
+	log.V(4).Info("Persisting next issuance time to metadata store", "next_issuance_time", renewalPoint)
 
 	if err := m.writeKeypair(meta, key, req.Status.Certificate, req.Status.CA); err != nil {
 		return fmt.Errorf("writing keypair: %w", err)
 	}
 	log.V(2).Info("Wrote new keypair to storage")
 
+	// We must explicitly delete the private key from the pending requests map so that the existing Completed
+	// request will not be re-used upon renewal.
+	// Without this, the renewal would pick up the existing issued certificate and re-issue, rather than requesting
+	// a new certificate.
+	m.deletePendingRequestPrivateKey(req.UID)
+	log.V(4).Info("Removed pending request private key from internal cache")
+
 	return nil
 }
 
-func (m *Manager) cleanupStaleRequests(ctx context.Context, log logr.Logger, volumeID string) error {
+// returns a list of all pending certificaterequest objects for the given volumeID.
+// the returned slice will be ordered with the most recent request FIRST.
+func (m *Manager) listAllRequestsForVolume(volumeID string) ([]*cmapi.CertificateRequest, error) {
 	sel, err := m.labelSelectorForVolume(volumeID)
 	if err != nil {
-		return fmt.Errorf("internal error building label selector - this is a bug, please open an issue: %w", err)
+		return nil, fmt.Errorf("internal error building label selector - this is a bug, please open an issue: %w", err)
 	}
 
 	reqs, err := m.lister.List(sel)
 	if err != nil {
-		return fmt.Errorf("listing certificaterequests: %w", err)
-	}
-
-	if len(reqs) < m.maxRequestsPerVolume {
-		return nil
+		return nil, fmt.Errorf("listing certificaterequests: %w", err)
 	}
 
 	// sort newest first to oldest last
 	sort.Slice(reqs, func(i, j int) bool {
 		return reqs[i].CreationTimestamp.After(reqs[j].CreationTimestamp.Time)
 	})
 
+	return reqs, nil
+}
+
+func (m *Manager) cleanupStaleRequests(ctx context.Context, log logr.Logger, volumeID string) error {
+	reqs, err := m.listAllRequestsForVolume(volumeID)
+	if err != nil {
+		return err
+	}
+	if len(reqs) <= m.maxRequestsPerVolume {
+		return nil
+	}
+
 	// start at the end of the slice and work back to maxRequestsPerVolume
-	for i := len(reqs) - 1; i >= m.maxRequestsPerVolume-1; i-- {
+	for i := len(reqs) - 1; i > m.maxRequestsPerVolume-1; i-- {
 		toDelete := reqs[i]
 		if err := m.client.CertmanagerV1().CertificateRequests(toDelete.Namespace).Delete(ctx, toDelete.Name, metav1.DeleteOptions{}); err != nil {
 			if apierrors.IsNotFound(err) {