Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.19

require (
github.com/onsi/gomega v1.27.10
github.com/openshift-online/ocm-sdk-go v0.1.368
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/instascale v0.0.9
Expand Down Expand Up @@ -69,7 +70,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/openshift-online/ocm-sdk-go v0.1.327 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,8 @@ github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
github.com/openshift-online/ocm-sdk-go v0.1.327 h1:WR822bGdQoMuZ2+dFdhZz3fpD2NlJhGr+F3FJPXvqFU=
github.com/openshift-online/ocm-sdk-go v0.1.327/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift-online/ocm-sdk-go v0.1.368 h1:qP+gkChV8WDwwpkUw1xUyjTXKdvrwyd70Gff2GMUSeU=
github.com/openshift-online/ocm-sdk-go v0.1.368/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556 h1:7W2fOhJicyEff24VaF7ASNzPtYvr+iSCVft4SIBAzaE=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556/go.mod h1:aQ6LDasvHMvHZXqLHnX2GRmnfTWCF/iIwz8EMTTIE9A=
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c h1:CV76yFOTXmq9VciBR3Bve5ZWzSxdft7gaMVB3kS0rwg=
Expand Down
174 changes: 174 additions & 0 deletions test/e2e/instascale.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package e2e

import (
. "github.com/onsi/gomega"
ocmsdk "github.com/openshift-online/ocm-sdk-go"
. "github.com/project-codeflare/codeflare-operator/test/support"
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestConfig(test Test, namespace string) (*corev1.ConfigMap, error) {
// Test configuration
configMap := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist-mcad",
Namespace: namespace,
},
BinaryData: map[string][]byte{
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
},
Immutable: Ptr(true),
}

config, err := test.Client().Core().CoreV1().ConfigMaps(namespace).Create(test.Ctx(), configMap, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)

return configMap, err
}

func CreateConnection(test Test) (*ocmsdk.Connection, error) {
instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{})
if err != nil {
test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err)
}
test.Expect(err).NotTo(HaveOccurred())
ocmToken := string(instascaleOCMSecret.Data["token"])
test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name)

connection, err := CreateOCMConnection(ocmToken)
if err != nil {
test.T().Errorf("Unable to create ocm connection - Error : %v", err)
}
return connection, err
}

func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) {
// Batch Job
job := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: batchv1.SchemeGroupVersion.String(),
Kind: "Job",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist",
Namespace: namespace.Name,
},
Spec: batchv1.JobSpec{
Completions: Ptr(int32(1)),
Parallelism: Ptr(int32(1)),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "job",
Image: GetPyTorchImage(),
Env: []corev1.EnvVar{
corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"},
},
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
Args: []string{"$PYTHONUSERBASE"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "test",
MountPath: "/test",
},
{
Name: "test2",
MountPath: "/test2",
},
},
WorkingDir: "/test2",
},
},
Volumes: []corev1.Volume{
{
Name: "test",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: config.Name,
},
},
},
},
{
Name: "test2",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}

// create an appwrapper
aw := &mcadv1beta1.AppWrapper{
ObjectMeta: metav1.ObjectMeta{
Name: "test-instascale",
Namespace: namespace.Name,
Labels: map[string]string{
"orderedinstance": "m5.xlarge_g4dn.xlarge",
},
},
Spec: mcadv1beta1.AppWrapperSpec{
AggrResources: mcadv1beta1.AppWrapperResourceList{
GenericItems: []mcadv1beta1.AppWrapperGenericResource{
{
CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
"nvidia.com/gpu": resource.MustParse("1"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
},
},
},
GenericTemplate: Raw(test, job),
CompletionStatus: "Complete",
},
},
},
},
}

_, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name)

test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive)))

return job, aw, err
}
73 changes: 73 additions & 0 deletions test/e2e/instascale_machinepool_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package e2e

import (
"testing"
"time"

. "github.com/onsi/gomega"
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
. "github.com/project-codeflare/codeflare-operator/test/support"
)

func TestInstascaleMachinePool(t *testing.T) {

test := With(t)
test.T().Parallel()

namespace := test.NewTestNamespace()

// Test configuration
config, err := TestConfig(test, namespace.Name)
test.Expect(err).To(BeNil())

//create OCM connection
connection, err := CreateConnection(test)
test.Expect(err).To(BeNil())

defer connection.Close()

// check existing cluster machine pool resources
// look for machine pool with aw name - expect not to find it
foundMachinePool, err := CheckMachinePools(connection, TestName)
test.Expect(err).NotTo(HaveOccurred())
test.Expect(foundMachinePool).To(BeFalse())

// Setup batch job and AppWrapper
job, aw, err := JobAppwrapperSetup(test, namespace, config)
test.Expect(err).To(BeNil())

// time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense
// wait for required resources to scale up before checking them again
time.Sleep(TestTimeoutMedium)

// look for machine pool with aw name - expect to find it
foundMachinePool, err = CheckMachinePools(connection, TestName)
test.Expect(err).NotTo(HaveOccurred())
test.Expect(foundMachinePool).To(BeTrue())

// Assert that the job has completed
test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name)
test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should(
Or(
WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)),
WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)),
))

// Assert the job has completed successfully
test.Expect(GetJob(test, job.Namespace, job.Name)).
To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)))

test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted)))

// allow time for the resources to scale down before checking them again
time.Sleep(TestTimeoutMedium)

// look for machine pool with aw name - expect not to find it
foundMachinePool, err = CheckMachinePools(connection, TestName)
test.Expect(err).NotTo(HaveOccurred())
test.Expect(foundMachinePool).To(BeFalse())

}
115 changes: 115 additions & 0 deletions test/support/clusterpools.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package support

import (
"context"
"fmt"
"os"
"strings"

ocmsdk "github.com/openshift-online/ocm-sdk-go"
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
mapiclientset "github.com/openshift/client-go/machine/clientset/versioned"
"github.com/openshift/client-go/machine/listers/machine/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
)

var (
ClusterID string = os.Getenv("CLUSTERID")
machineClient mapiclientset.Interface
msLister v1beta1.MachineSetLister
TestName string = "test-instascale"
)

const (
namespaceToList = "openshift-machine-api"
)

func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) {
logger, err := ocmsdk.NewGoLoggerBuilder().
Debug(false).
Build()
if err != nil {
fmt.Fprintf(os.Stderr, "Can't build logger: %v\n", err)
os.Exit(1)
}
connection, err := ocmsdk.NewConnectionBuilder().
Logger(logger).
Tokens(string(secret)).
Build()
if err != nil || connection == nil {
fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err)
os.Exit(1)
}

return connection, nil
}

func MachinePoolsExist(connection *ocmsdk.Connection) (bool, error) {
machinePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools()
return machinePools != nil, nil
}

func NodePoolsExist(connection *ocmsdk.Connection) (bool, error) {
nodePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools()
return nodePools != nil, nil
}

func CheckMachinePools(connection *ocmsdk.Connection, awName string) (foundMachinePool bool, err error) {
machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List()
machinePoolsListResponse, err := machinePoolsConnection.Send()
if err != nil {
return false, fmt.Errorf("unable to send request, error: %v", err)
}
machinePoolsList := machinePoolsListResponse.Items()
machinePoolsList.Range(func(index int, item *cmv1.MachinePool) bool {
instanceName, _ := item.GetID()
if strings.Contains(instanceName, awName) {
foundMachinePool = true
}
return true
})

return foundMachinePool, err
}

func CheckNodePools(connection *ocmsdk.Connection, awName string) (foundNodePool bool, err error) {
nodePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools().List()
nodePoolsListResponse, err := nodePoolsConnection.SendContext(context.Background())
if err != nil {
return false, fmt.Errorf("unable to send request, error: %v", err)
}
nodePoolsList := nodePoolsListResponse.Items()
nodePoolsList.Range(func(index int, item *cmv1.NodePool) bool {
instanceName, _ := item.GetID()
if strings.Contains(instanceName, awName) {
foundNodePool = true
}
return true
})

return foundNodePool, err
}

func MachineSetsCount() (numMachineSets int, err error) {
machineSets, err := machineClient.MachineV1beta1().MachineSets(namespaceToList).List(context.Background(), metav1.ListOptions{})
if err != nil {
return 0, fmt.Errorf("error while listing machine sets, error: %v", err)
}
machineSetsSize := machineSets.ListMeta.Size()

return machineSetsSize, nil
}

func CheckMachineSets(awName string) (foundMachineSet bool, err error) {
machineSets, err := msLister.MachineSets("").List(labels.Everything())
if err != nil {
return false, fmt.Errorf("error listing machine sets, error: %v", err)
}
for _, machineSet := range machineSets {
if strings.Contains(machineSet.Name, awName) {
foundMachineSet = true
}
}
return foundMachineSet, err
}
1 change: 1 addition & 0 deletions test/support/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ var (
ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true}

TestTimeoutShort = 1 * time.Minute
TestTimeoutThirtySeconds = 30 * time.Second
TestTimeoutMedium = 2 * time.Minute
TestTimeoutLong = 5 * time.Minute
)
Expand Down