55
66#include <linux/sizes.h>
77#include <linux/vfio_pci_core.h>
8+ #include <linux/delay.h>
9+ #include <linux/jiffies.h>
810
911/*
1012 * The device memory usable to the workloads running in the VM is cached
1719#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
1820#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
1921
20- /* Memory size expected as non cached and reserved by the VM driver */
21- #define RESMEM_SIZE SZ_1G
22-
2322/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
2423#define MEMBLK_SIZE SZ_512M
2524
25+ #define DVSEC_BITMAP_OFFSET 0xA
26+ #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
27+
28+ #define GPU_CAP_DVSEC_REGISTER 3
29+
30+ #define C2C_LINK_BAR0_OFFSET 0x1498
31+ #define HBM_TRAINING_BAR0_OFFSET 0x200BC
32+ #define STATUS_READY 0xFF
33+
34+ #define POLL_QUANTUM_MS 1000
35+ #define POLL_TIMEOUT_MS (30 * 1000)
36+
2637/*
2738 * The state of the two device memory region - resmem and usemem - is
2839 * saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
4657 struct mem_region resmem ;
4758 /* Lock to control device memory kernel mapping */
4859 struct mutex remap_lock ;
60+ bool has_mig_hw_bug ;
4961};
5062
5163static void nvgrace_gpu_init_fake_bar_emu_regs (struct vfio_device * core_vdev )
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
6678 if (index == USEMEM_REGION_INDEX )
6779 return & nvdev -> usemem ;
6880
69- if (index == RESMEM_REGION_INDEX )
81+ if (nvdev -> resmem . memlength && index == RESMEM_REGION_INDEX )
7082 return & nvdev -> resmem ;
7183
7284 return NULL ;
@@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
751763 u64 memphys , u64 memlength )
752764{
753765 int ret = 0 ;
766+ u64 resmem_size = 0 ;
754767
755768 /*
756- * The VM GPU device driver needs a non-cacheable region to support
757- * the MIG feature. Since the device memory is mapped as NORMAL cached,
758- * carve out a region from the end with a different NORMAL_NC
759- * property (called as reserved memory and represented as resmem). This
760- * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
761- * exposing the rest (termed as usable memory and represented using usemem)
762- * as cacheable 64b BAR (region 4 and 5).
769+ * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
770+ * region to support the MIG feature owing to a hardware bug. Since the
771+ * device memory is mapped as NORMAL cached, carve out a region from the end
772+ * with a different NORMAL_NC property (called as reserved memory and
773+ * represented as resmem). This region then is exposed as a 64b BAR
774+ * (region 2 and 3) to the VM, while exposing the rest (termed as usable
775+ * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
763776 *
764777 * devmem (memlength)
765778 * |-------------------------------------------------|
766779 * | |
767780 * usemem.memphys resmem.memphys
781+ *
782+ * This hardware bug is fixed on the Grace Blackwell platforms and the
783+ * presence of the bug can be determined through nvdev->has_mig_hw_bug.
784+ * Thus on systems with the hardware fix, there is no need to partition
785+ * the GPU device memory and the entire memory is usable and mapped as
786+ * NORMAL cached (i.e. resmem size is 0).
768787 */
788+ if (nvdev -> has_mig_hw_bug )
789+ resmem_size = SZ_1G ;
790+
769791 nvdev -> usemem .memphys = memphys ;
770792
771793 /*
772794 * The device memory exposed to the VM is added to the kernel by the
773- * VM driver module in chunks of memory block size. Only the usable
774- * memory (usemem) is added to the kernel for usage by the VM
775- * workloads. Make the usable memory size memblock aligned.
795+ * VM driver module in chunks of memory block size. Note that only the
796+ * usable memory (usemem) is added to the kernel for usage by the VM
797+ * workloads.
776798 */
777- if (check_sub_overflow (memlength , RESMEM_SIZE ,
799+ if (check_sub_overflow (memlength , resmem_size ,
778800 & nvdev -> usemem .memlength )) {
779801 ret = - EOVERFLOW ;
780802 goto done ;
781803 }
782804
783805 /*
784- * The USEMEM part of the device memory has to be MEMBLK_SIZE
785- * aligned. This is a hardwired ABI value between the GPU FW and
786- * VFIO driver. The VM device driver is also aware of it and make
787- * use of the value for its calculation to determine USEMEM size.
806+ * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
807+ * Calculate and save the BAR size for the region.
808+ */
809+ nvdev -> usemem .bar_size = roundup_pow_of_two (nvdev -> usemem .memlength );
810+
811+ /*
812+ * If the hardware has the fix for MIG, there is no requirement
813+ * for splitting the device memory to create RESMEM. The entire
814+ * device memory is usable and will be USEMEM. Return here for
815+ * such case.
816+ */
817+ if (!nvdev -> has_mig_hw_bug )
818+ goto done ;
819+
820+ /*
821+ * When the device memory is split to workaround the MIG bug on
822+ * Grace Hopper, the USEMEM part of the device memory has to be
823+ * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
824+ * GPU FW and VFIO driver. The VM device driver is also aware of it
825+ * and make use of the value for its calculation to determine USEMEM
826+ * size. Note that the device memory may not be 512M aligned.
788827 */
789828 nvdev -> usemem .memlength = round_down (nvdev -> usemem .memlength ,
790829 MEMBLK_SIZE );
@@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
803842 }
804843
805844 /*
806- * The memory regions are exposed as BARs. Calculate and save
807- * the BAR size for them .
845+ * The resmem region is exposed as a 64b BAR composed of region 2 and 3
846+ * for Grace Hopper. Calculate and save the BAR size for the region .
808847 */
809- nvdev -> usemem .bar_size = roundup_pow_of_two (nvdev -> usemem .memlength );
810848 nvdev -> resmem .bar_size = roundup_pow_of_two (nvdev -> resmem .memlength );
811849done :
812850 return ret ;
813851}
814852
853+ static bool nvgrace_gpu_has_mig_hw_bug (struct pci_dev * pdev )
854+ {
855+ int pcie_dvsec ;
856+ u16 dvsec_ctrl16 ;
857+
858+ pcie_dvsec = pci_find_dvsec_capability (pdev , PCI_VENDOR_ID_NVIDIA ,
859+ GPU_CAP_DVSEC_REGISTER );
860+
861+ if (pcie_dvsec ) {
862+ pci_read_config_word (pdev ,
863+ pcie_dvsec + DVSEC_BITMAP_OFFSET ,
864+ & dvsec_ctrl16 );
865+
866+ if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM )
867+ return false;
868+ }
869+
870+ return true;
871+ }
872+
873+ /*
874+ * To reduce the system bootup time, the HBM training has
875+ * been moved out of the UEFI on the Grace-Blackwell systems.
876+ *
877+ * The onus of checking whether the HBM training has completed
878+ * thus falls on the module. The HBM training status can be
879+ * determined from a BAR0 register.
880+ *
881+ * Similarly, another BAR0 register exposes the status of the
882+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
883+ *
884+ * Poll these register and check for 30s. If the HBM training is
885+ * not complete or if the C2C link is not ready, fail the probe.
886+ *
887+ * While the wait is not required on Grace Hopper systems, it
888+ * is beneficial to make the check to ensure the device is in an
889+ * expected state.
890+ *
891+ * Ensure that the BAR0 region is enabled before accessing the
892+ * registers.
893+ */
894+ static int nvgrace_gpu_wait_device_ready (struct pci_dev * pdev )
895+ {
896+ unsigned long timeout = jiffies + msecs_to_jiffies (POLL_TIMEOUT_MS );
897+ void __iomem * io ;
898+ int ret = - ETIME ;
899+
900+ ret = pci_enable_device (pdev );
901+ if (ret )
902+ return ret ;
903+
904+ ret = pci_request_selected_regions (pdev , 1 << 0 , KBUILD_MODNAME );
905+ if (ret )
906+ goto request_region_exit ;
907+
908+ io = pci_iomap (pdev , 0 , 0 );
909+ if (!io ) {
910+ ret = - ENOMEM ;
911+ goto iomap_exit ;
912+ }
913+
914+ do {
915+ if ((ioread32 (io + C2C_LINK_BAR0_OFFSET ) == STATUS_READY ) &&
916+ (ioread32 (io + HBM_TRAINING_BAR0_OFFSET ) == STATUS_READY )) {
917+ ret = 0 ;
918+ goto reg_check_exit ;
919+ }
920+ msleep (POLL_QUANTUM_MS );
921+ } while (!time_after (jiffies , timeout ));
922+
923+ reg_check_exit :
924+ pci_iounmap (pdev , io );
925+ iomap_exit :
926+ pci_release_selected_regions (pdev , 1 << 0 );
927+ request_region_exit :
928+ pci_disable_device (pdev );
929+ return ret ;
930+ }
931+
815932static int nvgrace_gpu_probe (struct pci_dev * pdev ,
816933 const struct pci_device_id * id )
817934{
@@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
820937 u64 memphys , memlength ;
821938 int ret ;
822939
940+ ret = nvgrace_gpu_wait_device_ready (pdev );
941+ if (ret )
942+ return ret ;
943+
823944 ret = nvgrace_gpu_fetch_memory_property (pdev , & memphys , & memlength );
824945 if (!ret )
825946 ops = & nvgrace_gpu_pci_ops ;
@@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
832953 dev_set_drvdata (& pdev -> dev , & nvdev -> core_device );
833954
834955 if (ops == & nvgrace_gpu_pci_ops ) {
956+ nvdev -> has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug (pdev );
957+
835958 /*
836959 * Device memory properties are identified in the host ACPI
837960 * table. Set the nvgrace_gpu_pci_core_device structure.
@@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
868991 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2345 ) },
869992 /* GH200 SKU */
870993 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2348 ) },
994+ /* GB200 SKU */
995+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO (PCI_VENDOR_ID_NVIDIA , 0x2941 ) },
871996 {}
872997};
873998
0 commit comments