From a7979272ec4c80ee5117c337678394fb929fc3fe Mon Sep 17 00:00:00 2001
From: Colin Xu <colin.xu@intel.com>
Date: Mon, 16 Dec 2019 08:55:32 +0800
Subject: [PATCH] Make HAXM run on system with more than 64 host CPUs

Although this patch changed lots of files but only do one thing: make
HAXM run on system with more than 64 logical CPUs.

Previously, HAX_MAX_CPUS is defined as 64, and HAXM stores CPU online
bitmap in 64-bit variable. When running on a system with more than 64
logical CPUs, IPI call actually executed on all CPUs, although internally
HAXM only maintains a 64-bit bitmap. So some per-CPU routine actually
runs on different CPUs, but the 64 loop will check the same pair of
VMXON/VMXOFF for VMX operations, then leads to the error.
Simply increasing the 64-bit bitmap to larger size may resolve the issue
but not efficient and clean.
Previous implementation also has another issue that it invokes
KeQueryActiveProcessors() to get the total logical CPU number, and
KeGetCurrentProcessorNumber() to get the current logical CPU ID. However,
both APIs are NOT designed to get information on Windows with more than
1 CPU group. Otherwise, both APIs only return value from group 0, which
can't reveal the actual logical CPU information. Instead, user should use
KeQueryActiveProcessorCountEx() and KeGetCurrentProcessorNumberEx().

This patch defines the CPU bitmap in 2-dimention way, in unit of group,
each group can hold up to 64 CPUs for bitmap, same as old implementation.
And introduce another array to store the per-CPU group/bitmap information
so that indexing could be fast. This patch also unify cpu init routines,
more common implementation into same header/source instead of OS specific,
like cpu_info_init(), smp_cfunction(), smp_call_parameter{}, etc. Since
they are very fundamental functions, several files are modified.

Change summary:
- Define 2-dimention structure hax_cpumap_t to store CPU bitmap info.
Including total group number, total logical CPU number, bitmap within
each group, group/bit position for each CPU id.
- For Windows/Linux/Darwin/BSD, use simliar routine cpu_info_init()
to initialize host CPUs, and implement in OS specific way.
- On Windows, use KeQueryActiveProcessorCountEx(),
KeGetCurrentProcessorNumberEx() and KeQueryActiveGroupCount() to get
correct logical CPUs number and group information, and fill hax_cpumap_t.
- For Linux/Darwin/BSD, use OS specific routine to get the total logical
CPU number, and fill into groups consecutive. This is different against
Windows since logical process group is Windows definition, and CPU bitmap
is not guaranteed to consecutively fit into a 64-bit bitmap: 64 logical
CPUs could be in two groups.
- Implement the new cpu_is_online() function with the new hax_cpumap_t.
- Implement the new cpu2cpumap() function with the new hax_cpumap_t.
- Unify OS specific smp_cfunction() implementations in to one, since
the function is executed by IPI on each CPU, check the CPU online with
the new cpu_is_online().
- For all functions refer to the CPU bitmap, use the new implementation.
- For all per-CPU IPI function, add current CPU id in log.

After this patch, HAXM design won't block running on system with a large
number CPUs, and easy to expand in case limitted by the date type range:
now the upper bound is 65536*64 regardless of other resource limitation.

Signed-off-by: Colin Xu <colin.xu@intel.com>
---
 core/cpu.c                          |  37 +++++--
 core/ept.c                          |  15 +--
 core/hax.c                          | 135 +++++++++++++---------
 core/include/cpu.h                  |   7 +-
 core/include/vcpu.h                 |   4 +-
 core/vcpu.c                         |  29 +++--
 include/darwin/hax_mac.h            |  17 ---
 include/darwin/hax_types_mac.h      |   7 +-
 include/hax.h                       | 154 +++++++++++++++++++++-----
 include/linux/hax_linux.h           |   7 --
 include/linux/hax_types_linux.h     |   8 +-
 include/netbsd/hax_netbsd.h         |   7 --
 include/netbsd/hax_types_netbsd.h   |   8 +-
 include/windows/hax_types_windows.h |  17 +--
 include/windows/hax_windows.h       |   7 --
 platforms/darwin/com_intel_hax.c    |  42 +------
 platforms/darwin/hax_wrapper.cpp    | 110 +++++++++++++-----
 platforms/linux/hax_entry.c         |  16 +--
 platforms/linux/hax_wrapper.c       |  98 ++++++++++++----
 platforms/netbsd/hax_entry.c        |  24 ++--
 platforms/netbsd/hax_entry_vcpu.c   |   3 +-
 platforms/netbsd/hax_wrapper.c      | 108 ++++++++++++++----
 platforms/windows/hax_entry.c       |  17 +--
 platforms/windows/hax_wrapper.c     | 166 +++++++++++++++++++---------
 24 files changed, 672 insertions(+), 371 deletions(-)

diff --git a/core/cpu.c b/core/cpu.c
index 050cc169..5f82fd7c 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -87,6 +87,8 @@ void cpu_init_vmx(void *arg)
 
     cpu_data = current_cpu_data();
 
+    hax_log(HAX_LOGD, "[#%d] cpu_init_vmx\n", cpu_data->cpu_id);
+
     cpu_data->cpu_features |= HAX_CPUF_VALID;
     if (!cpu_has_feature(X86_FEATURE_VMX))
         return;
@@ -158,6 +160,7 @@ void cpu_init_vmx(void *arg)
 
 void cpu_exit_vmx(void *arg)
 {
+    hax_log(HAX_LOGD, "[#%d] cpu_exit_vmx\n", current_cpu_data()->cpu_id);
 }
 
 /*
@@ -170,6 +173,8 @@ void cpu_pmu_init(void *arg)
     struct cpu_pmu_info *pmu_info = &current_cpu_data()->pmu_info;
     cpuid_args_t cpuid_args;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_pmu_init\n", current_cpu_data()->cpu_id);
+
     memset(pmu_info, 0, sizeof(struct cpu_pmu_info));
 
     // Call CPUID with EAX = 0
@@ -405,8 +410,8 @@ int cpu_vmx_execute(struct vcpu_t *vcpu, struct hax_tunnel *htun)
         }
 
         exit_reason.raw = vmread(vcpu, VM_EXIT_INFO_REASON);
-        hax_log(HAX_LOGD, "....exit_reason.raw %x, cpu %d %d\n",
-                exit_reason.raw, vcpu->cpu_id, hax_cpuid());
+        hax_log(HAX_LOGD, "....exit_reason.raw %x, vcpu %d, cpu %d\n",
+                exit_reason.raw, vcpu->cpu_id, hax_cpu_id());
 
         /* XXX Currently we take active save/restore for MSR and FPU, the main
          * reason is, we have no schedule hook to get notified of preemption
@@ -559,7 +564,7 @@ uint32_t load_vmcs(struct vcpu_t *vcpu, preempt_flag *flags)
         vcpu->is_vmcs_loaded = 1;
         cpu_data->current_vcpu = vcpu;
         vcpu->prev_cpu_id = vcpu->cpu_id;
-        vcpu->cpu_id = hax_cpuid();
+        vcpu->cpu_id = hax_cpu_id();
     }
 
     cpu_data->other_vmcs = curr_vmcs;
@@ -669,23 +674,27 @@ vmx_result_t cpu_vmxroot_leave(void)
     struct per_cpu_data *cpu_data = current_cpu_data();
     vmx_result_t result = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_vmxroot_leave\n", cpu_data->cpu_id);
+
     if (cpu_data->vmm_flag & VMXON_HAX) {
         result = asm_vmxoff();
         if (result == VMX_SUCCEED) {
             cpu_data->vmm_flag &= ~VMXON_HAX;
             restore_host_cr4_vmxe(cpu_data);
         } else {
-            hax_log(HAX_LOGE, "VMXOFF Failed..........\n");
+            hax_log(HAX_LOGE, "[#%d] VMXOFF Failed..........\n",
+                    cpu_data->cpu_id);
         }
     } else {
         log_vmxoff_no = 1;
 #ifdef HAX_PLATFORM_DARWIN
-        hax_log(HAX_LOGD, "Skipping VMXOFF because another VMM (VirtualBox or "
-                "macOS Hypervisor Framework) is running\n");
+        hax_log(HAX_LOGD, "[#%d] Skipping VMXOFF because another VMM "
+                "(VirtualBox or macOS Hypervisor Framework) is running\n",
+                cpu_data->cpu_id);
 #else
         // It should not go here in Win64/win32
         result = VMX_FAIL_VALID;
-        hax_log(HAX_LOGE, "NO VMXOFF.......\n");
+        hax_log(HAX_LOGE, "[#%d] NO VMXOFF.......\n", cpu_data->cpu_id);
 #endif
     }
     cpu_data->vmxoff_res = result;
@@ -700,11 +709,14 @@ vmx_result_t cpu_vmxroot_enter(void)
     hax_paddr_t vmxon_addr;
     vmx_result_t result = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_vmxroot_enter\n", cpu_data->cpu_id);
+
     cpu_data->host_cr4_vmxe = (get_cr4() & CR4_VMXE);
     if (cpu_data->host_cr4_vmxe) {
         if (debug_vmcs_count % 100000 == 0) {
-            hax_log(HAX_LOGD, "host VT has enabled!\n");
-            hax_log(HAX_LOGD, "Cr4 value = 0x%lx\n", get_cr4());
+            hax_log(HAX_LOGD, "[#%d] host VT has enabled!\n", cpu_data->cpu_id);
+            hax_log(HAX_LOGD, "[#%d] Cr4 value = 0x%lx\n",
+                    get_cr4(), cpu_data->cpu_id);
             log_host_cr4_vmxe = 1;
             log_host_cr4 = get_cr4();
         }
@@ -766,9 +778,10 @@ vmx_result_t cpu_vmxroot_enter(void)
 #endif
 
         if (fatal) {
-            hax_log(HAX_LOGE, "VMXON failed for region 0x%llx (result=0x%x, "
-                    "vmxe=%x)\n", hax_page_pa(cpu_data->vmxon_page),
-                    (uint32_t)result, (uint32_t)cpu_data->host_cr4_vmxe);
+            hax_log(HAX_LOGE, "[#%d] VMXON failed for region 0x%llx "
+                    "(result=0x%x, vmxe=%x)\n", cpu_data->cpu_id,
+                    hax_page_pa(cpu_data->vmxon_page), (uint32_t)result,
+                    (uint32_t)cpu_data->host_cr4_vmxe);
             restore_host_cr4_vmxe(cpu_data);
             if (result == VMX_FAIL_INVALID) {
                 log_vmxon_err_type1 = 1;
diff --git a/core/ept.c b/core/ept.c
index cc6d3276..c6037ff3 100644
--- a/core/ept.c
+++ b/core/ept.c
@@ -335,6 +335,8 @@ static void invept_smpfunc(struct invept_bundle *bundle)
     cpu_data = current_cpu_data();
     cpu_data->invept_res = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] invept_smpfunc\n", cpu_data->cpu_id);
+
     cpu_vmxroot_enter();
 
     if (cpu_data->vmxon_res == VMX_SUCCEED) {
@@ -348,8 +350,7 @@ void invept(hax_vm_t *hax_vm, uint type)
     uint64_t eptp_value = vm_get_eptp(hax_vm);
     struct invept_desc desc = { eptp_value, 0 };
     struct invept_bundle bundle;
-    int cpu_id;
-    uint32_t res;
+    uint32_t cpu_id, res;
 
     if (!ept_has_cap(ept_cap_invept)) {
         hax_log(HAX_LOGW, "INVEPT was not called due to missing host support"
@@ -384,10 +385,10 @@ void invept(hax_vm_t *hax_vm, uint type)
      * especially on macOS; instead, invept_smpfunc() writes VMX instruction
      * results in hax_cpu_data[], which are checked below.
      */
-    for (cpu_id = 0; cpu_id < max_cpus; cpu_id++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
 
-        if (!cpu_is_online(cpu_id)) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id)) {
             continue;
         }
         cpu_data = hax_cpu_data[cpu_id];
@@ -399,17 +400,17 @@ void invept(hax_vm_t *hax_vm, uint type)
 
         res = (uint32_t)cpu_data->vmxon_res;
         if (res != VMX_SUCCEED) {
-            hax_log(HAX_LOGE, "[Processor #%d] INVEPT was not called, because "
+            hax_log(HAX_LOGE, "[#%d] INVEPT was not called, because "
                     "VMXON failed (err=0x%x)\n", cpu_id, res);
         } else {
             res = (uint32_t)cpu_data->invept_res;
             if (res != VMX_SUCCEED) {
-                hax_log(HAX_LOGE, "[Processor #%d] INVEPT failed (err=0x%x)\n",
+                hax_log(HAX_LOGE, "[#%d] INVEPT failed (err=0x%x)\n",
                         cpu_id, res);
             }
             res = (uint32_t)cpu_data->vmxoff_res;
             if (res != VMX_SUCCEED) {
-                hax_log(HAX_LOGE, "[Processor #%d] INVEPT was called, but "
+                hax_log(HAX_LOGE, "[#%d] INVEPT was called, but "
                         "VMXOFF failed (err=0x%x)\n", cpu_id, res);
             }
         }
diff --git a/core/hax.c b/core/hax.c
index cf9a7400..995aaf51 100644
--- a/core/hax.c
+++ b/core/hax.c
@@ -55,6 +55,7 @@ struct hax_page *io_bitmap_page_a;
 struct hax_page *io_bitmap_page_b;
 struct hax_page *msr_bitmap_page;
 
+hax_cpumap_t cpu_online_map;
 struct per_cpu_data **hax_cpu_data;
 struct hax_t *hax;
 
@@ -71,25 +72,25 @@ static void hax_disable_vmx(void)
 
 static void free_cpu_vmxon_region(void)
 {
-    int cpu;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[cpu]->vmxon_page) {
-            hax_free_pages(hax_cpu_data[cpu]->vmxon_page);
-            hax_cpu_data[cpu]->vmxon_page = NULL;
+        if (hax_cpu_data[cpu_id]->vmxon_page) {
+            hax_free_pages(hax_cpu_data[cpu_id]->vmxon_page);
+            hax_cpu_data[cpu_id]->vmxon_page = NULL;
         }
     }
 }
 
 static int alloc_cpu_vmxon_region(void)
 {
-    int cpu;
+    uint32_t cpu_id;
     struct hax_page *page;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
         page = hax_alloc_page(0, 1);
         if (!page) {
@@ -97,32 +98,32 @@ static int alloc_cpu_vmxon_region(void)
             return -ENOMEM;
         }
         hax_clear_page(page);
-        hax_cpu_data[cpu]->vmxon_page = page;
+        hax_cpu_data[cpu_id]->vmxon_page = page;
     }
     return 0;
 }
 
 void free_cpu_template_vmcs(void)
 {
-    int cpu;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[cpu]->vmcs_page) {
-            hax_free_pages(hax_cpu_data[cpu]->vmcs_page);
-            hax_cpu_data[cpu]->vmcs_page = NULL;
+        if (hax_cpu_data[cpu_id]->vmcs_page) {
+            hax_free_pages(hax_cpu_data[cpu_id]->vmcs_page);
+            hax_cpu_data[cpu_id]->vmcs_page = NULL;
         }
     }
 }
 
 static int alloc_cpu_template_vmcs(void)
 {
-    int cpu;
+    uint32_t cpu_id;
     struct hax_page *page = NULL;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
         page = (struct hax_page *)hax_alloc_page(0, 1);
         if (!page) {
@@ -130,7 +131,7 @@ static int alloc_cpu_template_vmcs(void)
             return -ENOMEM;
         }
         hax_clear_page(page);
-        hax_cpu_data[cpu]->vmcs_page = page;
+        hax_cpu_data[cpu_id]->vmcs_page = page;
     }
     return 0;
 }
@@ -140,6 +141,29 @@ int hax_em64t_enabled(void)
     return hax->em64t_enable_flag;
 }
 
+#ifdef HAX_PLATFORM_DARWIN
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+#ifdef HAX_PLATFORM_LINUX
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+hax_smp_func_ret_t smp_cfunction(void *param, void *a2 __unused)
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+{
+    struct smp_call_parameter *p = (struct smp_call_parameter *)param;
+    void (*action)(void *parap) = p->func;
+    hax_cpumap_t *hax_cpus = p->cpus;
+
+    if (cpu_is_online(hax_cpus, hax_cpu_id()))
+        action(p->param);
+
+    return (hax_smp_func_ret_t)NULL;
+}
+
 /*
  * This vcpu_data should not be accessed by anyone else at this step.
  * Return 0 if can continue, <0 for error.
@@ -147,14 +171,15 @@ int hax_em64t_enabled(void)
 static int hax_vmx_enable_check(void)
 {
     int vts = 0, nxs = 0, vte = 0, nxe = 0, em64s = 0, em64e = 0, finished = 0;
-    int cpu, tnum = 0, error = 0;
+    int tnum = 0, error = 0;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
 
-        if (!cpu_is_online(cpu))
+        if (!cpu_is_online(&cpu_online_map, cpu_id))
             continue;
-        cpu_data = hax_cpu_data[cpu];
+        cpu_data = hax_cpu_data[cpu_id];
         // This should not happen !
         if (!cpu_data)
             continue;
@@ -412,8 +437,7 @@ static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write
  */
 static void hax_pmu_init(void)
 {
-    int cpu_id;
-    int ref_cpu_id = -1;
+    uint32_t cpu_id, ref_cpu_id = (uint32_t)(~0ULL);
 
     // Execute cpu_pmu_init() on each logical processor of the host CPU
     hax_smp_call_function(&cpu_online_map, cpu_pmu_init, NULL);
@@ -421,11 +445,11 @@ static void hax_pmu_init(void)
     // Find the common APM version supported by all host logical processors
     // TODO: Theoretically we should do the same for other APM parameters
     // (number of counters, etc.) as well
-    for (cpu_id = 0; cpu_id < max_cpus; cpu_id++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
         uint apm_version;
 
-        if (!cpu_is_online(cpu_id)) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id)) {
             continue;
         }
         cpu_data = hax_cpu_data[cpu_id];
@@ -512,7 +536,7 @@ static void hax_pmu_init(void)
 
 int hax_module_init(void)
 {
-    int ret = 0, cpu = 0;
+    uint32_t cpu_id;
 
     hax = (struct hax_t *)hax_vmalloc(sizeof(struct hax_t), HAX_MEM_NONPAGE);
     if (!hax)
@@ -523,29 +547,28 @@ int hax_module_init(void)
     if (!hax->hax_lock)
         goto out_0;
 
-    hax_cpu_data = hax_vmalloc(max_cpus * sizeof(void *), 0);
+    hax_cpu_data = hax_vmalloc(cpu_online_map.cpu_num * sizeof(void *), 0);
     if (!hax_cpu_data)
         goto out_1;
-    memset(hax_cpu_data, 0, max_cpus * sizeof(void *));
+    memset(hax_cpu_data, 0, cpu_online_map.cpu_num * sizeof(void *));
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu))
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id))
             continue;
-        hax_cpu_data[cpu] = hax_vmalloc(sizeof(struct per_cpu_data), 0);
-        if (!hax_cpu_data[cpu])
+        hax_cpu_data[cpu_id] = hax_vmalloc(sizeof(struct per_cpu_data), 0);
+        if (!hax_cpu_data[cpu_id])
             goto out_2;
-        memset(hax_cpu_data[cpu], 0, sizeof(struct per_cpu_data));
+        memset(hax_cpu_data[cpu_id], 0, sizeof(struct per_cpu_data));
 
-        hax_cpu_data[cpu]->hstate.hfxpage =
+        hax_cpu_data[cpu_id]->hstate.hfxpage =
                 (struct hax_page *)hax_alloc_page(0, 1);
-        if (!hax_cpu_data[cpu]->hstate.hfxpage)
+        if (!hax_cpu_data[cpu_id]->hstate.hfxpage)
             goto out_2;
-        hax_clear_page(hax_cpu_data[cpu]->hstate.hfxpage);
-        hax_cpu_data[cpu]->cpu_id = cpu;
+        hax_clear_page(hax_cpu_data[cpu_id]->hstate.hfxpage);
+        hax_cpu_data[cpu_id]->cpu_id = cpu_id;
     }
     cpu_init_feature_cache();
-    ret = hax_vmx_init();
-    if (ret < 0)
+    if (hax_vmx_init() < 0)
         goto out_2;
 
     hax_pmu_init();
@@ -557,15 +580,15 @@ int hax_module_init(void)
     return 0;
 
 out_2:
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (hax_cpu_data[cpu]) {
-            if (hax_cpu_data[cpu]->hstate.hfxpage) {
-                hax_free_pages(hax_cpu_data[cpu]->hstate.hfxpage);
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (hax_cpu_data[cpu_id]) {
+            if (hax_cpu_data[cpu_id]->hstate.hfxpage) {
+                hax_free_pages(hax_cpu_data[cpu_id]->hstate.hfxpage);
             }
-            hax_vfree(hax_cpu_data[cpu], sizeof(struct per_cpu_data));
+            hax_vfree(hax_cpu_data[cpu_id], sizeof(struct per_cpu_data));
         }
     }
-    hax_vfree(hax_cpu_data, max_cpus * sizeof(void *));
+    hax_vfree(hax_cpu_data, cpu_online_map.cpu_num * sizeof(void *));
 out_1:
     hax_mutex_free(hax->hax_lock);
 out_0:
@@ -575,7 +598,8 @@ int hax_module_init(void)
 
 int hax_module_exit(void)
 {
-    int i, ret;
+    int ret;
+    uint32_t cpu_id;
 
     if (!hax_list_empty(&hax->hax_vmlist)) {
         hax_log(HAX_LOGE, "Still VM not be destroyed?\n");
@@ -587,15 +611,16 @@ int hax_module_exit(void)
         return ret;
 
     hax_vmx_exit();
-    for (i = 0; i < max_cpus; i++) {
-        if (!hax_cpu_data[i])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[i]->hstate.hfxpage) {
-            hax_free_pages(hax_cpu_data[i]->hstate.hfxpage);
+        if (hax_cpu_data[cpu_id]->hstate.hfxpage) {
+            hax_free_pages(hax_cpu_data[cpu_id]->hstate.hfxpage);
         }
-        hax_vfree(hax_cpu_data[i], sizeof(struct per_cpu_data));
+        hax_vfree(hax_cpu_data[cpu_id], sizeof(struct per_cpu_data));
     }
-    hax_vfree(hax_cpu_data, max_cpus * sizeof(void *));
+    hax_vfree(hax_cpu_data, cpu_online_map.cpu_num * sizeof(void *));
+    cpu_info_exit();
     hax_mutex_free(hax->hax_lock);
     hax_vfree(hax, sizeof(struct hax_t));
     hax_log(HAX_LOGW, "-------- HAXM v%s End --------\n",
diff --git a/core/include/cpu.h b/core/include/cpu.h
index 988bb3c4..1db58557 100644
--- a/core/include/cpu.h
+++ b/core/include/cpu.h
@@ -44,8 +44,6 @@
 struct vcpu_t;
 struct vcpu_state_t;
 
-typedef uint32_t hax_cpuid_t;  // CPU identifier
-
 #define NR_HMSR 6
 
 struct hstate {
@@ -103,7 +101,7 @@ struct per_cpu_data {
     struct hax_page    *vmcs_page;
     struct vcpu_t      *current_vcpu;
     hax_paddr_t        other_vmcs;
-    hax_cpuid_t        cpu_id;
+    uint32_t           cpu_id;
     uint16_t           vmm_flag;
     uint16_t           nested;
     mword              host_cr4_vmxe;
@@ -157,8 +155,7 @@ struct per_cpu_data {
 extern struct per_cpu_data ** hax_cpu_data;
 static struct per_cpu_data * current_cpu_data(void)
 {
-    uint32_t cpu_id = hax_cpuid();
-    return hax_cpu_data[cpu_id];
+    return hax_cpu_data[hax_cpu_id()];
 }
 
 static struct per_cpu_data * get_cpu_data(uint32_t cpu_id)
diff --git a/core/include/vcpu.h b/core/include/vcpu.h
index c81349b7..dc103dcd 100644
--- a/core/include/vcpu.h
+++ b/core/include/vcpu.h
@@ -146,9 +146,9 @@ struct mmio_fetch_cache {
 
 struct vcpu_t {
     uint16_t vcpu_id;
-    uint16_t cpu_id;
+    uint32_t cpu_id;
     // Sometimes current thread might be migrated to other core.
-    uint16_t prev_cpu_id;
+    uint32_t prev_cpu_id;
     /*
      * VPID: Virtual Processor Identifier
      * VPIDs provide a way for software to identify to the processor
diff --git a/core/vcpu.c b/core/vcpu.c
index d0a145bc..c648a27d 100644
--- a/core/vcpu.c
+++ b/core/vcpu.c
@@ -127,7 +127,7 @@ static uint32_t get_seg_present(uint32_t seg)
 {
     mword ldtr_base;
     struct seg_desc_t *seg_desc;
-    struct hstate *hstate = &get_cpu_data(hax_cpuid())->hstate;
+    struct hstate *hstate = &get_cpu_data(hax_cpu_id())->hstate;
 
     ldtr_base = get_kernel_ldtr_base();
     seg_desc = (struct seg_desc_t *)ldtr_base + (seg >> 3);
@@ -460,8 +460,8 @@ struct vcpu_t *vcpu_create(struct vm_t *vm, void *vm_host, int vcpu_id)
     if (hax_vcpu_create_host(vcpu, vm_host, vm->vm_id, vcpu_id))
         goto fail_7;
 
-    vcpu->prev_cpu_id = -1;
-    vcpu->cpu_id = hax_cpuid();
+    vcpu->prev_cpu_id = (uint32_t)(~0ULL);
+    vcpu->cpu_id = hax_cpu_id();
     vcpu->vcpu_id = vcpu_id;
     vcpu->is_running = 0;
     vcpu->vm = vm;
@@ -1516,7 +1516,7 @@ static void fill_common_vmcs(struct vcpu_t *vcpu)
 static void vcpu_prepare(struct vcpu_t *vcpu)
 {
     hax_log(HAX_LOGD, "vcpu_prepare current %x, CPU %x\n", vcpu->vcpu_id,
-            hax_cpuid());
+            hax_cpu_id());
     hax_mutex_lock(vcpu->tmutex);
     fill_common_vmcs(vcpu);
     hax_mutex_unlock(vcpu->tmutex);
@@ -4320,8 +4320,17 @@ int vcpu_interrupt(struct vcpu_t *vcpu, uint8_t vector)
 }
 
 // Simply to cause vmexit to vcpu, if any vcpu is running on this physical CPU
-static void _vcpu_take_off(void *unused)
+static void _vcpu_take_off(void *param)
 {
+    hax_cpu_pos_t *target = (hax_cpu_pos_t *)param;
+
+    hax_log(HAX_LOGD, "[#%d] _vcpu_take_off\n", current_cpu_data()->cpu_id);
+    if (target)
+        hax_log(HAX_LOGD, "_vcpu_take_off on cpu (group-%d bit-%d)\n",
+                target->group, target->bit);
+    else
+        hax_log(HAX_LOGD, "_vcpu_take_off on all cpu");
+
     return;
 }
 
@@ -4342,16 +4351,16 @@ int vcpu_pause(struct vcpu_t *vcpu)
 
 int vcpu_takeoff(struct vcpu_t *vcpu)
 {
-    int cpu_id;
-    hax_cpumap_t targets;
+    uint32_t cpu_id;
+    hax_cpu_pos_t target = {0};
 
     // Don't change the sequence unless you are sure
     if (vcpu->is_running) {
         cpu_id = vcpu->cpu_id;
-        hax_assert(cpu_id != hax_cpuid());
-        targets = cpu2cpumap(cpu_id);
+        hax_assert(cpu_id != hax_cpu_id());
+        cpu2cpumap(cpu_id, &target);
         // If not considering Windows XP, definitely we don't need this
-        hax_smp_call_function(&targets, _vcpu_take_off, NULL);
+        hax_smp_call_function(&cpu_online_map, _vcpu_take_off, &target);
     }
 
     return 0;
diff --git a/include/darwin/hax_mac.h b/include/darwin/hax_mac.h
index 45a0ba7f..c2356a5d 100644
--- a/include/darwin/hax_mac.h
+++ b/include/darwin/hax_mac.h
@@ -171,21 +171,4 @@ static inline errno_t memcpy_s(void *dest, size_t destsz, const void *src,
 
 #define hax_assert(condition) assert(condition)
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((uint64_t)1 << cpu) & cpu_online_map);
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern int cpu_number(void);
-
-#ifdef __cplusplus
-}
-#endif
-
 #endif  // HAX_DARWIN_HAX_MAC_H_
diff --git a/include/darwin/hax_types_mac.h b/include/darwin/hax_types_mac.h
index 4f122f24..6af2505c 100644
--- a/include/darwin/hax_types_mac.h
+++ b/include/darwin/hax_types_mac.h
@@ -132,12 +132,9 @@ typedef struct hax_kmap_phys {
 
 typedef ulong mword;
 typedef mword preempt_flag;
-typedef uint64_t hax_cpumap_t;
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 typedef uint64_t HAX_VADDR_T;
 
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1UL << cpu);
-}
 #endif  // CONFIG_KERNEL_HAX
 #endif  // HAX_DARWIN_HAX_TYPES_MAC_H_
diff --git a/include/hax.h b/include/hax.h
index 2492b641..174c8163 100644
--- a/include/hax.h
+++ b/include/hax.h
@@ -228,20 +228,144 @@ static inline unsigned char *hax_page_va(struct hax_page *page)
     return (unsigned char *)page->kva;
 }
 
-#define HAX_MAX_CPUS (sizeof(uint64_t) * 8)
+/* Utilities */
+#define HAX_NOLOG       0xff
+#define HAX_LOGPANIC    5
+#define HAX_LOGE        4
+#define HAX_LOGW        3
+#define HAX_LOGI        2
+#define HAX_LOGD        1
+#define HAX_LOG_DEFAULT 3
+
+#ifdef HAX_PLATFORM_DARWIN
+#include "darwin/hax_mac.h"
+#endif
+#ifdef HAX_PLATFORM_LINUX
+#include "linux/hax_linux.h"
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+#include "netbsd/hax_netbsd.h"
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+#include "windows/hax_windows.h"
+#endif
+
+#define HAX_MAX_CPU_PER_GROUP (sizeof(hax_cpumask_t) * 8)
+#define HAX_MAX_CPU_GROUP ((uint16_t)(~0ULL))
+#define HAX_MAX_CPUS (HAX_MAX_CPU_PER_GROUP * HAX_MAX_CPU_GROUP)
+
+typedef struct hax_cpu_group_t {
+    hax_cpumask_t map;
+    uint32_t num;
+    uint16_t id;
+} hax_cpu_group_t;
+
+typedef struct hax_cpu_pos_t {
+    uint16_t group;
+    uint16_t bit;
+} hax_cpu_pos_t;
+
+typedef struct hax_cpumap_t {
+    hax_cpu_group_t *cpu_map;
+    hax_cpu_pos_t *cpu_pos;
+    uint16_t group_num;
+    uint32_t cpu_num;
+} hax_cpumap_t;
+
+typedef struct smp_call_parameter {
+    void (*func)(void *);
+    void *param;
+    hax_cpumap_t *cpus;
+} smp_call_parameter;
 
-/* Host SMP */
 extern hax_cpumap_t cpu_online_map;
-extern int max_cpus;
+
+static inline void cpu2cpumap(uint32_t cpu_id, hax_cpu_pos_t *target)
+{
+    if (!target)
+        return;
+
+    if (cpu_id >= cpu_online_map.cpu_num) {
+        target->group = (uint16_t)(~0ULL);
+        target->bit = (uint16_t)(~0ULL);
+    } else {
+        target->group = cpu_online_map.cpu_pos[cpu_id].group;
+        target->bit = cpu_online_map.cpu_pos[cpu_id].bit;
+    }
+}
+
+static inline bool cpu_is_online(hax_cpumap_t *cpu_map, uint32_t cpu_id)
+{
+    hax_cpumask_t map;
+    uint16_t group, bit;
+
+    if (cpu_id >= cpu_map->cpu_num) {
+        hax_log(HAX_LOGE, "Invalid cpu-%d\n", cpu_id);
+        return 0;
+    }
+
+    group = cpu_map->cpu_pos[cpu_id].group;
+    if (group != cpu_map->cpu_map[group].id) {
+        hax_log(HAX_LOGE, "Group id doesn't match record\n", group);
+        return 0;
+    }
+
+    bit = cpu_map->cpu_pos[cpu_id].bit;
+    map = cpu_map->cpu_map[group].map;
+    return !!(((hax_cpumask_t)1 << bit) & map);
+}
+
+extern uint32_t hax_cpu_id(void);
+static inline void get_online_map(void *param)
+{
+    hax_cpumap_t *omap = (hax_cpumap_t *)param;
+    hax_cpu_group_t *cpu_map;
+    hax_cpu_pos_t * cpu_pos;
+    uint32_t cpu_id, group, bit;
+
+    cpu_id = hax_cpu_id();
+    group = cpu_id / HAX_MAX_CPU_PER_GROUP;
+    bit = cpu_id % HAX_MAX_CPU_PER_GROUP;
+
+    cpu_map = &(omap->cpu_map[group]);
+    cpu_pos = &(omap->cpu_pos[cpu_id]);
+
+    hax_test_and_set_bit(bit, &cpu_map->map);
+    cpu_map->id = group;
+    cpu_pos->group = group;
+    cpu_pos->bit = bit;
+}
+
+static void cpu_info_exit(void)
+{
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map));
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos));
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+}
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+int cpu_info_init(void);
+#ifdef HAX_PLATFORM_DARWIN
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+#ifdef HAX_PLATFORM_LINUX
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+hax_smp_func_ret_t smp_cfunction(void *param, void *a2 __unused);
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+
 int hax_smp_call_function(hax_cpumap_t *cpus, void(*scfunc)(void *param),
                           void *param);
 
-uint32_t hax_cpuid(void);
 int proc_event_pending(struct vcpu_t *vcpu);
 
 void hax_disable_preemption(preempt_flag *eflags);
@@ -256,26 +380,4 @@ int hax_em64t_enabled(void);
 }
 #endif
 
-/* Utilities */
-#define HAX_NOLOG       0xff
-#define HAX_LOGPANIC    5
-#define HAX_LOGE        4
-#define HAX_LOGW        3
-#define HAX_LOGI        2
-#define HAX_LOGD        1
-#define HAX_LOG_DEFAULT 3
-
-#ifdef HAX_PLATFORM_DARWIN
-#include "darwin/hax_mac.h"
-#endif
-#ifdef HAX_PLATFORM_LINUX
-#include "linux/hax_linux.h"
-#endif
-#ifdef HAX_PLATFORM_NETBSD
-#include "netbsd/hax_netbsd.h"
-#endif
-#ifdef HAX_PLATFORM_WINDOWS
-#include "windows/hax_windows.h"
-#endif
-
 #endif  // HAX_H_
diff --git a/include/linux/hax_linux.h b/include/linux/hax_linux.h
index 7dde8649..3027451b 100644
--- a/include/linux/hax_linux.h
+++ b/include/linux/hax_linux.h
@@ -98,13 +98,6 @@ static inline int memcpy_s(void *dest, size_t destsz, const void *src,
 bool hax_cmpxchg32(uint32_t old_val, uint32_t new_val, volatile uint32_t *addr);
 bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t *addr);
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/include/linux/hax_types_linux.h b/include/linux/hax_types_linux.h
index c744a20b..da90b6d4 100644
--- a/include/linux/hax_types_linux.h
+++ b/include/linux/hax_types_linux.h
@@ -91,12 +91,8 @@ typedef struct hax_kmap_phys {
 
 typedef struct hax_spinlock hax_spinlock;
 
-typedef int hax_cpumap_t;
-
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1 << cpu);
-}
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 
 /* Remove this later */
 #define is_leaf(x)  1
diff --git a/include/netbsd/hax_netbsd.h b/include/netbsd/hax_netbsd.h
index 07d8f0b8..9fcaa19c 100644
--- a/include/netbsd/hax_netbsd.h
+++ b/include/netbsd/hax_netbsd.h
@@ -97,13 +97,6 @@ static inline int memcpy_s(void *dest, size_t destsz, const void *src,
 bool hax_cmpxchg32(uint32_t old_val, uint32_t new_val, volatile uint32_t *addr);
 bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t *addr);
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/include/netbsd/hax_types_netbsd.h b/include/netbsd/hax_types_netbsd.h
index 599fd54c..8b807001 100644
--- a/include/netbsd/hax_types_netbsd.h
+++ b/include/netbsd/hax_types_netbsd.h
@@ -91,12 +91,8 @@ typedef struct hax_kmap_phys {
 
 typedef struct hax_spinlock hax_spinlock;
 
-typedef int hax_cpumap_t;
-
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1 << cpu);
-}
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 
 /* Remove this later */
 #define is_leaf(x)  1
diff --git a/include/windows/hax_types_windows.h b/include/windows/hax_types_windows.h
index 2c2ce73f..2304f12e 100644
--- a/include/windows/hax_types_windows.h
+++ b/include/windows/hax_types_windows.h
@@ -45,14 +45,6 @@ typedef unsigned char bool;
 #define is_leaf(x)  1
 #endif
 
-typedef KAFFINITY hax_cpumap_t;
-inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return ((KAFFINITY)0x1 << cpu);
-}
-
-typedef KIRQL preempt_flag;
-
 // Signed Types
 typedef signed char         int8_t;
 typedef signed short        int16_t;
@@ -68,6 +60,15 @@ typedef unsigned int        uint;
 typedef unsigned long       ulong;
 typedef unsigned long       ulong_t;
 
+// KAFFINITY is 32 bits on a 32-bit version of Windows and is 64 bits
+//   on a 64-bit version of Windows. We always use 64-bit to store CPU mask
+//   in haxm so define it as 64-bit here.
+//typedef KAFFINITY hax_cpumask_t;
+typedef uint64_t hax_cpumask_t;
+typedef ULONG_PTR hax_smp_func_ret_t;
+
+typedef KIRQL preempt_flag;
+
 #include "../hax_list.h"
 struct hax_page {
     void *kva;
diff --git a/include/windows/hax_windows.h b/include/windows/hax_windows.h
index c43ef8b4..4da10682 100644
--- a/include/windows/hax_windows.h
+++ b/include/windows/hax_windows.h
@@ -184,13 +184,6 @@ static bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t
         return FALSE;
 }
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/platforms/darwin/com_intel_hax.c b/platforms/darwin/com_intel_hax.c
index 98795e3b..0bcd814f 100644
--- a/platforms/darwin/com_intel_hax.c
+++ b/platforms/darwin/com_intel_hax.c
@@ -99,38 +99,6 @@ static int lock_prim_init(void)
     return -1;
 }
 
-hax_cpumap_t cpu_online_map;
-int max_cpus;
-
-void get_online_map(void *param)
-{
-    uint64_t *omap;
-
-    //printf("%x\n", cpu_number());
-    omap = param;
-    if (!omap) {
-        hax_log(HAX_LOGE, "NULL pointer in get online map\n");
-        return;
-    }
-
-    hax_test_and_set_bit(cpu_number(), omap);
-    printf("%llx\n ", *omap);
-    return;
-}
-
-/* This is provided in unsupported kext */
-extern unsigned int real_ncpus;
-static void init_cpu_info(void)
-{
-    uint64_t possible_map, omap = 0;
-
-    possible_map = ~0ULL;
-    hax_smp_call_function(&possible_map, get_online_map, &omap);
-    printf("possible map %llx cpu_online_map %llx\n", possible_map, omap);
-    cpu_online_map = omap;
-    max_cpus = real_ncpus;
-}
-
 static int com_intel_hax_init(void)
 {
     int ret;
@@ -139,14 +107,9 @@ static int com_intel_hax_init(void)
     if (ret < 0)
         return ret;
 
-    init_cpu_info();
-
-    if (max_cpus > HAX_MAX_CPUS) {
-        hax_log(HAX_LOGE, "Too many cpus in system!, max_cpus:%d\n",
-                real_ncpus);
-        ret = -E2BIG;
+    ret = cpu_info_init();
+    if (ret < 0)
         goto fail0;
-    }
 
     ret = hax_malloc_init();
     if (ret < 0)
@@ -154,6 +117,7 @@ static int com_intel_hax_init(void)
 
     return 0;
 fail0:
+    cpu_info_exit();
     lock_prim_exit();
     return ret;
 }
diff --git a/platforms/darwin/hax_wrapper.cpp b/platforms/darwin/hax_wrapper.cpp
index 30e249cf..7f9eece5 100644
--- a/platforms/darwin/hax_wrapper.cpp
+++ b/platforms/darwin/hax_wrapper.cpp
@@ -78,30 +78,95 @@ extern "C" void hax_panic(const char *fmt, ...)
     va_end(args);
 }
 
-struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-};
-
-extern "C" void mp_rendezvous_no_intrs(void (*action_func)(void *), void *arg);
+extern int cpu_number(void);
+inline uint32_t hax_cpu_id(void)
+{
+    return (uint32_t)cpu_number();
+}
 
-void smp_cfunction(void *param)
+/* This is provided in unsupported kext */
+extern unsigned int real_ncpus;
+int cpu_info_init(void)
 {
-    int cpu_id;
-    void (*action)(void *parap);
-    hax_cpumap_t *hax_cpus;
-    struct smp_call_parameter *p;
-
-    p = (struct smp_call_parameter *)param;
-    cpu_id = cpu_number();
-    action = p->func;
-    hax_cpus = p->cpus;
-    //printf("cpus:%llx, current_cpu:%x\n", *cpus, cpu_id);
-    if (*hax_cpus & (0x1 << cpu_id))
-        action(p->param);
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = real_ncpus;
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
+extern "C" void mp_rendezvous_no_intrs(void (*action_func)(void *), void *arg);
+
 extern "C" int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
                                  void *param)
 {
@@ -113,11 +178,6 @@ extern "C" int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
     return 0;
 }
 
-extern "C" uint32_t hax_cpuid()
-{
-    return cpu_number();
-}
-
 extern "C" void hax_disable_preemption(preempt_flag *eflags)
 {
     mword flags;
diff --git a/platforms/linux/hax_entry.c b/platforms/linux/hax_entry.c
index a2c96c0a..9a66de18 100644
--- a/platforms/linux/hax_entry.c
+++ b/platforms/linux/hax_entry.c
@@ -114,24 +114,24 @@ static long hax_dev_ioctl(struct file *filp, unsigned int cmd,
 
 static int __init hax_driver_init(void)
 {
-    int i, err;
-
-    // Initialization
-    max_cpus = num_present_cpus();
-    cpu_online_map = 0;
-    for (i = 0; i < max_cpus; i++) {
-        if (cpu_online(i))
-            cpu_online_map |= (1ULL << i);
+    int err;
+
+    err = cpu_info_init();
+    if (err) {
+        hax_log(HAX_LOGE, "Failed to initialize CPU info\n");
+        return err;
     }
 
     if (hax_module_init() < 0) {
         hax_log(HAX_LOGE, "Failed to initialize HAXM module\n");
+        cpu_info_exit();
         return -EAGAIN;
     }
 
     err = misc_register(&hax_dev);
     if (err) {
         hax_log(HAX_LOGE, "Failed to register HAXM device\n");
+        cpu_info_exit();
         hax_module_exit();
         return err;
     }
diff --git a/platforms/linux/hax_wrapper.c b/platforms/linux/hax_wrapper.c
index 1e79d35d..ed392da5 100644
--- a/platforms/linux/hax_wrapper.c
+++ b/platforms/linux/hax_wrapper.c
@@ -40,9 +40,6 @@
 
 #include <asm/cmpxchg.h>
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
 static const char* kLogLevel[] = {
     KERN_ERR,
     KERN_DEBUG,     // HAX_LOGD
@@ -84,27 +81,88 @@ void hax_panic(const char *fmt, ...)
     va_end(args);
 }
 
-uint32_t hax_cpuid(void)
+inline uint32_t hax_cpu_id(void)
 {
-    return smp_processor_id();
+    return (uint32_t)smp_processor_id();
 }
 
-typedef struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-} smp_call_parameter;
-
-static void smp_cfunction(void *p)
+int cpu_info_init(void)
 {
-    struct smp_call_parameter *info = p;
-    hax_cpumap_t *cpus;
-    uint32_t cpuid;
-
-    cpus = info->cpus;
-    cpuid = hax_cpuid();
-    if (*cpus & (0x1 << cpuid))
-        info->func(info->param);
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = num_online_cpus();
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
diff --git a/platforms/netbsd/hax_entry.c b/platforms/netbsd/hax_entry.c
index b4a72afd..846ddf4c 100644
--- a/platforms/netbsd/hax_entry.c
+++ b/platforms/netbsd/hax_entry.c
@@ -120,7 +120,8 @@ static void
 hax_vcpu_attach(device_t parent, device_t self, void *aux)
 {
     struct hax_vcpu_softc *sc;
-    int unit, vm_id, cpu_id;
+    int unit, vm_id;
+    uint32_t vcpu_id;
 
     sc = device_private(self);
     if (sc == NULL) {
@@ -130,13 +131,13 @@ hax_vcpu_attach(device_t parent, device_t self, void *aux)
 
     unit = device_unit(self);
     vm_id = unit2vmmid(unit);
-    cpu_id = unit2vcpuid(unit);
+    vcpu_id = unit2vcpuid(unit);
 
     sc->sc_dev = self;
     sc->vcpu = NULL;
 
     snprintf(self->dv_xname, sizeof self->dv_xname, "hax_vm%02d/vcpu%02d",
-             vm_id, cpu_id);
+             vm_id, vcpu_id);
 
     if (!pmf_device_register(self, NULL, NULL))
         aprint_error_dev(self, "couldn't establish power handler\n");
@@ -216,23 +217,16 @@ MODULE(MODULE_CLASS_MISC, haxm, NULL);
 static int
 haxm_modcmd(modcmd_t cmd, void *arg __unused)
 {
-    struct cpu_info *ci;
-    CPU_INFO_ITERATOR cii;
     int err;
     size_t i;
 
     switch (cmd) {
     case MODULE_CMD_INIT: {
         // Initialization
-        max_cpus = 0;
-
-        ci = NULL;
-
-        for (CPU_INFO_FOREACH(cii, ci)) {
-            ++max_cpus;
-            if (!ISSET(ci->ci_schedstate.spc_flags, SPCF_OFFLINE)) {
-                cpu_online_map |= __BIT(cpu_index(ci));
-            }
+        err = cpu_info_init();
+        if (err) {
+            hax_log(HAX_LOGE, "Unable to init cpu info\n");
+            goto init_err0;
         }
 
         // Register hax_vm
@@ -327,6 +321,8 @@ haxm_modcmd(modcmd_t cmd, void *arg __unused)
 init_err2:
         config_cfdriver_detach(&hax_vm_cd);
 init_err1:
+        cpu_info_exit();
+init_err0:
         return ENXIO;
     }
     case MODULE_CMD_FINI: {
diff --git a/platforms/netbsd/hax_entry_vcpu.c b/platforms/netbsd/hax_entry_vcpu.c
index 02094072..b37b9e39 100644
--- a/platforms/netbsd/hax_entry_vcpu.c
+++ b/platforms/netbsd/hax_entry_vcpu.c
@@ -67,7 +67,8 @@ int hax_vcpu_open(dev_t self, int flag __unused, int mode __unused,
     struct vcpu_t *cvcpu;
     struct hax_vcpu_netbsd_t *vcpu;
     int ret;
-    int unit, vm_id, vcpu_id;
+    int unit, vm_id;
+    uint32_t vcpu_id;
 
     sc = device_lookup_private(&hax_vcpu_cd, minor(self));
     if (sc == NULL) {
diff --git a/platforms/netbsd/hax_wrapper.c b/platforms/netbsd/hax_wrapper.c
index d62535b8..bcd31d3b 100644
--- a/platforms/netbsd/hax_wrapper.c
+++ b/platforms/netbsd/hax_wrapper.c
@@ -44,9 +44,6 @@
 #include "../../core/include/hax_core_interface.h"
 #include "../../core/include/ia32.h"
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
 static const char* kLogPrefix[] = {
     "haxm: ",
     "haxm_debug: ",
@@ -76,27 +73,96 @@ void hax_panic(const char *fmt,  ...)
     va_end(args);
 }
 
-uint32_t hax_cpuid(void)
+inline uint32_t hax_cpu_id(void)
 {
-    return cpu_index(curcpu());
+    return (uint32_t)cpu_number();
 }
 
-typedef struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-} smp_call_parameter;
-
-static void smp_cfunction(void *a1, void *a2 __unused)
+int cpu_info_init(void)
 {
-    struct smp_call_parameter *info = a1;
-    hax_cpumap_t *cpus;
-    uint32_t cpuid;
-
-    cpus = info->cpus;
-    cpuid = hax_cpuid();
-    if (*cpus & (0x1 << cpuid))
-        info->func(info->param);
+    struct cpu_info *ci = NULL;
+    CPU_INFO_ITERATOR cii;
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = 0;
+    for (CPU_INFO_FOREACH(cii, ci)) {
+        if (!ISSET(ci->ci_schedstate.spc_flags, SPCF_OFFLINE)) {
+            ++cpu_online_map.cpu_num;
+        }
+    }
+
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
@@ -108,7 +174,7 @@ int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
     info.func = scfunc;
     info.param = param;
     info.cpus = cpus;
-    xc = xc_broadcast(XC_HIGHPRI, smp_cfunction, &info, NULL);
+    xc = xc_broadcast(XC_HIGHPRI, (xcfunc_t)smp_cfunction, &info, NULL);
     xc_wait(xc);
     return 0;
 }
diff --git a/platforms/windows/hax_entry.c b/platforms/windows/hax_entry.c
index f56335e8..74a0b440 100644
--- a/platforms/windows/hax_entry.c
+++ b/platforms/windows/hax_entry.c
@@ -54,24 +54,25 @@ DRIVER_UNLOAD HaxUnloadDriver;
 
 static int hax_host_init(void)
 {
-    int i, ret;
-    cpu_online_map = KeQueryActiveProcessors();
-
-    for (i = 0; i < (sizeof(ULONG_PTR) * 8); i++)
-        if (cpu_online_map & ((mword)0x1 << i))
-            max_cpus = i;
+    int ret;
 
-    /* we get the max_cpus from real_cpus in darwin, so add 1 here */
-    max_cpus++;
+    ret = cpu_info_init();
+    if (ret < 0) {
+        hax_log(HAX_LOGE, "CPU info init failed\n");
+        return ret;
+    }
 
     ret = smpc_dpc_init();
     if (ret < 0) {
+        hax_log(HAX_LOGE, "SMPC DPC init failed\n");
+        cpu_info_exit();
         return ret;
     }
 
     if (hax_module_init() < 0) {
             hax_log(HAX_LOGE, "Hax module init failed\n");
             smpc_dpc_exit();
+            cpu_info_exit();
             return -1;
     }
 
diff --git a/platforms/windows/hax_wrapper.c b/platforms/windows/hax_wrapper.c
index 4176a063..135c1180 100644
--- a/platforms/windows/hax_wrapper.c
+++ b/platforms/windows/hax_wrapper.c
@@ -31,21 +31,71 @@
 #include "hax_win.h"
 #include "../../core/include/ia32.h"
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
-uint32_t hax_cpuid()
+inline uint32_t hax_cpu_id(void)
 {
-    return KeGetCurrentProcessorNumber();
+    PROCESSOR_NUMBER ProcNumber = {0};
+    return (uint32_t)KeGetCurrentProcessorNumberEx(&ProcNumber);
 }
 
-struct smp_call_parameter
+int cpu_info_init(void)
 {
-    void (*func)(void *);
-    void *param;
-    /* Not used in DPC model*/
-    hax_cpumap_t *cpus;
-};
+    uint32_t size_group, size_pos, count, group, bit;
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.group_num = KeQueryActiveGroupCount();
+    cpu_online_map.cpu_num = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS);
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -1;
+    }
+
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    cpu_online_map.cpu_map = hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        return -1;
+    }
+
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+    cpu_online_map.cpu_pos = hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+        return -1;
+    }
+
+    count = 0;
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].map = (hax_cpumask_t)KeQueryGroupAffinity(
+                group);
+        cpu_online_map.cpu_map[group].id = group;
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit)) {
+                ++cpu_online_map.cpu_map[group].num;
+                cpu_online_map.cpu_pos[count].group = group;
+                cpu_online_map.cpu_pos[count].bit = bit;
+                ++count;
+            }
+        }
+    }
+
+    if (count != cpu_online_map.cpu_num) {
+        hax_log(HAX_LOGE, "Active logical processor count(%d)-affinity(%d) "
+                "doesn't match\n", cpu_online_map.cpu_num, count);
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+        return -1;
+    }
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+}
 
 #ifdef SMPC_DPCS
 KDEFERRED_ROUTINE smp_cfunction_dpc;
@@ -59,15 +109,20 @@ void smp_cfunction_dpc(
         __in_opt PVOID  SystemArgument1,
         __in_opt PVOID  SystemArgument2)
 {
+    struct smp_call_parameter *p = (struct smp_call_parameter *)SystemArgument2;
+    void (*action)(void *param) = p->func;
     hax_cpumap_t *done;
-    void (*action)(void *parap);
-    struct smp_call_parameter *p;
+    uint32_t self, group, bit;
 
-    p = (struct smp_call_parameter *)SystemArgument2;
-    done = (hax_cpumap_t*)SystemArgument1;
-    action = p->func;
     action(p->param);
-    hax_test_and_set_bit(hax_cpuid(), (uint64_t*)done);
+
+    // We only use hax_cpumap_t.hax_cpu_pos_t to mark done or not
+    done = (hax_cpumap_t*)SystemArgument1;
+    self = hax_cpu_id();
+    group = self / HAX_MAX_CPU_PER_GROUP;
+    bit = self % HAX_MAX_CPU_PER_GROUP;
+    done->cpu_pos[self].group = group;
+    done->cpu_pos[self].bit = bit;
 }
 
 /* IPI function is not exported to in XP, we use DPC to trigger the smp
@@ -80,43 +135,54 @@ void smp_cfunction_dpc(
  */
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * param)
 {
-    int i, self;
+    uint32_t cpu_id, self, group, bit, size_pos;
     BOOLEAN result;
     struct _KDPC *cur_dpc;
-    hax_cpumap_t done;
+    hax_cpumap_t done = {0};
     struct smp_call_parameter *sp;
     KIRQL old_irql;
     LARGE_INTEGER delay;
     NTSTATUS event_result;
+    int err = 0;
 
-    self = hax_cpuid();
+    self = hax_cpu_id();
+    group = self / HAX_MAX_CPU_PER_GROUP;
+    bit = self % HAX_MAX_CPU_PER_GROUP;
 
-    done = 0;
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+    done.cpu_pos = hax_vmalloc(size_pos, 0);
+    if (!done.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate done to check SMP DPC done\n");
+        return -1;
+    }
+    memset(done.cpu_pos, 0xFF, size_pos);
 
     event_result = KeWaitForSingleObject(&dpc_event, Executive, KernelMode,
                                          FALSE, NULL);
     if (event_result!= STATUS_SUCCESS) {
         hax_log(HAX_LOGE, "Failed to get the smp_call event object\n");
+        hax_vfree(done.cpu_pos, size_pos);
         return -1;
     }
 
-    if (((mword)1 << self) & *cpus) {
+    if (cpu_is_online(cpus, self)){
         KeRaiseIrql(DISPATCH_LEVEL, &old_irql);
         (scfunc)(param);
-        done |= ((mword)1 << self);
+        done.cpu_pos[self].group = group;
+        done.cpu_pos[self].bit = bit;
         KeLowerIrql(old_irql);
     }
 
-    for (i = 0; i < max_cpus; i++) {
-        if (!cpu_is_online(i) || (i == self))
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || (cpu_id == self))
             continue;
-        sp = smp_cp + i;
+        sp = smp_cp + cpu_id;
         sp->func = scfunc;
         sp->param = param;
-        cur_dpc = smpc_dpcs + i;
+        cur_dpc = smpc_dpcs + cpu_id;
         result = KeInsertQueueDpc(cur_dpc, &done, sp);
         if (result != TRUE)
-            hax_log(HAX_LOGE, "Failed to insert queue on CPU %x\n", i);
+            hax_log(HAX_LOGE, "Failed to insert queue on CPU %x\n", cpu_id);
     }
 
     /* Delay 100 ms */
@@ -124,32 +190,36 @@ int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * par
     if (KeDelayExecutionThread( KernelMode, TRUE, &delay ) != STATUS_SUCCESS)
         hax_log(HAX_LOGE, "Delay execution is not success\n");
 
-    if (done != *cpus)
+    if(!memcmp(done.cpu_pos, cpu_online_map.cpu_pos, size_pos)) {
+        err = -1;
         hax_log(HAX_LOGE, "sm call function is not called in all required CPUs\n");
+    }
 
     KeSetEvent(&dpc_event, 0, FALSE);
 
-    return (done != *cpus) ? -1 :0;
+    hax_vfree(done.cpu_pos, size_pos);
+
+    return err;
 }
 
 int
 smpc_dpc_init(void)
 {
     struct _KDPC *cur_dpc;
-    int i;
+    uint32_t cpu_id;
 
-    smpc_dpcs = hax_vmalloc(sizeof(KDPC) * max_cpus, 0);
+    smpc_dpcs = hax_vmalloc(sizeof(KDPC) * cpu_online_map.cpu_num, 0);
     if (!smpc_dpcs)
         return -ENOMEM;
-    smp_cp = hax_vmalloc(sizeof(struct smp_call_parameter) * max_cpus, 0);
+    smp_cp = hax_vmalloc(sizeof(struct smp_call_parameter) * cpu_online_map.cpu_num, 0);
     if (!smp_cp) {
-        hax_vfree(smpc_dpcs, sizeof(KDPC) * max_cpus);
+        hax_vfree(smpc_dpcs, sizeof(KDPC) * cpu_online_map.cpu_num);
         return -ENOMEM;
     }
     cur_dpc = smpc_dpcs;
-    for (i = 0; i < max_cpus; i++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         KeInitializeDpc(cur_dpc, smp_cfunction_dpc, NULL);
-        KeSetTargetProcessorDpc(cur_dpc, i);
+        KeSetTargetProcessorDpc(cur_dpc, cpu_id);
         /* Set the DPC as high important, so that we loop too long */
         KeSetImportanceDpc(cur_dpc, HighImportance);
         cur_dpc++;
@@ -160,34 +230,20 @@ smpc_dpc_init(void)
 
 int smpc_dpc_exit(void)
 {
-    hax_vfree(smpc_dpcs, sizeof(KDPC) * max_cpus);
-    hax_vfree(smp_cp, sizeof(KDPC) * max_cpus);
+    hax_vfree(smpc_dpcs, sizeof(KDPC) * cpu_online_map.cpu_num);
+    hax_vfree(smp_cp, sizeof(KDPC) * cpu_online_map.cpu_num);
     return 0;
 }
 #else
-/* This is the only function that in DIRQL */
-static ULONG_PTR smp_cfunction(ULONG_PTR param)
-{
-    int cpu_id;
-    void (*action)(void *parap) ;
-    hax_cpumap_t *hax_cpus;
-    struct smp_call_parameter *p;
-
-    p = (struct smp_call_parameter *)param;
-    cpu_id = hax_cpuid();
-    action = p->func;
-    hax_cpus = p->cpus;
-    if (*hax_cpus & ((mword)1 << cpu_id))
-        action(p->param);
-    return (ULONG_PTR)NULL;
-}
+// A driver calls KeIpiGenericCall to interrupt every processor and raises
+// the IRQL to IPI_LEVEL, which is greater than DIRQL for every device.
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * param)
 {
     struct smp_call_parameter sp;
     sp.func = scfunc;
     sp.param = param;
     sp.cpus = cpus;
-    KeIpiGenericCall(smp_cfunction, (ULONG_PTR)&sp);
+    KeIpiGenericCall((PKIPI_BROADCAST_WORKER)smp_cfunction, (ULONG_PTR)&sp);
     return 0;
 }
 #endif