diff --git a/core/cpu.c b/core/cpu.c
index 050cc169..5f82fd7c 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -87,6 +87,8 @@ void cpu_init_vmx(void *arg)
 
     cpu_data = current_cpu_data();
 
+    hax_log(HAX_LOGD, "[#%d] cpu_init_vmx\n", cpu_data->cpu_id);
+
     cpu_data->cpu_features |= HAX_CPUF_VALID;
     if (!cpu_has_feature(X86_FEATURE_VMX))
         return;
@@ -158,6 +160,7 @@ void cpu_init_vmx(void *arg)
 
 void cpu_exit_vmx(void *arg)
 {
+    hax_log(HAX_LOGD, "[#%d] cpu_exit_vmx\n", current_cpu_data()->cpu_id);
 }
 
 /*
@@ -170,6 +173,8 @@ void cpu_pmu_init(void *arg)
     struct cpu_pmu_info *pmu_info = &current_cpu_data()->pmu_info;
     cpuid_args_t cpuid_args;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_pmu_init\n", current_cpu_data()->cpu_id);
+
     memset(pmu_info, 0, sizeof(struct cpu_pmu_info));
 
     // Call CPUID with EAX = 0
@@ -405,8 +410,8 @@ int cpu_vmx_execute(struct vcpu_t *vcpu, struct hax_tunnel *htun)
         }
 
         exit_reason.raw = vmread(vcpu, VM_EXIT_INFO_REASON);
-        hax_log(HAX_LOGD, "....exit_reason.raw %x, cpu %d %d\n",
-                exit_reason.raw, vcpu->cpu_id, hax_cpuid());
+        hax_log(HAX_LOGD, "....exit_reason.raw %x, vcpu %d, cpu %d\n",
+                exit_reason.raw, vcpu->cpu_id, hax_cpu_id());
 
         /* XXX Currently we take active save/restore for MSR and FPU, the main
          * reason is, we have no schedule hook to get notified of preemption
@@ -559,7 +564,7 @@ uint32_t load_vmcs(struct vcpu_t *vcpu, preempt_flag *flags)
         vcpu->is_vmcs_loaded = 1;
         cpu_data->current_vcpu = vcpu;
         vcpu->prev_cpu_id = vcpu->cpu_id;
-        vcpu->cpu_id = hax_cpuid();
+        vcpu->cpu_id = hax_cpu_id();
     }
 
     cpu_data->other_vmcs = curr_vmcs;
@@ -669,23 +674,27 @@ vmx_result_t cpu_vmxroot_leave(void)
     struct per_cpu_data *cpu_data = current_cpu_data();
     vmx_result_t result = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_vmxroot_leave\n", cpu_data->cpu_id);
+
     if (cpu_data->vmm_flag & VMXON_HAX) {
         result = asm_vmxoff();
         if (result == VMX_SUCCEED) {
             cpu_data->vmm_flag &= ~VMXON_HAX;
             restore_host_cr4_vmxe(cpu_data);
         } else {
-            hax_log(HAX_LOGE, "VMXOFF Failed..........\n");
+            hax_log(HAX_LOGE, "[#%d] VMXOFF Failed..........\n",
+                    cpu_data->cpu_id);
         }
     } else {
         log_vmxoff_no = 1;
 #ifdef HAX_PLATFORM_DARWIN
-        hax_log(HAX_LOGD, "Skipping VMXOFF because another VMM (VirtualBox or "
-                "macOS Hypervisor Framework) is running\n");
+        hax_log(HAX_LOGD, "[#%d] Skipping VMXOFF because another VMM "
+                "(VirtualBox or macOS Hypervisor Framework) is running\n",
+                cpu_data->cpu_id);
 #else
         // It should not go here in Win64/win32
         result = VMX_FAIL_VALID;
-        hax_log(HAX_LOGE, "NO VMXOFF.......\n");
+        hax_log(HAX_LOGE, "[#%d] NO VMXOFF.......\n", cpu_data->cpu_id);
 #endif
     }
     cpu_data->vmxoff_res = result;
@@ -700,11 +709,14 @@ vmx_result_t cpu_vmxroot_enter(void)
     hax_paddr_t vmxon_addr;
     vmx_result_t result = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] cpu_vmxroot_enter\n", cpu_data->cpu_id);
+
     cpu_data->host_cr4_vmxe = (get_cr4() & CR4_VMXE);
     if (cpu_data->host_cr4_vmxe) {
         if (debug_vmcs_count % 100000 == 0) {
-            hax_log(HAX_LOGD, "host VT has enabled!\n");
-            hax_log(HAX_LOGD, "Cr4 value = 0x%lx\n", get_cr4());
+            hax_log(HAX_LOGD, "[#%d] host VT has enabled!\n", cpu_data->cpu_id);
+            hax_log(HAX_LOGD, "[#%d] Cr4 value = 0x%lx\n",
+                    get_cr4(), cpu_data->cpu_id);
             log_host_cr4_vmxe = 1;
             log_host_cr4 = get_cr4();
         }
@@ -766,9 +778,10 @@ vmx_result_t cpu_vmxroot_enter(void)
 #endif
 
         if (fatal) {
-            hax_log(HAX_LOGE, "VMXON failed for region 0x%llx (result=0x%x, "
-                    "vmxe=%x)\n", hax_page_pa(cpu_data->vmxon_page),
-                    (uint32_t)result, (uint32_t)cpu_data->host_cr4_vmxe);
+            hax_log(HAX_LOGE, "[#%d] VMXON failed for region 0x%llx "
+                    "(result=0x%x, vmxe=%x)\n", cpu_data->cpu_id,
+                    hax_page_pa(cpu_data->vmxon_page), (uint32_t)result,
+                    (uint32_t)cpu_data->host_cr4_vmxe);
             restore_host_cr4_vmxe(cpu_data);
             if (result == VMX_FAIL_INVALID) {
                 log_vmxon_err_type1 = 1;
diff --git a/core/ept.c b/core/ept.c
index cc6d3276..c6037ff3 100644
--- a/core/ept.c
+++ b/core/ept.c
@@ -335,6 +335,8 @@ static void invept_smpfunc(struct invept_bundle *bundle)
     cpu_data = current_cpu_data();
     cpu_data->invept_res = VMX_SUCCEED;
 
+    hax_log(HAX_LOGD, "[#%d] invept_smpfunc\n", cpu_data->cpu_id);
+
     cpu_vmxroot_enter();
 
     if (cpu_data->vmxon_res == VMX_SUCCEED) {
@@ -348,8 +350,7 @@ void invept(hax_vm_t *hax_vm, uint type)
     uint64_t eptp_value = vm_get_eptp(hax_vm);
     struct invept_desc desc = { eptp_value, 0 };
     struct invept_bundle bundle;
-    int cpu_id;
-    uint32_t res;
+    uint32_t cpu_id, res;
 
     if (!ept_has_cap(ept_cap_invept)) {
         hax_log(HAX_LOGW, "INVEPT was not called due to missing host support"
@@ -384,10 +385,10 @@ void invept(hax_vm_t *hax_vm, uint type)
      * especially on macOS; instead, invept_smpfunc() writes VMX instruction
      * results in hax_cpu_data[], which are checked below.
      */
-    for (cpu_id = 0; cpu_id < max_cpus; cpu_id++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
 
-        if (!cpu_is_online(cpu_id)) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id)) {
             continue;
         }
         cpu_data = hax_cpu_data[cpu_id];
@@ -399,17 +400,17 @@ void invept(hax_vm_t *hax_vm, uint type)
 
         res = (uint32_t)cpu_data->vmxon_res;
         if (res != VMX_SUCCEED) {
-            hax_log(HAX_LOGE, "[Processor #%d] INVEPT was not called, because "
+            hax_log(HAX_LOGE, "[#%d] INVEPT was not called, because "
                     "VMXON failed (err=0x%x)\n", cpu_id, res);
         } else {
             res = (uint32_t)cpu_data->invept_res;
             if (res != VMX_SUCCEED) {
-                hax_log(HAX_LOGE, "[Processor #%d] INVEPT failed (err=0x%x)\n",
+                hax_log(HAX_LOGE, "[#%d] INVEPT failed (err=0x%x)\n",
                         cpu_id, res);
             }
             res = (uint32_t)cpu_data->vmxoff_res;
             if (res != VMX_SUCCEED) {
-                hax_log(HAX_LOGE, "[Processor #%d] INVEPT was called, but "
+                hax_log(HAX_LOGE, "[#%d] INVEPT was called, but "
                         "VMXOFF failed (err=0x%x)\n", cpu_id, res);
             }
         }
diff --git a/core/hax.c b/core/hax.c
index cf9a7400..995aaf51 100644
--- a/core/hax.c
+++ b/core/hax.c
@@ -55,6 +55,7 @@ struct hax_page *io_bitmap_page_a;
 struct hax_page *io_bitmap_page_b;
 struct hax_page *msr_bitmap_page;
 
+hax_cpumap_t cpu_online_map;
 struct per_cpu_data **hax_cpu_data;
 struct hax_t *hax;
 
@@ -71,25 +72,25 @@ static void hax_disable_vmx(void)
 
 static void free_cpu_vmxon_region(void)
 {
-    int cpu;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[cpu]->vmxon_page) {
-            hax_free_pages(hax_cpu_data[cpu]->vmxon_page);
-            hax_cpu_data[cpu]->vmxon_page = NULL;
+        if (hax_cpu_data[cpu_id]->vmxon_page) {
+            hax_free_pages(hax_cpu_data[cpu_id]->vmxon_page);
+            hax_cpu_data[cpu_id]->vmxon_page = NULL;
         }
     }
 }
 
 static int alloc_cpu_vmxon_region(void)
 {
-    int cpu;
+    uint32_t cpu_id;
     struct hax_page *page;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
         page = hax_alloc_page(0, 1);
         if (!page) {
@@ -97,32 +98,32 @@ static int alloc_cpu_vmxon_region(void)
             return -ENOMEM;
         }
         hax_clear_page(page);
-        hax_cpu_data[cpu]->vmxon_page = page;
+        hax_cpu_data[cpu_id]->vmxon_page = page;
     }
     return 0;
 }
 
 void free_cpu_template_vmcs(void)
 {
-    int cpu;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[cpu]->vmcs_page) {
-            hax_free_pages(hax_cpu_data[cpu]->vmcs_page);
-            hax_cpu_data[cpu]->vmcs_page = NULL;
+        if (hax_cpu_data[cpu_id]->vmcs_page) {
+            hax_free_pages(hax_cpu_data[cpu_id]->vmcs_page);
+            hax_cpu_data[cpu_id]->vmcs_page = NULL;
         }
     }
 }
 
 static int alloc_cpu_template_vmcs(void)
 {
-    int cpu;
+    uint32_t cpu_id;
     struct hax_page *page = NULL;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu) || !hax_cpu_data[cpu])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || !hax_cpu_data[cpu_id])
             continue;
         page = (struct hax_page *)hax_alloc_page(0, 1);
         if (!page) {
@@ -130,7 +131,7 @@ static int alloc_cpu_template_vmcs(void)
             return -ENOMEM;
         }
         hax_clear_page(page);
-        hax_cpu_data[cpu]->vmcs_page = page;
+        hax_cpu_data[cpu_id]->vmcs_page = page;
     }
     return 0;
 }
@@ -140,6 +141,29 @@ int hax_em64t_enabled(void)
     return hax->em64t_enable_flag;
 }
 
+#ifdef HAX_PLATFORM_DARWIN
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+#ifdef HAX_PLATFORM_LINUX
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+hax_smp_func_ret_t smp_cfunction(void *param, void *a2 __unused)
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+hax_smp_func_ret_t smp_cfunction(void *param)
+#endif
+{
+    struct smp_call_parameter *p = (struct smp_call_parameter *)param;
+    void (*action)(void *parap) = p->func;
+    hax_cpumap_t *hax_cpus = p->cpus;
+
+    if (cpu_is_online(hax_cpus, hax_cpu_id()))
+        action(p->param);
+
+    return (hax_smp_func_ret_t)NULL;
+}
+
 /*
  * This vcpu_data should not be accessed by anyone else at this step.
  * Return 0 if can continue, <0 for error.
@@ -147,14 +171,15 @@ int hax_em64t_enabled(void)
 static int hax_vmx_enable_check(void)
 {
     int vts = 0, nxs = 0, vte = 0, nxe = 0, em64s = 0, em64e = 0, finished = 0;
-    int cpu, tnum = 0, error = 0;
+    int tnum = 0, error = 0;
+    uint32_t cpu_id;
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
 
-        if (!cpu_is_online(cpu))
+        if (!cpu_is_online(&cpu_online_map, cpu_id))
             continue;
-        cpu_data = hax_cpu_data[cpu];
+        cpu_data = hax_cpu_data[cpu_id];
         // This should not happen !
         if (!cpu_data)
             continue;
@@ -412,8 +437,7 @@ static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write
  */
 static void hax_pmu_init(void)
 {
-    int cpu_id;
-    int ref_cpu_id = -1;
+    uint32_t cpu_id, ref_cpu_id = (uint32_t)(~0ULL);
 
     // Execute cpu_pmu_init() on each logical processor of the host CPU
     hax_smp_call_function(&cpu_online_map, cpu_pmu_init, NULL);
@@ -421,11 +445,11 @@ static void hax_pmu_init(void)
     // Find the common APM version supported by all host logical processors
     // TODO: Theoretically we should do the same for other APM parameters
     // (number of counters, etc.) as well
-    for (cpu_id = 0; cpu_id < max_cpus; cpu_id++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         struct per_cpu_data *cpu_data;
         uint apm_version;
 
-        if (!cpu_is_online(cpu_id)) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id)) {
             continue;
         }
         cpu_data = hax_cpu_data[cpu_id];
@@ -512,7 +536,7 @@ static void hax_pmu_init(void)
 
 int hax_module_init(void)
 {
-    int ret = 0, cpu = 0;
+    uint32_t cpu_id;
 
     hax = (struct hax_t *)hax_vmalloc(sizeof(struct hax_t), HAX_MEM_NONPAGE);
     if (!hax)
@@ -523,29 +547,28 @@ int hax_module_init(void)
     if (!hax->hax_lock)
         goto out_0;
 
-    hax_cpu_data = hax_vmalloc(max_cpus * sizeof(void *), 0);
+    hax_cpu_data = hax_vmalloc(cpu_online_map.cpu_num * sizeof(void *), 0);
     if (!hax_cpu_data)
         goto out_1;
-    memset(hax_cpu_data, 0, max_cpus * sizeof(void *));
+    memset(hax_cpu_data, 0, cpu_online_map.cpu_num * sizeof(void *));
 
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (!cpu_is_online(cpu))
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id))
             continue;
-        hax_cpu_data[cpu] = hax_vmalloc(sizeof(struct per_cpu_data), 0);
-        if (!hax_cpu_data[cpu])
+        hax_cpu_data[cpu_id] = hax_vmalloc(sizeof(struct per_cpu_data), 0);
+        if (!hax_cpu_data[cpu_id])
             goto out_2;
-        memset(hax_cpu_data[cpu], 0, sizeof(struct per_cpu_data));
+        memset(hax_cpu_data[cpu_id], 0, sizeof(struct per_cpu_data));
 
-        hax_cpu_data[cpu]->hstate.hfxpage =
+        hax_cpu_data[cpu_id]->hstate.hfxpage =
                 (struct hax_page *)hax_alloc_page(0, 1);
-        if (!hax_cpu_data[cpu]->hstate.hfxpage)
+        if (!hax_cpu_data[cpu_id]->hstate.hfxpage)
             goto out_2;
-        hax_clear_page(hax_cpu_data[cpu]->hstate.hfxpage);
-        hax_cpu_data[cpu]->cpu_id = cpu;
+        hax_clear_page(hax_cpu_data[cpu_id]->hstate.hfxpage);
+        hax_cpu_data[cpu_id]->cpu_id = cpu_id;
     }
     cpu_init_feature_cache();
-    ret = hax_vmx_init();
-    if (ret < 0)
+    if (hax_vmx_init() < 0)
         goto out_2;
 
     hax_pmu_init();
@@ -557,15 +580,15 @@ int hax_module_init(void)
     return 0;
 
 out_2:
-    for (cpu = 0; cpu < max_cpus; cpu++) {
-        if (hax_cpu_data[cpu]) {
-            if (hax_cpu_data[cpu]->hstate.hfxpage) {
-                hax_free_pages(hax_cpu_data[cpu]->hstate.hfxpage);
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (hax_cpu_data[cpu_id]) {
+            if (hax_cpu_data[cpu_id]->hstate.hfxpage) {
+                hax_free_pages(hax_cpu_data[cpu_id]->hstate.hfxpage);
             }
-            hax_vfree(hax_cpu_data[cpu], sizeof(struct per_cpu_data));
+            hax_vfree(hax_cpu_data[cpu_id], sizeof(struct per_cpu_data));
         }
     }
-    hax_vfree(hax_cpu_data, max_cpus * sizeof(void *));
+    hax_vfree(hax_cpu_data, cpu_online_map.cpu_num * sizeof(void *));
 out_1:
     hax_mutex_free(hax->hax_lock);
 out_0:
@@ -575,7 +598,8 @@ int hax_module_init(void)
 
 int hax_module_exit(void)
 {
-    int i, ret;
+    int ret;
+    uint32_t cpu_id;
 
     if (!hax_list_empty(&hax->hax_vmlist)) {
         hax_log(HAX_LOGE, "Still VM not be destroyed?\n");
@@ -587,15 +611,16 @@ int hax_module_exit(void)
         return ret;
 
     hax_vmx_exit();
-    for (i = 0; i < max_cpus; i++) {
-        if (!hax_cpu_data[i])
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!hax_cpu_data[cpu_id])
             continue;
-        if (hax_cpu_data[i]->hstate.hfxpage) {
-            hax_free_pages(hax_cpu_data[i]->hstate.hfxpage);
+        if (hax_cpu_data[cpu_id]->hstate.hfxpage) {
+            hax_free_pages(hax_cpu_data[cpu_id]->hstate.hfxpage);
         }
-        hax_vfree(hax_cpu_data[i], sizeof(struct per_cpu_data));
+        hax_vfree(hax_cpu_data[cpu_id], sizeof(struct per_cpu_data));
     }
-    hax_vfree(hax_cpu_data, max_cpus * sizeof(void *));
+    hax_vfree(hax_cpu_data, cpu_online_map.cpu_num * sizeof(void *));
+    cpu_info_exit();
     hax_mutex_free(hax->hax_lock);
     hax_vfree(hax, sizeof(struct hax_t));
     hax_log(HAX_LOGW, "-------- HAXM v%s End --------\n",
diff --git a/core/include/cpu.h b/core/include/cpu.h
index 988bb3c4..1db58557 100644
--- a/core/include/cpu.h
+++ b/core/include/cpu.h
@@ -44,8 +44,6 @@
 struct vcpu_t;
 struct vcpu_state_t;
 
-typedef uint32_t hax_cpuid_t;  // CPU identifier
-
 #define NR_HMSR 6
 
 struct hstate {
@@ -103,7 +101,7 @@ struct per_cpu_data {
     struct hax_page    *vmcs_page;
     struct vcpu_t      *current_vcpu;
     hax_paddr_t        other_vmcs;
-    hax_cpuid_t        cpu_id;
+    uint32_t           cpu_id;
     uint16_t           vmm_flag;
     uint16_t           nested;
     mword              host_cr4_vmxe;
@@ -157,8 +155,7 @@ struct per_cpu_data {
 extern struct per_cpu_data ** hax_cpu_data;
 static struct per_cpu_data * current_cpu_data(void)
 {
-    uint32_t cpu_id = hax_cpuid();
-    return hax_cpu_data[cpu_id];
+    return hax_cpu_data[hax_cpu_id()];
 }
 
 static struct per_cpu_data * get_cpu_data(uint32_t cpu_id)
diff --git a/core/include/vcpu.h b/core/include/vcpu.h
index c81349b7..dc103dcd 100644
--- a/core/include/vcpu.h
+++ b/core/include/vcpu.h
@@ -146,9 +146,9 @@ struct mmio_fetch_cache {
 
 struct vcpu_t {
     uint16_t vcpu_id;
-    uint16_t cpu_id;
+    uint32_t cpu_id;
     // Sometimes current thread might be migrated to other core.
-    uint16_t prev_cpu_id;
+    uint32_t prev_cpu_id;
     /*
      * VPID: Virtual Processor Identifier
      * VPIDs provide a way for software to identify to the processor
diff --git a/core/vcpu.c b/core/vcpu.c
index d0a145bc..c648a27d 100644
--- a/core/vcpu.c
+++ b/core/vcpu.c
@@ -127,7 +127,7 @@ static uint32_t get_seg_present(uint32_t seg)
 {
     mword ldtr_base;
     struct seg_desc_t *seg_desc;
-    struct hstate *hstate = &get_cpu_data(hax_cpuid())->hstate;
+    struct hstate *hstate = &get_cpu_data(hax_cpu_id())->hstate;
 
     ldtr_base = get_kernel_ldtr_base();
     seg_desc = (struct seg_desc_t *)ldtr_base + (seg >> 3);
@@ -460,8 +460,8 @@ struct vcpu_t *vcpu_create(struct vm_t *vm, void *vm_host, int vcpu_id)
     if (hax_vcpu_create_host(vcpu, vm_host, vm->vm_id, vcpu_id))
         goto fail_7;
 
-    vcpu->prev_cpu_id = -1;
-    vcpu->cpu_id = hax_cpuid();
+    vcpu->prev_cpu_id = (uint32_t)(~0ULL);
+    vcpu->cpu_id = hax_cpu_id();
     vcpu->vcpu_id = vcpu_id;
     vcpu->is_running = 0;
     vcpu->vm = vm;
@@ -1516,7 +1516,7 @@ static void fill_common_vmcs(struct vcpu_t *vcpu)
 static void vcpu_prepare(struct vcpu_t *vcpu)
 {
     hax_log(HAX_LOGD, "vcpu_prepare current %x, CPU %x\n", vcpu->vcpu_id,
-            hax_cpuid());
+            hax_cpu_id());
     hax_mutex_lock(vcpu->tmutex);
     fill_common_vmcs(vcpu);
     hax_mutex_unlock(vcpu->tmutex);
@@ -4320,8 +4320,17 @@ int vcpu_interrupt(struct vcpu_t *vcpu, uint8_t vector)
 }
 
 // Simply to cause vmexit to vcpu, if any vcpu is running on this physical CPU
-static void _vcpu_take_off(void *unused)
+static void _vcpu_take_off(void *param)
 {
+    hax_cpu_pos_t *target = (hax_cpu_pos_t *)param;
+
+    hax_log(HAX_LOGD, "[#%d] _vcpu_take_off\n", current_cpu_data()->cpu_id);
+    if (target)
+        hax_log(HAX_LOGD, "_vcpu_take_off on cpu (group-%d bit-%d)\n",
+                target->group, target->bit);
+    else
+        hax_log(HAX_LOGD, "_vcpu_take_off on all cpu");
+
     return;
 }
 
@@ -4342,16 +4351,16 @@ int vcpu_pause(struct vcpu_t *vcpu)
 
 int vcpu_takeoff(struct vcpu_t *vcpu)
 {
-    int cpu_id;
-    hax_cpumap_t targets;
+    uint32_t cpu_id;
+    hax_cpu_pos_t target = {0};
 
     // Don't change the sequence unless you are sure
     if (vcpu->is_running) {
         cpu_id = vcpu->cpu_id;
-        hax_assert(cpu_id != hax_cpuid());
-        targets = cpu2cpumap(cpu_id);
+        hax_assert(cpu_id != hax_cpu_id());
+        cpu2cpumap(cpu_id, &target);
         // If not considering Windows XP, definitely we don't need this
-        hax_smp_call_function(&targets, _vcpu_take_off, NULL);
+        hax_smp_call_function(&cpu_online_map, _vcpu_take_off, &target);
     }
 
     return 0;
diff --git a/include/darwin/hax_mac.h b/include/darwin/hax_mac.h
index 45a0ba7f..c2356a5d 100644
--- a/include/darwin/hax_mac.h
+++ b/include/darwin/hax_mac.h
@@ -171,21 +171,4 @@ static inline errno_t memcpy_s(void *dest, size_t destsz, const void *src,
 
 #define hax_assert(condition) assert(condition)
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((uint64_t)1 << cpu) & cpu_online_map);
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern int cpu_number(void);
-
-#ifdef __cplusplus
-}
-#endif
-
 #endif  // HAX_DARWIN_HAX_MAC_H_
diff --git a/include/darwin/hax_types_mac.h b/include/darwin/hax_types_mac.h
index 4f122f24..6af2505c 100644
--- a/include/darwin/hax_types_mac.h
+++ b/include/darwin/hax_types_mac.h
@@ -132,12 +132,9 @@ typedef struct hax_kmap_phys {
 
 typedef ulong mword;
 typedef mword preempt_flag;
-typedef uint64_t hax_cpumap_t;
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 typedef uint64_t HAX_VADDR_T;
 
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1UL << cpu);
-}
 #endif  // CONFIG_KERNEL_HAX
 #endif  // HAX_DARWIN_HAX_TYPES_MAC_H_
diff --git a/include/hax.h b/include/hax.h
index 2492b641..174c8163 100644
--- a/include/hax.h
+++ b/include/hax.h
@@ -228,20 +228,144 @@ static inline unsigned char *hax_page_va(struct hax_page *page)
     return (unsigned char *)page->kva;
 }
 
-#define HAX_MAX_CPUS (sizeof(uint64_t) * 8)
+/* Utilities */
+#define HAX_NOLOG       0xff
+#define HAX_LOGPANIC    5
+#define HAX_LOGE        4
+#define HAX_LOGW        3
+#define HAX_LOGI        2
+#define HAX_LOGD        1
+#define HAX_LOG_DEFAULT 3
+
+#ifdef HAX_PLATFORM_DARWIN
+#include "darwin/hax_mac.h"
+#endif
+#ifdef HAX_PLATFORM_LINUX
+#include "linux/hax_linux.h"
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+#include "netbsd/hax_netbsd.h"
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+#include "windows/hax_windows.h"
+#endif
+
+#define HAX_MAX_CPU_PER_GROUP (sizeof(hax_cpumask_t) * 8)
+#define HAX_MAX_CPU_GROUP ((uint16_t)(~0ULL))
+#define HAX_MAX_CPUS (HAX_MAX_CPU_PER_GROUP * HAX_MAX_CPU_GROUP)
+
+typedef struct hax_cpu_group_t {
+    hax_cpumask_t map;
+    uint32_t num;
+    uint16_t id;
+} hax_cpu_group_t;
+
+typedef struct hax_cpu_pos_t {
+    uint16_t group;
+    uint16_t bit;
+} hax_cpu_pos_t;
+
+typedef struct hax_cpumap_t {
+    hax_cpu_group_t *cpu_map;
+    hax_cpu_pos_t *cpu_pos;
+    uint16_t group_num;
+    uint32_t cpu_num;
+} hax_cpumap_t;
+
+typedef struct smp_call_parameter {
+    void (*func)(void *);
+    void *param;
+    hax_cpumap_t *cpus;
+} smp_call_parameter;
 
-/* Host SMP */
 extern hax_cpumap_t cpu_online_map;
-extern int max_cpus;
+
+static inline void cpu2cpumap(uint32_t cpu_id, hax_cpu_pos_t *target)
+{
+    if (!target)
+        return;
+
+    if (cpu_id >= cpu_online_map.cpu_num) {
+        target->group = (uint16_t)(~0ULL);
+        target->bit = (uint16_t)(~0ULL);
+    } else {
+        target->group = cpu_online_map.cpu_pos[cpu_id].group;
+        target->bit = cpu_online_map.cpu_pos[cpu_id].bit;
+    }
+}
+
+static inline bool cpu_is_online(hax_cpumap_t *cpu_map, uint32_t cpu_id)
+{
+    hax_cpumask_t map;
+    uint16_t group, bit;
+
+    if (cpu_id >= cpu_map->cpu_num) {
+        hax_log(HAX_LOGE, "Invalid cpu-%d\n", cpu_id);
+        return 0;
+    }
+
+    group = cpu_map->cpu_pos[cpu_id].group;
+    if (group != cpu_map->cpu_map[group].id) {
+        hax_log(HAX_LOGE, "Group id doesn't match record\n", group);
+        return 0;
+    }
+
+    bit = cpu_map->cpu_pos[cpu_id].bit;
+    map = cpu_map->cpu_map[group].map;
+    return !!(((hax_cpumask_t)1 << bit) & map);
+}
+
+extern uint32_t hax_cpu_id(void);
+static inline void get_online_map(void *param)
+{
+    hax_cpumap_t *omap = (hax_cpumap_t *)param;
+    hax_cpu_group_t *cpu_map;
+    hax_cpu_pos_t * cpu_pos;
+    uint32_t cpu_id, group, bit;
+
+    cpu_id = hax_cpu_id();
+    group = cpu_id / HAX_MAX_CPU_PER_GROUP;
+    bit = cpu_id % HAX_MAX_CPU_PER_GROUP;
+
+    cpu_map = &(omap->cpu_map[group]);
+    cpu_pos = &(omap->cpu_pos[cpu_id]);
+
+    hax_test_and_set_bit(bit, &cpu_map->map);
+    cpu_map->id = group;
+    cpu_pos->group = group;
+    cpu_pos->bit = bit;
+}
+
+static void cpu_info_exit(void)
+{
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map));
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos));
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+}
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+int cpu_info_init(void);
+#ifdef HAX_PLATFORM_DARWIN
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+#ifdef HAX_PLATFORM_LINUX
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+#ifdef HAX_PLATFORM_NETBSD
+hax_smp_func_ret_t smp_cfunction(void *param, void *a2 __unused);
+#endif
+#ifdef HAX_PLATFORM_WINDOWS
+hax_smp_func_ret_t smp_cfunction(void *param);
+#endif
+
 int hax_smp_call_function(hax_cpumap_t *cpus, void(*scfunc)(void *param),
                           void *param);
 
-uint32_t hax_cpuid(void);
 int proc_event_pending(struct vcpu_t *vcpu);
 
 void hax_disable_preemption(preempt_flag *eflags);
@@ -256,26 +380,4 @@ int hax_em64t_enabled(void);
 }
 #endif
 
-/* Utilities */
-#define HAX_NOLOG       0xff
-#define HAX_LOGPANIC    5
-#define HAX_LOGE        4
-#define HAX_LOGW        3
-#define HAX_LOGI        2
-#define HAX_LOGD        1
-#define HAX_LOG_DEFAULT 3
-
-#ifdef HAX_PLATFORM_DARWIN
-#include "darwin/hax_mac.h"
-#endif
-#ifdef HAX_PLATFORM_LINUX
-#include "linux/hax_linux.h"
-#endif
-#ifdef HAX_PLATFORM_NETBSD
-#include "netbsd/hax_netbsd.h"
-#endif
-#ifdef HAX_PLATFORM_WINDOWS
-#include "windows/hax_windows.h"
-#endif
-
 #endif  // HAX_H_
diff --git a/include/linux/hax_linux.h b/include/linux/hax_linux.h
index 7dde8649..3027451b 100644
--- a/include/linux/hax_linux.h
+++ b/include/linux/hax_linux.h
@@ -98,13 +98,6 @@ static inline int memcpy_s(void *dest, size_t destsz, const void *src,
 bool hax_cmpxchg32(uint32_t old_val, uint32_t new_val, volatile uint32_t *addr);
 bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t *addr);
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/include/linux/hax_types_linux.h b/include/linux/hax_types_linux.h
index c744a20b..da90b6d4 100644
--- a/include/linux/hax_types_linux.h
+++ b/include/linux/hax_types_linux.h
@@ -91,12 +91,8 @@ typedef struct hax_kmap_phys {
 
 typedef struct hax_spinlock hax_spinlock;
 
-typedef int hax_cpumap_t;
-
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1 << cpu);
-}
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 
 /* Remove this later */
 #define is_leaf(x)  1
diff --git a/include/netbsd/hax_netbsd.h b/include/netbsd/hax_netbsd.h
index 07d8f0b8..9fcaa19c 100644
--- a/include/netbsd/hax_netbsd.h
+++ b/include/netbsd/hax_netbsd.h
@@ -97,13 +97,6 @@ static inline int memcpy_s(void *dest, size_t destsz, const void *src,
 bool hax_cmpxchg32(uint32_t old_val, uint32_t new_val, volatile uint32_t *addr);
 bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t *addr);
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/include/netbsd/hax_types_netbsd.h b/include/netbsd/hax_types_netbsd.h
index 599fd54c..8b807001 100644
--- a/include/netbsd/hax_types_netbsd.h
+++ b/include/netbsd/hax_types_netbsd.h
@@ -91,12 +91,8 @@ typedef struct hax_kmap_phys {
 
 typedef struct hax_spinlock hax_spinlock;
 
-typedef int hax_cpumap_t;
-
-static inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return (0x1 << cpu);
-}
+typedef uint64_t hax_cpumask_t;
+typedef void hax_smp_func_ret_t;
 
 /* Remove this later */
 #define is_leaf(x)  1
diff --git a/include/windows/hax_types_windows.h b/include/windows/hax_types_windows.h
index 2c2ce73f..2304f12e 100644
--- a/include/windows/hax_types_windows.h
+++ b/include/windows/hax_types_windows.h
@@ -45,14 +45,6 @@ typedef unsigned char bool;
 #define is_leaf(x)  1
 #endif
 
-typedef KAFFINITY hax_cpumap_t;
-inline hax_cpumap_t cpu2cpumap(int cpu)
-{
-    return ((KAFFINITY)0x1 << cpu);
-}
-
-typedef KIRQL preempt_flag;
-
 // Signed Types
 typedef signed char         int8_t;
 typedef signed short        int16_t;
@@ -68,6 +60,15 @@ typedef unsigned int        uint;
 typedef unsigned long       ulong;
 typedef unsigned long       ulong_t;
 
+// KAFFINITY is 32 bits on a 32-bit version of Windows and is 64 bits
+//   on a 64-bit version of Windows. We always use 64-bit to store CPU mask
+//   in haxm so define it as 64-bit here.
+//typedef KAFFINITY hax_cpumask_t;
+typedef uint64_t hax_cpumask_t;
+typedef ULONG_PTR hax_smp_func_ret_t;
+
+typedef KIRQL preempt_flag;
+
 #include "../hax_list.h"
 struct hax_page {
     void *kva;
diff --git a/include/windows/hax_windows.h b/include/windows/hax_windows.h
index c43ef8b4..4da10682 100644
--- a/include/windows/hax_windows.h
+++ b/include/windows/hax_windows.h
@@ -184,13 +184,6 @@ static bool hax_cmpxchg64(uint64_t old_val, uint64_t new_val, volatile uint64_t
         return FALSE;
 }
 
-static inline bool cpu_is_online(int cpu)
-{
-    if (cpu < 0 || cpu >= max_cpus)
-        return 0;
-    return !!(((mword)1 << cpu) & cpu_online_map);
-}
-
 int hax_notify_host_event(enum hax_notify_event event, uint32_t *param,
                           uint32_t size);
 
diff --git a/platforms/darwin/com_intel_hax.c b/platforms/darwin/com_intel_hax.c
index 98795e3b..0bcd814f 100644
--- a/platforms/darwin/com_intel_hax.c
+++ b/platforms/darwin/com_intel_hax.c
@@ -99,38 +99,6 @@ static int lock_prim_init(void)
     return -1;
 }
 
-hax_cpumap_t cpu_online_map;
-int max_cpus;
-
-void get_online_map(void *param)
-{
-    uint64_t *omap;
-
-    //printf("%x\n", cpu_number());
-    omap = param;
-    if (!omap) {
-        hax_log(HAX_LOGE, "NULL pointer in get online map\n");
-        return;
-    }
-
-    hax_test_and_set_bit(cpu_number(), omap);
-    printf("%llx\n ", *omap);
-    return;
-}
-
-/* This is provided in unsupported kext */
-extern unsigned int real_ncpus;
-static void init_cpu_info(void)
-{
-    uint64_t possible_map, omap = 0;
-
-    possible_map = ~0ULL;
-    hax_smp_call_function(&possible_map, get_online_map, &omap);
-    printf("possible map %llx cpu_online_map %llx\n", possible_map, omap);
-    cpu_online_map = omap;
-    max_cpus = real_ncpus;
-}
-
 static int com_intel_hax_init(void)
 {
     int ret;
@@ -139,14 +107,9 @@ static int com_intel_hax_init(void)
     if (ret < 0)
         return ret;
 
-    init_cpu_info();
-
-    if (max_cpus > HAX_MAX_CPUS) {
-        hax_log(HAX_LOGE, "Too many cpus in system!, max_cpus:%d\n",
-                real_ncpus);
-        ret = -E2BIG;
+    ret = cpu_info_init();
+    if (ret < 0)
         goto fail0;
-    }
 
     ret = hax_malloc_init();
     if (ret < 0)
@@ -154,6 +117,7 @@ static int com_intel_hax_init(void)
 
     return 0;
 fail0:
+    cpu_info_exit();
     lock_prim_exit();
     return ret;
 }
diff --git a/platforms/darwin/hax_wrapper.cpp b/platforms/darwin/hax_wrapper.cpp
index 30e249cf..7f9eece5 100644
--- a/platforms/darwin/hax_wrapper.cpp
+++ b/platforms/darwin/hax_wrapper.cpp
@@ -78,30 +78,95 @@ extern "C" void hax_panic(const char *fmt, ...)
     va_end(args);
 }
 
-struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-};
-
-extern "C" void mp_rendezvous_no_intrs(void (*action_func)(void *), void *arg);
+extern int cpu_number(void);
+inline uint32_t hax_cpu_id(void)
+{
+    return (uint32_t)cpu_number();
+}
 
-void smp_cfunction(void *param)
+/* This is provided in unsupported kext */
+extern unsigned int real_ncpus;
+int cpu_info_init(void)
 {
-    int cpu_id;
-    void (*action)(void *parap);
-    hax_cpumap_t *hax_cpus;
-    struct smp_call_parameter *p;
-
-    p = (struct smp_call_parameter *)param;
-    cpu_id = cpu_number();
-    action = p->func;
-    hax_cpus = p->cpus;
-    //printf("cpus:%llx, current_cpu:%x\n", *cpus, cpu_id);
-    if (*hax_cpus & (0x1 << cpu_id))
-        action(p->param);
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = real_ncpus;
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
+extern "C" void mp_rendezvous_no_intrs(void (*action_func)(void *), void *arg);
+
 extern "C" int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
                                  void *param)
 {
@@ -113,11 +178,6 @@ extern "C" int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
     return 0;
 }
 
-extern "C" uint32_t hax_cpuid()
-{
-    return cpu_number();
-}
-
 extern "C" void hax_disable_preemption(preempt_flag *eflags)
 {
     mword flags;
diff --git a/platforms/linux/hax_entry.c b/platforms/linux/hax_entry.c
index a2c96c0a..9a66de18 100644
--- a/platforms/linux/hax_entry.c
+++ b/platforms/linux/hax_entry.c
@@ -114,24 +114,24 @@ static long hax_dev_ioctl(struct file *filp, unsigned int cmd,
 
 static int __init hax_driver_init(void)
 {
-    int i, err;
-
-    // Initialization
-    max_cpus = num_present_cpus();
-    cpu_online_map = 0;
-    for (i = 0; i < max_cpus; i++) {
-        if (cpu_online(i))
-            cpu_online_map |= (1ULL << i);
+    int err;
+
+    err = cpu_info_init();
+    if (err) {
+        hax_log(HAX_LOGE, "Failed to initialize CPU info\n");
+        return err;
     }
 
     if (hax_module_init() < 0) {
         hax_log(HAX_LOGE, "Failed to initialize HAXM module\n");
+        cpu_info_exit();
         return -EAGAIN;
     }
 
     err = misc_register(&hax_dev);
     if (err) {
         hax_log(HAX_LOGE, "Failed to register HAXM device\n");
+        cpu_info_exit();
         hax_module_exit();
         return err;
     }
diff --git a/platforms/linux/hax_wrapper.c b/platforms/linux/hax_wrapper.c
index 1e79d35d..ed392da5 100644
--- a/platforms/linux/hax_wrapper.c
+++ b/platforms/linux/hax_wrapper.c
@@ -40,9 +40,6 @@
 
 #include <asm/cmpxchg.h>
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
 static const char* kLogLevel[] = {
     KERN_ERR,
     KERN_DEBUG,     // HAX_LOGD
@@ -84,27 +81,88 @@ void hax_panic(const char *fmt, ...)
     va_end(args);
 }
 
-uint32_t hax_cpuid(void)
+inline uint32_t hax_cpu_id(void)
 {
-    return smp_processor_id();
+    return (uint32_t)smp_processor_id();
 }
 
-typedef struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-} smp_call_parameter;
-
-static void smp_cfunction(void *p)
+int cpu_info_init(void)
 {
-    struct smp_call_parameter *info = p;
-    hax_cpumap_t *cpus;
-    uint32_t cpuid;
-
-    cpus = info->cpus;
-    cpuid = hax_cpuid();
-    if (*cpus & (0x1 << cpuid))
-        info->func(info->param);
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = num_online_cpus();
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
diff --git a/platforms/netbsd/hax_entry.c b/platforms/netbsd/hax_entry.c
index b4a72afd..846ddf4c 100644
--- a/platforms/netbsd/hax_entry.c
+++ b/platforms/netbsd/hax_entry.c
@@ -120,7 +120,8 @@ static void
 hax_vcpu_attach(device_t parent, device_t self, void *aux)
 {
     struct hax_vcpu_softc *sc;
-    int unit, vm_id, cpu_id;
+    int unit, vm_id;
+    uint32_t vcpu_id;
 
     sc = device_private(self);
     if (sc == NULL) {
@@ -130,13 +131,13 @@ hax_vcpu_attach(device_t parent, device_t self, void *aux)
 
     unit = device_unit(self);
     vm_id = unit2vmmid(unit);
-    cpu_id = unit2vcpuid(unit);
+    vcpu_id = unit2vcpuid(unit);
 
     sc->sc_dev = self;
     sc->vcpu = NULL;
 
     snprintf(self->dv_xname, sizeof self->dv_xname, "hax_vm%02d/vcpu%02d",
-             vm_id, cpu_id);
+             vm_id, vcpu_id);
 
     if (!pmf_device_register(self, NULL, NULL))
         aprint_error_dev(self, "couldn't establish power handler\n");
@@ -216,23 +217,16 @@ MODULE(MODULE_CLASS_MISC, haxm, NULL);
 static int
 haxm_modcmd(modcmd_t cmd, void *arg __unused)
 {
-    struct cpu_info *ci;
-    CPU_INFO_ITERATOR cii;
     int err;
     size_t i;
 
     switch (cmd) {
     case MODULE_CMD_INIT: {
         // Initialization
-        max_cpus = 0;
-
-        ci = NULL;
-
-        for (CPU_INFO_FOREACH(cii, ci)) {
-            ++max_cpus;
-            if (!ISSET(ci->ci_schedstate.spc_flags, SPCF_OFFLINE)) {
-                cpu_online_map |= __BIT(cpu_index(ci));
-            }
+        err = cpu_info_init();
+        if (err) {
+            hax_log(HAX_LOGE, "Unable to init cpu info\n");
+            goto init_err0;
         }
 
         // Register hax_vm
@@ -327,6 +321,8 @@ haxm_modcmd(modcmd_t cmd, void *arg __unused)
 init_err2:
         config_cfdriver_detach(&hax_vm_cd);
 init_err1:
+        cpu_info_exit();
+init_err0:
         return ENXIO;
     }
     case MODULE_CMD_FINI: {
diff --git a/platforms/netbsd/hax_entry_vcpu.c b/platforms/netbsd/hax_entry_vcpu.c
index 02094072..b37b9e39 100644
--- a/platforms/netbsd/hax_entry_vcpu.c
+++ b/platforms/netbsd/hax_entry_vcpu.c
@@ -67,7 +67,8 @@ int hax_vcpu_open(dev_t self, int flag __unused, int mode __unused,
     struct vcpu_t *cvcpu;
     struct hax_vcpu_netbsd_t *vcpu;
     int ret;
-    int unit, vm_id, vcpu_id;
+    int unit, vm_id;
+    uint32_t vcpu_id;
 
     sc = device_lookup_private(&hax_vcpu_cd, minor(self));
     if (sc == NULL) {
diff --git a/platforms/netbsd/hax_wrapper.c b/platforms/netbsd/hax_wrapper.c
index d62535b8..bcd31d3b 100644
--- a/platforms/netbsd/hax_wrapper.c
+++ b/platforms/netbsd/hax_wrapper.c
@@ -44,9 +44,6 @@
 #include "../../core/include/hax_core_interface.h"
 #include "../../core/include/ia32.h"
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
 static const char* kLogPrefix[] = {
     "haxm: ",
     "haxm_debug: ",
@@ -76,27 +73,96 @@ void hax_panic(const char *fmt,  ...)
     va_end(args);
 }
 
-uint32_t hax_cpuid(void)
+inline uint32_t hax_cpu_id(void)
 {
-    return cpu_index(curcpu());
+    return (uint32_t)cpu_number();
 }
 
-typedef struct smp_call_parameter {
-    void (*func)(void *);
-    void *param;
-    hax_cpumap_t *cpus;
-} smp_call_parameter;
-
-static void smp_cfunction(void *a1, void *a2 __unused)
+int cpu_info_init(void)
 {
-    struct smp_call_parameter *info = a1;
-    hax_cpumap_t *cpus;
-    uint32_t cpuid;
-
-    cpus = info->cpus;
-    cpuid = hax_cpuid();
-    if (*cpus & (0x1 << cpuid))
-        info->func(info->param);
+    struct cpu_info *ci = NULL;
+    CPU_INFO_ITERATOR cii;
+    uint32_t size_group, size_pos, cpu_id, group, bit;
+    hax_cpumap_t omap = {0};
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.cpu_num = 0;
+    for (CPU_INFO_FOREACH(cii, ci)) {
+        if (!ISSET(ci->ci_schedstate.spc_flags, SPCF_OFFLINE)) {
+            ++cpu_online_map.cpu_num;
+        }
+    }
+
+    group = HAX_MAX_CPU_PER_GROUP;
+    cpu_online_map.group_num = (cpu_online_map.cpu_num + group - 1) / group;
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -E2BIG;
+    }
+
+    cpu_online_map.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    omap.cpu_map = (hax_cpu_group_t *)hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map || !omap.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    cpu_online_map.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    omap.cpu_pos = (hax_cpu_pos_t *)hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos || !omap.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        goto fail_nomem;
+    }
+
+    // omap is filled for get_online_map() to init all host cpu info.
+    // Since smp_cfunction() will check if host cpu is online in cpu_online_map,
+    // but the first call to smp_cfunction() is to init cpu_online_map itself.
+    // Make smp_cfunction() always check group 0 bit 1 for get_online_map(),
+    // so get_online_map() assumes all online and init the real cpu_online_map.
+    omap.group_num = cpu_online_map.group_num;
+    omap.cpu_num = cpu_online_map.cpu_num;
+    for (cpu_id = 0; cpu_id < omap.cpu_num; cpu_id++) {
+        omap.cpu_pos[cpu_id].group = 0;
+        omap.cpu_pos[cpu_id].bit = 0;
+    }
+    for (group = 0; group < omap.group_num; group++) {
+        omap.cpu_map[group].id = 0;
+        omap.cpu_map[group].map = ~0ULL;
+    }
+    hax_smp_call_function(&omap, get_online_map, &cpu_online_map);
+
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit))
+                ++cpu_online_map.cpu_map[group].num;
+        }
+    }
+
+    hax_vfree(omap.cpu_map, size_group);
+    hax_vfree(omap.cpu_pos, size_pos);
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+
+fail_nomem:
+    if (cpu_online_map.cpu_map)
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+    if (cpu_online_map.cpu_pos)
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+    if (omap.cpu_map)
+        hax_vfree(omap.cpu_map, size_group);
+    if (omap.cpu_pos)
+        hax_vfree(omap.cpu_pos, size_pos);
+    return -ENOMEM;
 }
 
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
@@ -108,7 +174,7 @@ int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *),
     info.func = scfunc;
     info.param = param;
     info.cpus = cpus;
-    xc = xc_broadcast(XC_HIGHPRI, smp_cfunction, &info, NULL);
+    xc = xc_broadcast(XC_HIGHPRI, (xcfunc_t)smp_cfunction, &info, NULL);
     xc_wait(xc);
     return 0;
 }
diff --git a/platforms/windows/hax_entry.c b/platforms/windows/hax_entry.c
index f56335e8..74a0b440 100644
--- a/platforms/windows/hax_entry.c
+++ b/platforms/windows/hax_entry.c
@@ -54,24 +54,25 @@ DRIVER_UNLOAD HaxUnloadDriver;
 
 static int hax_host_init(void)
 {
-    int i, ret;
-    cpu_online_map = KeQueryActiveProcessors();
-
-    for (i = 0; i < (sizeof(ULONG_PTR) * 8); i++)
-        if (cpu_online_map & ((mword)0x1 << i))
-            max_cpus = i;
+    int ret;
 
-    /* we get the max_cpus from real_cpus in darwin, so add 1 here */
-    max_cpus++;
+    ret = cpu_info_init();
+    if (ret < 0) {
+        hax_log(HAX_LOGE, "CPU info init failed\n");
+        return ret;
+    }
 
     ret = smpc_dpc_init();
     if (ret < 0) {
+        hax_log(HAX_LOGE, "SMPC DPC init failed\n");
+        cpu_info_exit();
         return ret;
     }
 
     if (hax_module_init() < 0) {
             hax_log(HAX_LOGE, "Hax module init failed\n");
             smpc_dpc_exit();
+            cpu_info_exit();
             return -1;
     }
 
diff --git a/platforms/windows/hax_wrapper.c b/platforms/windows/hax_wrapper.c
index 4176a063..135c1180 100644
--- a/platforms/windows/hax_wrapper.c
+++ b/platforms/windows/hax_wrapper.c
@@ -31,21 +31,71 @@
 #include "hax_win.h"
 #include "../../core/include/ia32.h"
 
-int max_cpus;
-hax_cpumap_t cpu_online_map;
-
-uint32_t hax_cpuid()
+inline uint32_t hax_cpu_id(void)
 {
-    return KeGetCurrentProcessorNumber();
+    PROCESSOR_NUMBER ProcNumber = {0};
+    return (uint32_t)KeGetCurrentProcessorNumberEx(&ProcNumber);
 }
 
-struct smp_call_parameter
+int cpu_info_init(void)
 {
-    void (*func)(void *);
-    void *param;
-    /* Not used in DPC model*/
-    hax_cpumap_t *cpus;
-};
+    uint32_t size_group, size_pos, count, group, bit;
+
+    memset(&cpu_online_map, 0, sizeof(cpu_online_map));
+
+    cpu_online_map.group_num = KeQueryActiveGroupCount();
+    cpu_online_map.cpu_num = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS);
+    if (cpu_online_map.group_num > HAX_MAX_CPU_GROUP ||
+        cpu_online_map.cpu_num > HAX_MAX_CPUS) {
+        hax_log(HAX_LOGE, "Too many cpus %d-%d in system\n",
+                cpu_online_map.cpu_num, cpu_online_map.group_num);
+        return -1;
+    }
+
+    size_group = cpu_online_map.group_num * sizeof(*cpu_online_map.cpu_map);
+    cpu_online_map.cpu_map = hax_vmalloc(size_group, 0);
+    if (!cpu_online_map.cpu_map) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_map for cpu_online_map\n");
+        return -1;
+    }
+
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+    cpu_online_map.cpu_pos = hax_vmalloc(size_pos, 0);
+    if (!cpu_online_map.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate cpu_pos for cpu_online_map\n");
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+        return -1;
+    }
+
+    count = 0;
+    for (group = 0; group < cpu_online_map.group_num; group++) {
+        cpu_online_map.cpu_map[group].map = (hax_cpumask_t)KeQueryGroupAffinity(
+                group);
+        cpu_online_map.cpu_map[group].id = group;
+        cpu_online_map.cpu_map[group].num = 0;
+        for (bit = 0; bit < HAX_MAX_CPU_PER_GROUP; bit++) {
+            if (cpu_online_map.cpu_map[group].map & ((hax_cpumask_t)1 << bit)) {
+                ++cpu_online_map.cpu_map[group].num;
+                cpu_online_map.cpu_pos[count].group = group;
+                cpu_online_map.cpu_pos[count].bit = bit;
+                ++count;
+            }
+        }
+    }
+
+    if (count != cpu_online_map.cpu_num) {
+        hax_log(HAX_LOGE, "Active logical processor count(%d)-affinity(%d) "
+                "doesn't match\n", cpu_online_map.cpu_num, count);
+        hax_vfree(cpu_online_map.cpu_map, size_group);
+        hax_vfree(cpu_online_map.cpu_pos, size_pos);
+        return -1;
+    }
+
+    hax_log(HAX_LOGI, "Host cpu init %d logical cpu(s) into %d group(s)\n",
+            cpu_online_map.cpu_num, cpu_online_map.group_num);
+
+    return 0;
+}
 
 #ifdef SMPC_DPCS
 KDEFERRED_ROUTINE smp_cfunction_dpc;
@@ -59,15 +109,20 @@ void smp_cfunction_dpc(
         __in_opt PVOID  SystemArgument1,
         __in_opt PVOID  SystemArgument2)
 {
+    struct smp_call_parameter *p = (struct smp_call_parameter *)SystemArgument2;
+    void (*action)(void *param) = p->func;
     hax_cpumap_t *done;
-    void (*action)(void *parap);
-    struct smp_call_parameter *p;
+    uint32_t self, group, bit;
 
-    p = (struct smp_call_parameter *)SystemArgument2;
-    done = (hax_cpumap_t*)SystemArgument1;
-    action = p->func;
     action(p->param);
-    hax_test_and_set_bit(hax_cpuid(), (uint64_t*)done);
+
+    // We only use hax_cpumap_t.hax_cpu_pos_t to mark done or not
+    done = (hax_cpumap_t*)SystemArgument1;
+    self = hax_cpu_id();
+    group = self / HAX_MAX_CPU_PER_GROUP;
+    bit = self % HAX_MAX_CPU_PER_GROUP;
+    done->cpu_pos[self].group = group;
+    done->cpu_pos[self].bit = bit;
 }
 
 /* IPI function is not exported to in XP, we use DPC to trigger the smp
@@ -80,43 +135,54 @@ void smp_cfunction_dpc(
  */
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * param)
 {
-    int i, self;
+    uint32_t cpu_id, self, group, bit, size_pos;
     BOOLEAN result;
     struct _KDPC *cur_dpc;
-    hax_cpumap_t done;
+    hax_cpumap_t done = {0};
     struct smp_call_parameter *sp;
     KIRQL old_irql;
     LARGE_INTEGER delay;
     NTSTATUS event_result;
+    int err = 0;
 
-    self = hax_cpuid();
+    self = hax_cpu_id();
+    group = self / HAX_MAX_CPU_PER_GROUP;
+    bit = self % HAX_MAX_CPU_PER_GROUP;
 
-    done = 0;
+    size_pos = cpu_online_map.cpu_num * sizeof(*cpu_online_map.cpu_pos);
+    done.cpu_pos = hax_vmalloc(size_pos, 0);
+    if (!done.cpu_pos) {
+        hax_log(HAX_LOGE, "Couldn't allocate done to check SMP DPC done\n");
+        return -1;
+    }
+    memset(done.cpu_pos, 0xFF, size_pos);
 
     event_result = KeWaitForSingleObject(&dpc_event, Executive, KernelMode,
                                          FALSE, NULL);
     if (event_result!= STATUS_SUCCESS) {
         hax_log(HAX_LOGE, "Failed to get the smp_call event object\n");
+        hax_vfree(done.cpu_pos, size_pos);
         return -1;
     }
 
-    if (((mword)1 << self) & *cpus) {
+    if (cpu_is_online(cpus, self)){
         KeRaiseIrql(DISPATCH_LEVEL, &old_irql);
         (scfunc)(param);
-        done |= ((mword)1 << self);
+        done.cpu_pos[self].group = group;
+        done.cpu_pos[self].bit = bit;
         KeLowerIrql(old_irql);
     }
 
-    for (i = 0; i < max_cpus; i++) {
-        if (!cpu_is_online(i) || (i == self))
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
+        if (!cpu_is_online(&cpu_online_map, cpu_id) || (cpu_id == self))
             continue;
-        sp = smp_cp + i;
+        sp = smp_cp + cpu_id;
         sp->func = scfunc;
         sp->param = param;
-        cur_dpc = smpc_dpcs + i;
+        cur_dpc = smpc_dpcs + cpu_id;
         result = KeInsertQueueDpc(cur_dpc, &done, sp);
         if (result != TRUE)
-            hax_log(HAX_LOGE, "Failed to insert queue on CPU %x\n", i);
+            hax_log(HAX_LOGE, "Failed to insert queue on CPU %x\n", cpu_id);
     }
 
     /* Delay 100 ms */
@@ -124,32 +190,36 @@ int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * par
     if (KeDelayExecutionThread( KernelMode, TRUE, &delay ) != STATUS_SUCCESS)
         hax_log(HAX_LOGE, "Delay execution is not success\n");
 
-    if (done != *cpus)
+    if(!memcmp(done.cpu_pos, cpu_online_map.cpu_pos, size_pos)) {
+        err = -1;
         hax_log(HAX_LOGE, "sm call function is not called in all required CPUs\n");
+    }
 
     KeSetEvent(&dpc_event, 0, FALSE);
 
-    return (done != *cpus) ? -1 :0;
+    hax_vfree(done.cpu_pos, size_pos);
+
+    return err;
 }
 
 int
 smpc_dpc_init(void)
 {
     struct _KDPC *cur_dpc;
-    int i;
+    uint32_t cpu_id;
 
-    smpc_dpcs = hax_vmalloc(sizeof(KDPC) * max_cpus, 0);
+    smpc_dpcs = hax_vmalloc(sizeof(KDPC) * cpu_online_map.cpu_num, 0);
     if (!smpc_dpcs)
         return -ENOMEM;
-    smp_cp = hax_vmalloc(sizeof(struct smp_call_parameter) * max_cpus, 0);
+    smp_cp = hax_vmalloc(sizeof(struct smp_call_parameter) * cpu_online_map.cpu_num, 0);
     if (!smp_cp) {
-        hax_vfree(smpc_dpcs, sizeof(KDPC) * max_cpus);
+        hax_vfree(smpc_dpcs, sizeof(KDPC) * cpu_online_map.cpu_num);
         return -ENOMEM;
     }
     cur_dpc = smpc_dpcs;
-    for (i = 0; i < max_cpus; i++) {
+    for (cpu_id = 0; cpu_id < cpu_online_map.cpu_num; cpu_id++) {
         KeInitializeDpc(cur_dpc, smp_cfunction_dpc, NULL);
-        KeSetTargetProcessorDpc(cur_dpc, i);
+        KeSetTargetProcessorDpc(cur_dpc, cpu_id);
         /* Set the DPC as high important, so that we loop too long */
         KeSetImportanceDpc(cur_dpc, HighImportance);
         cur_dpc++;
@@ -160,34 +230,20 @@ smpc_dpc_init(void)
 
 int smpc_dpc_exit(void)
 {
-    hax_vfree(smpc_dpcs, sizeof(KDPC) * max_cpus);
-    hax_vfree(smp_cp, sizeof(KDPC) * max_cpus);
+    hax_vfree(smpc_dpcs, sizeof(KDPC) * cpu_online_map.cpu_num);
+    hax_vfree(smp_cp, sizeof(KDPC) * cpu_online_map.cpu_num);
     return 0;
 }
 #else
-/* This is the only function that in DIRQL */
-static ULONG_PTR smp_cfunction(ULONG_PTR param)
-{
-    int cpu_id;
-    void (*action)(void *parap) ;
-    hax_cpumap_t *hax_cpus;
-    struct smp_call_parameter *p;
-
-    p = (struct smp_call_parameter *)param;
-    cpu_id = hax_cpuid();
-    action = p->func;
-    hax_cpus = p->cpus;
-    if (*hax_cpus & ((mword)1 << cpu_id))
-        action(p->param);
-    return (ULONG_PTR)NULL;
-}
+// A driver calls KeIpiGenericCall to interrupt every processor and raises
+// the IRQL to IPI_LEVEL, which is greater than DIRQL for every device.
 int hax_smp_call_function(hax_cpumap_t *cpus, void (*scfunc)(void *), void * param)
 {
     struct smp_call_parameter sp;
     sp.func = scfunc;
     sp.param = param;
     sp.cpus = cpus;
-    KeIpiGenericCall(smp_cfunction, (ULONG_PTR)&sp);
+    KeIpiGenericCall((PKIPI_BROADCAST_WORKER)smp_cfunction, (ULONG_PTR)&sp);
     return 0;
 }
 #endif