Skip to content

Commit 07d2490

Browse files
agrafakpm00
authored andcommitted
kexec: enable CMA based contiguous allocation
When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Alexander Graf <[email protected]> Acked-by: Baoquan He <[email protected]> Reviewed-by: Pasha Tatashin <[email protected]> Cc: Zhongkun He <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent ed4f142 commit 07d2490

File tree

7 files changed

+156
-11
lines changed

7 files changed

+156
-11
lines changed

arch/riscv/kernel/kexec_elf.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
9595
kbuf.buf_align = PMD_SIZE;
9696
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
9797
kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
98+
kbuf.cma = NULL;
9899
kbuf.top_down = false;
99100
ret = arch_kexec_locate_mem_hole(&kbuf);
100101
if (!ret) {

include/linux/kexec.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes;
7979

8080
typedef unsigned long kimage_entry_t;
8181

82+
/*
83+
* This is a copy of the UAPI struct kexec_segment and must be identical
84+
* to it because it gets copied straight from user space into kernel
85+
* memory. Do not modify this structure unless you change the way segments
86+
* get ingested from user space.
87+
*/
8288
struct kexec_segment {
8389
/*
8490
* This pointer can point to user memory if kexec_load() system
@@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image);
172178
* @buf_align: Minimum alignment needed.
173179
* @buf_min: The buffer can't be placed below this address.
174180
* @buf_max: The buffer can't be placed above this address.
181+
* @cma: CMA page if the buffer is backed by CMA.
175182
* @top_down: Allocate from top of memory.
176183
* @random: Place the buffer at a random position.
177184
*/
@@ -184,6 +191,7 @@ struct kexec_buf {
184191
unsigned long buf_align;
185192
unsigned long buf_min;
186193
unsigned long buf_max;
194+
struct page *cma;
187195
bool top_down;
188196
#ifdef CONFIG_CRASH_DUMP
189197
bool random;
@@ -340,6 +348,7 @@ struct kimage {
340348

341349
unsigned long nr_segments;
342350
struct kexec_segment segment[KEXEC_SEGMENT_MAX];
351+
struct page *segment_cma[KEXEC_SEGMENT_MAX];
343352

344353
struct list_head control_pages;
345354
struct list_head dest_pages;
@@ -361,6 +370,7 @@ struct kimage {
361370
*/
362371
unsigned int hotplug_support:1;
363372
#endif
373+
unsigned int no_cma:1;
364374

365375
#ifdef ARCH_HAS_KIMAGE_ARCH
366376
struct kimage_arch arch;

include/uapi/linux/kexec.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#define KEXEC_FILE_ON_CRASH 0x00000002
2828
#define KEXEC_FILE_NO_INITRAMFS 0x00000004
2929
#define KEXEC_FILE_DEBUG 0x00000008
30+
#define KEXEC_FILE_NO_CMA 0x00000010
3031

3132
/* These values match the ELF architecture values.
3233
* Unless there is a good reason that should continue to be the case.

kernel/kexec.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
152152
goto out;
153153

154154
for (i = 0; i < nr_segments; i++) {
155-
ret = kimage_load_segment(image, &image->segment[i]);
155+
ret = kimage_load_segment(image, i);
156156
if (ret)
157157
goto out;
158158
}

kernel/kexec_core.c

Lines changed: 92 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <linux/hugetlb.h>
4141
#include <linux/objtool.h>
4242
#include <linux/kmsg_dump.h>
43+
#include <linux/dma-map-ops.h>
4344

4445
#include <asm/page.h>
4546
#include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
553554
kimage_free_pages(page);
554555
}
555556

557+
static void kimage_free_cma(struct kimage *image)
558+
{
559+
unsigned long i;
560+
561+
for (i = 0; i < image->nr_segments; i++) {
562+
struct page *cma = image->segment_cma[i];
563+
u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
564+
565+
if (!cma)
566+
continue;
567+
568+
arch_kexec_pre_free_pages(page_address(cma), nr_pages);
569+
dma_release_from_contiguous(NULL, cma, nr_pages);
570+
image->segment_cma[i] = NULL;
571+
}
572+
573+
}
574+
556575
void kimage_free(struct kimage *image)
557576
{
558577
kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
591610
/* Free the kexec control pages... */
592611
kimage_free_page_list(&image->control_pages);
593612

613+
/* Free CMA allocations */
614+
kimage_free_cma(image);
615+
594616
/*
595617
* Free up any temporary buffers allocated. This might hit if
596618
* error occurred much later after buffer allocation.
@@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image,
716738
return page;
717739
}
718740

719-
static int kimage_load_normal_segment(struct kimage *image,
720-
struct kexec_segment *segment)
741+
static int kimage_load_cma_segment(struct kimage *image, int idx)
742+
{
743+
struct kexec_segment *segment = &image->segment[idx];
744+
struct page *cma = image->segment_cma[idx];
745+
char *ptr = page_address(cma);
746+
unsigned long maddr;
747+
size_t ubytes, mbytes;
748+
int result = 0;
749+
unsigned char __user *buf = NULL;
750+
unsigned char *kbuf = NULL;
751+
752+
if (image->file_mode)
753+
kbuf = segment->kbuf;
754+
else
755+
buf = segment->buf;
756+
ubytes = segment->bufsz;
757+
mbytes = segment->memsz;
758+
maddr = segment->mem;
759+
760+
/* Then copy from source buffer to the CMA one */
761+
while (mbytes) {
762+
size_t uchunk, mchunk;
763+
764+
ptr += maddr & ~PAGE_MASK;
765+
mchunk = min_t(size_t, mbytes,
766+
PAGE_SIZE - (maddr & ~PAGE_MASK));
767+
uchunk = min(ubytes, mchunk);
768+
769+
if (uchunk) {
770+
/* For file based kexec, source pages are in kernel memory */
771+
if (image->file_mode)
772+
memcpy(ptr, kbuf, uchunk);
773+
else
774+
result = copy_from_user(ptr, buf, uchunk);
775+
ubytes -= uchunk;
776+
if (image->file_mode)
777+
kbuf += uchunk;
778+
else
779+
buf += uchunk;
780+
}
781+
782+
if (result) {
783+
result = -EFAULT;
784+
goto out;
785+
}
786+
787+
ptr += mchunk;
788+
maddr += mchunk;
789+
mbytes -= mchunk;
790+
791+
cond_resched();
792+
}
793+
794+
/* Clear any remainder */
795+
memset(ptr, 0, mbytes);
796+
797+
out:
798+
return result;
799+
}
800+
801+
static int kimage_load_normal_segment(struct kimage *image, int idx)
721802
{
803+
struct kexec_segment *segment = &image->segment[idx];
722804
unsigned long maddr;
723805
size_t ubytes, mbytes;
724806
int result;
@@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image,
733815
mbytes = segment->memsz;
734816
maddr = segment->mem;
735817

818+
if (image->segment_cma[idx])
819+
return kimage_load_cma_segment(image, idx);
820+
736821
result = kimage_set_destination(image, maddr);
737822
if (result < 0)
738823
goto out;
@@ -787,13 +872,13 @@ static int kimage_load_normal_segment(struct kimage *image,
787872
}
788873

789874
#ifdef CONFIG_CRASH_DUMP
790-
static int kimage_load_crash_segment(struct kimage *image,
791-
struct kexec_segment *segment)
875+
static int kimage_load_crash_segment(struct kimage *image, int idx)
792876
{
793877
/* For crash dumps kernels we simply copy the data from
794878
* user space to it's destination.
795879
* We do things a page at a time for the sake of kmap.
796880
*/
881+
struct kexec_segment *segment = &image->segment[idx];
797882
unsigned long maddr;
798883
size_t ubytes, mbytes;
799884
int result;
@@ -858,18 +943,17 @@ static int kimage_load_crash_segment(struct kimage *image,
858943
}
859944
#endif
860945

861-
int kimage_load_segment(struct kimage *image,
862-
struct kexec_segment *segment)
946+
int kimage_load_segment(struct kimage *image, int idx)
863947
{
864948
int result = -ENOMEM;
865949

866950
switch (image->type) {
867951
case KEXEC_TYPE_DEFAULT:
868-
result = kimage_load_normal_segment(image, segment);
952+
result = kimage_load_normal_segment(image, idx);
869953
break;
870954
#ifdef CONFIG_CRASH_DUMP
871955
case KEXEC_TYPE_CRASH:
872-
result = kimage_load_crash_segment(image, segment);
956+
result = kimage_load_crash_segment(image, idx);
873957
break;
874958
#endif
875959
}

kernel/kexec_file.c

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/kernel_read_file.h>
2727
#include <linux/syscalls.h>
2828
#include <linux/vmalloc.h>
29+
#include <linux/dma-map-ops.h>
2930
#include "kexec_internal.h"
3031

3132
#ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
253254
ret = 0;
254255
}
255256

257+
image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
258+
256259
if (cmdline_len) {
257260
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
258261
if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
434437
i, ksegment->buf, ksegment->bufsz, ksegment->mem,
435438
ksegment->memsz);
436439

437-
ret = kimage_load_segment(image, &image->segment[i]);
440+
ret = kimage_load_segment(image, i);
438441
if (ret)
439442
goto out;
440443
}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
663666
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
664667
}
665668

669+
static int kexec_alloc_contig(struct kexec_buf *kbuf)
670+
{
671+
size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
672+
unsigned long mem;
673+
struct page *p;
674+
675+
/* User space disabled CMA allocations, bail out. */
676+
if (kbuf->image->no_cma)
677+
return -EPERM;
678+
679+
/* Skip CMA logic for crash kernel */
680+
if (kbuf->image->type == KEXEC_TYPE_CRASH)
681+
return -EPERM;
682+
683+
p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
684+
if (!p)
685+
return -ENOMEM;
686+
687+
pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
688+
689+
mem = page_to_boot_pfn(p) << PAGE_SHIFT;
690+
691+
if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
692+
/* Our region is already in use by a statically defined one. Bail out. */
693+
pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
694+
dma_release_from_contiguous(NULL, p, nr_pages);
695+
return -EBUSY;
696+
}
697+
698+
kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
699+
kbuf->cma = p;
700+
701+
arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
702+
703+
return 0;
704+
}
705+
666706
/**
667707
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
668708
* @kbuf: Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
687727
if (ret <= 0)
688728
return ret;
689729

730+
/*
731+
* Try to find a free physically contiguous block of memory first. With that, we
732+
* can avoid any copying at kexec time.
733+
*/
734+
if (!kexec_alloc_contig(kbuf))
735+
return 0;
736+
690737
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
691738
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
692739
else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
732779
/* Ensure minimum alignment needed for segments. */
733780
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
734781
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
782+
kbuf->cma = NULL;
735783

736784
/* Walk the RAM ranges and allocate a suitable range for the buffer */
737785
ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
744792
ksegment->bufsz = kbuf->bufsz;
745793
ksegment->mem = kbuf->mem;
746794
ksegment->memsz = kbuf->memsz;
795+
kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
747796
kbuf->image->nr_segments++;
748797
return 0;
749798
}

kernel/kexec_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void);
1010
int sanity_check_segment_list(struct kimage *image);
1111
void kimage_free_page_list(struct list_head *list);
1212
void kimage_free(struct kimage *image);
13-
int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
13+
int kimage_load_segment(struct kimage *image, int idx);
1414
void kimage_terminate(struct kimage *image);
1515
int kimage_is_destination_range(struct kimage *image,
1616
unsigned long start, unsigned long end);

0 commit comments

Comments
 (0)