diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 204b2e72966790..351e853fd28dde 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -63,11 +63,46 @@ USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!) .endm + .macro ldpc1 reg1, reg2, ptr, val + user_ldp 9997f, \reg1, \reg2, \ptr, \val, #16 + .endm + + .macro stpc1 reg1, reg2, ptr, val + stp \reg1, \reg2, [\ptr], \val + .endm + end .req x5 -srcin .req x15 +dstin .req x6 +req_reg_pcuabi srcin, c15, x15 SYM_FUNC_START(COPY_FUNC_NAME) add end, x0, x2 + mov dstin, x0 +#ifdef CONFIG_CHERI_PURECAP_UABI +.arch morello+c64 + bx #4 + /* + * Having switched to C64, argumentless RET is equivalent to RET CLR. + * Because we have been called from A64, only LR is set. We therefore + * set CLR to a valid capability, derived from PCC (as if we had been + * called from C64). Conveniently this will also automatically switch + * us back to A64 when returning (as the LSB of LR should be unset). + */ + cvtp clr, lr + /* + * Accessing memory via X registers in C64 requires using + * alternate-base loads and stores; unfortunately most loads and stores + * used in copy_template.S do not have an alternate-base counterpart. + * The most straightforward solution is to access memory via C + * registers only. We therefore need to create a valid capability for + * the kernel buffer too, which is done by deriving it from DDC. Since + * X-based accesses are validated against DDC, this is functionally + * equivalent. + */ + cvtd c0, x0 + mov srcin, c1 +#else mov srcin, x1 +#endif #include "copy_template.S" mov x0, #0 // Nothing to copy ret @@ -75,13 +110,13 @@ SYM_FUNC_START(COPY_FUNC_NAME) // Exception fixups 9996: b.cs 9997f // Registers are in Option A format - add dst, dst, count -9997: cmp dst, dstin + add dstx, dstx, count +9997: cmp dstx, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder USER(9998f, ldtrb tmp1w, [srcin]) strb tmp1w, [dst], #1 -9998: sub x0, end, dst // bytes not copied +9998: sub x0, end, dstx // bytes not copied ret SYM_FUNC_END(COPY_FUNC_NAME) EXPORT_SYMBOL(COPY_FUNC_NAME) diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S index 7f2f5a0e2fb9f0..459983994cfb66 100644 --- a/arch/arm64/lib/copy_template.S +++ b/arch/arm64/lib/copy_template.S @@ -18,17 +18,16 @@ * x0 - dest * x1 - src * x2 - n - * Returns: - * x0 - dest */ -dstin .req x0 -src .req x1 +req_reg_pcuabi dst, c0, x0 +dstx .req x0 +req_reg_pcuabi src, c1, x1 +srcx .req x1 count .req x2 tmp1 .req x3 tmp1w .req w3 tmp2 .req x4 tmp2w .req w4 -dst .req x6 A_l .req x7 A_h .req x8 @@ -39,7 +38,15 @@ C_h .req x12 D_l .req x13 D_h .req x14 - mov dst, dstin +#ifdef COPY_CAPTAGS +tmp1c .req c3 +tmp2c .req c4 + +Ac_l .req c7 +Ac_h .req c8 +Bc_l .req c9 +Bc_h .req c10 +#endif #ifdef CONFIG_AS_HAS_MOPS alternative_if_not ARM64_HAS_MOPS @@ -54,7 +61,7 @@ alternative_else_nop_endif /*When memory length is less than 16, the accessed are not aligned.*/ b.lo .Ltiny15 - neg tmp2, src + neg tmp2, srcx ands tmp2, tmp2, #15/* Bytes to reach alignment. */ b.eq .LSrcAligned sub count, count, tmp2 @@ -81,6 +88,11 @@ alternative_else_nop_endif str1 tmp1, dst, #8 .LSrcAligned: +#ifdef COPY_CAPTAGS + /* src now 16-byte aligned, copy capability tags if dst also aligned */ + tst dstx, #15 + b.eq .LSrcAligned_cpycaps +#endif cmp count, #64 b.ge .Lcpy_over64 /* @@ -188,4 +200,68 @@ alternative_else_nop_endif tst count, #0x3f b.ne .Ltail63 +#ifdef COPY_CAPTAGS + b .Lexitfunc + + /* + * The .L*_cpycaps instruction sequences below are copies of the + * sequences above. + * The only functional difference is that they use capability + * loads/stores, such that capability tags are copied from the source to + * the destination. For that reason, they require both src and dst to be + * 16-byte aligned. + * Because C registers are twice as bigs as X registers, we only need + * half the L/S instructions to transfer the same amount of data. + */ +.LSrcAligned_cpycaps: + cmp count, #64 + b.ge .Lcpy_over64_cpycaps + +.Ltail63_cpycaps: + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldr1 tmp2c, src, #16 + str1 tmp2c, dst, #16 +1: + ldr1 tmp2c, src, #16 + str1 tmp2c, dst, #16 +2: + ldr1 tmp2c, src, #16 + str1 tmp2c, dst, #16 + b .Ltiny15 + +.Lcpy_over64_cpycaps: + subs count, count, #128 + b.ge .Lcpy_body_large_cpycaps + + ldpc1 Ac_l, Ac_h, src, #32 + stpc1 Ac_l, Ac_h, dst, #32 + ldpc1 Bc_l, Bc_h, src, #32 + stpc1 Bc_l, Bc_h, dst, #32 + + tst count, #0x3f + b.ne .Ltail63_cpycaps + b .Lexitfunc + + .p2align L1_CACHE_SHIFT +.Lcpy_body_large_cpycaps: + ldpc1 Ac_l, Ac_h, src, #32 + ldpc1 Bc_l, Bc_h, src, #32 +1: + /* attempt to keep 64-byte blocks of loads and stores interlaced */ + stpc1 Ac_l, Ac_h, dst, #32 + ldpc1 Ac_l, Ac_h, src, #32 + stpc1 Bc_l, Bc_h, dst, #32 + ldpc1 Bc_l, Bc_h, src, #32 + subs count, count, #64 + b.ge 1b + stpc1 Ac_l, Ac_h, dst, #32 + stpc1 Bc_l, Bc_h, dst, #32 + + tst count, #0x3f + b.ne .Ltail63_cpycaps +#endif /* COPY_CAPTAGS */ .Lexitfunc: diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index f4ac5fec80fe49..32e3a6b8fe94db 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -62,11 +62,30 @@ USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!) .endm + .macro ldpc1 reg1, reg2, ptr, val + ldp \reg1, \reg2, [\ptr], \val + .endm + + .macro stpc1 reg1, reg2, ptr, val + user_stp 9997f, \reg1, \reg2, \ptr, \val, #16 + .endm + end .req x5 -srcin .req x15 +dstin .req x6 +req_reg_pcuabi srcin, c15, x15 SYM_FUNC_START(COPY_FUNC_NAME) add end, x0, x2 + mov dstin, x0 +#ifdef CONFIG_CHERI_PURECAP_UABI +.arch morello+c64 + bx #4 + /* See comments in copy_from_user.S */ + cvtp clr, lr + cvtd c1, x1 + mov srcin, c1 +#else mov srcin, x1 +#endif #include "copy_template.S" mov x0, #0 ret @@ -74,14 +93,14 @@ SYM_FUNC_START(COPY_FUNC_NAME) // Exception fixups 9996: b.cs 9997f // Registers are in Option A format - add dst, dst, count -9997: cmp dst, dstin + add dstx, dstx, count +9997: cmp dstx, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder ldrb tmp1w, [srcin] USER(9998f, sttrb tmp1w, [dst]) add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied +9998: sub x0, end, dstx // bytes not copied ret SYM_FUNC_END(COPY_FUNC_NAME) EXPORT_SYMBOL(COPY_FUNC_NAME) diff --git a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c index 12eb0af97887b6..2fa2aa643fd60d 100644 --- a/tools/testing/selftests/arm64/morello/freestanding_init_globals.c +++ b/tools/testing/selftests/arm64/morello/freestanding_init_globals.c @@ -38,6 +38,9 @@ #ifndef R_MORELLO_RELATIVE #define R_MORELLO_RELATIVE 59395 #endif +#ifndef R_MORELLO_FUNC_RELATIVE +#define R_MORELLO_FUNC_RELATIVE 59400 +#endif struct cap_reloc { size_t capability_location; @@ -181,7 +184,8 @@ void __morello_process_dynamic_relocs(void *auxv) for (reloc = rela_dyn_start; reloc < rela_dyn_end; ++reloc) { uintptr_t *reloc_addr, value; - if (reloc->r_info != R_MORELLO_RELATIVE) + if (reloc->r_info != R_MORELLO_RELATIVE && + reloc->r_info != R_MORELLO_FUNC_RELATIVE) continue; reloc_addr = (uintptr_t *)cheri_address_set(cap_rw, reloc->r_offset); value = morello_relative(0, cap_rx, cap_rw, reloc, reloc_addr);