kernel_ldr: use unoptimized memcpy before mmu bringup (closes #1102)

Before the MMU is up, all reads/writes must be aligned; the optimized memcpy implementation does not guarantee all reads/writes it performs are aligned. This commit splits the libc impl to be separate for kernel/kernel_ldr, and so now only kernel will use the optimized impl. This is safe, as the MMU is brought up before kernel begins executing.
2025-05-29 05:55:16 -04:00 · 2020-08-02 14:30:06 -07:00 · 2020-08-02 14:30:06 -07:00 · 29358dc593
commit 29358dc593
parent a82914d58e
10 changed files with 723 additions and 0 deletions
--- a/mesosphere/kernel/source/libc/arch/arm64/asmdefs.h
+++ b/mesosphere/kernel/source/libc/arch/arm64/asmdefs.h
@ -0,0 +1,31 @@
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#define ENTRY_ALIGN(name, alignment)    \
+  .global name;                         \
+  .type name,%function;                 \
+  .align alignment;                     \
+  name:                                 \
+  .cfi_startproc;
+
+#define ENTRY(name)    ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)   \
+  .global name;             \
+  .type name,%function;     \
+  name:
+
+#define END(name)       \
+  .cfi_endproc;         \
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#endif
--- a/mesosphere/kernel/source/libc/arch/arm64/memcmp.arch.arm64.s
+++ b/mesosphere/kernel/source/libc/arch/arm64/memcmp.arch.arm64.s
@ -0,0 +1,133 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#include "asmdefs.h"
+
+/* Parameters and result.  */
+#define src1        x0
+#define src2        x1
+#define limit       x2
+#define result      w0
+
+/* Internal variables.  */
+#define data1       x3
+#define data1w      w3
+#define data1h      x4
+#define data2       x5
+#define data2w      w5
+#define data2h      x6
+#define tmp1        x7
+#define tmp2        x8
+
+ENTRY (memcmp)
+    subs    limit, limit, 8
+    b.lo    L(less8)
+
+    ldr     data1, [src1], 8
+    ldr     data2, [src2], 8
+    cmp     data1, data2
+    b.ne    L(return)
+
+    subs    limit, limit, 8
+    b.gt    L(more16)
+
+    ldr     data1, [src1, limit]
+    ldr     data2, [src2, limit]
+    b       L(return)
+
+L(more16):
+    ldr     data1, [src1], 8
+    ldr     data2, [src2], 8
+    cmp     data1, data2
+    bne     L(return)
+
+    /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+       strings.  */
+    subs    limit, limit, 16
+    b.ls    L(last_bytes)
+
+    /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+       try to align, so limit it only to strings larger than 128 bytes.  */
+    cmp     limit, 96
+    b.ls    L(loop16)
+
+    /* Align src1 and adjust src2 with bytes not yet done.  */
+    and     tmp1, src1, 15
+    add     limit, limit, tmp1
+    sub     src1, src1, tmp1
+    sub     src2, src2, tmp1
+
+    /* Loop performing 16 bytes per iteration using aligned src1.
+       Limit is pre-decremented by 16 and must be larger than zero.
+       Exit if <= 16 bytes left to do or if the data is not equal.  */
+    .p2align 4
+L(loop16):
+    ldp     data1, data1h, [src1], 16
+    ldp     data2, data2h, [src2], 16
+    subs    limit, limit, 16
+    ccmp    data1, data2, 0, hi
+    ccmp    data1h, data2h, 0, eq
+    b.eq    L(loop16)
+
+    cmp     data1, data2
+    bne     L(return)
+    mov     data1, data1h
+    mov     data2, data2h
+    cmp     data1, data2
+    bne     L(return)
+
+    /* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+    add     src1, src1, limit
+    add     src2, src2, limit
+    ldp     data1, data1h, [src1]
+    ldp     data2, data2h, [src2]
+    cmp     data1, data2
+    bne     L(return)
+    mov     data1, data1h
+    mov     data2, data2h
+    cmp     data1, data2
+
+    /* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+    rev     data1, data1
+    rev     data2, data2
+#endif
+    cmp     data1, data2
+L(ret_eq):
+    cset    result, ne
+    cneg    result, result, lo
+    ret
+
+    .p2align 4
+    /* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+    adds    limit, limit, 4
+    b.lo    L(less4)
+    ldr     data1w, [src1], 4
+    ldr     data2w, [src2], 4
+    cmp     data1w, data2w
+    b.ne    L(return)
+    sub     limit, limit, 4
+L(less4):
+    adds    limit, limit, 4
+    beq     L(ret_eq)
+L(byte_loop):
+    ldrb    data1w, [src1], 1
+    ldrb    data2w, [src2], 1
+    subs    limit, limit, 1
+    ccmp    data1w, data2w, 0, ne    /* NZCV = 0b0000.  */
+    b.eq    L(byte_loop)
+    sub     result, data1w, data2w
+    ret
+
+END (memcmp)
--- a/mesosphere/kernel/source/libc/arch/arm64/memcpy.arch.arm64.s
+++ b/mesosphere/kernel/source/libc/arch/arm64/memcpy.arch.arm64.s
@ -0,0 +1,239 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin   x0
+#define src     x1
+#define count   x2
+#define dst     x3
+#define srcend  x4
+#define dstend  x5
+#define A_l     x6
+#define A_lw    w6
+#define A_h     x7
+#define B_l     x8
+#define B_lw    w8
+#define B_h     x9
+#define C_l     x10
+#define C_lw    w10
+#define C_h     x11
+#define D_l     x12
+#define D_h     x13
+#define E_l     x14
+#define E_h     x15
+#define F_l     x16
+#define F_h     x17
+#define G_l     count
+#define G_h     dst
+#define H_l     src
+#define H_h     srcend
+#define tmp1    x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY (memcpy)
+ENTRY_ALIAS (memmove)
+    add     srcend, src, count
+    add     dstend, dstin, count
+    cmp     count, 128
+    b.hi    L(copy_long)
+    cmp     count, 32
+    b.hi    L(copy32_128)
+
+    /* Small copies: 0..32 bytes.  */
+    cmp     count, 16
+    b.lo    L(copy16)
+    ldp     A_l, A_h, [src]
+    ldp     D_l, D_h, [srcend, -16]
+    stp     A_l, A_h, [dstin]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    /* Copy 8-15 bytes.  */
+L(copy16):
+    tbz     count, 3, L(copy8)
+    ldr     A_l, [src]
+    ldr     A_h, [srcend, -8]
+    str     A_l, [dstin]
+    str     A_h, [dstend, -8]
+    ret
+
+    .p2align 3
+    /* Copy 4-7 bytes.  */
+L(copy8):
+    tbz     count, 2, L(copy4)
+    ldr     A_lw, [src]
+    ldr     B_lw, [srcend, -4]
+    str     A_lw, [dstin]
+    str     B_lw, [dstend, -4]
+    ret
+
+    /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+    cbz     count, L(copy0)
+    lsr     tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    C_lw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    C_lw, [dstend, -1]
+L(copy0):
+    ret
+
+    .p2align 4
+    /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+    ldp     A_l, A_h, [src]
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [srcend, -32]
+    ldp     D_l, D_h, [srcend, -16]
+    cmp     count, 64
+    b.hi    L(copy128)
+    stp     A_l, A_h, [dstin]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstend, -32]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy 65..128 bytes.  */
+L(copy128):
+    ldp     E_l, E_h, [src, 32]
+    ldp     F_l, F_h, [src, 48]
+    cmp     count, 96
+    b.ls    L(copy96)
+    ldp     G_l, G_h, [srcend, -64]
+    ldp     H_l, H_h, [srcend, -48]
+    stp     G_l, G_h, [dstend, -64]
+    stp     H_l, H_h, [dstend, -48]
+L(copy96):
+    stp     A_l, A_h, [dstin]
+    stp     B_l, B_h, [dstin, 16]
+    stp     E_l, E_h, [dstin, 32]
+    stp     F_l, F_h, [dstin, 48]
+    stp     C_l, C_h, [dstend, -32]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    /* Copy more than 128 bytes.  */
+L(copy_long):
+    /* Use backwards copy if there is an overlap.  */
+    sub     tmp1, dstin, src
+    cbz     tmp1, L(copy0)
+    cmp     tmp1, count
+    b.lo    L(copy_long_backwards)
+
+    /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+    ldp     D_l, D_h, [src]
+    and     tmp1, dstin, 15
+    bic     dst, dstin, 15
+    sub     src, src, tmp1
+    add     count, count, tmp1    /* Count is now 16 too large.  */
+    ldp     A_l, A_h, [src, 16]
+    stp     D_l, D_h, [dstin]
+    ldp     B_l, B_h, [src, 32]
+    ldp     C_l, C_h, [src, 48]
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16    /* Test and readjust count.  */
+    b.ls    L(copy64_from_end)
+
+L(loop64):
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [src, 16]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [src, 32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [src, 48]
+    stp     D_l, D_h, [dst, 64]!
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    L(loop64)
+
+    /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+    ldp     E_l, E_h, [srcend, -64]
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [srcend, -48]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [srcend, -16]
+    stp     D_l, D_h, [dst, 64]
+    stp     E_l, E_h, [dstend, -64]
+    stp     A_l, A_h, [dstend, -48]
+    stp     B_l, B_h, [dstend, -32]
+    stp     C_l, C_h, [dstend, -16]
+    ret
+
+    .p2align 4
+
+    /* Large backwards copy for overlapping copies.
+       Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+    ldp     D_l, D_h, [srcend, -16]
+    and     tmp1, dstend, 15
+    sub     srcend, srcend, tmp1
+    sub     count, count, tmp1
+    ldp     A_l, A_h, [srcend, -16]
+    stp     D_l, D_h, [dstend, -16]
+    ldp     B_l, B_h, [srcend, -32]
+    ldp     C_l, C_h, [srcend, -48]
+    ldp     D_l, D_h, [srcend, -64]!
+    sub     dstend, dstend, tmp1
+    subs    count, count, 128
+    b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [srcend, -16]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [srcend, -48]
+    stp     D_l, D_h, [dstend, -64]!
+    ldp     D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    L(loop64_backwards)
+
+    /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+    ldp     G_l, G_h, [src, 48]
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [src, 32]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [src, 16]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [src]
+    stp     D_l, D_h, [dstend, -64]
+    stp     G_l, G_h, [dstin, 48]
+    stp     A_l, A_h, [dstin, 32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin]
+    ret
+
+END (memcpy)
--- a/mesosphere/kernel/source/libc/arch/arm64/memset.arch.arm64.s
+++ b/mesosphere/kernel/source/libc/arch/arm64/memset.arch.arm64.s
@ -0,0 +1,172 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define DC_ZVA_THRESHOLD 512
+
+#define dstin	x0
+#define val	    x1
+#define valw	w1
+#define count	x2
+#define dst	    x3
+#define dstend	x4
+#define zva_val	x5
+
+ENTRY (memset)
+
+    bfi     valw, valw,  8,  8
+    bfi     valw, valw, 16, 16
+    bfi     val,   val, 32, 32
+
+	add	    dstend, dstin, count
+
+	cmp 	count, 96
+	b.hi	L(set_long)
+	cmp	    count, 16
+	b.hs	L(set_medium)
+
+	/* Set 0..15 bytes.  */
+	tbz	    count, 3, 1f
+	str	    val, [dstin]
+	str	    val, [dstend, -8]
+    ret
+1:	tbz	    count, 2, 2f
+	str	    valw, [dstin]
+	str	    valw, [dstend, -4]
+    ret
+2:	cbz	    count, 3f
+	strb	valw, [dstin]
+	tbz	    count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 16..96 bytes.  */
+    .p2align 4
+L(set_medium):
+    stp     val, val, [dstin]
+	tbnz	count, 6, L(set96)
+	stp	    val, val, [dstend, -16]
+	tbz	    count, 5, 1f
+	stp	    val, val, [dstin, 16]
+	stp	    val, val, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	stp	    val, val, [dstin, 16]
+	stp	    val, val, [dstin, 32]
+	stp	    val, val, [dstin, 48]
+	stp	    val, val, [dstend, -32]
+	stp	    val, val, [dstend, -16]
+    ret
+
+	.p2align 4
+L(set_long):
+	stp	    val, val, [dstin]
+#if DC_ZVA_THRESHOLD
+	cmp	    count, DC_ZVA_THRESHOLD
+	ccmp	val, 0, 0, cs
+	bic	    dst, dstin, 15
+	b.eq	L(zva_64)
+#else
+	bic	    dst, dstin, 15
+#endif
+	/* Small-size or non-zero memset does not use DC ZVA. */
+	sub	    count, dstend, dst
+
+	/*
+	 * Adjust count and bias for loop. By substracting extra 1 from count,
+	 * it is easy to use tbz instruction to check whether loop tailing
+	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
+	 */
+	sub	    count, count, 64+16+1
+
+#if DC_ZVA_THRESHOLD
+	/* Align loop on 16-byte boundary, this might be friendly to i-cache. */
+	nop
+#endif
+
+1:	stp	    val, val, [dst, 16]
+	stp	    val, val, [dst, 32]
+	stp	    val, val, [dst, 48]
+	stp	    val, val, [dst, 64]!
+	subs	count, count, 64
+	b.hs	1b
+
+	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stp	    val, val, [dst, 16]
+	stp	    val, val, [dst, 32]
+1:	stp	    val, val, [dstend, -32]
+	stp	    val, val, [dstend, -16]
+	ret
+
+#if DC_ZVA_THRESHOLD
+	.p2align 4
+L(zva_64):
+	stp	    val, val, [dst, 16]
+	stp	    val, val, [dst, 32]
+	stp	    val, val, [dst, 48]
+	bic	    dst, dst, 63
+
+	/*
+	 * Previous memory writes might cross cache line boundary, and cause
+	 * cache line partially dirty. Zeroing this kind of cache line using
+	 * DC ZVA will incur extra cost, for it requires loading untouched
+	 * part of the line from memory before zeoring.
+	 *
+	 * So, write the first 64 byte aligned block using stp to force
+	 * fully dirty cache line.
+	 */
+	stp	    val, val, [dst, 64]
+	stp	    val, val, [dst, 80]
+	stp	    val, val, [dst, 96]
+	stp	    val, val, [dst, 112]
+
+	sub	    count, dstend, dst
+	/*
+	 * Adjust count and bias for loop. By substracting extra 1 from count,
+	 * it is easy to use tbz instruction to check whether loop tailing
+	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
+	 */
+	sub	    count, count, 128+64+64+1
+	add	    dst, dst, 128
+	nop
+
+	/* DC ZVA sets 64 bytes each time. */
+1:	dc	    zva, dst
+	add	    dst, dst, 64
+	subs	count, count, 64
+	b.hs	1b
+
+	/*
+	 * Write the last 64 byte aligned block using stp to force fully
+	 * dirty cache line.
+	 */
+	stp	    val, val, [dst, 0]
+	stp	    val, val, [dst, 16]
+	stp	    val, val, [dst, 32]
+	stp	    val, val, [dst, 48]
+
+	tbz	    count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stp	    val, val, [dst, 64]
+	stp	    val, val, [dst, 80]
+1:	stp	    val, val, [dstend, -32]
+	stp	    val, val, [dstend, -16]
+	ret
+#endif
+
+
+END (memset)
--- a/mesosphere/kernel/source/libc/kern_libc_config.arch.arm64.h
+++ b/mesosphere/kernel/source/libc/kern_libc_config.arch.arm64.h
@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+/* Definitions for libc genericity. */
+#define MESOSPHERE_LIBC_MEMCPY_GENERIC  0
+#define MESOSPHERE_LIBC_MEMCMP_GENERIC  0
+#define MESOSPHERE_LIBC_MEMMOVE_GENERIC 0
+#define MESOSPHERE_LIBC_MEMSET_GENERIC  0
+#define MESOSPHERE_LIBC_STRNCPY_GENERIC 1
+#define MESOSPHERE_LIBC_STRNCMP_GENERIC 1
--- a/mesosphere/kernel/source/libc/kern_libc_config.h
+++ b/mesosphere/kernel/source/libc/kern_libc_config.h
@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#if defined(ATMOSPHERE_ARCH_ARM64)
+
+    #include "kern_libc_config.arch.arm64.h"
+
+#else
+
+    #error "Unknown architecture for libc"
+
+#endif
--- a/mesosphere/kernel/source/libc/kern_libc_generic.c
+++ b/mesosphere/kernel/source/libc/kern_libc_generic.c
@ -0,0 +1,673 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <string.h>
+#include <stddef.h>
+#include <limits.h>
+#include "kern_libc_config.h"
+
+/* Note: copied from newlib */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+FUNCTION
+	<<memmove>>---move possibly overlapping memory
+INDEX
+	memmove
+SYNOPSIS
+	#include <string.h>
+	void *memmove(void *<[dst]>, const void *<[src]>, size_t <[length]>);
+DESCRIPTION
+	This function moves <[length]> characters from the block of
+	memory starting at <<*<[src]>>> to the memory starting at
+	<<*<[dst]>>>. <<memmove>> reproduces the characters correctly
+	at <<*<[dst]>>> even if the two areas overlap.
+RETURNS
+	The function returns <[dst]> as passed.
+PORTABILITY
+<<memmove>> is ANSI C.
+<<memmove>> requires no supporting OS subroutines.
+QUICKREF
+	memmove ansi pure
+*/
+
+/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
+#define UNALIGNED(X, Y) \
+  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
+
+/* How many bytes are copied each iteration of the 4X unrolled loop.  */
+#define BIGBLOCKSIZE    (sizeof (long) << 2)
+
+/* How many bytes are copied each iteration of the word copy loop.  */
+#define LITTLEBLOCKSIZE (sizeof (long))
+
+/* Threshhold for punting to the byte copier.  */
+#undef TOO_SMALL
+#define TOO_SMALL(LEN)  ((LEN) < BIGBLOCKSIZE)
+
+#if MESOSPHERE_LIBC_MEMMOVE_GENERIC
+
+/*SUPPRESS 20*/
+void *
+//__inhibit_loop_to_libcall
+__attribute__((weak))
+memmove (void *dst_void,
+	const void *src_void,
+	size_t length)
+{
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  char *dst = dst_void;
+  const char *src = src_void;
+
+  if (src < dst && dst < src + length)
+    {
+      /* Have to copy backwards */
+      src += length;
+      dst += length;
+      while (length--)
+	{
+	  *--dst = *--src;
+	}
+    }
+  else
+    {
+      while (length--)
+	{
+	  *dst++ = *src++;
+	}
+    }
+
+  return dst_void;
+#else
+  char *dst = dst_void;
+  const char *src = src_void;
+  long *aligned_dst;
+  const long *aligned_src;
+
+  if (src < dst && dst < src + length)
+    {
+      /* Destructive overlap...have to copy backwards */
+      src += length;
+      dst += length;
+      while (length--)
+	{
+	  *--dst = *--src;
+	}
+    }
+  else
+    {
+      /* Use optimizing algorithm for a non-destructive copy to closely
+         match memcpy. If the size is small or either SRC or DST is unaligned,
+         then punt into the byte copy loop.  This should be rare.  */
+      if (!TOO_SMALL(length) && !UNALIGNED (src, dst))
+        {
+          aligned_dst = (long*)dst;
+          aligned_src = (long*)src;
+
+          /* Copy 4X long words at a time if possible.  */
+          while (length >= BIGBLOCKSIZE)
+            {
+              *aligned_dst++ = *aligned_src++;
+              *aligned_dst++ = *aligned_src++;
+              *aligned_dst++ = *aligned_src++;
+              *aligned_dst++ = *aligned_src++;
+              length -= BIGBLOCKSIZE;
+            }
+
+          /* Copy one long word at a time if possible.  */
+          while (length >= LITTLEBLOCKSIZE)
+            {
+              *aligned_dst++ = *aligned_src++;
+              length -= LITTLEBLOCKSIZE;
+            }
+
+          /* Pick up any residual with a byte copier.  */
+          dst = (char*)aligned_dst;
+          src = (char*)aligned_src;
+        }
+
+      while (length--)
+        {
+          *dst++ = *src++;
+        }
+    }
+
+  return dst_void;
+#endif /* not PREFER_SIZE_OVER_SPEED */
+}
+
+#endif /* MESOSPHERE_LIBC_MEMMOVE_GENERIC */
+
+/*
+FUNCTION
+        <<memcpy>>---copy memory regions
+SYNOPSIS
+        #include <string.h>
+        void* memcpy(void *restrict <[out]>, const void *restrict <[in]>,
+                     size_t <[n]>);
+DESCRIPTION
+        This function copies <[n]> bytes from the memory region
+        pointed to by <[in]> to the memory region pointed to by
+        <[out]>.
+        If the regions overlap, the behavior is undefined.
+RETURNS
+        <<memcpy>> returns a pointer to the first byte of the <[out]>
+        region.
+PORTABILITY
+<<memcpy>> is ANSI C.
+<<memcpy>> requires no supporting OS subroutines.
+QUICKREF
+        memcpy ansi pure
+	*/
+
+#if MESOSPHERE_LIBC_MEMCPY_GENERIC
+
+void *
+__attribute__((weak))
+memcpy (void * dst0,
+	const void * __restrict src0,
+	size_t len0)
+{
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  char *dst = (char *) dst0;
+  char *src = (char *) src0;
+
+  void *save = dst0;
+
+  while (len0--)
+    {
+      *dst++ = *src++;
+    }
+
+  return save;
+#else
+  char *dst = dst0;
+  const char *src = src0;
+  long *aligned_dst;
+  const long *aligned_src;
+
+  /* If the size is small, or either SRC or DST is unaligned,
+     then punt into the byte copy loop.  This should be rare.  */
+  if (!TOO_SMALL(len0) && !UNALIGNED (src, dst))
+    {
+      aligned_dst = (long*)dst;
+      aligned_src = (long*)src;
+
+      /* Copy 4X long words at a time if possible.  */
+      while (len0 >= BIGBLOCKSIZE)
+        {
+          *aligned_dst++ = *aligned_src++;
+          *aligned_dst++ = *aligned_src++;
+          *aligned_dst++ = *aligned_src++;
+          *aligned_dst++ = *aligned_src++;
+          len0 -= BIGBLOCKSIZE;
+        }
+
+      /* Copy one long word at a time if possible.  */
+      while (len0 >= LITTLEBLOCKSIZE)
+        {
+          *aligned_dst++ = *aligned_src++;
+          len0 -= LITTLEBLOCKSIZE;
+        }
+
+       /* Pick up any residual with a byte copier.  */
+      dst = (char*)aligned_dst;
+      src = (char*)aligned_src;
+    }
+
+  while (len0--)
+    *dst++ = *src++;
+
+  return dst0;
+#endif /* not PREFER_SIZE_OVER_SPEED */
+}
+
+#endif /* MESOSPHERE_LIBC_MEMCPY_GENERIC */
+
+/*
+FUNCTION
+	<<memset>>---set an area of memory
+INDEX
+	memset
+SYNOPSIS
+	#include <string.h>
+	void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
+DESCRIPTION
+	This function converts the argument <[c]> into an unsigned
+	char and fills the first <[length]> characters of the array
+	pointed to by <[dst]> to the value.
+RETURNS
+	<<memset>> returns the value of <[dst]>.
+PORTABILITY
+<<memset>> is ANSI C.
+    <<memset>> requires no supporting OS subroutines.
+QUICKREF
+	memset ansi pure
+*/
+
+#include <string.h>
+
+#undef LBLOCKSIZE
+#undef UNALIGNED
+#undef TOO_SMALL
+
+#define LBLOCKSIZE (sizeof(long))
+#define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
+#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
+
+#if MESOSPHERE_LIBC_MEMSET_GENERIC
+
+void *
+__attribute__((weak))
+memset (void *m,
+	int c,
+	size_t n)
+{
+  char *s = (char *) m;
+
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
+  unsigned int i;
+  unsigned long buffer;
+  unsigned long *aligned_addr;
+  unsigned int d = c & 0xff;	/* To avoid sign extension, copy C to an
+				   unsigned variable.  */
+
+  while (UNALIGNED (s))
+    {
+      if (n--)
+        *s++ = (char) c;
+      else
+        return m;
+    }
+
+  if (!TOO_SMALL (n))
+    {
+      /* If we get this far, we know that n is large and s is word-aligned. */
+      aligned_addr = (unsigned long *) s;
+
+      /* Store D into each char sized location in BUFFER so that
+         we can set large blocks quickly.  */
+      buffer = (d << 8) | d;
+      buffer |= (buffer << 16);
+      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+        buffer = (buffer << i) | buffer;
+
+      /* Unroll the loop.  */
+      while (n >= LBLOCKSIZE*4)
+        {
+          *aligned_addr++ = buffer;
+          *aligned_addr++ = buffer;
+          *aligned_addr++ = buffer;
+          *aligned_addr++ = buffer;
+          n -= 4*LBLOCKSIZE;
+        }
+
+      while (n >= LBLOCKSIZE)
+        {
+          *aligned_addr++ = buffer;
+          n -= LBLOCKSIZE;
+        }
+      /* Pick up the remainder with a bytewise loop.  */
+      s = (char*)aligned_addr;
+    }
+
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
+  while (n--)
+    *s++ = (char) c;
+
+  return m;
+}
+
+#endif /* MESOSPHERE_LIBC_MEMSET_GENERIC */
+
+/*
+FUNCTION
+	<<memcmp>>---compare two memory areas
+INDEX
+	memcmp
+SYNOPSIS
+	#include <string.h>
+	int memcmp(const void *<[s1]>, const void *<[s2]>, size_t <[n]>);
+DESCRIPTION
+	This function compares not more than <[n]> characters of the
+	object pointed to by <[s1]> with the object pointed to by <[s2]>.
+RETURNS
+	The function returns an integer greater than, equal to or
+	less than zero 	according to whether the object pointed to by
+	<[s1]> is greater than, equal to or less than the object
+	pointed to by <[s2]>.
+PORTABILITY
+<<memcmp>> is ANSI C.
+<<memcmp>> requires no supporting OS subroutines.
+QUICKREF
+	memcmp ansi pure
+*/
+
+#undef LBLOCKSIZE
+#undef UNALIGNED
+#undef TOO_SMALL
+
+/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
+#define UNALIGNED(X, Y) \
+  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
+
+/* How many bytes are copied each iteration of the word copy loop.  */
+#define LBLOCKSIZE (sizeof (long))
+
+/* Threshhold for punting to the byte copier.  */
+#define TOO_SMALL(LEN)  ((LEN) < LBLOCKSIZE)
+
+#if MESOSPHERE_LIBC_MEMCMP_GENERIC
+
+int
+__attribute__((weak))
+memcmp (const void *m1,
+	const void *m2,
+	size_t n)
+{
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  unsigned char *s1 = (unsigned char *) m1;
+  unsigned char *s2 = (unsigned char *) m2;
+
+  while (n--)
+    {
+      if (*s1 != *s2)
+	{
+	  return *s1 - *s2;
+	}
+      s1++;
+      s2++;
+    }
+  return 0;
+#else
+  unsigned char *s1 = (unsigned char *) m1;
+  unsigned char *s2 = (unsigned char *) m2;
+  unsigned long *a1;
+  unsigned long *a2;
+
+  /* If the size is too small, or either pointer is unaligned,
+     then we punt to the byte compare loop.  Hopefully this will
+     not turn up in inner loops.  */
+  if (!TOO_SMALL(n) && !UNALIGNED(s1,s2))
+    {
+      /* Otherwise, load and compare the blocks of memory one
+         word at a time.  */
+      a1 = (unsigned long*) s1;
+      a2 = (unsigned long*) s2;
+      while (n >= LBLOCKSIZE)
+        {
+          if (*a1 != *a2)
+   	    break;
+          a1++;
+          a2++;
+          n -= LBLOCKSIZE;
+        }
+
+      /* check m mod LBLOCKSIZE remaining characters */
+
+      s1 = (unsigned char*)a1;
+      s2 = (unsigned char*)a2;
+    }
+
+  while (n--)
+    {
+      if (*s1 != *s2)
+	return *s1 - *s2;
+      s1++;
+      s2++;
+    }
+
+  return 0;
+#endif /* not PREFER_SIZE_OVER_SPEED */
+}
+
+#endif /* MESOSPHERE_LIBC_MEMCMP_GENERIC */
+
+/*
+FUNCTION
+	<<strncpy>>---counted copy string
+INDEX
+	strncpy
+SYNOPSIS
+	#include <string.h>
+	char *strncpy(char *restrict <[dst]>, const char *restrict <[src]>,
+                      size_t <[length]>);
+DESCRIPTION
+	<<strncpy>> copies not more than <[length]> characters from the
+	the string pointed to by <[src]> (including the terminating
+	null character) to the array pointed to by <[dst]>.  If the
+	string pointed to by <[src]> is shorter than <[length]>
+	characters, null characters are appended to the destination
+	array until a total of <[length]> characters have been
+	written.
+RETURNS
+	This function returns the initial value of <[dst]>.
+PORTABILITY
+<<strncpy>> is ANSI C.
+<<strncpy>> requires no supporting OS subroutines.
+QUICKREF
+	strncpy ansi pure
+*/
+
+#include <string.h>
+#include <limits.h>
+
+/*SUPPRESS 560*/
+/*SUPPRESS 530*/
+
+/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
+#define UNALIGNED(X, Y) \
+  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
+
+#if LONG_MAX == 2147483647L
+#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
+#else
+#if LONG_MAX == 9223372036854775807L
+/* Nonzero if X (a long int) contains a NULL byte. */
+#define DETECTNULL(X) (((X) - 0x0101010101010101) & ~(X) & 0x8080808080808080)
+#else
+#error long int is not a 32bit or 64bit type.
+#endif
+#endif
+
+#ifndef DETECTNULL
+#error long int is not a 32bit or 64bit byte
+#endif
+
+#undef TOO_SMALL
+#define TOO_SMALL(LEN) ((LEN) < sizeof (long))
+
+#if MESOSPHERE_LIBC_STRNCMP_GENERIC
+
+char *
+strncpy (char *__restrict dst0,
+	const char *__restrict src0,
+	size_t count)
+{
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  char *dscan;
+  const char *sscan;
+
+  dscan = dst0;
+  sscan = src0;
+  while (count > 0)
+    {
+      --count;
+      if ((*dscan++ = *sscan++) == '\0')
+	break;
+    }
+  while (count-- > 0)
+    *dscan++ = '\0';
+
+  return dst0;
+#else
+  char *dst = dst0;
+  const char *src = src0;
+  long *aligned_dst;
+  const long *aligned_src;
+
+  /* If SRC and DEST is aligned and count large enough, then copy words.  */
+  if (!UNALIGNED (src, dst) && !TOO_SMALL (count))
+    {
+      aligned_dst = (long*)dst;
+      aligned_src = (long*)src;
+
+      /* SRC and DEST are both "long int" aligned, try to do "long int"
+	 sized copies.  */
+      while (count >= sizeof (long int) && !DETECTNULL(*aligned_src))
+	{
+	  count -= sizeof (long int);
+	  *aligned_dst++ = *aligned_src++;
+	}
+
+      dst = (char*)aligned_dst;
+      src = (char*)aligned_src;
+    }
+
+  while (count > 0)
+    {
+      --count;
+      if ((*dst++ = *src++) == '\0')
+	break;
+    }
+
+  while (count-- > 0)
+    *dst++ = '\0';
+
+  return dst0;
+#endif /* not PREFER_SIZE_OVER_SPEED */
+}
+
+#endif /* MESOSPHERE_LIBC_STRNCPY_GENERIC */
+
+/*
+FUNCTION
+	<<strncmp>>---character string compare
+
+INDEX
+	strncmp
+SYNOPSIS
+	#include <string.h>
+	int strncmp(const char *<[a]>, const char * <[b]>, size_t <[length]>);
+DESCRIPTION
+	<<strncmp>> compares up to <[length]> characters
+	from the string at <[a]> to the string at <[b]>.
+RETURNS
+	If <<*<[a]>>> sorts lexicographically after <<*<[b]>>>,
+	<<strncmp>> returns a number greater than zero.  If the two
+	strings are equivalent, <<strncmp>> returns zero.  If <<*<[a]>>>
+	sorts lexicographically before <<*<[b]>>>, <<strncmp>> returns a
+	number less than zero.
+PORTABILITY
+<<strncmp>> is ANSI C.
+<<strncmp>> requires no supporting OS subroutines.
+QUICKREF
+	strncmp ansi pure
+*/
+
+#include <string.h>
+#include <limits.h>
+
+/* Nonzero if either X or Y is not aligned on a "long" boundary.  */
+#define UNALIGNED(X, Y) \
+  (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
+
+/* DETECTNULL returns nonzero if (long)X contains a NULL byte. */
+#if LONG_MAX == 2147483647L
+#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
+#else
+#if LONG_MAX == 9223372036854775807L
+#define DETECTNULL(X) (((X) - 0x0101010101010101) & ~(X) & 0x8080808080808080)
+#else
+#error long int is not a 32bit or 64bit type.
+#endif
+#endif
+
+#ifndef DETECTNULL
+#error long int is not a 32bit or 64bit byte
+#endif
+
+#if MESOSPHERE_LIBC_STRNCMP_GENERIC
+
+int
+strncmp (const char *s1,
+	const char *s2,
+	size_t n)
+{
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  if (n == 0)
+    return 0;
+
+  while (n-- != 0 && *s1 == *s2)
+    {
+      if (n == 0 || *s1 == '\0')
+	break;
+      s1++;
+      s2++;
+    }
+
+  return (*(unsigned char *) s1) - (*(unsigned char *) s2);
+#else
+  unsigned long *a1;
+  unsigned long *a2;
+
+  if (n == 0)
+    return 0;
+
+  /* If s1 or s2 are unaligned, then compare bytes. */
+  if (!UNALIGNED (s1, s2))
+    {
+      /* If s1 and s2 are word-aligned, compare them a word at a time. */
+      a1 = (unsigned long*)s1;
+      a2 = (unsigned long*)s2;
+      while (n >= sizeof (long) && *a1 == *a2)
+        {
+          n -= sizeof (long);
+
+          /* If we've run out of bytes or hit a null, return zero
+	     since we already know *a1 == *a2.  */
+          if (n == 0 || DETECTNULL (*a1))
+	    return 0;
+
+          a1++;
+          a2++;
+        }
+
+      /* A difference was detected in last few bytes of s1, so search bytewise */
+      s1 = (char*)a1;
+      s2 = (char*)a2;
+    }
+
+  while (n-- > 0 && *s1 == *s2)
+    {
+      /* If we've run out of bytes or hit a null, return zero
+	 since we already know *s1 == *s2.  */
+      if (n == 0 || *s1 == '\0')
+	return 0;
+      s1++;
+      s2++;
+    }
+  return (*(unsigned char *) s1) - (*(unsigned char *) s2);
+#endif /* not PREFER_SIZE_OVER_SPEED */
+}
+
+#endif /* MESOSPHERE_LIBC_STRNCMP_GENERIC */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif