/************************************************************************************
 * libs/libc/machine/arm/armv7-a/arch_memcpy.S
 * ARMv7-A optimized memcpy.
 *
 * Adapted for use with ARMv7-A and NuttX by:
 *
 *   Copyright (C) 2017 Gregory Nutt. All rights reserved.
 *   Author: Gregory Nutt <gnutt@nuttx.org>
 *
 * Based on the ARMv7-M version contributed by Mike Smith.  Apparently in the public
 * domain and is re-released here under the modified BSD license:
 *
 * Obtained via a posting on the Stellaris forum:
 *  http://e2e.ti.com/support/microcontrollers/\
 *       stellaris_arm_cortex-m3_microcontroller/f/473/t/44360.aspx
 *
 * Posted by rocksoft on Jul 24, 2008 10:19 AM
 *
 *   Hi,
 *
 *   I recently finished a "memcpy" replacement and thought it might be useful for
 *   others...
 *
 *   I've put some instructions and the code here:
 *
 *   http://www.rock-software.net/downloads/memcpy/
 *
 *   Hope it works for you as well as it did for me.
 *
 *   Liam.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 * 3. Neither the name NuttX nor the names of its contributors may be
 *    used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 ************************************************************************************/

/************************************************************************************
 * Public Symbols
 ************************************************************************************/

	.global	memcpy
	.syntax	unified
	.file	"arch_memcpy.S"

/************************************************************************************
 * .text
 ************************************************************************************/

	.text

/************************************************************************************
 * Private Constant Data
 ************************************************************************************/

/* We have 16 possible alignment combinations of src and dst, this jump table
 * directs the copy operation
 *
 * Bits:  Src=00, Dst=00 - Long to Long copy
 * Bits:  Src=00, Dst=01 - Long to Byte before half word
 * Bits:  Src=00, Dst=10 - Long to Half word
 * Bits:  Src=00, Dst=11 - Long to Byte before long word
 * Bits:  Src=01, Dst=00 - Byte before half word to long
 * Bits:  Src=01, Dst=01 - Byte before half word to byte before half word -
 *                         Same alignment
 * Bits:  Src=01, Dst=10 - Byte before half word to half word
 * Bits:  Src=01, Dst=11 - Byte before half word to byte before long word
 * Bits:  Src=10, Dst=00 - Half word to long word
 * Bits:  Src=10, Dst=01 - Half word to byte before half word
 * Bits:  Src=10, Dst=10 - Half word to half word - Same Alignment
 * Bits:  Src=10, Dst=11 - Half word to byte before long word
 * Bits:  Src=11, Dst=00 - Byte before long word to long word
 * Bits:  Src=11, Dst=01 - Byte before long word to byte before half word
 * Bits:  Src=11, Dst=11 - Byte before long word to half word
 * Bits:  Src=11, Dst=11 - Byte before long word to Byte before long word -
 *                         Same alignment
 */

MEM_DataCopyTable:
	.byte	(MEM_DataCopy0  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy1  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy2  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy3  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy4  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy5  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy6  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy7  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy8  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy9  - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy10 - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy11 - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy12 - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy13 - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy14 - MEM_DataCopyJump) >> 2
	.byte	(MEM_DataCopy15 - MEM_DataCopyJump) >> 2

	.align 2

MEM_LongCopyTable:
	.byte	(MEM_LongCopyEnd   - MEM_LongCopyJump) >> 2	/* 0 bytes left */
	.byte	(MEM_LongCopyJump0 - MEM_LongCopyJump) >> 2	/* 4 bytes left */
	.byte	(MEM_LongCopyJump1 - MEM_LongCopyJump) >> 2	/* 8 bytes left */
	.byte	(MEM_LongCopyJump2 - MEM_LongCopyJump) >> 2	/* 12 bytes left */
	.byte	(MEM_LongCopyJump3 - MEM_LongCopyJump) >> 2	/* 16 bytes left */
	.byte	(MEM_LongCopyJump4 - MEM_LongCopyJump) >> 2	/* 20 bytes left */
	.byte	(MEM_LongCopyJump5 - MEM_LongCopyJump) >> 2	/* 24 bytes left */
	.byte	(MEM_LongCopyJump6 - MEM_LongCopyJump) >> 2	/* 28 bytes left */
	.byte	(MEM_LongCopyJump7 - MEM_LongCopyJump) >> 2	/* 32 bytes left */
	.byte	(MEM_LongCopyJump8 - MEM_LongCopyJump) >> 2	/* 36 bytes left */

/************************************************************************************
 * Public Functions
 ************************************************************************************/
/************************************************************************************
 * Name: memcpy
 *
 * Description:
 *   Optimized "general" copy routine
 *
 * Input Parameters:
 *   r0 = destination, r1 = source, r2 = length
 *
 * Returned Value:
 *   r0 = destination r1-r3 burned
 *
 ************************************************************************************/

	.align 4

memcpy:
	push	{r14}
	push    {r0}
	bl      _do_memcpy
	pop     {r0}
	pop     {pc}

	.align 4

_do_memcpy:
	push    {r14}
	push    {r4}

	/* This allows the inner workings to "assume" a minimum amount of bytes */
	/* Quickly check for very short copies */

	cmp		r2, #4
	blt		MEM_DataCopyBytes

	and		r14, r0, #3		 		/* Get destination alignment bits */
	bfi		r14, r1, #2, #2	 		/* Get source alignment bits */

	ldr		r3, =MEM_DataCopyTable	/* Jump table base address */
	ldrb	r4, [r3, r14]			/* DWord offset for this alignment combination */
	ldr		r3, =MEM_DataCopyJump	/* Base of branch table anchor */
	add		r3, r3, r4, lsl #2		/* Absolute address of logic */
	bx		r3

	/* data copy branch table anchor */

	.align 4
MEM_DataCopyJump:

/* Bits:  Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
 * 3 bytes to read for long word aligning
 */

MEM_DataCopy5:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=10, Dst=10 - Half word to half word - Same Alignment
 * 2 bytes to read for long word aligning
 */

MEM_DataCopy10:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=11, Dst=11 - Byte before long word to Byte before long word - Same alignment
 * 1 bytes to read for long word aligning
 */

MEM_DataCopy15:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=00, Dst=00 - Long to Long copy */

MEM_DataCopy0:
	/* Save regs that may be used by memcpy */

	push	{r5-r12}

	/* Check for short word-aligned copy */

	cmp		r2, #0x28
	blt		MEM_DataCopy0_2

	/* Bulk copy loop */

MEM_DataCopy0_1:
	ldmia	r1!, {r3-r12}
	stmia	r0!, {r3-r12}
	sub		r2, r2, #0x28
	cmp		r2, #0x28
	bge		MEM_DataCopy0_1

	/* Copy remaining long words */

MEM_DataCopy0_2:
	ldr		r14, =MEM_LongCopyTable	/* Jump table base address */
	lsr		r11, r2, 2				/* Convert byte count to word count */
	add		r14, r14, r11			/* Jump table offset address */
	ldrb	r3, [r14]				/* DWord offset from branch table anchor */
	ldr		r11, =MEM_LongCopyJump	/* Address of branch table anchor */
	add		r11, r11, r3, lsl #2	/* Absolute address into branch table */
	bx		r11						/* Go there */

	/* longword copy branch table anchor */

MEM_LongCopyJump:

MEM_LongCopyJump0:
	ldr		r3, [r1], #0x04		/* 4 bytes remain */
	str		r3, [r0], #0x04
	b		MEM_LongCopyEnd

MEM_LongCopyJump1:
	ldmia	r1!, {r3-r4}		/* 8 bytes remain */
	stmia	r0!, {r3-r4}
	b		MEM_LongCopyEnd

MEM_LongCopyJump2:
	ldmia	r1!, {r3-r5}		/* 12 bytes remain */
	stmia	r0!, {r3-r5}
	b		MEM_LongCopyEnd

MEM_LongCopyJump3:
	ldmia	r1!, {r3-r6}		/* 16 bytes remain */
	stmia	r0!, {r3-r6}
	b		MEM_LongCopyEnd

MEM_LongCopyJump4:
	ldmia	r1!, {r3-r7}		/* 20 bytes remain */
	stmia	r0!, {r3-r7}
	b		MEM_LongCopyEnd

MEM_LongCopyJump5:
	ldmia	r1!, {r3-r8}		/* 24 bytes remain */
	stmia	r0!, {r3-r8}
	b		MEM_LongCopyEnd

MEM_LongCopyJump6:
	ldmia	r1!, {r3-r9}		/* 28 bytes remain */
	stmia	r0!, {r3-r9}
	b		MEM_LongCopyEnd

MEM_LongCopyJump7:
	ldmia	r1!, {r3-r10}		/* 32 bytes remain */
	stmia	r0!, {r3-r10}
	b		MEM_LongCopyEnd

MEM_LongCopyJump8:
	ldmia	r1!, {r3-r11}		/* 36 bytes remain */
	stmia	r0!, {r3-r11}

MEM_LongCopyEnd:
	pop		{r5-r12}
	and		r2, r2, #0x03		/* All the longs have been copied */

	/* Deal with up to 3 remaining bytes */

MEM_DataCopyBytes:
	/* Deal with up to 3 remaining bytes */

	pop		{r4}
	cmp		r2, #0x00
	it		eq
	popeq	{pc}

	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	subs	r2, r2, #0x01
	it		eq
	popeq	{pc}

	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	subs	r2, r2, #0x01
	it		eq
	popeq	{pc}

	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	pop		{pc}

 .align 4

/* Bits:  Src=01, Dst=11 - Byte before half word to byte before long word
 * 3 bytes to read for long word aligning the source
 */

MEM_DataCopy7:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=10, Dst=00 - Half word to long word
 * 2 bytes to read for long word aligning the source
 */

MEM_DataCopy8:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=11, Dst=01 - Byte before long word to byte before half word
 * 1 byte to read for long word aligning the source
 */

MEM_DataCopy13:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=00, Dst=10 - Long to Half word */

MEM_DataCopy2:
	cmp		r2, #0x28
	blt		MEM_DataCopy2_1

	/* Save regs */

	push	{r5-r12}

	/* Bulk copy loop */

MEM_DataCopy2_2:
	ldmia	r1!, {r3-r12}

	strh	r3, [r0], #0x02

	lsr		r3, r3, #0x10
	bfi		r3, r4, #0x10, #0x10
	lsr		r4, r4, #0x10
	bfi		r4, r5, #0x10, #0x10
	lsr		r5, r5, #0x10
	bfi		r5, r6, #0x10, #0x10
	lsr		r6, r6, #0x10
	bfi		r6, r7, #0x10, #0x10
	lsr		r7, r7, #0x10
	bfi		r7, r8, #0x10, #0x10
	lsr		r8, r8, #0x10
	bfi		r8, r9, #0x10, #0x10
	lsr		r9, r9, #0x10
	bfi		r9, r10, #0x10, #0x10
	lsr		r10, r10, #0x10
	bfi		r10, r11, #0x10, #0x10
	lsr		r11, r11, #0x10
	bfi		r11, r12, #0x10, #0x10
	stmia	r0!, {r3-r11}
	lsr		r12, r12, #0x10
	strh	r12, [r0], #0x02

	sub		r2, r2, #0x28
	cmp		r2, #0x28
	bge		MEM_DataCopy2_2
	pop		{r5-r12}

MEM_DataCopy2_1: /* Read longs and write 2 x half words */
	cmp		r2, #4
	blt		MEM_DataCopyBytes
	ldr		r3, [r1], #0x04
	strh	r3, [r0], #0x02
	lsr		r3, r3, #0x10
	strh	r3, [r0], #0x02
	sub		r2, r2, #0x04
	b		MEM_DataCopy2

/* Bits:  Src=01, Dst=00 - Byte before half word to long
 * Bits:  Src=01, Dst=10 - Byte before half word to half word
 * 3 bytes to read for long word aligning the source
 */

MEM_DataCopy4:
MEM_DataCopy6:
	/* Read B and write B */

	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=10, Dst=01 - Half word to byte before half word
 * Bits:  Src=10, Dst=11 - Half word to byte before long word
 * 2 bytes to read for long word aligning the source
 */

MEM_DataCopy9:
MEM_DataCopy11:
	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=11, Dst=00 -chm Byte before long word to long word
 * Bits:  Src=11, Dst=11 - Byte before long word to half word
 * 1 byte to read for long word aligning the source
 */

MEM_DataCopy12:
MEM_DataCopy14:
	/* Read B and write B */

	ldrb	r3, [r1], #0x01
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x01

/* Bits:  Src=00, Dst=01 - Long to Byte before half word
 * Bits:  Src=00, Dst=11 - Long to Byte before long word
 */

MEM_DataCopy1: /* Read longs, write B->H->B */
MEM_DataCopy3:
	cmp		r2, #4
	blt		MEM_DataCopyBytes
	ldr		r3, [r1], #0x04
	strb	r3, [r0], #0x01
	lsr		r3, r3, #0x08
	strh	r3, [r0], #0x02
	lsr		r3, r3, #0x10
	strb	r3, [r0], #0x01
	sub		r2, r2, #0x04
	b		MEM_DataCopy3

	.size	memcpy, .-memcpy
	.end
