[arm] Add optimised TCP/IP checksumming for 64-bit ARM

Signed-off-by: Michael Brown <mcb30@ipxe.org>
2016-05-10 17:13:05 +01:00 · 2016-05-10 17:13:05 +01:00 · 47931a4de5
parent 95716ece91
commit 47931a4de5
3 changed files with 190 additions and 0 deletions
--- a/src/arch/arm32/include/bits/tcpip.h
+++ b/src/arch/arm32/include/bits/tcpip.h
--- a/src/arch/arm64/core/arm64_tcpip.c
+++ b/src/arch/arm64/core/arm64_tcpip.c
@ -0,0 +1,175 @@
+/*
+ * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * You can also choose to distribute this program under the terms of
+ * the Unmodified Binary Distribution Licence (as given in the file
+ * COPYING.UBDL), provided that you have satisfied its requirements.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+/** @file
+ *
+ * TCP/IP checksum
+ *
+ */
+
+#include <strings.h>
+#include <ipxe/tcpip.h>
+
+/** Alignment used by main checksumming loop */
+#define TCPIP_CHKSUM_ALIGN 16
+
+/** Number of steps in each iteration of the unrolled main checksumming loop */
+#define TCPIP_CHKSUM_UNROLL 4
+
+/**
+ * Calculate continued TCP/IP checkum
+ *
+ * @v sum		Checksum of already-summed data, in network byte order
+ * @v data		Data buffer
+ * @v len		Length of data buffer
+ * @ret sum		Updated checksum, in network byte order
+ */
+uint16_t tcpip_continue_chksum ( uint16_t sum, const void *data,
+				 size_t len ) {
+	intptr_t start;
+	intptr_t end;
+	intptr_t mid;
+	unsigned int pre;
+	unsigned int post;
+	unsigned int first;
+	uint64_t discard_low;
+	uint64_t discard_high;
+
+	/* Avoid potentially undefined shift operation */
+	if ( len == 0 )
+		return sum;
+
+	/* Find maximally-aligned midpoint.  For short blocks of data,
+	 * this may be aligned to fewer than 16 bytes.
+	 */
+	start = ( ( intptr_t ) data );
+	end = ( start + len );
+	mid = ( end &
+		~( ( ~( 1UL << 63 ) ) >> ( 64 - flsl ( start ^ end ) ) ) );
+
+	/* Calculate pre- and post-alignment lengths */
+	pre = ( ( mid - start ) & ( TCPIP_CHKSUM_ALIGN - 1 ) );
+	post = ( ( end - mid ) & ( TCPIP_CHKSUM_ALIGN - 1 ) );
+
+	/* Calculate number of steps in first iteration of unrolled loop */
+	first = ( ( ( len - pre - post ) / TCPIP_CHKSUM_ALIGN ) &
+		  ( TCPIP_CHKSUM_UNROLL - 1 ) );
+
+	/* Calculate checksum */
+	__asm__ ( /* Invert sum */
+		  "eor %w0, %w0, #0xffff\n\t"
+		  /* Clear carry flag */
+		  "cmn xzr, xzr\n\t"
+		  /* Byteswap and sum pre-alignment byte, if applicable */
+		  "tbz %w4, #0, 1f\n\t"
+		  "ldrb %w2, [%1], #1\n\t"
+		  "rev16 %w0, %w0\n\t"
+		  "rev16 %w2, %w2\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum pre-alignment halfword, if applicable */
+		  "tbz %w4, #1, 1f\n\t"
+		  "ldrh %w2, [%1], #2\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum pre-alignment word, if applicable */
+		  "tbz %w4, #2, 1f\n\t"
+		  "ldr %w2, [%1], #4\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum pre-alignment doubleword, if applicable */
+		  "tbz %w4, #3, 1f\n\t"
+		  "ldr %2, [%1], #8\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Jump into unrolled (x4) main loop */
+		  "adr %2, 2f\n\t"
+		  "sub %2, %2, %5, lsl #3\n\t"
+		  "sub %2, %2, %5, lsl #2\n\t"
+		  "br %2\n\t"
+		  "\n1:\n\t"
+		  "ldp %2, %3, [%1], #16\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "adcs %0, %0, %3\n\t"
+		  "ldp %2, %3, [%1], #16\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "adcs %0, %0, %3\n\t"
+		  "ldp %2, %3, [%1], #16\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "adcs %0, %0, %3\n\t"
+		  "ldp %2, %3, [%1], #16\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "adcs %0, %0, %3\n\t"
+		  "\n2:\n\t"
+		  "sub %2, %1, %6\n\t"
+		  "cbnz %2, 1b\n\t"
+		  /* Sum post-alignment doubleword, if applicable */
+		  "tbz %w7, #3, 1f\n\t"
+		  "ldr %2, [%1], #8\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum post-alignment word, if applicable */
+		  "tbz %w7, #2, 1f\n\t"
+		  "ldr %w2, [%1], #4\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum post-alignment halfword, if applicable */
+		  "tbz %w7, #1, 1f\n\t"
+		  "ldrh %w2, [%1], #2\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Sum post-alignment byte, if applicable */
+		  "tbz %w7, #0, 1f\n\t"
+		  "ldrb %w2, [%1], #1\n\t"
+		  "adcs %0, %0, %2\n\t"
+		  "\n1:\n\t"
+		  /* Fold down to a uint32_t plus carry flag */
+		  "lsr %2, %0, #32\n\t"
+		  "adcs %w0, %w0, %w2\n\t"
+		  /* Fold down to a uint16_t plus carry in bit 16 */
+		  "ubfm %2, %0, #0, #15\n\t"
+		  "ubfm %3, %0, #16, #31\n\t"
+		  "adc %w0, %w2, %w3\n\t"
+		  /* Fold down to a uint16_t */
+		  "tbz %w0, #16, 1f\n\t"
+		  "mov %w2, #0xffff\n\t"
+		  "sub %w0, %w0, %w2\n\t"
+		  "tbz %w0, #16, 1f\n\t"
+		  "sub %w0, %w0, %w2\n\t"
+		  "\n1:\n\t"
+		  /* Byteswap back, if applicable */
+		  "tbz %w4, #0, 1f\n\t"
+		  "rev16 %w0, %w0\n\t"
+		  "\n1:\n\t"
+		  /* Invert sum */
+		  "eor %w0, %w0, #0xffff\n\t"
+		  : "+r" ( sum ), "+r" ( data ), "=&r" ( discard_low ),
+		    "=&r" ( discard_high )
+		  : "r" ( pre ), "r" ( first ), "r" ( end - post ),
+		    "r" ( post )
+		  : "cc" );
+
+	return sum;
+}
--- a/src/arch/arm64/include/bits/tcpip.h
+++ b/src/arch/arm64/include/bits/tcpip.h
@ -0,0 +1,15 @@
+#ifndef _BITS_TCPIP_H
+#define _BITS_TCPIP_H
+
+/** @file
+ *
+ * Transport-network layer interface
+ *
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+extern uint16_t tcpip_continue_chksum ( uint16_t sum, const void *data,
+					size_t len );
+
+#endif /* _BITS_TCPIP_H */