From ec22e08db1fa61f5276b2ed1efbde28a1a45101e Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Tue, 26 Jun 2012 17:19:18 +0100 Subject: [PATCH] [tcpip] Add faster algorithm for calculating the TCP/IP checksum The generic TCP/IP checksum implementation requires approximately 10 CPU clocks per byte (as measured using the TSC). Improve this to approximately 0.5 CPU clocks per byte by using "lodsl ; adcl" in an unrolled loop. Signed-off-by: Michael Brown --- src/arch/x86/core/x86_tcpip.c | 169 ++++++++++++++++++++++++++++++ src/arch/x86/include/bits/tcpip.h | 5 + 2 files changed, 174 insertions(+) create mode 100644 src/arch/x86/core/x86_tcpip.c diff --git a/src/arch/x86/core/x86_tcpip.c b/src/arch/x86/core/x86_tcpip.c new file mode 100644 index 000000000..b4e7c3b83 --- /dev/null +++ b/src/arch/x86/core/x86_tcpip.c @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2012 Michael Brown . + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +FILE_LICENCE ( GPL2_OR_LATER ); + +/** @file + * + * TCP/IP checksum + * + */ + +#include +#include + +extern char x86_tcpip_loop_end[]; + +/** + * Calculate continued TCP/IP checkum + * + * @v partial Checksum of already-summed data, in network byte order + * @v data Data buffer + * @v len Length of data buffer + * @ret cksum Updated checksum, in network byte order + */ +uint16_t x86_tcpip_continue_chksum ( uint16_t partial, + const void *data, size_t len ) { + unsigned long sum = ( ( ~partial ) & 0xffff ); + unsigned long initial_word_count; + unsigned long loop_count; + unsigned long loop_partial_count; + unsigned long final_word_count; + unsigned long final_byte; + unsigned long discard_S; + unsigned long discard_c; + unsigned long discard_a; + unsigned long discard_r1; + unsigned long discard_r2; + + /* Calculate number of initial 16-bit words required to bring + * the main loop into alignment. (We don't care about the + * speed for data aligned to less than 16 bits, since this + * situation won't occur in practice.) + */ + if ( len >= sizeof ( sum ) ) { + initial_word_count = ( ( -( ( intptr_t ) data ) & + ( sizeof ( sum ) - 1 ) ) >> 1 ); + } else { + initial_word_count = 0; + } + len -= ( initial_word_count * 2 ); + + /* Calculate number of iterations of the main loop. This loop + * processes native machine words (32-bit or 64-bit), and is + * unrolled 16 times. We calculate an overall iteration + * count, and a starting point for the first iteration. + */ + loop_count = ( len / ( sizeof ( sum ) * 16 ) ); + loop_partial_count = + ( ( len % ( sizeof ( sum ) * 16 ) ) / sizeof ( sum ) ); + + /* Calculate number of 16-bit words remaining after the main + * loop completes. + */ + final_word_count = ( ( len % sizeof ( sum ) ) / 2 ); + + /* Calculate whether or not a final byte remains at the end */ + final_byte = ( len & 1 ); + + /* Calculate the checksum */ + __asm__ ( /* Calculate position at which to jump into the + * unrolled loop. + */ + "imul $( -x86_tcpip_loop_step_size ), %4\n\t" + "add %5, %4\n\t" + + /* Clear carry flag before starting checksumming */ + "clc\n\t" + + /* Checksum initial words */ + "jmp 2f\n\t" + "\n1:\n\t" + "lodsw\n\t" + "adcw %w2, %w0\n\t" + "\n2:\n\t" + "loop 1b\n\t" + + /* Main "lods;adc" loop, unrolled x16 */ + "mov %12, %3\n\t" + "jmp *%4\n\t" + "\nx86_tcpip_loop_start:\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "lods%z2\n\tadc %2, %0\n\t" + "\nx86_tcpip_loop_end:\n\t" + "loop x86_tcpip_loop_start\n\t" + ".equ x86_tcpip_loop_step_size, " + " ( ( x86_tcpip_loop_end - x86_tcpip_loop_start ) / 16 )\n\t" + + /* Checksum remaining whole words */ + "mov %13, %3\n\t" + "jmp 2f\n\t" + "\n1:\n\t" + "lodsw\n\t" + "adcw %w2, %w0\n\t" + "\n2:\n\t" + "loop 1b\n\t" + + /* Checksum final byte if applicable */ + "mov %14, %3\n\t" + "loop 1f\n\t" + "adcb (%1), %b0\n\t" + "adcb $0, %h0\n\t" + "\n1:\n\t" + + /* Fold down to a uint16_t */ + "push %0\n\t" + "popw %w0\n\t" + "popw %w2\n\t" + "adcw %w2, %w0\n\t" +#if ULONG_MAX > 0xffffffffUL /* 64-bit only */ + "popw %w2\n\t" + "adcw %w2, %w0\n\t" + "popw %w2\n\t" + "adcw %w2, %w0\n\t" +#endif /* 64-bit only */ + + /* Consume CF */ + "adcw $0, %w0\n\t" + "adcw $0, %w0\n\t" + + : "=&Q" ( sum ), "=&S" ( discard_S ), "=&a" ( discard_a ), + "=&c" ( discard_c ), "=&r" ( discard_r1 ), + "=&r" ( discard_r2 ) + : "0" ( sum ), "1" ( data ), "2" ( 0 ), + "3" ( initial_word_count + 1 ), "4" ( loop_partial_count ), + "5" ( x86_tcpip_loop_end ), "g" ( loop_count + 1 ), + "g" ( final_word_count + 1 ), "g" ( final_byte ) ); + + return ( ~sum & 0xffff ); +} diff --git a/src/arch/x86/include/bits/tcpip.h b/src/arch/x86/include/bits/tcpip.h index 9ae8d9205..a4b335eb1 100644 --- a/src/arch/x86/include/bits/tcpip.h +++ b/src/arch/x86/include/bits/tcpip.h @@ -9,4 +9,9 @@ FILE_LICENCE ( GPL2_OR_LATER ); +extern uint16_t x86_tcpip_continue_chksum ( uint16_t partial, + const void *data, size_t len ); + +#define tcpip_continue_chksum x86_tcpip_continue_chksum + #endif /* _BITS_TCPIP_H */