From 21767fad2ba94675bcf49c0796b4b2a65702c974 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:15:52 +0200 Subject: x86_64: prepare shared lib/csum-partial.c Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86_64/lib/Makefile | 4 +- arch/x86_64/lib/csum-partial.c | 150 -------------------------------------- arch/x86_64/lib/csum-partial_64.c | 150 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 152 deletions(-) delete mode 100644 arch/x86_64/lib/csum-partial.c create mode 100644 arch/x86_64/lib/csum-partial_64.c (limited to 'arch/x86_64') diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index 373615a594f..6b510f97cab 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -2,12 +2,12 @@ # Makefile for x86_64-specific library files. # -CFLAGS_csum-partial.o := -funroll-loops +CFLAGS_csum-partial_64.o := -funroll-loops obj-y := io.o iomap_copy_64.o obj-$(CONFIG_SMP) += msr-on-cpu.o -lib-y := csum-partial.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ +lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ usercopy_64.o getuser_64.o putuser_64.o \ thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o lib-y += memcpy_64.o memmove_64.o memset.o copy_user.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c deleted file mode 100644 index bc503f50690..00000000000 --- a/arch/x86_64/lib/csum-partial.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * arch/x86_64/lib/csum-partial.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed. - */ - -#include -#include -#include - -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - -/* - * Do a 64-bit checksum on an arbitrary memory area. - * Returns a 32bit checksum. - * - * This isn't as time critical as it used to be because many NICs - * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. - */ -static unsigned do_csum(const unsigned char *buff, unsigned len) -{ - unsigned odd, count; - unsigned long result = 0; - - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } - - /* last upto 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); - - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); -} - -EXPORT_SYMBOL(csum_partial); - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -__sum16 ip_compute_csum(const void *buff, int len) -{ - return csum_fold(csum_partial(buff,len,0)); -} -EXPORT_SYMBOL(ip_compute_csum); - diff --git a/arch/x86_64/lib/csum-partial_64.c b/arch/x86_64/lib/csum-partial_64.c new file mode 100644 index 00000000000..bc503f50690 --- /dev/null +++ b/arch/x86_64/lib/csum-partial_64.c @@ -0,0 +1,150 @@ +/* + * arch/x86_64/lib/csum-partial.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed. + */ + +#include +#include +#include + +static inline unsigned short from32to16(unsigned a) +{ + unsigned short b = a >> 16; + asm("addw %w2,%w0\n\t" + "adcw $0,%w0\n" + : "=r" (b) + : "0" (b), "r" (a)); + return b; +} + +/* + * Do a 64-bit checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. + * + * Things tried and found to not make it faster: + * Manual Prefetching + * Unrolling to an 128 bytes inner loop. + * Using interleaving with more registers to break the carry chains. + */ +static unsigned do_csum(const unsigned char *buff, unsigned len) +{ + unsigned odd, count; + unsigned long result = 0; + + if (unlikely(len == 0)) + return result; + odd = 1 & (unsigned long) buff; + if (unlikely(odd)) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + unsigned long zero; + unsigned count64; + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + + /* main loop using 64byte blocks */ + zero = 0; + count64 = count >> 3; + while (count64) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq %[zero],%[res]" + : [res] "=r" (result) + : [src] "r" (buff), [zero] "r" (zero), + "[res]" (result)); + buff += 64; + count64--; + } + + /* last upto 7 8byte blocks */ + count %= 8; + while (count) { + asm("addq %1,%0\n\t" + "adcq %2,%0\n" + : "=r" (result) + : "m" (*(unsigned long *)buff), + "r" (zero), "0" (result)); + --count; + buff += 8; + } + result = add32_with_carry(result>>32, + result&0xffffffff); + + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = add32_with_carry(result>>32, result & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return result; +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 64-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)add32_with_carry(do_csum(buff, len), + (__force u32)sum); +} + +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +__sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff,len,0)); +} +EXPORT_SYMBOL(ip_compute_csum); + -- cgit