crc32intelc.h - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
              // Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
/*
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
  arising from the use of this software.
  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:
  1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgment in the product documentation would be
    appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.
  Ferry Toth
  ftoth@exalondelft.nl
*/
/* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
  CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. Where efficient
  3 crc32q instructions are used which a single core can execute in parallel.
  This compensates for the latency of a single crc32q instruction. Combining the 
  3 CRC-32C bytes is done using the pclmulqdq instruction, which has overhead of
  its own, and makes this code path only efficient for buffer sizes above 216 bytes. 
  All code requiring a crc32q instruction is done inside a macro, for which alternative
  code is generated in case of a 32 bit platform.
   
  This code is a port of Intels crc_iscsi_v_pcl.asm assembly code (which is part of
  this project as well as in a modified form the linux kernel) and reaches the same 
  throughput on 64bit platforms. The main advantage of this port is that it was
  relatively easy to port to 32bit platforms (like Intel Edison which currently has
  only 32bit support). Being written in C it is of course easier to maintain and possibly
  optimize further */
/* Version history:
  1.0  07 May 2016  Ferry Toth - First version
*/
#ifndef __LP64__
#define CRC_NATIVE uint32_t
#else
#define CRC_NATIVE uint64_t
#endif
#ifndef __LP64__
#define CRCtriplet(crc, buf, offset) \
    crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *((uint32_t*) buf ## 0 + 2 * offset)); \
    crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *((uint32_t*) buf ## 1 + 2 * offset)); \
    crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *((uint32_t*) buf ## 2 + 2 * offset)); \
    crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *((uint32_t*) buf ## 0 + 1 + 2 * offset)); \
    crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *((uint32_t*) buf ## 1 + 1 + 2 * offset)); \
    crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *((uint32_t*) buf ## 2 + 1 + 2 * offset));
#else
#define CRCtriplet(crc, buf, offset) \
    crc ## 0 = __builtin_ia32_crc32di(crc ## 0, *(buf ## 0 + offset)); \
    crc ## 1 = __builtin_ia32_crc32di(crc ## 1, *(buf ## 1 + offset)); \
    crc ## 2 = __builtin_ia32_crc32di(crc ## 2, *(buf ## 2 + offset));
#endif
#ifndef __LP64__
#define CRCduplet(crc, buf, offset) \
    crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *((uint32_t*) buf ## 0 + 2 * offset)); \
    crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *((uint32_t*) buf ## 1 + 2 * offset)); \
    crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *((uint32_t*) buf ## 0 + 1 + 2 * offset)); \
    crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *((uint32_t*) buf ## 1 + 1 + 2 * offset));
#else
#define CRCduplet(crc, buf, offset) \
    crc ## 0 = __builtin_ia32_crc32di(crc ## 0, *(buf ## 0 + offset)); \
    crc ## 1 = __builtin_ia32_crc32di(crc ## 1, *(buf ## 1 + offset));
#endif
#ifndef __LP64__
#define CRCsinglet(crc, buf, offset) \
    crc = __builtin_ia32_crc32si(crc, *(uint32_t*)(buf + offset)); \
    crc = __builtin_ia32_crc32si(crc, *(uint32_t*)(buf + offset + sizeof(uint32_t)));
#else
#define CRCsinglet(crc, buf, offset) crc = __builtin_ia32_crc32di(crc, *(uint64_t*)(buf + offset));
#endif
/*
 * CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well chosen constant 
 * and xor's these with the remaining CRC. I (Ferry Toth) could not find a way to implement this in
 * C, so the 64bit code following here is from Intel. As that code runs only on 64 bit (due to movq
 * instructions), I am providing a 32bit variant that does the same but using movd. The 32bit
 * version keeps intermediate results longer in the xmm registers to do the 2nd xor, then moves the
 * longs in 2 steps for the final crc32l
 * 
*/
#ifndef __LP64__
#define CombineCRC()\
asm volatile (\
"movdqu (%3), %%xmm0\n\t"\
"movd %0, %%xmm1\n\t"\
"pclmullqlqdq %%xmm0, %%xmm1\n\t"\
"movd %2, %%xmm2\n\t"\
"pclmullqhqdq %%xmm0, %%xmm2\n\t"\
"pxor %%xmm2, %%xmm1\n\t"\
"movdqu (%4), %%xmm2\n\t"\
"pxor %%xmm2, %%xmm1\n\t"\
"movd %%xmm1, %0\n\t"\
"crc32l %0, %5\n\t"\
"pextrd $1, %%xmm1, %1\n\t"\
"crc32l %1, %5\n\t"\
"movl %5, %0"\
: "=r" ( crc0 )\
: "0" ( crc0 ), "r" ( crc1 ), "r" ( crc32cIntelC_K + block_size - 1 ), "r" ( ( uint64_t* ) next2 - 1 ), "r" ( crc2 )\
: "%xmm0", "%xmm1", "%xmm2"\
);
#else
#define CombineCRC()\
asm volatile (\
"movdqa (%3), %%xmm0\n\t"\
"movq %0, %%xmm1\n\t"\
"pclmullqlqdq %%xmm0, %%xmm1\n\t"\
"movq %2, %%xmm2\n\t"\
"pclmullqhqdq %%xmm0, %%xmm2\n\t"\
"pxor %%xmm2, %%xmm1\n\t"\
"movq %%xmm1, %0"\
: "=r" ( crc0 ) \
: "0" ( crc0 ), "r" ( crc1 ), "r" ( crc32cIntelC_K + block_size - 1 ) \
: "%xmm0", "%xmm1", "%xmm2"\
); \
crc0 = crc0 ^ * ( ( uint64_t* ) next2 - 1 );\
crc2 = __builtin_ia32_crc32di ( crc2, crc0 );\
crc0 = crc2;
#endif
// kate: indent-mode cstyle; indent-width 4; replace-tabs on;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)