;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Copyright (c) 2012, Intel Corporation 
; 
; All rights reserved. 
; 
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are
; met: 
; 
; * Redistributions of source code must retain the above copyright
;   notice, this list of conditions and the following disclaimer.  
; 
; * Redistributions in binary form must reproduce the above copyright
;   notice, this list of conditions and the following disclaimer in the
;   documentation and/or other materials provided with the
;   distribution. 
; 
; * Neither the name of the Intel Corporation nor the names of its
;   contributors may be used to endorse or promote products derived from
;   this software without specific prior written permission. 
; 
; 
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT16 crc16_T10DIF_128x(       
;               UINT16 init_crc,                //initial CRC value, 16 bits
;               const unsigned char *buf,       //buffer pointer to calculate CRC on
;               UINT64 len                      //buffer length in bytes (64-bit data)
;               );
;       
;       This code works only on multiple of 128 Byte length buffers as input. 
;       If, for example, len is set to 257, the code will compute CRC of a buffer with length 257 - (257 mod 128) = 256 Bytes.
;       Code works only on 64-bit platforms.
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;               Wajdi Feghali
;       
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
;
;       sample yasm command line: 
;               yasm -f x64 -f elf64 -X gnu -g dwarf2 crc16_T10DIF_128x.asm
;
[bits 64]

section .text

; for windows platforms, add "WIN_ABI" to preprocessor definitions.
%ifdef WIN_ABI
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif


align   16
global  crc16_T10DIF_128x
crc16_T10DIF_128x:
        ; clean out low 7 bits of the length to get length - (length mod 128)
        and     arg3, ~127
        je      _zero_bytes

        ; adjust the 16-bit initial_crc value, scale it to 32 bits
        shl     arg1_low32, 16
        
        ; adjust stack pointer
        sub     rsp,16*10+8

%ifdef WIN_ABI    
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp+16*2],xmm6
        movdqa  [rsp+16*3],xmm7
        movdqa  [rsp+16*4],xmm8
        movdqa  [rsp+16*5],xmm9
        movdqa  [rsp+16*6],xmm10
        movdqa  [rsp+16*7],xmm11
        movdqa  [rsp+16*8],xmm12
        movdqa  [rsp+16*9],xmm13
%endif
    


        ; load the initial crc value
        movd    xmm10, arg1_low32
        
        pslldq  xmm10, 12
    
        movdqa  xmm11, [SHUF_MASK wrt rip]
        ; receive the initial 128B data
        movdqu  xmm0, [arg2+16*0]
        movdqu  xmm1, [arg2+16*1]
        movdqu  xmm2, [arg2+16*2]
        movdqu  xmm3, [arg2+16*3]       
        movdqu  xmm4, [arg2+16*4]       
        movdqu  xmm5, [arg2+16*5]
        movdqu  xmm6, [arg2+16*6]       
        movdqu  xmm7, [arg2+16*7]       
        
        pshufb  xmm0, xmm11
        
        ; XOR the initial_crc value
        pxor    xmm0, xmm10
        
        pshufb  xmm1, xmm11
        pshufb  xmm2, xmm11
        pshufb  xmm3, xmm11
        pshufb  xmm4, xmm11
        pshufb  xmm5, xmm11
        pshufb  xmm6, xmm11
        pshufb  xmm7, xmm11

        movdqa  xmm10, [rk3 wrt rip]    ;xmm10 has rk3 and rk4

        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        sub     arg3, 256
        jl      _128B_left

        
        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128B_loop: 
        
        ; update the buffer pointer
        add     arg2, 128               ;    buf += 128;
        
        movdqu  xmm9, [arg2+16*0]
        movdqu  xmm12, [arg2+16*1]
        pshufb  xmm9, xmm11
        pshufb  xmm12, xmm11
        movdqa  xmm8, xmm0
        movdqa  xmm13, xmm1
        pclmulqdq       xmm0, xmm10, 0x00
        pclmulqdq       xmm8, xmm10 , 0x11
        pclmulqdq       xmm1, xmm10, 0x00
        pclmulqdq       xmm13, xmm10 , 0x11
        pxor    xmm0, xmm9
        xorps   xmm0, xmm8
        pxor    xmm1, xmm12
        xorps   xmm1, xmm13

        movdqu  xmm9, [arg2+16*2]
        movdqu  xmm12, [arg2+16*3]
        pshufb  xmm9, xmm11
        pshufb  xmm12, xmm11
        movdqa  xmm8, xmm2
        movdqa  xmm13, xmm3
        pclmulqdq       xmm2, xmm10, 0x00
        pclmulqdq       xmm8, xmm10 , 0x11
        pclmulqdq       xmm3, xmm10, 0x00
        pclmulqdq       xmm13, xmm10 , 0x11
        pxor    xmm2, xmm9
        xorps   xmm2, xmm8
        pxor    xmm3, xmm12
        xorps   xmm3, xmm13

        movdqu  xmm9, [arg2+16*4]
        movdqu  xmm12, [arg2+16*5]
        pshufb  xmm9, xmm11
        pshufb  xmm12, xmm11
        movdqa  xmm8, xmm4
        movdqa  xmm13, xmm5
        pclmulqdq       xmm4, xmm10, 0x00
        pclmulqdq       xmm8, xmm10 , 0x11
        pclmulqdq       xmm5, xmm10, 0x00
        pclmulqdq       xmm13, xmm10 , 0x11
        pxor    xmm4, xmm9
        xorps   xmm4, xmm8
        pxor    xmm5, xmm12
        xorps   xmm5, xmm13

        movdqu  xmm9, [arg2+16*6]
        movdqu  xmm12, [arg2+16*7]
        pshufb  xmm9, xmm11
        pshufb  xmm12, xmm11
        movdqa  xmm8, xmm6
        movdqa  xmm13, xmm7
        pclmulqdq       xmm6, xmm10, 0x00
        pclmulqdq       xmm8, xmm10 , 0x11
        pclmulqdq       xmm7, xmm10, 0x00
        pclmulqdq       xmm13, xmm10 , 0x11
        pxor    xmm6, xmm9
        xorps   xmm6, xmm8
        pxor    xmm7, xmm12
        xorps   xmm7, xmm13

        sub     arg3, 128
        
        ; check if there is another 128B in the buffer to be able to fold
        jge     _fold_128B_loop
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        
_128B_left:     
        add     arg2, 128

        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
        ; fold the 8 xmm registers to 1 xmm register with different constants

        movdqa  xmm10, [rk9 wrt rip]
        movdqa  xmm8, xmm0
        pclmulqdq       xmm0, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        xorps   xmm7, xmm0
        
        movdqa  xmm10, [rk11 wrt rip]
        movdqa  xmm8, xmm1
        pclmulqdq       xmm1, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        xorps   xmm7, xmm1
    
        movdqa  xmm10, [rk13 wrt rip]
        movdqa  xmm8, xmm2
        pclmulqdq       xmm2, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2

        movdqa  xmm10, [rk15 wrt rip]
        movdqa  xmm8, xmm3
        pclmulqdq       xmm3, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        xorps   xmm7, xmm3

        movdqa  xmm10, [rk17 wrt rip]
        movdqa  xmm8, xmm4
        pclmulqdq       xmm4, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        pxor    xmm7, xmm4

        movdqa  xmm10, [rk19 wrt rip]
        movdqa  xmm8, xmm5
        pclmulqdq       xmm5, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        xorps   xmm7, xmm5

        movdqa  xmm10, [rk1 wrt rip]
        movdqa  xmm8, xmm6
        pclmulqdq       xmm6, xmm10, 0x11
        pclmulqdq       xmm8, xmm10, 0x00
        pxor    xmm7, xmm8
        pxor    xmm7, xmm6


        add     arg3, 128
        
        ; compute crc of a 128-bit value
        movdqa  xmm10, [rk5 wrt rip]    ; rk5 and rk6 in xmm10
        movdqa  xmm0, xmm7
        
        ;64b fold
        pclmulqdq       xmm7, xmm10, 0x1
        pslldq  xmm0, 8
        pxor    xmm7, xmm0
        
        ;32b fold
        movdqa  xmm0, xmm7

        pand    xmm0, [mask wrt rip]
        
        psrldq  xmm7, 12
        pclmulqdq       xmm7, xmm10, 0x10
        pxor    xmm7, xmm0
        
        ;barrett reduction
_barrett:
        movdqa  xmm10, [rk7 wrt rip]    ; rk7 and rk8 in xmm10
        movdqa  xmm0, xmm7
        pclmulqdq       xmm7, xmm10, 0x01
        pslldq  xmm7, 4
        pclmulqdq       xmm7, xmm10, 0x11
        
        pslldq  xmm7, 4
        pxor    xmm7, xmm0
        pextrd  eax, xmm7,1

_cleanup:
        ; scale the result back to 16 bits
        shr     eax, 16
%ifdef WIN_ABI    
        movdqa  xmm6, [rsp+16*2]
        movdqa  xmm7, [rsp+16*3]
        movdqa  xmm8, [rsp+16*4]
        movdqa  xmm9, [rsp+16*5]
        movdqa  xmm10, [rsp+16*6]
        movdqa  xmm11, [rsp+16*7]
        movdqa  xmm12, [rsp+16*8]
        movdqa  xmm13, [rsp+16*9]
%endif
        add     rsp,16*10+8
        ret

_zero_bytes:
        mov     eax, arg1_low32
        ret


section .data

; precomputed constants 
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
rk1:
DQ 0x2d56000000000000
rk2:    
DQ 0x06df000000000000
rk3:
DQ 0x9d9d000000000000
rk4:
DQ 0x7cf5000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000
rk9:
DQ 0xceae000000000000
rk10:
DQ 0xbfd6000000000000
rk11:
DQ 0x1e16000000000000
rk12:
DQ 0x713c000000000000
rk13:
DQ 0xf7f9000000000000
rk14:
DQ 0x80a6000000000000
rk15:
DQ 0x044c000000000000
rk16:
DQ 0xe658000000000000
rk17:
DQ 0xad18000000000000
rk18:
DQ 0xa497000000000000
rk19:
DQ 0x6ee3000000000000
rk20:
DQ 0xe7b5000000000000



align 16
mask: 
DDQ 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF

SHUF_MASK:
DDQ 0x000102030405060708090A0B0C0D0E0F
