summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGreg Hudson <ghudson@mit.edu>2013-04-24 13:04:19 -0400
committerGreg Hudson <ghudson@mit.edu>2013-05-24 14:17:06 -0400
commit7809ae6c7d9d737e1a7becc0851148c73c095c4b (patch)
tree3334eb50c8146dd29252a0ae268832d37ce3175c
parent94d2162dcf1eec509ac376955615ab38309035be (diff)
downloadkrb5-7809ae6c7d9d737e1a7becc0851148c73c095c4b.tar.gz
krb5-7809ae6c7d9d737e1a7becc0851148c73c095c4b.tar.xz
krb5-7809ae6c7d9d737e1a7becc0851148c73c095c4b.zip
Add Intel AESNI assembly files
Add assembly files from the Intel AESNI Sample Library, version 1.2, which implement AES encryption using AES-NI instructions. Trailing whitespace was removed.
-rw-r--r--NOTICE38
-rw-r--r--doc/notice.rst38
-rw-r--r--src/lib/crypto/builtin/aes/iaesx64.s2081
-rw-r--r--src/lib/crypto/builtin/aes/iaesx86.s2199
4 files changed, 4356 insertions, 0 deletions
diff --git a/NOTICE b/NOTICE
index ac08bba04c..08712a3668 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1173,3 +1173,41 @@ The bundled libev source code is subject to the following license:
the GPL in this and the other files of this package. If you do not
delete the provisions above, a recipient may use your version of
this file under either the BSD or the GPL.
+
+======================================================================
+
+Files copied from the Intel AESNI Sample Library are subject to the
+following license:
+
+ Copyright (C) 2010, Intel Corporation
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/doc/notice.rst b/doc/notice.rst
index 0215841736..33ba998cf8 100644
--- a/doc/notice.rst
+++ b/doc/notice.rst
@@ -1120,3 +1120,41 @@ The bundled libev source code is subject to the following license:
required by the GPL in this and the other files of this package. If you do
not delete the provisions above, a recipient may use your version of this
file under either the BSD or the GPL.
+
+-------------------
+
+Files copied from the Intel AESNI Sample Library are subject to the
+following license:
+
+ Copyright |copy| 2010, Intel Corporation
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the following
+ disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGE.
diff --git a/src/lib/crypto/builtin/aes/iaesx64.s b/src/lib/crypto/builtin/aes/iaesx64.s
new file mode 100644
index 0000000000..1012e36569
--- /dev/null
+++ b/src/lib/crypto/builtin/aes/iaesx64.s
@@ -0,0 +1,2081 @@
+[bits 64]
+[CPU intelnop]
+
+; Copyright (c) 2010, Intel Corporation
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; * Redistributions of source code must retain the above copyright notice,
+; this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+; * Neither the name of Intel Corporation nor the names of its contributors
+; may be used to endorse or promote products derived from this software
+; without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%macro linux_setup 0
+%ifdef __linux__
+ mov rcx, rdi
+ mov rdx, rsi
+%endif
+%endmacro
+
+%macro inversekey 1
+ movdqu xmm1,%1
+ aesimc xmm0,xmm1
+ movdqu %1,xmm0
+%endmacro
+
+%macro aesdeclast1 1
+ aesdeclast xmm0,%1
+%endmacro
+
+%macro aesenclast1 1
+ aesenclast xmm0,%1
+%endmacro
+
+%macro aesdec1 1
+ aesdec xmm0,%1
+%endmacro
+
+%macro aesenc1 1
+ aesenc xmm0,%1
+%endmacro
+
+
+%macro aesdeclast1_u 1
+ movdqu xmm4,%1
+ aesdeclast xmm0,xmm4
+%endmacro
+
+%macro aesenclast1_u 1
+ movdqu xmm4,%1
+ aesenclast xmm0,xmm4
+%endmacro
+
+%macro aesdec1_u 1
+ movdqu xmm4,%1
+ aesdec xmm0,xmm4
+%endmacro
+
+%macro aesenc1_u 1
+ movdqu xmm4,%1
+ aesenc xmm0,xmm4
+%endmacro
+
+%macro aesdec4 1
+ movdqa xmm4,%1
+
+ aesdec xmm0,xmm4
+ aesdec xmm1,xmm4
+ aesdec xmm2,xmm4
+ aesdec xmm3,xmm4
+
+%endmacro
+
+%macro aesdeclast4 1
+ movdqa xmm4,%1
+
+ aesdeclast xmm0,xmm4
+ aesdeclast xmm1,xmm4
+ aesdeclast xmm2,xmm4
+ aesdeclast xmm3,xmm4
+
+%endmacro
+
+
+%macro aesenc4 1
+ movdqa xmm4,%1
+
+ aesenc xmm0,xmm4
+ aesenc xmm1,xmm4
+ aesenc xmm2,xmm4
+ aesenc xmm3,xmm4
+
+%endmacro
+
+%macro aesenclast4 1
+ movdqa xmm4,%1
+
+ aesenclast xmm0,xmm4
+ aesenclast xmm1,xmm4
+ aesenclast xmm2,xmm4
+ aesenclast xmm3,xmm4
+
+%endmacro
+
+
+%macro load_and_inc4 1
+ movdqa xmm4,%1
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ movdqa xmm1,xmm5
+ paddd xmm1,[counter_add_one wrt rip]
+ pshufb xmm1, xmm6 ; byte swap counter back
+ movdqa xmm2,xmm5
+ paddd xmm2,[counter_add_two wrt rip]
+ pshufb xmm2, xmm6 ; byte swap counter back
+ movdqa xmm3,xmm5
+ paddd xmm3,[counter_add_three wrt rip]
+ pshufb xmm3, xmm6 ; byte swap counter back
+ pxor xmm0,xmm4
+ paddd xmm5,[counter_add_four wrt rip]
+ pxor xmm1,xmm4
+ pxor xmm2,xmm4
+ pxor xmm3,xmm4
+%endmacro
+
+%macro xor_with_input4 1
+ movdqu xmm4,[%1]
+ pxor xmm0,xmm4
+ movdqu xmm4,[%1+16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[%1+32]
+ pxor xmm2,xmm4
+ movdqu xmm4,[%1+48]
+ pxor xmm3,xmm4
+%endmacro
+
+
+
+%macro load_and_xor4 2
+ movdqa xmm4,%2
+ movdqu xmm0,[%1 + 0*16]
+ pxor xmm0,xmm4
+ movdqu xmm1,[%1 + 1*16]
+ pxor xmm1,xmm4
+ movdqu xmm2,[%1 + 2*16]
+ pxor xmm2,xmm4
+ movdqu xmm3,[%1 + 3*16]
+ pxor xmm3,xmm4
+%endmacro
+
+%macro store4 1
+ movdqu [%1 + 0*16],xmm0
+ movdqu [%1 + 1*16],xmm1
+ movdqu [%1 + 2*16],xmm2
+ movdqu [%1 + 3*16],xmm3
+%endmacro
+
+%macro copy_round_keys 3
+ movdqu xmm4,[%2 + ((%3)*16)]
+ movdqa [%1 + ((%3)*16)],xmm4
+%endmacro
+
+
+%macro key_expansion_1_192 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu [rdx+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192 1
+ movdqa xmm5, xmm4
+ pslldq xmm5, 4
+ shufps xmm6, xmm1, 11110000b
+ pxor xmm6, xmm5
+ pxor xmm4, xmm6
+ pshufd xmm7, xmm4, 00001110b
+ movdqu [rdx+%1], xmm7
+%endmacro
+
+
+section .data
+align 16
+shuffle_mask:
+DD 0FFFFFFFFh
+DD 03020100h
+DD 07060504h
+DD 0B0A0908h
+
+byte_swap_16:
+DDQ 0x000102030405060708090A0B0C0D0E0F
+
+align 16
+counter_add_one:
+DD 1
+DD 0
+DD 0
+DD 0
+
+counter_add_two:
+DD 2
+DD 0
+DD 0
+DD 0
+
+counter_add_three:
+DD 3
+DD 0
+DD 0
+DD 0
+
+counter_add_four:
+DD 4
+DD 0
+DD 0
+DD 0
+
+
+
+section .text
+
+align 16
+key_expansion256:
+
+ pshufd xmm2, xmm2, 011111111b
+
+ movdqa xmm4, xmm1
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm2
+
+ movdqu [rdx], xmm1
+ add rdx, 0x10
+
+ aeskeygenassist xmm4, xmm1, 0
+ pshufd xmm2, xmm4, 010101010b
+
+ movdqa xmm4, xmm3
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+ movdqu [rdx], xmm3
+ add rdx, 0x10
+
+ ret
+
+
+
+align 16
+key_expansion128:
+ pshufd xmm2, xmm2, 0xFF;
+ movdqa xmm3, xmm1
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+
+ ; storing the result in the key schedule array
+ movdqu [rdx], xmm1
+ add rdx, 0x10
+ ret
+
+
+
+
+
+
+align 16
+global iEncExpandKey128
+iEncExpandKey128:
+
+ linux_setup
+
+ movdqu xmm1, [rcx] ; loading the key
+
+ movdqu [rdx], xmm1
+
+ movdqa xmm5, [shuffle_mask wrt rip]
+
+ add rdx,16
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ call key_expansion128
+
+ ret
+
+
+
+align 16
+global iEncExpandKey192
+iEncExpandKey192:
+
+ linux_setup
+ sub rsp,64+8
+ movdqa [rsp],xmm6
+ movdqa [rsp+16],xmm7
+
+
+ movq xmm7, [rcx+16] ; loading the AES key
+ movq [rdx+16], xmm7 ; Storing key in memory where all key expansion
+ pshufd xmm4, xmm7, 01001111b
+ movdqu xmm1, [rcx] ; loading the AES key
+ movdqu [rdx], xmm1 ; Storing key in memory where all key expansion
+
+ pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion.
+ pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192 24
+ key_expansion_2_192 40
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192 48
+ key_expansion_2_192 64
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192 72
+ key_expansion_2_192 88
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192 96
+ key_expansion_2_192 112
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192 120
+ key_expansion_2_192 136
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192 144
+ key_expansion_2_192 160
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192 168
+ key_expansion_2_192 184
+
+ aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192 192
+
+
+ movdqa xmm6,[rsp]
+ movdqa xmm7,[rsp+16]
+ add rsp,64+8
+
+ ret
+
+
+
+
+align 16
+global iDecExpandKey128
+iDecExpandKey128:
+
+ linux_setup
+ push rcx
+ push rdx
+ sub rsp,16+8
+
+ call iEncExpandKey128
+
+ add rsp,16+8
+ pop rdx
+ pop rcx
+
+ inversekey [rdx + 1*16]
+ inversekey [rdx + 2*16]
+ inversekey [rdx + 3*16]
+ inversekey [rdx + 4*16]
+ inversekey [rdx + 5*16]
+ inversekey [rdx + 6*16]
+ inversekey [rdx + 7*16]
+ inversekey [rdx + 8*16]
+ inversekey [rdx + 9*16]
+
+ ret
+
+
+align 16
+global iDecExpandKey192
+iDecExpandKey192:
+
+ linux_setup
+ push rcx
+ push rdx
+ sub rsp,16+8
+
+ call iEncExpandKey192
+
+ add rsp,16+8
+ pop rdx
+ pop rcx
+
+
+ inversekey [rdx + 1*16]
+ inversekey [rdx + 2*16]
+ inversekey [rdx + 3*16]
+ inversekey [rdx + 4*16]
+ inversekey [rdx + 5*16]
+ inversekey [rdx + 6*16]
+ inversekey [rdx + 7*16]
+ inversekey [rdx + 8*16]
+ inversekey [rdx + 9*16]
+ inversekey [rdx + 10*16]
+ inversekey [rdx + 11*16]
+
+ ret
+
+
+
+align 16
+global iDecExpandKey256
+iDecExpandKey256:
+
+ linux_setup
+ push rcx
+ push rdx
+ sub rsp,16+8
+
+ call iEncExpandKey256
+
+ add rsp,16+8
+ pop rdx
+ pop rcx
+
+ inversekey [rdx + 1*16]
+ inversekey [rdx + 2*16]
+ inversekey [rdx + 3*16]
+ inversekey [rdx + 4*16]
+ inversekey [rdx + 5*16]
+ inversekey [rdx + 6*16]
+ inversekey [rdx + 7*16]
+ inversekey [rdx + 8*16]
+ inversekey [rdx + 9*16]
+ inversekey [rdx + 10*16]
+ inversekey [rdx + 11*16]
+ inversekey [rdx + 12*16]
+ inversekey [rdx + 13*16]
+
+ ret
+
+
+
+
+align 16
+global iEncExpandKey256
+iEncExpandKey256:
+
+ linux_setup
+
+ movdqu xmm1, [rcx] ; loading the key
+ movdqu xmm3, [rcx+16]
+ movdqu [rdx], xmm1 ; Storing key in memory where all key schedule will be stored
+ movdqu [rdx+16], xmm3
+
+ add rdx,32
+
+ movdqa xmm5, [shuffle_mask wrt rip] ; this mask is used by key_expansion
+
+ aeskeygenassist xmm2, xmm3, 0x1 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x2 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x4 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x8 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x10 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x20 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x40 ;
+; call key_expansion256
+
+ pshufd xmm2, xmm2, 011111111b
+
+ movdqa xmm4, xmm1
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm2
+
+ movdqu [rdx], xmm1
+
+
+ ret
+
+
+
+
+
+
+align 16
+global iDec128
+iDec128:
+
+ linux_setup
+ sub rsp,16*16+8
+
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_dec128
+
+ cmp eax,4
+ jl lp128decsingle
+
+ test rcx,0xf
+ jz lp128decfour
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ mov rcx,rsp
+
+
+
+align 16
+lp128decfour:
+
+ test eax,eax
+ jz end_dec128
+
+ cmp eax,4
+ jl lp128decsingle
+
+ load_and_xor4 rdx, [rcx+10*16]
+ add rdx,16*4
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ sub eax,4
+ store4 r8+rdx-(16*4)
+ jmp lp128decfour
+
+
+ align 16
+lp128decsingle:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+10*16]
+ pxor xmm0, xmm4
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ add rdx, 16
+ movdqu [r8 + rdx - 16], xmm0
+ dec eax
+ jnz lp128decsingle
+
+end_dec128:
+
+ add rsp,16*16+8
+ ret
+
+
+align 16
+global iDec128_CBC
+iDec128_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_dec128_CBC
+
+ cmp eax,4
+ jl lp128decsingle_CBC
+
+ test rcx,0xf
+ jz lp128decfour_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ mov rcx,rsp
+
+
+align 16
+lp128decfour_CBC:
+
+ test eax,eax
+ jz end_dec128_CBC
+
+ cmp eax,4
+ jl lp128decsingle_CBC
+
+ load_and_xor4 rdx, [rcx+10*16]
+ add rdx,16*4
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[rdx - 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[rdx - 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[rdx - 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[rdx - 16*4 + 3*16]
+
+ sub eax,4
+ store4 r8+rdx-(16*4)
+ jmp lp128decfour_CBC
+
+
+ align 16
+lp128decsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqa xmm1,xmm0
+ movdqu xmm4,[rcx+10*16]
+ pxor xmm0, xmm4
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+ add rdx, 16
+ movdqu [r8 + rdx - 16], xmm0
+ dec eax
+ jnz lp128decsingle_CBC
+
+end_dec128_CBC:
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm5
+ add rsp,16*16+8
+ ret
+
+
+align 16
+global iDec192_CBC
+iDec192_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_dec192_CBC
+
+ cmp eax,4
+ jl lp192decsingle_CBC
+
+ test rcx,0xf
+ jz lp192decfour_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ mov rcx,rsp
+
+
+align 16
+lp192decfour_CBC:
+
+ test eax,eax
+ jz end_dec192_CBC
+
+ cmp eax,4
+ jl lp192decsingle_CBC
+
+ load_and_xor4 rdx, [rcx+12*16]
+ add rdx,16*4
+ aesdec4 [rcx+11*16]
+ aesdec4 [rcx+10*16]
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[rdx - 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[rdx - 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[rdx - 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[rdx - 16*4 + 3*16]
+
+ sub eax,4
+ store4 r8+rdx-(16*4)
+ jmp lp192decfour_CBC
+
+
+ align 16
+lp192decsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+12*16]
+ movdqa xmm1,xmm0
+ pxor xmm0, xmm4
+ aesdec1_u [rcx+11*16]
+ aesdec1_u [rcx+10*16]
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+ add rdx, 16
+ movdqu [r8 + rdx - 16], xmm0
+ dec eax
+ jnz lp192decsingle_CBC
+
+end_dec192_CBC:
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm5
+ add rsp,16*16+8
+ ret
+
+
+
+
+align 16
+global iDec256_CBC
+iDec256_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_dec256_CBC
+
+ cmp eax,4
+ jl lp256decsingle_CBC
+
+ test rcx,0xf
+ jz lp256decfour_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ copy_round_keys rsp,rcx,13
+ copy_round_keys rsp,rcx,14
+ mov rcx,rsp
+
+align 16
+lp256decfour_CBC:
+
+ test eax,eax
+ jz end_dec256_CBC
+
+ cmp eax,4
+ jl lp256decsingle_CBC
+
+ load_and_xor4 rdx, [rcx+14*16]
+ add rdx,16*4
+ aesdec4 [rcx+13*16]
+ aesdec4 [rcx+12*16]
+ aesdec4 [rcx+11*16]
+ aesdec4 [rcx+10*16]
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[rdx - 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[rdx - 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[rdx - 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[rdx - 16*4 + 3*16]
+
+ sub eax,4
+ store4 r8+rdx-(16*4)
+ jmp lp256decfour_CBC
+
+
+ align 16
+lp256decsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+14*16]
+ movdqa xmm1,xmm0
+ pxor xmm0, xmm4
+ aesdec1_u [rcx+13*16]
+ aesdec1_u [rcx+12*16]
+ aesdec1_u [rcx+11*16]
+ aesdec1_u [rcx+10*16]
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+ add rdx, 16
+ movdqu [r8 + rdx - 16], xmm0
+ dec eax
+ jnz lp256decsingle_CBC
+
+end_dec256_CBC:
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm5
+ add rsp,16*16+8
+ ret
+
+
+
+
+
+align 16
+global iDec192
+iDec192:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_dec192
+
+ cmp eax,4
+ jl lp192decsingle
+
+ test rcx,0xf
+ jz lp192decfour
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ mov rcx,rsp
+
+align 16
+lp192decfour:
+
+ test eax,eax
+ jz end_dec192
+
+ cmp eax,4
+ jl lp192decsingle
+
+ load_and_xor4 rdx, [rcx+12*16]
+ add rdx,16*4
+ aesdec4 [rcx+11*16]
+ aesdec4 [rcx+10*16]
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ sub eax,4
+ store4 r8+rdx-(16*4)
+ jmp lp192decfour
+
+
+ align 16
+lp192decsingle:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+12*16]
+ pxor xmm0, xmm4
+ aesdec1_u [rcx+11*16]
+ aesdec1_u [rcx+10*16]
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ add rdx, 16
+ movdqu [r8 + rdx - 16], xmm0
+ dec eax
+ jnz lp192decsingle
+
+end_dec192:
+
+ add rsp,16*16+8
+ ret
+
+
+
+
+align 16
+global iDec256
+iDec256:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_dec256
+
+ cmp eax,4
+ jl lp256dec
+
+ test rcx,0xf
+ jz lp256dec4
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ copy_round_keys rsp,rcx,13
+ copy_round_keys rsp,rcx,14
+ mov rcx,rsp
+
+
+ align 16
+lp256dec4:
+ test eax,eax
+ jz end_dec256
+
+ cmp eax,4
+ jl lp256dec
+
+ load_and_xor4 rdx,[rcx+14*16]
+ add rdx, 4*16
+ aesdec4 [rcx+13*16]
+ aesdec4 [rcx+12*16]
+ aesdec4 [rcx+11*16]
+ aesdec4 [rcx+10*16]
+ aesdec4 [rcx+9*16]
+ aesdec4 [rcx+8*16]
+ aesdec4 [rcx+7*16]
+ aesdec4 [rcx+6*16]
+ aesdec4 [rcx+5*16]
+ aesdec4 [rcx+4*16]
+ aesdec4 [rcx+3*16]
+ aesdec4 [rcx+2*16]
+ aesdec4 [rcx+1*16]
+ aesdeclast4 [rcx+0*16]
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lp256dec4
+
+ align 16
+lp256dec:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+14*16]
+ add rdx, 16
+ pxor xmm0, xmm4 ; Round 0 (only xor)
+ aesdec1_u [rcx+13*16]
+ aesdec1_u [rcx+12*16]
+ aesdec1_u [rcx+11*16]
+ aesdec1_u [rcx+10*16]
+ aesdec1_u [rcx+9*16]
+ aesdec1_u [rcx+8*16]
+ aesdec1_u [rcx+7*16]
+ aesdec1_u [rcx+6*16]
+ aesdec1_u [rcx+5*16]
+ aesdec1_u [rcx+4*16]
+ aesdec1_u [rcx+3*16]
+ aesdec1_u [rcx+2*16]
+ aesdec1_u [rcx+1*16]
+ aesdeclast1_u [rcx+0*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp256dec
+
+end_dec256:
+
+ add rsp,16*16+8
+ ret
+
+
+
+
+
+
+align 16
+global iEnc128
+iEnc128:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_enc128
+
+ cmp eax,4
+ jl lp128encsingle
+
+ test rcx,0xf
+ jz lpenc128four
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ mov rcx,rsp
+
+
+ align 16
+
+lpenc128four:
+
+ test eax,eax
+ jz end_enc128
+
+ cmp eax,4
+ jl lp128encsingle
+
+ load_and_xor4 rdx,[rcx+0*16]
+ add rdx,4*16
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenclast4 [rcx+10*16]
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lpenc128four
+
+ align 16
+lp128encsingle:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenclast1_u [rcx+10*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp128encsingle
+
+end_enc128:
+
+ add rsp,16*16+8
+ ret
+
+
+align 16
+global iEnc128_CTR
+iEnc128_CTR:
+
+ linux_setup
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+
+ sub rsp,16*16+8+16
+
+ movdqa [rsp+16*16], xmm6
+ movdqa xmm6, [byte_swap_16 wrt rip]
+ pshufb xmm5, xmm6 ; byte swap counter
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_encctr128
+
+ cmp eax,4
+ jl lp128encctrsingle
+
+ test rcx,0xf
+ jz lpencctr128four
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ mov rcx,rsp
+
+
+ align 16
+
+lpencctr128four:
+
+ test eax,eax
+ jz end_encctr128
+
+ cmp eax,4
+ jl lp128encctrsingle
+
+ load_and_inc4 [rcx+0*16]
+ add rdx,4*16
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenclast4 [rcx+10*16]
+ xor_with_input4 rdx-(4*16)
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lpencctr128four
+
+ align 16
+lp128encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ paddd xmm5,[counter_add_one wrt rip]
+ add rdx, 16
+ movdqu xmm4,[rcx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenclast1_u [rcx+10*16]
+ movdqu xmm4, [rdx-16]
+ pxor xmm0,xmm4
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp128encctrsingle
+
+end_encctr128:
+
+ mov r9,[r9+24]
+
+ pshufb xmm5, xmm6 ; byte swap counter
+ movdqu [r9],xmm5
+ movdqa xmm6, [rsp+16*16]
+ add rsp,16*16+8+16
+ ret
+
+
+
+align 16
+global iEnc192_CTR
+iEnc192_CTR:
+
+ linux_setup
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+
+ sub rsp,16*16+8+16
+
+ movdqa [rsp+16*16], xmm6
+ movdqa xmm6, [byte_swap_16 wrt rip]
+ pshufb xmm5, xmm6 ; byte swap counter
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_encctr192
+
+ cmp eax,4
+ jl lp192encctrsingle
+
+ test rcx,0xf
+ jz lpencctr192four
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ mov rcx,rsp
+
+
+ align 16
+
+lpencctr192four:
+
+ test eax,eax
+ jz end_encctr192
+
+ cmp eax,4
+ jl lp192encctrsingle
+
+ load_and_inc4 [rcx+0*16]
+ add rdx,4*16
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenc4 [rcx+10*16]
+ aesenc4 [rcx+11*16]
+ aesenclast4 [rcx+12*16]
+ xor_with_input4 rdx-(4*16)
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lpencctr192four
+
+ align 16
+lp192encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ movdqu xmm4,[rcx+0*16]
+ paddd xmm5,[counter_add_one wrt rip]
+ add rdx, 16
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenc1_u [rcx+10*16]
+ aesenc1_u [rcx+11*16]
+ aesenclast1_u [rcx+12*16]
+ movdqu xmm4, [rdx-16]
+ pxor xmm0,xmm4
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp192encctrsingle
+
+end_encctr192:
+
+ mov r9,[r9+24]
+ pshufb xmm5, xmm6 ; byte swap counter
+ movdqu [r9],xmm5
+ movdqa xmm6, [rsp+16*16]
+ add rsp,16*16+8+16
+ ret
+
+
+align 16
+global iEnc256_CTR
+iEnc256_CTR:
+
+ linux_setup
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm5,[rax]
+
+
+ sub rsp,16*16+8+16
+
+ movdqa [rsp+16*16], xmm6
+ movdqa xmm6, [byte_swap_16 wrt rip]
+ pshufb xmm5, xmm6 ; byte swap counter
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_encctr256
+
+ cmp eax,4
+ jl lp256encctrsingle
+
+ test rcx,0xf
+ jz lpencctr256four
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ copy_round_keys rsp,rcx,13
+ copy_round_keys rsp,rcx,14
+ mov rcx,rsp
+
+
+ align 16
+
+lpencctr256four:
+
+ test eax,eax
+ jz end_encctr256
+
+ cmp eax,4
+ jl lp256encctrsingle
+
+ load_and_inc4 [rcx+0*16]
+ add rdx,4*16
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenc4 [rcx+10*16]
+ aesenc4 [rcx+11*16]
+ aesenc4 [rcx+12*16]
+ aesenc4 [rcx+13*16]
+ aesenclast4 [rcx+14*16]
+ xor_with_input4 rdx-(4*16)
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lpencctr256four
+
+ align 16
+lp256encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ movdqu xmm4,[rcx+0*16]
+ paddd xmm5,[counter_add_one wrt rip]
+ add rdx, 16
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenc1_u [rcx+10*16]
+ aesenc1_u [rcx+11*16]
+ aesenc1_u [rcx+12*16]
+ aesenc1_u [rcx+13*16]
+ aesenclast1_u [rcx+14*16]
+ movdqu xmm4, [rdx-16]
+ pxor xmm0,xmm4
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp256encctrsingle
+
+end_encctr256:
+
+ mov r9,[r9+24]
+ pshufb xmm5, xmm6 ; byte swap counter
+ movdqu [r9],xmm5
+ movdqa xmm6, [rsp+16*16]
+ add rsp,16*16+8+16
+ ret
+
+
+
+
+
+
+
+align 16
+global iEnc128_CBC
+iEnc128_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm1,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test rcx,0xf
+ jz lp128encsingle_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ mov rcx,rsp
+
+
+ align 16
+
+lp128encsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4,[rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm1
+ pxor xmm0, xmm4
+ aesenc1 [rcx+1*16]
+ aesenc1 [rcx+2*16]
+ aesenc1 [rcx+3*16]
+ aesenc1 [rcx+4*16]
+ aesenc1 [rcx+5*16]
+ aesenc1 [rcx+6*16]
+ aesenc1 [rcx+7*16]
+ aesenc1 [rcx+8*16]
+ aesenc1 [rcx+9*16]
+ aesenclast1 [rcx+10*16]
+ movdqa xmm1,xmm0
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp128encsingle_CBC
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm1
+ add rsp,16*16+8
+ ret
+
+
+align 16
+global iEnc192_CBC
+iEnc192_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm1,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test rcx,0xf
+ jz lp192encsingle_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ mov rcx,rsp
+
+
+
+ align 16
+
+lp192encsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4, [rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm1
+ pxor xmm0, xmm4
+ aesenc1 [rcx+1*16]
+ aesenc1 [rcx+2*16]
+ aesenc1 [rcx+3*16]
+ aesenc1 [rcx+4*16]
+ aesenc1 [rcx+5*16]
+ aesenc1 [rcx+6*16]
+ aesenc1 [rcx+7*16]
+ aesenc1 [rcx+8*16]
+ aesenc1 [rcx+9*16]
+ aesenc1 [rcx+10*16]
+ aesenc1 [rcx+11*16]
+ aesenclast1 [rcx+12*16]
+ movdqa xmm1,xmm0
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp192encsingle_CBC
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm1
+
+ add rsp,16*16+8
+ ret
+
+
+align 16
+global iEnc256_CBC
+iEnc256_CBC:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov r9,rcx
+ mov rax,[rcx+24]
+ movdqu xmm1,[rax]
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test rcx,0xf
+ jz lp256encsingle_CBC
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ copy_round_keys rsp,rcx,13
+ copy_round_keys rsp,rcx,14
+ mov rcx,rsp
+
+ align 16
+
+lp256encsingle_CBC:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4, [rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm1
+ pxor xmm0, xmm4
+ aesenc1 [rcx+1*16]
+ aesenc1 [rcx+2*16]
+ aesenc1 [rcx+3*16]
+ aesenc1 [rcx+4*16]
+ aesenc1 [rcx+5*16]
+ aesenc1 [rcx+6*16]
+ aesenc1 [rcx+7*16]
+ aesenc1 [rcx+8*16]
+ aesenc1 [rcx+9*16]
+ aesenc1 [rcx+10*16]
+ aesenc1 [rcx+11*16]
+ aesenc1 [rcx+12*16]
+ aesenc1 [rcx+13*16]
+ aesenclast1 [rcx+14*16]
+ movdqa xmm1,xmm0
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp256encsingle_CBC
+
+ mov r9,[r9+24]
+ movdqu [r9],xmm1
+ add rsp,16*16+8
+ ret
+
+
+
+
+align 16
+global iEnc192
+iEnc192:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+ test eax,eax
+ jz end_enc192
+
+ cmp eax,4
+ jl lp192encsingle
+
+ test rcx,0xf
+ jz lpenc192four
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ mov rcx,rsp
+
+
+ align 16
+
+lpenc192four:
+
+ test eax,eax
+ jz end_enc192
+
+ cmp eax,4
+ jl lp192encsingle
+
+ load_and_xor4 rdx,[rcx+0*16]
+ add rdx,4*16
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenc4 [rcx+10*16]
+ aesenc4 [rcx+11*16]
+ aesenclast4 [rcx+12*16]
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lpenc192four
+
+ align 16
+lp192encsingle:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4, [rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenc1_u [rcx+10*16]
+ aesenc1_u [rcx+11*16]
+ aesenclast1_u [rcx+12*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp192encsingle
+
+end_enc192:
+
+ add rsp,16*16+8
+ ret
+
+
+
+
+
+
+align 16
+global iEnc256
+iEnc256:
+
+ linux_setup
+ sub rsp,16*16+8
+
+ mov eax,[rcx+32] ; numblocks
+ mov rdx,[rcx]
+ mov r8,[rcx+8]
+ mov rcx,[rcx+16]
+
+ sub r8,rdx
+
+
+ test eax,eax
+ jz end_enc256
+
+ cmp eax,4
+ jl lp256enc
+
+ test rcx,0xf
+ jz lp256enc4
+
+ copy_round_keys rsp,rcx,0
+ copy_round_keys rsp,rcx,1
+ copy_round_keys rsp,rcx,2
+ copy_round_keys rsp,rcx,3
+ copy_round_keys rsp,rcx,4
+ copy_round_keys rsp,rcx,5
+ copy_round_keys rsp,rcx,6
+ copy_round_keys rsp,rcx,7
+ copy_round_keys rsp,rcx,8
+ copy_round_keys rsp,rcx,9
+ copy_round_keys rsp,rcx,10
+ copy_round_keys rsp,rcx,11
+ copy_round_keys rsp,rcx,12
+ copy_round_keys rsp,rcx,13
+ copy_round_keys rsp,rcx,14
+ mov rcx,rsp
+
+
+ align 16
+
+lp256enc4:
+ test eax,eax
+ jz end_enc256
+
+ cmp eax,4
+ jl lp256enc
+
+
+ load_and_xor4 rdx,[rcx+0*16]
+ add rdx, 16*4
+ aesenc4 [rcx+1*16]
+ aesenc4 [rcx+2*16]
+ aesenc4 [rcx+3*16]
+ aesenc4 [rcx+4*16]
+ aesenc4 [rcx+5*16]
+ aesenc4 [rcx+6*16]
+ aesenc4 [rcx+7*16]
+ aesenc4 [rcx+8*16]
+ aesenc4 [rcx+9*16]
+ aesenc4 [rcx+10*16]
+ aesenc4 [rcx+11*16]
+ aesenc4 [rcx+12*16]
+ aesenc4 [rcx+13*16]
+ aesenclast4 [rcx+14*16]
+
+ store4 r8+rdx-16*4
+ sub eax,4
+ jmp lp256enc4
+
+ align 16
+lp256enc:
+
+ movdqu xmm0, [rdx]
+ movdqu xmm4, [rcx+0*16]
+ add rdx, 16
+ pxor xmm0, xmm4
+ aesenc1_u [rcx+1*16]
+ aesenc1_u [rcx+2*16]
+ aesenc1_u [rcx+3*16]
+ aesenc1_u [rcx+4*16]
+ aesenc1_u [rcx+5*16]
+ aesenc1_u [rcx+6*16]
+ aesenc1_u [rcx+7*16]
+ aesenc1_u [rcx+8*16]
+ aesenc1_u [rcx+9*16]
+ aesenc1_u [rcx+10*16]
+ aesenc1_u [rcx+11*16]
+ aesenc1_u [rcx+12*16]
+ aesenc1_u [rcx+13*16]
+ aesenclast1_u [rcx+14*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [r8+rdx-16], xmm0
+ dec eax
+ jnz lp256enc
+
+end_enc256:
+
+ add rsp,16*16+8
+ ret
diff --git a/src/lib/crypto/builtin/aes/iaesx86.s b/src/lib/crypto/builtin/aes/iaesx86.s
new file mode 100644
index 0000000000..c65921b23f
--- /dev/null
+++ b/src/lib/crypto/builtin/aes/iaesx86.s
@@ -0,0 +1,2199 @@
+[bits 32]
+[CPU intelnop]
+
+; Copyright (c) 2010, Intel Corporation
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; * Redistributions of source code must retain the above copyright notice,
+; this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+; * Neither the name of Intel Corporation nor the names of its contributors
+; may be used to endorse or promote products derived from this software
+; without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+%macro inversekey 1
+ movdqu xmm1,%1
+ aesimc xmm0,xmm1
+ movdqu %1,xmm0
+%endmacro
+
+
+%macro aesdec4 1
+ movdqa xmm4,%1
+
+ aesdec xmm0,xmm4
+ aesdec xmm1,xmm4
+ aesdec xmm2,xmm4
+ aesdec xmm3,xmm4
+
+%endmacro
+
+
+%macro aesdeclast4 1
+ movdqa xmm4,%1
+
+ aesdeclast xmm0,xmm4
+ aesdeclast xmm1,xmm4
+ aesdeclast xmm2,xmm4
+ aesdeclast xmm3,xmm4
+
+%endmacro
+
+
+%macro aesenc4 1
+ movdqa xmm4,%1
+
+ aesenc xmm0,xmm4
+ aesenc xmm1,xmm4
+ aesenc xmm2,xmm4
+ aesenc xmm3,xmm4
+
+%endmacro
+
+%macro aesenclast4 1
+ movdqa xmm4,%1
+
+ aesenclast xmm0,xmm4
+ aesenclast xmm1,xmm4
+ aesenclast xmm2,xmm4
+ aesenclast xmm3,xmm4
+
+%endmacro
+
+
+%macro aesdeclast1 1
+ aesdeclast xmm0,%1
+%endmacro
+
+%macro aesenclast1 1
+ aesenclast xmm0,%1
+%endmacro
+
+%macro aesdec1 1
+ aesdec xmm0,%1
+%endmacro
+
+;abab
+%macro aesenc1 1
+ aesenc xmm0,%1
+%endmacro
+
+
+%macro aesdeclast1_u 1
+ movdqu xmm4,%1
+ aesdeclast xmm0,xmm4
+%endmacro
+
+%macro aesenclast1_u 1
+ movdqu xmm4,%1
+ aesenclast xmm0,xmm4
+%endmacro
+
+%macro aesdec1_u 1
+ movdqu xmm4,%1
+ aesdec xmm0,xmm4
+%endmacro
+
+%macro aesenc1_u 1
+ movdqu xmm4,%1
+ aesenc xmm0,xmm4
+%endmacro
+
+
+%macro load_and_xor4 2
+ movdqa xmm4,%2
+ movdqu xmm0,[%1 + 0*16]
+ pxor xmm0,xmm4
+ movdqu xmm1,[%1 + 1*16]
+ pxor xmm1,xmm4
+ movdqu xmm2,[%1 + 2*16]
+ pxor xmm2,xmm4
+ movdqu xmm3,[%1 + 3*16]
+ pxor xmm3,xmm4
+%endmacro
+
+
+%macro load_and_inc4 1
+ movdqa xmm4,%1
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ movdqa xmm1,xmm5
+ paddd xmm1,[counter_add_one]
+ pshufb xmm1, xmm6 ; byte swap counter back
+ movdqa xmm2,xmm5
+ paddd xmm2,[counter_add_two]
+ pshufb xmm2, xmm6 ; byte swap counter back
+ movdqa xmm3,xmm5
+ paddd xmm3,[counter_add_three]
+ pshufb xmm3, xmm6 ; byte swap counter back
+ pxor xmm0,xmm4
+ paddd xmm5,[counter_add_four]
+ pxor xmm1,xmm4
+ pxor xmm2,xmm4
+ pxor xmm3,xmm4
+%endmacro
+
+%macro xor_with_input4 1
+ movdqu xmm4,[%1]
+ pxor xmm0,xmm4
+ movdqu xmm4,[%1+16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[%1+32]
+ pxor xmm2,xmm4
+ movdqu xmm4,[%1+48]
+ pxor xmm3,xmm4
+%endmacro
+
+%macro store4 1
+ movdqu [%1 + 0*16],xmm0
+ movdqu [%1 + 1*16],xmm1
+ movdqu [%1 + 2*16],xmm2
+ movdqu [%1 + 3*16],xmm3
+%endmacro
+
+
+%macro copy_round_keys 3
+ movdqu xmm4,[%2 + ((%3)*16)]
+ movdqa [%1 + ((%3)*16)],xmm4
+%endmacro
+
+;abab
+%macro copy_round_keyx 3
+ movdqu xmm4,[%2 + ((%3)*16)]
+ movdqa %1,xmm4
+%endmacro
+
+
+
+%macro key_expansion_1_192 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu [edx+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192 1
+ movdqa xmm5, xmm4
+ pslldq xmm5, 4
+ shufps xmm6, xmm1, 11110000b
+ pxor xmm6, xmm5
+ pxor xmm4, xmm6
+ pshufd xmm7, xmm4, 00001110b
+ movdqu [edx+%1], xmm7
+%endmacro
+
+
+
+
+
+section .data
+align 16
+shuffle_mask:
+DD 0FFFFFFFFh
+DD 03020100h
+DD 07060504h
+DD 0B0A0908h
+
+byte_swap_16:
+DDQ 0x000102030405060708090A0B0C0D0E0F
+
+align 16
+counter_add_one:
+DD 1
+DD 0
+DD 0
+DD 0
+
+counter_add_two:
+DD 2
+DD 0
+DD 0
+DD 0
+
+counter_add_three:
+DD 3
+DD 0
+DD 0
+DD 0
+
+counter_add_four:
+DD 4
+DD 0
+DD 0
+DD 0
+
+
+section .text
+
+
+
+align 16
+key_expansion256:
+
+ pshufd xmm2, xmm2, 011111111b
+
+ movdqu xmm4, xmm1
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm2
+
+ movdqu [edx], xmm1
+ add edx, 0x10
+
+ aeskeygenassist xmm4, xmm1, 0
+ pshufd xmm2, xmm4, 010101010b
+
+ movdqu xmm4, xmm3
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm3, xmm4
+ pxor xmm3, xmm2
+
+ movdqu [edx], xmm3
+ add edx, 0x10
+
+ ret
+
+
+
+align 16
+key_expansion128:
+ pshufd xmm2, xmm2, 0xFF;
+ movdqu xmm3, xmm1
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pshufb xmm3, xmm5
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+
+ ; storing the result in the key schedule array
+ movdqu [edx], xmm1
+ add edx, 0x10
+ ret
+
+
+
+align 16
+global _iEncExpandKey128
+_iEncExpandKey128:
+
+ mov ecx,[esp-4+8] ;input
+ mov edx,[esp-4+12] ;ctx
+
+ movdqu xmm1, [ecx] ; loading the key
+
+ movdqu [edx], xmm1
+
+ movdqa xmm5, [shuffle_mask]
+
+ add edx,16
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ call key_expansion128
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ call key_expansion128
+
+ ret
+
+
+align 16
+global _iEncExpandKey192
+_iEncExpandKey192:
+
+ mov ecx,[esp-4+8] ;input
+ mov edx,[esp-4+12] ;ctx
+
+ movq xmm7, [ecx+16] ; loading the AES key
+ movq [edx+16], xmm7 ; Storing key in memory where all key expansion
+ pshufd xmm4, xmm7, 01001111b
+ movdqu xmm1, [ecx] ; loading the AES key
+ movdqu [edx], xmm1 ; Storing key in memory where all key expansion
+
+ pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion.
+ pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192 24
+ key_expansion_2_192 40
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192 48
+ key_expansion_2_192 64
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192 72
+ key_expansion_2_192 88
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192 96
+ key_expansion_2_192 112
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192 120
+ key_expansion_2_192 136
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192 144
+ key_expansion_2_192 160
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192 168
+ key_expansion_2_192 184
+
+ aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192 192
+
+ ret
+
+
+
+
+
+
+align 16
+global _iDecExpandKey128
+_iDecExpandKey128:
+ push DWORD [esp+8]
+ push DWORD [esp+8]
+
+ call _iEncExpandKey128
+ add esp,8
+
+ mov edx,[esp-4+12] ;ctx
+
+ inversekey [edx + 1*16]
+ inversekey [edx + 2*16]
+ inversekey [edx + 3*16]
+ inversekey [edx + 4*16]
+ inversekey [edx + 5*16]
+ inversekey [edx + 6*16]
+ inversekey [edx + 7*16]
+ inversekey [edx + 8*16]
+ inversekey [edx + 9*16]
+
+ ret
+
+
+
+
+align 16
+global _iDecExpandKey192
+_iDecExpandKey192:
+ push DWORD [esp+8]
+ push DWORD [esp+8]
+
+ call _iEncExpandKey192
+ add esp,8
+
+ mov edx,[esp-4+12] ;ctx
+
+ inversekey [edx + 1*16]
+ inversekey [edx + 2*16]
+ inversekey [edx + 3*16]
+ inversekey [edx + 4*16]
+ inversekey [edx + 5*16]
+ inversekey [edx + 6*16]
+ inversekey [edx + 7*16]
+ inversekey [edx + 8*16]
+ inversekey [edx + 9*16]
+ inversekey [edx + 10*16]
+ inversekey [edx + 11*16]
+
+ ret
+
+
+
+
+align 16
+global _iDecExpandKey256
+_iDecExpandKey256:
+ push DWORD [esp+8]
+ push DWORD [esp+8]
+
+ call _iEncExpandKey256
+ add esp, 8
+
+ mov edx, [esp-4+12] ;expanded key
+
+ inversekey [edx + 1*16]
+ inversekey [edx + 2*16]
+ inversekey [edx + 3*16]
+ inversekey [edx + 4*16]
+ inversekey [edx + 5*16]
+ inversekey [edx + 6*16]
+ inversekey [edx + 7*16]
+ inversekey [edx + 8*16]
+ inversekey [edx + 9*16]
+ inversekey [edx + 10*16]
+ inversekey [edx + 11*16]
+ inversekey [edx + 12*16]
+ inversekey [edx + 13*16]
+
+ ret
+
+
+
+
+align 16
+global _iEncExpandKey256
+_iEncExpandKey256:
+ mov ecx, [esp-4+8] ;input
+ mov edx, [esp-4+12] ;expanded key
+
+
+ movdqu xmm1, [ecx] ; loading the key
+ movdqu xmm3, [ecx+16]
+ movdqu [edx], xmm1 ; Storing key in memory where all key schedule will be stored
+ movdqu [edx+16], xmm3
+
+ add edx,32
+
+ movdqa xmm5, [shuffle_mask] ; this mask is used by key_expansion
+
+ aeskeygenassist xmm2, xmm3, 0x1 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x2 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x4 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x8 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x10 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x20 ;
+ call key_expansion256
+ aeskeygenassist xmm2, xmm3, 0x40 ;
+; call key_expansion256
+
+ pshufd xmm2, xmm2, 011111111b
+
+ movdqu xmm4, xmm1
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pshufb xmm4, xmm5
+ pxor xmm1, xmm4
+ pxor xmm1, xmm2
+
+ movdqu [edx], xmm1
+
+
+ ret
+
+
+
+
+
+
+align 16
+global _iDec128
+_iDec128:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_dec128
+
+ cmp eax,4
+ jl lp128decsingle
+
+ test ecx,0xf
+ jz lp128decfour
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ mov ecx,esp
+
+
+align 16
+lp128decfour:
+
+ test eax,eax
+ jz end_dec128
+
+ cmp eax,4
+ jl lp128decsingle
+
+ load_and_xor4 esi, [ecx+10*16]
+ add esi,16*4
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ sub eax,4
+ store4 esi+edi-(16*4)
+ jmp lp128decfour
+
+
+ align 16
+lp128decsingle:
+
+ movdqu xmm0, [esi]
+ movdqu xmm4,[ecx+10*16]
+ pxor xmm0, xmm4
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ add esi, 16
+ movdqu [edi+esi - 16], xmm0
+ dec eax
+ jnz lp128decsingle
+
+end_dec128:
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret
+
+
+
+align 16
+global _iDec128_CBC
+_iDec128_CBC:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_dec128_CBC
+
+ cmp eax,4
+ jl lp128decsingle_CBC
+
+ test ecx,0xf
+ jz lp128decfour_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ mov ecx,esp
+
+
+align 16
+lp128decfour_CBC:
+
+ test eax,eax
+ jz end_dec128_CBC
+
+ cmp eax,4
+ jl lp128decsingle_CBC
+
+ load_and_xor4 esi, [ecx+10*16]
+ add esi,16*4
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[esi- 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[esi- 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[esi- 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[esi- 16*4 + 3*16]
+
+ sub eax,4
+ store4 esi+edi-(16*4)
+ jmp lp128decfour_CBC
+
+
+ align 16
+lp128decsingle_CBC:
+
+ movdqu xmm0, [esi]
+ movdqa xmm1,xmm0
+ movdqu xmm4,[ecx+10*16]
+ pxor xmm0, xmm4
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+
+ add esi, 16
+ movdqu [edi+esi - 16], xmm0
+ dec eax
+ jnz lp128decsingle_CBC
+
+end_dec128_CBC:
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last iv for chaining
+
+ ret
+
+
+
+
+
+
+align 16
+global _iDec192
+_iDec192:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_dec192
+
+ cmp eax,4
+ jl lp192decsingle
+
+ test ecx,0xf
+ jz lp192decfour
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ mov ecx,esp
+
+
+align 16
+lp192decfour:
+
+ test eax,eax
+ jz end_dec192
+
+ cmp eax,4
+ jl lp192decsingle
+
+ load_and_xor4 esi, [ecx+12*16]
+ add esi,16*4
+ aesdec4 [ecx+11*16]
+ aesdec4 [ecx+10*16]
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ sub eax,4
+ store4 esi+edi-(16*4)
+ jmp lp192decfour
+
+
+ align 16
+lp192decsingle:
+
+ movdqu xmm0, [esi]
+ movdqu xmm4,[ecx+12*16]
+ pxor xmm0, xmm4
+ aesdec1_u [ecx+11*16]
+ aesdec1_u [ecx+10*16]
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ add esi, 16
+ movdqu [edi+esi - 16], xmm0
+ dec eax
+ jnz lp192decsingle
+
+end_dec192:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret
+
+
+align 16
+global _iDec192_CBC
+_iDec192_CBC:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_dec192_CBC
+
+ cmp eax,4
+ jl lp192decsingle_CBC
+
+ test ecx,0xf
+ jz lp192decfour_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ mov ecx,esp
+
+align 16
+lp192decfour_CBC:
+
+ test eax,eax
+ jz end_dec192_CBC
+
+ cmp eax,4
+ jl lp192decsingle_CBC
+
+ load_and_xor4 esi, [ecx+12*16]
+ add esi,16*4
+ aesdec4 [ecx+11*16]
+ aesdec4 [ecx+10*16]
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[esi- 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[esi- 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[esi- 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[esi- 16*4 + 3*16]
+
+ sub eax,4
+ store4 esi+edi-(16*4)
+ jmp lp192decfour_CBC
+
+
+ align 16
+lp192decsingle_CBC:
+
+ movdqu xmm0, [esi]
+ movdqu xmm4,[ecx+12*16]
+ movdqa xmm1,xmm0
+ pxor xmm0, xmm4
+ aesdec1_u [ecx+11*16]
+ aesdec1_u [ecx+10*16]
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+
+ add esi, 16
+ movdqu [edi+esi - 16], xmm0
+ dec eax
+ jnz lp192decsingle_CBC
+
+end_dec192_CBC:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8]
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last iv for chaining
+
+ ret
+
+
+
+
+
+align 16
+global _iDec256
+_iDec256:
+ mov ecx, [esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+
+ test eax,eax
+ jz end_dec256
+
+ cmp eax,4
+ jl lp256dec
+
+ test ecx,0xf
+ jz lp256dec4
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ copy_round_keys esp,ecx,13
+ copy_round_keys esp,ecx,14
+ mov ecx,esp
+
+ align 16
+lp256dec4:
+ test eax,eax
+ jz end_dec256
+
+ cmp eax,4
+ jl lp256dec
+
+ load_and_xor4 esi,[ecx+14*16]
+ add esi, 4*16
+ aesdec4 [ecx+13*16]
+ aesdec4 [ecx+12*16]
+ aesdec4 [ecx+11*16]
+ aesdec4 [ecx+10*16]
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lp256dec4
+
+ align 16
+lp256dec:
+
+ movdqu xmm0, [esi]
+ movdqu xmm4,[ecx+14*16]
+ add esi, 16
+ pxor xmm0, xmm4 ; Round 0 (only xor)
+ aesdec1_u [ecx+13*16]
+ aesdec1_u [ecx+12*16]
+ aesdec1_u [ecx+11*16]
+ aesdec1_u [ecx+10*16]
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp256dec
+
+end_dec256:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret
+
+
+
+
+align 16
+global _iDec256_CBC
+_iDec256_CBC:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_dec256_CBC
+
+ cmp eax,4
+ jl lp256decsingle_CBC
+
+ test ecx,0xf
+ jz lp256decfour_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ copy_round_keys esp,ecx,13
+ copy_round_keys esp,ecx,14
+ mov ecx,esp
+
+align 16
+lp256decfour_CBC:
+
+ test eax,eax
+ jz end_dec256_CBC
+
+ cmp eax,4
+ jl lp256decsingle_CBC
+
+ load_and_xor4 esi, [ecx+14*16]
+ add esi,16*4
+ aesdec4 [ecx+13*16]
+ aesdec4 [ecx+12*16]
+ aesdec4 [ecx+11*16]
+ aesdec4 [ecx+10*16]
+ aesdec4 [ecx+9*16]
+ aesdec4 [ecx+8*16]
+ aesdec4 [ecx+7*16]
+ aesdec4 [ecx+6*16]
+ aesdec4 [ecx+5*16]
+ aesdec4 [ecx+4*16]
+ aesdec4 [ecx+3*16]
+ aesdec4 [ecx+2*16]
+ aesdec4 [ecx+1*16]
+ aesdeclast4 [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqu xmm4,[esi- 16*4 + 0*16]
+ pxor xmm1,xmm4
+ movdqu xmm4,[esi- 16*4 + 1*16]
+ pxor xmm2,xmm4
+ movdqu xmm4,[esi- 16*4 + 2*16]
+ pxor xmm3,xmm4
+ movdqu xmm5,[esi- 16*4 + 3*16]
+
+ sub eax,4
+ store4 esi+edi-(16*4)
+ jmp lp256decfour_CBC
+
+
+ align 16
+lp256decsingle_CBC:
+
+ movdqu xmm0, [esi]
+ movdqa xmm1,xmm0
+ movdqu xmm4, [ecx+14*16]
+ pxor xmm0, xmm4
+ aesdec1_u [ecx+13*16]
+ aesdec1_u [ecx+12*16]
+ aesdec1_u [ecx+11*16]
+ aesdec1_u [ecx+10*16]
+ aesdec1_u [ecx+9*16]
+ aesdec1_u [ecx+8*16]
+ aesdec1_u [ecx+7*16]
+ aesdec1_u [ecx+6*16]
+ aesdec1_u [ecx+5*16]
+ aesdec1_u [ecx+4*16]
+ aesdec1_u [ecx+3*16]
+ aesdec1_u [ecx+2*16]
+ aesdec1_u [ecx+1*16]
+ aesdeclast1_u [ecx+0*16]
+
+ pxor xmm0,xmm5
+ movdqa xmm5,xmm1
+
+ add esi, 16
+ movdqu [edi+esi - 16], xmm0
+ dec eax
+ jnz lp256decsingle_CBC
+
+end_dec256_CBC:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last iv for chaining
+
+ ret
+
+
+
+
+
+
+
+
+
+align 16
+global _iEnc128
+_iEnc128:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_enc128
+
+ cmp eax,4
+ jl lp128encsingle
+
+ test ecx,0xf
+ jz lpenc128four
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ mov ecx,esp
+
+
+ align 16
+
+lpenc128four:
+
+ test eax,eax
+ jz end_enc128
+
+ cmp eax,4
+ jl lp128encsingle
+
+ load_and_xor4 esi,[ecx+0*16]
+ add esi,4*16
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenclast4 [ecx+10*16]
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lpenc128four
+
+ align 16
+lp128encsingle:
+
+ movdqu xmm0, [esi]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenclast1_u [ecx+10*16]
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp128encsingle
+
+end_enc128:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret
+
+
+align 16
+global _iEnc128_CTR
+_iEnc128_CTR:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;initial counter
+ movdqa xmm6, [byte_swap_16]
+ pshufb xmm5, xmm6 ; byte swap counter
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_encctr128
+
+ cmp eax,4
+ jl lp128encctrsingle
+
+ test ecx,0xf
+ jz lpencctr128four
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ mov ecx,esp
+
+
+ align 16
+
+lpencctr128four:
+
+ test eax,eax
+ jz end_encctr128
+
+ cmp eax,4
+ jl lp128encctrsingle
+
+ load_and_inc4 [ecx+0*16]
+ add esi,4*16
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenclast4 [ecx+10*16]
+ xor_with_input4 esi-(4*16)
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lpencctr128four
+
+ align 16
+lp128encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ paddd xmm5,[counter_add_one]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenclast1_u [ecx+10*16]
+ movdqu xmm4, [esi-16]
+ pxor xmm0,xmm4
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp128encctrsingle
+
+end_encctr128:
+ pshufb xmm5, xmm6 ; byte swap counter
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last counter for chaining
+
+ ret
+
+
+align 16
+global _iEnc192_CTR
+_iEnc192_CTR:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;initial counter
+ movdqa xmm6, [byte_swap_16]
+ pshufb xmm5, xmm6 ; byte swap counter
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_encctr192
+
+ cmp eax,4
+ jl lp192encctrsingle
+
+ test ecx,0xf
+ jz lpencctr192four
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ mov ecx,esp
+
+
+ align 16
+
+lpencctr192four:
+
+ test eax,eax
+ jz end_encctr192
+
+ cmp eax,4
+ jl lp192encctrsingle
+
+ load_and_inc4 [ecx+0*16]
+ add esi,4*16
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenc4 [ecx+10*16]
+ aesenc4 [ecx+11*16]
+ aesenclast4 [ecx+12*16]
+ xor_with_input4 esi-(4*16)
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lpencctr192four
+
+ align 16
+lp192encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ paddd xmm5,[counter_add_one]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenc1_u [ecx+10*16]
+ aesenc1_u [ecx+11*16]
+ aesenclast1_u [ecx+12*16]
+ movdqu xmm4, [esi-16]
+ pxor xmm0,xmm4
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp192encctrsingle
+
+end_encctr192:
+
+ pshufb xmm5, xmm6 ; byte swap counter
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last counter for chaining
+
+ ret
+
+
+align 16
+global _iEnc256_CTR
+_iEnc256_CTR:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm5,[eax] ;initial counter
+ movdqa xmm6, [byte_swap_16]
+ pshufb xmm5, xmm6 ; byte swap counter
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_encctr256
+
+ cmp eax,4
+ jl lp256encctrsingle
+
+ test ecx,0xf
+ jz lpencctr256four
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ copy_round_keys esp,ecx,13
+ copy_round_keys esp,ecx,14
+ mov ecx,esp
+
+
+ align 16
+
+lpencctr256four:
+
+ test eax,eax
+ jz end_encctr256
+
+ cmp eax,4
+ jl lp256encctrsingle
+
+ load_and_inc4 [ecx+0*16]
+ add esi,4*16
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenc4 [ecx+10*16]
+ aesenc4 [ecx+11*16]
+ aesenc4 [ecx+12*16]
+ aesenc4 [ecx+13*16]
+ aesenclast4 [ecx+14*16]
+ xor_with_input4 esi-(4*16)
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lpencctr256four
+
+ align 16
+
+lp256encctrsingle:
+
+ movdqa xmm0,xmm5
+ pshufb xmm0, xmm6 ; byte swap counter back
+ paddd xmm5,[counter_add_one]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenc1_u [ecx+10*16]
+ aesenc1_u [ecx+11*16]
+ aesenc1_u [ecx+12*16]
+ aesenc1_u [ecx+13*16]
+ aesenclast1_u [ecx+14*16]
+ movdqu xmm4, [esi-16]
+ pxor xmm0,xmm4
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp256encctrsingle
+
+end_encctr256:
+
+ pshufb xmm5, xmm6 ; byte swap counter
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm5 ; store last counter for chaining
+
+ ret
+
+
+
+
+
+
+align 16
+global _iEnc128_CBC
+_iEnc128_CBC:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm1,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+ sub edi,esi
+
+ test ecx,0xf
+ jz lp128encsingle_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ mov ecx,esp
+
+ align 16
+
+lp128encsingle_CBC:
+
+ movdqu xmm0, [esi]
+ add esi, 16
+ pxor xmm0, xmm1
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1 [ecx+1*16]
+ aesenc1 [ecx+2*16]
+ aesenc1 [ecx+3*16]
+ aesenc1 [ecx+4*16]
+ aesenc1 [ecx+5*16]
+ aesenc1 [ecx+6*16]
+ aesenc1 [ecx+7*16]
+ aesenc1 [ecx+8*16]
+ aesenc1 [ecx+9*16]
+ aesenclast1 [ecx+10*16]
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ movdqa xmm1,xmm0
+ dec eax
+ jnz lp128encsingle_CBC
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm1 ; store last iv for chaining
+
+ ret
+
+
+align 16
+global _iEnc192_CBC
+_iEnc192_CBC:
+ mov ecx,[esp-4+8] ; first arg
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm1,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+ sub edi,esi
+
+ test ecx,0xf
+ jz lp192encsingle_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ mov ecx,esp
+
+ align 16
+
+lp192encsingle_CBC:
+
+ movdqu xmm0, [esi]
+ add esi, 16
+ pxor xmm0, xmm1
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1 [ecx+1*16]
+ aesenc1 [ecx+2*16]
+ aesenc1 [ecx+3*16]
+ aesenc1 [ecx+4*16]
+ aesenc1 [ecx+5*16]
+ aesenc1 [ecx+6*16]
+ aesenc1 [ecx+7*16]
+ aesenc1 [ecx+8*16]
+ aesenc1 [ecx+9*16]
+ aesenc1 [ecx+10*16]
+ aesenc1 [ecx+11*16]
+ aesenclast1 [ecx+12*16]
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ movdqa xmm1,xmm0
+ dec eax
+ jnz lp192encsingle_CBC
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+ mov ecx,[esp-4+8] ; first arg
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm1 ; store last iv for chaining
+
+ ret
+
+align 16
+global _iEnc256_CBC
+_iEnc256_CBC:
+ mov ecx,[esp-4+8] ; first arg
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+12]
+ movdqu xmm1,[eax] ;iv
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+ sub edi,esi
+
+ test ecx,0xf
+ jz lp256encsingle_CBC
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ copy_round_keys esp,ecx,13
+ copy_round_keys esp,ecx,14
+ mov ecx,esp
+
+ align 16
+
+lp256encsingle_CBC:
+
+;abab
+ movdqu xmm0, [esi]
+ add esi, 16
+ pxor xmm0, xmm1
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1 [ecx+1*16]
+ aesenc1 [ecx+2*16]
+ aesenc1 [ecx+3*16]
+ aesenc1 [ecx+4*16]
+ aesenc1 [ecx+5*16]
+ aesenc1 [ecx+6*16]
+ aesenc1 [ecx+7*16]
+ aesenc1 [ecx+8*16]
+ aesenc1 [ecx+9*16]
+ aesenc1 [ecx+10*16]
+ aesenc1 [ecx+11*16]
+ aesenc1 [ecx+12*16]
+ aesenc1 [ecx+13*16]
+ aesenclast1 [ecx+14*16]
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ movdqa xmm1,xmm0
+ dec eax
+ jnz lp256encsingle_CBC
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+ mov ecx,[esp-4+8]
+ mov ecx,[ecx+12]
+ movdqu [ecx],xmm1 ; store last iv for chaining
+
+ ret
+
+
+
+
+
+align 16
+global _iEnc192
+_iEnc192:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_enc192
+
+ cmp eax,4
+ jl lp192encsingle
+
+ test ecx,0xf
+ jz lpenc192four
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ mov ecx,esp
+
+ align 16
+
+lpenc192four:
+
+ test eax,eax
+ jz end_enc192
+
+ cmp eax,4
+ jl lp192encsingle
+
+ load_and_xor4 esi,[ecx+0*16]
+ add esi,4*16
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenc4 [ecx+10*16]
+ aesenc4 [ecx+11*16]
+ aesenclast4 [ecx+12*16]
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lpenc192four
+
+ align 16
+lp192encsingle:
+
+ movdqu xmm0, [esi]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenc1_u [ecx+10*16]
+ aesenc1_u [ecx+11*16]
+ aesenclast1_u [ecx+12*16]
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp192encsingle
+
+end_enc192:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret
+
+
+
+
+align 16
+global _iEnc256
+_iEnc256:
+ mov ecx,[esp-4+8]
+
+ push esi
+ push edi
+ push ebp
+ mov ebp,esp
+
+ sub esp,16*16
+ and esp,0xfffffff0
+
+ mov eax,[ecx+16] ; numblocks
+ mov esi,[ecx]
+ mov edi,[ecx+4]
+ mov ecx,[ecx+8]
+
+ sub edi,esi
+
+ test eax,eax
+ jz end_enc256
+
+ cmp eax,4
+ jl lp256enc
+
+ test ecx,0xf
+ jz lp256enc4
+
+ copy_round_keys esp,ecx,0
+ copy_round_keys esp,ecx,1
+ copy_round_keys esp,ecx,2
+ copy_round_keys esp,ecx,3
+ copy_round_keys esp,ecx,4
+ copy_round_keys esp,ecx,5
+ copy_round_keys esp,ecx,6
+ copy_round_keys esp,ecx,7
+ copy_round_keys esp,ecx,8
+ copy_round_keys esp,ecx,9
+ copy_round_keys esp,ecx,10
+ copy_round_keys esp,ecx,11
+ copy_round_keys esp,ecx,12
+ copy_round_keys esp,ecx,13
+ copy_round_keys esp,ecx,14
+ mov ecx,esp
+
+
+
+ align 16
+
+lp256enc4:
+ test eax,eax
+ jz end_enc256
+
+ cmp eax,4
+ jl lp256enc
+
+
+ load_and_xor4 esi,[ecx+0*16]
+ add esi, 16*4
+ aesenc4 [ecx+1*16]
+ aesenc4 [ecx+2*16]
+ aesenc4 [ecx+3*16]
+ aesenc4 [ecx+4*16]
+ aesenc4 [ecx+5*16]
+ aesenc4 [ecx+6*16]
+ aesenc4 [ecx+7*16]
+ aesenc4 [ecx+8*16]
+ aesenc4 [ecx+9*16]
+ aesenc4 [ecx+10*16]
+ aesenc4 [ecx+11*16]
+ aesenc4 [ecx+12*16]
+ aesenc4 [ecx+13*16]
+ aesenclast4 [ecx+14*16]
+
+ store4 esi+edi-16*4
+ sub eax,4
+ jmp lp256enc4
+
+ align 16
+lp256enc:
+
+ movdqu xmm0, [esi]
+ add esi, 16
+ movdqu xmm4,[ecx+0*16]
+ pxor xmm0, xmm4
+ aesenc1_u [ecx+1*16]
+ aesenc1_u [ecx+2*16]
+ aesenc1_u [ecx+3*16]
+ aesenc1_u [ecx+4*16]
+ aesenc1_u [ecx+5*16]
+ aesenc1_u [ecx+6*16]
+ aesenc1_u [ecx+7*16]
+ aesenc1_u [ecx+8*16]
+ aesenc1_u [ecx+9*16]
+ aesenc1_u [ecx+10*16]
+ aesenc1_u [ecx+11*16]
+ aesenc1_u [ecx+12*16]
+ aesenc1_u [ecx+13*16]
+ aesenclast1_u [ecx+14*16]
+
+ ; Store output encrypted data into CIPHERTEXT array
+ movdqu [esi+edi-16], xmm0
+ dec eax
+ jnz lp256enc
+
+end_enc256:
+
+
+ mov esp,ebp
+ pop ebp
+ pop edi
+ pop esi
+
+ ret