From 1429346822e56485acc4f4af4d1458e9df4115b9 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Wed, 10 Jun 2015 05:13:30 +0300 Subject: src/aes*.asm -> src/asm/, src/aes*.c -> src/c/ --- CMakeLists.txt | 4 +- src/aes128.asm | 202 ------------------------------------- src/aes128.c | 99 ------------------- src/aes192.asm | 248 ---------------------------------------------- src/aes192.c | 134 ------------------------- src/aes256.asm | 286 ----------------------------------------------------- src/aes256.c | 162 ------------------------------ src/asm/aes128.asm | 202 +++++++++++++++++++++++++++++++++++++ src/asm/aes192.asm | 248 ++++++++++++++++++++++++++++++++++++++++++++++ src/asm/aes256.asm | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/c/aes128.c | 99 +++++++++++++++++++ src/c/aes192.c | 134 +++++++++++++++++++++++++ src/c/aes256.c | 162 ++++++++++++++++++++++++++++++ 13 files changed, 1133 insertions(+), 1133 deletions(-) delete mode 100644 src/aes128.asm delete mode 100644 src/aes128.c delete mode 100644 src/aes192.asm delete mode 100644 src/aes192.c delete mode 100644 src/aes256.asm delete mode 100644 src/aes256.c create mode 100644 src/asm/aes128.asm create mode 100644 src/asm/aes192.asm create mode 100644 src/asm/aes256.asm create mode 100644 src/c/aes128.c create mode 100644 src/c/aes192.c create mode 100644 src/c/aes256.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 08d18c1..029dd51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,9 @@ project(libaesni C CXX ASM_MASM) file(GLOB_RECURSE ${PROJECT_NAME}_headers "include/*.h") if(AESNI_USE_ASM) - file(GLOB ${PROJECT_NAME}_sources src/aes128.asm src/aes192.asm src/aes256.asm src/buffer.c src/common.c) + file(GLOB ${PROJECT_NAME}_sources "src/asm/*.asm" "src/*.c") else() - file(GLOB ${PROJECT_NAME}_sources src/aes128.c src/aes192.c src/aes256.c src/buffer.c src/common.c) + file(GLOB ${PROJECT_NAME}_sources "src/c/*.c" "src/*.c") endif() add_library(${PROJECT_NAME} ${${PROJECT_NAME}_headers} ${${PROJECT_NAME}_sources}) diff --git a/src/aes128.asm b/src/aes128.asm deleted file mode 100644 index b49de0e..0000000 --- a/src/aes128.asm +++ /dev/null @@ -1,202 +0,0 @@ -; Copyright 2015 Egor Tensin -; This file is licensed under the terms of the MIT License. -; See LICENSE.txt for details. - -.586 -.xmm -.model flat - -.code - -@raw_aes128_encrypt_block@20 proc - pxor xmm0, [ecx] - aesenc xmm0, [ecx + 10h] - aesenc xmm0, [ecx + 20h] - aesenc xmm0, [ecx + 30h] - aesenc xmm0, [ecx + 40h] - aesenc xmm0, [ecx + 50h] - aesenc xmm0, [ecx + 60h] - aesenc xmm0, [ecx + 70h] - aesenc xmm0, [ecx + 80h] - aesenc xmm0, [ecx + 90h] - aesenclast xmm0, [ecx + 0A0h] - ret -@raw_aes128_encrypt_block@20 endp - -@raw_aes128_decrypt_block@20 proc - pxor xmm0, [ecx] - aesdec xmm0, [ecx + 10h] - aesdec xmm0, [ecx + 20h] - aesdec xmm0, [ecx + 30h] - aesdec xmm0, [ecx + 40h] - aesdec xmm0, [ecx + 50h] - aesdec xmm0, [ecx + 60h] - aesdec xmm0, [ecx + 70h] - aesdec xmm0, [ecx + 80h] - aesdec xmm0, [ecx + 90h] - aesdeclast xmm0, [ecx + 0A0h] - ret -@raw_aes128_decrypt_block@20 endp - -@raw_aes128_expand_key_schedule@20 proc - ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. - ; Words are denoted by `w[N]`. - ; - ; A key schedule is composed of 10 "regular" keys and a dumb key for - ; the "whitening" step. - ; - ; A key schedule is thus composed of 44 "words". - ; The FIPS standard includes an algorithm to calculate these words via - ; a simple loop: - ; - ; i = 4 - ; while i < 44: - ; temp = w[i - 1] - ; if i % 4 == 0: - ; temp = SubWord(RotWord(temp))^Rcon - ; w[i] = w[i - 4]^temp - ; i = i + 1 - ; - ; The loop above may be unrolled like this: - ; - ; w[4] = SubWord(RotWord(w[3]))^Rcon^w[0] - ; w[5] = w[4]^w[1] - ; = SubWord(RotWord(w[3]))^Rcon^w[1]^w[0] - ; w[6] = w[5]^w[2] - ; = SubWord(RotWord(w[3]))^Rcon^w[2]^w[1]^w[0] - ; w[7] = w[6]^w[3] - ; = SubWord(RotWord(w[3]))^Rcon^w[3]^w[2]^w[1]^w[0] - ; w[8] = SubWord(RotWord(w[7]))^Rcon^w[4] - ; w[9] = w[8]^w[5] - ; = SubWord(RotWord(w[7]))^Rcon^w[5]^w[4] - ; w[10] = w[9]^w[6] - ; = SubWord(RotWord(w[7]))^Rcon^w[6]^w[5]^w[4] - ; w[11] = w[10]^w[7] - ; = SubWord(RotWord(w[7]))^Rcon^w[7]^w[6]^w[5]^w[4] - ; - ; ... and so on. - ; - ; The Intel AES-NI instruction set facilitates calculating SubWord - ; and RotWord using `aeskeygenassist`, which is used in this routine. - ; - ; Preconditions: - ; * xmm0[127:96] == w[3], - ; * xmm0[95:64] == w[2], - ; * xmm0[63:32] == w[1], - ; * xmm0[31:0] == w[0]. - - movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] - add ecx, 10h ; ecx = &w[4] - - aeskeygenassist xmm7, xmm0, 01h ; xmm7[127:96] = RotWord(SubWord(w[3]))^Rcon - call aes128_keygen_assist ; sets w[4], w[5], w[6], w[7] - aeskeygenassist xmm7, xmm0, 02h ; xmm7[127:96] = RotWord(SubWord(w[7]))^Rcon - call aes128_keygen_assist ; sets w[8], w[9], w[10], w[11] - aeskeygenassist xmm7, xmm0, 04h ; xmm7[127:96] = RotWord(SubWord(w[11]))^Rcon - call aes128_keygen_assist ; sets w[12], w[13], w[14], w[15] - aeskeygenassist xmm7, xmm0, 08h ; xmm7[127:96] = RotWord(SubWord(w[15]))^Rcon - call aes128_keygen_assist ; sets w[16], w[17], w[18], w[19] - aeskeygenassist xmm7, xmm0, 10h ; xmm7[127:96] = RotWord(SubWord(w[19]))^Rcon - call aes128_keygen_assist ; sets w[20], w[21], w[22], w[23] - aeskeygenassist xmm7, xmm0, 20h ; xmm7[127:96] = RotWord(SubWord(w[23]))^Rcon - call aes128_keygen_assist ; sets w[24], w[25], w[26], w[27] - aeskeygenassist xmm7, xmm0, 40h ; xmm7[127:96] = RotWord(SubWord(w[27]))^Rcon - call aes128_keygen_assist ; sets w[28], w[29], w[30], w[31] - aeskeygenassist xmm7, xmm0, 80h ; xmm7[127:96] = RotWord(SubWord(w[31]))^Rcon - call aes128_keygen_assist ; sets w[32], w[33], w[34], w[35] - aeskeygenassist xmm7, xmm0, 1Bh ; xmm7[127:96] = RotWord(SubWord(w[35]))^Rcon - call aes128_keygen_assist ; sets w[36], w[37], w[38], w[39] - aeskeygenassist xmm7, xmm0, 36h ; xmm7[127:96] = RotWord(SubWord(w[39]))^Rcon - call aes128_keygen_assist ; sets w[40], w[41], w[42], w[43] - - ret - -aes128_keygen_assist: - ; Preconditions: - ; * xmm0[127:96] == w[i+3], - ; * xmm0[95:64] == w[i+2], - ; * xmm0[63:32] == w[i+1], - ; * xmm0[31:0] == w[i], - ; * xmm7[127:96] == RotWord(SubWord(w[i+3]))^Rcon, - ; * ecx == &w[i+4]. - ; - ; Postconditions: - ; * xmm0[127:96] == w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], - ; * xmm0[95:64] == w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i], - ; * xmm0[63:32] == w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i], - ; * xmm0[31:0] == w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i], - ; * ecx == &w[i+8], - ; * the value in xmm6 is also modified. - - ; Calculate - ; w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+2]^w[i+1]^w[i], - ; w[i+1]^w[i] and - ; w[i]. - movdqa xmm6, xmm0 ; xmm6 = xmm0 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+1]^w[i] - ; xmm0[31:0] == w[i] - - ; Calculate - ; w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i], - ; w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] and - ; w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i]. - pshufd xmm6, xmm7, 0FFh ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm7[127:96] - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - ; xmm0[127:96] == w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] - ; xmm0[31:0] == w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i] - - ; Set w[i+4], w[i+5], w[i+6] and w[i+7]. - movdqa [ecx], xmm0 ; w[i+7] = RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] - ; w[i+6] = RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i] - ; w[i+5] = RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] - ; w[i+4] = RotWord(SubWord(w[i+3]))^Rcon^w[i] - add ecx, 10h ; ecx = &w[i+8] - - ret -@raw_aes128_expand_key_schedule@20 endp - -@raw_aes128_invert_key_schedule@8 proc - movdqa xmm7, [ecx] - movdqa xmm6, [ecx + 0A0h] - movdqa [edx], xmm6 - movdqa [edx + 0A0h], xmm7 - - aesimc xmm7, [ecx + 10h] - aesimc xmm6, [ecx + 90h] - movdqa [edx + 10h], xmm6 - movdqa [edx + 90h], xmm7 - - aesimc xmm7, [ecx + 20h] - aesimc xmm6, [ecx + 80h] - movdqa [edx + 20h], xmm6 - movdqa [edx + 80h], xmm7 - - aesimc xmm7, [ecx + 30h] - aesimc xmm6, [ecx + 70h] - movdqa [edx + 30h], xmm6 - movdqa [edx + 70h], xmm7 - - aesimc xmm7, [ecx + 40h] - aesimc xmm6, [ecx + 60h] - movdqa [edx + 40h], xmm6 - movdqa [edx + 60h], xmm7 - - aesimc xmm7, [ecx + 50h] - movdqa [edx + 50h], xmm7 - - ret -@raw_aes128_invert_key_schedule@8 endp - -end diff --git a/src/aes128.c b/src/aes128.c deleted file mode 100644 index d4c609d..0000000 --- a/src/aes128.c +++ /dev/null @@ -1,99 +0,0 @@ -/** - * \file - * \author Egor Tensin - * \date 2015 - * \copyright This file is licensed under the terms of the MIT License. - * See LICENSE.txt for details. - */ - -#include - -#include -#include - -AesBlock128 __fastcall raw_aes128_encrypt_block( - AesBlock128 plain, - Aes128KeySchedule* key_schedule) -{ - plain = _mm_xor_si128(plain, key_schedule->keys[0]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); - return _mm_aesenclast_si128(plain, key_schedule->keys[10]); -} - -AesBlock128 __fastcall raw_aes128_decrypt_block( - AesBlock128 cipher, - Aes128KeySchedule* inverted_schedule) -{ - cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); - return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[10]); -} - -static AesBlock128 __fastcall aes128_keygen_assist( - AesBlock128 prev, - AesBlock128 hwgen) -{ - AesBlock128 tmp = prev; - - tmp = _mm_slli_si128(tmp, 4); - prev = _mm_xor_si128(prev, tmp); - tmp = _mm_slli_si128(tmp, 4); - prev = _mm_xor_si128(prev, tmp); - tmp = _mm_slli_si128(tmp, 4); - prev = _mm_xor_si128(prev, tmp); - - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - prev = _mm_xor_si128(prev, hwgen); - - return prev; -} - -void __fastcall raw_aes128_expand_key_schedule( - AesBlock128 key, - Aes128KeySchedule* key_schedule) -{ - AesBlock128 prev = key_schedule->keys[0] = key; - prev = key_schedule->keys[1] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x01)); - prev = key_schedule->keys[2] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x02)); - prev = key_schedule->keys[3] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x04)); - prev = key_schedule->keys[4] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x08)); - prev = key_schedule->keys[5] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x10)); - prev = key_schedule->keys[6] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x20)); - prev = key_schedule->keys[7] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x40)); - prev = key_schedule->keys[8] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x80)); - prev = key_schedule->keys[9] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x1b)); - prev = key_schedule->keys[10] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x36)); -} - -void __fastcall raw_aes128_invert_key_schedule( - Aes128KeySchedule* key_schedule, - Aes128KeySchedule* inverted_schedule) -{ - inverted_schedule->keys[0] = key_schedule->keys[10]; - inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[9]); - inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[8]); - inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[7]); - inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[6]); - inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[5]); - inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[4]); - inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[3]); - inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[2]); - inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[1]); - inverted_schedule->keys[10] = key_schedule->keys[0]; -} diff --git a/src/aes192.asm b/src/aes192.asm deleted file mode 100644 index 5cc8ded..0000000 --- a/src/aes192.asm +++ /dev/null @@ -1,248 +0,0 @@ -; Copyright 2015 Egor Tensin -; This file is licensed under the terms of the MIT License. -; See LICENSE.txt for details. - -.586 -.xmm -.model flat - -.code - -@raw_aes192_encrypt_block@20 proc - pxor xmm0, [ecx] - aesenc xmm0, [ecx + 10h] - aesenc xmm0, [ecx + 20h] - aesenc xmm0, [ecx + 30h] - aesenc xmm0, [ecx + 40h] - aesenc xmm0, [ecx + 50h] - aesenc xmm0, [ecx + 60h] - aesenc xmm0, [ecx + 70h] - aesenc xmm0, [ecx + 80h] - aesenc xmm0, [ecx + 90h] - aesenc xmm0, [ecx + 0A0h] - aesenc xmm0, [ecx + 0B0h] - aesenclast xmm0, [ecx + 0C0h] - ret -@raw_aes192_encrypt_block@20 endp - -@raw_aes192_decrypt_block@20 proc - pxor xmm0, [ecx] - aesdec xmm0, [ecx + 10h] - aesdec xmm0, [ecx + 20h] - aesdec xmm0, [ecx + 30h] - aesdec xmm0, [ecx + 40h] - aesdec xmm0, [ecx + 50h] - aesdec xmm0, [ecx + 60h] - aesdec xmm0, [ecx + 70h] - aesdec xmm0, [ecx + 80h] - aesdec xmm0, [ecx + 90h] - aesdec xmm0, [ecx + 0A0h] - aesdec xmm0, [ecx + 0B0h] - aesdeclast xmm0, [ecx + 0C0h] - ret -@raw_aes192_decrypt_block@20 endp - -@raw_aes192_expand_key_schedule@36 proc - ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. - ; Words are denoted by `w[N]`. - ; - ; A key schedule is composed of 12 "regular" keys and a dumb key for - ; the "whitening" step. - ; - ; A key schedule is thus composed of 52 "words". - ; The FIPS standard includes an algorithm to calculate these words via - ; a simple loop: - ; - ; i = 6 - ; while i < 52: - ; temp = w[i - 1] - ; if i % 6 == 0: - ; temp = SubWord(RotWord(temp))^Rcon - ; w[i] = w[i - 6]^temp - ; i = i + 1 - ; - ; The loop above may be unrolled like this: - ; - ; w[6] = SubWord(RotWord(w[5]))^Rcon^w[0] - ; w[7] = w[6]^w[1] - ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1] - ; w[8] = w[7]^w[2] - ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2] - ; w[9] = w[8]^w[3] - ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3] - ; w[10] = w[9]^w[4] - ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3]^w[4] - ; w[11] = w[10]^w[5] - ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3]^w[4]^w[5] - ; w[12] = SubWord(RotWord(w[11]))^Rcon^w[6] - ; w[13] = w[12]^w[7] - ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7] - ; w[14] = w[13]^w[8] - ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8] - ; w[15] = w[14]^w[9] - ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9] - ; w[16] = w[15]^w[10] - ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9]^w[10] - ; w[17] = w[16]^w[11] - ; = SubWort(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9]^w[10]^w[11] - ; - ; ... and so on. - ; - ; The Intel AES-NI instruction set facilitates calculating SubWord - ; and RotWord using `aeskeygenassist`, which is used in this routine. - ; - ; Preconditions: - ; * xmm1[63:32] == w[5], - ; * xmm1[31:0] == w[4], - ; * xmm0[127:96] == w[3], - ; * xmm0[95:64] == w[2], - ; * xmm0[63:32] == w[1], - ; * xmm0[31:0] == w[0]. - - movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] - movq qword ptr [ecx + 10h], xmm1 ; sets w[4], w[5] - - aeskeygenassist xmm7, xmm1, 1 ; xmm7[63:32] = RotWord(SubWord(w[5]))^Rcon, - call aes192_keygen_assist - movdqu [ecx + 18h], xmm0 - movq qword ptr [ecx + 28h], xmm1 - aeskeygenassist xmm7, xmm1, 2 ; xmm7[63:32] = RotWord(SubWord(w[11]))^Rcon - call aes192_keygen_assist - movdqa [ecx + 30h], xmm0 - movq qword ptr [ecx + 40h], xmm1 - aeskeygenassist xmm7, xmm1, 4 ; xmm7[63:32] = RotWord(SubWord(w[17]))^Rcon - call aes192_keygen_assist - movdqu [ecx + 48h], xmm0 - movq qword ptr [ecx + 58h], xmm1 - aeskeygenassist xmm7, xmm1, 8 ; xmm7[63:32] = RotWord(SubWord(w[23]))^Rcon - call aes192_keygen_assist - movdqa [ecx + 60h], xmm0 - movq qword ptr [ecx + 70h], xmm1 - aeskeygenassist xmm7, xmm1, 10h ; xmm7[63:32] = RotWord(SubWord(w[29]))^Rcon - call aes192_keygen_assist - movdqu [ecx + 78h], xmm0 - movq qword ptr [ecx + 88h], xmm1 - aeskeygenassist xmm7, xmm1, 20h ; xmm7[63:32] = RotWord(SubWord(w[35]))^Rcon - call aes192_keygen_assist - movdqa [ecx + 90h], xmm0 - movq qword ptr [ecx + 0a0h], xmm1 - aeskeygenassist xmm7, xmm1, 40h ; xmm7[63:32] = RotWord(SubWord(w[41]))^Rcon - call aes192_keygen_assist - movdqu [ecx + 0a8h], xmm0 - movq qword ptr [ecx + 0b8h], xmm1 - aeskeygenassist xmm7, xmm1, 80h ; xmm7[63:32] = RotWord(SubWord(w[49]))^Rcon - call aes192_keygen_assist - movdqa [ecx + 0c0h], xmm0 - - ret - -aes192_keygen_assist: - ; Preconditions: - ; * xmm1[127:96] == 0, - ; * xmm1[95:64] == 0, - ; * xmm1[63:32] == w[i+5], - ; * xmm1[31:0] == w[i+4], - ; * xmm0[127:96] == w[i+3], - ; * xmm0[95:64] == w[i+2], - ; * xmm0[63:32] == w[i+1], - ; * xmm0[31:0] == w[i], - ; * xmm7[63:32] == RotWord(SubWord(w[i+5]))^Rcon. - ; - ; Postconditions: - ; * xmm1[127:96] == 0, - ; * xmm1[95:64] == 0, - ; * xmm1[63:32] == w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], - ; * xmm1[31:0] == w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], - ; * xmm0[127:96] == w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], - ; * xmm0[95:64] == w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i], - ; * xmm0[63:32] == w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i], - ; * xmm0[31:0] == w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i], - ; * the value in xmm6 is also modified. - - ; Calculate - ; w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+2]^w[i+1]^w[i], - ; w[i+1]^w[i] and - ; w[i]. - movdqa xmm6, xmm0 ; xmm6 = xmm0 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+1]^w[i] - ; xmm0[31:0] == w[i] - - ; Calculate - ; w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i], - ; w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i] and - ; w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i]. - pshufd xmm6, xmm7, 55h ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm7[63:32] - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - ; xmm0[127:96] == w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i] - ; xmm0[31:0] == w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i] - - ; Calculate - ; w[i+5]^w[i+4], - ; w[i+4]. - pshufd xmm6, xmm1, 0F3h ; xmm6 = xmm1[31:0] << 32 - pxor xmm1, xmm6 ; xmm1 ^= xmm7 - ; xmm1[63:32] == w[i+5]^w[i+4] - ; xmm1[31:0] == w[i+4] - - ; Calculate - ; w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i]. - pshufd xmm6, xmm0, 0FFh ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm0[127:96] - psrldq xmm6, 8 ; xmm6 >>= 64 - pxor xmm1, xmm6 ; xmm1 ^= xmm6 - ; xmm1[63:32] == w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm1[31:0] == w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i] - - ret -@raw_aes192_expand_key_schedule@36 endp - -@raw_aes192_invert_key_schedule@8 proc - movdqa xmm7, [ecx] - movdqa xmm6, [ecx + 0C0h] - movdqa [edx], xmm6 - movdqa [edx + 0C0h], xmm7 - - aesimc xmm7, [ecx + 10h] - aesimc xmm6, [ecx + 0B0h] - movdqa [edx + 10h], xmm6 - movdqa [edx + 0B0h], xmm7 - - aesimc xmm7, [ecx + 20h] - aesimc xmm6, [ecx + 0A0h] - movdqa [edx + 20h], xmm6 - movdqa [edx + 0A0h], xmm7 - - aesimc xmm7, [ecx + 30h] - aesimc xmm6, [ecx + 90h] - movdqa [edx + 30h], xmm6 - movdqa [edx + 90h], xmm7 - - aesimc xmm7, [ecx + 40h] - aesimc xmm6, [ecx + 80h] - movdqa [edx + 40h], xmm6 - movdqa [edx + 80h], xmm7 - - aesimc xmm7, [ecx + 50h] - aesimc xmm6, [ecx + 70h] - movdqa [edx + 50h], xmm6 - movdqa [edx + 70h], xmm7 - - aesimc xmm7, [ecx + 60h] - movdqa [edx + 60h], xmm7 - - ret -@raw_aes192_invert_key_schedule@8 endp - -end diff --git a/src/aes192.c b/src/aes192.c deleted file mode 100644 index fec8f06..0000000 --- a/src/aes192.c +++ /dev/null @@ -1,134 +0,0 @@ -/** - * \file - * \author Egor Tensin - * \date 2015 - * \copyright This file is licensed under the terms of the MIT License. - * See LICENSE.txt for details. - */ - -#include - -#include -#include - -AesBlock128 __fastcall raw_aes192_encrypt_block( - AesBlock128 plain, - Aes192KeySchedule* key_schedule) -{ - plain = _mm_xor_si128(plain, key_schedule->keys[0]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[10]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[11]); - return _mm_aesenclast_si128(plain, key_schedule->keys[12]); -} - -AesBlock128 __fastcall raw_aes192_decrypt_block( - AesBlock128 cipher, - Aes192KeySchedule* inverted_schedule) -{ - cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[10]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[11]); - return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[12]); -} - -static void __fastcall aes192_keygen_assist( - AesBlock128* prev_lo, - AesBlock128* prev_hi, - AesBlock128 hwgen) -{ - AesBlock128 tmp = *prev_lo; - - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - - hwgen = _mm_shuffle_epi32(hwgen, 0x55); - *prev_lo = _mm_xor_si128(*prev_lo, hwgen); - - tmp = _mm_shuffle_epi32(*prev_hi, 0xf3); - *prev_hi = _mm_xor_si128(*prev_hi, tmp); - - tmp = _mm_shuffle_epi32(*prev_lo, 0xff); - tmp = _mm_srli_si128(tmp, 8); - *prev_hi = _mm_xor_si128(*prev_hi, tmp); -} - -void __fastcall raw_aes192_expand_key_schedule( - AesBlock128 key_lo, - AesBlock128 key_hi, - Aes192KeySchedule* key_schedule) -{ - key_schedule->keys[0] = key_lo; - key_schedule->keys[1] = key_hi; - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x01)); - key_schedule->keys[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[1]), _mm_castsi128_pd(key_lo), 0)); - key_schedule->keys[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x02)); - key_schedule->keys[3] = key_lo; - key_schedule->keys[4] = key_hi; - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x04)); - key_schedule->keys[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[4]), _mm_castsi128_pd(key_lo), 0)); - key_schedule->keys[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x08)); - key_schedule->keys[6] = key_lo; - key_schedule->keys[7] = key_hi; - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x10)); - key_schedule->keys[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[7]), _mm_castsi128_pd(key_lo), 0)); - key_schedule->keys[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x20)); - key_schedule->keys[9] = key_lo; - key_schedule->keys[10] = key_hi; - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x40)); - key_schedule->keys[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[10]), _mm_castsi128_pd(key_lo), 0)); - key_schedule->keys[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); - - aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x80)); - key_schedule->keys[12] = key_lo; -} - -void __fastcall raw_aes192_invert_key_schedule( - Aes192KeySchedule* key_schedule, - Aes192KeySchedule* inverted_schedule) -{ - inverted_schedule->keys[0] = key_schedule->keys[12]; - inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[11]); - inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[10]); - inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[9]); - inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[8]); - inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[7]); - inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[6]); - inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[5]); - inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[4]); - inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[3]); - inverted_schedule->keys[10] = _mm_aesimc_si128(key_schedule->keys[2]); - inverted_schedule->keys[11] = _mm_aesimc_si128(key_schedule->keys[1]); - inverted_schedule->keys[12] = key_schedule->keys[0]; -} diff --git a/src/aes256.asm b/src/aes256.asm deleted file mode 100644 index 413e67b..0000000 --- a/src/aes256.asm +++ /dev/null @@ -1,286 +0,0 @@ -; Copyright 2015 Egor Tensin -; This file is licensed under the terms of the MIT License. -; See LICENSE.txt for details. - -.586 -.xmm -.model flat - -.code - -@raw_aes256_encrypt_block@20 proc - pxor xmm0, [ecx] - aesenc xmm0, [ecx + 10h] - aesenc xmm0, [ecx + 20h] - aesenc xmm0, [ecx + 30h] - aesenc xmm0, [ecx + 40h] - aesenc xmm0, [ecx + 50h] - aesenc xmm0, [ecx + 60h] - aesenc xmm0, [ecx + 70h] - aesenc xmm0, [ecx + 80h] - aesenc xmm0, [ecx + 90h] - aesenc xmm0, [ecx + 0A0h] - aesenc xmm0, [ecx + 0B0h] - aesenc xmm0, [ecx + 0C0h] - aesenc xmm0, [ecx + 0D0h] - aesenclast xmm0, [ecx + 0E0h] - ret -@raw_aes256_encrypt_block@20 endp - -@raw_aes256_decrypt_block@20 proc - pxor xmm0, [ecx] - aesdec xmm0, [ecx + 10h] - aesdec xmm0, [ecx + 20h] - aesdec xmm0, [ecx + 30h] - aesdec xmm0, [ecx + 40h] - aesdec xmm0, [ecx + 50h] - aesdec xmm0, [ecx + 60h] - aesdec xmm0, [ecx + 70h] - aesdec xmm0, [ecx + 80h] - aesdec xmm0, [ecx + 90h] - aesdec xmm0, [ecx + 0A0h] - aesdec xmm0, [ecx + 0B0h] - aesdec xmm0, [ecx + 0C0h] - aesdec xmm0, [ecx + 0D0h] - aesdeclast xmm0, [ecx + 0E0h] - ret -@raw_aes256_decrypt_block@20 endp - -@raw_aes256_expand_key_schedule@36 proc - ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. - ; Words are denoted by `w[N]`. - ; - ; A key schedule is composed of 14 "regular" keys and a dumb key for - ; the "whitening" step. - ; - ; A key schedule is thus composed of 60 "words". - ; The FIPS standard includes an algorithm to calculate these words via - ; a simple loop: - ; - ; i = 8 - ; while i < 60: - ; temp = w[i - 1] - ; if i % 8 == 0: - ; temp = SubWord(RotWord(temp))^Rcon - ; elif i % 8 == 4: - ; temp = SubWord(temp) - ; w[i] = w[i - 8]^temp - ; i = i + 1 - ; - ; The loop above may be unrolled like this: - ; - ; w[8] = SubWord(RotWord(w[7]))^Rcon^w[0] - ; w[9] = w[8]^w[1] - ; = SubWord(RotWord(w[7]))^Rcon^w[1]^w[0] - ; w[10] = w[9]^w[2] - ; = SubWord(RotWord(w[7]))^Rcon^w[2]^w[1]^w[0] - ; w[11] = w[10]^w[3] - ; = SubWord(RotWord(w[7]))^Rcon^w[3]^w[2]^w[1]^w[0] - ; w[12] = SubWord(w[11])^w[4] - ; w[13] = w[12]^w[5] - ; = SubWord(w[11])^w[5]^w[4] - ; w[14] = w[13]^w[6] - ; = SubWord(w[11])^w[6]^w[5]^w[4] - ; w[15] = w[14]^w[7] - ; = SubWord(w[11])^w[7]^w[6]^w[5]^w[4] - ; w[16] = SubWord(RotWord(w[15]))^Rcon^w[8] - ; w[17] = w[16]^w[9] - ; = SubWord(RotWord(w[15]))^Rcon^w[9]^w[8] - ; w[18] = w[17]^w[10] - ; = SubWord(RotWord(w[15]))^Rcon^w[10]^w[9]^w[8] - ; w[19] = w[18]^w[11] - ; = SubWord(RotWord(w[15]))^Rcon^w[11]^w[10]^w[9]^w[8] - ; w[20] = SubWord(w[19])^w[12] - ; w[21] = w[20]^w[13] - ; = SubWord(w[19])^w[13]^w[12] - ; w[22] = w[21]^w[14] - ; = SubWord(w[19])^w[14]^w[13]^w[12] - ; w[23] = w[22]^w[15] - ; = SubWord(w[19])^w[15]^w[14]^w[13]^w[12] - ; - ; ... and so on. - ; - ; The Intel AES-NI instruction set facilitates calculating SubWord - ; and RotWord using `aeskeygenassist`, which is used in this routine. - ; - ; Preconditions: - ; * xmm1[127:96] == w[7], - ; * xmm1[95:64] == w[6], - ; * xmm1[63:32] == w[5], - ; * xmm1[31:0] == w[4], - ; * xmm0[127:96] == w[3], - ; * xmm0[95:64] == w[2], - ; * xmm0[63:32] == w[1], - ; * xmm0[31:0] == w[0]. - - movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] - movdqa [ecx + 10h], xmm1 ; sets w[4], w[5], w[6], w[7] - lea ecx, [ecx + 20h] ; ecx = &w[8] - - aeskeygenassist xmm7, xmm1, 1h ; xmm7[127:96] = RotWord(SubWord(w[7]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[8], w[9], w[10], w[11] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[11]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[12], w[13], w[14], w[15] - - aeskeygenassist xmm7, xmm1, 2h ; xmm7[127:96] = RotWord(SubWord(w[15]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[16], w[17], w[18], w[19] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[19]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[20], w[21], w[22], w[23] - - aeskeygenassist xmm7, xmm1, 4h ; xmm7[127:96] = RotWord(SubWord(w[23]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[24], w[25], w[26], w[27] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[27]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[28], w[29], w[30], w[31] - - aeskeygenassist xmm7, xmm1, 8h ; xmm7[127:96] = RotWord(SubWord(w[31]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[32], w[33], w[34], w[35] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[35]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[36], w[37], w[38], w[39] - - aeskeygenassist xmm7, xmm1, 10h ; xmm7[127:96] = RotWord(SubWord(w[39]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[40], w[41], w[42], w[43] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[43]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[44], w[45], w[46], w[47] - - aeskeygenassist xmm7, xmm1, 20h ; xmm7[127:96] = RotWord(SubWord(w[47]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[48], w[49], w[50], w[51] - - aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[51]) - pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] - call aes256_keygen_assist ; sets w[52], w[53], w[54], w[55] - - aeskeygenassist xmm7, xmm1, 40h ; xmm7[127:96] = RotWord(SubWord(w[55]))^Rcon - pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] - call aes256_keygen_assist ; sets w[56], w[57], w[58], w[59] - - ret - -aes256_keygen_assist: - ; Preconditions: - ; * xmm1[127:96] == w[i+7], - ; * xmm1[95:64] == w[i+6], - ; * xmm1[63:32] == w[i+5], - ; * xmm1[31:0] == w[i+4], - ; * xmm0[127:96] == w[i+3], - ; * xmm0[95:64] == w[i+2], - ; * xmm0[63:32] == w[i+1], - ; * xmm0[31:0] == w[i], - ; * xmm7[127:96] == xmm7[95:64] == xmm7[63:32] == xmm7[31:0] == HWGEN, - ; where HWGEN is either RotWord(SubWord(w[i+7]))^Rcon or SubWord(w[i+7]), - ; depending on the number of the round being processed, - ; * ecx == &w[i+8]. - ; - ; Postconditions: - ; * xmm1[127:96] == w[i+11] == HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i], - ; * xmm1[95:64] == w[i+10] == HWGEN^w[i+2]^w[i+1]^w[i], - ; * xmm1[63:32] == w[i+9] == HWGEN^w[i+1]^w[i], - ; * xmm1[31:0] == w[i+8] == HWGEN^w[i], - ; * xmm0[127:96] == w[i+7], - ; * xmm0[95:64] == w[i+6], - ; * xmm0[63:32] == w[i+5], - ; * xmm0[31:0] == w[i+4], - ; * ecx == &w[i+12], - ; * the value in xmm6 is also modified. - - ; Calculate - ; w[i+3]^w[i+2]^w[i+1]^w[i], - ; w[i+2]^w[i+1]^w[i], - ; w[i+1]^w[i] and - ; w[i]. - movdqa xmm6, xmm0 ; xmm6 = xmm0 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - pslldq xmm6, 4 ; xmm6 <<= 32 - pxor xmm0, xmm6 ; xmm0 ^= xmm6 - ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+1]^w[i] - ; xmm0[31:0] == w[i] - - ; Calculate - ; HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i], - ; HWGEN^w[i+2]^w[i+1]^w[i], - ; HWGEN^w[i+1]^w[i] and - ; HWGEN^w[i]. - pxor xmm0, xmm7 ; xmm0 ^= xmm7 - ; xmm0[127:96] == w[i+11] == HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i] - ; xmm0[95:64] == w[i+10] == HWGEN^w[i+2]^w[i+1]^w[i] - ; xmm0[63:32] == w[i+9] == HWGEN^w[i+1]^w[i] - ; xmm0[31:0] == w[i+8] == HWGEN^w[i] - - ; Set w[i+8], w[i+9], w[i+10] and w[i+11]. - movdqa [ecx], xmm0 ; w[i+8] = HWGEN^w[i] - ; w[i+9] = HWGEN^w[i+1]^w[i] - ; w[i+10] = HWGEN^w[i+2]^w[i+1]^w[i] - ; w[i+11] = HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i] - add ecx, 10h ; ecx = &w[i+12] - - ; Swap the values in xmm0 and xmm1. - pxor xmm0, xmm1 - pxor xmm1, xmm0 - pxor xmm0, xmm1 - - ret -@raw_aes256_expand_key_schedule@36 endp - -@raw_aes256_invert_key_schedule@8 proc - movdqa xmm7, [ecx] - movdqa xmm6, [ecx + 0E0h] - movdqa [edx], xmm6 - movdqa [edx + 0E0h], xmm7 - - aesimc xmm7, [ecx + 10h] - aesimc xmm6, [ecx + 0D0h] - movdqa [edx + 10h], xmm6 - movdqa [edx + 0D0h], xmm7 - - aesimc xmm7, [ecx + 20h] - aesimc xmm6, [ecx + 0C0h] - movdqa [edx + 20h], xmm6 - movdqa [edx + 0C0h], xmm7 - - aesimc xmm7, [ecx + 30h] - aesimc xmm6, [ecx + 0B0h] - movdqa [edx + 30h], xmm6 - movdqa [edx + 0B0h], xmm7 - - aesimc xmm7, [ecx + 40h] - aesimc xmm6, [ecx + 0A0h] - movdqa [edx + 40h], xmm6 - movdqa [edx + 0A0h], xmm7 - - aesimc xmm7, [ecx + 50h] - aesimc xmm6, [ecx + 90h] - movdqa [edx + 50h], xmm6 - movdqa [edx + 90h], xmm7 - - aesimc xmm7, [ecx + 60h] - aesimc xmm6, [ecx + 80h] - movdqa [edx + 60h], xmm6 - movdqa [edx + 80h], xmm7 - - aesimc xmm7, [ecx + 70h] - movdqa [edx + 70h], xmm7 - - ret -@raw_aes256_invert_key_schedule@8 endp - -end diff --git a/src/aes256.c b/src/aes256.c deleted file mode 100644 index be4f783..0000000 --- a/src/aes256.c +++ /dev/null @@ -1,162 +0,0 @@ -/** - * \file - * \author Egor Tensin - * \date 2015 - * \copyright This file is licensed under the terms of the MIT License. - * See LICENSE.txt for details. - */ - -#include - -#include -#include - -AesBlock128 __fastcall raw_aes256_encrypt_block( - AesBlock128 plain, - Aes256KeySchedule* key_schedule) -{ - plain = _mm_xor_si128(plain, key_schedule->keys[0]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[10]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[11]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[12]); - plain = _mm_aesenc_si128(plain, key_schedule->keys[13]); - return _mm_aesenclast_si128(plain, key_schedule->keys[14]); -} - -AesBlock128 __fastcall raw_aes256_decrypt_block( - AesBlock128 cipher, - Aes256KeySchedule* inverted_schedule) -{ - cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[10]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[11]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[12]); - cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[13]); - return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[14]); -} - -static AesBlock128 __fastcall aes256_keygen_assist( - AesBlock128* prev_lo, - AesBlock128* prev_hi, - AesBlock128 hwgen) -{ - AesBlock128 tmp = *prev_lo; - - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - tmp = _mm_slli_si128(tmp, 4); - *prev_lo = _mm_xor_si128(*prev_lo, tmp); - - *prev_lo = _mm_xor_si128(*prev_lo, hwgen); - - *prev_hi = _mm_xor_si128(*prev_hi, *prev_lo); - *prev_lo = _mm_xor_si128(*prev_lo, *prev_hi); - *prev_hi = _mm_xor_si128(*prev_hi, *prev_lo); - - return *prev_hi; -} - -void __fastcall raw_aes256_expand_key_schedule( - AesBlock128 key_lo, - AesBlock128 key_hi, - Aes256KeySchedule* key_schedule) -{ - AesBlock128 prev_lo, prev_hi; - AesBlock128 hwgen; - - prev_lo = key_schedule->keys[0] = key_lo; - prev_hi = key_schedule->keys[1] = key_hi; - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x01); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[2] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[3] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x02); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[4] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[5] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x04); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[6] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[7] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x08); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[8] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[9] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x10); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[10] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[11] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x20); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[12] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); - hwgen = _mm_shuffle_epi32(hwgen, 0xaa); - key_schedule->keys[13] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); - - hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x40); - hwgen = _mm_shuffle_epi32(hwgen, 0xff); - key_schedule->keys[14] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); -} - -void __fastcall raw_aes256_invert_key_schedule( - Aes256KeySchedule* key_schedule, - Aes256KeySchedule* inverted_schedule) -{ - inverted_schedule->keys[0] = key_schedule->keys[14]; - inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[13]); - inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[12]); - inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[11]); - inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[10]); - inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[9]); - inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[8]); - inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[7]); - inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[6]); - inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[5]); - inverted_schedule->keys[10] = _mm_aesimc_si128(key_schedule->keys[4]); - inverted_schedule->keys[11] = _mm_aesimc_si128(key_schedule->keys[3]); - inverted_schedule->keys[12] = _mm_aesimc_si128(key_schedule->keys[2]); - inverted_schedule->keys[13] = _mm_aesimc_si128(key_schedule->keys[1]); - inverted_schedule->keys[14] = key_schedule->keys[0]; -} diff --git a/src/asm/aes128.asm b/src/asm/aes128.asm new file mode 100644 index 0000000..b49de0e --- /dev/null +++ b/src/asm/aes128.asm @@ -0,0 +1,202 @@ +; Copyright 2015 Egor Tensin +; This file is licensed under the terms of the MIT License. +; See LICENSE.txt for details. + +.586 +.xmm +.model flat + +.code + +@raw_aes128_encrypt_block@20 proc + pxor xmm0, [ecx] + aesenc xmm0, [ecx + 10h] + aesenc xmm0, [ecx + 20h] + aesenc xmm0, [ecx + 30h] + aesenc xmm0, [ecx + 40h] + aesenc xmm0, [ecx + 50h] + aesenc xmm0, [ecx + 60h] + aesenc xmm0, [ecx + 70h] + aesenc xmm0, [ecx + 80h] + aesenc xmm0, [ecx + 90h] + aesenclast xmm0, [ecx + 0A0h] + ret +@raw_aes128_encrypt_block@20 endp + +@raw_aes128_decrypt_block@20 proc + pxor xmm0, [ecx] + aesdec xmm0, [ecx + 10h] + aesdec xmm0, [ecx + 20h] + aesdec xmm0, [ecx + 30h] + aesdec xmm0, [ecx + 40h] + aesdec xmm0, [ecx + 50h] + aesdec xmm0, [ecx + 60h] + aesdec xmm0, [ecx + 70h] + aesdec xmm0, [ecx + 80h] + aesdec xmm0, [ecx + 90h] + aesdeclast xmm0, [ecx + 0A0h] + ret +@raw_aes128_decrypt_block@20 endp + +@raw_aes128_expand_key_schedule@20 proc + ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. + ; Words are denoted by `w[N]`. + ; + ; A key schedule is composed of 10 "regular" keys and a dumb key for + ; the "whitening" step. + ; + ; A key schedule is thus composed of 44 "words". + ; The FIPS standard includes an algorithm to calculate these words via + ; a simple loop: + ; + ; i = 4 + ; while i < 44: + ; temp = w[i - 1] + ; if i % 4 == 0: + ; temp = SubWord(RotWord(temp))^Rcon + ; w[i] = w[i - 4]^temp + ; i = i + 1 + ; + ; The loop above may be unrolled like this: + ; + ; w[4] = SubWord(RotWord(w[3]))^Rcon^w[0] + ; w[5] = w[4]^w[1] + ; = SubWord(RotWord(w[3]))^Rcon^w[1]^w[0] + ; w[6] = w[5]^w[2] + ; = SubWord(RotWord(w[3]))^Rcon^w[2]^w[1]^w[0] + ; w[7] = w[6]^w[3] + ; = SubWord(RotWord(w[3]))^Rcon^w[3]^w[2]^w[1]^w[0] + ; w[8] = SubWord(RotWord(w[7]))^Rcon^w[4] + ; w[9] = w[8]^w[5] + ; = SubWord(RotWord(w[7]))^Rcon^w[5]^w[4] + ; w[10] = w[9]^w[6] + ; = SubWord(RotWord(w[7]))^Rcon^w[6]^w[5]^w[4] + ; w[11] = w[10]^w[7] + ; = SubWord(RotWord(w[7]))^Rcon^w[7]^w[6]^w[5]^w[4] + ; + ; ... and so on. + ; + ; The Intel AES-NI instruction set facilitates calculating SubWord + ; and RotWord using `aeskeygenassist`, which is used in this routine. + ; + ; Preconditions: + ; * xmm0[127:96] == w[3], + ; * xmm0[95:64] == w[2], + ; * xmm0[63:32] == w[1], + ; * xmm0[31:0] == w[0]. + + movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] + add ecx, 10h ; ecx = &w[4] + + aeskeygenassist xmm7, xmm0, 01h ; xmm7[127:96] = RotWord(SubWord(w[3]))^Rcon + call aes128_keygen_assist ; sets w[4], w[5], w[6], w[7] + aeskeygenassist xmm7, xmm0, 02h ; xmm7[127:96] = RotWord(SubWord(w[7]))^Rcon + call aes128_keygen_assist ; sets w[8], w[9], w[10], w[11] + aeskeygenassist xmm7, xmm0, 04h ; xmm7[127:96] = RotWord(SubWord(w[11]))^Rcon + call aes128_keygen_assist ; sets w[12], w[13], w[14], w[15] + aeskeygenassist xmm7, xmm0, 08h ; xmm7[127:96] = RotWord(SubWord(w[15]))^Rcon + call aes128_keygen_assist ; sets w[16], w[17], w[18], w[19] + aeskeygenassist xmm7, xmm0, 10h ; xmm7[127:96] = RotWord(SubWord(w[19]))^Rcon + call aes128_keygen_assist ; sets w[20], w[21], w[22], w[23] + aeskeygenassist xmm7, xmm0, 20h ; xmm7[127:96] = RotWord(SubWord(w[23]))^Rcon + call aes128_keygen_assist ; sets w[24], w[25], w[26], w[27] + aeskeygenassist xmm7, xmm0, 40h ; xmm7[127:96] = RotWord(SubWord(w[27]))^Rcon + call aes128_keygen_assist ; sets w[28], w[29], w[30], w[31] + aeskeygenassist xmm7, xmm0, 80h ; xmm7[127:96] = RotWord(SubWord(w[31]))^Rcon + call aes128_keygen_assist ; sets w[32], w[33], w[34], w[35] + aeskeygenassist xmm7, xmm0, 1Bh ; xmm7[127:96] = RotWord(SubWord(w[35]))^Rcon + call aes128_keygen_assist ; sets w[36], w[37], w[38], w[39] + aeskeygenassist xmm7, xmm0, 36h ; xmm7[127:96] = RotWord(SubWord(w[39]))^Rcon + call aes128_keygen_assist ; sets w[40], w[41], w[42], w[43] + + ret + +aes128_keygen_assist: + ; Preconditions: + ; * xmm0[127:96] == w[i+3], + ; * xmm0[95:64] == w[i+2], + ; * xmm0[63:32] == w[i+1], + ; * xmm0[31:0] == w[i], + ; * xmm7[127:96] == RotWord(SubWord(w[i+3]))^Rcon, + ; * ecx == &w[i+4]. + ; + ; Postconditions: + ; * xmm0[127:96] == w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], + ; * xmm0[95:64] == w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i], + ; * xmm0[63:32] == w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i], + ; * xmm0[31:0] == w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i], + ; * ecx == &w[i+8], + ; * the value in xmm6 is also modified. + + ; Calculate + ; w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+2]^w[i+1]^w[i], + ; w[i+1]^w[i] and + ; w[i]. + movdqa xmm6, xmm0 ; xmm6 = xmm0 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+1]^w[i] + ; xmm0[31:0] == w[i] + + ; Calculate + ; w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i], + ; w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] and + ; w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i]. + pshufd xmm6, xmm7, 0FFh ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm7[127:96] + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + ; xmm0[127:96] == w[i+7] == RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+6] == RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+5] == RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] + ; xmm0[31:0] == w[i+4] == RotWord(SubWord(w[i+3]))^Rcon^w[i] + + ; Set w[i+4], w[i+5], w[i+6] and w[i+7]. + movdqa [ecx], xmm0 ; w[i+7] = RotWord(SubWord(w[i+3]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] + ; w[i+6] = RotWord(SubWord(w[i+3]))^Rcon^w[i+2]^w[i+1]^w[i] + ; w[i+5] = RotWord(SubWord(w[i+3]))^Rcon^w[i+1]^w[i] + ; w[i+4] = RotWord(SubWord(w[i+3]))^Rcon^w[i] + add ecx, 10h ; ecx = &w[i+8] + + ret +@raw_aes128_expand_key_schedule@20 endp + +@raw_aes128_invert_key_schedule@8 proc + movdqa xmm7, [ecx] + movdqa xmm6, [ecx + 0A0h] + movdqa [edx], xmm6 + movdqa [edx + 0A0h], xmm7 + + aesimc xmm7, [ecx + 10h] + aesimc xmm6, [ecx + 90h] + movdqa [edx + 10h], xmm6 + movdqa [edx + 90h], xmm7 + + aesimc xmm7, [ecx + 20h] + aesimc xmm6, [ecx + 80h] + movdqa [edx + 20h], xmm6 + movdqa [edx + 80h], xmm7 + + aesimc xmm7, [ecx + 30h] + aesimc xmm6, [ecx + 70h] + movdqa [edx + 30h], xmm6 + movdqa [edx + 70h], xmm7 + + aesimc xmm7, [ecx + 40h] + aesimc xmm6, [ecx + 60h] + movdqa [edx + 40h], xmm6 + movdqa [edx + 60h], xmm7 + + aesimc xmm7, [ecx + 50h] + movdqa [edx + 50h], xmm7 + + ret +@raw_aes128_invert_key_schedule@8 endp + +end diff --git a/src/asm/aes192.asm b/src/asm/aes192.asm new file mode 100644 index 0000000..5cc8ded --- /dev/null +++ b/src/asm/aes192.asm @@ -0,0 +1,248 @@ +; Copyright 2015 Egor Tensin +; This file is licensed under the terms of the MIT License. +; See LICENSE.txt for details. + +.586 +.xmm +.model flat + +.code + +@raw_aes192_encrypt_block@20 proc + pxor xmm0, [ecx] + aesenc xmm0, [ecx + 10h] + aesenc xmm0, [ecx + 20h] + aesenc xmm0, [ecx + 30h] + aesenc xmm0, [ecx + 40h] + aesenc xmm0, [ecx + 50h] + aesenc xmm0, [ecx + 60h] + aesenc xmm0, [ecx + 70h] + aesenc xmm0, [ecx + 80h] + aesenc xmm0, [ecx + 90h] + aesenc xmm0, [ecx + 0A0h] + aesenc xmm0, [ecx + 0B0h] + aesenclast xmm0, [ecx + 0C0h] + ret +@raw_aes192_encrypt_block@20 endp + +@raw_aes192_decrypt_block@20 proc + pxor xmm0, [ecx] + aesdec xmm0, [ecx + 10h] + aesdec xmm0, [ecx + 20h] + aesdec xmm0, [ecx + 30h] + aesdec xmm0, [ecx + 40h] + aesdec xmm0, [ecx + 50h] + aesdec xmm0, [ecx + 60h] + aesdec xmm0, [ecx + 70h] + aesdec xmm0, [ecx + 80h] + aesdec xmm0, [ecx + 90h] + aesdec xmm0, [ecx + 0A0h] + aesdec xmm0, [ecx + 0B0h] + aesdeclast xmm0, [ecx + 0C0h] + ret +@raw_aes192_decrypt_block@20 endp + +@raw_aes192_expand_key_schedule@36 proc + ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. + ; Words are denoted by `w[N]`. + ; + ; A key schedule is composed of 12 "regular" keys and a dumb key for + ; the "whitening" step. + ; + ; A key schedule is thus composed of 52 "words". + ; The FIPS standard includes an algorithm to calculate these words via + ; a simple loop: + ; + ; i = 6 + ; while i < 52: + ; temp = w[i - 1] + ; if i % 6 == 0: + ; temp = SubWord(RotWord(temp))^Rcon + ; w[i] = w[i - 6]^temp + ; i = i + 1 + ; + ; The loop above may be unrolled like this: + ; + ; w[6] = SubWord(RotWord(w[5]))^Rcon^w[0] + ; w[7] = w[6]^w[1] + ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1] + ; w[8] = w[7]^w[2] + ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2] + ; w[9] = w[8]^w[3] + ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3] + ; w[10] = w[9]^w[4] + ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3]^w[4] + ; w[11] = w[10]^w[5] + ; = SubWord(RotWord(w[5]))^Rcon^w[0]^w[1]^w[2]^w[3]^w[4]^w[5] + ; w[12] = SubWord(RotWord(w[11]))^Rcon^w[6] + ; w[13] = w[12]^w[7] + ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7] + ; w[14] = w[13]^w[8] + ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8] + ; w[15] = w[14]^w[9] + ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9] + ; w[16] = w[15]^w[10] + ; = SubWord(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9]^w[10] + ; w[17] = w[16]^w[11] + ; = SubWort(RotWord(w[11]))^Rcon^w[6]^w[7]^w[8]^w[9]^w[10]^w[11] + ; + ; ... and so on. + ; + ; The Intel AES-NI instruction set facilitates calculating SubWord + ; and RotWord using `aeskeygenassist`, which is used in this routine. + ; + ; Preconditions: + ; * xmm1[63:32] == w[5], + ; * xmm1[31:0] == w[4], + ; * xmm0[127:96] == w[3], + ; * xmm0[95:64] == w[2], + ; * xmm0[63:32] == w[1], + ; * xmm0[31:0] == w[0]. + + movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] + movq qword ptr [ecx + 10h], xmm1 ; sets w[4], w[5] + + aeskeygenassist xmm7, xmm1, 1 ; xmm7[63:32] = RotWord(SubWord(w[5]))^Rcon, + call aes192_keygen_assist + movdqu [ecx + 18h], xmm0 + movq qword ptr [ecx + 28h], xmm1 + aeskeygenassist xmm7, xmm1, 2 ; xmm7[63:32] = RotWord(SubWord(w[11]))^Rcon + call aes192_keygen_assist + movdqa [ecx + 30h], xmm0 + movq qword ptr [ecx + 40h], xmm1 + aeskeygenassist xmm7, xmm1, 4 ; xmm7[63:32] = RotWord(SubWord(w[17]))^Rcon + call aes192_keygen_assist + movdqu [ecx + 48h], xmm0 + movq qword ptr [ecx + 58h], xmm1 + aeskeygenassist xmm7, xmm1, 8 ; xmm7[63:32] = RotWord(SubWord(w[23]))^Rcon + call aes192_keygen_assist + movdqa [ecx + 60h], xmm0 + movq qword ptr [ecx + 70h], xmm1 + aeskeygenassist xmm7, xmm1, 10h ; xmm7[63:32] = RotWord(SubWord(w[29]))^Rcon + call aes192_keygen_assist + movdqu [ecx + 78h], xmm0 + movq qword ptr [ecx + 88h], xmm1 + aeskeygenassist xmm7, xmm1, 20h ; xmm7[63:32] = RotWord(SubWord(w[35]))^Rcon + call aes192_keygen_assist + movdqa [ecx + 90h], xmm0 + movq qword ptr [ecx + 0a0h], xmm1 + aeskeygenassist xmm7, xmm1, 40h ; xmm7[63:32] = RotWord(SubWord(w[41]))^Rcon + call aes192_keygen_assist + movdqu [ecx + 0a8h], xmm0 + movq qword ptr [ecx + 0b8h], xmm1 + aeskeygenassist xmm7, xmm1, 80h ; xmm7[63:32] = RotWord(SubWord(w[49]))^Rcon + call aes192_keygen_assist + movdqa [ecx + 0c0h], xmm0 + + ret + +aes192_keygen_assist: + ; Preconditions: + ; * xmm1[127:96] == 0, + ; * xmm1[95:64] == 0, + ; * xmm1[63:32] == w[i+5], + ; * xmm1[31:0] == w[i+4], + ; * xmm0[127:96] == w[i+3], + ; * xmm0[95:64] == w[i+2], + ; * xmm0[63:32] == w[i+1], + ; * xmm0[31:0] == w[i], + ; * xmm7[63:32] == RotWord(SubWord(w[i+5]))^Rcon. + ; + ; Postconditions: + ; * xmm1[127:96] == 0, + ; * xmm1[95:64] == 0, + ; * xmm1[63:32] == w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], + ; * xmm1[31:0] == w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], + ; * xmm0[127:96] == w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], + ; * xmm0[95:64] == w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i], + ; * xmm0[63:32] == w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i], + ; * xmm0[31:0] == w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i], + ; * the value in xmm6 is also modified. + + ; Calculate + ; w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+2]^w[i+1]^w[i], + ; w[i+1]^w[i] and + ; w[i]. + movdqa xmm6, xmm0 ; xmm6 = xmm0 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+1]^w[i] + ; xmm0[31:0] == w[i] + + ; Calculate + ; w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i], + ; w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i] and + ; w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i]. + pshufd xmm6, xmm7, 55h ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm7[63:32] + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + ; xmm0[127:96] == w[i+9] == RotWord(SubWord(w[i+5]))^Rcon^w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+8] == RotWord(SubWord(w[i+5]))^Rcon^w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+7] == RotWord(SubWord(w[i+5]))^Rcon^w[i+1]^w[i] + ; xmm0[31:0] == w[i+6] == RotWord(SubWord(w[i+5]))^Rcon^w[i] + + ; Calculate + ; w[i+5]^w[i+4], + ; w[i+4]. + pshufd xmm6, xmm1, 0F3h ; xmm6 = xmm1[31:0] << 32 + pxor xmm1, xmm6 ; xmm1 ^= xmm7 + ; xmm1[63:32] == w[i+5]^w[i+4] + ; xmm1[31:0] == w[i+4] + + ; Calculate + ; w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i]. + pshufd xmm6, xmm0, 0FFh ; xmm6[127:96] = xmm6[95:64] = xmm6[63:32] = xmm6[31:0] = xmm0[127:96] + psrldq xmm6, 8 ; xmm6 >>= 64 + pxor xmm1, xmm6 ; xmm1 ^= xmm6 + ; xmm1[63:32] == w[i+11] == RotWord(SubWord(w[i+5]))^Rcon^w[i+5]^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm1[31:0] == w[i+10] == RotWord(SubWord(w[i+5]))^Rcon^w[i+4]^w[i+3]^w[i+2]^w[i+1]^w[i] + + ret +@raw_aes192_expand_key_schedule@36 endp + +@raw_aes192_invert_key_schedule@8 proc + movdqa xmm7, [ecx] + movdqa xmm6, [ecx + 0C0h] + movdqa [edx], xmm6 + movdqa [edx + 0C0h], xmm7 + + aesimc xmm7, [ecx + 10h] + aesimc xmm6, [ecx + 0B0h] + movdqa [edx + 10h], xmm6 + movdqa [edx + 0B0h], xmm7 + + aesimc xmm7, [ecx + 20h] + aesimc xmm6, [ecx + 0A0h] + movdqa [edx + 20h], xmm6 + movdqa [edx + 0A0h], xmm7 + + aesimc xmm7, [ecx + 30h] + aesimc xmm6, [ecx + 90h] + movdqa [edx + 30h], xmm6 + movdqa [edx + 90h], xmm7 + + aesimc xmm7, [ecx + 40h] + aesimc xmm6, [ecx + 80h] + movdqa [edx + 40h], xmm6 + movdqa [edx + 80h], xmm7 + + aesimc xmm7, [ecx + 50h] + aesimc xmm6, [ecx + 70h] + movdqa [edx + 50h], xmm6 + movdqa [edx + 70h], xmm7 + + aesimc xmm7, [ecx + 60h] + movdqa [edx + 60h], xmm7 + + ret +@raw_aes192_invert_key_schedule@8 endp + +end diff --git a/src/asm/aes256.asm b/src/asm/aes256.asm new file mode 100644 index 0000000..413e67b --- /dev/null +++ b/src/asm/aes256.asm @@ -0,0 +1,286 @@ +; Copyright 2015 Egor Tensin +; This file is licensed under the terms of the MIT License. +; See LICENSE.txt for details. + +.586 +.xmm +.model flat + +.code + +@raw_aes256_encrypt_block@20 proc + pxor xmm0, [ecx] + aesenc xmm0, [ecx + 10h] + aesenc xmm0, [ecx + 20h] + aesenc xmm0, [ecx + 30h] + aesenc xmm0, [ecx + 40h] + aesenc xmm0, [ecx + 50h] + aesenc xmm0, [ecx + 60h] + aesenc xmm0, [ecx + 70h] + aesenc xmm0, [ecx + 80h] + aesenc xmm0, [ecx + 90h] + aesenc xmm0, [ecx + 0A0h] + aesenc xmm0, [ecx + 0B0h] + aesenc xmm0, [ecx + 0C0h] + aesenc xmm0, [ecx + 0D0h] + aesenclast xmm0, [ecx + 0E0h] + ret +@raw_aes256_encrypt_block@20 endp + +@raw_aes256_decrypt_block@20 proc + pxor xmm0, [ecx] + aesdec xmm0, [ecx + 10h] + aesdec xmm0, [ecx + 20h] + aesdec xmm0, [ecx + 30h] + aesdec xmm0, [ecx + 40h] + aesdec xmm0, [ecx + 50h] + aesdec xmm0, [ecx + 60h] + aesdec xmm0, [ecx + 70h] + aesdec xmm0, [ecx + 80h] + aesdec xmm0, [ecx + 90h] + aesdec xmm0, [ecx + 0A0h] + aesdec xmm0, [ecx + 0B0h] + aesdec xmm0, [ecx + 0C0h] + aesdec xmm0, [ecx + 0D0h] + aesdeclast xmm0, [ecx + 0E0h] + ret +@raw_aes256_decrypt_block@20 endp + +@raw_aes256_expand_key_schedule@36 proc + ; A "word" (in terms of the FIPS 187 standard) is a 32-bit block. + ; Words are denoted by `w[N]`. + ; + ; A key schedule is composed of 14 "regular" keys and a dumb key for + ; the "whitening" step. + ; + ; A key schedule is thus composed of 60 "words". + ; The FIPS standard includes an algorithm to calculate these words via + ; a simple loop: + ; + ; i = 8 + ; while i < 60: + ; temp = w[i - 1] + ; if i % 8 == 0: + ; temp = SubWord(RotWord(temp))^Rcon + ; elif i % 8 == 4: + ; temp = SubWord(temp) + ; w[i] = w[i - 8]^temp + ; i = i + 1 + ; + ; The loop above may be unrolled like this: + ; + ; w[8] = SubWord(RotWord(w[7]))^Rcon^w[0] + ; w[9] = w[8]^w[1] + ; = SubWord(RotWord(w[7]))^Rcon^w[1]^w[0] + ; w[10] = w[9]^w[2] + ; = SubWord(RotWord(w[7]))^Rcon^w[2]^w[1]^w[0] + ; w[11] = w[10]^w[3] + ; = SubWord(RotWord(w[7]))^Rcon^w[3]^w[2]^w[1]^w[0] + ; w[12] = SubWord(w[11])^w[4] + ; w[13] = w[12]^w[5] + ; = SubWord(w[11])^w[5]^w[4] + ; w[14] = w[13]^w[6] + ; = SubWord(w[11])^w[6]^w[5]^w[4] + ; w[15] = w[14]^w[7] + ; = SubWord(w[11])^w[7]^w[6]^w[5]^w[4] + ; w[16] = SubWord(RotWord(w[15]))^Rcon^w[8] + ; w[17] = w[16]^w[9] + ; = SubWord(RotWord(w[15]))^Rcon^w[9]^w[8] + ; w[18] = w[17]^w[10] + ; = SubWord(RotWord(w[15]))^Rcon^w[10]^w[9]^w[8] + ; w[19] = w[18]^w[11] + ; = SubWord(RotWord(w[15]))^Rcon^w[11]^w[10]^w[9]^w[8] + ; w[20] = SubWord(w[19])^w[12] + ; w[21] = w[20]^w[13] + ; = SubWord(w[19])^w[13]^w[12] + ; w[22] = w[21]^w[14] + ; = SubWord(w[19])^w[14]^w[13]^w[12] + ; w[23] = w[22]^w[15] + ; = SubWord(w[19])^w[15]^w[14]^w[13]^w[12] + ; + ; ... and so on. + ; + ; The Intel AES-NI instruction set facilitates calculating SubWord + ; and RotWord using `aeskeygenassist`, which is used in this routine. + ; + ; Preconditions: + ; * xmm1[127:96] == w[7], + ; * xmm1[95:64] == w[6], + ; * xmm1[63:32] == w[5], + ; * xmm1[31:0] == w[4], + ; * xmm0[127:96] == w[3], + ; * xmm0[95:64] == w[2], + ; * xmm0[63:32] == w[1], + ; * xmm0[31:0] == w[0]. + + movdqa [ecx], xmm0 ; sets w[0], w[1], w[2], w[3] + movdqa [ecx + 10h], xmm1 ; sets w[4], w[5], w[6], w[7] + lea ecx, [ecx + 20h] ; ecx = &w[8] + + aeskeygenassist xmm7, xmm1, 1h ; xmm7[127:96] = RotWord(SubWord(w[7]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[8], w[9], w[10], w[11] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[11]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[12], w[13], w[14], w[15] + + aeskeygenassist xmm7, xmm1, 2h ; xmm7[127:96] = RotWord(SubWord(w[15]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[16], w[17], w[18], w[19] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[19]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[20], w[21], w[22], w[23] + + aeskeygenassist xmm7, xmm1, 4h ; xmm7[127:96] = RotWord(SubWord(w[23]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[24], w[25], w[26], w[27] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[27]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[28], w[29], w[30], w[31] + + aeskeygenassist xmm7, xmm1, 8h ; xmm7[127:96] = RotWord(SubWord(w[31]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[32], w[33], w[34], w[35] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[35]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[36], w[37], w[38], w[39] + + aeskeygenassist xmm7, xmm1, 10h ; xmm7[127:96] = RotWord(SubWord(w[39]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[40], w[41], w[42], w[43] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[43]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[44], w[45], w[46], w[47] + + aeskeygenassist xmm7, xmm1, 20h ; xmm7[127:96] = RotWord(SubWord(w[47]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[48], w[49], w[50], w[51] + + aeskeygenassist xmm7, xmm1, 0 ; xmm7[95:64] = SubWord(w[51]) + pshufd xmm7, xmm7, 0AAh ; xmm7[127:96] = xmm7[63:32] = xmm7[31:0] = xmm7[95:64] + call aes256_keygen_assist ; sets w[52], w[53], w[54], w[55] + + aeskeygenassist xmm7, xmm1, 40h ; xmm7[127:96] = RotWord(SubWord(w[55]))^Rcon + pshufd xmm7, xmm7, 0FFh ; xmm7[95:64] = xmm7[63:32] = xmm7[31:0] = xmm7[127:96] + call aes256_keygen_assist ; sets w[56], w[57], w[58], w[59] + + ret + +aes256_keygen_assist: + ; Preconditions: + ; * xmm1[127:96] == w[i+7], + ; * xmm1[95:64] == w[i+6], + ; * xmm1[63:32] == w[i+5], + ; * xmm1[31:0] == w[i+4], + ; * xmm0[127:96] == w[i+3], + ; * xmm0[95:64] == w[i+2], + ; * xmm0[63:32] == w[i+1], + ; * xmm0[31:0] == w[i], + ; * xmm7[127:96] == xmm7[95:64] == xmm7[63:32] == xmm7[31:0] == HWGEN, + ; where HWGEN is either RotWord(SubWord(w[i+7]))^Rcon or SubWord(w[i+7]), + ; depending on the number of the round being processed, + ; * ecx == &w[i+8]. + ; + ; Postconditions: + ; * xmm1[127:96] == w[i+11] == HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i], + ; * xmm1[95:64] == w[i+10] == HWGEN^w[i+2]^w[i+1]^w[i], + ; * xmm1[63:32] == w[i+9] == HWGEN^w[i+1]^w[i], + ; * xmm1[31:0] == w[i+8] == HWGEN^w[i], + ; * xmm0[127:96] == w[i+7], + ; * xmm0[95:64] == w[i+6], + ; * xmm0[63:32] == w[i+5], + ; * xmm0[31:0] == w[i+4], + ; * ecx == &w[i+12], + ; * the value in xmm6 is also modified. + + ; Calculate + ; w[i+3]^w[i+2]^w[i+1]^w[i], + ; w[i+2]^w[i+1]^w[i], + ; w[i+1]^w[i] and + ; w[i]. + movdqa xmm6, xmm0 ; xmm6 = xmm0 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + pslldq xmm6, 4 ; xmm6 <<= 32 + pxor xmm0, xmm6 ; xmm0 ^= xmm6 + ; xmm0[127:96] == w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+1]^w[i] + ; xmm0[31:0] == w[i] + + ; Calculate + ; HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i], + ; HWGEN^w[i+2]^w[i+1]^w[i], + ; HWGEN^w[i+1]^w[i] and + ; HWGEN^w[i]. + pxor xmm0, xmm7 ; xmm0 ^= xmm7 + ; xmm0[127:96] == w[i+11] == HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i] + ; xmm0[95:64] == w[i+10] == HWGEN^w[i+2]^w[i+1]^w[i] + ; xmm0[63:32] == w[i+9] == HWGEN^w[i+1]^w[i] + ; xmm0[31:0] == w[i+8] == HWGEN^w[i] + + ; Set w[i+8], w[i+9], w[i+10] and w[i+11]. + movdqa [ecx], xmm0 ; w[i+8] = HWGEN^w[i] + ; w[i+9] = HWGEN^w[i+1]^w[i] + ; w[i+10] = HWGEN^w[i+2]^w[i+1]^w[i] + ; w[i+11] = HWGEN^w[i+3]^w[i+2]^w[i+1]^w[i] + add ecx, 10h ; ecx = &w[i+12] + + ; Swap the values in xmm0 and xmm1. + pxor xmm0, xmm1 + pxor xmm1, xmm0 + pxor xmm0, xmm1 + + ret +@raw_aes256_expand_key_schedule@36 endp + +@raw_aes256_invert_key_schedule@8 proc + movdqa xmm7, [ecx] + movdqa xmm6, [ecx + 0E0h] + movdqa [edx], xmm6 + movdqa [edx + 0E0h], xmm7 + + aesimc xmm7, [ecx + 10h] + aesimc xmm6, [ecx + 0D0h] + movdqa [edx + 10h], xmm6 + movdqa [edx + 0D0h], xmm7 + + aesimc xmm7, [ecx + 20h] + aesimc xmm6, [ecx + 0C0h] + movdqa [edx + 20h], xmm6 + movdqa [edx + 0C0h], xmm7 + + aesimc xmm7, [ecx + 30h] + aesimc xmm6, [ecx + 0B0h] + movdqa [edx + 30h], xmm6 + movdqa [edx + 0B0h], xmm7 + + aesimc xmm7, [ecx + 40h] + aesimc xmm6, [ecx + 0A0h] + movdqa [edx + 40h], xmm6 + movdqa [edx + 0A0h], xmm7 + + aesimc xmm7, [ecx + 50h] + aesimc xmm6, [ecx + 90h] + movdqa [edx + 50h], xmm6 + movdqa [edx + 90h], xmm7 + + aesimc xmm7, [ecx + 60h] + aesimc xmm6, [ecx + 80h] + movdqa [edx + 60h], xmm6 + movdqa [edx + 80h], xmm7 + + aesimc xmm7, [ecx + 70h] + movdqa [edx + 70h], xmm7 + + ret +@raw_aes256_invert_key_schedule@8 endp + +end diff --git a/src/c/aes128.c b/src/c/aes128.c new file mode 100644 index 0000000..d4c609d --- /dev/null +++ b/src/c/aes128.c @@ -0,0 +1,99 @@ +/** + * \file + * \author Egor Tensin + * \date 2015 + * \copyright This file is licensed under the terms of the MIT License. + * See LICENSE.txt for details. + */ + +#include + +#include +#include + +AesBlock128 __fastcall raw_aes128_encrypt_block( + AesBlock128 plain, + Aes128KeySchedule* key_schedule) +{ + plain = _mm_xor_si128(plain, key_schedule->keys[0]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); + return _mm_aesenclast_si128(plain, key_schedule->keys[10]); +} + +AesBlock128 __fastcall raw_aes128_decrypt_block( + AesBlock128 cipher, + Aes128KeySchedule* inverted_schedule) +{ + cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); + return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[10]); +} + +static AesBlock128 __fastcall aes128_keygen_assist( + AesBlock128 prev, + AesBlock128 hwgen) +{ + AesBlock128 tmp = prev; + + tmp = _mm_slli_si128(tmp, 4); + prev = _mm_xor_si128(prev, tmp); + tmp = _mm_slli_si128(tmp, 4); + prev = _mm_xor_si128(prev, tmp); + tmp = _mm_slli_si128(tmp, 4); + prev = _mm_xor_si128(prev, tmp); + + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + prev = _mm_xor_si128(prev, hwgen); + + return prev; +} + +void __fastcall raw_aes128_expand_key_schedule( + AesBlock128 key, + Aes128KeySchedule* key_schedule) +{ + AesBlock128 prev = key_schedule->keys[0] = key; + prev = key_schedule->keys[1] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x01)); + prev = key_schedule->keys[2] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x02)); + prev = key_schedule->keys[3] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x04)); + prev = key_schedule->keys[4] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x08)); + prev = key_schedule->keys[5] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x10)); + prev = key_schedule->keys[6] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x20)); + prev = key_schedule->keys[7] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x40)); + prev = key_schedule->keys[8] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x80)); + prev = key_schedule->keys[9] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x1b)); + prev = key_schedule->keys[10] = aes128_keygen_assist(prev, _mm_aeskeygenassist_si128(prev, 0x36)); +} + +void __fastcall raw_aes128_invert_key_schedule( + Aes128KeySchedule* key_schedule, + Aes128KeySchedule* inverted_schedule) +{ + inverted_schedule->keys[0] = key_schedule->keys[10]; + inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[9]); + inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[8]); + inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[7]); + inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[6]); + inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[5]); + inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[4]); + inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[3]); + inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[2]); + inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[1]); + inverted_schedule->keys[10] = key_schedule->keys[0]; +} diff --git a/src/c/aes192.c b/src/c/aes192.c new file mode 100644 index 0000000..fec8f06 --- /dev/null +++ b/src/c/aes192.c @@ -0,0 +1,134 @@ +/** + * \file + * \author Egor Tensin + * \date 2015 + * \copyright This file is licensed under the terms of the MIT License. + * See LICENSE.txt for details. + */ + +#include + +#include +#include + +AesBlock128 __fastcall raw_aes192_encrypt_block( + AesBlock128 plain, + Aes192KeySchedule* key_schedule) +{ + plain = _mm_xor_si128(plain, key_schedule->keys[0]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[10]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[11]); + return _mm_aesenclast_si128(plain, key_schedule->keys[12]); +} + +AesBlock128 __fastcall raw_aes192_decrypt_block( + AesBlock128 cipher, + Aes192KeySchedule* inverted_schedule) +{ + cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[10]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[11]); + return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[12]); +} + +static void __fastcall aes192_keygen_assist( + AesBlock128* prev_lo, + AesBlock128* prev_hi, + AesBlock128 hwgen) +{ + AesBlock128 tmp = *prev_lo; + + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + + hwgen = _mm_shuffle_epi32(hwgen, 0x55); + *prev_lo = _mm_xor_si128(*prev_lo, hwgen); + + tmp = _mm_shuffle_epi32(*prev_hi, 0xf3); + *prev_hi = _mm_xor_si128(*prev_hi, tmp); + + tmp = _mm_shuffle_epi32(*prev_lo, 0xff); + tmp = _mm_srli_si128(tmp, 8); + *prev_hi = _mm_xor_si128(*prev_hi, tmp); +} + +void __fastcall raw_aes192_expand_key_schedule( + AesBlock128 key_lo, + AesBlock128 key_hi, + Aes192KeySchedule* key_schedule) +{ + key_schedule->keys[0] = key_lo; + key_schedule->keys[1] = key_hi; + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x01)); + key_schedule->keys[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[1]), _mm_castsi128_pd(key_lo), 0)); + key_schedule->keys[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x02)); + key_schedule->keys[3] = key_lo; + key_schedule->keys[4] = key_hi; + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x04)); + key_schedule->keys[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[4]), _mm_castsi128_pd(key_lo), 0)); + key_schedule->keys[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x08)); + key_schedule->keys[6] = key_lo; + key_schedule->keys[7] = key_hi; + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x10)); + key_schedule->keys[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[7]), _mm_castsi128_pd(key_lo), 0)); + key_schedule->keys[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x20)); + key_schedule->keys[9] = key_lo; + key_schedule->keys[10] = key_hi; + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x40)); + key_schedule->keys[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_schedule->keys[10]), _mm_castsi128_pd(key_lo), 0)); + key_schedule->keys[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_lo), _mm_castsi128_pd(key_hi), 1)); + + aes192_keygen_assist(&key_lo, &key_hi, _mm_aeskeygenassist_si128(key_hi, 0x80)); + key_schedule->keys[12] = key_lo; +} + +void __fastcall raw_aes192_invert_key_schedule( + Aes192KeySchedule* key_schedule, + Aes192KeySchedule* inverted_schedule) +{ + inverted_schedule->keys[0] = key_schedule->keys[12]; + inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[11]); + inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[10]); + inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[9]); + inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[8]); + inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[7]); + inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[6]); + inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[5]); + inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[4]); + inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[3]); + inverted_schedule->keys[10] = _mm_aesimc_si128(key_schedule->keys[2]); + inverted_schedule->keys[11] = _mm_aesimc_si128(key_schedule->keys[1]); + inverted_schedule->keys[12] = key_schedule->keys[0]; +} diff --git a/src/c/aes256.c b/src/c/aes256.c new file mode 100644 index 0000000..be4f783 --- /dev/null +++ b/src/c/aes256.c @@ -0,0 +1,162 @@ +/** + * \file + * \author Egor Tensin + * \date 2015 + * \copyright This file is licensed under the terms of the MIT License. + * See LICENSE.txt for details. + */ + +#include + +#include +#include + +AesBlock128 __fastcall raw_aes256_encrypt_block( + AesBlock128 plain, + Aes256KeySchedule* key_schedule) +{ + plain = _mm_xor_si128(plain, key_schedule->keys[0]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[1]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[2]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[3]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[4]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[5]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[6]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[7]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[8]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[9]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[10]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[11]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[12]); + plain = _mm_aesenc_si128(plain, key_schedule->keys[13]); + return _mm_aesenclast_si128(plain, key_schedule->keys[14]); +} + +AesBlock128 __fastcall raw_aes256_decrypt_block( + AesBlock128 cipher, + Aes256KeySchedule* inverted_schedule) +{ + cipher = _mm_xor_si128(cipher, inverted_schedule->keys[0]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[1]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[2]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[3]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[4]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[5]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[6]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[7]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[8]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[9]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[10]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[11]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[12]); + cipher = _mm_aesdec_si128(cipher, inverted_schedule->keys[13]); + return _mm_aesdeclast_si128(cipher, inverted_schedule->keys[14]); +} + +static AesBlock128 __fastcall aes256_keygen_assist( + AesBlock128* prev_lo, + AesBlock128* prev_hi, + AesBlock128 hwgen) +{ + AesBlock128 tmp = *prev_lo; + + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + tmp = _mm_slli_si128(tmp, 4); + *prev_lo = _mm_xor_si128(*prev_lo, tmp); + + *prev_lo = _mm_xor_si128(*prev_lo, hwgen); + + *prev_hi = _mm_xor_si128(*prev_hi, *prev_lo); + *prev_lo = _mm_xor_si128(*prev_lo, *prev_hi); + *prev_hi = _mm_xor_si128(*prev_hi, *prev_lo); + + return *prev_hi; +} + +void __fastcall raw_aes256_expand_key_schedule( + AesBlock128 key_lo, + AesBlock128 key_hi, + Aes256KeySchedule* key_schedule) +{ + AesBlock128 prev_lo, prev_hi; + AesBlock128 hwgen; + + prev_lo = key_schedule->keys[0] = key_lo; + prev_hi = key_schedule->keys[1] = key_hi; + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x01); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[2] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[3] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x02); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[4] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[5] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x04); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[6] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[7] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x08); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[8] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[9] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x10); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[10] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[11] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x20); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[12] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0); + hwgen = _mm_shuffle_epi32(hwgen, 0xaa); + key_schedule->keys[13] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); + + hwgen = _mm_aeskeygenassist_si128(prev_hi, 0x40); + hwgen = _mm_shuffle_epi32(hwgen, 0xff); + key_schedule->keys[14] = aes256_keygen_assist(&prev_lo, &prev_hi, hwgen); +} + +void __fastcall raw_aes256_invert_key_schedule( + Aes256KeySchedule* key_schedule, + Aes256KeySchedule* inverted_schedule) +{ + inverted_schedule->keys[0] = key_schedule->keys[14]; + inverted_schedule->keys[1] = _mm_aesimc_si128(key_schedule->keys[13]); + inverted_schedule->keys[2] = _mm_aesimc_si128(key_schedule->keys[12]); + inverted_schedule->keys[3] = _mm_aesimc_si128(key_schedule->keys[11]); + inverted_schedule->keys[4] = _mm_aesimc_si128(key_schedule->keys[10]); + inverted_schedule->keys[5] = _mm_aesimc_si128(key_schedule->keys[9]); + inverted_schedule->keys[6] = _mm_aesimc_si128(key_schedule->keys[8]); + inverted_schedule->keys[7] = _mm_aesimc_si128(key_schedule->keys[7]); + inverted_schedule->keys[8] = _mm_aesimc_si128(key_schedule->keys[6]); + inverted_schedule->keys[9] = _mm_aesimc_si128(key_schedule->keys[5]); + inverted_schedule->keys[10] = _mm_aesimc_si128(key_schedule->keys[4]); + inverted_schedule->keys[11] = _mm_aesimc_si128(key_schedule->keys[3]); + inverted_schedule->keys[12] = _mm_aesimc_si128(key_schedule->keys[2]); + inverted_schedule->keys[13] = _mm_aesimc_si128(key_schedule->keys[1]); + inverted_schedule->keys[14] = key_schedule->keys[0]; +} -- cgit v1.2.3