diff -Nru openssl-3.0.10/debian/changelog openssl-3.0.10/debian/changelog --- openssl-3.0.10/debian/changelog 2023-08-02 06:59:28.000000000 +0000 +++ openssl-3.0.10/debian/changelog 2023-08-08 15:51:58.000000000 +0000 @@ -1,3 +1,9 @@ +openssl (3.0.10-1ubuntu2) mantic; urgency=medium + + * d/p/intel/*: cherry-pick AVX512 patches for recent Intel CPUs (LP: #2030784) + + -- Simon Chopin Tue, 08 Aug 2023 17:51:58 +0200 + openssl (3.0.10-1ubuntu1) mantic; urgency=low * Merge from Debian unstable. Remaining changes: diff -Nru openssl-3.0.10/debian/patches/intel/0001-Dual-1536-2048-bit-exponentiation-optimization-for-I.patch openssl-3.0.10/debian/patches/intel/0001-Dual-1536-2048-bit-exponentiation-optimization-for-I.patch --- openssl-3.0.10/debian/patches/intel/0001-Dual-1536-2048-bit-exponentiation-optimization-for-I.patch 1970-01-01 00:00:00.000000000 +0000 +++ openssl-3.0.10/debian/patches/intel/0001-Dual-1536-2048-bit-exponentiation-optimization-for-I.patch 2023-08-08 15:51:24.000000000 +0000 @@ -0,0 +1,3152 @@ +From 8a75952ba829784f5cb499c4883a5729c226e38c Mon Sep 17 00:00:00 2001 +From: Andrey Matyukov +Date: Tue, 8 Dec 2020 22:53:39 +0300 +Subject: [PATCH 1/2] Dual 1536/2048-bit exponentiation optimization for Intel + IceLake CPU + +It uses AVX512_IFMA + AVX512_VL (with 256-bit wide registers) ISA to +keep lower power license. + +Reviewed-by: Matt Caswell +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/14908) + +Backported by Simon Chopin + +Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/openssl/+bug/2030784 +Origin: https://github.com/openssl/openssl/pull/14908 +Applied-Upstream: 3.1.0 + +--- + .../asm/{rsaz-avx512.pl => rsaz-2k-avx512.pl} | 310 +++--- + crypto/bn/asm/rsaz-3k-avx512.pl | 874 ++++++++++++++++ + crypto/bn/asm/rsaz-4k-avx512.pl | 930 ++++++++++++++++++ + crypto/bn/bn_exp.c | 24 +- + crypto/bn/build.info | 6 +- + crypto/bn/rsaz_exp_x2.c | 405 +++++--- + test/exptest.c | 9 +- + 7 files changed, 2226 insertions(+), 332 deletions(-) + rename crypto/bn/asm/{rsaz-avx512.pl => rsaz-2k-avx512.pl} (71%) + create mode 100644 crypto/bn/asm/rsaz-3k-avx512.pl + create mode 100644 crypto/bn/asm/rsaz-4k-avx512.pl + +diff --git a/crypto/bn/asm/rsaz-avx512.pl b/crypto/bn/asm/rsaz-2k-avx512.pl +similarity index 71% +rename from crypto/bn/asm/rsaz-avx512.pl +rename to crypto/bn/asm/rsaz-2k-avx512.pl +index 8d1d19f6c7..80bc4a51b2 100644 +--- a/crypto/bn/asm/rsaz-avx512.pl ++++ b/crypto/bn/asm/rsaz-2k-avx512.pl +@@ -7,7 +7,8 @@ + # https://www.openssl.org/source/license.html + # + # +-# Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov ++# Originally written by Sergey Kirillov and Andrey Matyukov. ++# Special thanks to Ilya Albrekht for his valuable hints. + # Intel Corporation + # + # December 2020 +@@ -86,26 +87,29 @@ ___ + ############################################################################### + # Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. + # +-# AMM is defined as presented in the paper +-# "Efficient Software Implementations of Modular Exponentiation" by Shay Gueron. ++# AMM is defined as presented in the paper [1]. + # + # The input and output are presented in 2^52 radix domain, i.e. + # |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. + # |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 +-# (note, the implementation counts only 52 bits from it). + # +-# NB: the AMM implementation does not perform "conditional" subtraction step as +-# specified in the original algorithm as according to the paper "Enhanced Montgomery +-# Multiplication" by Shay Gueron (see Lemma 1), the result will be always < 2*2^1024 +-# and can be used as a direct input to the next AMM iteration. +-# This post-condition is true, provided the correct parameter |s| is choosen, i.e. +-# s >= n + 2 * k, which matches our case: 1040 > 1024 + 2 * 1. ++# NB: the AMM implementation does not perform "conditional" subtraction step ++# specified in the original algorithm as according to the Lemma 1 from the paper ++# [2], the result will be always < 2*m and can be used as a direct input to ++# the next AMM iteration. This post-condition is true, provided the correct ++# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k, ++# which matches our case: 1040 > 1024 + 2 * 1. + # +-# void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, +-# const BN_ULONG *a, +-# const BN_ULONG *b, +-# const BN_ULONG *m, +-# BN_ULONG k0); ++# [1] Gueron, S. Efficient software implementations of modular exponentiation. ++# DOI: 10.1007/s13389-012-0031-5 ++# [2] Gueron, S. Enhanced Montgomery Multiplication. ++# DOI: 10.1007/3-540-36400-5_5 ++# ++# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, ++# const BN_ULONG *a, ++# const BN_ULONG *b, ++# const BN_ULONG *m, ++# BN_ULONG k0); + ############################################################################### + { + # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") +@@ -121,16 +125,13 @@ my $b_ptr = "%r11"; + my $iter = "%ebx"; + + my $zero = "%ymm0"; +-my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm1", map("%ymm$_",(16..19))); +-my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm2", map("%ymm$_",(20..23))); +-my $Bi = "%ymm3"; +-my $Yi = "%ymm4"; ++my $Bi = "%ymm1"; ++my $Yi = "%ymm2"; ++my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19))); ++my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23))); + + # Registers mapping for normalization. +-# We can reuse Bi, Yi registers here. +-my $TMP = $Bi; +-my $mask52x4 = $Yi; +-my ($T0,$T0h,$T1,$T1h,$T2) = map("%ymm$_", (24..28)); ++my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26))); + + sub amm52x20_x1() { + # _data_offset - offset in the |a| or |m| arrays pointing to the beginning +@@ -199,16 +200,16 @@ $code.=<<___; + ___ + } + +-# Normalization routine: handles carry bits in R0..R2 QWs and +-# gets R0..R2 back to normalized 2^52 representation. ++# Normalization routine: handles carry bits and gets bignum qwords to normalized ++# 2^52 representation. + # + # Uses %r8-14,%e[bcd]x + sub amm52x20_x1_norm { + my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; + $code.=<<___; + # Put accumulator to low qword in R0 +- vpbroadcastq $_acc, $TMP +- vpblendd \$3, $TMP, $_R0, $_R0 ++ vpbroadcastq $_acc, $T0 ++ vpblendd \$3, $T0, $_R0, $_R0 + + # Extract "carries" (12 high bits) from each QW of R0..R2 + # Save them to LSB of QWs in T0..T2 +@@ -223,14 +224,14 @@ $code.=<<___; + valignq \$3, $T1, $T1h, $T1h + valignq \$3, $T0h, $T1, $T1 + valignq \$3, $T0, $T0h, $T0h +- valignq \$3, $zero, $T0, $T0 ++ valignq \$3, .Lzeros(%rip), $T0, $T0 + + # Drop "carries" from R0..R2 QWs +- vpandq $mask52x4, $_R0, $_R0 +- vpandq $mask52x4, $_R0h, $_R0h +- vpandq $mask52x4, $_R1, $_R1 +- vpandq $mask52x4, $_R1h, $_R1h +- vpandq $mask52x4, $_R2, $_R2 ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 + + # Sum R0..R2 with corresponding adjusted carries + vpaddq $T0, $_R0, $_R0 +@@ -241,11 +242,11 @@ $code.=<<___; + + # Now handle carry bits from this addition + # Get mask of QWs which 52-bit parts overflow... +- vpcmpuq \$1, $_R0, $mask52x4, %k1 # OP=lt +- vpcmpuq \$1, $_R0h, $mask52x4, %k2 +- vpcmpuq \$1, $_R1, $mask52x4, %k3 +- vpcmpuq \$1, $_R1h, $mask52x4, %k4 +- vpcmpuq \$1, $_R2, $mask52x4, %k5 ++ vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt) ++ vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2 ++ vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3 ++ vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4 ++ vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5 + kmovb %k1, %r14d # k1 + kmovb %k2, %r13d # k1h + kmovb %k3, %r12d # k2 +@@ -253,11 +254,11 @@ $code.=<<___; + kmovb %k5, %r10d # k3 + + # ...or saturated +- vpcmpuq \$0, $_R0, $mask52x4, %k1 # OP=eq +- vpcmpuq \$0, $_R0h, $mask52x4, %k2 +- vpcmpuq \$0, $_R1, $mask52x4, %k3 +- vpcmpuq \$0, $_R1h, $mask52x4, %k4 +- vpcmpuq \$0, $_R2, $mask52x4, %k5 ++ vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq ++ vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2 ++ vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3 ++ vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4 ++ vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5 + kmovb %k1, %r9d # k4 + kmovb %k2, %r8d # k4h + kmovb %k3, %ebx # k5 +@@ -297,27 +298,27 @@ $code.=<<___; + kmovb %r10d, %k5 + + # Add carries according to the obtained mask +- vpsubq $mask52x4, $_R0, ${_R0}{%k1} +- vpsubq $mask52x4, $_R0h, ${_R0h}{%k2} +- vpsubq $mask52x4, $_R1, ${_R1}{%k3} +- vpsubq $mask52x4, $_R1h, ${_R1h}{%k4} +- vpsubq $mask52x4, $_R2, ${_R2}{%k5} +- +- vpandq $mask52x4, $_R0, $_R0 +- vpandq $mask52x4, $_R0h, $_R0h +- vpandq $mask52x4, $_R1, $_R1 +- vpandq $mask52x4, $_R1h, $_R1h +- vpandq $mask52x4, $_R2, $_R2 ++ vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} ++ vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} ++ vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} ++ vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} ++ vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} ++ ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 + ___ + } + + $code.=<<___; + .text + +-.globl ossl_rsaz_amm52x20_x1_256 +-.type ossl_rsaz_amm52x20_x1_256,\@function,5 ++.globl ossl_rsaz_amm52x20_x1_ifma256 ++.type ossl_rsaz_amm52x20_x1_ifma256,\@function,5 + .align 32 +-ossl_rsaz_amm52x20_x1_256: ++ossl_rsaz_amm52x20_x1_ifma256: + .cfi_startproc + endbranch + push %rbx +@@ -332,7 +333,7 @@ ossl_rsaz_amm52x20_x1_256: + .cfi_push %r14 + push %r15 + .cfi_push %r15 +-.Lrsaz_amm52x20_x1_256_body: ++.Lossl_rsaz_amm52x20_x1_ifma256_body: + + # Zeroing accumulators + vpxord $zero, $zero, $zero +@@ -360,17 +361,15 @@ $code.=<<___; + lea `4*8`($b_ptr), $b_ptr + dec $iter + jne .Lloop5 +- +- vmovdqa64 .Lmask52x4(%rip), $mask52x4 + ___ + &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); + $code.=<<___; + +- vmovdqu64 $R0_0, ($res) +- vmovdqu64 $R0_0h, 32($res) +- vmovdqu64 $R1_0, 64($res) +- vmovdqu64 $R1_0h, 96($res) +- vmovdqu64 $R2_0, 128($res) ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) + + vzeroupper + mov 0(%rsp),%r15 +@@ -387,10 +386,10 @@ $code.=<<___; + .cfi_restore %rbx + lea 48(%rsp),%rsp + .cfi_adjust_cfa_offset -48 +-.Lrsaz_amm52x20_x1_256_epilogue: ++.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: + ret + .cfi_endproc +-.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 ++.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 + ___ + + $code.=<<___; +@@ -406,25 +405,25 @@ ___ + ############################################################################### + # Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 + # +-# See description of ossl_rsaz_amm52x20_x1_256() above for details about Almost ++# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost + # Montgomery Multiplication algorithm and function input parameters description. + # + # This function does two AMMs for two independent inputs, hence dual. + # +-# void ossl_rsaz_amm52x20_x2_256(BN_ULONG out[2][20], +-# const BN_ULONG a[2][20], +-# const BN_ULONG b[2][20], +-# const BN_ULONG m[2][20], +-# const BN_ULONG k0[2]); ++# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], ++# const BN_ULONG a[2][20], ++# const BN_ULONG b[2][20], ++# const BN_ULONG m[2][20], ++# const BN_ULONG k0[2]); + ############################################################################### + + $code.=<<___; + .text + +-.globl ossl_rsaz_amm52x20_x2_256 +-.type ossl_rsaz_amm52x20_x2_256,\@function,5 ++.globl ossl_rsaz_amm52x20_x2_ifma256 ++.type ossl_rsaz_amm52x20_x2_ifma256,\@function,5 + .align 32 +-ossl_rsaz_amm52x20_x2_256: ++ossl_rsaz_amm52x20_x2_ifma256: + .cfi_startproc + endbranch + push %rbx +@@ -439,7 +438,7 @@ ossl_rsaz_amm52x20_x2_256: + .cfi_push %r14 + push %r15 + .cfi_push %r15 +-.Lrsaz_amm52x20_x2_256_body: ++.Lossl_rsaz_amm52x20_x2_ifma256_body: + + # Zeroing accumulators + vpxord $zero, $zero, $zero +@@ -472,24 +471,22 @@ $code.=<<___; + lea 8($b_ptr), $b_ptr + dec $iter + jne .Lloop20 +- +- vmovdqa64 .Lmask52x4(%rip), $mask52x4 + ___ + &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); + &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); + $code.=<<___; + +- vmovdqu64 $R0_0, ($res) +- vmovdqu64 $R0_0h, 32($res) +- vmovdqu64 $R1_0, 64($res) +- vmovdqu64 $R1_0h, 96($res) +- vmovdqu64 $R2_0, 128($res) ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) + +- vmovdqu64 $R0_1, 160($res) +- vmovdqu64 $R0_1h, 192($res) +- vmovdqu64 $R1_1, 224($res) +- vmovdqu64 $R1_1h, 256($res) +- vmovdqu64 $R2_1, 288($res) ++ vmovdqu64 $R0_1, `5*32`($res) ++ vmovdqu64 $R0_1h, `6*32`($res) ++ vmovdqu64 $R1_1, `7*32`($res) ++ vmovdqu64 $R1_1h, `8*32`($res) ++ vmovdqu64 $R2_1, `9*32`($res) + + vzeroupper + mov 0(%rsp),%r15 +@@ -506,10 +503,10 @@ $code.=<<___; + .cfi_restore %rbx + lea 48(%rsp),%rsp + .cfi_adjust_cfa_offset -48 +-.Lrsaz_amm52x20_x2_256_epilogue: ++.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: + ret + .cfi_endproc +-.size ossl_rsaz_amm52x20_x2_256, .-ossl_rsaz_amm52x20_x2_256 ++.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 + ___ + } + +@@ -517,77 +514,76 @@ ___ + # Constant time extraction from the precomputed table of powers base^i, where + # i = 0..2^EXP_WIN_SIZE-1 + # +-# The input |red_table| contains precomputations for two independent base values, +-# so the |tbl_idx| indicates for which base shall we extract the value. +-# |red_table_idx| is a power index. ++# The input |red_table| contains precomputations for two independent base values. ++# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. + # +-# Extracted value (output) is 20 digit number in 2^52 radix. ++# Extracted value (output) is 2 20 digit numbers in 2^52 radix. + # + # void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, + # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], +-# int red_table_idx, +-# int tbl_idx); # 0 or 1 ++# int red_table_idx1, int red_table_idx2); + # + # EXP_WIN_SIZE = 5 + ############################################################################### + { + # input parameters +-my ($out,$red_tbl,$red_tbl_idx,$tbl_idx) = @_6_args_universal_ABI; ++my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ++ ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +-my ($t0,$t1,$t2,$t3,$t4) = map("%ymm$_", (0..4)); +-my $t4xmm = $t4; +-$t4xmm =~ s/%y/%x/; +-my ($tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = map("%ymm$_", (16..20)); +-my ($cur_idx,$idx,$ones) = map("%ymm$_", (21..23)); ++my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); ++my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); ++my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); ++ ++my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); ++my $t0xmm = $t0; ++$t0xmm =~ s/%y/%x/; + + $code.=<<___; + .text + + .align 32 + .globl ossl_extract_multiplier_2x20_win5 +-.type ossl_extract_multiplier_2x20_win5,\@function,4 ++.type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent + ossl_extract_multiplier_2x20_win5: + .cfi_startproc + endbranch +- leaq ($tbl_idx,$tbl_idx,4), %rax +- salq \$5, %rax +- addq %rax, $red_tbl +- + vmovdqa64 .Lones(%rip), $ones # broadcast ones +- vpbroadcastq $red_tbl_idx, $idx ++ vpbroadcastq $red_tbl_idx1, $idx1 ++ vpbroadcastq $red_tbl_idx2, $idx2 + leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl + +- vpxor $t4xmm, $t4xmm, $t4xmm +- vmovdqa64 $t4, $t3 # zeroing t0..4, cur_idx +- vmovdqa64 $t4, $t2 +- vmovdqa64 $t4, $t1 +- vmovdqa64 $t4, $t0 +- vmovdqa64 $t4, $cur_idx ++ # zeroing t0..n, cur_idx ++ vpxor $t0xmm, $t0xmm, $t0xmm ++ vmovdqa64 $t0, $cur_idx ++___ ++foreach (1..9) { ++ $code.="vmovdqa64 $t0, $t[$_] \n"; ++} ++$code.=<<___; + + .align 32 + .Lloop: +- vpcmpq \$0, $cur_idx, $idx, %k1 # mask of (idx == cur_idx) +- addq \$320, $red_tbl # 320 = 2 * 20 digits * 8 bytes +- vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx +- vmovdqu64 -320($red_tbl), $tmp0 # load data from red_tbl +- vmovdqu64 -288($red_tbl), $tmp1 +- vmovdqu64 -256($red_tbl), $tmp2 +- vmovdqu64 -224($red_tbl), $tmp3 +- vmovdqu64 -192($red_tbl), $tmp4 +- vpblendmq $tmp0, $t0, ${t0}{%k1} # extract data when mask is not zero +- vpblendmq $tmp1, $t1, ${t1}{%k1} +- vpblendmq $tmp2, $t2, ${t2}{%k1} +- vpblendmq $tmp3, $t3, ${t3}{%k1} +- vpblendmq $tmp4, $t4, ${t4}{%k1} ++ vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) ++ vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) ++___ ++foreach (0..9) { ++ my $mask = $_<5?"%k1":"%k2"; ++$code.=<<___; ++ vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl ++ vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero ++___ ++} ++$code.=<<___; ++ vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx ++ addq \$`2*20*8`, $red_tbl + cmpq $red_tbl, %rax + jne .Lloop +- +- vmovdqu64 $t0, ($out) # store t0..4 +- vmovdqu64 $t1, 32($out) +- vmovdqu64 $t2, 64($out) +- vmovdqu64 $t3, 96($out) +- vmovdqu64 $t4, 128($out) +- ++___ ++# store t0..n ++foreach (0..9) { ++ $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; ++} ++$code.=<<___; + ret + .cfi_endproc + .size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 +@@ -597,6 +593,8 @@ $code.=<<___; + .align 32 + .Lones: + .quad 1,1,1,1 ++.Lzeros: ++ .quad 0,0,0,0 + ___ + } + +@@ -606,7 +604,7 @@ $frame="%rdx"; + $context="%r8"; + $disp="%r9"; + +-$code.=<<___ ++$code.=<<___; + .extern __imp_RtlVirtualUnwind + .type rsaz_def_handler,\@abi-omnipotent + .align 16 +@@ -697,32 +695,24 @@ rsaz_def_handler: + + .section .pdata + .align 4 +- .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_256 +- .rva .LSEH_end_ossl_rsaz_amm52x20_x1_256 +- .rva .LSEH_info_ossl_rsaz_amm52x20_x1_256 +- +- .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_256 +- .rva .LSEH_end_ossl_rsaz_amm52x20_x2_256 +- .rva .LSEH_info_ossl_rsaz_amm52x20_x2_256 ++ .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256 + +- .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5 +- .rva .LSEH_end_ossl_extract_multiplier_2x20_win5 +- .rva .LSEH_info_ossl_extract_multiplier_2x20_win5 ++ .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256 + + .section .xdata + .align 8 +-.LSEH_info_ossl_rsaz_amm52x20_x1_256: +- .byte 9,0,0,0 +- .rva rsaz_def_handler +- .rva .Lrsaz_amm52x20_x1_256_body,.Lrsaz_amm52x20_x1_256_epilogue +-.LSEH_info_ossl_rsaz_amm52x20_x2_256: ++.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256: + .byte 9,0,0,0 + .rva rsaz_def_handler +- .rva .Lrsaz_amm52x20_x2_256_body,.Lrsaz_amm52x20_x2_256_epilogue +-.LSEH_info_ossl_extract_multiplier_2x20_win5: ++ .rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue ++.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256: + .byte 9,0,0,0 + .rva rsaz_def_handler +- .rva .LSEH_begin_ossl_extract_multiplier_2x20_win5,.LSEH_begin_ossl_extract_multiplier_2x20_win5 ++ .rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue + ___ + } + }}} else {{{ # fallback for old assembler +@@ -736,16 +726,16 @@ ossl_rsaz_avx512ifma_eligible: + ret + .size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible + +-.globl ossl_rsaz_amm52x20_x1_256 +-.globl ossl_rsaz_amm52x20_x2_256 ++.globl ossl_rsaz_amm52x20_x1_ifma256 ++.globl ossl_rsaz_amm52x20_x2_ifma256 + .globl ossl_extract_multiplier_2x20_win5 +-.type ossl_rsaz_amm52x20_x1_256,\@abi-omnipotent +-ossl_rsaz_amm52x20_x1_256: +-ossl_rsaz_amm52x20_x2_256: ++.type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent ++ossl_rsaz_amm52x20_x1_ifma256: ++ossl_rsaz_amm52x20_x2_ifma256: + ossl_extract_multiplier_2x20_win5: + .byte 0x0f,0x0b # ud2 + ret +-.size ossl_rsaz_amm52x20_x1_256, .-ossl_rsaz_amm52x20_x1_256 ++.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 + ___ + }}} + +diff --git a/crypto/bn/asm/rsaz-3k-avx512.pl b/crypto/bn/asm/rsaz-3k-avx512.pl +new file mode 100644 +index 0000000000..e294afd294 +--- /dev/null ++++ b/crypto/bn/asm/rsaz-3k-avx512.pl +@@ -0,0 +1,874 @@ ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# Copyright (c) 2021, Intel Corporation. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# ++# Originally written by Sergey Kirillov and Andrey Matyukov ++# Intel Corporation ++# ++# March 2021 ++# ++# Initial release. ++# ++# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. ++# ++# IceLake-Client @ 1.3GHz ++# |---------+-----------------------+---------------+-------------| ++# | | OpenSSL 3.0.0-alpha15 | this | Unit | ++# |---------+-----------------------+---------------+-------------| ++# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign | ++# | | 203.2 | 453.5 / +123% | sign/s | ++# |---------+-----------------------+---------------+-------------| ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++$avx512ifma=0; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` ++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx512ifma = ($1>=2.26); ++} ++ ++if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && ++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { ++ $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); ++} ++ ++if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { ++ $avx512ifma = ($2>=7.0); ++} ++ ++open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++if ($avx512ifma>0) {{{ ++@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ++ ++############################################################################### ++# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52. ++# ++# AMM is defined as presented in the paper [1]. ++# ++# The input and output are presented in 2^52 radix domain, i.e. ++# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed ++# ++# NOTE: the function uses zero-padded data - 2 high QWs is a padding. ++# ++# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 ++# ++# NB: the AMM implementation does not perform "conditional" subtraction step ++# specified in the original algorithm as according to the Lemma 1 from the paper ++# [2], the result will be always < 2*m and can be used as a direct input to ++# the next AMM iteration. This post-condition is true, provided the correct ++# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k, ++# which matches our case: 1560 > 1536 + 2 * 1. ++# ++# [1] Gueron, S. Efficient software implementations of modular exponentiation. ++# DOI: 10.1007/s13389-012-0031-5 ++# [2] Gueron, S. Enhanced Montgomery Multiplication. ++# DOI: 10.1007/3-540-36400-5_5 ++# ++# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, ++# const BN_ULONG *a, ++# const BN_ULONG *b, ++# const BN_ULONG *m, ++# BN_ULONG k0); ++############################################################################### ++{ ++# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") ++my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; ++ ++my $mask52 = "%rax"; ++my $acc0_0 = "%r9"; ++my $acc0_0_low = "%r9d"; ++my $acc0_1 = "%r15"; ++my $acc0_1_low = "%r15d"; ++my $b_ptr = "%r11"; ++ ++my $iter = "%ebx"; ++ ++my $zero = "%ymm0"; ++my $Bi = "%ymm1"; ++my $Yi = "%ymm2"; ++my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10)); ++my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18)); ++ ++# Registers mapping for normalization ++my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23))); ++ ++sub amm52x30_x1() { ++# _data_offset - offset in the |a| or |m| arrays pointing to the beginning ++# of data for corresponding AMM operation; ++# _b_offset - offset in the |b| array pointing to the next qword digit; ++my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_; ++my $_R0_xmm = $_R0; ++$_R0_xmm =~ s/%y/%x/; ++$code.=<<___; ++ movq $_b_offset($b_ptr), %r13 # b[i] ++ ++ vpbroadcastq %r13, $Bi # broadcast b[i] ++ movq $_data_offset($a), %rdx ++ mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) ++ addq %r13, $_acc # acc += t0 ++ movq %r12, %r10 ++ adcq \$0, %r10 # t2 += CF ++ ++ movq $_k0, %r13 ++ imulq $_acc, %r13 # acc * k0 ++ andq $mask52, %r13 # yi = (acc * k0) & mask52 ++ ++ vpbroadcastq %r13, $Yi # broadcast y[i] ++ movq $_data_offset($m), %rdx ++ mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) ++ addq %r13, $_acc # acc += t0 ++ adcq %r12, %r10 # t2 += (t1 + CF) ++ ++ shrq \$52, $_acc ++ salq \$12, %r10 ++ or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) ++ ++ vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 ++ vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h ++ vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 ++ vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h ++ vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 ++ vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h ++ vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 ++ vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h ++ ++ vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 ++ vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h ++ vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 ++ vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h ++ vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 ++ vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h ++ vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 ++ vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h ++ ++ # Shift accumulators right by 1 qword, zero extending the highest one ++ valignq \$1, $_R0, $_R0h, $_R0 ++ valignq \$1, $_R0h, $_R1, $_R0h ++ valignq \$1, $_R1, $_R1h, $_R1 ++ valignq \$1, $_R1h, $_R2, $_R1h ++ valignq \$1, $_R2, $_R2h, $_R2 ++ valignq \$1, $_R2h, $_R3, $_R2h ++ valignq \$1, $_R3, $_R3h, $_R3 ++ valignq \$1, $_R3h, $zero, $_R3h ++ ++ vmovq $_R0_xmm, %r13 ++ addq %r13, $_acc # acc += R0[0] ++ ++ vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 ++ vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h ++ vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 ++ vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h ++ vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 ++ vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h ++ vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 ++ vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h ++ ++ vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 ++ vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h ++ vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 ++ vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h ++ vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 ++ vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h ++ vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 ++ vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h ++___ ++} ++ ++# Normalization routine: handles carry bits and gets bignum qwords to normalized ++# 2^52 representation. ++# ++# Uses %r8-14,%e[abcd]x ++sub amm52x30_x1_norm { ++my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_; ++$code.=<<___; ++ # Put accumulator to low qword in R0 ++ vpbroadcastq $_acc, $T0 ++ vpblendd \$3, $T0, $_R0, $_R0 ++ ++ # Extract "carries" (12 high bits) from each QW of the bignum ++ # Save them to LSB of QWs in T0..Tn ++ vpsrlq \$52, $_R0, $T0 ++ vpsrlq \$52, $_R0h, $T0h ++ vpsrlq \$52, $_R1, $T1 ++ vpsrlq \$52, $_R1h, $T1h ++ vpsrlq \$52, $_R2, $T2 ++ vpsrlq \$52, $_R2h, $T2h ++ vpsrlq \$52, $_R3, $T3 ++ vpsrlq \$52, $_R3h, $T3h ++ ++ # "Shift left" T0..Tn by 1 QW ++ valignq \$3, $T3, $T3h, $T3h ++ valignq \$3, $T2h, $T3, $T3 ++ valignq \$3, $T2, $T2h, $T2h ++ valignq \$3, $T1h, $T2, $T2 ++ valignq \$3, $T1, $T1h, $T1h ++ valignq \$3, $T0h, $T1, $T1 ++ valignq \$3, $T0, $T0h, $T0h ++ valignq \$3, .Lzeros(%rip), $T0, $T0 ++ ++ # Drop "carries" from R0..Rn QWs ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 ++ vpandq .Lmask52x4(%rip), $_R2h, $_R2h ++ vpandq .Lmask52x4(%rip), $_R3, $_R3 ++ vpandq .Lmask52x4(%rip), $_R3h, $_R3h ++ ++ # Sum R0..Rn with corresponding adjusted carries ++ vpaddq $T0, $_R0, $_R0 ++ vpaddq $T0h, $_R0h, $_R0h ++ vpaddq $T1, $_R1, $_R1 ++ vpaddq $T1h, $_R1h, $_R1h ++ vpaddq $T2, $_R2, $_R2 ++ vpaddq $T2h, $_R2h, $_R2h ++ vpaddq $T3, $_R3, $_R3 ++ vpaddq $T3h, $_R3h, $_R3h ++ ++ # Now handle carry bits from this addition ++ # Get mask of QWs whose 52-bit parts overflow ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 ++ kmovb %k1,%r14d ++ kmovb %k2,%r13d ++ shl \$4,%r13b ++ or %r13b,%r14b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 ++ kmovb %k1,%r13d ++ kmovb %k2,%r12d ++ shl \$4,%r12b ++ or %r12b,%r13b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 ++ kmovb %k1,%r12d ++ kmovb %k2,%r11d ++ shl \$4,%r11b ++ or %r11b,%r12b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 ++ kmovb %k1,%r11d ++ kmovb %k2,%r10d ++ shl \$4,%r10b ++ or %r10b,%r11b ++ ++ addb %r14b,%r14b ++ adcb %r13b,%r13b ++ adcb %r12b,%r12b ++ adcb %r11b,%r11b ++ ++ # Get mask of QWs whose 52-bit parts saturated ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 ++ kmovb %k1,%r9d ++ kmovb %k2,%r8d ++ shl \$4,%r8b ++ or %r8b,%r9b ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 ++ kmovb %k1,%r8d ++ kmovb %k2,%edx ++ shl \$4,%dl ++ or %dl,%r8b ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 ++ kmovb %k1,%edx ++ kmovb %k2,%ecx ++ shl \$4,%cl ++ or %cl,%dl ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 ++ kmovb %k1,%ecx ++ kmovb %k2,%ebx ++ shl \$4,%bl ++ or %bl,%cl ++ ++ addb %r9b,%r14b ++ adcb %r8b,%r13b ++ adcb %dl,%r12b ++ adcb %cl,%r11b ++ ++ xor %r9b,%r14b ++ xor %r8b,%r13b ++ xor %dl,%r12b ++ xor %cl,%r11b ++ ++ kmovb %r14d,%k1 ++ shr \$4,%r14b ++ kmovb %r14d,%k2 ++ kmovb %r13d,%k3 ++ shr \$4,%r13b ++ kmovb %r13d,%k4 ++ kmovb %r12d,%k5 ++ shr \$4,%r12b ++ kmovb %r12d,%k6 ++ kmovb %r11d,%k7 ++ ++ vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} ++ vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} ++ vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} ++ vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} ++ vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} ++ vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} ++ vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} ++ ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 ++ vpandq .Lmask52x4(%rip), $_R2h, $_R2h ++ vpandq .Lmask52x4(%rip), $_R3, $_R3 ++ ++ shr \$4,%r11b ++ kmovb %r11d,%k1 ++ ++ vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} ++ ++ vpandq .Lmask52x4(%rip), $_R3h, $_R3h ++___ ++} ++ ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x30_x1_ifma256 ++.type ossl_rsaz_amm52x30_x1_ifma256,\@function,5 ++.align 32 ++ossl_rsaz_amm52x30_x1_ifma256: ++.cfi_startproc ++ endbranch ++ push %rbx ++.cfi_push %rbx ++ push %rbp ++.cfi_push %rbp ++ push %r12 ++.cfi_push %r12 ++ push %r13 ++.cfi_push %r13 ++ push %r14 ++.cfi_push %r14 ++ push %r15 ++.cfi_push %r15 ++___ ++$code.=<<___ if ($win64); ++ lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) ++ vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers ++ vmovdqa64 %xmm7, `1*16`(%rsp) ++ vmovdqa64 %xmm8, `2*16`(%rsp) ++ vmovdqa64 %xmm9, `3*16`(%rsp) ++ vmovdqa64 %xmm10,`4*16`(%rsp) ++ vmovdqa64 %xmm11,`5*16`(%rsp) ++ vmovdqa64 %xmm12,`6*16`(%rsp) ++ vmovdqa64 %xmm13,`7*16`(%rsp) ++ vmovdqa64 %xmm14,`8*16`(%rsp) ++ vmovdqa64 %xmm15,`9*16`(%rsp) ++.Lossl_rsaz_amm52x30_x1_ifma256_body: ++___ ++$code.=<<___; ++ # Zeroing accumulators ++ vpxord $zero, $zero, $zero ++ vmovdqa64 $zero, $R0_0 ++ vmovdqa64 $zero, $R0_0h ++ vmovdqa64 $zero, $R1_0 ++ vmovdqa64 $zero, $R1_0h ++ vmovdqa64 $zero, $R2_0 ++ vmovdqa64 $zero, $R2_0h ++ vmovdqa64 $zero, $R3_0 ++ vmovdqa64 $zero, $R3_0h ++ ++ xorl $acc0_0_low, $acc0_0_low ++ ++ movq $b, $b_ptr # backup address of b ++ movq \$0xfffffffffffff, $mask52 # 52-bit mask ++ ++ # Loop over 30 digits unrolled by 4 ++ mov \$7, $iter ++ ++.align 32 ++.Lloop7: ++___ ++ foreach my $idx (0..3) { ++ &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); ++ } ++$code.=<<___; ++ lea `4*8`($b_ptr), $b_ptr ++ dec $iter ++ jne .Lloop7 ++___ ++ &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); ++ &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); ++ ++ &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); ++$code.=<<___; ++ ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) ++ vmovdqu64 $R2_0h, `5*32`($res) ++ vmovdqu64 $R3_0, `6*32`($res) ++ vmovdqu64 $R3_0h, `7*32`($res) ++ ++ vzeroupper ++ lea (%rsp),%rax ++.cfi_def_cfa_register %rax ++___ ++$code.=<<___ if ($win64); ++ vmovdqa64 `0*16`(%rax),%xmm6 ++ vmovdqa64 `1*16`(%rax),%xmm7 ++ vmovdqa64 `2*16`(%rax),%xmm8 ++ vmovdqa64 `3*16`(%rax),%xmm9 ++ vmovdqa64 `4*16`(%rax),%xmm10 ++ vmovdqa64 `5*16`(%rax),%xmm11 ++ vmovdqa64 `6*16`(%rax),%xmm12 ++ vmovdqa64 `7*16`(%rax),%xmm13 ++ vmovdqa64 `8*16`(%rax),%xmm14 ++ vmovdqa64 `9*16`(%rax),%xmm15 ++ lea 168(%rsp),%rax ++___ ++$code.=<<___; ++ mov 0(%rax),%r15 ++.cfi_restore %r15 ++ mov 8(%rax),%r14 ++.cfi_restore %r14 ++ mov 16(%rax),%r13 ++.cfi_restore %r13 ++ mov 24(%rax),%r12 ++.cfi_restore %r12 ++ mov 32(%rax),%rbp ++.cfi_restore %rbp ++ mov 40(%rax),%rbx ++.cfi_restore %rbx ++ lea 48(%rax),%rsp # restore rsp ++.cfi_def_cfa %rsp,8 ++.Lossl_rsaz_amm52x30_x1_ifma256_epilogue: ++ ret ++.cfi_endproc ++.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 ++___ ++ ++$code.=<<___; ++.data ++.align 32 ++.Lmask52x4: ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++___ ++ ++############################################################################### ++# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52 ++# ++# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost ++# Montgomery Multiplication algorithm and function input parameters description. ++# ++# This function does two AMMs for two independent inputs, hence dual. ++# ++# NOTE: the function uses zero-padded data - 2 high QWs is a padding. ++# ++# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32], ++# const BN_ULONG a[2][32], ++# const BN_ULONG b[2][32], ++# const BN_ULONG m[2][32], ++# const BN_ULONG k0[2]); ++############################################################################### ++ ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x30_x2_ifma256 ++.type ossl_rsaz_amm52x30_x2_ifma256,\@function,5 ++.align 32 ++ossl_rsaz_amm52x30_x2_ifma256: ++.cfi_startproc ++ endbranch ++ push %rbx ++.cfi_push %rbx ++ push %rbp ++.cfi_push %rbp ++ push %r12 ++.cfi_push %r12 ++ push %r13 ++.cfi_push %r13 ++ push %r14 ++.cfi_push %r14 ++ push %r15 ++.cfi_push %r15 ++___ ++$code.=<<___ if ($win64); ++ lea -168(%rsp),%rsp ++ vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers ++ vmovdqa64 %xmm7, `1*16`(%rsp) ++ vmovdqa64 %xmm8, `2*16`(%rsp) ++ vmovdqa64 %xmm9, `3*16`(%rsp) ++ vmovdqa64 %xmm10,`4*16`(%rsp) ++ vmovdqa64 %xmm11,`5*16`(%rsp) ++ vmovdqa64 %xmm12,`6*16`(%rsp) ++ vmovdqa64 %xmm13,`7*16`(%rsp) ++ vmovdqa64 %xmm14,`8*16`(%rsp) ++ vmovdqa64 %xmm15,`9*16`(%rsp) ++.Lossl_rsaz_amm52x30_x2_ifma256_body: ++___ ++$code.=<<___; ++ # Zeroing accumulators ++ vpxord $zero, $zero, $zero ++ vmovdqa64 $zero, $R0_0 ++ vmovdqa64 $zero, $R0_0h ++ vmovdqa64 $zero, $R1_0 ++ vmovdqa64 $zero, $R1_0h ++ vmovdqa64 $zero, $R2_0 ++ vmovdqa64 $zero, $R2_0h ++ vmovdqa64 $zero, $R3_0 ++ vmovdqa64 $zero, $R3_0h ++ ++ vmovdqa64 $zero, $R0_1 ++ vmovdqa64 $zero, $R0_1h ++ vmovdqa64 $zero, $R1_1 ++ vmovdqa64 $zero, $R1_1h ++ vmovdqa64 $zero, $R2_1 ++ vmovdqa64 $zero, $R2_1h ++ vmovdqa64 $zero, $R3_1 ++ vmovdqa64 $zero, $R3_1h ++ ++ ++ xorl $acc0_0_low, $acc0_0_low ++ xorl $acc0_1_low, $acc0_1_low ++ ++ movq $b, $b_ptr # backup address of b ++ movq \$0xfffffffffffff, $mask52 # 52-bit mask ++ ++ mov \$30, $iter ++ ++.align 32 ++.Lloop30: ++___ ++ &amm52x30_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)"); ++ # 32*8 = offset of the next dimension in two-dimension array ++ &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)"); ++$code.=<<___; ++ lea 8($b_ptr), $b_ptr ++ dec $iter ++ jne .Lloop30 ++___ ++ &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); ++ &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h); ++$code.=<<___; ++ ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) ++ vmovdqu64 $R2_0h, `5*32`($res) ++ vmovdqu64 $R3_0, `6*32`($res) ++ vmovdqu64 $R3_0h, `7*32`($res) ++ ++ vmovdqu64 $R0_1, `8*32`($res) ++ vmovdqu64 $R0_1h, `9*32`($res) ++ vmovdqu64 $R1_1, `10*32`($res) ++ vmovdqu64 $R1_1h, `11*32`($res) ++ vmovdqu64 $R2_1, `12*32`($res) ++ vmovdqu64 $R2_1h, `13*32`($res) ++ vmovdqu64 $R3_1, `14*32`($res) ++ vmovdqu64 $R3_1h, `15*32`($res) ++ ++ vzeroupper ++ lea (%rsp),%rax ++.cfi_def_cfa_register %rax ++___ ++$code.=<<___ if ($win64); ++ vmovdqa64 `0*16`(%rax),%xmm6 ++ vmovdqa64 `1*16`(%rax),%xmm7 ++ vmovdqa64 `2*16`(%rax),%xmm8 ++ vmovdqa64 `3*16`(%rax),%xmm9 ++ vmovdqa64 `4*16`(%rax),%xmm10 ++ vmovdqa64 `5*16`(%rax),%xmm11 ++ vmovdqa64 `6*16`(%rax),%xmm12 ++ vmovdqa64 `7*16`(%rax),%xmm13 ++ vmovdqa64 `8*16`(%rax),%xmm14 ++ vmovdqa64 `9*16`(%rax),%xmm15 ++ lea 168(%rsp),%rax ++___ ++$code.=<<___; ++ mov 0(%rax),%r15 ++.cfi_restore %r15 ++ mov 8(%rax),%r14 ++.cfi_restore %r14 ++ mov 16(%rax),%r13 ++.cfi_restore %r13 ++ mov 24(%rax),%r12 ++.cfi_restore %r12 ++ mov 32(%rax),%rbp ++.cfi_restore %rbp ++ mov 40(%rax),%rbx ++.cfi_restore %rbx ++ lea 48(%rax),%rsp ++.cfi_def_cfa %rsp,8 ++.Lossl_rsaz_amm52x30_x2_ifma256_epilogue: ++ ret ++.cfi_endproc ++.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256 ++___ ++} ++ ++############################################################################### ++# Constant time extraction from the precomputed table of powers base^i, where ++# i = 0..2^EXP_WIN_SIZE-1 ++# ++# The input |red_table| contains precomputations for two independent base values. ++# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. ++# ++# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix. ++# (2 high QW is zero padding) ++# ++# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, ++# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32], ++# int red_table_idx1, int red_table_idx2); ++# ++# EXP_WIN_SIZE = 5 ++############################################################################### ++{ ++# input parameters ++my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ++ ("%rdi","%rsi","%rdx","%rcx"); # Unix order ++ ++my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); ++my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25)); ++my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30)); ++ ++my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15); ++my $t0xmm = $t0; ++$t0xmm =~ s/%y/%x/; ++ ++$code.=<<___; ++.text ++ ++.align 32 ++.globl ossl_extract_multiplier_2x30_win5 ++.type ossl_extract_multiplier_2x30_win5,\@abi-omnipotent ++ossl_extract_multiplier_2x30_win5: ++.cfi_startproc ++ endbranch ++ vmovdqa64 .Lones(%rip), $ones # broadcast ones ++ vpbroadcastq $red_tbl_idx1, $idx1 ++ vpbroadcastq $red_tbl_idx2, $idx2 ++ leaq `(1<<5)*2*32*8`($red_tbl), %rax # holds end of the tbl ++ ++ # zeroing t0..n, cur_idx ++ vpxor $t0xmm, $t0xmm, $t0xmm ++ vmovdqa64 $t0, $cur_idx ++___ ++foreach (1..15) { ++ $code.="vmovdqa64 $t0, $t[$_] \n"; ++} ++$code.=<<___; ++ ++.align 32 ++.Lloop: ++ vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) ++ vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) ++___ ++foreach (0..15) { ++ my $mask = $_<8?"%k1":"%k2"; ++$code.=<<___; ++ vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl ++ vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero ++___ ++} ++$code.=<<___; ++ vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx ++ addq \$`2*32*8`, $red_tbl ++ cmpq $red_tbl, %rax ++ jne .Lloop ++___ ++# store t0..n ++foreach (0..15) { ++ $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; ++} ++$code.=<<___; ++ ++ ret ++.cfi_endproc ++.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5 ++___ ++$code.=<<___; ++.data ++.align 32 ++.Lones: ++ .quad 1,1,1,1 ++.Lzeros: ++ .quad 0,0,0,0 ++___ ++} ++ ++if ($win64) { ++$rec="%rcx"; ++$frame="%rdx"; ++$context="%r8"; ++$disp="%r9"; ++ ++$code.=<<___; ++.extern __imp_RtlVirtualUnwind ++.type rsaz_avx_handler,\@abi-omnipotent ++.align 16 ++rsaz_avx_handler: ++ push %rsi ++ push %rdi ++ push %rbx ++ push %rbp ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ pushfq ++ sub \$64,%rsp ++ ++ mov 120($context),%rax # pull context->Rax ++ mov 248($context),%rbx # pull context->Rip ++ ++ mov 8($disp),%rsi # disp->ImageBase ++ mov 56($disp),%r11 # disp->HandlerData ++ ++ mov 0(%r11),%r10d # HandlerData[0] ++ lea (%rsi,%r10),%r10 # prologue label ++ cmp %r10,%rbx # context->Rip<.Lprologue ++ jb .Lcommon_seh_tail ++ ++ mov 4(%r11),%r10d # HandlerData[1] ++ lea (%rsi,%r10),%r10 # epilogue label ++ cmp %r10,%rbx # context->Rip>=.Lepilogue ++ jae .Lcommon_seh_tail ++ ++ mov 152($context),%rax # pull context->Rsp ++ ++ lea (%rax),%rsi # %xmm save area ++ lea 512($context),%rdi # & context.Xmm6 ++ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) ++ .long 0xa548f3fc # cld; rep movsq ++ ++ lea `48+168`(%rax),%rax ++ ++ mov -8(%rax),%rbx ++ mov -16(%rax),%rbp ++ mov -24(%rax),%r12 ++ mov -32(%rax),%r13 ++ mov -40(%rax),%r14 ++ mov -48(%rax),%r15 ++ mov %rbx,144($context) # restore context->Rbx ++ mov %rbp,160($context) # restore context->Rbp ++ mov %r12,216($context) # restore context->R12 ++ mov %r13,224($context) # restore context->R13 ++ mov %r14,232($context) # restore context->R14 ++ mov %r15,240($context) # restore context->R14 ++ ++.Lcommon_seh_tail: ++ mov 8(%rax),%rdi ++ mov 16(%rax),%rsi ++ mov %rax,152($context) # restore context->Rsp ++ mov %rsi,168($context) # restore context->Rsi ++ mov %rdi,176($context) # restore context->Rdi ++ ++ mov 40($disp),%rdi # disp->ContextRecord ++ mov $context,%rsi # context ++ mov \$154,%ecx # sizeof(CONTEXT) ++ .long 0xa548f3fc # cld; rep movsq ++ ++ mov $disp,%rsi ++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER ++ mov 8(%rsi),%rdx # arg2, disp->ImageBase ++ mov 0(%rsi),%r8 # arg3, disp->ControlPc ++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry ++ mov 40(%rsi),%r10 # disp->ContextRecord ++ lea 56(%rsi),%r11 # &disp->HandlerData ++ lea 24(%rsi),%r12 # &disp->EstablisherFrame ++ mov %r10,32(%rsp) # arg5 ++ mov %r11,40(%rsp) # arg6 ++ mov %r12,48(%rsp) # arg7 ++ mov %rcx,56(%rsp) # arg8, (NULL) ++ call *__imp_RtlVirtualUnwind(%rip) ++ ++ mov \$1,%eax # ExceptionContinueSearch ++ add \$64,%rsp ++ popfq ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %rbp ++ pop %rbx ++ pop %rdi ++ pop %rsi ++ ret ++.size rsaz_avx_handler,.-rsaz_avx_handler ++ ++.section .pdata ++.align 4 ++ .rva .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256 ++ ++ .rva .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256 ++ ++.section .xdata ++.align 8 ++.LSEH_info_ossl_rsaz_amm52x30_x1_ifma256: ++ .byte 9,0,0,0 ++ .rva rsaz_avx_handler ++ .rva .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue ++.LSEH_info_ossl_rsaz_amm52x30_x2_ifma256: ++ .byte 9,0,0,0 ++ .rva rsaz_avx_handler ++ .rva .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue ++___ ++} ++}}} else {{{ # fallback for old assembler ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x30_x1_ifma256 ++.globl ossl_rsaz_amm52x30_x2_ifma256 ++.globl ossl_extract_multiplier_2x30_win5 ++.type ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent ++ossl_rsaz_amm52x30_x1_ifma256: ++ossl_rsaz_amm52x30_x2_ifma256: ++ossl_extract_multiplier_2x30_win5: ++ .byte 0x0f,0x0b # ud2 ++ ret ++.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 ++___ ++}}} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/bn/asm/rsaz-4k-avx512.pl b/crypto/bn/asm/rsaz-4k-avx512.pl +new file mode 100644 +index 0000000000..fb5bf10198 +--- /dev/null ++++ b/crypto/bn/asm/rsaz-4k-avx512.pl +@@ -0,0 +1,930 @@ ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# Copyright (c) 2021, Intel Corporation. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# ++# Originally written by Sergey Kirillov and Andrey Matyukov ++# Intel Corporation ++# ++# March 2021 ++# ++# Initial release. ++# ++# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. ++# ++# IceLake-Client @ 1.3GHz ++# |---------+-----------------------+---------------+-------------| ++# | | OpenSSL 3.0.0-alpha15 | this | Unit | ++# |---------+-----------------------+---------------+-------------| ++# | rsa4096 | 14 301 4300 | 5 813 953 | cycles/sign | ++# | | 90.9 | 223.6 / +146% | sign/s | ++# |---------+-----------------------+---------------+-------------| ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++$avx512ifma=0; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or ++die "can't locate x86_64-xlate.pl"; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` ++ =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx512ifma = ($1>=2.26); ++} ++ ++if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && ++ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { ++ $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); ++} ++ ++if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { ++ $avx512ifma = ($2>=7.0); ++} ++ ++open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++if ($avx512ifma>0) {{{ ++@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ++ ++############################################################################### ++# Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52. ++# ++# AMM is defined as presented in the paper [1]. ++# ++# The input and output are presented in 2^52 radix domain, i.e. ++# |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed. ++# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 ++# ++# NB: the AMM implementation does not perform "conditional" subtraction step ++# specified in the original algorithm as according to the Lemma 1 from the paper ++# [2], the result will be always < 2*m and can be used as a direct input to ++# the next AMM iteration. This post-condition is true, provided the correct ++# parameter |s| (notion of the Lemma 1 from [2]) is choosen, i.e. s >= n + 2 * k, ++# which matches our case: 2080 > 2048 + 2 * 1. ++# ++# [1] Gueron, S. Efficient software implementations of modular exponentiation. ++# DOI: 10.1007/s13389-012-0031-5 ++# [2] Gueron, S. Enhanced Montgomery Multiplication. ++# DOI: 10.1007/3-540-36400-5_5 ++# ++# void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, ++# const BN_ULONG *a, ++# const BN_ULONG *b, ++# const BN_ULONG *m, ++# BN_ULONG k0); ++############################################################################### ++{ ++# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") ++my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; ++ ++my $mask52 = "%rax"; ++my $acc0_0 = "%r9"; ++my $acc0_0_low = "%r9d"; ++my $acc0_1 = "%r15"; ++my $acc0_1_low = "%r15d"; ++my $b_ptr = "%r11"; ++ ++my $iter = "%ebx"; ++ ++my $zero = "%ymm0"; ++my $Bi = "%ymm1"; ++my $Yi = "%ymm2"; ++my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12)); ++my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22)); ++ ++# Registers mapping for normalization ++my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29))); ++ ++sub amm52x40_x1() { ++# _data_offset - offset in the |a| or |m| arrays pointing to the beginning ++# of data for corresponding AMM operation; ++# _b_offset - offset in the |b| array pointing to the next qword digit; ++my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_; ++my $_R0_xmm = $_R0; ++$_R0_xmm =~ s/%y/%x/; ++$code.=<<___; ++ movq $_b_offset($b_ptr), %r13 # b[i] ++ ++ vpbroadcastq %r13, $Bi # broadcast b[i] ++ movq $_data_offset($a), %rdx ++ mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) ++ addq %r13, $_acc # acc += t0 ++ movq %r12, %r10 ++ adcq \$0, %r10 # t2 += CF ++ ++ movq $_k0, %r13 ++ imulq $_acc, %r13 # acc * k0 ++ andq $mask52, %r13 # yi = (acc * k0) & mask52 ++ ++ vpbroadcastq %r13, $Yi # broadcast y[i] ++ movq $_data_offset($m), %rdx ++ mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) ++ addq %r13, $_acc # acc += t0 ++ adcq %r12, %r10 # t2 += (t1 + CF) ++ ++ shrq \$52, $_acc ++ salq \$12, %r10 ++ or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) ++ ++ vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 ++ vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h ++ vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 ++ vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h ++ vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 ++ vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h ++ vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 ++ vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h ++ vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4 ++ vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h ++ ++ vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 ++ vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h ++ vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 ++ vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h ++ vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 ++ vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h ++ vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 ++ vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h ++ vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4 ++ vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h ++ ++ # Shift accumulators right by 1 qword, zero extending the highest one ++ valignq \$1, $_R0, $_R0h, $_R0 ++ valignq \$1, $_R0h, $_R1, $_R0h ++ valignq \$1, $_R1, $_R1h, $_R1 ++ valignq \$1, $_R1h, $_R2, $_R1h ++ valignq \$1, $_R2, $_R2h, $_R2 ++ valignq \$1, $_R2h, $_R3, $_R2h ++ valignq \$1, $_R3, $_R3h, $_R3 ++ valignq \$1, $_R3h, $_R4, $_R3h ++ valignq \$1, $_R4, $_R4h, $_R4 ++ valignq \$1, $_R4h, $zero, $_R4h ++ ++ vmovq $_R0_xmm, %r13 ++ addq %r13, $_acc # acc += R0[0] ++ ++ vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 ++ vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h ++ vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 ++ vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h ++ vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 ++ vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h ++ vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 ++ vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h ++ vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4 ++ vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h ++ ++ vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 ++ vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h ++ vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 ++ vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h ++ vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 ++ vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h ++ vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 ++ vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h ++ vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4 ++ vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h ++___ ++} ++ ++# Normalization routine: handles carry bits and gets bignum qwords to normalized ++# 2^52 representation. ++# ++# Uses %r8-14,%e[abcd]x ++sub amm52x40_x1_norm { ++my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_; ++$code.=<<___; ++ # Put accumulator to low qword in R0 ++ vpbroadcastq $_acc, $T0 ++ vpblendd \$3, $T0, $_R0, $_R0 ++ ++ # Extract "carries" (12 high bits) from each QW of the bignum ++ # Save them to LSB of QWs in T0..Tn ++ vpsrlq \$52, $_R0, $T0 ++ vpsrlq \$52, $_R0h, $T0h ++ vpsrlq \$52, $_R1, $T1 ++ vpsrlq \$52, $_R1h, $T1h ++ vpsrlq \$52, $_R2, $T2 ++ vpsrlq \$52, $_R2h, $T2h ++ vpsrlq \$52, $_R3, $T3 ++ vpsrlq \$52, $_R3h, $T3h ++ vpsrlq \$52, $_R4, $T4 ++ vpsrlq \$52, $_R4h, $T4h ++ ++ # "Shift left" T0..Tn by 1 QW ++ valignq \$3, $T4, $T4h, $T4h ++ valignq \$3, $T3h, $T4, $T4 ++ valignq \$3, $T3, $T3h, $T3h ++ valignq \$3, $T2h, $T3, $T3 ++ valignq \$3, $T2, $T2h, $T2h ++ valignq \$3, $T1h, $T2, $T2 ++ valignq \$3, $T1, $T1h, $T1h ++ valignq \$3, $T0h, $T1, $T1 ++ valignq \$3, $T0, $T0h, $T0h ++ valignq \$3, .Lzeros(%rip), $T0, $T0 ++ ++ # Drop "carries" from R0..Rn QWs ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 ++ vpandq .Lmask52x4(%rip), $_R2h, $_R2h ++ vpandq .Lmask52x4(%rip), $_R3, $_R3 ++ vpandq .Lmask52x4(%rip), $_R3h, $_R3h ++ vpandq .Lmask52x4(%rip), $_R4, $_R4 ++ vpandq .Lmask52x4(%rip), $_R4h, $_R4h ++ ++ # Sum R0..Rn with corresponding adjusted carries ++ vpaddq $T0, $_R0, $_R0 ++ vpaddq $T0h, $_R0h, $_R0h ++ vpaddq $T1, $_R1, $_R1 ++ vpaddq $T1h, $_R1h, $_R1h ++ vpaddq $T2, $_R2, $_R2 ++ vpaddq $T2h, $_R2h, $_R2h ++ vpaddq $T3, $_R3, $_R3 ++ vpaddq $T3h, $_R3h, $_R3h ++ vpaddq $T4, $_R4, $_R4 ++ vpaddq $T4h, $_R4h, $_R4h ++ ++ # Now handle carry bits from this addition ++ # Get mask of QWs whose 52-bit parts overflow ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 ++ kmovb %k1,%r14d ++ kmovb %k2,%r13d ++ shl \$4,%r13b ++ or %r13b,%r14b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 ++ kmovb %k1,%r13d ++ kmovb %k2,%r12d ++ shl \$4,%r12b ++ or %r12b,%r13b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 ++ kmovb %k1,%r12d ++ kmovb %k2,%r11d ++ shl \$4,%r11b ++ or %r11b,%r12b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 ++ kmovb %k1,%r11d ++ kmovb %k2,%r10d ++ shl \$4,%r10b ++ or %r10b,%r11b ++ ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R4},%k1 ++ vpcmpuq \$6,.Lmask52x4(%rip),${_R4h},%k2 ++ kmovb %k1,%r10d ++ kmovb %k2,%r9d ++ shl \$4,%r9b ++ or %r9b,%r10b ++ ++ addb %r14b,%r14b ++ adcb %r13b,%r13b ++ adcb %r12b,%r12b ++ adcb %r11b,%r11b ++ adcb %r10b,%r10b ++ ++ # Get mask of QWs whose 52-bit parts saturated ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 ++ kmovb %k1,%r9d ++ kmovb %k2,%r8d ++ shl \$4,%r8b ++ or %r8b,%r9b ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 ++ kmovb %k1,%r8d ++ kmovb %k2,%edx ++ shl \$4,%dl ++ or %dl,%r8b ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 ++ kmovb %k1,%edx ++ kmovb %k2,%ecx ++ shl \$4,%cl ++ or %cl,%dl ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 ++ kmovb %k1,%ecx ++ kmovb %k2,%ebx ++ shl \$4,%bl ++ or %bl,%cl ++ ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R4},%k1 ++ vpcmpuq \$0,.Lmask52x4(%rip),${_R4h},%k2 ++ kmovb %k1,%ebx ++ kmovb %k2,%eax ++ shl \$4,%al ++ or %al,%bl ++ ++ addb %r9b,%r14b ++ adcb %r8b,%r13b ++ adcb %dl,%r12b ++ adcb %cl,%r11b ++ adcb %bl,%r10b ++ ++ xor %r9b,%r14b ++ xor %r8b,%r13b ++ xor %dl,%r12b ++ xor %cl,%r11b ++ xor %bl,%r10b ++ ++ kmovb %r14d,%k1 ++ shr \$4,%r14b ++ kmovb %r14d,%k2 ++ kmovb %r13d,%k3 ++ shr \$4,%r13b ++ kmovb %r13d,%k4 ++ kmovb %r12d,%k5 ++ shr \$4,%r12b ++ kmovb %r12d,%k6 ++ kmovb %r11d,%k7 ++ ++ vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} ++ vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} ++ vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} ++ vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} ++ vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} ++ vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} ++ vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} ++ ++ vpandq .Lmask52x4(%rip), $_R0, $_R0 ++ vpandq .Lmask52x4(%rip), $_R0h, $_R0h ++ vpandq .Lmask52x4(%rip), $_R1, $_R1 ++ vpandq .Lmask52x4(%rip), $_R1h, $_R1h ++ vpandq .Lmask52x4(%rip), $_R2, $_R2 ++ vpandq .Lmask52x4(%rip), $_R2h, $_R2h ++ vpandq .Lmask52x4(%rip), $_R3, $_R3 ++ ++ shr \$4,%r11b ++ kmovb %r11d,%k1 ++ kmovb %r10d,%k2 ++ shr \$4,%r10b ++ kmovb %r10d,%k3 ++ ++ vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} ++ vpsubq .Lmask52x4(%rip), $_R4, ${_R4}{%k2} ++ vpsubq .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3} ++ ++ vpandq .Lmask52x4(%rip), $_R3h, $_R3h ++ vpandq .Lmask52x4(%rip), $_R4, $_R4 ++ vpandq .Lmask52x4(%rip), $_R4h, $_R4h ++___ ++} ++ ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x40_x1_ifma256 ++.type ossl_rsaz_amm52x40_x1_ifma256,\@function,5 ++.align 32 ++ossl_rsaz_amm52x40_x1_ifma256: ++.cfi_startproc ++ endbranch ++ push %rbx ++.cfi_push %rbx ++ push %rbp ++.cfi_push %rbp ++ push %r12 ++.cfi_push %r12 ++ push %r13 ++.cfi_push %r13 ++ push %r14 ++.cfi_push %r14 ++ push %r15 ++.cfi_push %r15 ++___ ++$code.=<<___ if ($win64); ++ lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) ++ vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers ++ vmovdqa64 %xmm7, `1*16`(%rsp) ++ vmovdqa64 %xmm8, `2*16`(%rsp) ++ vmovdqa64 %xmm9, `3*16`(%rsp) ++ vmovdqa64 %xmm10,`4*16`(%rsp) ++ vmovdqa64 %xmm11,`5*16`(%rsp) ++ vmovdqa64 %xmm12,`6*16`(%rsp) ++ vmovdqa64 %xmm13,`7*16`(%rsp) ++ vmovdqa64 %xmm14,`8*16`(%rsp) ++ vmovdqa64 %xmm15,`9*16`(%rsp) ++.Lossl_rsaz_amm52x40_x1_ifma256_body: ++___ ++$code.=<<___; ++ # Zeroing accumulators ++ vpxord $zero, $zero, $zero ++ vmovdqa64 $zero, $R0_0 ++ vmovdqa64 $zero, $R0_0h ++ vmovdqa64 $zero, $R1_0 ++ vmovdqa64 $zero, $R1_0h ++ vmovdqa64 $zero, $R2_0 ++ vmovdqa64 $zero, $R2_0h ++ vmovdqa64 $zero, $R3_0 ++ vmovdqa64 $zero, $R3_0h ++ vmovdqa64 $zero, $R4_0 ++ vmovdqa64 $zero, $R4_0h ++ ++ xorl $acc0_0_low, $acc0_0_low ++ ++ movq $b, $b_ptr # backup address of b ++ movq \$0xfffffffffffff, $mask52 # 52-bit mask ++ ++ # Loop over 40 digits unrolled by 4 ++ mov \$10, $iter ++ ++.align 32 ++.Lloop10: ++___ ++ foreach my $idx (0..3) { ++ &amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0); ++ } ++$code.=<<___; ++ lea `4*8`($b_ptr), $b_ptr ++ dec $iter ++ jne .Lloop10 ++___ ++ &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h); ++$code.=<<___; ++ ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) ++ vmovdqu64 $R2_0h, `5*32`($res) ++ vmovdqu64 $R3_0, `6*32`($res) ++ vmovdqu64 $R3_0h, `7*32`($res) ++ vmovdqu64 $R4_0, `8*32`($res) ++ vmovdqu64 $R4_0h, `9*32`($res) ++ ++ vzeroupper ++ lea (%rsp),%rax ++.cfi_def_cfa_register %rax ++___ ++$code.=<<___ if ($win64); ++ vmovdqa64 `0*16`(%rax),%xmm6 ++ vmovdqa64 `1*16`(%rax),%xmm7 ++ vmovdqa64 `2*16`(%rax),%xmm8 ++ vmovdqa64 `3*16`(%rax),%xmm9 ++ vmovdqa64 `4*16`(%rax),%xmm10 ++ vmovdqa64 `5*16`(%rax),%xmm11 ++ vmovdqa64 `6*16`(%rax),%xmm12 ++ vmovdqa64 `7*16`(%rax),%xmm13 ++ vmovdqa64 `8*16`(%rax),%xmm14 ++ vmovdqa64 `9*16`(%rax),%xmm15 ++ lea 168(%rsp),%rax ++___ ++$code.=<<___; ++ mov 0(%rax),%r15 ++.cfi_restore %r15 ++ mov 8(%rax),%r14 ++.cfi_restore %r14 ++ mov 16(%rax),%r13 ++.cfi_restore %r13 ++ mov 24(%rax),%r12 ++.cfi_restore %r12 ++ mov 32(%rax),%rbp ++.cfi_restore %rbp ++ mov 40(%rax),%rbx ++.cfi_restore %rbx ++ lea 48(%rax),%rsp # restore rsp ++.cfi_def_cfa %rsp,8 ++.Lossl_rsaz_amm52x40_x1_ifma256_epilogue: ++ ++ ret ++.cfi_endproc ++.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 ++___ ++ ++$code.=<<___; ++.data ++.align 32 ++.Lmask52x4: ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++ .quad 0xfffffffffffff ++___ ++ ++############################################################################### ++# Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52 ++# ++# See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost ++# Montgomery Multiplication algorithm and function input parameters description. ++# ++# This function does two AMMs for two independent inputs, hence dual. ++# ++# void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40], ++# const BN_ULONG a[2][40], ++# const BN_ULONG b[2][40], ++# const BN_ULONG m[2][40], ++# const BN_ULONG k0[2]); ++############################################################################### ++ ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x40_x2_ifma256 ++.type ossl_rsaz_amm52x40_x2_ifma256,\@function,5 ++.align 32 ++ossl_rsaz_amm52x40_x2_ifma256: ++.cfi_startproc ++ endbranch ++ push %rbx ++.cfi_push %rbx ++ push %rbp ++.cfi_push %rbp ++ push %r12 ++.cfi_push %r12 ++ push %r13 ++.cfi_push %r13 ++ push %r14 ++.cfi_push %r14 ++ push %r15 ++.cfi_push %r15 ++___ ++$code.=<<___ if ($win64); ++ lea -168(%rsp),%rsp ++ vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers ++ vmovdqa64 %xmm7, `1*16`(%rsp) ++ vmovdqa64 %xmm8, `2*16`(%rsp) ++ vmovdqa64 %xmm9, `3*16`(%rsp) ++ vmovdqa64 %xmm10,`4*16`(%rsp) ++ vmovdqa64 %xmm11,`5*16`(%rsp) ++ vmovdqa64 %xmm12,`6*16`(%rsp) ++ vmovdqa64 %xmm13,`7*16`(%rsp) ++ vmovdqa64 %xmm14,`8*16`(%rsp) ++ vmovdqa64 %xmm15,`9*16`(%rsp) ++.Lossl_rsaz_amm52x40_x2_ifma256_body: ++___ ++$code.=<<___; ++ # Zeroing accumulators ++ vpxord $zero, $zero, $zero ++ vmovdqa64 $zero, $R0_0 ++ vmovdqa64 $zero, $R0_0h ++ vmovdqa64 $zero, $R1_0 ++ vmovdqa64 $zero, $R1_0h ++ vmovdqa64 $zero, $R2_0 ++ vmovdqa64 $zero, $R2_0h ++ vmovdqa64 $zero, $R3_0 ++ vmovdqa64 $zero, $R3_0h ++ vmovdqa64 $zero, $R4_0 ++ vmovdqa64 $zero, $R4_0h ++ ++ vmovdqa64 $zero, $R0_1 ++ vmovdqa64 $zero, $R0_1h ++ vmovdqa64 $zero, $R1_1 ++ vmovdqa64 $zero, $R1_1h ++ vmovdqa64 $zero, $R2_1 ++ vmovdqa64 $zero, $R2_1h ++ vmovdqa64 $zero, $R3_1 ++ vmovdqa64 $zero, $R3_1h ++ vmovdqa64 $zero, $R4_1 ++ vmovdqa64 $zero, $R4_1h ++ ++ ++ xorl $acc0_0_low, $acc0_0_low ++ xorl $acc0_1_low, $acc0_1_low ++ ++ movq $b, $b_ptr # backup address of b ++ movq \$0xfffffffffffff, $mask52 # 52-bit mask ++ ++ mov \$40, $iter ++ ++.align 32 ++.Lloop40: ++___ ++ &amm52x40_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)"); ++ # 40*8 = offset of the next dimension in two-dimension array ++ &amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)"); ++$code.=<<___; ++ lea 8($b_ptr), $b_ptr ++ dec $iter ++ jne .Lloop40 ++___ ++ &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h); ++ &amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h); ++$code.=<<___; ++ ++ vmovdqu64 $R0_0, `0*32`($res) ++ vmovdqu64 $R0_0h, `1*32`($res) ++ vmovdqu64 $R1_0, `2*32`($res) ++ vmovdqu64 $R1_0h, `3*32`($res) ++ vmovdqu64 $R2_0, `4*32`($res) ++ vmovdqu64 $R2_0h, `5*32`($res) ++ vmovdqu64 $R3_0, `6*32`($res) ++ vmovdqu64 $R3_0h, `7*32`($res) ++ vmovdqu64 $R4_0, `8*32`($res) ++ vmovdqu64 $R4_0h, `9*32`($res) ++ ++ vmovdqu64 $R0_1, `10*32`($res) ++ vmovdqu64 $R0_1h, `11*32`($res) ++ vmovdqu64 $R1_1, `12*32`($res) ++ vmovdqu64 $R1_1h, `13*32`($res) ++ vmovdqu64 $R2_1, `14*32`($res) ++ vmovdqu64 $R2_1h, `15*32`($res) ++ vmovdqu64 $R3_1, `16*32`($res) ++ vmovdqu64 $R3_1h, `17*32`($res) ++ vmovdqu64 $R4_1, `18*32`($res) ++ vmovdqu64 $R4_1h, `19*32`($res) ++ ++ vzeroupper ++ lea (%rsp),%rax ++.cfi_def_cfa_register %rax ++___ ++$code.=<<___ if ($win64); ++ vmovdqa64 `0*16`(%rax),%xmm6 ++ vmovdqa64 `1*16`(%rax),%xmm7 ++ vmovdqa64 `2*16`(%rax),%xmm8 ++ vmovdqa64 `3*16`(%rax),%xmm9 ++ vmovdqa64 `4*16`(%rax),%xmm10 ++ vmovdqa64 `5*16`(%rax),%xmm11 ++ vmovdqa64 `6*16`(%rax),%xmm12 ++ vmovdqa64 `7*16`(%rax),%xmm13 ++ vmovdqa64 `8*16`(%rax),%xmm14 ++ vmovdqa64 `9*16`(%rax),%xmm15 ++ lea 168(%rsp),%rax ++___ ++$code.=<<___; ++ mov 0(%rax),%r15 ++.cfi_restore %r15 ++ mov 8(%rax),%r14 ++.cfi_restore %r14 ++ mov 16(%rax),%r13 ++.cfi_restore %r13 ++ mov 24(%rax),%r12 ++.cfi_restore %r12 ++ mov 32(%rax),%rbp ++.cfi_restore %rbp ++ mov 40(%rax),%rbx ++.cfi_restore %rbx ++ lea 48(%rax),%rsp ++.cfi_def_cfa %rsp,8 ++.Lossl_rsaz_amm52x40_x2_ifma256_epilogue: ++ ret ++.cfi_endproc ++.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256 ++___ ++} ++ ++############################################################################### ++# Constant time extraction from the precomputed table of powers base^i, where ++# i = 0..2^EXP_WIN_SIZE-1 ++# ++# The input |red_table| contains precomputations for two independent base values. ++# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. ++# ++# Extracted value (output) is 2 40 digits numbers in 2^52 radix. ++# ++# void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, ++# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40], ++# int red_table_idx1, int red_table_idx2); ++# ++# EXP_WIN_SIZE = 5 ++############################################################################### ++{ ++# input parameters ++my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ++ ("%rdi","%rsi","%rdx","%rcx"); # Unix order ++ ++my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); ++my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); ++my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); ++ ++my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); ++my $t0xmm = $t0; ++$t0xmm =~ s/%y/%x/; ++ ++sub get_table_value_consttime() { ++my ($_idx,$_offset) = @_; ++$code.=<<___; ++ vpxorq $cur_idx, $cur_idx, $cur_idx ++.align 32 ++.Lloop_$_offset: ++ vpcmpq \$0, $cur_idx, $_idx, %k1 # mask of (idx == cur_idx) ++___ ++foreach (0..9) { ++$code.=<<___; ++ vmovdqu64 `$_offset+${_}*32`($red_tbl), $tmp # load data from red_tbl ++ vpblendmq $tmp, $t[$_], ${t[$_]}{%k1} # extract data when mask is not zero ++___ ++} ++$code.=<<___; ++ vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx ++ addq \$`2*40*8`, $red_tbl ++ cmpq $red_tbl, %rax ++ jne .Lloop_$_offset ++___ ++} ++ ++$code.=<<___; ++.text ++ ++.align 32 ++.globl ossl_extract_multiplier_2x40_win5 ++.type ossl_extract_multiplier_2x40_win5,\@abi-omnipotent ++ossl_extract_multiplier_2x40_win5: ++.cfi_startproc ++ endbranch ++ vmovdqa64 .Lones(%rip), $ones # broadcast ones ++ vpbroadcastq $red_tbl_idx1, $idx1 ++ vpbroadcastq $red_tbl_idx2, $idx2 ++ leaq `(1<<5)*2*40*8`($red_tbl), %rax # holds end of the tbl ++ ++ # backup red_tbl address ++ movq $red_tbl, %r10 ++ ++ # zeroing t0..n, cur_idx ++ vpxor $t0xmm, $t0xmm, $t0xmm ++___ ++foreach (1..9) { ++ $code.="vmovdqa64 $t0, $t[$_] \n"; ++} ++ ++&get_table_value_consttime($idx1, 0); ++foreach (0..9) { ++ $code.="vmovdqu64 $t[$_], `(0+$_)*32`($out) \n"; ++} ++$code.="movq %r10, $red_tbl \n"; ++&get_table_value_consttime($idx2, 40*8); ++foreach (0..9) { ++ $code.="vmovdqu64 $t[$_], `(10+$_)*32`($out) \n"; ++} ++$code.=<<___; ++ ++ ret ++.cfi_endproc ++.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5 ++___ ++$code.=<<___; ++.data ++.align 32 ++.Lones: ++ .quad 1,1,1,1 ++.Lzeros: ++ .quad 0,0,0,0 ++___ ++} ++ ++if ($win64) { ++$rec="%rcx"; ++$frame="%rdx"; ++$context="%r8"; ++$disp="%r9"; ++ ++$code.=<<___; ++.extern __imp_RtlVirtualUnwind ++.type rsaz_avx_handler,\@abi-omnipotent ++.align 16 ++rsaz_avx_handler: ++ push %rsi ++ push %rdi ++ push %rbx ++ push %rbp ++ push %r12 ++ push %r13 ++ push %r14 ++ push %r15 ++ pushfq ++ sub \$64,%rsp ++ ++ mov 120($context),%rax # pull context->Rax ++ mov 248($context),%rbx # pull context->Rip ++ ++ mov 8($disp),%rsi # disp->ImageBase ++ mov 56($disp),%r11 # disp->HandlerData ++ ++ mov 0(%r11),%r10d # HandlerData[0] ++ lea (%rsi,%r10),%r10 # prologue label ++ cmp %r10,%rbx # context->Rip<.Lprologue ++ jb .Lcommon_seh_tail ++ ++ mov 4(%r11),%r10d # HandlerData[1] ++ lea (%rsi,%r10),%r10 # epilogue label ++ cmp %r10,%rbx # context->Rip>=.Lepilogue ++ jae .Lcommon_seh_tail ++ ++ mov 152($context),%rax # pull context->Rsp ++ ++ lea (%rax),%rsi # %xmm save area ++ lea 512($context),%rdi # & context.Xmm6 ++ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) ++ .long 0xa548f3fc # cld; rep movsq ++ ++ lea `48+168`(%rax),%rax ++ ++ mov -8(%rax),%rbx ++ mov -16(%rax),%rbp ++ mov -24(%rax),%r12 ++ mov -32(%rax),%r13 ++ mov -40(%rax),%r14 ++ mov -48(%rax),%r15 ++ mov %rbx,144($context) # restore context->Rbx ++ mov %rbp,160($context) # restore context->Rbp ++ mov %r12,216($context) # restore context->R12 ++ mov %r13,224($context) # restore context->R13 ++ mov %r14,232($context) # restore context->R14 ++ mov %r15,240($context) # restore context->R14 ++ ++.Lcommon_seh_tail: ++ mov 8(%rax),%rdi ++ mov 16(%rax),%rsi ++ mov %rax,152($context) # restore context->Rsp ++ mov %rsi,168($context) # restore context->Rsi ++ mov %rdi,176($context) # restore context->Rdi ++ ++ mov 40($disp),%rdi # disp->ContextRecord ++ mov $context,%rsi # context ++ mov \$154,%ecx # sizeof(CONTEXT) ++ .long 0xa548f3fc # cld; rep movsq ++ ++ mov $disp,%rsi ++ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER ++ mov 8(%rsi),%rdx # arg2, disp->ImageBase ++ mov 0(%rsi),%r8 # arg3, disp->ControlPc ++ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry ++ mov 40(%rsi),%r10 # disp->ContextRecord ++ lea 56(%rsi),%r11 # &disp->HandlerData ++ lea 24(%rsi),%r12 # &disp->EstablisherFrame ++ mov %r10,32(%rsp) # arg5 ++ mov %r11,40(%rsp) # arg6 ++ mov %r12,48(%rsp) # arg7 ++ mov %rcx,56(%rsp) # arg8, (NULL) ++ call *__imp_RtlVirtualUnwind(%rip) ++ ++ mov \$1,%eax # ExceptionContinueSearch ++ add \$64,%rsp ++ popfq ++ pop %r15 ++ pop %r14 ++ pop %r13 ++ pop %r12 ++ pop %rbp ++ pop %rbx ++ pop %rdi ++ pop %rsi ++ ret ++.size rsaz_avx_handler,.-rsaz_avx_handler ++ ++.section .pdata ++.align 4 ++ .rva .LSEH_begin_ossl_rsaz_amm52x40_x1_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x40_x1_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x40_x1_ifma256 ++ ++ .rva .LSEH_begin_ossl_rsaz_amm52x40_x2_ifma256 ++ .rva .LSEH_end_ossl_rsaz_amm52x40_x2_ifma256 ++ .rva .LSEH_info_ossl_rsaz_amm52x40_x2_ifma256 ++ ++.section .xdata ++.align 8 ++.LSEH_info_ossl_rsaz_amm52x40_x1_ifma256: ++ .byte 9,0,0,0 ++ .rva rsaz_avx_handler ++ .rva .Lossl_rsaz_amm52x40_x1_ifma256_body,.Lossl_rsaz_amm52x40_x1_ifma256_epilogue ++.LSEH_info_ossl_rsaz_amm52x40_x2_ifma256: ++ .byte 9,0,0,0 ++ .rva rsaz_avx_handler ++ .rva .Lossl_rsaz_amm52x40_x2_ifma256_body,.Lossl_rsaz_amm52x40_x2_ifma256_epilogue ++___ ++} ++}}} else {{{ # fallback for old assembler ++$code.=<<___; ++.text ++ ++.globl ossl_rsaz_amm52x40_x1_ifma256 ++.globl ossl_rsaz_amm52x40_x2_ifma256 ++.globl ossl_extract_multiplier_2x40_win5 ++.type ossl_rsaz_amm52x40_x1_ifma256,\@abi-omnipotent ++ossl_rsaz_amm52x40_x1_ifma256: ++ossl_rsaz_amm52x40_x2_ifma256: ++ossl_extract_multiplier_2x40_win5: ++ .byte 0x0f,0x0b # ud2 ++ ret ++.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 ++___ ++}}} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c +index 4e169ae1f9..4d02dcda53 100644 +--- a/crypto/bn/bn_exp.c ++++ b/crypto/bn/bn_exp.c +@@ -1439,12 +1439,20 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1 + BN_MONT_CTX *mont2 = NULL; + + if (ossl_rsaz_avx512ifma_eligible() && +- ((a1->top == 16) && (p1->top == 16) && (BN_num_bits(m1) == 1024) && +- (a2->top == 16) && (p2->top == 16) && (BN_num_bits(m2) == 1024))) { +- +- if (bn_wexpand(rr1, 16) == NULL) ++ (((a1->top == 16) && (p1->top == 16) && (BN_num_bits(m1) == 1024) && ++ (a2->top == 16) && (p2->top == 16) && (BN_num_bits(m2) == 1024)) || ++ ((a1->top == 24) && (p1->top == 24) && (BN_num_bits(m1) == 1536) && ++ (a2->top == 24) && (p2->top == 24) && (BN_num_bits(m2) == 1536)) || ++ ((a1->top == 32) && (p1->top == 32) && (BN_num_bits(m1) == 2048) && ++ (a2->top == 32) && (p2->top == 32) && (BN_num_bits(m2) == 2048)))) { ++ ++ int topn = a1->top; ++ /* Modulus bits of |m1| and |m2| are equal */ ++ int mod_bits = BN_num_bits(m1); ++ ++ if (bn_wexpand(rr1, topn) == NULL) + goto err; +- if (bn_wexpand(rr2, 16) == NULL) ++ if (bn_wexpand(rr2, topn) == NULL) + goto err; + + /* Ensure that montgomery contexts are initialized */ +@@ -1469,14 +1477,14 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1 + mont1->RR.d, mont1->n0[0], + rr2->d, a2->d, p2->d, m2->d, + mont2->RR.d, mont2->n0[0], +- 1024 /* factor bit size */); ++ mod_bits); + +- rr1->top = 16; ++ rr1->top = topn; + rr1->neg = 0; + bn_correct_top(rr1); + bn_check_top(rr1); + +- rr2->top = 16; ++ rr2->top = topn; + rr2->neg = 0; + bn_correct_top(rr2); + bn_check_top(rr2); +diff --git a/crypto/bn/build.info b/crypto/bn/build.info +index c4ba51b265..47cbe1bed8 100644 +--- a/crypto/bn/build.info ++++ b/crypto/bn/build.info +@@ -24,7 +24,7 @@ IF[{- !$disabled{asm} -}] + + $BNASM_x86_64=\ + x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s \ +- rsaz-avx2.s rsaz_exp_x2.c rsaz-avx512.s ++ rsaz-avx2.s rsaz_exp_x2.c rsaz-2k-avx512.s rsaz-3k-avx512.s rsaz-4k-avx512.s + IF[{- $config{target} !~ /^VC/ -}] + $BNASM_x86_64=asm/x86_64-gcc.c $BNASM_x86_64 + ELSE +@@ -155,7 +155,9 @@ GENERATE[x86_64-mont5.s]=asm/x86_64-mont5.pl + GENERATE[x86_64-gf2m.s]=asm/x86_64-gf2m.pl + GENERATE[rsaz-x86_64.s]=asm/rsaz-x86_64.pl + GENERATE[rsaz-avx2.s]=asm/rsaz-avx2.pl +-GENERATE[rsaz-avx512.s]=asm/rsaz-avx512.pl ++GENERATE[rsaz-2k-avx512.s]=asm/rsaz-2k-avx512.pl ++GENERATE[rsaz-3k-avx512.s]=asm/rsaz-3k-avx512.pl ++GENERATE[rsaz-4k-avx512.s]=asm/rsaz-4k-avx512.pl + + GENERATE[bn-ia64.s]=asm/ia64.S + GENERATE[ia64-mont.s]=asm/ia64-mont.pl +diff --git a/crypto/bn/rsaz_exp_x2.c b/crypto/bn/rsaz_exp_x2.c +index b19050dfee..8490dfe992 100644 +--- a/crypto/bn/rsaz_exp_x2.c ++++ b/crypto/bn/rsaz_exp_x2.c +@@ -1,6 +1,6 @@ + /* + * Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. +- * Copyright (c) 2020, Intel Corporation. All Rights Reserved. ++ * Copyright (c) 2020-2021, Intel Corporation. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy +@@ -8,7 +8,8 @@ + * https://www.openssl.org/source/license.html + * + * +- * Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov ++ * Originally written by Sergey Kirillov and Andrey Matyukov. ++ * Special thanks to Ilya Albrekht for his valuable hints. + * Intel Corporation + * + */ +@@ -42,8 +43,12 @@ NON_EMPTY_TRANSLATION_UNIT + # define BITS2WORD8_SIZE(x) (((x) + 7) >> 3) + # define BITS2WORD64_SIZE(x) (((x) + 63) >> 6) + +-static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len); +-static ossl_inline void put_digit52(uint8_t *out, int out_len, uint64_t digit); ++/* Number of registers required to hold |digits_num| amount of qword digits */ ++# define NUMBER_OF_REGISTERS(digits_num, register_size) \ ++ (((digits_num) * 64 + (register_size) - 1) / (register_size)) ++ ++static ossl_inline uint64_t get_digit(const uint8_t *in, int in_len); ++static ossl_inline void put_digit(uint8_t *out, int out_len, uint64_t digit); + static void to_words52(BN_ULONG *out, int out_len, const BN_ULONG *in, + int in_bitsize); + static void from_words52(BN_ULONG *bn_out, int out_bitsize, const BN_ULONG *in); +@@ -55,37 +60,52 @@ static ossl_inline int number_of_digits(int bitsize, int digit_size) + return (bitsize + digit_size - 1) / digit_size; + } + +-typedef void (*AMM52)(BN_ULONG *res, const BN_ULONG *base, +- const BN_ULONG *exp, const BN_ULONG *m, BN_ULONG k0); +-typedef void (*EXP52_x2)(BN_ULONG *res, const BN_ULONG *base, +- const BN_ULONG *exp[2], const BN_ULONG *m, +- const BN_ULONG *rr, const BN_ULONG k0[2]); +- + /* + * For details of the methods declared below please refer to + * crypto/bn/asm/rsaz-avx512.pl + * +- * Naming notes: ++ * Naming conventions: + * amm = Almost Montgomery Multiplication + * ams = Almost Montgomery Squaring +- * 52x20 - data represented as array of 20 digits in 52-bit radix ++ * 52xZZ - data represented as array of ZZ digits in 52-bit radix + * _x1_/_x2_ - 1 or 2 independent inputs/outputs +- * _256 suffix - uses 256-bit (AVX512VL) registers ++ * _ifma256 - uses 256-bit wide IFMA ISA (AVX512_IFMA256) + */ + +-/*AMM = Almost Montgomery Multiplication. */ +-void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, const BN_ULONG *base, +- const BN_ULONG *exp, const BN_ULONG *m, +- BN_ULONG k0); +-static void RSAZ_exp52x20_x2_256(BN_ULONG *res, const BN_ULONG *base, +- const BN_ULONG *exp[2], const BN_ULONG *m, +- const BN_ULONG *rr, const BN_ULONG k0[2]); +-void ossl_rsaz_amm52x20_x2_256(BN_ULONG *out, const BN_ULONG *a, +- const BN_ULONG *b, const BN_ULONG *m, +- const BN_ULONG k0[2]); ++void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ BN_ULONG k0); ++void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ const BN_ULONG k0[2]); + void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, + const BN_ULONG *red_table, +- int red_table_idx, int tbl_idx); ++ int red_table_idx1, int red_table_idx2); ++ ++void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ BN_ULONG k0); ++void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ const BN_ULONG k0[2]); ++void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, ++ const BN_ULONG *red_table, ++ int red_table_idx1, int red_table_idx2); ++ ++void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ BN_ULONG k0); ++void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ const BN_ULONG k0[2]); ++void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, ++ const BN_ULONG *red_table, ++ int red_table_idx1, int red_table_idx2); ++ ++static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base, ++ const BN_ULONG *exp[2], const BN_ULONG *m, ++ const BN_ULONG *rr, const BN_ULONG k0[2], ++ int modulus_bitsize); + + /* + * Dual Montgomery modular exponentiation using prime moduli of the +@@ -98,7 +118,10 @@ void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, + * + * Each moduli shall be |factor_size| bit size. + * +- * NOTE: currently only 2x1024 case is supported. ++ * Supported cases: ++ * - 2x1024 ++ * - 2x1536 ++ * - 2x2048 + * + * [out] res|i| - result of modular exponentiation: array of qword values + * in regular (2^64) radix. Size of array shall be enough +@@ -127,6 +150,8 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, + BN_ULONG k0_2, + int factor_size) + { ++ typedef void (*AMM)(BN_ULONG *res, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, BN_ULONG k0); + int ret = 0; + + /* +@@ -135,52 +160,60 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, + */ + int exp_digits = number_of_digits(factor_size + 2, DIGIT_SIZE); + int coeff_pow = 4 * (DIGIT_SIZE * exp_digits - factor_size); ++ ++ /* Number of YMM registers required to store exponent's digits */ ++ int ymm_regs_num = NUMBER_OF_REGISTERS(exp_digits, 256 /* ymm bit size */); ++ /* Capacity of the register set (in qwords) to store exponent */ ++ int regs_capacity = ymm_regs_num * 4; ++ + BN_ULONG *base1_red, *m1_red, *rr1_red; + BN_ULONG *base2_red, *m2_red, *rr2_red; + BN_ULONG *coeff_red; + BN_ULONG *storage = NULL; + BN_ULONG *storage_aligned = NULL; +- BN_ULONG storage_len_bytes = 7 * exp_digits * sizeof(BN_ULONG); +- +- /* AMM = Almost Montgomery Multiplication */ +- AMM52 amm = NULL; +- /* Dual (2-exps in parallel) exponentiation */ +- EXP52_x2 exp_x2 = NULL; ++ int storage_len_bytes = 7 * regs_capacity * sizeof(BN_ULONG) ++ + 64 /* alignment */; + + const BN_ULONG *exp[2] = {0}; + BN_ULONG k0[2] = {0}; ++ /* AMM = Almost Montgomery Multiplication */ ++ AMM amm = NULL; + +- /* Only 1024-bit factor size is supported now */ + switch (factor_size) { + case 1024: +- amm = ossl_rsaz_amm52x20_x1_256; +- exp_x2 = RSAZ_exp52x20_x2_256; ++ amm = ossl_rsaz_amm52x20_x1_ifma256; ++ break; ++ case 1536: ++ amm = ossl_rsaz_amm52x30_x1_ifma256; ++ break; ++ case 2048: ++ amm = ossl_rsaz_amm52x40_x1_ifma256; + break; + default: + goto err; + } + +- storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes + 64); ++ storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes); + if (storage == NULL) + goto err; + storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); + + /* Memory layout for red(undant) representations */ + base1_red = storage_aligned; +- base2_red = storage_aligned + 1 * exp_digits; +- m1_red = storage_aligned + 2 * exp_digits; +- m2_red = storage_aligned + 3 * exp_digits; +- rr1_red = storage_aligned + 4 * exp_digits; +- rr2_red = storage_aligned + 5 * exp_digits; +- coeff_red = storage_aligned + 6 * exp_digits; ++ base2_red = storage_aligned + 1 * regs_capacity; ++ m1_red = storage_aligned + 2 * regs_capacity; ++ m2_red = storage_aligned + 3 * regs_capacity; ++ rr1_red = storage_aligned + 4 * regs_capacity; ++ rr2_red = storage_aligned + 5 * regs_capacity; ++ coeff_red = storage_aligned + 6 * regs_capacity; + + /* Convert base_i, m_i, rr_i, from regular to 52-bit radix */ +- to_words52(base1_red, exp_digits, base1, factor_size); +- to_words52(base2_red, exp_digits, base2, factor_size); +- to_words52(m1_red, exp_digits, m1, factor_size); +- to_words52(m2_red, exp_digits, m2, factor_size); +- to_words52(rr1_red, exp_digits, rr1, factor_size); +- to_words52(rr2_red, exp_digits, rr2, factor_size); ++ to_words52(base1_red, regs_capacity, base1, factor_size); ++ to_words52(base2_red, regs_capacity, base2, factor_size); ++ to_words52(m1_red, regs_capacity, m1, factor_size); ++ to_words52(m2_red, regs_capacity, m2, factor_size); ++ to_words52(rr1_red, regs_capacity, rr1, factor_size); ++ to_words52(rr2_red, regs_capacity, rr2, factor_size); + + /* + * Compute target domain Montgomery converters RR' for each modulus +@@ -193,10 +226,10 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, + * where + * k = 4 * (52 * digits52 - modlen) + * R = 2^(64 * ceil(modlen/64)) mod m +- * RR = R^2 mod M ++ * RR = R^2 mod m + * R' = 2^(52 * ceil(modlen/52)) mod m + * +- * modlen = 1024: k = 64, RR = 2^2048 mod m, RR' = 2^2080 mod m ++ * EX/ modlen = 1024: k = 64, RR = 2^2048 mod m, RR' = 2^2080 mod m + */ + memset(coeff_red, 0, exp_digits * sizeof(BN_ULONG)); + /* (1) in reduced domain representation */ +@@ -214,7 +247,11 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, + k0[0] = k0_1; + k0[1] = k0_2; + +- exp_x2(rr1_red, base1_red, exp, m1_red, rr1_red, k0); ++ /* Dual (2-exps in parallel) exponentiation */ ++ ret = RSAZ_mod_exp_x2_ifma256(rr1_red, base1_red, exp, m1_red, rr1_red, ++ k0, factor_size); ++ if (!ret) ++ goto err; + + /* Convert rr_i back to regular radix */ + from_words52(res1, factor_size, rr1_red); +@@ -226,7 +263,6 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, + bn_reduce_once_in_place(res1, /*carry=*/0, m1, storage, factor_size); + bn_reduce_once_in_place(res2, /*carry=*/0, m2, storage, factor_size); + +- ret = 1; + err: + if (storage != NULL) { + OPENSSL_cleanse(storage, storage_len_bytes); +@@ -236,91 +272,149 @@ err: + } + + /* +- * Dual 1024-bit w-ary modular exponentiation using prime moduli of the same +- * bit size using Almost Montgomery Multiplication, optimized with AVX512_IFMA +- * ISA. ++ * Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of ++ * the same bit size using Almost Montgomery Multiplication, optimized with ++ * AVX512_IFMA256 ISA. + * + * The parameter w (window size) = 5. + * +- * [out] res - result of modular exponentiation: 2x20 qword ++ * [out] res - result of modular exponentiation: 2x{20,30,40} qword + * values in 2^52 radix. +- * [in] base - base (2x20 qword values in 2^52 radix) +- * [in] exp - array of 2 pointers to 16 qword values in 2^64 radix. ++ * [in] base - base (2x{20,30,40} qword values in 2^52 radix) ++ * [in] exp - array of 2 pointers to {16,24,32} qword values in 2^64 radix. + * Exponent is not converted to redundant representation. +- * [in] m - moduli (2x20 qword values in 2^52 radix) +- * [in] rr - Montgomery parameter for 2 moduli: RR = 2^2080 mod m. +- * (2x20 qword values in 2^52 radix) ++ * [in] m - moduli (2x{20,30,40} qword values in 2^52 radix) ++ * [in] rr - Montgomery parameter for 2 moduli: ++ * RR(1024) = 2^2080 mod m. ++ * RR(1536) = 2^3120 mod m. ++ * RR(2048) = 2^4160 mod m. ++ * (2x{20,30,40} qword values in 2^52 radix) + * [in] k0 - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64 + * + * \return (void). + */ +-static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ +- const BN_ULONG *base, /* [2][20] */ +- const BN_ULONG *exp[2], /* 2x16 */ +- const BN_ULONG *m, /* [2][20] */ +- const BN_ULONG *rr, /* [2][20] */ +- const BN_ULONG k0[2]) ++int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out, ++ const BN_ULONG *base, ++ const BN_ULONG *exp[2], ++ const BN_ULONG *m, ++ const BN_ULONG *rr, ++ const BN_ULONG k0[2], ++ int modulus_bitsize) + { +-# define BITSIZE_MODULUS (1024) +-# define EXP_WIN_SIZE (5) +-# define EXP_WIN_MASK ((1U << EXP_WIN_SIZE) - 1) +-/* +- * Number of digits (64-bit words) in redundant representation to handle +- * modulus bits +- */ +-# define RED_DIGITS (20) +-# define EXP_DIGITS (16) +-# define DAMM ossl_rsaz_amm52x20_x2_256 ++ typedef void (*DAMM)(BN_ULONG *res, const BN_ULONG *a, ++ const BN_ULONG *b, const BN_ULONG *m, ++ const BN_ULONG k0[2]); ++ typedef void (*DEXTRACT)(BN_ULONG *res, const BN_ULONG *red_table, ++ int red_table_idx, int tbl_idx); ++ ++ int ret = 0; ++ int idx; ++ ++ /* Exponent window size */ ++ int exp_win_size = 5; ++ int exp_win_mask = (1U << exp_win_size) - 1; ++ ++ /* ++ * Number of digits (64-bit words) in redundant representation to handle ++ * modulus bits ++ */ ++ int red_digits = 0; ++ int exp_digits = 0; ++ ++ BN_ULONG *storage = NULL; ++ BN_ULONG *storage_aligned = NULL; ++ int storage_len_bytes = 0; ++ ++ /* Red(undant) result Y and multiplier X */ ++ BN_ULONG *red_Y = NULL; /* [2][red_digits] */ ++ BN_ULONG *red_X = NULL; /* [2][red_digits] */ ++ /* Pre-computed table of base powers */ ++ BN_ULONG *red_table = NULL; /* [1U << exp_win_size][2][red_digits] */ ++ /* Expanded exponent */ ++ BN_ULONG *expz = NULL; /* [2][exp_digits + 1] */ ++ ++ /* Dual AMM */ ++ DAMM damm = NULL; ++ /* Extractor from red_table */ ++ DEXTRACT extract = NULL; ++ + /* + * Squaring is done using multiplication now. That can be a subject of + * optimization in future. + */ +-# define DAMS(r,a,m,k0) \ +- ossl_rsaz_amm52x20_x2_256((r),(a),(a),(m),(k0)) +- +- /* Allocate stack for red(undant) result Y and multiplier X */ +- ALIGN64 BN_ULONG red_Y[2][RED_DIGITS]; +- ALIGN64 BN_ULONG red_X[2][RED_DIGITS]; ++# define DAMS(r,a,m,k0) damm((r),(a),(a),(m),(k0)) + +- /* Allocate expanded exponent */ +- ALIGN64 BN_ULONG expz[2][EXP_DIGITS + 1]; ++ switch (modulus_bitsize) { ++ case 1024: ++ red_digits = 20; ++ exp_digits = 16; ++ damm = ossl_rsaz_amm52x20_x2_ifma256; ++ extract = ossl_extract_multiplier_2x20_win5; ++ break; ++ case 1536: ++ /* Extended with 2 digits padding to avoid mask ops in high YMM register */ ++ red_digits = 30 + 2; ++ exp_digits = 24; ++ damm = ossl_rsaz_amm52x30_x2_ifma256; ++ extract = ossl_extract_multiplier_2x30_win5; ++ break; ++ case 2048: ++ red_digits = 40; ++ exp_digits = 32; ++ damm = ossl_rsaz_amm52x40_x2_ifma256; ++ extract = ossl_extract_multiplier_2x40_win5; ++ break; ++ default: ++ goto err; ++ } + +- /* Pre-computed table of base powers */ +- ALIGN64 BN_ULONG red_table[1U << EXP_WIN_SIZE][2][RED_DIGITS]; ++ storage_len_bytes = (2 * red_digits /* red_Y */ ++ + 2 * red_digits /* red_X */ ++ + 2 * red_digits * (1U << exp_win_size) /* red_table */ ++ + 2 * (exp_digits + 1)) /* expz */ ++ * sizeof(BN_ULONG) ++ + 64; /* alignment */ + +- int idx; ++ storage = (BN_ULONG *)OPENSSL_zalloc(storage_len_bytes); ++ if (storage == NULL) ++ goto err; ++ storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); + +- memset(red_Y, 0, sizeof(red_Y)); +- memset(red_table, 0, sizeof(red_table)); +- memset(red_X, 0, sizeof(red_X)); ++ red_Y = storage_aligned; ++ red_X = red_Y + 2 * red_digits; ++ red_table = red_X + 2 * red_digits; ++ expz = red_table + 2 * red_digits * (1U << exp_win_size); + + /* + * Compute table of powers base^i, i = 0, ..., (2^EXP_WIN_SIZE) - 1 + * table[0] = mont(x^0) = mont(1) + * table[1] = mont(x^1) = mont(x) + */ +- red_X[0][0] = 1; +- red_X[1][0] = 1; +- DAMM(red_table[0][0], (const BN_ULONG*)red_X, rr, m, k0); +- DAMM(red_table[1][0], base, rr, m, k0); +- +- for (idx = 1; idx < (int)((1U << EXP_WIN_SIZE) / 2); idx++) { +- DAMS(red_table[2 * idx + 0][0], red_table[1 * idx][0], m, k0); +- DAMM(red_table[2 * idx + 1][0], red_table[2 * idx][0], red_table[1][0], m, k0); ++ red_X[0 * red_digits] = 1; ++ red_X[1 * red_digits] = 1; ++ damm(&red_table[0 * 2 * red_digits], (const BN_ULONG*)red_X, rr, m, k0); ++ damm(&red_table[1 * 2 * red_digits], base, rr, m, k0); ++ ++ for (idx = 1; idx < (int)((1U << exp_win_size) / 2); idx++) { ++ DAMS(&red_table[(2 * idx + 0) * 2 * red_digits], ++ &red_table[(1 * idx) * 2 * red_digits], m, k0); ++ damm(&red_table[(2 * idx + 1) * 2 * red_digits], ++ &red_table[(2 * idx) * 2 * red_digits], ++ &red_table[1 * 2 * red_digits], m, k0); + } + + /* Copy and expand exponents */ +- memcpy(expz[0], exp[0], EXP_DIGITS * sizeof(BN_ULONG)); +- expz[0][EXP_DIGITS] = 0; +- memcpy(expz[1], exp[1], EXP_DIGITS * sizeof(BN_ULONG)); +- expz[1][EXP_DIGITS] = 0; ++ memcpy(&expz[0 * (exp_digits + 1)], exp[0], exp_digits * sizeof(BN_ULONG)); ++ expz[1 * (exp_digits + 1) - 1] = 0; ++ memcpy(&expz[1 * (exp_digits + 1)], exp[1], exp_digits * sizeof(BN_ULONG)); ++ expz[2 * (exp_digits + 1) - 1] = 0; + + /* Exponentiation */ + { +- const int rem = BITSIZE_MODULUS % EXP_WIN_SIZE; +- BN_ULONG table_idx_mask = EXP_WIN_MASK; ++ const int rem = modulus_bitsize % exp_win_size; ++ BN_ULONG table_idx_mask = exp_win_mask; + +- int exp_bit_no = BITSIZE_MODULUS - rem; ++ int exp_bit_no = modulus_bitsize - rem; + int exp_chunk_no = exp_bit_no / 64; + int exp_chunk_shift = exp_bit_no % 64; + +@@ -337,8 +431,8 @@ static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ + OPENSSL_assert(rem != 0); + + /* Process 1-st exp window - just init result */ +- red_table_idx_0 = expz[0][exp_chunk_no]; +- red_table_idx_1 = expz[1][exp_chunk_no]; ++ red_table_idx_0 = expz[exp_chunk_no + 0 * (exp_digits + 1)]; ++ red_table_idx_1 = expz[exp_chunk_no + 1 * (exp_digits + 1)]; + /* + * The function operates with fixed moduli sizes divisible by 64, + * thus table index here is always in supported range [0, EXP_WIN_SIZE). +@@ -346,13 +440,10 @@ static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ + red_table_idx_0 >>= exp_chunk_shift; + red_table_idx_1 >>= exp_chunk_shift; + +- ossl_extract_multiplier_2x20_win5(red_Y[0], (const BN_ULONG*)red_table, +- (int)red_table_idx_0, 0); +- ossl_extract_multiplier_2x20_win5(red_Y[1], (const BN_ULONG*)red_table, +- (int)red_table_idx_1, 1); ++ extract(&red_Y[0 * red_digits], (const BN_ULONG*)red_table, (int)red_table_idx_0, (int)red_table_idx_1); + + /* Process other exp windows */ +- for (exp_bit_no -= EXP_WIN_SIZE; exp_bit_no >= 0; exp_bit_no -= EXP_WIN_SIZE) { ++ for (exp_bit_no -= exp_win_size; exp_bit_no >= 0; exp_bit_no -= exp_win_size) { + /* Extract pre-computed multiplier from the table */ + { + BN_ULONG T; +@@ -360,43 +451,37 @@ static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ + exp_chunk_no = exp_bit_no / 64; + exp_chunk_shift = exp_bit_no % 64; + { +- red_table_idx_0 = expz[0][exp_chunk_no]; +- T = expz[0][exp_chunk_no + 1]; ++ red_table_idx_0 = expz[exp_chunk_no + 0 * (exp_digits + 1)]; ++ T = expz[exp_chunk_no + 1 + 0 * (exp_digits + 1)]; + + red_table_idx_0 >>= exp_chunk_shift; + /* + * Get additional bits from then next quadword + * when 64-bit boundaries are crossed. + */ +- if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { ++ if (exp_chunk_shift > 64 - exp_win_size) { + T <<= (64 - exp_chunk_shift); + red_table_idx_0 ^= T; + } + red_table_idx_0 &= table_idx_mask; +- +- ossl_extract_multiplier_2x20_win5(red_X[0], +- (const BN_ULONG*)red_table, +- (int)red_table_idx_0, 0); + } + { +- red_table_idx_1 = expz[1][exp_chunk_no]; +- T = expz[1][exp_chunk_no + 1]; ++ red_table_idx_1 = expz[exp_chunk_no + 1 * (exp_digits + 1)]; ++ T = expz[exp_chunk_no + 1 + 1 * (exp_digits + 1)]; + + red_table_idx_1 >>= exp_chunk_shift; + /* + * Get additional bits from then next quadword + * when 64-bit boundaries are crossed. + */ +- if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { ++ if (exp_chunk_shift > 64 - exp_win_size) { + T <<= (64 - exp_chunk_shift); + red_table_idx_1 ^= T; + } + red_table_idx_1 &= table_idx_mask; +- +- ossl_extract_multiplier_2x20_win5(red_X[1], +- (const BN_ULONG*)red_table, +- (int)red_table_idx_1, 1); + } ++ ++ extract(&red_X[0 * red_digits], (const BN_ULONG*)red_table, (int)red_table_idx_0, (int)red_table_idx_1); + } + + /* Series of squaring */ +@@ -406,43 +491,46 @@ static void RSAZ_exp52x20_x2_256(BN_ULONG *out, /* [2][20] */ + DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); + DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); + +- DAMM((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); ++ damm((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); + } + } + + /* + * + * NB: After the last AMM of exponentiation in Montgomery domain, the result +- * may be 1025-bit, but the conversion out of Montgomery domain performs an +- * AMM(x,1) which guarantees that the final result is less than |m|, so no +- * conditional subtraction is needed here. See "Efficient Software +- * Implementations of Modular Exponentiation" (by Shay Gueron) paper for details. ++ * may be (modulus_bitsize + 1), but the conversion out of Montgomery domain ++ * performs an AMM(x,1) which guarantees that the final result is less than ++ * |m|, so no conditional subtraction is needed here. See [1] for details. ++ * ++ * [1] Gueron, S. Efficient software implementations of modular exponentiation. ++ * DOI: 10.1007/s13389-012-0031-5 + */ + + /* Convert result back in regular 2^52 domain */ +- memset(red_X, 0, sizeof(red_X)); +- red_X[0][0] = 1; +- red_X[1][0] = 1; +- DAMM(out, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); +- +- /* Clear exponents */ +- OPENSSL_cleanse(expz, sizeof(expz)); +- OPENSSL_cleanse(red_Y, sizeof(red_Y)); +- +-# undef DAMS +-# undef DAMM +-# undef EXP_DIGITS +-# undef RED_DIGITS +-# undef EXP_WIN_MASK +-# undef EXP_WIN_SIZE +-# undef BITSIZE_MODULUS ++ memset(red_X, 0, 2 * red_digits * sizeof(BN_ULONG)); ++ red_X[0 * red_digits] = 1; ++ red_X[1 * red_digits] = 1; ++ damm(out, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); ++ ++ ret = 1; ++ ++err: ++ if (storage != NULL) { ++ /* Clear whole storage */ ++ OPENSSL_cleanse(storage, storage_len_bytes); ++ OPENSSL_free(storage); ++ } ++ ++#undef DAMS ++ return ret; + } + +-static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len) ++static ossl_inline uint64_t get_digit(const uint8_t *in, int in_len) + { + uint64_t digit = 0; + + assert(in != NULL); ++ assert(in_len <= 8); + + for (; in_len > 0; in_len--) { + digit <<= 8; +@@ -480,17 +568,17 @@ static void to_words52(BN_ULONG *out, int out_len, + } + + if (in_bitsize > DIGIT_SIZE) { +- uint64_t digit = get_digit52(in_str, 7); ++ uint64_t digit = get_digit(in_str, 7); + + out[0] = digit & DIGIT_MASK; + in_str += 6; + in_bitsize -= DIGIT_SIZE; +- digit = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); ++ digit = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize)); + out[1] = digit >> 4; + out += 2; + out_len -= 2; + } else if (in_bitsize > 0) { +- out[0] = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); ++ out[0] = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize)); + out++; + out_len--; + } +@@ -502,12 +590,13 @@ static void to_words52(BN_ULONG *out, int out_len, + } + } + +-static ossl_inline void put_digit52(uint8_t *pStr, int strLen, uint64_t digit) ++static ossl_inline void put_digit(uint8_t *out, int out_len, uint64_t digit) + { +- assert(pStr != NULL); ++ assert(out != NULL); ++ assert(out_len <= 8); + +- for (; strLen > 0; strLen--) { +- *pStr++ = (uint8_t)(digit & 0xFF); ++ for (; out_len > 0; out_len--) { ++ *out++ = (uint8_t)(digit & 0xFF); + digit >>= 8; + } + } +@@ -543,13 +632,13 @@ static void from_words52(BN_ULONG *out, int out_bitsize, const BN_ULONG *in) + } + + if (out_bitsize > DIGIT_SIZE) { +- put_digit52(out_str, 7, in[0]); ++ put_digit(out_str, 7, in[0]); + out_str += 6; + out_bitsize -= DIGIT_SIZE; +- put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), ++ put_digit(out_str, BITS2WORD8_SIZE(out_bitsize), + (in[1] << 4 | in[0] >> 48)); + } else if (out_bitsize) { +- put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]); ++ put_digit(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]); + } + } + } +diff --git a/test/exptest.c b/test/exptest.c +index 59285b17a3..143dfa9958 100644 +--- a/test/exptest.c ++++ b/test/exptest.c +@@ -252,11 +252,12 @@ static int test_mod_exp_x2(int idx) + BIGNUM *m2 = NULL; + int factor_size = 0; + +- /* +- * Currently only 1024-bit factor size is supported. +- */ + if (idx <= 100) + factor_size = 1024; ++ else if (idx <= 200) ++ factor_size = 1536; ++ else if (idx <= 300) ++ factor_size = 2048; + + if (!TEST_ptr(ctx = BN_CTX_new())) + goto err; +@@ -332,6 +333,6 @@ int setup_tests(void) + { + ADD_TEST(test_mod_exp_zero); + ADD_ALL_TESTS(test_mod_exp, 200); +- ADD_ALL_TESTS(test_mod_exp_x2, 100); ++ ADD_ALL_TESTS(test_mod_exp_x2, 300); + return 1; + } + +base-commit: 245cb0291e0db99d9ccf3692fa76f440b2b054c2 +-- +2.39.2 + diff -Nru openssl-3.0.10/debian/patches/intel/0002-AES-GCM-enabled-with-AVX512-vAES-and-vPCLMULQDQ.patch openssl-3.0.10/debian/patches/intel/0002-AES-GCM-enabled-with-AVX512-vAES-and-vPCLMULQDQ.patch --- openssl-3.0.10/debian/patches/intel/0002-AES-GCM-enabled-with-AVX512-vAES-and-vPCLMULQDQ.patch 1970-01-01 00:00:00.000000000 +0000 +++ openssl-3.0.10/debian/patches/intel/0002-AES-GCM-enabled-with-AVX512-vAES-and-vPCLMULQDQ.patch 2023-08-08 15:51:48.000000000 +0000 @@ -0,0 +1,5311 @@ +From 949108dd73de321fb93c8d81b846a2a1d015a9fd Mon Sep 17 00:00:00 2001 +From: Andrey Matyukov +Date: Wed, 9 Jun 2021 14:38:40 -0700 +Subject: [PATCH 2/2] AES-GCM enabled with AVX512 vAES and vPCLMULQDQ. + +Vectorized 'stitched' encrypt + ghash implementation of AES-GCM enabled +with AVX512 vAES and vPCLMULQDQ instructions (available starting Intel's +IceLake micro-architecture). + +The performance details for representative IceLake Server and Client +platforms are shown below + +Performance data: +OpenSSL Speed KBs/Sec +Intel(R) Xeon(R) Platinum 8380 CPU @ 2.30GHz (1Core/1Thread) +Payload in Bytes 16 64 256 1024 8192 16384 +AES-128-GCM + Baseline 478708.27 1118296.96 2428092.52 3518199.4 4172355.99 4235762.07 + Patched 534613.95 2009345.55 3775588.15 5059517.64 8476794.88 8941541.79 + Speedup 1.12 1.80 1.55 1.44 2.03 2.11 + +AES-256-GCM + Baseline 399237.27 961699.9 2136377.65 2979889.15 3554823.37 3617757.5 + Patched 475948.13 1720128.51 3462407.12 4696832.2 7532013.16 7924953.91 + Speedup 1.19 1.79 1.62 1.58 2.12 2.19 +Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz (1Core/1Thread) +Payload in Bytes 16 64 256 1024 8192 16384 +AES-128-GCM + Baseline 259128.54 570756.43 1362554.16 1990654.57 2359128.88 2401671.58 + Patched 292139.47 1079320.95 2001974.63 2829007.46 4510318.59 4705314.41 + Speedup 1.13 1.89 1.47 1.42 1.91 1.96 +AES-256-GCM + Baseline 236000.34 550506.76 1234638.08 1716734.57 2011255.6 2028099.99 + Patched 247256.32 919731.34 1773270.43 2553239.55 3953115.14 4111227.29 + Speedup 1.05 1.67 1.44 1.49 1.97 2.03 + +Reviewed-by: TJ O'Dwyer, Marcel Cornu, Pablo de Lara +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17239) + +Backported by Simon Chopin + +Bug-Ubuntu: https://bugs.launchpad.net/ubuntu/+source/openssl/+bug/2030784 +Origin: https://github.com/openssl/openssl/pull/17239 +Applied-Upstream: 3.1.0 +--- + crypto/modes/asm/aes-gcm-avx512.pl | 4975 +++++++++++++++++ + crypto/modes/build.info | 3 +- + include/crypto/modes.h | 4 +- + .../ciphers/cipher_aes_gcm_hw_aesni.inc | 13 +- + .../ciphers/cipher_aes_gcm_hw_vaes_avx512.inc | 205 + + 5 files changed, 5195 insertions(+), 5 deletions(-) + create mode 100644 crypto/modes/asm/aes-gcm-avx512.pl + create mode 100644 providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc + +diff --git a/crypto/modes/asm/aes-gcm-avx512.pl b/crypto/modes/asm/aes-gcm-avx512.pl +new file mode 100644 +index 0000000000..1c7ee8769a +--- /dev/null ++++ b/crypto/modes/asm/aes-gcm-avx512.pl +@@ -0,0 +1,4975 @@ ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# Copyright (c) 2021, Intel Corporation. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# ++# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ) ++# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1 ++# (https://github.com/intel/intel-ipsec-mb). ++# Original author is Tomasz Kantecki . ++# ++# References: ++# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on ++# Intel Architecture Processors. August, 2010. ++# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on ++# Intel Architecture Processors. October, 2012. ++# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its ++# Usage for Computing the GCM Mode. May, 2010. ++# ++# ++# December 2021 ++# ++# Initial release. ++# ++# GCM128_CONTEXT structure has storage for 16 hkeys only, but this ++# implementation can use up to 48. To avoid extending the context size, ++# precompute and store in the context first 16 hkeys only, and compute the rest ++# on demand keeping them in the local frame. ++# ++#====================================================================== ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$win64 = 0; ++$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); ++ ++$avx512vaes = 0; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; ++$dir = $1; ++($xlate = "${dir}x86_64-xlate.pl" and -f $xlate) ++ or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) ++ or die "can't locate x86_64-xlate.pl"; ++ ++if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { ++ $avx512vaes = ($1 >= 2.30); ++} ++ ++if (!$avx512vaes ++ && $win64 ++ && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) ++ && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) ++{ ++ $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14); ++} ++ ++if (!$avx512vaes && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { ++ $avx512vaes = ($2 >= 7.0); ++} ++ ++open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT = *OUT; ++ ++#====================================================================== ++if ($avx512vaes>0) { #<<< ++ ++$code .= <<___; ++.extern OPENSSL_ia32cap_P ++.globl ossl_vaes_vpclmulqdq_capable ++.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent ++.align 32 ++ossl_vaes_vpclmulqdq_capable: ++ mov OPENSSL_ia32cap_P+8(%rip), %rcx ++ # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f ++ mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx ++ xor %eax,%eax ++ and %rdx,%rcx ++ cmp %rdx,%rcx ++ cmove %rcx,%rax ++ ret ++.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable ++___ ++ ++# ; Mapping key length -> AES rounds count ++my %aes_rounds = ( ++ 128 => 9, ++ 192 => 11, ++ 256 => 13); ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Code generation control switches ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++# ; ABI-aware zeroing of volatile registers in EPILOG(). ++# ; Disabled due to performance reasons. ++my $CLEAR_SCRATCH_REGISTERS = 0; ++ ++# ; Zero HKeys storage from the stack if they are stored there ++my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1; ++ ++# ; Enable / disable check of function arguments for null pointer ++# ; Currently disabled, as this check is handled outside. ++my $CHECK_FUNCTION_ARGUMENTS = 0; ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Global constants ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++# AES block size in bytes ++my $AES_BLOCK_SIZE = 16; ++ ++# Storage capacity in elements ++my $HKEYS_STORAGE_CAPACITY = 48; ++my $LOCAL_STORAGE_CAPACITY = 48; ++my $HKEYS_CONTEXT_CAPACITY = 16; ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Stack frame definition ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs ++# (2) -> +8-byte space for 16-byte alignment of XMM storage ++# (3) -> Frame pointer (%RBP) ++# (4) -> +160-byte XMM storage (Windows only, zero on Linux) ++# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8 ++# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions) ++# (7) -> +768-byte HKEYS storage ++# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary ++ ++my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack) ++my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers ++my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48 ++my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks ++ ++my $STACK_HKEYS_OFFSET = 0; ++my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE); ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Function arguments abstraction ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11); ++ ++# ; This implementation follows the convention: for non-leaf functions (they ++# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from ++# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This ++# ; helps to facilitate SEH handlers writing. ++# ++# ; Leaf functions here do not use more than 4 input arguments. ++if ($win64) { ++ $arg1 = "%rcx"; ++ $arg2 = "%rdx"; ++ $arg3 = "%r8"; ++ $arg4 = "%r9"; ++ $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes ++ $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)"; ++ $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)"; ++ $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)"; ++ $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)"; ++ $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)"; ++ $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)"; ++} else { ++ $arg1 = "%rdi"; ++ $arg2 = "%rsi"; ++ $arg3 = "%rdx"; ++ $arg4 = "%rcx"; ++ $arg5 = "%r8"; ++ $arg6 = "%r9"; ++ $arg7 = "`$GP_STORAGE + 8*1`(%rbp)"; ++ $arg8 = "`$GP_STORAGE + 8*2`(%rbp)"; ++ $arg9 = "`$GP_STORAGE + 8*3`(%rbp)"; ++ $arg10 = "`$GP_STORAGE + 8*4`(%rbp)"; ++ $arg11 = "`$GP_STORAGE + 8*5`(%rbp)"; ++} ++ ++# ; Offsets in gcm128_context structure (see include/crypto/modes.h) ++my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key ++my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer ++my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation) ++my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input ++my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted ++my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash ++my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values) ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Helper functions ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++# ; Generates "random" local labels ++sub random_string() { ++ my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); ++ my $length = 15; ++ my $str; ++ map { $str .= $chars[rand(33)] } 1 .. $length; ++ return $str; ++} ++ ++sub BYTE { ++ my ($reg) = @_; ++ if ($reg =~ /%r[abcd]x/i) { ++ $reg =~ s/%r([abcd])x/%${1}l/i; ++ } elsif ($reg =~ /%r[sdb][ip]/i) { ++ $reg =~ s/%r([sdb][ip])/%${1}l/i; ++ } elsif ($reg =~ /%r[0-9]{1,2}/i) { ++ $reg =~ s/%(r[0-9]{1,2})/%${1}b/i; ++ } else { ++ die "BYTE: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++sub WORD { ++ my ($reg) = @_; ++ if ($reg =~ /%r[abcdsdb][xip]/i) { ++ $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i; ++ } elsif ($reg =~ /%r[0-9]{1,2}/) { ++ $reg =~ s/%(r[0-9]{1,2})/%${1}w/i; ++ } else { ++ die "WORD: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++sub DWORD { ++ my ($reg) = @_; ++ if ($reg =~ /%r[abcdsdb][xip]/i) { ++ $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i; ++ } elsif ($reg =~ /%r[0-9]{1,2}/i) { ++ $reg =~ s/%(r[0-9]{1,2})/%${1}d/i; ++ } else { ++ die "DWORD: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++sub XWORD { ++ my ($reg) = @_; ++ if ($reg =~ /%[xyz]mm/i) { ++ $reg =~ s/%[xyz]mm/%xmm/i; ++ } else { ++ die "XWORD: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++sub YWORD { ++ my ($reg) = @_; ++ if ($reg =~ /%[xyz]mm/i) { ++ $reg =~ s/%[xyz]mm/%ymm/i; ++ } else { ++ die "YWORD: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++sub ZWORD { ++ my ($reg) = @_; ++ if ($reg =~ /%[xyz]mm/i) { ++ $reg =~ s/%[xyz]mm/%zmm/i; ++ } else { ++ die "ZWORD: unknown register: $reg\n"; ++ } ++ return $reg; ++} ++ ++# ; Helper function to construct effective address based on two kinds of ++# ; offsets: numerical or located in the register ++sub EffectiveAddress { ++ my ($base, $offset, $displacement) = @_; ++ $displacement = 0 if (!$displacement); ++ ++ if ($offset =~ /^\d+\z/) { # numerical offset ++ return "`$offset + $displacement`($base)"; ++ } else { # offset resides in register ++ return "$displacement($base,$offset,1)"; ++ } ++} ++ ++# ; Provides memory location of corresponding HashKey power ++sub HashKeyByIdx { ++ my ($idx, $base) = @_; ++ my $base_str = ($base eq "%rsp") ? "frame" : "context"; ++ ++ my $offset = &HashKeyOffsetByIdx($idx, $base_str); ++ return "$offset($base)"; ++} ++ ++# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage ++sub HashKeyOffsetByIdx { ++ my ($idx, $base) = @_; ++ die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base" ++ if (($base ne "frame") && ($base ne "context")); ++ ++ my $offset_base; ++ my $offset_idx; ++ if ($base eq "frame") { # frame storage ++ die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1); ++ $offset_base = $STACK_HKEYS_OFFSET; ++ $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx)); ++ } else { # context storage ++ die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1); ++ $offset_base = $CTX_OFFSET_HTable; ++ $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx)); ++ } ++ return $offset_base + $offset_idx; ++} ++ ++# ; Creates local frame and does back up of non-volatile registers. ++# ; Holds stack unwinding directives. ++sub PROLOG { ++ my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_; ++ ++ my $DYNAMIC_STACK_ALLOC_SIZE = 0; ++ my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52; ++ ++ if ($need_hkeys_stack_storage) { ++ $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE; ++ } ++ ++ if ($need_aes_stack_storage) { ++ if (!$need_hkeys_stack_storage) { ++ die "PROLOG: unsupported case - aes storage without hkeys one"; ++ } ++ $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE; ++ } ++ ++ $code .= <<___; ++ push %rbx ++.cfi_push %rbx ++.L${func_name}_seh_push_rbx: ++ push %rbp ++.cfi_push %rbp ++.L${func_name}_seh_push_rbp: ++ push %r12 ++.cfi_push %r12 ++.L${func_name}_seh_push_r12: ++ push %r13 ++.cfi_push %r13 ++.L${func_name}_seh_push_r13: ++ push %r14 ++.cfi_push %r14 ++.L${func_name}_seh_push_r14: ++ push %r15 ++.cfi_push %r15 ++.L${func_name}_seh_push_r15: ++___ ++ ++ if ($win64) { ++ $code .= <<___; ++ push %rdi ++.L${func_name}_seh_push_rdi: ++ push %rsi ++.L${func_name}_seh_push_rsi: ++ ++ sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment ++.L${func_name}_seh_allocstack_xmm: ++___ ++ } ++ $code .= <<___; ++ # ; %rbp contains stack pointer right after GP regs pushed at stack + [8 ++ # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH ++ # ; handlers. The requirement for a frame pointer is that its offset from ++ # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer ++ # ; itself seems to be reasonable to use here, because later we do 64-byte stack ++ # ; alignment which gives us non-determinate offsets and complicates writing ++ # ; SEH handlers. ++ # ++ # ; It also serves as an anchor for retrieving stack arguments on both Linux ++ # ; and Windows. ++ lea `$XMM_STORAGE`(%rsp),%rbp ++.cfi_def_cfa_register %rbp ++.L${func_name}_seh_setfp: ++___ ++ if ($win64) { ++ ++ # ; xmm6:xmm15 need to be preserved on Windows ++ foreach my $reg_idx (6 .. 15) { ++ my $xmm_reg_offset = ($reg_idx - 6) * 16; ++ $code .= <<___; ++ vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp) ++.L${func_name}_seh_save_xmm${reg_idx}: ++___ ++ } ++ } ++ ++ $code .= <<___; ++# Prolog ends here. Next stack allocation is treated as "dynamic". ++.L${func_name}_seh_prolog_end: ++___ ++ ++ if ($DYNAMIC_STACK_ALLOC_SIZE) { ++ $code .= <<___; ++ sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp ++ and \$(-64),%rsp ++___ ++ } ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Restore register content for the caller. ++# ;;; And cleanup stack. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub EPILOG { ++ my ($hkeys_storage_on_stack, $payload_len) = @_; ++ ++ my $rndsuffix = &random_string(); ++ ++ if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) { ++ ++ # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys ++ # ; were stored in the local frame storage ++ $code .= <<___; ++ cmpq \$`16*16`,$payload_len ++ jbe .Lskip_hkeys_cleanup_${rndsuffix} ++ vpxor %xmm0,%xmm0,%xmm0 ++___ ++ for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) { ++ $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n"; ++ } ++ $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n"; ++ } ++ ++ if ($CLEAR_SCRATCH_REGISTERS) { ++ &clear_scratch_gps_asm(); ++ &clear_scratch_zmms_asm(); ++ } else { ++ $code .= "vzeroupper\n"; ++ } ++ ++ if ($win64) { ++ ++ # ; restore xmm15:xmm6 ++ for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { ++ my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16; ++ $code .= <<___; ++ vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx}, ++___ ++ } ++ } ++ ++ if ($win64) { ++ ++ # Forming valid epilog for SEH with use of frame pointer. ++ # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code ++ $code .= "lea 8(%rbp),%rsp\n"; ++ } else { ++ $code .= "lea (%rbp),%rsp\n"; ++ $code .= ".cfi_def_cfa_register %rsp\n"; ++ } ++ ++ if ($win64) { ++ $code .= <<___; ++ pop %rsi ++.cfi_pop %rsi ++ pop %rdi ++.cfi_pop %rdi ++___ ++ } ++ $code .= <<___; ++ pop %r15 ++.cfi_pop %r15 ++ pop %r14 ++.cfi_pop %r14 ++ pop %r13 ++.cfi_pop %r13 ++ pop %r12 ++.cfi_pop %r12 ++ pop %rbp ++.cfi_pop %rbp ++ pop %rbx ++.cfi_pop %rbx ++___ ++} ++ ++# ; Clears all scratch ZMM registers ++# ; ++# ; It should be called before restoring the XMM registers ++# ; for Windows (XMM6-XMM15). ++# ; ++sub clear_scratch_zmms_asm { ++ ++ # ; On Linux, all ZMM registers are scratch registers ++ if (!$win64) { ++ $code .= "vzeroall\n"; ++ } else { ++ foreach my $i (0 .. 5) { ++ $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; ++ } ++ } ++ foreach my $i (16 .. 31) { ++ $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; ++ } ++} ++ ++# Clears all scratch GP registers ++sub clear_scratch_gps_asm { ++ foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") { ++ $code .= "xor $reg,$reg\n"; ++ } ++ if (!$win64) { ++ foreach my $reg ("%rsi", "%rdi") { ++ $code .= "xor $reg,$reg\n"; ++ } ++ } ++} ++ ++sub precompute_hkeys_on_stack { ++ my $GCM128_CTX = $_[0]; ++ my $HKEYS_READY = $_[1]; ++ my $ZTMP0 = $_[2]; ++ my $ZTMP1 = $_[3]; ++ my $ZTMP2 = $_[4]; ++ my $ZTMP3 = $_[5]; ++ my $ZTMP4 = $_[6]; ++ my $ZTMP5 = $_[7]; ++ my $ZTMP6 = $_[8]; ++ my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32" ++ ++ die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE" ++ if ($HKEYS_RANGE ne "first16" ++ && $HKEYS_RANGE ne "mid16" ++ && $HKEYS_RANGE ne "all" ++ && $HKEYS_RANGE ne "first32" ++ && $HKEYS_RANGE ne "last32"); ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ test $HKEYS_READY,$HKEYS_READY ++ jnz .L_skip_hkeys_precomputation_${rndsuffix} ++___ ++ ++ if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") { ++ ++ # ; Fill the stack with the first 16 hkeys from the context ++ $code .= <<___; ++ # ; Move 16 hkeys from the context to stack ++ vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0 ++ vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]} ++ ++ vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1 ++ vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]} ++ ++ # ; broadcast HashKey^8 ++ vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 ++ ++ vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2 ++ vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]} ++ ++ vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3 ++ vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]} ++___ ++ } ++ ++ if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1 ++ ++ # ; broadcast HashKey^8 ++ vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 ++ ++ vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2 ++ vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3 ++___ ++ ++ } ++ ++ if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { ++ ++ # ; Precompute hkeys^i, i=17..32 ++ my $i = 20; ++ foreach (1 .. int((32 - 16) / 8)) { ++ ++ # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) ++ &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); ++ $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; ++ $i += 4; ++ ++ # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) ++ &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); ++ $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; ++ $i += 4; ++ } ++ } ++ ++ if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { ++ ++ # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48) ++ my $i = 36; ++ foreach (1 .. int((48 - 32) / 8)) { ++ ++ # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) ++ &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); ++ $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; ++ $i += 4; ++ ++ # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) ++ &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); ++ $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; ++ $i += 4; ++ } ++ } ++ ++ $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n"; ++} ++ ++# ;; ============================================================================= ++# ;; Generic macro to produce code that executes $OPCODE instruction ++# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16. ++# ;; All three operands of the instruction come from registers. ++# ;; Note: if 3 blocks are left at the end instruction is produced to operate all ++# ;; 4 blocks (full width of ZMM) ++sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 { ++ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) ++ my $OPCODE = $_[1]; # [in] instruction name ++ my @DST; ++ $DST[0] = $_[2]; # [out] destination ZMM register ++ $DST[1] = $_[3]; # [out] destination ZMM register ++ $DST[2] = $_[4]; # [out] destination ZMM register ++ $DST[3] = $_[5]; # [out] destination ZMM register ++ my @SRC1; ++ $SRC1[0] = $_[6]; # [in] source 1 ZMM register ++ $SRC1[1] = $_[7]; # [in] source 1 ZMM register ++ $SRC1[2] = $_[8]; # [in] source 1 ZMM register ++ $SRC1[3] = $_[9]; # [in] source 1 ZMM register ++ my @SRC2; ++ $SRC2[0] = $_[10]; # [in] source 2 ZMM register ++ $SRC2[1] = $_[11]; # [in] source 2 ZMM register ++ $SRC2[2] = $_[12]; # [in] source 2 ZMM register ++ $SRC2[3] = $_[13]; # [in] source 2 ZMM register ++ ++ die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" ++ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); ++ ++ my $reg_idx = 0; ++ my $blocks_left = $NUM_BLOCKS; ++ ++ foreach (1 .. ($NUM_BLOCKS / 4)) { ++ $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n"; ++ $reg_idx++; ++ $blocks_left -= 4; ++ } ++ ++ my $DSTREG = $DST[$reg_idx]; ++ my $SRC1REG = $SRC1[$reg_idx]; ++ my $SRC2REG = $SRC2[$reg_idx]; ++ ++ if ($blocks_left == 1) { ++ $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n"; ++ } elsif ($blocks_left == 2) { ++ $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n"; ++ } elsif ($blocks_left == 3) { ++ $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n"; ++ } ++} ++ ++# ;; ============================================================================= ++# ;; Loads specified number of AES blocks into ZMM registers using mask register ++# ;; for the last loaded register (xmm, ymm or zmm). ++# ;; Loads take place at 1 byte granularity. ++sub ZMM_LOAD_MASKED_BLOCKS_0_16 { ++ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) ++ my $INP = $_[1]; # [in] input data pointer to read from ++ my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) ++ my @DST; ++ $DST[0] = $_[3]; # [out] ZMM register with loaded data ++ $DST[1] = $_[4]; # [out] ZMM register with loaded data ++ $DST[2] = $_[5]; # [out] ZMM register with loaded data ++ $DST[3] = $_[6]; # [out] ZMM register with loaded data ++ my $MASK = $_[7]; # [in] mask register ++ ++ die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" ++ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); ++ ++ my $src_offset = 0; ++ my $dst_idx = 0; ++ my $blocks_left = $NUM_BLOCKS; ++ ++ if ($NUM_BLOCKS > 0) { ++ foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { ++ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n"; ++ $src_offset += 64; ++ $dst_idx++; ++ $blocks_left -= 4; ++ } ++ } ++ ++ my $DSTREG = $DST[$dst_idx]; ++ ++ if ($blocks_left == 1) { ++ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n"; ++ } elsif ($blocks_left == 2) { ++ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n"; ++ } elsif (($blocks_left == 3 || $blocks_left == 4)) { ++ $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n"; ++ } ++} ++ ++# ;; ============================================================================= ++# ;; Stores specified number of AES blocks from ZMM registers with mask register ++# ;; for the last loaded register (xmm, ymm or zmm). ++# ;; Stores take place at 1 byte granularity. ++sub ZMM_STORE_MASKED_BLOCKS_0_16 { ++ my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) ++ my $OUTP = $_[1]; # [in] output data pointer to write to ++ my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) ++ my @SRC; ++ $SRC[0] = $_[3]; # [in] ZMM register with data to store ++ $SRC[1] = $_[4]; # [in] ZMM register with data to store ++ $SRC[2] = $_[5]; # [in] ZMM register with data to store ++ $SRC[3] = $_[6]; # [in] ZMM register with data to store ++ my $MASK = $_[7]; # [in] mask register ++ ++ die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" ++ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); ++ ++ my $dst_offset = 0; ++ my $src_idx = 0; ++ my $blocks_left = $NUM_BLOCKS; ++ ++ if ($NUM_BLOCKS > 0) { ++ foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { ++ $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n"; ++ $dst_offset += 64; ++ $src_idx++; ++ $blocks_left -= 4; ++ } ++ } ++ ++ my $SRCREG = $SRC[$src_idx]; ++ ++ if ($blocks_left == 1) { ++ $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; ++ } elsif ($blocks_left == 2) { ++ $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; ++ } elsif ($blocks_left == 3 || $blocks_left == 4) { ++ $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; ++ } ++} ++ ++# ;;; =========================================================================== ++# ;;; Handles AES encryption rounds ++# ;;; It handles special cases: the last and first rounds ++# ;;; Optionally, it performs XOR with data after the last AES round. ++# ;;; Uses NROUNDS parameter to check what needs to be done for the current round. ++# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). ++sub ZMM_AESENC_ROUND_BLOCKS_0_16 { ++ my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3 ++ my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7 ++ my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11 ++ my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15 ++ my $KEY = $_[4]; # [in] zmm containing round key ++ my $ROUND = $_[5]; # [in] round number ++ my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3 ++ my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7 ++ my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11 ++ my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15 ++ my $NUMBL = $_[10]; # [in] number of blocks; numerical value ++ my $NROUNDS = $_[11]; # [in] number of rounds; numerical value ++ ++ # ;;; === first AES round ++ if ($ROUND < 1) { ++ ++ # ;; round 0 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, ++ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); ++ } ++ ++ # ;;; === middle AES rounds ++ if ($ROUND >= 1 && $ROUND <= $NROUNDS) { ++ ++ # ;; rounds 1 to 9/11/13 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, ++ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); ++ } ++ ++ # ;;; === last AES round ++ if ($ROUND > $NROUNDS) { ++ ++ # ;; the last round - mix enclast with text xor's ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, ++ $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); ++ ++ # ;;; === XOR with data ++ if ( ($D0_3 ne "no_data") ++ && ($D4_7 ne "no_data") ++ && ($D8_11 ne "no_data") ++ && ($D12_15 ne "no_data")) ++ { ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, ++ $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15); ++ } ++ } ++} ++ ++# ;;; Horizontal XOR - 4 x 128bits xored together ++sub VHPXORI4x128 { ++ my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output ++ my $TMP = $_[1]; # [clobbered] ZMM temporary register ++ $code .= <<___; ++ vextracti64x4 \$1,$REG,@{[YWORD($TMP)]} ++ vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]} ++ vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]} ++ vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]} ++___ ++} ++ ++# ;;; AVX512 reduction macro ++sub VCLMUL_REDUCE { ++ my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128) ++ my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial ++ my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce ++ my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce ++ my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register ++ my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register ++ ++ $code .= <<___; ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; first phase of the reduction ++ vpclmulqdq \$0x01,$LO128,$POLY,$TMP0 ++ vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs ++ vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; second phase of the reduction ++ vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1 ++ vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R ++ vpclmulqdq \$0x10,$TMP0,$POLY,$OUT ++ vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts ++ vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128 ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++___ ++} ++ ++# ;; =========================================================================== ++# ;; schoolbook multiply of 16 blocks (16 x 16 bytes) ++# ;; - it is assumed that data read from $INPTR is already shuffled and ++# ;; $INPTR address is 64 byte aligned ++# ;; - there is an option to pass ready blocks through ZMM registers too. ++# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty ++sub GHASH_16 { ++ my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction), ++ # end_reduce (end with reduction), start_reduce ++ my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits ++ my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits ++ my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits ++ my $INPTR = $_[4]; # [in] data input pointer ++ my $INOFF = $_[5]; # [in] data input offset ++ my $INDIS = $_[6]; # [in] data input displacement ++ my $HKPTR = $_[7]; # [in] hash key pointer ++ my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset) ++ my $HKDIS = $_[9]; # [in] hash key displacement ++ my $HASH = $_[10]; # [in/out] ZMM hash value in/out ++ my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM ++ my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM ++ my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM ++ my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM ++ my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM ++ my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM ++ my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM ++ my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM ++ my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM ++ my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided ++ my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) ++ my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) ++ my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) ++ my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) ++ ++ my $start_ghash = 0; ++ my $do_reduction = 0; ++ if ($TYPE eq "start") { ++ $start_ghash = 1; ++ } ++ ++ if ($TYPE eq "start_reduce") { ++ $start_ghash = 1; ++ $do_reduction = 1; ++ } ++ ++ if ($TYPE eq "end_reduce") { ++ $do_reduction = 1; ++ } ++ ++ # ;; ghash blocks 0-3 ++ if (scalar(@_) == 21) { ++ $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n"; ++ } else { ++ $ZTMP9 = $DAT0; ++ } ++ ++ if ($start_ghash != 0) { ++ $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n"; ++ } ++ $code .= <<___; ++ vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8 ++ vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 ++ vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 ++ vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 ++ vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 ++___ ++ ++ # ;; ghash blocks 4-7 ++ if (scalar(@_) == 21) { ++ $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n"; ++ } else { ++ $ZTMP9 = $DAT1; ++ } ++ $code .= <<___; ++ vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8 ++ vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 ++ vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 ++ vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 ++ vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 ++___ ++ ++ # ;; update sums ++ if ($start_ghash != 0) { ++ $code .= <<___; ++ vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1 ++ vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H ++ vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L ++ vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1 ++___ ++ } else { # ;; mid, end, end_reduce ++ $code .= <<___; ++ vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 ++ vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H ++ vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L ++ vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 ++___ ++ } ++ ++ # ;; ghash blocks 8-11 ++ if (scalar(@_) == 21) { ++ $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n"; ++ } else { ++ $ZTMP9 = $DAT2; ++ } ++ $code .= <<___; ++ vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8 ++ vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 ++ vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 ++ vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 ++ vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 ++___ ++ ++ # ;; ghash blocks 12-15 ++ if (scalar(@_) == 21) { ++ $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n"; ++ } else { ++ $ZTMP9 = $DAT3; ++ } ++ $code .= <<___; ++ vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8 ++ vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 ++ vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 ++ vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 ++ vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 ++ # ;; update sums ++ vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 ++ vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H ++ vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L ++ vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 ++___ ++ if ($do_reduction != 0) { ++ $code .= <<___; ++ # ;; integrate GM into GH and GL ++ vpsrldq \$8,$GM,$ZTMP0 ++ vpslldq \$8,$GM,$ZTMP1 ++ vpxorq $ZTMP0,$GH,$GH ++ vpxorq $ZTMP1,$GL,$GL ++___ ++ ++ # ;; add GH and GL 128-bit words horizontally ++ &VHPXORI4x128($GH, $ZTMP0); ++ &VHPXORI4x128($GL, $ZTMP1); ++ ++ # ;; reduction ++ $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n"; ++ &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1)); ++ } ++} ++ ++# ;; =========================================================================== ++# ;; GHASH 1 to 16 blocks of cipher text ++# ;; - performs reduction at the end ++# ;; - it doesn't load the data and it assumed it is already loaded and shuffled ++sub GHASH_1_TO_16 { ++ my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys ++ my $GHASH = $_[1]; # [out] ghash output ++ my $T0H = $_[2]; # [clobbered] temporary ZMM ++ my $T0L = $_[3]; # [clobbered] temporary ZMM ++ my $T0M1 = $_[4]; # [clobbered] temporary ZMM ++ my $T0M2 = $_[5]; # [clobbered] temporary ZMM ++ my $T1H = $_[6]; # [clobbered] temporary ZMM ++ my $T1L = $_[7]; # [clobbered] temporary ZMM ++ my $T1M1 = $_[8]; # [clobbered] temporary ZMM ++ my $T1M2 = $_[9]; # [clobbered] temporary ZMM ++ my $HK = $_[10]; # [clobbered] temporary ZMM ++ my $AAD_HASH_IN = $_[11]; # [in] input hash value ++ my @CIPHER_IN; ++ $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3 ++ $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7 ++ $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11 ++ $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15 ++ my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks ++ my $GH = $_[17]; # [in] ZMM with hi product part ++ my $GM = $_[18]; # [in] ZMM with mid product part ++ my $GL = $_[19]; # [in] ZMM with lo product part ++ ++ die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); ++ ++ if (scalar(@_) == 17) { ++ $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n"; ++ } ++ ++ if ($NUM_BLOCKS == 16) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 ++ vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H ++ vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 ++ vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1 ++ vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1 ++ vpxorq $T1H,$T0H,$T1H ++ vpxorq $T1L,$T0L,$T1L ++ vpxorq $T1M1,$T0M1,$T1M1 ++ vpxorq $T1M2,$T0M2,$T1M2 ++___ ++ } elsif ($NUM_BLOCKS >= 12) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 ++ vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H ++ vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 ++ vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1 ++ vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2 ++___ ++ } elsif ($NUM_BLOCKS >= 8) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 ++ vpxorq $T1H,$T0H,$T1H ++ vpxorq $T1L,$T0L,$T1L ++ vpxorq $T1M1,$T0M1,$T1M1 ++ vpxorq $T1M2,$T0M2,$T1M2 ++___ ++ } elsif ($NUM_BLOCKS >= 4) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK ++ vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0 ++ vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1 ++___ ++ } ++ ++ # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4) ++ my $blocks_left = ($NUM_BLOCKS % 4); ++ if ($blocks_left > 0) { ++ ++ # ;; ===================================================== ++ # ;; There are 1, 2 or 3 blocks left to process. ++ # ;; It may also be that they are the only blocks to process. ++ ++ # ;; Set hash key and register index position for the remaining 1 to 3 blocks ++ my $reg_idx = ($NUM_BLOCKS / 4); ++ my $REG_IN = $CIPHER_IN[$reg_idx]; ++ ++ if ($blocks_left == 1) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]} ++ vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1 ++ vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1 ++ vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0 ++___ ++ } elsif ($blocks_left == 2) { ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} ++ vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1 ++ vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1 ++ vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0 ++___ ++ } else { # ; blocks_left == 3 ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} ++ vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK ++ vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1 ++ vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1 ++ vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0 ++___ ++ } ++ ++ if (scalar(@_) == 20) { ++ ++ # ;; *** GH/GM/GL passed as arguments ++ if ($NUM_BLOCKS >= 4) { ++ $code .= <<___; ++ # ;; add ghash product sums from the first 4, 8 or 12 blocks ++ vpxorq $T1M1,$T0M1,$T0M1 ++ vpternlogq \$0x96,$T1M2,$GM,$T0M2 ++ vpternlogq \$0x96,$T1H,$GH,$T0H ++ vpternlogq \$0x96,$T1L,$GL,$T0L ++___ ++ } else { ++ $code .= <<___; ++ vpxorq $GM,$T0M1,$T0M1 ++ vpxorq $GH,$T0H,$T0H ++ vpxorq $GL,$T0L,$T0L ++___ ++ } ++ } else { ++ ++ # ;; *** GH/GM/GL NOT passed as arguments ++ if ($NUM_BLOCKS >= 4) { ++ $code .= <<___; ++ # ;; add ghash product sums from the first 4, 8 or 12 blocks ++ vpxorq $T1M1,$T0M1,$T0M1 ++ vpxorq $T1M2,$T0M2,$T0M2 ++ vpxorq $T1H,$T0H,$T0H ++ vpxorq $T1L,$T0L,$T0L ++___ ++ } ++ } ++ $code .= <<___; ++ # ;; integrate TM into TH and TL ++ vpxorq $T0M2,$T0M1,$T0M1 ++ vpsrldq \$8,$T0M1,$T1M1 ++ vpslldq \$8,$T0M1,$T1M2 ++ vpxorq $T1M1,$T0H,$T0H ++ vpxorq $T1M2,$T0L,$T0L ++___ ++ } else { ++ ++ # ;; ===================================================== ++ # ;; number of blocks is 4, 8, 12 or 16 ++ # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2 ++ if (scalar(@_) == 20) { ++ $code .= <<___; ++ # ;; *** GH/GM/GL passed as arguments ++ vpxorq $GM,$T1M1,$T1M1 ++ vpxorq $GH,$T1H,$T1H ++ vpxorq $GL,$T1L,$T1L ++___ ++ } ++ $code .= <<___; ++ # ;; integrate TM into TH and TL ++ vpxorq $T1M2,$T1M1,$T1M1 ++ vpsrldq \$8,$T1M1,$T0M1 ++ vpslldq \$8,$T1M1,$T0M2 ++ vpxorq $T0M1,$T1H,$T0H ++ vpxorq $T0M2,$T1L,$T0L ++___ ++ } ++ ++ # ;; add TH and TL 128-bit words horizontally ++ &VHPXORI4x128($T0H, $T1M1); ++ &VHPXORI4x128($T0L, $T1M2); ++ ++ # ;; reduction ++ $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n"; ++ &VCLMUL_REDUCE( ++ @{[XWORD($GHASH)]}, ++ @{[XWORD($HK)]}, ++ @{[XWORD($T0H)]}, ++ @{[XWORD($T0L)]}, ++ @{[XWORD($T0M1)]}, ++ @{[XWORD($T0M2)]}); ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1) ++# ;; Input: A and B (128-bits each, bit-reflected) ++# ;; Output: C = A*B*x mod poly, (i.e. >>1 ) ++# ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input ++# ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. ++# ;; ++# ;; Refer to [3] for more detals. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub GHASH_MUL { ++ my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits) ++ my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits) ++ my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm ++ my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm ++ my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm ++ ++ $code .= <<___; ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1 ++ vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0 ++ vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0 ++ vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1 ++ vpxorq $T3,$GH,$GH ++ ++ vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs ++ vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs ++ vpxorq $T3,$T1,$T1 ++ vpxorq $T2,$GH,$GH ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;first phase of the reduction ++ vmovdqu64 POLY2(%rip),$T3 ++ ++ vpclmulqdq \$0x01,$GH,$T3,$T2 ++ vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs ++ vpxorq $T2,$GH,$GH # ; first phase of the reduction complete ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;second phase of the reduction ++ vpclmulqdq \$0x00,$GH,$T3,$T2 ++ vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R ++ vpclmulqdq \$0x10,$GH,$T3,$GH ++ vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts ++ # ; second phase of the reduction complete, the result is in $GH ++ vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2 ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; PRECOMPUTE computes HashKey_i ++sub PRECOMPUTE { ++ my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated ++ my $HK = $_[1]; #; [in] xmm, hash key ++ my $T1 = $_[2]; #; [clobbered] xmm ++ my $T2 = $_[3]; #; [clobbered] xmm ++ my $T3 = $_[4]; #; [clobbered] xmm ++ my $T4 = $_[5]; #; [clobbered] xmm ++ my $T5 = $_[6]; #; [clobbered] xmm ++ my $T6 = $_[7]; #; [clobbered] xmm ++ ++ my $ZT1 = &ZWORD($T1); ++ my $ZT2 = &ZWORD($T2); ++ my $ZT3 = &ZWORD($T3); ++ my $ZT4 = &ZWORD($T4); ++ my $ZT5 = &ZWORD($T5); ++ my $ZT6 = &ZWORD($T6); ++ ++ my $YT1 = &YWORD($T1); ++ my $YT2 = &YWORD($T2); ++ my $YT3 = &YWORD($T3); ++ my $YT4 = &YWORD($T4); ++ my $YT5 = &YWORD($T5); ++ my $YT6 = &YWORD($T6); ++ ++ $code .= <<___; ++ vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5 ++ vmovdqa $YT5,$YT4 ++___ ++ ++ # ;; calculate HashKey^2<<1 mod poly ++ &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3); ++ ++ $code .= <<___; ++ vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]} ++ vinserti64x2 \$1,$HK,$YT4,$YT5 ++ vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2 ++___ ++ ++ # ;; use 2x128-bit computation ++ # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly ++ &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4 ++ ++ $code .= <<___; ++ vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]} ++ ++ vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5 ++ ++ # ;; switch to 4x128-bit computations now ++ vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4 ++ vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6 ++___ ++ ++ # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly ++ &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); ++ $code .= <<___; ++ vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now ++ vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4 ++___ ++ ++ # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly ++ # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution ++ ++ # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9) ++ &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3); ++ $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n"; ++ ++ # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13) ++ &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); ++ $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n"; ++ ++ # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; READ_SMALL_DATA_INPUT ++# ;; Packs xmm register with data when data input is less or equal to 16 bytes ++# ;; Returns 0 if data has length 0 ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub READ_SMALL_DATA_INPUT { ++ my $OUTPUT = $_[0]; # [out] xmm register ++ my $INPUT = $_[1]; # [in] buffer pointer to read from ++ my $LENGTH = $_[2]; # [in] number of bytes to read ++ my $TMP1 = $_[3]; # [clobbered] ++ my $TMP2 = $_[4]; # [clobbered] ++ my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask ++ ++ $code .= <<___; ++ mov \$16,@{[DWORD($TMP2)]} ++ lea byte_len_to_mask_table(%rip),$TMP1 ++ cmp $TMP2,$LENGTH ++ cmovc $LENGTH,$TMP2 ++___ ++ if ($win64) { ++ $code .= <<___; ++ add $TMP2,$TMP1 ++ add $TMP2,$TMP1 ++ kmovw ($TMP1),$MASK ++___ ++ } else { ++ $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n"; ++ } ++ $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n"; ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. ++# Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ++# Output: The hash of the data (AAD_HASH). ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub CALC_AAD_HASH { ++ my $A_IN = $_[0]; # [in] AAD text pointer ++ my $A_LEN = $_[1]; # [in] AAD length ++ my $AAD_HASH = $_[2]; # [in/out] xmm ghash value ++ my $GCM128_CTX = $_[3]; # [in] pointer to context ++ my $ZT0 = $_[4]; # [clobbered] ZMM register ++ my $ZT1 = $_[5]; # [clobbered] ZMM register ++ my $ZT2 = $_[6]; # [clobbered] ZMM register ++ my $ZT3 = $_[7]; # [clobbered] ZMM register ++ my $ZT4 = $_[8]; # [clobbered] ZMM register ++ my $ZT5 = $_[9]; # [clobbered] ZMM register ++ my $ZT6 = $_[10]; # [clobbered] ZMM register ++ my $ZT7 = $_[11]; # [clobbered] ZMM register ++ my $ZT8 = $_[12]; # [clobbered] ZMM register ++ my $ZT9 = $_[13]; # [clobbered] ZMM register ++ my $ZT10 = $_[14]; # [clobbered] ZMM register ++ my $ZT11 = $_[15]; # [clobbered] ZMM register ++ my $ZT12 = $_[16]; # [clobbered] ZMM register ++ my $ZT13 = $_[17]; # [clobbered] ZMM register ++ my $ZT14 = $_[18]; # [clobbered] ZMM register ++ my $ZT15 = $_[19]; # [clobbered] ZMM register ++ my $ZT16 = $_[20]; # [clobbered] ZMM register ++ my $T1 = $_[21]; # [clobbered] GP register ++ my $T2 = $_[22]; # [clobbered] GP register ++ my $T3 = $_[23]; # [clobbered] GP register ++ my $MASKREG = $_[24]; # [clobbered] mask register ++ ++ my $HKEYS_READY = "%rbx"; ++ ++ my $SHFMSK = $ZT13; ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ mov $A_IN,$T1 # ; T1 = AAD ++ mov $A_LEN,$T2 # ; T2 = aadLen ++ or $T2,$T2 ++ jz .L_CALC_AAD_done_${rndsuffix} ++ ++ xor $HKEYS_READY,$HKEYS_READY ++ vmovdqa64 SHUF_MASK(%rip),$SHFMSK ++ ++.L_get_AAD_loop48x16_${rndsuffix}: ++ cmp \$`(48*16)`,$T2 ++ jl .L_exit_AAD_loop48x16_${rndsuffix} ++___ ++ ++ $code .= <<___; ++ vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3 ++ vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7 ++ vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11 ++ vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all"); ++ $code .= "mov \$1,$HKEYS_READY\n"; ++ ++ &GHASH_16( ++ "start", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", ++ &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19 ++ vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23 ++ vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27 ++ vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ &GHASH_16( ++ "mid", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", ++ &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35 ++ vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39 ++ vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43 ++ vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ &GHASH_16( ++ "end_reduce", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", ++ &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ sub \$`(48*16)`,$T2 ++ je .L_CALC_AAD_done_${rndsuffix} ++ ++ add \$`(48*16)`,$T1 ++ jmp .L_get_AAD_loop48x16_${rndsuffix} ++ ++.L_exit_AAD_loop48x16_${rndsuffix}: ++ # ; Less than 48x16 bytes remaining ++ cmp \$`(32*16)`,$T2 ++ jl .L_less_than_32x16_${rndsuffix} ++___ ++ ++ $code .= <<___; ++ # ; Get next 16 blocks ++ vmovdqu64 `64*0`($T1),$ZT1 ++ vmovdqu64 `64*1`($T1),$ZT2 ++ vmovdqu64 `64*2`($T1),$ZT3 ++ vmovdqu64 `64*3`($T1),$ZT4 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32"); ++ $code .= "mov \$1,$HKEYS_READY\n"; ++ ++ &GHASH_16( ++ "start", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", ++ &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ vmovdqu64 `16*16 + 64*0`($T1),$ZT1 ++ vmovdqu64 `16*16 + 64*1`($T1),$ZT2 ++ vmovdqu64 `16*16 + 64*2`($T1),$ZT3 ++ vmovdqu64 `16*16 + 64*3`($T1),$ZT4 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ &GHASH_16( ++ "end_reduce", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", ++ &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ sub \$`(32*16)`,$T2 ++ je .L_CALC_AAD_done_${rndsuffix} ++ ++ add \$`(32*16)`,$T1 ++ jmp .L_less_than_16x16_${rndsuffix} ++ ++.L_less_than_32x16_${rndsuffix}: ++ cmp \$`(16*16)`,$T2 ++ jl .L_less_than_16x16_${rndsuffix} ++ # ; Get next 16 blocks ++ vmovdqu64 `64*0`($T1),$ZT1 ++ vmovdqu64 `64*1`($T1),$ZT2 ++ vmovdqu64 `64*2`($T1),$ZT3 ++ vmovdqu64 `64*3`($T1),$ZT4 ++ vpshufb $SHFMSK,$ZT1,$ZT1 ++ vpshufb $SHFMSK,$ZT2,$ZT2 ++ vpshufb $SHFMSK,$ZT3,$ZT3 ++ vpshufb $SHFMSK,$ZT4,$ZT4 ++___ ++ ++ # ; This code path does not use more than 16 hkeys, so they can be taken from the context ++ # ; (not from the stack storage) ++ &GHASH_16( ++ "start_reduce", $ZT5, $ZT6, $ZT7, ++ "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX, ++ &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0, ++ $ZT8, $ZT9, $ZT10, $ZT11, ++ $ZT12, $ZT14, $ZT15, $ZT16, ++ "NO_ZMM", $ZT1, $ZT2, $ZT3, ++ $ZT4); ++ ++ $code .= <<___; ++ sub \$`(16*16)`,$T2 ++ je .L_CALC_AAD_done_${rndsuffix} ++ ++ add \$`(16*16)`,$T1 ++ # ; Less than 16x16 bytes remaining ++.L_less_than_16x16_${rndsuffix}: ++ # ;; prep mask source address ++ lea byte64_len_to_mask_table(%rip),$T3 ++ lea ($T3,$T2,8),$T3 ++ ++ # ;; calculate number of blocks to ghash (including partial bytes) ++ add \$15,@{[DWORD($T2)]} ++ shr \$4,@{[DWORD($T2)]} ++ cmp \$2,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_1_${rndsuffix} ++ je .L_AAD_blocks_2_${rndsuffix} ++ cmp \$4,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_3_${rndsuffix} ++ je .L_AAD_blocks_4_${rndsuffix} ++ cmp \$6,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_5_${rndsuffix} ++ je .L_AAD_blocks_6_${rndsuffix} ++ cmp \$8,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_7_${rndsuffix} ++ je .L_AAD_blocks_8_${rndsuffix} ++ cmp \$10,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_9_${rndsuffix} ++ je .L_AAD_blocks_10_${rndsuffix} ++ cmp \$12,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_11_${rndsuffix} ++ je .L_AAD_blocks_12_${rndsuffix} ++ cmp \$14,@{[DWORD($T2)]} ++ jb .L_AAD_blocks_13_${rndsuffix} ++ je .L_AAD_blocks_14_${rndsuffix} ++ cmp \$15,@{[DWORD($T2)]} ++ je .L_AAD_blocks_15_${rndsuffix} ++___ ++ ++ # ;; fall through for 16 blocks ++ ++ # ;; The flow of each of these cases is identical: ++ # ;; - load blocks plain text ++ # ;; - shuffle loaded blocks ++ # ;; - xor in current hash value into block 0 ++ # ;; - perform up multiplications with ghash keys ++ # ;; - jump to reduction code ++ ++ for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) { ++ $code .= ".L_AAD_blocks_${aad_blocks}_${rndsuffix}:\n"; ++ if ($aad_blocks > 12) { ++ $code .= "sub \$`12*16*8`, $T3\n"; ++ } elsif ($aad_blocks > 8) { ++ $code .= "sub \$`8*16*8`, $T3\n"; ++ } elsif ($aad_blocks > 4) { ++ $code .= "sub \$`4*16*8`, $T3\n"; ++ } ++ $code .= "kmovq ($T3),$MASKREG\n"; ++ ++ &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG); ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); ++ ++ &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH), ++ $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks); ++ ++ if ($aad_blocks > 1) { ++ ++ # ;; fall through to CALC_AAD_done in 1 block case ++ $code .= "jmp .L_CALC_AAD_done_${rndsuffix}\n"; ++ } ++ ++ } ++ $code .= ".L_CALC_AAD_done_${rndsuffix}:\n"; ++ ++ # ;; result in AAD_HASH ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; PARTIAL_BLOCK ++# ;; Handles encryption/decryption and the tag partial blocks between ++# ;; update calls. ++# ;; Requires the input data be at least 1 byte long. ++# ;; Output: ++# ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT), ++# ;; AAD_HASH and updated GCM128_CTX ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub PARTIAL_BLOCK { ++ my $GCM128_CTX = $_[0]; # [in] key pointer ++ my $PBLOCK_LEN = $_[1]; # [in] partial block length ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer ++ my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length ++ my $DATA_OFFSET = $_[5]; # [out] data offset (gets set) ++ my $AAD_HASH = $_[6]; # [out] updated GHASH value ++ my $ENC_DEC = $_[7]; # [in] cipher direction ++ my $GPTMP0 = $_[8]; # [clobbered] GP temporary register ++ my $GPTMP1 = $_[9]; # [clobbered] GP temporary register ++ my $GPTMP2 = $_[10]; # [clobbered] GP temporary register ++ my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register ++ my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register ++ my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register ++ my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register ++ my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register ++ my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register ++ my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register ++ my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register ++ my $MASKREG = $_[19]; # [clobbered] mask temporary register ++ ++ my $XTMP0 = &XWORD($ZTMP0); ++ my $XTMP1 = &XWORD($ZTMP1); ++ my $XTMP2 = &XWORD($ZTMP2); ++ my $XTMP3 = &XWORD($ZTMP3); ++ my $XTMP4 = &XWORD($ZTMP4); ++ my $XTMP5 = &XWORD($ZTMP5); ++ my $XTMP6 = &XWORD($ZTMP6); ++ my $XTMP7 = &XWORD($ZTMP7); ++ ++ my $LENGTH = $DATA_OFFSET; ++ my $IA0 = $GPTMP1; ++ my $IA1 = $GPTMP2; ++ my $IA2 = $GPTMP0; ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero ++ mov ($PBLOCK_LEN),$LENGTH ++ or $LENGTH,$LENGTH ++ je .L_partial_block_done_${rndsuffix} # ;Leave Macro if no partial blocks ++___ ++ ++ &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG); ++ ++ $code .= <<___; ++ # ;; XTMP1 = my_ctx_data.partial_block_enc_key ++ vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1 ++ vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2 ++ ++ # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes ++ # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16) ++ lea SHIFT_MASK(%rip),$IA0 ++ add $LENGTH,$IA0 ++ vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask ++ vpshufb $XTMP3,$XTMP1,$XTMP1 ++___ ++ ++ if ($ENC_DEC eq "DEC") { ++ $code .= <<___; ++ # ;; keep copy of cipher text in $XTMP4 ++ vmovdqa64 $XTMP0,$XTMP4 ++___ ++ } ++ $code .= <<___; ++ vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn) ++ # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block ++ # ;; Determine if partial block is not being filled and shift mask accordingly ++___ ++ if ($win64) { ++ $code .= <<___; ++ mov $PLAIN_CIPH_LEN,$IA1 ++ add $LENGTH,$IA1 ++___ ++ } else { ++ $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n"; ++ } ++ $code .= <<___; ++ sub \$16,$IA1 ++ jge .L_no_extra_mask_${rndsuffix} ++ sub $IA1,$IA0 ++.L_no_extra_mask_${rndsuffix}: ++ # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1 ++ # ;; - mask out bottom $LENGTH bytes of $XTMP1 ++ # ;; sizeof(SHIFT_MASK) == 16 bytes ++ vmovdqu64 16($IA0),$XTMP0 ++ vpand $XTMP0,$XTMP1,$XTMP1 ++___ ++ ++ if ($ENC_DEC eq "DEC") { ++ $code .= <<___; ++ vpand $XTMP0,$XTMP4,$XTMP4 ++ vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4 ++ vpshufb $XTMP3,$XTMP4,$XTMP4 ++ vpxorq $XTMP4,$AAD_HASH,$AAD_HASH ++___ ++ } else { ++ $code .= <<___; ++ vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 ++ vpshufb $XTMP3,$XTMP1,$XTMP1 ++ vpxorq $XTMP1,$AAD_HASH,$AAD_HASH ++___ ++ } ++ $code .= <<___; ++ cmp \$0,$IA1 ++ jl .L_partial_incomplete_${rndsuffix} ++___ ++ ++ # ;; GHASH computation for the last <16 Byte block ++ &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7); ++ ++ $code .= <<___; ++ movq \$0, ($PBLOCK_LEN) ++ # ;; Set $LENGTH to be the number of bytes to write out ++ mov $LENGTH,$IA0 ++ mov \$16,$LENGTH ++ sub $IA0,$LENGTH ++ jmp .L_enc_dec_done_${rndsuffix} ++ ++.L_partial_incomplete_${rndsuffix}: ++___ ++ if ($win64) { ++ $code .= <<___; ++ mov $PLAIN_CIPH_LEN,$IA0 ++ add $IA0,($PBLOCK_LEN) ++___ ++ } else { ++ $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n"; ++ } ++ $code .= <<___; ++ mov $PLAIN_CIPH_LEN,$LENGTH ++ ++.L_enc_dec_done_${rndsuffix}: ++ # ;; output encrypted Bytes ++ ++ lea byte_len_to_mask_table(%rip),$IA0 ++ kmovw ($IA0,$LENGTH,2),$MASKREG ++ vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX) ++___ ++ ++ if ($ENC_DEC eq "ENC") { ++ $code .= <<___; ++ # ;; shuffle XTMP1 back to output as ciphertext ++ vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 ++ vpshufb $XTMP3,$XTMP1,$XTMP1 ++___ ++ } ++ $code .= <<___; ++ mov $CIPH_PLAIN_OUT,$IA0 ++ vmovdqu8 $XTMP1,($IA0){$MASKREG} ++.L_partial_block_done_${rndsuffix}: ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation ++sub INITIAL_BLOCKS_PARTIAL_CIPHER { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer ++ my $LENGTH = $_[4]; # [in/clobbered] length in bytes ++ my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) ++ my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) ++ my $CTR = $_[7]; # [in/out] current counter value ++ my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC) ++ my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH ++ my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH ++ my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH ++ my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH ++ my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text ++ my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH ++ my $CTR0 = $_[15]; # [clobbered] ZMM temporary ++ my $CTR1 = $_[16]; # [clobbered] ZMM temporary ++ my $CTR2 = $_[17]; # [clobbered] ZMM temporary ++ my $CTR3 = $_[18]; # [clobbered] ZMM temporary ++ my $ZT1 = $_[19]; # [clobbered] ZMM temporary ++ my $IA0 = $_[20]; # [clobbered] GP temporary ++ my $IA1 = $_[21]; # [clobbered] GP temporary ++ my $MASKREG = $_[22]; # [clobbered] mask register ++ my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask ++ ++ if ($NUM_BLOCKS == 1) { ++ $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n"; ++ } elsif ($NUM_BLOCKS == 2) { ++ $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n"; ++ } else { ++ $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n"; ++ } ++ ++ # ;; prepare AES counter blocks ++ if ($NUM_BLOCKS == 1) { ++ $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n"; ++ } elsif ($NUM_BLOCKS == 2) { ++ $code .= <<___; ++ vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]} ++ vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]} ++___ ++ } else { ++ $code .= <<___; ++ vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]} ++ vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0 ++___ ++ if ($NUM_BLOCKS > 4) { ++ $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n"; ++ } ++ if ($NUM_BLOCKS > 8) { ++ $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n"; ++ } ++ if ($NUM_BLOCKS > 12) { ++ $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n"; ++ } ++ } ++ ++ # ;; get load/store mask ++ $code .= <<___; ++ lea byte64_len_to_mask_table(%rip),$IA0 ++ mov $LENGTH,$IA1 ++___ ++ if ($NUM_BLOCKS > 12) { ++ $code .= "sub \$`3*64`,$IA1\n"; ++ } elsif ($NUM_BLOCKS > 8) { ++ $code .= "sub \$`2*64`,$IA1\n"; ++ } elsif ($NUM_BLOCKS > 4) { ++ $code .= "sub \$`1*64`,$IA1\n"; ++ } ++ $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; ++ ++ # ;; extract new counter value ++ # ;; shuffle the counters for AES rounds ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n"; ++ } ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0, ++ $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); ++ ++ # ;; load plain/cipher text ++ &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG); ++ ++ # ;; AES rounds and XOR with plain/cipher text ++ foreach my $j (0 .. ($NROUNDS + 1)) { ++ $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n"; ++ &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j, ++ $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS); ++ } ++ ++ # ;; retrieve the last cipher counter block (partially XOR'ed with text) ++ # ;; - this is needed for partial block cases ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n"; ++ } ++ ++ # ;; write cipher/plain text back to output and ++ $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; ++ &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG); ++ ++ # ;; zero bytes outside the mask before hashing ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n"; ++ } else { ++ $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n"; ++ } ++ ++ # ;; Shuffle the cipher text blocks for hashing part ++ # ;; ZT5 and ZT6 are expected outputs with blocks for hashing ++ if ($ENC_DEC eq "DEC") { ++ ++ # ;; Decrypt case ++ # ;; - cipher blocks are in ZT5 & ZT6 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0, ++ $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); ++ } else { ++ ++ # ;; Encrypt case ++ # ;; - cipher blocks are in CTR0-CTR3 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0, ++ $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); ++ } ++ ++ # ;; Extract the last block for partials and multi_call cases ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n"; ++ } ++ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; Computes GHASH on 1 to 16 blocks ++sub INITIAL_BLOCKS_PARTIAL_GHASH { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $LENGTH = $_[2]; # [in/clobbered] length in bytes ++ my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) ++ my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value ++ my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC) ++ my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH ++ my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH ++ my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH ++ my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH ++ my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text ++ my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH ++ my $ZT0 = $_[12]; # [clobbered] ZMM temporary ++ my $ZT1 = $_[13]; # [clobbered] ZMM temporary ++ my $ZT2 = $_[14]; # [clobbered] ZMM temporary ++ my $ZT3 = $_[15]; # [clobbered] ZMM temporary ++ my $ZT4 = $_[16]; # [clobbered] ZMM temporary ++ my $ZT5 = $_[17]; # [clobbered] ZMM temporary ++ my $ZT6 = $_[18]; # [clobbered] ZMM temporary ++ my $ZT7 = $_[19]; # [clobbered] ZMM temporary ++ my $ZT8 = $_[20]; # [clobbered] ZMM temporary ++ my $PBLOCK_LEN = $_[21]; # [in] partial block length ++ my $GH = $_[22]; # [in] ZMM with hi product part ++ my $GM = $_[23]; # [in] ZMM with mid prodcut part ++ my $GL = $_[24]; # [in] ZMM with lo product part ++ ++ my $rndsuffix = &random_string(); ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;;; - Hash all but the last partial block of data ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++ # ;; update data offset ++ if ($NUM_BLOCKS > 1) { ++ ++ # ;; The final block of data may be <16B ++ $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n"; ++ } ++ ++ if ($NUM_BLOCKS < 16) { ++ $code .= <<___; ++ # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. ++ # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. ++ cmp \$16,$LENGTH ++ jl .L_small_initial_partial_block_${rndsuffix} ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;;; Handle a full length final block - encrypt and hash all blocks ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++ sub \$16,$LENGTH ++ movq \$0,($PBLOCK_LEN) ++___ ++ ++ # ;; Hash all of the data ++ if (scalar(@_) == 22) { ++ ++ # ;; start GHASH compute ++ &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS); ++ } elsif (scalar(@_) == 25) { ++ ++ # ;; continue GHASH compute ++ &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL); ++ } ++ $code .= "jmp .L_small_initial_compute_done_${rndsuffix}\n"; ++ } ++ ++ $code .= <<___; ++.L_small_initial_partial_block_${rndsuffix}: ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;;; Handle ghash for a <16B final block ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++ # ;; As it's an init / update / finalize series we need to leave the ++ # ;; last block if it's less than a full block of data. ++ ++ mov $LENGTH,($PBLOCK_LEN) ++ vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX) ++___ ++ ++ my $k = ($NUM_BLOCKS - 1); ++ my $last_block_to_hash = 1; ++ if (($NUM_BLOCKS > $last_block_to_hash)) { ++ ++ # ;; ZT12-ZT20 - temporary registers ++ if (scalar(@_) == 22) { ++ ++ # ;; start GHASH compute ++ &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k); ++ } elsif (scalar(@_) == 25) { ++ ++ # ;; continue GHASH compute ++ &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL); ++ } ++ ++ # ;; just fall through no jmp needed ++ } else { ++ ++ if (scalar(@_) == 25) { ++ $code .= <<___; ++ # ;; Reduction is required in this case. ++ # ;; Integrate GM into GH and GL. ++ vpsrldq \$8,$GM,$ZT0 ++ vpslldq \$8,$GM,$ZT1 ++ vpxorq $ZT0,$GH,$GH ++ vpxorq $ZT1,$GL,$GL ++___ ++ ++ # ;; Add GH and GL 128-bit words horizontally ++ &VHPXORI4x128($GH, $ZT0); ++ &VHPXORI4x128($GL, $ZT1); ++ ++ # ;; 256-bit to 128-bit reduction ++ $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n"; ++ &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2)); ++ } ++ $code .= <<___; ++ # ;; Record that a reduction is not needed - ++ # ;; In this case no hashes are computed because there ++ # ;; is only one initial block and it is < 16B in length. ++ # ;; We only need to check if a reduction is needed if ++ # ;; initial_blocks == 1 and init/update/final is being used. ++ # ;; In this case we may just have a partial block, and that ++ # ;; gets hashed in finalize. ++ ++ # ;; The hash should end up in HASH_IN_OUT. ++ # ;; The only way we should get here is if there is ++ # ;; a partial block of data, so xor that into the hash. ++ vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT ++ # ;; The result is in $HASH_IN_OUT ++ jmp .L_after_reduction_${rndsuffix} ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;;; After GHASH reduction ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++ $code .= ".L_small_initial_compute_done_${rndsuffix}:\n"; ++ ++ # ;; If using init/update/finalize, we need to xor any partial block data ++ # ;; into the hash. ++ if ($NUM_BLOCKS > 1) { ++ ++ # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place ++ if ($NUM_BLOCKS != 16) { ++ $code .= <<___; ++ # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero ++ or $LENGTH,$LENGTH ++ je .L_after_reduction_${rndsuffix} ++___ ++ } ++ $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n"; ++ } ++ ++ $code .= ".L_after_reduction_${rndsuffix}:\n"; ++ ++ # ;; Final hash is now in HASH_IN_OUT ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. ++# ;; It may look similar to INITIAL_BLOCKS but its usage is different: ++# ;; - first encrypts/decrypts required number of blocks and then ++# ;; ghashes these blocks ++# ;; - Small packets or left over data chunks (<256 bytes) ++# ;; - Remaining data chunks below 256 bytes (multi buffer code) ++# ;; ++# ;; num_initial_blocks is expected to include the partial final block ++# ;; in the count. ++sub INITIAL_BLOCKS_PARTIAL { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer ++ my $LENGTH = $_[4]; # [in/clobbered] length in bytes ++ my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) ++ my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) ++ my $CTR = $_[7]; # [in/out] current counter value ++ my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value ++ my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC) ++ my $CTR0 = $_[10]; # [clobbered] ZMM temporary ++ my $CTR1 = $_[11]; # [clobbered] ZMM temporary ++ my $CTR2 = $_[12]; # [clobbered] ZMM temporary ++ my $CTR3 = $_[13]; # [clobbered] ZMM temporary ++ my $DAT0 = $_[14]; # [clobbered] ZMM temporary ++ my $DAT1 = $_[15]; # [clobbered] ZMM temporary ++ my $DAT2 = $_[16]; # [clobbered] ZMM temporary ++ my $DAT3 = $_[17]; # [clobbered] ZMM temporary ++ my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary ++ my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary ++ my $ZT0 = $_[20]; # [clobbered] ZMM temporary ++ my $ZT1 = $_[21]; # [clobbered] ZMM temporary ++ my $ZT2 = $_[22]; # [clobbered] ZMM temporary ++ my $ZT3 = $_[23]; # [clobbered] ZMM temporary ++ my $ZT4 = $_[24]; # [clobbered] ZMM temporary ++ my $IA0 = $_[25]; # [clobbered] GP temporary ++ my $IA1 = $_[26]; # [clobbered] GP temporary ++ my $MASKREG = $_[27]; # [clobbered] mask register ++ my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask ++ my $PBLOCK_LEN = $_[29]; # [in] partial block length ++ ++ &INITIAL_BLOCKS_PARTIAL_CIPHER( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, ++ $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR, ++ $ENC_DEC, $DAT0, $DAT1, $DAT2, ++ $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0, ++ $CTR1, $CTR2, $CTR3, $ZT0, ++ $IA0, $IA1, $MASKREG, $SHUFMASK); ++ ++ &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0, ++ $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK), ++ &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN); ++} ++ ++# ;; =========================================================================== ++# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks ++# ;; followed with GHASH of the N blocks. ++sub GHASH_16_ENCRYPT_N_GHASH_N { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer ++ my $DATA_OFFSET = $_[4]; # [in] data offset ++ my $LENGTH = $_[5]; # [in] data length ++ my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian ++ my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check ++ my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key ++ # (can be in form of register or numerical value) ++ my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in ++ my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb ++ my $B00_03 = $_[11]; # [clobbered] temporary ZMM ++ my $B04_07 = $_[12]; # [clobbered] temporary ZMM ++ my $B08_11 = $_[13]; # [clobbered] temporary ZMM ++ my $B12_15 = $_[14]; # [clobbered] temporary ZMM ++ my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM ++ my $GH1L = $_[16]; # [clobbered] temporary ZMM ++ my $GH1M = $_[17]; # [clobbered] temporary ZMM ++ my $GH1T = $_[18]; # [clobbered] temporary ZMM ++ my $GH2H = $_[19]; # [clobbered] temporary ZMM ++ my $GH2L = $_[20]; # [clobbered] temporary ZMM ++ my $GH2M = $_[21]; # [clobbered] temporary ZMM ++ my $GH2T = $_[22]; # [clobbered] temporary ZMM ++ my $GH3H = $_[23]; # [clobbered] temporary ZMM ++ my $GH3L = $_[24]; # [clobbered] temporary ZMM ++ my $GH3M = $_[25]; # [clobbered] temporary ZMM ++ my $GH3T = $_[26]; # [clobbered] temporary ZMM ++ my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM ++ my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM ++ my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM ++ my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM ++ my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM ++ my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM ++ my $ZT01 = $_[33]; # [clobbered] temporary ZMM ++ my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian ++ my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian ++ my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" ++ my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum ++ my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum ++ my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum ++ my $ENC_DEC = $_[40]; # [in] cipher direction ++ my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value ++ my $IA0 = $_[42]; # [clobbered] GP temporary ++ my $IA1 = $_[43]; # [clobbered] GP temporary ++ my $MASKREG = $_[44]; # [clobbered] mask register ++ my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16) ++ my $PBLOCK_LEN = $_[46]; # [in] partial block length ++ ++ die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n" ++ if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); ++ ++ my $rndsuffix = &random_string(); ++ ++ my $GH1H = $HASH_IN_OUT; ++ ++ # ; this is to avoid additional move in do_reduction case ++ ++ my $LAST_GHASH_BLK = $GH1L; ++ my $LAST_CIPHER_BLK = $GH1T; ++ ++ my $RED_POLY = $GH2T; ++ my $RED_P1 = $GH2L; ++ my $RED_T1 = $GH2H; ++ my $RED_T2 = $GH2M; ++ ++ my $DATA1 = $GH3H; ++ my $DATA2 = $GH3L; ++ my $DATA3 = $GH3M; ++ my $DATA4 = $GH3T; ++ ++ # ;; do reduction after the 16 blocks ? ++ my $do_reduction = 0; ++ ++ # ;; is 16 block chunk a start? ++ my $is_start = 0; ++ ++ if ($GHASH_TYPE eq "start_reduce") { ++ $is_start = 1; ++ $do_reduction = 1; ++ } ++ ++ if ($GHASH_TYPE eq "start") { ++ $is_start = 1; ++ } ++ ++ if ($GHASH_TYPE eq "end_reduce") { ++ $do_reduction = 1; ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; - get load/store mask ++ # ;; - load plain/cipher text ++ # ;; get load/store mask ++ $code .= <<___; ++ lea byte64_len_to_mask_table(%rip),$IA0 ++ mov $LENGTH,$IA1 ++___ ++ if ($NUM_BLOCKS > 12) { ++ $code .= "sub \$`3*64`,$IA1\n"; ++ } elsif ($NUM_BLOCKS > 8) { ++ $code .= "sub \$`2*64`,$IA1\n"; ++ } elsif ($NUM_BLOCKS > 4) { ++ $code .= "sub \$`1*64`,$IA1\n"; ++ } ++ $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; prepare counter blocks ++ ++ $code .= <<___; ++ cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]} ++ jae .L_16_blocks_overflow_${rndsuffix} ++___ ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE, ++ $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4); ++ $code .= <<___; ++ jmp .L_16_blocks_ok_${rndsuffix} ++ ++.L_16_blocks_overflow_${rndsuffix}: ++ vpshufb $SHFMSK,$CTR_BE,$CTR_BE ++ vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 ++___ ++ if ($NUM_BLOCKS > 4) { ++ $code .= <<___; ++ vmovdqa64 ddq_add_4444(%rip),$B12_15 ++ vpaddd $B12_15,$B00_03,$B04_07 ++___ ++ } ++ if ($NUM_BLOCKS > 8) { ++ $code .= "vpaddd $B12_15,$B04_07,$B08_11\n"; ++ } ++ if ($NUM_BLOCKS > 12) { ++ $code .= "vpaddd $B12_15,$B08_11,$B12_15\n"; ++ } ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); ++ $code .= <<___; ++.L_16_blocks_ok_${rndsuffix}: ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; - pre-load constants ++ # ;; - add current hash into the 1st block ++ vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1 ++___ ++ if ($is_start != 0) { ++ $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n"; ++ } else { ++ $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; ++ } ++ ++ $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n"; ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; save counter for the next round ++ # ;; increment counter overflow check register ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n"; ++ } ++ $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n"; ++ ++ $code .= <<___; ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; pre-load constants ++ vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 ++ vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; stitch AES rounds with GHASH ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 0 - ARK ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n"; ++ ++ $code .= <<___; ++ # ;;================================================== ++ # ;; GHASH 4 blocks (15 to 12) ++ vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 ++ vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 ++ vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 ++ vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 1 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n"; ++ ++ $code .= <<___; ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (11 to 8) ++ vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 ++ vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 2 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n"; ++ ++ $code .= <<___; ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (7 to 4) ++ vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 3 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n"; ++ ++ $code .= <<___; ++ # ;; ================================================= ++ # ;; Gather (XOR) GHASH for 12 blocks ++ vpternlogq \$0x96,$GH3H,$GH2H,$GH1H ++ vpternlogq \$0x96,$GH3L,$GH2L,$GH1L ++ vpternlogq \$0x96,$GH3T,$GH2T,$GH1T ++ vpternlogq \$0x96,$GH3M,$GH2M,$GH1M ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 4 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n"; ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; load plain/cipher text ++ &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG); ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 5 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n"; ++ ++ $code .= <<___; ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (3 to 0) ++ vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 6 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n"; ++ ++ # ;; ================================================= ++ # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid) ++ # ;; - add GH2[MTLH] to GH1[MTLH] ++ $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n"; ++ if ($do_reduction != 0) { ++ ++ if ($is_start != 0) { ++ $code .= "vpxorq $GH2M,$GH1M,$GH1M\n"; ++ } else { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H ++ vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L ++ vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M ++___ ++ } ++ ++ } else { ++ ++ # ;; Update H/M/L hash sums if not carrying reduction ++ if ($is_start != 0) { ++ $code .= <<___; ++ vpxorq $GH2H,$GH1H,$TO_REDUCE_H ++ vpxorq $GH2L,$GH1L,$TO_REDUCE_L ++ vpxorq $GH2M,$GH1M,$TO_REDUCE_M ++___ ++ } else { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H ++ vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L ++ vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M ++___ ++ } ++ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 7 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n"; ++ ++ # ;; ================================================= ++ # ;; prepare mid sum for adding to high & low ++ # ;; load polynomial constant for reduction ++ if ($do_reduction != 0) { ++ $code .= <<___; ++ vpsrldq \$8,$GH1M,$GH2M ++ vpslldq \$8,$GH1M,$GH1M ++ ++ vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 8 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n"; ++ ++ # ;; ================================================= ++ # ;; Add mid product to high and low ++ if ($do_reduction != 0) { ++ if ($is_start != 0) { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 ++ vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 ++___ ++ } else { ++ $code .= <<___; ++ vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64 ++ vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64 ++___ ++ } ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 9 ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ ++ # ;; ================================================= ++ # ;; horizontal xor of low and high 4x128 ++ if ($do_reduction != 0) { ++ &VHPXORI4x128($GH1H, $GH2H); ++ &VHPXORI4x128($GH1L, $GH2L); ++ } ++ ++ if (($NROUNDS >= 11)) { ++ $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; ++ } ++ ++ # ;; ================================================= ++ # ;; first phase of reduction ++ if ($do_reduction != 0) { ++ $code .= <<___; ++ vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} ++ vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs ++ vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds up to 11 (AES192) or 13 (AES256) ++ # ;; AES128 is done ++ if (($NROUNDS >= 11)) { ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n"; ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ if (($NROUNDS == 13)) { ++ $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n"; ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n"; ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); ++ } ++ } ++ ++ # ;; ================================================= ++ # ;; second phase of the reduction ++ if ($do_reduction != 0) { ++ $code .= <<___; ++ vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} ++ vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R ++ vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} ++ vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts ++ # ;; GH1H = GH1H + RED_T1 + RED_T2 ++ vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; the last AES round ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; XOR against plain/cipher text ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, ++ $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4); ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; retrieve the last cipher counter block (partially XOR'ed with text) ++ # ;; - this is needed for partial block cases ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n"; ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; store cipher/plain text ++ $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; ++ &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG); ++ ++ # ;; ================================================= ++ # ;; shuffle cipher text blocks for GHASH computation ++ if ($ENC_DEC eq "ENC") { ++ ++ # ;; zero bytes outside the mask before hashing ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n"; ++ } else { ++ $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n"; ++ } ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03, ++ $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); ++ } else { ++ ++ # ;; zero bytes outside the mask before hashing ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n"; ++ } else { ++ $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n"; ++ } ++ ++ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( ++ $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1, ++ $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); ++ } ++ ++ # ;; ================================================= ++ # ;; Extract the last block for partial / multi_call cases ++ if ($NUM_BLOCKS <= 4) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n"; ++ } elsif ($NUM_BLOCKS <= 8) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n"; ++ } elsif ($NUM_BLOCKS <= 12) { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n"; ++ } else { ++ $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n"; ++ } ++ ++ if ($do_reduction != 0) { ++ ++ # ;; GH1H holds reduced hash value ++ # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)" ++ # ;; - register rename trick obsoletes the above move ++ } ++ ++ # ;; ================================================= ++ # ;; GHASH last N blocks ++ # ;; - current hash value in HASH_IN_OUT or ++ # ;; product parts in TO_REDUCE_H/M/L ++ # ;; - DATA1-DATA4 include blocks for GHASH ++ ++ if ($do_reduction == 0) { ++ &INITIAL_BLOCKS_PARTIAL_GHASH( ++ $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, ++ &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, ++ $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), ++ $B00_03, $B04_07, $B08_11, $B12_15, ++ $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, ++ $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M, ++ $TO_REDUCE_L); ++ } else { ++ &INITIAL_BLOCKS_PARTIAL_GHASH( ++ $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, ++ &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, ++ $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), ++ $B00_03, $B04_07, $B08_11, $B12_15, ++ $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, ++ $GHKEY1, $PBLOCK_LEN); ++ } ++} ++ ++# ;; =========================================================================== ++# ;; =========================================================================== ++# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks ++# ;; followed with GHASH of the N blocks. ++sub GCM_ENC_DEC_LAST { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer ++ my $DATA_OFFSET = $_[4]; # [in] data offset ++ my $LENGTH = $_[5]; # [in/clobbered] data length ++ my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian ++ my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check ++ my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key ++ # (can be register or numerical offset) ++ my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in ++ my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb ++ my $ZT00 = $_[11]; # [clobbered] temporary ZMM ++ my $ZT01 = $_[12]; # [clobbered] temporary ZMM ++ my $ZT02 = $_[13]; # [clobbered] temporary ZMM ++ my $ZT03 = $_[14]; # [clobbered] temporary ZMM ++ my $ZT04 = $_[15]; # [clobbered] temporary ZMM ++ my $ZT05 = $_[16]; # [clobbered] temporary ZMM ++ my $ZT06 = $_[17]; # [clobbered] temporary ZMM ++ my $ZT07 = $_[18]; # [clobbered] temporary ZMM ++ my $ZT08 = $_[19]; # [clobbered] temporary ZMM ++ my $ZT09 = $_[20]; # [clobbered] temporary ZMM ++ my $ZT10 = $_[21]; # [clobbered] temporary ZMM ++ my $ZT11 = $_[22]; # [clobbered] temporary ZMM ++ my $ZT12 = $_[23]; # [clobbered] temporary ZMM ++ my $ZT13 = $_[24]; # [clobbered] temporary ZMM ++ my $ZT14 = $_[25]; # [clobbered] temporary ZMM ++ my $ZT15 = $_[26]; # [clobbered] temporary ZMM ++ my $ZT16 = $_[27]; # [clobbered] temporary ZMM ++ my $ZT17 = $_[28]; # [clobbered] temporary ZMM ++ my $ZT18 = $_[29]; # [clobbered] temporary ZMM ++ my $ZT19 = $_[30]; # [clobbered] temporary ZMM ++ my $ZT20 = $_[31]; # [clobbered] temporary ZMM ++ my $ZT21 = $_[32]; # [clobbered] temporary ZMM ++ my $ZT22 = $_[33]; # [clobbered] temporary ZMM ++ my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian ++ my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian ++ my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" ++ my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum ++ my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum ++ my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum ++ my $ENC_DEC = $_[40]; # [in] cipher direction ++ my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value ++ my $IA0 = $_[42]; # [clobbered] GP temporary ++ my $IA1 = $_[43]; # [clobbered] GP temporary ++ my $MASKREG = $_[44]; # [clobbered] mask register ++ my $PBLOCK_LEN = $_[45]; # [in] partial block length ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} ++ add \$15,@{[DWORD($IA0)]} ++ shr \$4,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_0_${rndsuffix} ++ ++ cmp \$8,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_8_${rndsuffix} ++ jb .L_last_num_blocks_is_7_1_${rndsuffix} ++ ++ ++ cmp \$12,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_12_${rndsuffix} ++ jb .L_last_num_blocks_is_11_9_${rndsuffix} ++ ++ # ;; 16, 15, 14 or 13 ++ cmp \$15,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_15_${rndsuffix} ++ ja .L_last_num_blocks_is_16_${rndsuffix} ++ cmp \$14,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_14_${rndsuffix} ++ jmp .L_last_num_blocks_is_13_${rndsuffix} ++ ++.L_last_num_blocks_is_11_9_${rndsuffix}: ++ # ;; 11, 10 or 9 ++ cmp \$10,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_10_${rndsuffix} ++ ja .L_last_num_blocks_is_11_${rndsuffix} ++ jmp .L_last_num_blocks_is_9_${rndsuffix} ++ ++.L_last_num_blocks_is_7_1_${rndsuffix}: ++ cmp \$4,@{[DWORD($IA0)]} ++ je .L_last_num_blocks_is_4_${rndsuffix} ++ jb .L_last_num_blocks_is_3_1_${rndsuffix} ++ # ;; 7, 6 or 5 ++ cmp \$6,@{[DWORD($IA0)]} ++ ja .L_last_num_blocks_is_7_${rndsuffix} ++ je .L_last_num_blocks_is_6_${rndsuffix} ++ jmp .L_last_num_blocks_is_5_${rndsuffix} ++ ++.L_last_num_blocks_is_3_1_${rndsuffix}: ++ # ;; 3, 2 or 1 ++ cmp \$2,@{[DWORD($IA0)]} ++ ja .L_last_num_blocks_is_3_${rndsuffix} ++ je .L_last_num_blocks_is_2_${rndsuffix} ++___ ++ ++ # ;; fall through for `jmp .L_last_num_blocks_is_1` ++ ++ # ;; Use rep to generate different block size variants ++ # ;; - one block size has to be the first one ++ for my $num_blocks (1 .. 16) { ++ $code .= ".L_last_num_blocks_is_${num_blocks}_${rndsuffix}:\n"; ++ &GHASH_16_ENCRYPT_N_GHASH_N( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, ++ $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET, ++ $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03, ++ $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, ++ $ZT09, $ZT10, $ZT11, $ZT12, $ZT13, ++ $ZT14, $ZT15, $ZT16, $ZT17, $ZT18, ++ $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4, ++ $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M, ++ $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG, ++ $num_blocks, $PBLOCK_LEN); ++ ++ $code .= "jmp .L_last_blocks_done_${rndsuffix}\n"; ++ } ++ ++ $code .= ".L_last_num_blocks_is_0_${rndsuffix}:\n"; ++ ++ # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction ++ # ;; - convert mid into end_reduce ++ # ;; - convert start into start_reduce ++ if ($GHASH_TYPE eq "mid") { ++ $GHASH_TYPE = "end_reduce"; ++ } ++ if ($GHASH_TYPE eq "start") { ++ $GHASH_TYPE = "start_reduce"; ++ } ++ ++ &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp", ++ $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01, ++ $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09); ++ ++ $code .= ".L_last_blocks_done_${rndsuffix}:\n"; ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;; Main GCM macro stitching cipher with GHASH ++# ;; - operates on single stream ++# ;; - encrypts 16 blocks at a time ++# ;; - ghash the 16 previously encrypted ciphertext blocks ++# ;; - no partial block or multi_call handling here ++sub GHASH_16_ENCRYPT_16_PARALLEL { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer ++ my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer ++ my $DATA_OFFSET = $_[3]; # [in] data offset ++ my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian ++ my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check ++ my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value) ++ my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out ++ my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in ++ my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb ++ my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher) ++ my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher) ++ my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher) ++ my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher) ++ my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) ++ my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher) ++ my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher) ++ my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher) ++ my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher) ++ my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash) ++ my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash) ++ my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash) ++ my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash) ++ my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash) ++ my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash) ++ my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash) ++ my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash) ++ my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash) ++ my $ZT19 = $_[28]; # [clobbered] temporary ZMM ++ my $ZT20 = $_[29]; # [clobbered] temporary ZMM ++ my $ZT21 = $_[30]; # [clobbered] temporary ZMM ++ my $ZT22 = $_[31]; # [clobbered] temporary ZMM ++ my $ZT23 = $_[32]; # [clobbered] temporary ZMM ++ my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian ++ my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian ++ my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum ++ my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum ++ my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum ++ my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time" ++ my $ENC_DEC = $_[39]; # [in] cipher direction ++ my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset ++ my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in" ++ my $IA0 = $_[42]; # [clobbered] temporary GPR ++ ++ my $B00_03 = $ZT1; ++ my $B04_07 = $ZT2; ++ my $B08_11 = $ZT3; ++ my $B12_15 = $ZT4; ++ ++ my $GH1H = $ZT5; ++ ++ # ; @note: do not change this mapping ++ my $GH1L = $ZT6; ++ my $GH1M = $ZT7; ++ my $GH1T = $ZT8; ++ ++ my $GH2H = $ZT9; ++ my $GH2L = $ZT10; ++ my $GH2M = $ZT11; ++ my $GH2T = $ZT12; ++ ++ my $RED_POLY = $GH2T; ++ my $RED_P1 = $GH2L; ++ my $RED_T1 = $GH2H; ++ my $RED_T2 = $GH2M; ++ ++ my $GH3H = $ZT13; ++ my $GH3L = $ZT14; ++ my $GH3M = $ZT15; ++ my $GH3T = $ZT16; ++ ++ my $DATA1 = $ZT13; ++ my $DATA2 = $ZT14; ++ my $DATA3 = $ZT15; ++ my $DATA4 = $ZT16; ++ ++ my $AESKEY1 = $ZT17; ++ my $AESKEY2 = $ZT18; ++ ++ my $GHKEY1 = $ZT19; ++ my $GHKEY2 = $ZT20; ++ my $GHDAT1 = $ZT21; ++ my $GHDAT2 = $ZT22; ++ ++ my $rndsuffix = &random_string(); ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; prepare counter blocks ++ ++ $code .= <<___; ++ cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} ++ jae .L_16_blocks_overflow_${rndsuffix} ++ vpaddd $ADDBE_1234,$CTR_BE,$B00_03 ++ vpaddd $ADDBE_4x4,$B00_03,$B04_07 ++ vpaddd $ADDBE_4x4,$B04_07,$B08_11 ++ vpaddd $ADDBE_4x4,$B08_11,$B12_15 ++ jmp .L_16_blocks_ok_${rndsuffix} ++.L_16_blocks_overflow_${rndsuffix}: ++ vpshufb $SHFMSK,$CTR_BE,$CTR_BE ++ vmovdqa64 ddq_add_4444(%rip),$B12_15 ++ vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 ++ vpaddd $B12_15,$B00_03,$B04_07 ++ vpaddd $B12_15,$B04_07,$B08_11 ++ vpaddd $B12_15,$B08_11,$B12_15 ++ vpshufb $SHFMSK,$B00_03,$B00_03 ++ vpshufb $SHFMSK,$B04_07,$B04_07 ++ vpshufb $SHFMSK,$B08_11,$B08_11 ++ vpshufb $SHFMSK,$B12_15,$B12_15 ++.L_16_blocks_ok_${rndsuffix}: ++___ ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; pre-load constants ++ $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n"; ++ if ($GHASH_IN ne "no_ghash_in") { ++ $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n"; ++ } else { ++ $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; ++ } ++ ++ $code .= <<___; ++ vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; save counter for the next round ++ # ;; increment counter overflow check register ++ vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE ++ addb \$16,@{[BYTE($CTR_CHECK)]} ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; pre-load constants ++ vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 ++ vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; stitch AES rounds with GHASH ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 0 - ARK ++ ++ vpxorq $AESKEY1,$B00_03,$B00_03 ++ vpxorq $AESKEY1,$B04_07,$B04_07 ++ vpxorq $AESKEY1,$B08_11,$B08_11 ++ vpxorq $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1 ++ ++ # ;;================================================== ++ # ;; GHASH 4 blocks (15 to 12) ++ vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 ++ vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 ++ vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 ++ vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 1 ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2 ++ ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (11 to 8) ++ vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 ++ vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2 ++ vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 2 ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1 ++ ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (7 to 4) ++ vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 3 ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2 ++ ++ # ;; ================================================= ++ # ;; Gather (XOR) GHASH for 12 blocks ++ vpternlogq \$0x96,$GH3H,$GH2H,$GH1H ++ vpternlogq \$0x96,$GH3L,$GH2L,$GH1L ++ vpternlogq \$0x96,$GH3T,$GH2T,$GH1T ++ vpternlogq \$0x96,$GH3M,$GH2M,$GH1M ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 4 ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; load plain/cipher text (recycle GH3xx registers) ++ vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1 ++ vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2 ++ vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3 ++ vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds 5 ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2 ++ ++ # ;; ================================================= ++ # ;; GHASH 4 blocks (3 to 0) ++ vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 ++ vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 ++ vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 ++ vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 6 ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1 ++___ ++ ++ # ;; ================================================= ++ # ;; gather GHASH in GH1L (low) and GH1H (high) ++ if ($DO_REDUCTION eq "first_time") { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM ++ vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM ++ vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH ++ vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL ++___ ++ } ++ if ($DO_REDUCTION eq "no_reduction") { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM ++ vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM ++ vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH ++ vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL ++___ ++ } ++ if ($DO_REDUCTION eq "final_reduction") { ++ $code .= <<___; ++ # ;; phase 1: add mid products together ++ # ;; also load polynomial constant for reduction ++ vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM ++ vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M ++ ++ vpsrldq \$8,$GH1M,$GH2M ++ vpslldq \$8,$GH1M,$GH1M ++ ++ vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 7 ++ $code .= <<___; ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2 ++___ ++ ++ # ;; ================================================= ++ # ;; Add mid product to high and low ++ if ($DO_REDUCTION eq "final_reduction") { ++ $code .= <<___; ++ vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 ++ vpxorq $TO_REDUCE_H,$GH1H,$GH1H ++ vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 ++ vpxorq $TO_REDUCE_L,$GH1L,$GH1L ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 8 ++ $code .= <<___; ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1 ++___ ++ ++ # ;; ================================================= ++ # ;; horizontal xor of low and high 4x128 ++ if ($DO_REDUCTION eq "final_reduction") { ++ &VHPXORI4x128($GH1H, $GH2H); ++ &VHPXORI4x128($GH1L, $GH2L); ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES round 9 ++ $code .= <<___; ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++___ ++ if (($NROUNDS >= 11)) { ++ $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; ++ } ++ ++ # ;; ================================================= ++ # ;; first phase of reduction ++ if ($DO_REDUCTION eq "final_reduction") { ++ $code .= <<___; ++ vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} ++ vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs ++ vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; AES rounds up to 11 (AES192) or 13 (AES256) ++ # ;; AES128 is done ++ if (($NROUNDS >= 11)) { ++ $code .= <<___; ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1 ++ ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++___ ++ if (($NROUNDS == 13)) { ++ $code .= <<___; ++ vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2 ++ ++ vaesenc $AESKEY1,$B00_03,$B00_03 ++ vaesenc $AESKEY1,$B04_07,$B04_07 ++ vaesenc $AESKEY1,$B08_11,$B08_11 ++ vaesenc $AESKEY1,$B12_15,$B12_15 ++ vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1 ++ ++ vaesenc $AESKEY2,$B00_03,$B00_03 ++ vaesenc $AESKEY2,$B04_07,$B04_07 ++ vaesenc $AESKEY2,$B08_11,$B08_11 ++ vaesenc $AESKEY2,$B12_15,$B12_15 ++___ ++ } ++ } ++ ++ # ;; ================================================= ++ # ;; second phase of the reduction ++ if ($DO_REDUCTION eq "final_reduction") { ++ $code .= <<___; ++ vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} ++ vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R ++ vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} ++ vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts ++ # ;; GH1H = GH1H x RED_T1 x RED_T2 ++ vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} ++___ ++ } ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; the last AES round ++ $code .= <<___; ++ vaesenclast $AESKEY1,$B00_03,$B00_03 ++ vaesenclast $AESKEY1,$B04_07,$B04_07 ++ vaesenclast $AESKEY1,$B08_11,$B08_11 ++ vaesenclast $AESKEY1,$B12_15,$B12_15 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; XOR against plain/cipher text ++ vpxorq $DATA1,$B00_03,$B00_03 ++ vpxorq $DATA2,$B04_07,$B04_07 ++ vpxorq $DATA3,$B08_11,$B08_11 ++ vpxorq $DATA4,$B12_15,$B12_15 ++ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; store cipher/plain text ++ mov $CIPH_PLAIN_OUT,$IA0 ++ vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1) ++___ ++ ++ # ;; ================================================= ++ # ;; shuffle cipher text blocks for GHASH computation ++ if ($ENC_DEC eq "ENC") { ++ $code .= <<___; ++ vpshufb $SHFMSK,$B00_03,$B00_03 ++ vpshufb $SHFMSK,$B04_07,$B04_07 ++ vpshufb $SHFMSK,$B08_11,$B08_11 ++ vpshufb $SHFMSK,$B12_15,$B12_15 ++___ ++ } else { ++ $code .= <<___; ++ vpshufb $SHFMSK,$DATA1,$B00_03 ++ vpshufb $SHFMSK,$DATA2,$B04_07 ++ vpshufb $SHFMSK,$DATA3,$B08_11 ++ vpshufb $SHFMSK,$DATA4,$B12_15 ++___ ++ } ++ ++ # ;; ================================================= ++ # ;; store shuffled cipher text for ghashing ++ $code .= <<___; ++ vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp) ++ vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp) ++ vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp) ++ vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp) ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Encryption of a single block ++sub ENCRYPT_SINGLE_BLOCK { ++ my $AES_KEY = $_[0]; # ; [in] ++ my $XMM0 = $_[1]; # ; [in/out] ++ my $GPR1 = $_[2]; # ; [clobbered] ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ # ; load number of rounds from AES_KEY structure (offset in bytes is ++ # ; size of the |rd_key| buffer) ++ mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]} ++ cmp \$9,@{[DWORD($GPR1)]} ++ je .Laes_128_${rndsuffix} ++ cmp \$11,@{[DWORD($GPR1)]} ++ je .Laes_192_${rndsuffix} ++ cmp \$13,@{[DWORD($GPR1)]} ++ je .Laes_256_${rndsuffix} ++ jmp .Lexit_aes_${rndsuffix} ++___ ++ for my $keylen (sort keys %aes_rounds) { ++ my $nr = $aes_rounds{$keylen}; ++ $code .= <<___; ++.align 32 ++.Laes_${keylen}_${rndsuffix}: ++___ ++ $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n"; ++ for (my $i = 1; $i <= $nr; $i++) { ++ $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n"; ++ } ++ $code .= <<___; ++ vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0 ++ jmp .Lexit_aes_${rndsuffix} ++___ ++ } ++ $code .= ".Lexit_aes_${rndsuffix}:\n\n"; ++} ++ ++sub CALC_J0 { ++ my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context ++ my $IV = $_[1]; #; [in] Pointer to IV ++ my $IV_LEN = $_[2]; #; [in] IV length ++ my $J0 = $_[3]; #; [out] XMM reg to contain J0 ++ my $ZT0 = $_[4]; #; [clobbered] ZMM register ++ my $ZT1 = $_[5]; #; [clobbered] ZMM register ++ my $ZT2 = $_[6]; #; [clobbered] ZMM register ++ my $ZT3 = $_[7]; #; [clobbered] ZMM register ++ my $ZT4 = $_[8]; #; [clobbered] ZMM register ++ my $ZT5 = $_[9]; #; [clobbered] ZMM register ++ my $ZT6 = $_[10]; #; [clobbered] ZMM register ++ my $ZT7 = $_[11]; #; [clobbered] ZMM register ++ my $ZT8 = $_[12]; #; [clobbered] ZMM register ++ my $ZT9 = $_[13]; #; [clobbered] ZMM register ++ my $ZT10 = $_[14]; #; [clobbered] ZMM register ++ my $ZT11 = $_[15]; #; [clobbered] ZMM register ++ my $ZT12 = $_[16]; #; [clobbered] ZMM register ++ my $ZT13 = $_[17]; #; [clobbered] ZMM register ++ my $ZT14 = $_[18]; #; [clobbered] ZMM register ++ my $ZT15 = $_[19]; #; [clobbered] ZMM register ++ my $ZT16 = $_[20]; #; [clobbered] ZMM register ++ my $T1 = $_[21]; #; [clobbered] GP register ++ my $T2 = $_[22]; #; [clobbered] GP register ++ my $T3 = $_[23]; #; [clobbered] GP register ++ my $MASKREG = $_[24]; #; [clobbered] mask register ++ ++ # ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ++ # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ++ ++ # ;; Calculate GHASH of (IV || 0s) ++ $code .= "vpxor $J0,$J0,$J0\n"; ++ &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, ++ $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG); ++ ++ # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) ++ $code .= <<___; ++ mov $IV_LEN,$T1 ++ shl \$3,$T1 # ; IV length in bits ++ vmovq $T1,@{[XWORD($ZT2)]} ++ ++ # ;; Might need shuffle of ZT2 ++ vpxorq $J0,@{[XWORD($ZT2)]},$J0 ++ ++ vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]} ++___ ++ &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]}); ++ ++ $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n"; ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for ++# ;;; encoding/decoding. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub GCM_INIT_IV { ++ my $AES_KEYS = $_[0]; # [in] AES key schedule ++ my $GCM128_CTX = $_[1]; # [in/out] GCM context ++ my $IV = $_[2]; # [in] IV pointer ++ my $IV_LEN = $_[3]; # [in] IV length ++ my $GPR1 = $_[4]; # [clobbered] GP register ++ my $GPR2 = $_[5]; # [clobbered] GP register ++ my $GPR3 = $_[6]; # [clobbered] GP register ++ my $MASKREG = $_[7]; # [clobbered] mask register ++ my $CUR_COUNT = $_[8]; # [out] XMM with current counter ++ my $ZT0 = $_[9]; # [clobbered] ZMM register ++ my $ZT1 = $_[10]; # [clobbered] ZMM register ++ my $ZT2 = $_[11]; # [clobbered] ZMM register ++ my $ZT3 = $_[12]; # [clobbered] ZMM register ++ my $ZT4 = $_[13]; # [clobbered] ZMM register ++ my $ZT5 = $_[14]; # [clobbered] ZMM register ++ my $ZT6 = $_[15]; # [clobbered] ZMM register ++ my $ZT7 = $_[16]; # [clobbered] ZMM register ++ my $ZT8 = $_[17]; # [clobbered] ZMM register ++ my $ZT9 = $_[18]; # [clobbered] ZMM register ++ my $ZT10 = $_[19]; # [clobbered] ZMM register ++ my $ZT11 = $_[20]; # [clobbered] ZMM register ++ my $ZT12 = $_[21]; # [clobbered] ZMM register ++ my $ZT13 = $_[22]; # [clobbered] ZMM register ++ my $ZT14 = $_[23]; # [clobbered] ZMM register ++ my $ZT15 = $_[24]; # [clobbered] ZMM register ++ my $ZT16 = $_[25]; # [clobbered] ZMM register ++ ++ my $ZT0x = $ZT0; ++ $ZT0x =~ s/zmm/xmm/; ++ ++ $code .= <<___; ++ cmp \$12,$IV_LEN ++ je iv_len_12_init_IV ++___ ++ ++ # ;; IV is different than 12 bytes ++ &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, ++ $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); ++ $code .= <<___; ++ jmp skip_iv_len_12_init_IV ++iv_len_12_init_IV: # ;; IV is 12 bytes ++ # ;; read 12 IV bytes and pad with 0x00000001 ++ vmovdqu8 ONEf(%rip),$CUR_COUNT ++ mov $IV,$GPR2 ++ mov \$0x0000000000000fff,@{[DWORD($GPR1)]} ++ kmovq $GPR1,$MASKREG ++ vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1 ++skip_iv_len_12_init_IV: ++ vmovdqu $CUR_COUNT,$ZT0x ++___ ++ &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0) ++ $code .= <<___; ++ vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage ++ ++ # ;; store IV as counter in LE format ++ vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT ++ vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi ++___ ++} ++ ++sub GCM_UPDATE_AAD { ++ my $GCM128_CTX = $_[0]; # [in] GCM context pointer ++ my $A_IN = $_[1]; # [in] AAD pointer ++ my $A_LEN = $_[2]; # [in] AAD length in bytes ++ my $GPR1 = $_[3]; # [clobbered] GP register ++ my $GPR2 = $_[4]; # [clobbered] GP register ++ my $GPR3 = $_[5]; # [clobbered] GP register ++ my $MASKREG = $_[6]; # [clobbered] mask register ++ my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value ++ my $ZT0 = $_[8]; # [clobbered] ZMM register ++ my $ZT1 = $_[9]; # [clobbered] ZMM register ++ my $ZT2 = $_[10]; # [clobbered] ZMM register ++ my $ZT3 = $_[11]; # [clobbered] ZMM register ++ my $ZT4 = $_[12]; # [clobbered] ZMM register ++ my $ZT5 = $_[13]; # [clobbered] ZMM register ++ my $ZT6 = $_[14]; # [clobbered] ZMM register ++ my $ZT7 = $_[15]; # [clobbered] ZMM register ++ my $ZT8 = $_[16]; # [clobbered] ZMM register ++ my $ZT9 = $_[17]; # [clobbered] ZMM register ++ my $ZT10 = $_[18]; # [clobbered] ZMM register ++ my $ZT11 = $_[19]; # [clobbered] ZMM register ++ my $ZT12 = $_[20]; # [clobbered] ZMM register ++ my $ZT13 = $_[21]; # [clobbered] ZMM register ++ my $ZT14 = $_[22]; # [clobbered] ZMM register ++ my $ZT15 = $_[23]; # [clobbered] ZMM register ++ my $ZT16 = $_[24]; # [clobbered] ZMM register ++ ++ # ; load current hash ++ $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n"; ++ ++ &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2, ++ $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, ++ $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); ++ ++ # ; load current hash ++ $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n"; ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Cipher and ghash of payloads shorter than 256 bytes ++# ;;; - number of blocks in the message comes as argument ++# ;;; - depending on the number of blocks an optimized variant of ++# ;;; INITIAL_BLOCKS_PARTIAL is invoked ++sub GCM_ENC_DEC_SMALL { ++ my $AES_KEYS = $_[0]; # [in] key pointer ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer ++ my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer ++ my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length ++ my $ENC_DEC = $_[5]; # [in] cipher direction ++ my $DATA_OFFSET = $_[6]; # [in] data offset ++ my $LENGTH = $_[7]; # [in] data length ++ my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16 ++ my $CTR = $_[9]; # [in/out] XMM counter block ++ my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value ++ my $ZTMP0 = $_[11]; # [clobbered] ZMM register ++ my $ZTMP1 = $_[12]; # [clobbered] ZMM register ++ my $ZTMP2 = $_[13]; # [clobbered] ZMM register ++ my $ZTMP3 = $_[14]; # [clobbered] ZMM register ++ my $ZTMP4 = $_[15]; # [clobbered] ZMM register ++ my $ZTMP5 = $_[16]; # [clobbered] ZMM register ++ my $ZTMP6 = $_[17]; # [clobbered] ZMM register ++ my $ZTMP7 = $_[18]; # [clobbered] ZMM register ++ my $ZTMP8 = $_[19]; # [clobbered] ZMM register ++ my $ZTMP9 = $_[20]; # [clobbered] ZMM register ++ my $ZTMP10 = $_[21]; # [clobbered] ZMM register ++ my $ZTMP11 = $_[22]; # [clobbered] ZMM register ++ my $ZTMP12 = $_[23]; # [clobbered] ZMM register ++ my $ZTMP13 = $_[24]; # [clobbered] ZMM register ++ my $ZTMP14 = $_[25]; # [clobbered] ZMM register ++ my $IA0 = $_[26]; # [clobbered] GP register ++ my $IA1 = $_[27]; # [clobbered] GP register ++ my $MASKREG = $_[28]; # [clobbered] mask register ++ my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask ++ my $PBLOCK_LEN = $_[30]; # [in] partial block length ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ cmp \$8,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_8_${rndsuffix} ++ jl .L_small_initial_num_blocks_is_7_1_${rndsuffix} ++ ++ ++ cmp \$12,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_12_${rndsuffix} ++ jl .L_small_initial_num_blocks_is_11_9_${rndsuffix} ++ ++ # ;; 16, 15, 14 or 13 ++ cmp \$16,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_16_${rndsuffix} ++ cmp \$15,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_15_${rndsuffix} ++ cmp \$14,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_14_${rndsuffix} ++ jmp .L_small_initial_num_blocks_is_13_${rndsuffix} ++ ++.L_small_initial_num_blocks_is_11_9_${rndsuffix}: ++ # ;; 11, 10 or 9 ++ cmp \$11,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_11_${rndsuffix} ++ cmp \$10,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_10_${rndsuffix} ++ jmp .L_small_initial_num_blocks_is_9_${rndsuffix} ++ ++.L_small_initial_num_blocks_is_7_1_${rndsuffix}: ++ cmp \$4,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_4_${rndsuffix} ++ jl .L_small_initial_num_blocks_is_3_1_${rndsuffix} ++ # ;; 7, 6 or 5 ++ cmp \$7,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_7_${rndsuffix} ++ cmp \$6,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_6_${rndsuffix} ++ jmp .L_small_initial_num_blocks_is_5_${rndsuffix} ++ ++.L_small_initial_num_blocks_is_3_1_${rndsuffix}: ++ # ;; 3, 2 or 1 ++ cmp \$3,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_3_${rndsuffix} ++ cmp \$2,$NUM_BLOCKS ++ je .L_small_initial_num_blocks_is_2_${rndsuffix} ++ ++ # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed ++ ++ # ;; Generation of different block size variants ++ # ;; - one block size has to be the first one ++___ ++ ++ for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) { ++ $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${rndsuffix}:\n"; ++ &INITIAL_BLOCKS_PARTIAL( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET, ++ $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN); ++ ++ if ($num_blocks != 16) { ++ $code .= "jmp .L_small_initial_blocks_encrypted_${rndsuffix}\n"; ++ } ++ } ++ ++ $code .= ".L_small_initial_blocks_encrypted_${rndsuffix}:\n"; ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context ++# ; struct has been initialized by GCM_INIT_IV ++# ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. ++# ; Clobbers rax, r10-r15, and zmm0-zmm31, k1 ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub GCM_ENC_DEC { ++ my $AES_KEYS = $_[0]; # [in] AES Key schedule ++ my $GCM128_CTX = $_[1]; # [in] context pointer ++ my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update ++ my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer ++ my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length ++ my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer ++ my $ENC_DEC = $_[6]; # [in] cipher direction ++ ++ my $IA0 = "%r10"; ++ my $IA1 = "%r12"; ++ my $IA2 = "%r13"; ++ my $IA3 = "%r15"; ++ my $IA4 = "%r11"; ++ my $IA5 = "%rax"; ++ my $IA6 = "%rbx"; ++ my $IA7 = "%r14"; ++ ++ my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN; ++ ++ my $CTR_CHECK = $IA3; ++ my $DATA_OFFSET = $IA4; ++ my $HASHK_PTR = $IA6; ++ ++ my $HKEYS_READY = $IA7; ++ ++ my $CTR_BLOCKz = "%zmm2"; ++ my $CTR_BLOCKx = "%xmm2"; ++ ++ # ; hardcoded in GCM_INIT ++ ++ my $AAD_HASHz = "%zmm14"; ++ my $AAD_HASHx = "%xmm14"; ++ ++ # ; hardcoded in GCM_COMPLETE ++ ++ my $ZTMP0 = "%zmm0"; ++ my $ZTMP1 = "%zmm3"; ++ my $ZTMP2 = "%zmm4"; ++ my $ZTMP3 = "%zmm5"; ++ my $ZTMP4 = "%zmm6"; ++ my $ZTMP5 = "%zmm7"; ++ my $ZTMP6 = "%zmm10"; ++ my $ZTMP7 = "%zmm11"; ++ my $ZTMP8 = "%zmm12"; ++ my $ZTMP9 = "%zmm13"; ++ my $ZTMP10 = "%zmm15"; ++ my $ZTMP11 = "%zmm16"; ++ my $ZTMP12 = "%zmm17"; ++ ++ my $ZTMP13 = "%zmm19"; ++ my $ZTMP14 = "%zmm20"; ++ my $ZTMP15 = "%zmm21"; ++ my $ZTMP16 = "%zmm30"; ++ my $ZTMP17 = "%zmm31"; ++ my $ZTMP18 = "%zmm1"; ++ my $ZTMP19 = "%zmm18"; ++ my $ZTMP20 = "%zmm8"; ++ my $ZTMP21 = "%zmm22"; ++ my $ZTMP22 = "%zmm23"; ++ ++ my $GH = "%zmm24"; ++ my $GL = "%zmm25"; ++ my $GM = "%zmm26"; ++ my $SHUF_MASK = "%zmm29"; ++ ++ # ; Unused in the small packet path ++ my $ADDBE_4x4 = "%zmm27"; ++ my $ADDBE_1234 = "%zmm28"; ++ ++ my $MASKREG = "%k1"; ++ ++ my $rndsuffix = &random_string(); ++ ++ # ;; reduction every 48 blocks, depth 32 blocks ++ # ;; @note 48 blocks is the maximum capacity of the stack frame ++ my $big_loop_nblocks = 48; ++ my $big_loop_depth = 32; ++ ++ # ;;; Macro flow depending on packet size ++ # ;;; - LENGTH <= 16 blocks ++ # ;;; - cipher followed by hashing (reduction) ++ # ;;; - 16 blocks < LENGTH < 32 blocks ++ # ;;; - cipher 16 blocks ++ # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) ++ # ;;; - 32 blocks < LENGTH < 48 blocks ++ # ;;; - cipher 2 x 16 blocks ++ # ;;; - hash 16 blocks ++ # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) ++ # ;;; - LENGTH >= 48 blocks ++ # ;;; - cipher 2 x 16 blocks ++ # ;;; - while (data_to_cipher >= 48 blocks): ++ # ;;; - cipher 16 blocks & hash 16 blocks ++ # ;;; - cipher 16 blocks & hash 16 blocks ++ # ;;; - cipher 16 blocks & hash 16 blocks (reduction) ++ # ;;; - if (data_to_cipher >= 32 blocks): ++ # ;;; - cipher 16 blocks & hash 16 blocks ++ # ;;; - cipher 16 blocks & hash 16 blocks ++ # ;;; - hash 16 blocks (reduction) ++ # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) ++ # ;;; - elif (data_to_cipher >= 16 blocks): ++ # ;;; - cipher 16 blocks & hash 16 blocks ++ # ;;; - hash 16 blocks ++ # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) ++ # ;;; - else: ++ # ;;; - hash 16 blocks ++ # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) ++ ++ if ($win64) { ++ $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n"; ++ } else { ++ $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n"; ++ } ++ $code .= "je .L_enc_dec_done_${rndsuffix}\n"; ++ ++ # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in ++ # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc' ++ ++ $code .= "xor $HKEYS_READY, $HKEYS_READY\n"; ++ $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n"; ++ ++ # ;; Used for the update flow - if there was a previous partial ++ # ;; block fill the remaining bytes here. ++ &PARTIAL_BLOCK( ++ $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, ++ $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1, ++ $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, ++ $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG); ++ ++ $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n"; ++ ++ # ;; Save the amount of data left to process in $LENGTH ++ # ;; NOTE: PLAIN_CIPH_LEN is a register on linux; ++ if ($win64) { ++ $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n"; ++ } ++ ++ # ;; There may be no more data if it was consumed in the partial block. ++ $code .= <<___; ++ sub $DATA_OFFSET,$LENGTH ++ je .L_enc_dec_done_${rndsuffix} ++___ ++ ++ $code .= <<___; ++ cmp \$`(16 * 16)`,$LENGTH ++ jbe .L_message_below_equal_16_blocks_${rndsuffix} ++ ++ vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK ++ vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4 ++ vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234 ++ ++ # ;; start the pipeline ++ # ;; - 32 blocks aes-ctr ++ # ;; - 16 blocks ghash + aes-ctr ++ ++ # ;; set up CTR_CHECK ++ vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]} ++ and \$255,@{[DWORD($CTR_CHECK)]} ++ # ;; in LE format after init, convert to BE ++ vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz ++ vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz ++___ ++ ++ # ;; ==== AES-CTR - first 16 blocks ++ my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ my $data_in_out_offset = 0; ++ &INITIAL_BLOCKS_16( ++ $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, ++ $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, ++ $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, ++ $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); ++ ++ &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ "first16"); ++ ++ $code .= <<___; ++ cmp \$`(32 * 16)`,$LENGTH ++ jb .L_message_below_32_blocks_${rndsuffix} ++___ ++ ++ # ;; ==== AES-CTR - next 16 blocks ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); ++ $data_in_out_offset = (16 * 16); ++ &INITIAL_BLOCKS_16( ++ $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, ++ $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, ++ $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, ++ $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); ++ ++ &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ "last32"); ++ $code .= "mov \$1,$HKEYS_READY\n"; ++ ++ $code .= <<___; ++ add \$`(32 * 16)`,$DATA_OFFSET ++ sub \$`(32 * 16)`,$LENGTH ++ ++ cmp \$`($big_loop_nblocks * 16)`,$LENGTH ++ jb .L_no_more_big_nblocks_${rndsuffix} ++___ ++ ++ # ;; ==== ++ # ;; ==== AES-CTR + GHASH - 48 blocks loop ++ # ;; ==== ++ $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n"; ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, start ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); ++ $data_in_out_offset = (0 * 16); ++ my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, ++ $IA0); ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ $data_in_out_offset = (16 * 16); ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", ++ $IA0); ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, reduction ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); ++ $data_in_out_offset = (32 * 16); ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", ++ $IA0); ++ ++ # ;; === xor cipher block 0 with GHASH (ZT4) ++ $code .= <<___; ++ vmovdqa64 $ZTMP4,$AAD_HASHz ++ ++ add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET ++ sub \$`($big_loop_nblocks * 16)`,$LENGTH ++ cmp \$`($big_loop_nblocks * 16)`,$LENGTH ++ jae .L_encrypt_big_nblocks_${rndsuffix} ++ ++.L_no_more_big_nblocks_${rndsuffix}: ++ ++ cmp \$`(32 * 16)`,$LENGTH ++ jae .L_encrypt_32_blocks_${rndsuffix} ++ ++ cmp \$`(16 * 16)`,$LENGTH ++ jae .L_encrypt_16_blocks_${rndsuffix} ++___ ++ ++ # ;; ===================================================== ++ # ;; ===================================================== ++ # ;; ==== GHASH 1 x 16 blocks ++ # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks ++ # ;; ==== then GHASH N blocks ++ $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n"; ++ ++ # ;; calculate offset to the right hash key ++ $code .= <<___; ++mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} ++and \$~15,@{[DWORD($IA0)]} ++mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]} ++sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} ++___ ++ ++ # ;; ==== GHASH 32 blocks and follow with reduction ++ &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16), ++ "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); ++ ++ # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); ++ $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n"; ++ &GCM_ENC_DEC_LAST( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, ++ $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, ++ $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, ++ $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, ++ $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, ++ "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, ++ $IA0, $IA5, $MASKREG, $PBLOCK_LEN); ++ ++ $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; ++ $code .= "jmp .L_ghash_done_${rndsuffix}\n"; ++ ++ # ;; ===================================================== ++ # ;; ===================================================== ++ # ;; ==== GHASH & encrypt 1 x 16 blocks ++ # ;; ==== GHASH & encrypt 1 x 16 blocks ++ # ;; ==== GHASH 1 x 16 blocks (reduction) ++ # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks ++ # ;; ==== then GHASH N blocks ++ $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n"; ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, start ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ $data_in_out_offset = (0 * 16); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, ++ $IA0); ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); ++ $data_in_out_offset = (16 * 16); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", ++ $IA0); ++ ++ # ;; ==== GHASH 16 blocks with reduction ++ &GHASH_16( ++ "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16), ++ "%rsp", &HashKeyOffsetByIdx(16, "frame"), ++ 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); ++ ++ # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ $code .= <<___; ++ sub \$`(32 * 16)`,$LENGTH ++ add \$`(32 * 16)`,$DATA_OFFSET ++___ ++ ++ # ;; calculate offset to the right hash key ++ $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; ++ $code .= <<___; ++ and \$~15,@{[DWORD($IA0)]} ++ mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} ++ sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} ++___ ++ &GCM_ENC_DEC_LAST( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, ++ $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, ++ $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, ++ $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, ++ $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, ++ "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, ++ $IA0, $IA5, $MASKREG, $PBLOCK_LEN); ++ ++ $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; ++ $code .= "jmp .L_ghash_done_${rndsuffix}\n"; ++ ++ # ;; ===================================================== ++ # ;; ===================================================== ++ # ;; ==== GHASH & encrypt 16 blocks (done before) ++ # ;; ==== GHASH 1 x 16 blocks ++ # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks ++ # ;; ==== then GHASH N blocks ++ $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n"; ++ ++ # ;; ==== AES-CTR + GHASH - 16 blocks, start ++ $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ $data_in_out_offset = (0 * 16); ++ &GHASH_16_ENCRYPT_16_PARALLEL( ++ $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, ++ 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, ++ $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, ++ $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, ++ $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, ++ $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, ++ $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, ++ $IA0); ++ ++ # ;; ==== GHASH 1 x 16 blocks ++ &GHASH_16( ++ "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16), ++ "%rsp", &HashKeyOffsetByIdx(32, "frame"), ++ 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); ++ ++ # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); ++ $code .= <<___; ++ sub \$`(16 * 16)`,$LENGTH ++ add \$`(16 * 16)`,$DATA_OFFSET ++___ ++ &GCM_ENC_DEC_LAST( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, ++ $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK, ++ &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0, ++ $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, ++ $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, ++ $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, ++ $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, ++ $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20, ++ $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, ++ "end_reduce", $GL, $GH, $GM, ++ $ENC_DEC, $AAD_HASHz, $IA0, $IA5, ++ $MASKREG, $PBLOCK_LEN); ++ ++ $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; ++ $code .= <<___; ++ jmp .L_ghash_done_${rndsuffix} ++ ++.L_message_below_32_blocks_${rndsuffix}: ++ # ;; 32 > number of blocks > 16 ++ ++ sub \$`(16 * 16)`,$LENGTH ++ add \$`(16 * 16)`,$DATA_OFFSET ++___ ++ $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); ++ ++ # ;; calculate offset to the right hash key ++ $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; ++ ++ &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ "mid16"); ++ $code .= "mov \$1,$HKEYS_READY\n"; ++ ++ $code .= <<___; ++and \$~15,@{[DWORD($IA0)]} ++mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} ++sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} ++___ ++ ++ &GCM_ENC_DEC_LAST( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, ++ $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, ++ $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, ++ $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, ++ $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, ++ "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, ++ $IA0, $IA5, $MASKREG, $PBLOCK_LEN); ++ ++ $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; ++ $code .= <<___; ++ jmp .L_ghash_done_${rndsuffix} ++ ++.L_message_below_equal_16_blocks_${rndsuffix}: ++ # ;; Determine how many blocks to process ++ # ;; - process one additional block if there is a partial block ++ mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]} ++ add \$15,@{[DWORD($IA1)]} ++ shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16 ++___ ++ &GCM_ENC_DEC_SMALL( ++ $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC, ++ $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0, ++ $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, ++ $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, ++ $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK, ++ $PBLOCK_LEN); ++ ++ # ;; fall through to exit ++ ++ $code .= ".L_ghash_done_${rndsuffix}:\n"; ++ ++ # ;; save the last counter block ++ $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n"; ++ $code .= <<___; ++ vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX) ++.L_enc_dec_done_${rndsuffix}: ++___ ++} ++ ++# ;;; =========================================================================== ++# ;;; Encrypt/decrypt the initial 16 blocks ++sub INITIAL_BLOCKS_16 { ++ my $IN = $_[0]; # [in] input buffer ++ my $OUT = $_[1]; # [in] output buffer ++ my $AES_KEYS = $_[2]; # [in] pointer to expanded keys ++ my $DATA_OFFSET = $_[3]; # [in] data offset ++ my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits) ++ my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits ++ my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check ++ my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian) ++ my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) ++ my $T0 = $_[9]; # [clobered] temporary ZMM register ++ my $T1 = $_[10]; # [clobered] temporary ZMM register ++ my $T2 = $_[11]; # [clobered] temporary ZMM register ++ my $T3 = $_[12]; # [clobered] temporary ZMM register ++ my $T4 = $_[13]; # [clobered] temporary ZMM register ++ my $T5 = $_[14]; # [clobered] temporary ZMM register ++ my $T6 = $_[15]; # [clobered] temporary ZMM register ++ my $T7 = $_[16]; # [clobered] temporary ZMM register ++ my $T8 = $_[17]; # [clobered] temporary ZMM register ++ my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask ++ my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector ++ my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks ++ my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset ++ my $IA0 = $_[22]; # [clobered] temporary GP register ++ ++ my $B00_03 = $T5; ++ my $B04_07 = $T6; ++ my $B08_11 = $T7; ++ my $B12_15 = $T8; ++ ++ my $rndsuffix = &random_string(); ++ ++ my $stack_offset = $BLK_OFFSET; ++ $code .= <<___; ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;; prepare counter blocks ++ ++ cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} ++ jae .L_next_16_overflow_${rndsuffix} ++ vpaddd $ADDBE_1234,$CTR,$B00_03 ++ vpaddd $ADDBE_4x4,$B00_03,$B04_07 ++ vpaddd $ADDBE_4x4,$B04_07,$B08_11 ++ vpaddd $ADDBE_4x4,$B08_11,$B12_15 ++ jmp .L_next_16_ok_${rndsuffix} ++.L_next_16_overflow_${rndsuffix}: ++ vpshufb $SHUF_MASK,$CTR,$CTR ++ vmovdqa64 ddq_add_4444(%rip),$B12_15 ++ vpaddd ddq_add_1234(%rip),$CTR,$B00_03 ++ vpaddd $B12_15,$B00_03,$B04_07 ++ vpaddd $B12_15,$B04_07,$B08_11 ++ vpaddd $B12_15,$B08_11,$B12_15 ++ vpshufb $SHUF_MASK,$B00_03,$B00_03 ++ vpshufb $SHUF_MASK,$B04_07,$B04_07 ++ vpshufb $SHUF_MASK,$B08_11,$B08_11 ++ vpshufb $SHUF_MASK,$B12_15,$B12_15 ++.L_next_16_ok_${rndsuffix}: ++ vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR ++ addb \$16,@{[BYTE($CTR_CHECK)]} ++ # ;; === load 16 blocks of data ++ vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0 ++ vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1 ++ vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2 ++ vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3 ++ ++ # ;; move to AES encryption rounds ++ vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4 ++ vpxorq $T4,$B00_03,$B00_03 ++ vpxorq $T4,$B04_07,$B04_07 ++ vpxorq $T4,$B08_11,$B08_11 ++ vpxorq $T4,$B12_15,$B12_15 ++___ ++ foreach (1 .. ($NROUNDS)) { ++ $code .= <<___; ++ vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4 ++ vaesenc $T4,$B00_03,$B00_03 ++ vaesenc $T4,$B04_07,$B04_07 ++ vaesenc $T4,$B08_11,$B08_11 ++ vaesenc $T4,$B12_15,$B12_15 ++___ ++ } ++ $code .= <<___; ++ vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4 ++ vaesenclast $T4,$B00_03,$B00_03 ++ vaesenclast $T4,$B04_07,$B04_07 ++ vaesenclast $T4,$B08_11,$B08_11 ++ vaesenclast $T4,$B12_15,$B12_15 ++ ++ # ;; xor against text ++ vpxorq $T0,$B00_03,$B00_03 ++ vpxorq $T1,$B04_07,$B04_07 ++ vpxorq $T2,$B08_11,$B08_11 ++ vpxorq $T3,$B12_15,$B12_15 ++ ++ # ;; store ++ mov $OUT, $IA0 ++ vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1) ++ vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1) ++___ ++ if ($ENC_DEC eq "DEC") { ++ $code .= <<___; ++ # ;; decryption - cipher text needs to go to GHASH phase ++ vpshufb $SHUF_MASK,$T0,$B00_03 ++ vpshufb $SHUF_MASK,$T1,$B04_07 ++ vpshufb $SHUF_MASK,$T2,$B08_11 ++ vpshufb $SHUF_MASK,$T3,$B12_15 ++___ ++ } else { ++ $code .= <<___; ++ # ;; encryption ++ vpshufb $SHUF_MASK,$B00_03,$B00_03 ++ vpshufb $SHUF_MASK,$B04_07,$B04_07 ++ vpshufb $SHUF_MASK,$B08_11,$B08_11 ++ vpshufb $SHUF_MASK,$B12_15,$B12_15 ++___ ++ } ++ ++ if ($GHASH ne "no_ghash") { ++ $code .= <<___; ++ # ;; === xor cipher block 0 with GHASH for the next GHASH round ++ vpxorq $GHASH,$B00_03,$B00_03 ++___ ++ } ++ $code .= <<___; ++ vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp) ++ vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp) ++ vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp) ++ vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp) ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ; GCM_COMPLETE Finishes ghash calculation ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++sub GCM_COMPLETE { ++ my $GCM128_CTX = $_[0]; ++ my $PBLOCK_LEN = $_[1]; ++ ++ my $rndsuffix = &random_string(); ++ ++ $code .= <<___; ++ vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2 ++ vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0) ++___ ++ ++ $code .= <<___; ++ vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4 ++ ++ # ;; Process the final partial block. ++ cmp \$0,$PBLOCK_LEN ++ je .L_partial_done_${rndsuffix} ++___ ++ ++ # ;GHASH computation for the last <16 Byte block ++ &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); ++ ++ $code .= <<___; ++.L_partial_done_${rndsuffix}: ++ vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5 ++ vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C) ++ vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits ++ ++ vpxor %xmm5,%xmm4,%xmm4 ++___ ++ ++ &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); ++ ++ $code .= <<___; ++ vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap ++ vpxor %xmm4,%xmm3,%xmm3 ++ ++.L_return_T_${rndsuffix}: ++ vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX) ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;;; Functions definitions ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++$code .= ".text\n"; ++{ ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ # ;void ossl_aes_gcm_init_avx512 / ++ # ; (const void *aes_keys, ++ # ; void *gcm128ctx) ++ # ; ++ # ; Precomputes hashkey table for GHASH optimization. ++ # ; Leaf function (does not allocate stack space, does not use non-volatile registers). ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ $code .= <<___; ++.globl ossl_aes_gcm_init_avx512 ++.type ossl_aes_gcm_init_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_init_avx512: ++.cfi_startproc ++ endbranch ++___ ++ if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check aes_keys != NULL ++ test $arg1,$arg1 ++ jz .Labort_init ++ ++ # ;; Check gcm128ctx != NULL ++ test $arg2,$arg2 ++ jz .Labort_init ++___ ++ } ++ $code .= "vpxorq %xmm16,%xmm16,%xmm16\n"; ++ &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey ++ $code .= <<___; ++ vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 ++ # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;; ++ vmovdqa64 %xmm16,%xmm2 ++ vpsllq \$1,%xmm16,%xmm16 ++ vpsrlq \$63,%xmm2,%xmm2 ++ vmovdqa %xmm2,%xmm1 ++ vpslldq \$8,%xmm2,%xmm2 ++ vpsrldq \$8,%xmm1,%xmm1 ++ vporq %xmm2,%xmm16,%xmm16 ++ # ;reduction ++ vpshufd \$0b00100100,%xmm1,%xmm2 ++ vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 ++ vpand POLY(%rip),%xmm2,%xmm2 ++ vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly ++ # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly ++___ ++ &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); ++ if ($CLEAR_SCRATCH_REGISTERS) { ++ &clear_scratch_gps_asm(); ++ &clear_scratch_zmms_asm(); ++ } else { ++ $code .= "vzeroupper\n"; ++ } ++ $code .= <<___; ++.Labort_init: ++ret ++.cfi_endproc ++.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 ++___ ++} ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_aes_gcm_setiv_avx512 ++# ; (const void *aes_keys, ++# ; void *gcm128ctx, ++# ; const unsigned char *iv, ++# ; size_t ivlen) ++# ; ++# ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_aes_gcm_setiv_avx512 ++.type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_setiv_avx512: ++.cfi_startproc ++.Lsetiv_seh_begin: ++ endbranch ++___ ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check aes_keys != NULL ++ test $arg1,$arg1 ++ jz .Labort_setiv ++ ++ # ;; Check gcm128ctx != NULL ++ test $arg2,$arg2 ++ jz .Labort_setiv ++ ++ # ;; Check iv != NULL ++ test $arg3,$arg3 ++ jz .Labort_setiv ++ ++ # ;; Check ivlen != 0 ++ test $arg4,$arg4 ++ jz .Labort_setiv ++___ ++} ++ ++# ; NOTE: code before PROLOG() must not modify any registers ++&PROLOG( ++ 1, # allocate stack space for hkeys ++ 0, # do not allocate stack space for AES blocks ++ "setiv"); ++&GCM_INIT_IV( ++ "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1", ++ "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", ++ "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); ++&EPILOG( ++ 1, # hkeys were allocated ++ $arg4); ++$code .= <<___; ++.Labort_setiv: ++ret ++.Lsetiv_seh_end: ++.cfi_endproc ++.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512 ++___ ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_aes_gcm_update_aad_avx512 ++# ; (unsigned char *gcm128ctx, ++# ; const unsigned char *aad, ++# ; size_t aadlen) ++# ; ++# ; Updates AAD hash in gcm128_context structure. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_aes_gcm_update_aad_avx512 ++.type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_update_aad_avx512: ++.cfi_startproc ++.Lghash_seh_begin: ++ endbranch ++___ ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check gcm128ctx != NULL ++ test $arg1,$arg1 ++ jz .Lexit_update_aad ++ ++ # ;; Check aad != NULL ++ test $arg2,$arg2 ++ jz .Lexit_update_aad ++ ++ # ;; Check aadlen != 0 ++ test $arg3,$arg3 ++ jz .Lexit_update_aad ++___ ++} ++ ++# ; NOTE: code before PROLOG() must not modify any registers ++&PROLOG( ++ 1, # allocate stack space for hkeys, ++ 0, # do not allocate stack space for AES blocks ++ "ghash"); ++&GCM_UPDATE_AAD( ++ "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11", ++ "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13", ++ "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); ++&EPILOG( ++ 1, # hkeys were allocated ++ $arg3); ++$code .= <<___; ++.Lexit_update_aad: ++ret ++.Lghash_seh_end: ++.cfi_endproc ++.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512 ++___ ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_aes_gcm_encrypt_avx512 ++# ; (const void* aes_keys, ++# ; void *gcm128ctx, ++# ; unsigned int *pblocklen, ++# ; const unsigned char *in, ++# ; size_t len, ++# ; unsigned char *out); ++# ; ++# ; Performs encryption of data |in| of len |len|, and stores the output in |out|. ++# ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_aes_gcm_encrypt_avx512 ++.type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_encrypt_avx512: ++.cfi_startproc ++.Lencrypt_seh_begin: ++ endbranch ++___ ++ ++# ; NOTE: code before PROLOG() must not modify any registers ++&PROLOG( ++ 1, # allocate stack space for hkeys ++ 1, # allocate stack space for AES blocks ++ "encrypt"); ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check aes_keys != NULL ++ test $arg1,$arg1 ++ jz .Lexit_gcm_encrypt ++ ++ # ;; Check gcm128ctx != NULL ++ test $arg2,$arg2 ++ jz .Lexit_gcm_encrypt ++ ++ # ;; Check pblocklen != NULL ++ test $arg3,$arg3 ++ jz .Lexit_gcm_encrypt ++ ++ # ;; Check in != NULL ++ test $arg4,$arg4 ++ jz .Lexit_gcm_encrypt ++ ++ # ;; Check if len != 0 ++ cmp \$0,$arg5 ++ jz .Lexit_gcm_encrypt ++ ++ # ;; Check out != NULL ++ cmp \$0,$arg6 ++ jz .Lexit_gcm_encrypt ++___ ++} ++$code .= <<___; ++ # ; load number of rounds from AES_KEY structure (offset in bytes is ++ # ; size of the |rd_key| buffer) ++ mov `4*15*4`($arg1),%eax ++ cmp \$9,%eax ++ je .Laes_gcm_encrypt_128_avx512 ++ cmp \$11,%eax ++ je .Laes_gcm_encrypt_192_avx512 ++ cmp \$13,%eax ++ je .Laes_gcm_encrypt_256_avx512 ++ xor %eax,%eax ++ jmp .Lexit_gcm_encrypt ++___ ++for my $keylen (sort keys %aes_rounds) { ++ $NROUNDS = $aes_rounds{$keylen}; ++ $code .= <<___; ++.align 32 ++.Laes_gcm_encrypt_${keylen}_avx512: ++___ ++ &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC"); ++ $code .= "jmp .Lexit_gcm_encrypt\n"; ++} ++$code .= ".Lexit_gcm_encrypt:\n"; ++&EPILOG(1, $arg5); ++$code .= <<___; ++ret ++.Lencrypt_seh_end: ++.cfi_endproc ++.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512 ++___ ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_aes_gcm_decrypt_avx512 ++# ; (const void* keys, ++# ; void *gcm128ctx, ++# ; unsigned int *pblocklen, ++# ; const unsigned char *in, ++# ; size_t len, ++# ; unsigned char *out); ++# ; ++# ; Performs decryption of data |in| of len |len|, and stores the output in |out|. ++# ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_aes_gcm_decrypt_avx512 ++.type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_decrypt_avx512: ++.cfi_startproc ++.Ldecrypt_seh_begin: ++ endbranch ++___ ++ ++# ; NOTE: code before PROLOG() must not modify any registers ++&PROLOG( ++ 1, # allocate stack space for hkeys ++ 1, # allocate stack space for AES blocks ++ "decrypt"); ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check keys != NULL ++ test $arg1,$arg1 ++ jz .Lexit_gcm_decrypt ++ ++ # ;; Check gcm128ctx != NULL ++ test $arg2,$arg2 ++ jz .Lexit_gcm_decrypt ++ ++ # ;; Check pblocklen != NULL ++ test $arg3,$arg3 ++ jz .Lexit_gcm_decrypt ++ ++ # ;; Check in != NULL ++ test $arg4,$arg4 ++ jz .Lexit_gcm_decrypt ++ ++ # ;; Check if len != 0 ++ cmp \$0,$arg5 ++ jz .Lexit_gcm_decrypt ++ ++ # ;; Check out != NULL ++ cmp \$0,$arg6 ++ jz .Lexit_gcm_decrypt ++___ ++} ++$code .= <<___; ++ # ; load number of rounds from AES_KEY structure (offset in bytes is ++ # ; size of the |rd_key| buffer) ++ mov `4*15*4`($arg1),%eax ++ cmp \$9,%eax ++ je .Laes_gcm_decrypt_128_avx512 ++ cmp \$11,%eax ++ je .Laes_gcm_decrypt_192_avx512 ++ cmp \$13,%eax ++ je .Laes_gcm_decrypt_256_avx512 ++ xor %eax,%eax ++ jmp .Lexit_gcm_decrypt ++___ ++for my $keylen (sort keys %aes_rounds) { ++ $NROUNDS = $aes_rounds{$keylen}; ++ $code .= <<___; ++.align 32 ++.Laes_gcm_decrypt_${keylen}_avx512: ++___ ++ &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC"); ++ $code .= "jmp .Lexit_gcm_decrypt\n"; ++} ++$code .= ".Lexit_gcm_decrypt:\n"; ++&EPILOG(1, $arg5); ++$code .= <<___; ++ret ++.Ldecrypt_seh_end: ++.cfi_endproc ++.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512 ++___ ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_aes_gcm_finalize_vaes_avx512 ++# ; (void *gcm128ctx, ++# ; unsigned int pblocklen); ++# ; ++# ; Finalizes encryption / decryption ++# ; Leaf function (does not allocate stack space, does not use non-volatile registers). ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_aes_gcm_finalize_avx512 ++.type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent ++.align 32 ++ossl_aes_gcm_finalize_avx512: ++.cfi_startproc ++ endbranch ++___ ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check gcm128ctx != NULL ++ test $arg1,$arg1 ++ jz .Labort_finalize ++___ ++} ++ ++&GCM_COMPLETE("$arg1", "$arg2"); ++ ++$code .= <<___; ++.Labort_finalize: ++ret ++.cfi_endproc ++.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512 ++___ ++ ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++# ;void ossl_gcm_gmult_avx512(u64 Xi[2], ++# ; const void* gcm128ctx) ++# ; ++# ; Leaf function (does not allocate stack space, does not use non-volatile registers). ++# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++$code .= <<___; ++.globl ossl_gcm_gmult_avx512 ++.hidden ossl_gcm_gmult_avx512 ++.type ossl_gcm_gmult_avx512,\@abi-omnipotent ++.align 32 ++ossl_gcm_gmult_avx512: ++.cfi_startproc ++ endbranch ++___ ++if ($CHECK_FUNCTION_ARGUMENTS) { ++ $code .= <<___; ++ # ;; Check Xi != NULL ++ test $arg1,$arg1 ++ jz .Labort_gmult ++ ++ # ;; Check gcm128ctx != NULL ++ test $arg2,$arg2 ++ jz .Labort_gmult ++___ ++} ++$code .= "vmovdqu64 ($arg1),%xmm1\n"; ++$code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n"; ++ ++&GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); ++ ++$code .= "vmovdqu64 %xmm1,($arg1)\n"; ++if ($CLEAR_SCRATCH_REGISTERS) { ++ &clear_scratch_gps_asm(); ++ &clear_scratch_zmms_asm(); ++} else { ++ $code .= "vzeroupper\n"; ++} ++$code .= <<___; ++.Labort_gmult: ++ret ++.cfi_endproc ++.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512 ++___ ++ ++if ($win64) { ++ ++ # Add unwind metadata for SEH. ++ ++ # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160 ++ my $UWOP_PUSH_NONVOL = 0; ++ my $UWOP_ALLOC_LARGE = 1; ++ my $UWOP_SET_FPREG = 3; ++ my $UWOP_SAVE_XMM128 = 8; ++ my %UWOP_REG_NUMBER = ( ++ rax => 0, ++ rcx => 1, ++ rdx => 2, ++ rbx => 3, ++ rsp => 4, ++ rbp => 5, ++ rsi => 6, ++ rdi => 7, ++ map(("r$_" => $_), (8 .. 15))); ++ ++ $code .= <<___; ++.section .pdata ++.align 4 ++ .rva .Lsetiv_seh_begin ++ .rva .Lsetiv_seh_end ++ .rva .Lsetiv_seh_info ++ ++ .rva .Lghash_seh_begin ++ .rva .Lghash_seh_end ++ .rva .Lghash_seh_info ++ ++ .rva .Lencrypt_seh_begin ++ .rva .Lencrypt_seh_end ++ .rva .Lencrypt_seh_info ++ ++ .rva .Ldecrypt_seh_begin ++ .rva .Ldecrypt_seh_end ++ .rva .Ldecrypt_seh_info ++ ++.section .xdata ++___ ++ ++ foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") { ++ $code .= <<___; ++.align 8 ++.L${func_name}_seh_info: ++ .byte 1 # version 1, no flags ++ .byte \$L\$${func_name}_seh_prolog_end-\$L\$${func_name}_seh_begin ++ .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10 ++ # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16 ++ .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]} ++___ ++ ++ # Metadata for %xmm15-%xmm6 ++ # Occupy 2 slots each ++ for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { ++ ++ # Scaled-by-16 stack offset ++ my $xmm_reg_offset = ($reg_idx - 6); ++ $code .= <<___; ++ .byte \$L\$${func_name}_seh_save_xmm${reg_idx}-\$L\$${func_name}_seh_begin ++ .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]} ++ .value $xmm_reg_offset ++___ ++ } ++ ++ $code .= <<___; ++ # Frame pointer (occupy 1 slot) ++ .byte \$L\$${func_name}_seh_setfp-\$L\$${func_name}_seh_begin ++ .byte $UWOP_SET_FPREG ++ ++ # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes ++ .byte \$L\$${func_name}_seh_allocstack_xmm-\$L\$${func_name}_seh_begin ++ .byte $UWOP_ALLOC_LARGE ++ .value `($XMM_STORAGE + 8) / 8` ++___ ++ ++ # Metadata for GPR regs ++ # Occupy 1 slot each ++ foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") { ++ $code .= <<___; ++ .byte \$L\$${func_name}_seh_push_${reg}-\$L\$${func_name}_seh_begin ++ .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]} ++___ ++ } ++ } ++} ++ ++$code .= <<___; ++.data ++.align 16 ++POLY: .quad 0x0000000000000001, 0xC200000000000000 ++ ++.align 64 ++POLY2: ++ .quad 0x00000001C2000000, 0xC200000000000000 ++ .quad 0x00000001C2000000, 0xC200000000000000 ++ .quad 0x00000001C2000000, 0xC200000000000000 ++ .quad 0x00000001C2000000, 0xC200000000000000 ++ ++.align 16 ++TWOONE: .quad 0x0000000000000001, 0x0000000100000000 ++ ++# ;;; Order of these constants should not change. ++# ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F ++.align 64 ++SHUF_MASK: ++ .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 ++ .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 ++ .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 ++ .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 ++ ++.align 16 ++SHIFT_MASK: ++ .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 ++ ++ALL_F: ++ .quad 0xffffffffffffffff, 0xffffffffffffffff ++ ++ZERO: ++ .quad 0x0000000000000000, 0x0000000000000000 ++ ++.align 16 ++ONE: ++ .quad 0x0000000000000001, 0x0000000000000000 ++ ++.align 16 ++ONEf: ++ .quad 0x0000000000000000, 0x0100000000000000 ++ ++.align 64 ++ddq_add_1234: ++ .quad 0x0000000000000001, 0x0000000000000000 ++ .quad 0x0000000000000002, 0x0000000000000000 ++ .quad 0x0000000000000003, 0x0000000000000000 ++ .quad 0x0000000000000004, 0x0000000000000000 ++ ++.align 64 ++ddq_add_5678: ++ .quad 0x0000000000000005, 0x0000000000000000 ++ .quad 0x0000000000000006, 0x0000000000000000 ++ .quad 0x0000000000000007, 0x0000000000000000 ++ .quad 0x0000000000000008, 0x0000000000000000 ++ ++.align 64 ++ddq_add_4444: ++ .quad 0x0000000000000004, 0x0000000000000000 ++ .quad 0x0000000000000004, 0x0000000000000000 ++ .quad 0x0000000000000004, 0x0000000000000000 ++ .quad 0x0000000000000004, 0x0000000000000000 ++ ++.align 64 ++ddq_add_8888: ++ .quad 0x0000000000000008, 0x0000000000000000 ++ .quad 0x0000000000000008, 0x0000000000000000 ++ .quad 0x0000000000000008, 0x0000000000000000 ++ .quad 0x0000000000000008, 0x0000000000000000 ++ ++.align 64 ++ddq_addbe_1234: ++ .quad 0x0000000000000000, 0x0100000000000000 ++ .quad 0x0000000000000000, 0x0200000000000000 ++ .quad 0x0000000000000000, 0x0300000000000000 ++ .quad 0x0000000000000000, 0x0400000000000000 ++ ++.align 64 ++ddq_addbe_4444: ++ .quad 0x0000000000000000, 0x0400000000000000 ++ .quad 0x0000000000000000, 0x0400000000000000 ++ .quad 0x0000000000000000, 0x0400000000000000 ++ .quad 0x0000000000000000, 0x0400000000000000 ++ ++.align 64 ++byte_len_to_mask_table: ++ .value 0x0000, 0x0001, 0x0003, 0x0007 ++ .value 0x000f, 0x001f, 0x003f, 0x007f ++ .value 0x00ff, 0x01ff, 0x03ff, 0x07ff ++ .value 0x0fff, 0x1fff, 0x3fff, 0x7fff ++ .value 0xffff ++ ++.align 64 ++byte64_len_to_mask_table: ++ .quad 0x0000000000000000, 0x0000000000000001 ++ .quad 0x0000000000000003, 0x0000000000000007 ++ .quad 0x000000000000000f, 0x000000000000001f ++ .quad 0x000000000000003f, 0x000000000000007f ++ .quad 0x00000000000000ff, 0x00000000000001ff ++ .quad 0x00000000000003ff, 0x00000000000007ff ++ .quad 0x0000000000000fff, 0x0000000000001fff ++ .quad 0x0000000000003fff, 0x0000000000007fff ++ .quad 0x000000000000ffff, 0x000000000001ffff ++ .quad 0x000000000003ffff, 0x000000000007ffff ++ .quad 0x00000000000fffff, 0x00000000001fffff ++ .quad 0x00000000003fffff, 0x00000000007fffff ++ .quad 0x0000000000ffffff, 0x0000000001ffffff ++ .quad 0x0000000003ffffff, 0x0000000007ffffff ++ .quad 0x000000000fffffff, 0x000000001fffffff ++ .quad 0x000000003fffffff, 0x000000007fffffff ++ .quad 0x00000000ffffffff, 0x00000001ffffffff ++ .quad 0x00000003ffffffff, 0x00000007ffffffff ++ .quad 0x0000000fffffffff, 0x0000001fffffffff ++ .quad 0x0000003fffffffff, 0x0000007fffffffff ++ .quad 0x000000ffffffffff, 0x000001ffffffffff ++ .quad 0x000003ffffffffff, 0x000007ffffffffff ++ .quad 0x00000fffffffffff, 0x00001fffffffffff ++ .quad 0x00003fffffffffff, 0x00007fffffffffff ++ .quad 0x0000ffffffffffff, 0x0001ffffffffffff ++ .quad 0x0003ffffffffffff, 0x0007ffffffffffff ++ .quad 0x000fffffffffffff, 0x001fffffffffffff ++ .quad 0x003fffffffffffff, 0x007fffffffffffff ++ .quad 0x00ffffffffffffff, 0x01ffffffffffffff ++ .quad 0x03ffffffffffffff, 0x07ffffffffffffff ++ .quad 0x0fffffffffffffff, 0x1fffffffffffffff ++ .quad 0x3fffffffffffffff, 0x7fffffffffffffff ++ .quad 0xffffffffffffffff ++___ ++ ++} else { ++# Fallback for old assembler ++$code .= <<___; ++.text ++.globl ossl_vaes_vpclmulqdq_capable ++.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent ++ossl_vaes_vpclmulqdq_capable: ++ xor %eax,%eax ++ ret ++.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable ++ ++.globl ossl_aes_gcm_init_avx512 ++.globl ossl_aes_gcm_setiv_avx512 ++.globl ossl_aes_gcm_update_aad_avx512 ++.globl ossl_aes_gcm_encrypt_avx512 ++.globl ossl_aes_gcm_decrypt_avx512 ++.globl ossl_aes_gcm_finalize_avx512 ++.globl ossl_gcm_gmult_avx512 ++ ++.type ossl_aes_gcm_init_avx512,\@abi-omnipotent ++ossl_aes_gcm_init_avx512: ++ossl_aes_gcm_setiv_avx512: ++ossl_aes_gcm_update_aad_avx512: ++ossl_aes_gcm_encrypt_avx512: ++ossl_aes_gcm_decrypt_avx512: ++ossl_aes_gcm_finalize_avx512: ++ossl_gcm_gmult_avx512: ++ .byte 0x0f,0x0b # ud2 ++ ret ++.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 ++___ ++} ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/modes/build.info b/crypto/modes/build.info +index f3558fa1a4..9362048b6c 100644 +--- a/crypto/modes/build.info ++++ b/crypto/modes/build.info +@@ -4,7 +4,7 @@ $MODESASM= + IF[{- !$disabled{asm} -}] + $MODESASM_x86=ghash-x86.S + $MODESDEF_x86=GHASH_ASM +- $MODESASM_x86_64=ghash-x86_64.s aesni-gcm-x86_64.s ++ $MODESASM_x86_64=ghash-x86_64.s aesni-gcm-x86_64.s aes-gcm-avx512.s + $MODESDEF_x86_64=GHASH_ASM + + # ghash-ia64.s doesn't work on VMS +@@ -66,6 +66,7 @@ GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl + GENERATE[ghash-x86.S]=asm/ghash-x86.pl + GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl + GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl ++GENERATE[aes-gcm-avx512.s]=asm/aes-gcm-avx512.pl + GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl + INCLUDE[ghash-sparcv9.o]=.. + GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl +diff --git a/include/crypto/modes.h b/include/crypto/modes.h +index 19f9d85959..7e3060084f 100644 +--- a/include/crypto/modes.h ++++ b/include/crypto/modes.h +@@ -118,8 +118,8 @@ struct gcm128_context { + size_t t[16 / sizeof(size_t)]; + } Yi, EKi, EK0, len, Xi, H; + /* +- * Relative position of Xi, H and pre-computed Htable is used in some +- * assembler modules, i.e. don't change the order! ++ * Relative position of Yi, EKi, EK0, len, Xi, H and pre-computed Htable is ++ * used in some assembler modules, i.e. don't change the order! + */ + #if TABLE_BITS==8 + u128 Htable[256]; +diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_aesni.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_aesni.inc +index e6aa0479dd..4d2b74af88 100644 +--- a/providers/implementations/ciphers/cipher_aes_gcm_hw_aesni.inc ++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_aesni.inc +@@ -31,8 +31,17 @@ static const PROV_GCM_HW aesni_gcm = { + ossl_gcm_one_shot + }; + ++#include "cipher_aes_gcm_hw_vaes_avx512.inc" ++ + const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits) + { +- return AESNI_CAPABLE ? &aesni_gcm : &aes_gcm; ++#ifdef VAES_GCM_ENABLED ++ if (ossl_vaes_vpclmulqdq_capable()) ++ return &vaes_gcm; ++ else ++#endif ++ if (AESNI_CAPABLE) ++ return &aesni_gcm; ++ else ++ return &aes_gcm; + } +- +diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc +new file mode 100644 +index 0000000000..8f279d0c7f +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc +@@ -0,0 +1,205 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * Copyright (c) 2021, Intel Corporation. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/*- ++ * AVX512 VAES + VPCLMULDQD support for AES GCM. ++ * This file is included by cipher_aes_gcm_hw_aesni.inc ++ */ ++ ++#undef VAES_GCM_ENABLED ++#if (defined(__x86_64) || defined(__x86_64__) || \ ++ defined(_M_AMD64) || defined(_M_X64)) ++# define VAES_GCM_ENABLED ++ ++/* Returns non-zero when AVX512F + VAES + VPCLMULDQD combination is available */ ++int ossl_vaes_vpclmulqdq_capable(void); ++ ++# define OSSL_AES_GCM_UPDATE(direction) \ ++ void ossl_aes_gcm_ ## direction ## _avx512(const void *ks, \ ++ void *gcm128ctx, \ ++ unsigned int *pblocklen, \ ++ const unsigned char *in, \ ++ size_t len, \ ++ unsigned char *out); ++ ++OSSL_AES_GCM_UPDATE(encrypt) ++OSSL_AES_GCM_UPDATE(decrypt) ++ ++void ossl_aes_gcm_init_avx512(const void *ks, void *gcm128ctx); ++void ossl_aes_gcm_setiv_avx512(const void *ks, void *gcm128ctx, ++ const unsigned char *iv, size_t ivlen); ++void ossl_aes_gcm_update_aad_avx512(void *gcm128ctx, const unsigned char *aad, ++ size_t aadlen); ++void ossl_aes_gcm_finalize_avx512(void *gcm128ctx, unsigned int pblocklen); ++ ++void ossl_gcm_gmult_avx512(u64 Xi[2], const void *gcm128ctx); ++ ++static int vaes_gcm_setkey(PROV_GCM_CTX *ctx, const unsigned char *key, ++ size_t keylen) ++{ ++ GCM128_CONTEXT *gcmctx = &ctx->gcm; ++ PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx; ++ AES_KEY *ks = &actx->ks.ks; ++ ++ ctx->ks = ks; ++ aesni_set_encrypt_key(key, keylen * 8, ks); ++ memset(gcmctx, 0, sizeof(*gcmctx)); ++ gcmctx->key = ks; ++ ctx->key_set = 1; ++ ++ ossl_aes_gcm_init_avx512(ks, gcmctx); ++ ++ return 1; ++} ++ ++static int vaes_gcm_setiv(PROV_GCM_CTX *ctx, const unsigned char *iv, ++ size_t ivlen) ++{ ++ GCM128_CONTEXT *gcmctx = &ctx->gcm; ++ ++ gcmctx->Yi.u[0] = 0; /* Current counter */ ++ gcmctx->Yi.u[1] = 0; ++ gcmctx->Xi.u[0] = 0; /* AAD hash */ ++ gcmctx->Xi.u[1] = 0; ++ gcmctx->len.u[0] = 0; /* AAD length */ ++ gcmctx->len.u[1] = 0; /* Message length */ ++ gcmctx->ares = 0; ++ gcmctx->mres = 0; ++ ++ /* IV is limited by 2^64 bits, thus 2^61 bytes */ ++ if (ivlen > (U64(1) << 61)) ++ return 0; ++ ++ ossl_aes_gcm_setiv_avx512(ctx->ks, gcmctx, iv, ivlen); ++ ++ return 1; ++} ++ ++static int vaes_gcm_aadupdate(PROV_GCM_CTX *ctx, ++ const unsigned char *aad, ++ size_t aad_len) ++{ ++ GCM128_CONTEXT *gcmctx = &ctx->gcm; ++ u64 alen = gcmctx->len.u[0]; ++ unsigned int ares; ++ size_t i, lenBlks; ++ ++ /* Bad sequence: call of AAD update after message processing */ ++ if (gcmctx->len.u[1] > 0) ++ return 0; ++ ++ alen += aad_len; ++ /* AAD is limited by 2^64 bits, thus 2^61 bytes */ ++ if ((alen > (U64(1) << 61)) || (alen < aad_len)) ++ return 0; ++ ++ gcmctx->len.u[0] = alen; ++ ++ ares = gcmctx->ares; ++ /* Partial AAD block left from previous AAD update calls */ ++ if (ares > 0) { ++ /* ++ * Fill partial block buffer till full block ++ * (note, the hash is stored reflected) ++ */ ++ while (ares > 0 && aad_len > 0) { ++ gcmctx->Xi.c[15 - ares] ^= *(aad++); ++ --aad_len; ++ ares = (ares + 1) % AES_BLOCK_SIZE; ++ } ++ /* Full block gathered */ ++ if (ares == 0) { ++ ossl_gcm_gmult_avx512(gcmctx->Xi.u, gcmctx); ++ } else { /* no more AAD */ ++ gcmctx->ares = ares; ++ return 1; ++ } ++ } ++ ++ /* Bulk AAD processing */ ++ lenBlks = aad_len & ((size_t)(-AES_BLOCK_SIZE)); ++ if (lenBlks > 0) { ++ ossl_aes_gcm_update_aad_avx512(gcmctx, aad, lenBlks); ++ aad += lenBlks; ++ aad_len -= lenBlks; ++ } ++ ++ /* Add remaining AAD to the hash (note, the hash is stored reflected) */ ++ if (aad_len > 0) { ++ ares = aad_len; ++ for (i = 0; i < aad_len; i++) ++ gcmctx->Xi.c[15 - i] ^= aad[i]; ++ } ++ ++ gcmctx->ares = ares; ++ ++ return 1; ++} ++ ++static int vaes_gcm_cipherupdate(PROV_GCM_CTX *ctx, const unsigned char *in, ++ size_t len, unsigned char *out) ++{ ++ GCM128_CONTEXT *gcmctx = &ctx->gcm; ++ u64 mlen = gcmctx->len.u[1]; ++ ++ mlen += len; ++ if (mlen > ((U64(1) << 36) - 32) || (mlen < len)) ++ return 0; ++ ++ gcmctx->len.u[1] = mlen; ++ ++ /* Finalize GHASH(AAD) if AAD partial blocks left unprocessed */ ++ if (gcmctx->ares > 0) { ++ ossl_gcm_gmult_avx512(gcmctx->Xi.u, gcmctx); ++ gcmctx->ares = 0; ++ } ++ ++ if (ctx->enc) ++ ossl_aes_gcm_encrypt_avx512(ctx->ks, gcmctx, &gcmctx->mres, in, len, out); ++ else ++ ossl_aes_gcm_decrypt_avx512(ctx->ks, gcmctx, &gcmctx->mres, in, len, out); ++ ++ return 1; ++} ++ ++static int vaes_gcm_cipherfinal(PROV_GCM_CTX *ctx, unsigned char *tag) ++{ ++ GCM128_CONTEXT *gcmctx = &ctx->gcm; ++ unsigned int *res = &gcmctx->mres; ++ ++ /* Finalize AAD processing */ ++ if (gcmctx->ares > 0) ++ res = &gcmctx->ares; ++ ++ ossl_aes_gcm_finalize_avx512(gcmctx, *res); ++ ++ if (ctx->enc) { ++ ctx->taglen = GCM_TAG_MAX_SIZE; ++ memcpy(tag, gcmctx->Xi.c, ++ ctx->taglen <= sizeof(gcmctx->Xi.c) ? ctx->taglen : ++ sizeof(gcmctx->Xi.c)); ++ *res = 0; ++ } else { ++ return !CRYPTO_memcmp(gcmctx->Xi.c, tag, ctx->taglen); ++ } ++ ++ return 1; ++} ++ ++static const PROV_GCM_HW vaes_gcm = { ++ vaes_gcm_setkey, ++ vaes_gcm_setiv, ++ vaes_gcm_aadupdate, ++ vaes_gcm_cipherupdate, ++ vaes_gcm_cipherfinal, ++ ossl_gcm_one_shot ++}; ++ ++#endif +-- +2.39.2 + diff -Nru openssl-3.0.10/debian/patches/series openssl-3.0.10/debian/patches/series --- openssl-3.0.10/debian/patches/series 2023-08-02 03:16:27.000000000 +0000 +++ openssl-3.0.10/debian/patches/series 2023-08-08 14:02:26.000000000 +0000 @@ -12,3 +12,6 @@ tests-use-seclevel-1.patch tls1.2-min-seclevel2.patch skip_tls1.1_seclevel3_tests.patch + +intel/0001-Dual-1536-2048-bit-exponentiation-optimization-for-I.patch +intel/0002-AES-GCM-enabled-with-AVX512-vAES-and-vPCLMULQDQ.patch