@@ -737,59 +737,6 @@ sub _ghash_update {
737737 vmovdqu ($GHASH_ACC_PTR ), $GHASH_ACC_XMM
738738 vpshufb $BSWAP_MASK_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM
739739
740- # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
741- cmp \$ $VL , $AADLEN
742- jb .Laad_blockbyblock$local_label_suffix
743-
744- # AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and
745- # gfpoly to all 128-bit lanes.
746- vshufi64x2 \$ 0, $BSWAP_MASK , $BSWAP_MASK , $BSWAP_MASK
747- vshufi64x2 \$ 0, $GFPOLY , $GFPOLY , $GFPOLY
748-
749- # Load the lowest set of key powers.
750- vmovdqu8 $OFFSETOFEND_H_POWERS -1*$VL ($H_POWERS ), $H_POW1
751-
752- cmp \$ 4*$VL -1, $AADLEN
753- jbe .Laad_loop_1x$local_label_suffix
754-
755- # AADLEN >= 4*VL. Load the higher key powers.
756- vmovdqu8 $OFFSETOFEND_H_POWERS -4*$VL ($H_POWERS ), $H_POW4
757- vmovdqu8 $OFFSETOFEND_H_POWERS -3*$VL ($H_POWERS ), $H_POW3
758- vmovdqu8 $OFFSETOFEND_H_POWERS -2*$VL ($H_POWERS ), $H_POW2
759-
760- # Update GHASH with 4*VL bytes of AAD at a time.
761- .Laad_loop_4x$local_label_suffix :
762- vmovdqu8 0*$VL ($AAD ), $GHASHDATA0
763- vmovdqu8 1*$VL ($AAD ), $GHASHDATA1
764- vmovdqu8 2*$VL ($AAD ), $GHASHDATA2
765- vmovdqu8 3*$VL ($AAD ), $GHASHDATA3
766- @{[ _ghash_4x ]}
767- sub \$ -4*$VL , $AAD # shorter than 'add 4*VL' when VL=32
768- add \$ -4*$VL , $AADLEN
769- cmp \$ 4*$VL -1, $AADLEN
770- ja .Laad_loop_4x$local_label_suffix
771-
772- # Update GHASH with VL bytes of AAD at a time.
773- cmp \$ $VL , $AADLEN
774- jb .Laad_large_done$local_label_suffix
775- .Laad_loop_1x$local_label_suffix :
776- vmovdqu8 ($AAD ), $GHASHDATA0
777- vpshufb $BSWAP_MASK , $GHASHDATA0 , $GHASHDATA0
778- vpxord $GHASHDATA0 , $GHASH_ACC , $GHASH_ACC
779- @{[ _ghash_mul $H_POW1 , $GHASH_ACC , $GHASH_ACC , $GFPOLY ,
780- $GHASHDATA0 , $GHASHDATA1 , $GHASHDATA2 ]}
781- @{[ _horizontal_xor $GHASH_ACC , $GHASH_ACC_XMM , $GHASH_ACC_XMM ,
782- $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM ]}
783- add \$ $VL , $AAD
784- sub \$ $VL , $AADLEN
785- cmp \$ $VL , $AADLEN
786- jae .Laad_loop_1x$local_label_suffix
787-
788- .Laad_large_done$local_label_suffix :
789- # Issue the vzeroupper that is needed after using ymm or zmm registers.
790- # Do it here instead of at the end, to minimize overhead for small AADLEN.
791- vzeroupper
792-
793740 # GHASH the remaining data 16 bytes at a time, using xmm registers only.
794741.Laad_blockbyblock$local_label_suffix :
795742 test $AADLEN , $AADLEN
@@ -801,9 +748,6 @@ sub _ghash_update {
801748 vpxor $GHASHDATA0_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM
802749 @{[ _ghash_mul $H_POW1_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM , $GFPOLY_XMM ,
803750 $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM ]}
804- add \$ 16, $AAD
805- sub \$ 16, $AADLEN
806- jnz .Laad_loop_blockbyblock$local_label_suffix
807751
808752.Laad_done$local_label_suffix :
809753 # Store the updated GHASH accumulator back to memory.
@@ -1303,31 +1247,6 @@ sub _aes_gcm_update {
13031247 return $code ;
13041248}
13051249
1306- # void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
1307- $code .= _begin_func " gcm_gmult_vpclmulqdq_avx10" , 1;
1308- {
1309- my ( $GHASH_ACC_PTR , $H_POWERS ) = @argregs [ 0 .. 1 ];
1310- my ( $GHASH_ACC , $BSWAP_MASK , $H_POW1 , $GFPOLY , $T0 , $T1 , $T2 ) =
1311- map ( " %xmm$_ " , ( 0 .. 6 ) );
1312-
1313- $code .= <<___ ;
1314- @{[ _save_xmmregs (6) ]}
1315- .seh_endprologue
1316-
1317- vmovdqu ($GHASH_ACC_PTR ), $GHASH_ACC
1318- vmovdqu .Lbswap_mask(%rip ), $BSWAP_MASK
1319- vmovdqu $OFFSETOFEND_H_POWERS -16($H_POWERS ), $H_POW1
1320- vmovdqu .Lgfpoly(%rip ), $GFPOLY
1321- vpshufb $BSWAP_MASK , $GHASH_ACC , $GHASH_ACC
1322-
1323- @{[ _ghash_mul $H_POW1 , $GHASH_ACC , $GHASH_ACC , $GFPOLY , $T0 , $T1 , $T2 ]}
1324-
1325- vpshufb $BSWAP_MASK , $GHASH_ACC , $GHASH_ACC
1326- vmovdqu $GHASH_ACC , ($GHASH_ACC_PTR )
1327- ___
1328- }
1329- $code .= _end_func;
1330-
13311250# Disabled until significant deployment of AVX10/256 is seen. The separate
13321251# *_vaes_avx2 implementation provides the only 256-bit support for now.
13331252#
@@ -1353,7 +1272,7 @@ sub _aes_gcm_update {
13531272$code .= _aes_gcm_init;
13541273$code .= _end_func;
13551274
1356- $code .= _begin_func " gcm_ghash_vpclmulqdq_avx10_512 " , 1;
1275+ $code .= _begin_func " gcm_ghash_vpclmulqdq_avx10_512_1 " , 1;
13571276$code .= _ghash_update;
13581277$code .= _end_func;
13591278
0 commit comments