@@ -588,18 +588,24 @@ sub _ghash_4x {
588588 return $code ;
589589}
590590
591- # void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
592- $code .= _begin_func " gcm_gmult_vpclmulqdq_avx512" , 1;
591+ # void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
592+ # const uint8_t aad[16], size_t aad_len_16););
593+ $code .= _begin_func " gcm_ghash_vpclmulqdq_avx512_16" , 1;
593594{
594- my ( $GHASH_ACC_PTR , $HTABLE ) = @argregs [ 0 .. 1 ];
595+ my ( $GHASH_ACC_PTR , $HTABLE , $AAD , $AAD_LEN_16 ) = @argregs [ 0 .. 3 ];
595596 my ( $GHASH_ACC , $BSWAP_MASK , $H_POW1 , $GFPOLY , $T0 , $T1 , $T2 ) =
596597 map ( " %xmm$_ " , ( 0 .. 6 ) );
597598
598599 $code .= <<___ ;
599600 @{[ _save_xmmregs (6) ]}
600601 .seh_endprologue
601602
603+ # Load the GHASH accumulator.
602604 vmovdqu ($GHASH_ACC_PTR ), $GHASH_ACC
605+
606+ # XOR the AAD into the accumulator.
607+ vpxor ($AAD ), $GHASH_ACC , $GHASH_ACC
608+
603609 vmovdqu .Lbswap_mask(%rip ), $BSWAP_MASK
604610 vmovdqu $OFFSETOFEND_H_POWERS -16($HTABLE ), $H_POW1
605611 vmovdqu .Lgfpoly(%rip ), $GFPOLY
@@ -615,127 +621,6 @@ sub _ghash_4x {
615621}
616622$code .= _end_func;
617623
618- # void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16],
619- # const uint8_t *in, size_t len);
620- #
621- # Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
622- # by |in| and |len|. |len| must be a multiple of 16.
623- #
624- # This function handles large amounts of AAD efficiently, while also keeping the
625- # overhead low for small amounts of AAD which is the common case. TLS uses less
626- # than one block of AAD, but (uncommonly) other use cases may use much more.
627- $code .= _begin_func " gcm_ghash_vpclmulqdq_avx512" , 1;
628- {
629- # Function arguments
630- my ( $GHASH_ACC_PTR , $HTABLE , $AAD , $AADLEN ) = @argregs [ 0 .. 3 ];
631-
632- # Additional local variables
633- my ( $GHASHDATA0 , $GHASHDATA0_XMM ) = ( " %zmm0 " , " %xmm0 " );
634- my ( $GHASHDATA1 , $GHASHDATA1_XMM ) = ( " %zmm1 " , " %xmm1 " );
635- my ( $GHASHDATA2 , $GHASHDATA2_XMM ) = ( " %zmm2 " , " %xmm2 " );
636- my ( $GHASHDATA3 , $GHASHDATA3_XMM ) = ( " %zmm3 " , " %xmm3 " );
637- my @GHASHDATA = ( $GHASHDATA0 , $GHASHDATA1 , $GHASHDATA2 , $GHASHDATA3 );
638- my @GHASHDATA_XMM =
639- ( $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM , $GHASHDATA3_XMM );
640- my ( $BSWAP_MASK , $BSWAP_MASK_XMM ) = ( " %zmm4 " , " %xmm4 " );
641- my ( $GHASH_ACC , $GHASH_ACC_XMM ) = ( " %zmm5 " , " %xmm5 " );
642- my ( $H_POW4 , $H_POW3 , $H_POW2 ) = ( " %zmm6 " , " %zmm7 " , " %zmm8 " );
643- my ( $H_POW1 , $H_POW1_XMM ) = ( " %zmm9 " , " %xmm9 " );
644- my ( $GFPOLY , $GFPOLY_XMM ) = ( " %zmm10 " , " %xmm10 " );
645- my ( $GHASHTMP0 , $GHASHTMP1 , $GHASHTMP2 ) =
646- ( " %zmm11 " , " %zmm12 " , " %zmm13 " );
647-
648- $code .= <<___ ;
649- @{[ _save_xmmregs (6 .. 13) ]}
650- .seh_endprologue
651-
652- # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small,
653- # usually only 128-bit vectors will be used. So as an optimization, don't
654- # broadcast these constants to all 128-bit lanes quite yet.
655- vmovdqu .Lbswap_mask(%rip ), $BSWAP_MASK_XMM
656- vmovdqu .Lgfpoly(%rip ), $GFPOLY_XMM
657-
658- # Load the GHASH accumulator.
659- vmovdqu ($GHASH_ACC_PTR ), $GHASH_ACC_XMM
660- vpshufb $BSWAP_MASK_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM
661-
662- # Optimize for AADLEN < 64 by checking for AADLEN < 64 before AADLEN < 256.
663- cmp \$ 64, $AADLEN
664- jb .Laad_blockbyblock
665-
666- # AADLEN >= 64, so we'll operate on full vectors. Broadcast bswap_mask and
667- # gfpoly to all 128-bit lanes.
668- vshufi64x2 \$ 0, $BSWAP_MASK , $BSWAP_MASK , $BSWAP_MASK
669- vshufi64x2 \$ 0, $GFPOLY , $GFPOLY , $GFPOLY
670-
671- # Load the lowest set of key powers.
672- vmovdqu8 $OFFSETOFEND_H_POWERS -1*64($HTABLE ), $H_POW1
673-
674- cmp \$ 256, $AADLEN
675- jb .Laad_loop_1x
676-
677- # AADLEN >= 256. Load the higher key powers.
678- vmovdqu8 $OFFSETOFEND_H_POWERS -4*64($HTABLE ), $H_POW4
679- vmovdqu8 $OFFSETOFEND_H_POWERS -3*64($HTABLE ), $H_POW3
680- vmovdqu8 $OFFSETOFEND_H_POWERS -2*64($HTABLE ), $H_POW2
681-
682- # Update GHASH with 256 bytes of AAD at a time.
683- .Laad_loop_4x:
684- vmovdqu8 0*64($AAD ), $GHASHDATA0
685- vmovdqu8 1*64($AAD ), $GHASHDATA1
686- vmovdqu8 2*64($AAD ), $GHASHDATA2
687- vmovdqu8 3*64($AAD ), $GHASHDATA3
688- @{[ _ghash_4x $BSWAP_MASK , @GHASHDATA , @GHASHDATA_XMM , $H_POW4 , $H_POW3 ,
689- $H_POW2 , $H_POW1 , $GFPOLY , $GHASHTMP0 , $GHASHTMP1 ,
690- $GHASHTMP2 , $GHASH_ACC , $GHASH_ACC_XMM ]}
691- add \$ 256, $AAD
692- sub \$ 256, $AADLEN
693- cmp \$ 256, $AADLEN
694- jae .Laad_loop_4x
695-
696- # Update GHASH with 64 bytes of AAD at a time.
697- cmp \$ 64, $AADLEN
698- jb .Laad_large_done
699- .Laad_loop_1x:
700- vmovdqu8 ($AAD ), $GHASHDATA0
701- vpshufb $BSWAP_MASK , $GHASHDATA0 , $GHASHDATA0
702- vpxord $GHASHDATA0 , $GHASH_ACC , $GHASH_ACC
703- @{[ _ghash_mul $H_POW1 , $GHASH_ACC , $GHASH_ACC , $GFPOLY ,
704- $GHASHDATA0 , $GHASHDATA1 , $GHASHDATA2 ]}
705- @{[ _horizontal_xor $GHASH_ACC , $GHASH_ACC_XMM , $GHASH_ACC_XMM ,
706- $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM ]}
707- add \$ 64, $AAD
708- sub \$ 64, $AADLEN
709- cmp \$ 64, $AADLEN
710- jae .Laad_loop_1x
711-
712- .Laad_large_done:
713-
714- # GHASH the remaining data 16 bytes at a time, using xmm registers only.
715- .Laad_blockbyblock:
716- test $AADLEN , $AADLEN
717- jz .Laad_done
718- vmovdqu $OFFSETOFEND_H_POWERS -16($HTABLE ), $H_POW1_XMM
719- .Laad_loop_blockbyblock:
720- vmovdqu ($AAD ), $GHASHDATA0_XMM
721- vpshufb $BSWAP_MASK_XMM , $GHASHDATA0_XMM , $GHASHDATA0_XMM
722- vpxor $GHASHDATA0_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM
723- @{[ _ghash_mul $H_POW1_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM , $GFPOLY_XMM ,
724- $GHASHDATA0_XMM , $GHASHDATA1_XMM , $GHASHDATA2_XMM ]}
725- add \$ 16, $AAD
726- sub \$ 16, $AADLEN
727- jnz .Laad_loop_blockbyblock
728-
729- .Laad_done:
730- # Store the updated GHASH accumulator back to memory.
731- vpshufb $BSWAP_MASK_XMM , $GHASH_ACC_XMM , $GHASH_ACC_XMM
732- vmovdqu $GHASH_ACC_XMM , ($GHASH_ACC_PTR )
733-
734- vzeroupper # This is needed after using ymm or zmm registers.
735- ___
736- }
737- $code .= _end_func;
738-
739624# Do one non-last round of AES encryption on the counter blocks in aesdata[0-3]
740625# using the round key that has been broadcast to all 128-bit lanes of round_key.
741626sub _vaesenc_4x {
@@ -1292,11 +1177,6 @@ sub filter_and_print {
12921177 my $postspace = $+ {postspace };
12931178 if (exists $asmMap {$trimmed }) {
12941179 $line = ${prespace} . $asmMap {$trimmed } . ${postspace} ;
1295- } else {
1296- if ($trimmed =~ / (vpclmulqdq|vaes).*%[yz]mm/ ) {
1297- die (" found instruction not supported under old binutils, please update asmMap with the results of running\n " .
1298- ' find target -name "*aes-gcm-avx512*.o" -exec python3 crypto/fipsmodule/aes/asm/make-avx-map-for-old-binutils.py \{\} \; | LC_ALL=C sort | uniq' );
1299- }
13001180 }
13011181 }
13021182 print $line ," \n " ;
0 commit comments