grub-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v5 1/5] Import libgcrypt 1.11.0


From: Vladimir Serbinenko
Subject: [PATCH v5 1/5] Import libgcrypt 1.11.0
Date: Sun, 8 Sep 2024 18:43:11 +0300

We currently use an old version of libcrypt which
results in us having fewer ciphers and missing on many
other improvements.

Signed-off-by: Vladimir Serbinenko <phcoder@gmail.com>
---
 grub-core/lib/libgcrypt/AUTHORS               |   274 +
 grub-core/lib/libgcrypt/COPYING               |   339 +
 grub-core/lib/libgcrypt/COPYING.LIB           |   502 +
 grub-core/lib/libgcrypt/LICENSES              |   319 +
 grub-core/lib/libgcrypt/README                |   278 +
 grub-core/lib/libgcrypt/README.GIT            |    49 +
 grub-core/lib/libgcrypt/THANKS                |   168 +
 grub-core/lib/libgcrypt/VERSION               |     1 +
 grub-core/lib/libgcrypt/cipher/ChangeLog      |  3990 ---
 grub-core/lib/libgcrypt/cipher/ChangeLog-2011 |    40 +-
 grub-core/lib/libgcrypt/cipher/Makefile.am    |   330 +-
 grub-core/lib/libgcrypt/cipher/Manifest       |    73 -
 grub-core/lib/libgcrypt/cipher/ac.c           |  3301 --
 .../lib/libgcrypt/cipher/arcfour-amd64.S      |   108 +
 grub-core/lib/libgcrypt/cipher/arcfour.c      |    93 +-
 .../libgcrypt/cipher/aria-aesni-avx-amd64.S   |  1440 +
 .../libgcrypt/cipher/aria-aesni-avx2-amd64.S  |  1830 ++
 .../libgcrypt/cipher/aria-gfni-avx512-amd64.S |  1010 +
 grub-core/lib/libgcrypt/cipher/aria.c         |  1768 ++
 .../lib/libgcrypt/cipher/asm-common-aarch64.h |   132 +
 .../lib/libgcrypt/cipher/asm-common-amd64.h   |   213 +
 .../lib/libgcrypt/cipher/asm-common-i386.h    |   161 +
 .../lib/libgcrypt/cipher/asm-common-s390x.h   |    90 +
 .../lib/libgcrypt/cipher/asm-inline-s390x.h   |   205 +
 .../libgcrypt/cipher/asm-poly1305-aarch64.h   |   245 +
 .../lib/libgcrypt/cipher/asm-poly1305-amd64.h |   171 +
 .../lib/libgcrypt/cipher/asm-poly1305-s390x.h |   140 +
 grub-core/lib/libgcrypt/cipher/bithelp.h      |   111 +-
 grub-core/lib/libgcrypt/cipher/blake2.c       |  1086 +
 .../lib/libgcrypt/cipher/blake2b-amd64-avx2.S |   301 +
 .../libgcrypt/cipher/blake2b-amd64-avx512.S   |   429 +
 .../lib/libgcrypt/cipher/blake2s-amd64-avx.S  |   281 +
 .../libgcrypt/cipher/blake2s-amd64-avx512.S   |   397 +
 .../lib/libgcrypt/cipher/blowfish-amd64.S     |   601 +
 grub-core/lib/libgcrypt/cipher/blowfish-arm.S |   743 +
 grub-core/lib/libgcrypt/cipher/blowfish.c     |   802 +-
 grub-core/lib/libgcrypt/cipher/bufhelp.h      |   458 +-
 grub-core/lib/libgcrypt/cipher/bulkhelp.h     |   493 +
 .../libgcrypt/cipher/camellia-aarch64-ce.c    |    42 +
 .../lib/libgcrypt/cipher/camellia-aarch64.S   |   585 +
 .../cipher/camellia-aesni-avx-amd64.S         |  2802 ++
 .../cipher/camellia-aesni-avx2-amd64.S        |    34 +
 .../cipher/camellia-aesni-avx2-amd64.h        |  2327 ++
 grub-core/lib/libgcrypt/cipher/camellia-arm.S |   626 +
 .../cipher/camellia-gfni-avx2-amd64.S         |    34 +
 .../cipher/camellia-gfni-avx512-amd64.S       |  1634 +
 .../lib/libgcrypt/cipher/camellia-glue.c      |  1670 +-
 .../lib/libgcrypt/cipher/camellia-ppc8le.c    |    47 +
 .../lib/libgcrypt/cipher/camellia-ppc9le.c    |    47 +
 .../lib/libgcrypt/cipher/camellia-simd128.h   |  2235 ++
 .../cipher/camellia-vaes-avx2-amd64.S         |    35 +
 grub-core/lib/libgcrypt/cipher/camellia.c     |   172 +-
 grub-core/lib/libgcrypt/cipher/camellia.h     |    34 +-
 grub-core/lib/libgcrypt/cipher/cast5-amd64.S  |   663 +
 grub-core/lib/libgcrypt/cipher/cast5-arm.S    |   728 +
 grub-core/lib/libgcrypt/cipher/cast5.c        |   775 +-
 .../lib/libgcrypt/cipher/chacha20-aarch64.S   |   650 +
 .../libgcrypt/cipher/chacha20-amd64-avx2.S    |   604 +
 .../libgcrypt/cipher/chacha20-amd64-avx512.S  |   736 +
 .../libgcrypt/cipher/chacha20-amd64-ssse3.S   |  1015 +
 .../libgcrypt/cipher/chacha20-armv7-neon.S    |   393 +
 .../lib/libgcrypt/cipher/chacha20-p10le-8x.s  |   864 +
 grub-core/lib/libgcrypt/cipher/chacha20-ppc.c |   750 +
 .../lib/libgcrypt/cipher/chacha20-s390x.S     |  1566 +
 grub-core/lib/libgcrypt/cipher/chacha20.c     |  1450 +
 .../lib/libgcrypt/cipher/cipher-aeswrap.c     |   380 +
 grub-core/lib/libgcrypt/cipher/cipher-cbc.c   |   292 +
 grub-core/lib/libgcrypt/cipher/cipher-ccm.c   |   419 +
 grub-core/lib/libgcrypt/cipher/cipher-cfb.c   |   317 +
 grub-core/lib/libgcrypt/cipher/cipher-cmac.c  |   292 +
 grub-core/lib/libgcrypt/cipher/cipher-ctr.c   |   131 +
 grub-core/lib/libgcrypt/cipher/cipher-eax.c   |   293 +
 .../libgcrypt/cipher/cipher-gcm-armv7-neon.S  |   341 +
 .../cipher/cipher-gcm-armv8-aarch32-ce.S      |   588 +
 .../cipher/cipher-gcm-armv8-aarch64-ce.S      |   633 +
 .../cipher/cipher-gcm-intel-pclmul.c          |  2025 ++
 .../lib/libgcrypt/cipher/cipher-gcm-ppc.c     |   548 +
 .../lib/libgcrypt/cipher/cipher-gcm-siv.c     |   664 +
 grub-core/lib/libgcrypt/cipher/cipher-gcm.c   |  1260 +
 .../lib/libgcrypt/cipher/cipher-internal.h    |   975 +
 grub-core/lib/libgcrypt/cipher/cipher-ocb.c   |   763 +
 grub-core/lib/libgcrypt/cipher/cipher-ofb.c   |   108 +
 .../lib/libgcrypt/cipher/cipher-poly1305.c    |   383 +
 grub-core/lib/libgcrypt/cipher/cipher-siv.c   |   375 +
 grub-core/lib/libgcrypt/cipher/cipher-xts.c   |   189 +
 grub-core/lib/libgcrypt/cipher/cipher.c       |  2702 +-
 .../libgcrypt/cipher/crc-armv8-aarch64-ce.S   |   500 +
 grub-core/lib/libgcrypt/cipher/crc-armv8-ce.c |   229 +
 .../lib/libgcrypt/cipher/crc-intel-pclmul.c   |   939 +
 grub-core/lib/libgcrypt/cipher/crc-ppc.c      |   656 +
 grub-core/lib/libgcrypt/cipher/crc.c          |   192 +-
 grub-core/lib/libgcrypt/cipher/des-amd64.S    |  1116 +
 grub-core/lib/libgcrypt/cipher/des.c          |   301 +-
 grub-core/lib/libgcrypt/cipher/dsa-common.c   |   473 +
 grub-core/lib/libgcrypt/cipher/dsa.c          |  1207 +-
 grub-core/lib/libgcrypt/cipher/ecc-common.h   |   143 +
 grub-core/lib/libgcrypt/cipher/ecc-curves.c   |  1587 +
 grub-core/lib/libgcrypt/cipher/ecc-ecdh.c     |   357 +
 grub-core/lib/libgcrypt/cipher/ecc-ecdsa.c    |   305 +
 grub-core/lib/libgcrypt/cipher/ecc-eddsa.c    |  1079 +
 grub-core/lib/libgcrypt/cipher/ecc-gost.c     |   218 +
 grub-core/lib/libgcrypt/cipher/ecc-misc.c     |   469 +
 grub-core/lib/libgcrypt/cipher/ecc-sm2.c      |   569 +
 grub-core/lib/libgcrypt/cipher/ecc.c          |  3242 +-
 grub-core/lib/libgcrypt/cipher/elgamal.c      |   829 +-
 grub-core/lib/libgcrypt/cipher/gost-s-box.c   |   266 +
 grub-core/lib/libgcrypt/cipher/gost-sb.h      |  2128 ++
 grub-core/lib/libgcrypt/cipher/gost.h         |    34 +
 grub-core/lib/libgcrypt/cipher/gost28147.c    |   553 +
 grub-core/lib/libgcrypt/cipher/gostr3411-94.c |   383 +
 grub-core/lib/libgcrypt/cipher/hash-common.c  |   110 +-
 grub-core/lib/libgcrypt/cipher/hash-common.h  |    35 +-
 grub-core/lib/libgcrypt/cipher/hmac-tests.c   |   732 -
 grub-core/lib/libgcrypt/cipher/idea.c         |    24 +-
 grub-core/lib/libgcrypt/cipher/kdf-internal.h |    39 +
 grub-core/lib/libgcrypt/cipher/kdf.c          |  2239 +-
 .../libgcrypt/cipher/keccak-amd64-avx512.S    |   587 +
 .../lib/libgcrypt/cipher/keccak-armv7-neon.S  |   945 +
 grub-core/lib/libgcrypt/cipher/keccak.c       |  1904 ++
 .../lib/libgcrypt/cipher/keccak_permute_32.h  |   536 +
 .../lib/libgcrypt/cipher/keccak_permute_64.h  |   385 +
 grub-core/lib/libgcrypt/cipher/kem-ecc.c      |   332 +
 grub-core/lib/libgcrypt/cipher/kem-ecc.h      |    40 +
 grub-core/lib/libgcrypt/cipher/kem.c          |   435 +
 grub-core/lib/libgcrypt/cipher/kyber-common.c |   766 +
 grub-core/lib/libgcrypt/cipher/kyber-kdep.c   |   825 +
 grub-core/lib/libgcrypt/cipher/kyber.c        |   530 +
 grub-core/lib/libgcrypt/cipher/kyber.h        |   130 +
 grub-core/lib/libgcrypt/cipher/mac-cmac.c     |   532 +
 grub-core/lib/libgcrypt/cipher/mac-gmac.c     |   203 +
 grub-core/lib/libgcrypt/cipher/mac-hmac.c     |  1471 +
 grub-core/lib/libgcrypt/cipher/mac-internal.h |   290 +
 grub-core/lib/libgcrypt/cipher/mac-poly1305.c |   382 +
 grub-core/lib/libgcrypt/cipher/mac.c          |   834 +
 .../lib/libgcrypt/cipher/mceliece6688128f.c   |  3673 +++
 .../lib/libgcrypt/cipher/mceliece6688128f.h   |    63 +
 grub-core/lib/libgcrypt/cipher/md.c           |  1582 +-
 grub-core/lib/libgcrypt/cipher/md4.c          |   171 +-
 grub-core/lib/libgcrypt/cipher/md5.c          |   177 +-
 .../libgcrypt/cipher/poly1305-amd64-avx512.S  |  1626 +
 .../lib/libgcrypt/cipher/poly1305-internal.h  |    92 +
 .../lib/libgcrypt/cipher/poly1305-p10le.s     |   841 +
 .../lib/libgcrypt/cipher/poly1305-s390x.S     |    87 +
 grub-core/lib/libgcrypt/cipher/poly1305.c     |   846 +
 grub-core/lib/libgcrypt/cipher/primegen.c     |   563 +-
 .../lib/libgcrypt/cipher/pubkey-internal.h    |   107 +
 grub-core/lib/libgcrypt/cipher/pubkey-util.c  |  1363 +
 grub-core/lib/libgcrypt/cipher/pubkey.c       |  4329 +--
 grub-core/lib/libgcrypt/cipher/rfc2268.c      |    57 +-
 .../lib/libgcrypt/cipher/rijndael-aarch64.S   |   512 +
 .../lib/libgcrypt/cipher/rijndael-aesni.c     |  5033 +++
 .../lib/libgcrypt/cipher/rijndael-amd64.S     |   477 +
 grub-core/lib/libgcrypt/cipher/rijndael-arm.S |   581 +
 .../cipher/rijndael-armv8-aarch32-ce.S        |  2134 ++
 .../cipher/rijndael-armv8-aarch64-ce.S        |  2038 ++
 .../lib/libgcrypt/cipher/rijndael-armv8-ce.c  |   396 +
 .../lib/libgcrypt/cipher/rijndael-gcm-p10le.s |  1401 +
 .../lib/libgcrypt/cipher/rijndael-internal.h  |   216 +
 .../lib/libgcrypt/cipher/rijndael-p10le.c     |   119 +
 .../lib/libgcrypt/cipher/rijndael-padlock.c   |   109 +
 .../libgcrypt/cipher/rijndael-ppc-common.h    |   328 +
 .../libgcrypt/cipher/rijndael-ppc-functions.h |  2544 ++
 grub-core/lib/libgcrypt/cipher/rijndael-ppc.c |   230 +
 .../lib/libgcrypt/cipher/rijndael-ppc9le.c    |   119 +
 .../lib/libgcrypt/cipher/rijndael-s390x.c     |  1166 +
 .../cipher/rijndael-ssse3-amd64-asm.S         |   879 +
 .../libgcrypt/cipher/rijndael-ssse3-amd64.c   |   742 +
 .../lib/libgcrypt/cipher/rijndael-tables.h    |  1846 +-
 .../cipher/rijndael-vaes-avx2-amd64.S         |  3688 +++
 .../cipher/rijndael-vaes-avx2-i386.S          |  2804 ++
 .../lib/libgcrypt/cipher/rijndael-vaes-i386.c |   231 +
 .../lib/libgcrypt/cipher/rijndael-vaes.c      |   240 +
 grub-core/lib/libgcrypt/cipher/rijndael.c     |  2643 +-
 grub-core/lib/libgcrypt/cipher/rmd160.c       |   592 +-
 grub-core/lib/libgcrypt/cipher/rsa-common.c   |  1151 +
 grub-core/lib/libgcrypt/cipher/rsa.c          |  1736 +-
 .../lib/libgcrypt/cipher/salsa20-amd64.S      |   940 +
 .../lib/libgcrypt/cipher/salsa20-armv7-neon.S |   899 +
 grub-core/lib/libgcrypt/cipher/salsa20.c      |   600 +
 grub-core/lib/libgcrypt/cipher/scrypt.c       |   322 +
 grub-core/lib/libgcrypt/cipher/seed.c         |    32 +-
 .../lib/libgcrypt/cipher/serpent-armv7-neon.S |  1180 +
 .../lib/libgcrypt/cipher/serpent-avx2-amd64.S |  1214 +
 .../lib/libgcrypt/cipher/serpent-avx512-x86.c |   994 +
 .../lib/libgcrypt/cipher/serpent-sse2-amd64.S |  1276 +
 grub-core/lib/libgcrypt/cipher/serpent.c      |  2017 +-
 .../lib/libgcrypt/cipher/sha1-armv7-neon.S    |   526 +
 .../libgcrypt/cipher/sha1-armv8-aarch32-ce.S  |   220 +
 .../libgcrypt/cipher/sha1-armv8-aarch64-ce.S  |   204 +
 .../lib/libgcrypt/cipher/sha1-avx-amd64.S     |   433 +
 .../libgcrypt/cipher/sha1-avx-bmi2-amd64.S    |   446 +
 .../libgcrypt/cipher/sha1-avx2-bmi2-amd64.S   |   578 +
 .../lib/libgcrypt/cipher/sha1-intel-shaext.c  |   292 +
 .../lib/libgcrypt/cipher/sha1-ssse3-amd64.S   |   442 +
 grub-core/lib/libgcrypt/cipher/sha1.c         |   571 +-
 grub-core/lib/libgcrypt/cipher/sha1.h         |    47 +
 .../cipher/sha256-armv8-aarch32-ce.S          |   231 +
 .../cipher/sha256-armv8-aarch64-ce.S          |   218 +
 .../lib/libgcrypt/cipher/sha256-avx-amd64.S   |   511 +
 .../libgcrypt/cipher/sha256-avx2-bmi2-amd64.S |   533 +
 .../libgcrypt/cipher/sha256-intel-shaext.c    |   363 +
 grub-core/lib/libgcrypt/cipher/sha256-ppc.c   |   610 +
 .../lib/libgcrypt/cipher/sha256-ssse3-amd64.S |   533 +
 grub-core/lib/libgcrypt/cipher/sha256.c       |   775 +-
 grub-core/lib/libgcrypt/cipher/sha512-arm.S   |   464 +
 .../lib/libgcrypt/cipher/sha512-armv7-neon.S  |   452 +
 .../cipher/sha512-armv8-aarch64-ce.S          |   383 +
 .../lib/libgcrypt/cipher/sha512-avx-amd64.S   |   466 +
 .../libgcrypt/cipher/sha512-avx2-bmi2-amd64.S |   507 +
 .../libgcrypt/cipher/sha512-avx512-amd64.S    |   465 +
 grub-core/lib/libgcrypt/cipher/sha512-ppc.c   |   725 +
 .../lib/libgcrypt/cipher/sha512-ssse3-amd64.S |   472 +
 .../lib/libgcrypt/cipher/sha512-ssse3-i386.c  |   404 +
 grub-core/lib/libgcrypt/cipher/sha512.c       |  1310 +-
 grub-core/lib/libgcrypt/cipher/sm3-aarch64.S  |   660 +
 .../libgcrypt/cipher/sm3-armv8-aarch64-ce.S   |   221 +
 .../lib/libgcrypt/cipher/sm3-avx-bmi2-amd64.S |   555 +
 grub-core/lib/libgcrypt/cipher/sm3.c          |   565 +
 grub-core/lib/libgcrypt/cipher/sm4-aarch64.S  |   644 +
 .../libgcrypt/cipher/sm4-aesni-avx-amd64.S    |  1058 +
 .../libgcrypt/cipher/sm4-aesni-avx2-amd64.S   |   973 +
 .../libgcrypt/cipher/sm4-armv8-aarch64-ce.S   |   731 +
 .../cipher/sm4-armv9-aarch64-sve-ce.S         |   967 +
 .../libgcrypt/cipher/sm4-gfni-avx2-amd64.S    |  1260 +
 .../libgcrypt/cipher/sm4-gfni-avx512-amd64.S  |  1861 ++
 grub-core/lib/libgcrypt/cipher/sm4-ppc.c      |   342 +
 grub-core/lib/libgcrypt/cipher/sm4.c          |  2070 ++
 grub-core/lib/libgcrypt/cipher/sntrup761.c    |  1062 +
 grub-core/lib/libgcrypt/cipher/sntrup761.h    |    73 +
 grub-core/lib/libgcrypt/cipher/stribog.c      |  1362 +
 .../lib/libgcrypt/cipher/test-getrusage.c     |   105 -
 grub-core/lib/libgcrypt/cipher/tiger.c        |   311 +-
 .../lib/libgcrypt/cipher/twofish-aarch64.S    |   322 +
 .../lib/libgcrypt/cipher/twofish-amd64.S      |  1258 +
 grub-core/lib/libgcrypt/cipher/twofish-arm.S  |   363 +
 .../lib/libgcrypt/cipher/twofish-avx2-amd64.S |  1136 +
 grub-core/lib/libgcrypt/cipher/twofish.c      |   954 +-
 .../libgcrypt/cipher/whirlpool-sse2-amd64.S   |   348 +
 grub-core/lib/libgcrypt/cipher/whirlpool.c    |   334 +-
 grub-core/lib/libgcrypt/compat/Makefile.am    |    48 +
 grub-core/lib/libgcrypt/compat/clock.c        |    36 +
 grub-core/lib/libgcrypt/compat/compat.c       |    40 +
 grub-core/lib/libgcrypt/compat/getpid.c       |    29 +
 grub-core/lib/libgcrypt/compat/libcompat.h    |    37 +
 grub-core/lib/libgcrypt/config.h.in           |   873 +
 grub-core/lib/libgcrypt/configure             | 25763 ++++++++++++++++
 grub-core/lib/libgcrypt/configure.ac          |  3883 +++
 grub-core/lib/libgcrypt/mkinstalldirs         |   161 +
 grub-core/lib/libgcrypt/mpi/ChangeLog-2011    |    17 +-
 grub-core/lib/libgcrypt/mpi/Makefile.am       |    16 +-
 grub-core/lib/libgcrypt/mpi/Manifest          |    41 -
 .../mpi/{pentium4/sse2 => aarch64}/distfiles  |     1 +
 .../lib/libgcrypt/mpi/aarch64/mpi-asm-defs.h  |     4 +
 .../lib/libgcrypt/mpi/aarch64/mpih-add1.S     |    75 +
 .../lib/libgcrypt/mpi/aarch64/mpih-mul1.S     |   100 +
 .../lib/libgcrypt/mpi/aarch64/mpih-mul2.S     |   112 +
 .../lib/libgcrypt/mpi/aarch64/mpih-mul3.S     |   125 +
 .../lib/libgcrypt/mpi/aarch64/mpih-sub1.S     |    75 +
 grub-core/lib/libgcrypt/mpi/alpha/README      |     4 +-
 grub-core/lib/libgcrypt/mpi/alpha/mpih-add1.S |     4 +-
 .../lib/libgcrypt/mpi/alpha/mpih-lshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/alpha/mpih-mul1.S |     4 +-
 grub-core/lib/libgcrypt/mpi/alpha/mpih-mul2.S |     4 +-
 grub-core/lib/libgcrypt/mpi/alpha/mpih-mul3.S |     4 +-
 .../lib/libgcrypt/mpi/alpha/mpih-rshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/alpha/mpih-sub1.S |     4 +-
 .../lib/libgcrypt/mpi/alpha/udiv-qrnnd.S      |     4 +-
 grub-core/lib/libgcrypt/mpi/amd64/distfiles   |     1 +
 grub-core/lib/libgcrypt/mpi/amd64/func_abi.h  |    34 +
 .../lib/libgcrypt/mpi/amd64/mpi-asm-defs.h    |     2 +-
 grub-core/lib/libgcrypt/mpi/amd64/mpih-add1.S |    92 +-
 .../lib/libgcrypt/mpi/amd64/mpih-lshift.S     |    54 +-
 grub-core/lib/libgcrypt/mpi/amd64/mpih-mul1.S |    11 +-
 grub-core/lib/libgcrypt/mpi/amd64/mpih-mul2.S |    53 +-
 grub-core/lib/libgcrypt/mpi/amd64/mpih-mul3.S |    11 +-
 .../lib/libgcrypt/mpi/amd64/mpih-rshift.S     |    56 +-
 grub-core/lib/libgcrypt/mpi/amd64/mpih-sub1.S |    90 +-
 .../lib/libgcrypt/mpi/{i586 => arm}/distfiles |     6 +-
 .../lib/libgcrypt/mpi/arm/mpi-asm-defs.h      |     4 +
 grub-core/lib/libgcrypt/mpi/arm/mpih-add1.S   |    76 +
 grub-core/lib/libgcrypt/mpi/arm/mpih-mul1.S   |    80 +
 grub-core/lib/libgcrypt/mpi/arm/mpih-mul2.S   |    94 +
 grub-core/lib/libgcrypt/mpi/arm/mpih-mul3.S   |   100 +
 grub-core/lib/libgcrypt/mpi/arm/mpih-sub1.S   |    77 +
 .../lib/libgcrypt/mpi/asm-common-aarch64.h    |    26 +
 .../lib/libgcrypt/mpi/asm-common-amd64.h      |    26 +
 grub-core/lib/libgcrypt/mpi/asm-common-i386.h |    26 +
 grub-core/lib/libgcrypt/mpi/config.links      |   178 +-
 .../{cipher/rmd.h => mpi/ec-ed25519.c}        |    35 +-
 grub-core/lib/libgcrypt/mpi/ec-hw-s390x.c     |   412 +
 grub-core/lib/libgcrypt/mpi/ec-inline.h       |  1236 +
 grub-core/lib/libgcrypt/mpi/ec-internal.h     |    49 +
 grub-core/lib/libgcrypt/mpi/ec-nist.c         |   826 +
 grub-core/lib/libgcrypt/mpi/ec.c              |  2053 +-
 grub-core/lib/libgcrypt/mpi/generic/Manifest  |    29 -
 grub-core/lib/libgcrypt/mpi/generic/distfiles |     1 -
 .../lib/libgcrypt/mpi/generic/mpi-asm-defs.h  |    16 +-
 .../lib/libgcrypt/mpi/generic/mpih-add1.c     |    10 +-
 .../lib/libgcrypt/mpi/generic/mpih-lshift.c   |     6 +-
 .../lib/libgcrypt/mpi/generic/mpih-mul1.c     |     8 +-
 .../lib/libgcrypt/mpi/generic/mpih-mul2.c     |    10 +-
 .../lib/libgcrypt/mpi/generic/mpih-mul3.c     |     8 +-
 .../lib/libgcrypt/mpi/generic/mpih-rshift.c   |     4 +-
 .../lib/libgcrypt/mpi/generic/mpih-sub1.c     |     8 +-
 .../lib/libgcrypt/mpi/generic/udiv-w-sdiv.c   |     4 +-
 grub-core/lib/libgcrypt/mpi/hppa/mpih-add1.S  |     4 +-
 .../lib/libgcrypt/mpi/hppa/mpih-lshift.S      |     4 +-
 .../lib/libgcrypt/mpi/hppa/mpih-rshift.S      |     4 +-
 grub-core/lib/libgcrypt/mpi/hppa/mpih-sub1.S  |     4 +-
 grub-core/lib/libgcrypt/mpi/hppa/udiv-qrnnd.S |     4 +-
 grub-core/lib/libgcrypt/mpi/i386/Manifest     |    28 -
 grub-core/lib/libgcrypt/mpi/i386/distfiles    |     1 -
 grub-core/lib/libgcrypt/mpi/i386/mpih-add1.S  |    51 +-
 .../lib/libgcrypt/mpi/i386/mpih-lshift.S      |    16 +-
 grub-core/lib/libgcrypt/mpi/i386/mpih-mul1.S  |    16 +-
 grub-core/lib/libgcrypt/mpi/i386/mpih-mul2.S  |    16 +-
 grub-core/lib/libgcrypt/mpi/i386/mpih-mul3.S  |    16 +-
 .../lib/libgcrypt/mpi/i386/mpih-rshift.S      |    18 +-
 grub-core/lib/libgcrypt/mpi/i386/mpih-sub1.S  |    51 +-
 grub-core/lib/libgcrypt/mpi/i386/syntax.h     |    16 +-
 grub-core/lib/libgcrypt/mpi/i586/Manifest     |    27 -
 grub-core/lib/libgcrypt/mpi/i586/README       |    26 -
 grub-core/lib/libgcrypt/mpi/i586/mpih-add1.S  |   135 -
 .../lib/libgcrypt/mpi/i586/mpih-lshift.S      |   229 -
 grub-core/lib/libgcrypt/mpi/i586/mpih-mul1.S  |    89 -
 grub-core/lib/libgcrypt/mpi/i586/mpih-mul2.S  |    93 -
 grub-core/lib/libgcrypt/mpi/i586/mpih-mul3.S  |    93 -
 .../lib/libgcrypt/mpi/i586/mpih-rshift.S      |   228 -
 grub-core/lib/libgcrypt/mpi/i586/mpih-sub1.S  |   142 -
 grub-core/lib/libgcrypt/mpi/longlong.h        |   967 +-
 grub-core/lib/libgcrypt/mpi/m68k/Manifest     |    25 -
 grub-core/lib/libgcrypt/mpi/m68k/distfiles    |     1 -
 .../lib/libgcrypt/mpi/m68k/mc68020/Manifest   |    23 -
 .../lib/libgcrypt/mpi/m68k/mc68020/distfiles  |     1 -
 .../libgcrypt/mpi/m68k/mc68020/mpih-mul1.S    |     4 +-
 .../libgcrypt/mpi/m68k/mc68020/mpih-mul2.S    |     4 +-
 .../libgcrypt/mpi/m68k/mc68020/mpih-mul3.S    |     4 +-
 grub-core/lib/libgcrypt/mpi/m68k/mpih-add1.S  |     4 +-
 .../lib/libgcrypt/mpi/m68k/mpih-lshift.S      |     4 +-
 .../lib/libgcrypt/mpi/m68k/mpih-rshift.S      |     4 +-
 grub-core/lib/libgcrypt/mpi/m68k/mpih-sub1.S  |     4 +-
 grub-core/lib/libgcrypt/mpi/m68k/syntax.h     |     6 +-
 grub-core/lib/libgcrypt/mpi/mips3/Manifest    |    28 -
 grub-core/lib/libgcrypt/mpi/mips3/README      |     2 +-
 grub-core/lib/libgcrypt/mpi/mips3/distfiles   |     1 -
 grub-core/lib/libgcrypt/mpi/mips3/mpih-add1.S |     4 +-
 .../lib/libgcrypt/mpi/mips3/mpih-lshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/mips3/mpih-mul1.S |     4 +-
 grub-core/lib/libgcrypt/mpi/mips3/mpih-mul2.S |     4 +-
 grub-core/lib/libgcrypt/mpi/mips3/mpih-mul3.S |     4 +-
 .../lib/libgcrypt/mpi/mips3/mpih-rshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/mips3/mpih-sub1.S |     4 +-
 grub-core/lib/libgcrypt/mpi/mpi-add.c         |    64 +-
 grub-core/lib/libgcrypt/mpi/mpi-bit.c         |   257 +-
 grub-core/lib/libgcrypt/mpi/mpi-cmp.c         |    45 +-
 grub-core/lib/libgcrypt/mpi/mpi-div.c         |    23 +-
 grub-core/lib/libgcrypt/mpi/mpi-gcd.c         |    15 +-
 grub-core/lib/libgcrypt/mpi/mpi-inline.c      |     4 +-
 grub-core/lib/libgcrypt/mpi/mpi-inline.h      |     4 +-
 grub-core/lib/libgcrypt/mpi/mpi-internal.h    |    60 +-
 grub-core/lib/libgcrypt/mpi/mpi-inv.c         |   312 +-
 grub-core/lib/libgcrypt/mpi/mpi-mod.c         |    54 +-
 grub-core/lib/libgcrypt/mpi/mpi-mpow.c        |    12 +-
 grub-core/lib/libgcrypt/mpi/mpi-mul.c         |    27 +-
 grub-core/lib/libgcrypt/mpi/mpi-pow.c         |   458 +-
 grub-core/lib/libgcrypt/mpi/mpi-scan.c        |   136 +-
 grub-core/lib/libgcrypt/mpi/mpicoder.c        |   763 +-
 grub-core/lib/libgcrypt/mpi/mpih-const-time.c |   241 +
 grub-core/lib/libgcrypt/mpi/mpih-div.c        |    14 +-
 grub-core/lib/libgcrypt/mpi/mpih-mul.c        |    27 +-
 grub-core/lib/libgcrypt/mpi/mpiutil.c         |   488 +-
 grub-core/lib/libgcrypt/mpi/pa7100/Manifest   |    22 -
 grub-core/lib/libgcrypt/mpi/pa7100/distfiles  |     1 -
 .../lib/libgcrypt/mpi/pa7100/mpih-lshift.S    |     4 +-
 .../lib/libgcrypt/mpi/pa7100/mpih-rshift.S    |     4 +-
 grub-core/lib/libgcrypt/mpi/pentium4/README   |   115 -
 .../lib/libgcrypt/mpi/pentium4/distfiles      |     3 -
 .../lib/libgcrypt/mpi/pentium4/mmx/distfiles  |     2 -
 .../libgcrypt/mpi/pentium4/mmx/mpih-lshift.S  |   457 -
 .../libgcrypt/mpi/pentium4/mmx/mpih-rshift.S  |   453 -
 .../libgcrypt/mpi/pentium4/sse2/mpih-add1.S   |    91 -
 .../libgcrypt/mpi/pentium4/sse2/mpih-mul1.S   |    96 -
 .../libgcrypt/mpi/pentium4/sse2/mpih-mul2.S   |   136 -
 .../libgcrypt/mpi/pentium4/sse2/mpih-mul3.S   |   127 -
 .../libgcrypt/mpi/pentium4/sse2/mpih-sub1.S   |   112 -
 grub-core/lib/libgcrypt/mpi/power/Manifest    |    27 -
 grub-core/lib/libgcrypt/mpi/power/distfiles   |     1 -
 grub-core/lib/libgcrypt/mpi/power/mpih-add1.S |     4 +-
 .../lib/libgcrypt/mpi/power/mpih-lshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/power/mpih-mul1.S |     4 +-
 grub-core/lib/libgcrypt/mpi/power/mpih-mul2.S |     4 +-
 grub-core/lib/libgcrypt/mpi/power/mpih-mul3.S |     4 +-
 .../lib/libgcrypt/mpi/power/mpih-rshift.S     |     4 +-
 grub-core/lib/libgcrypt/mpi/power/mpih-sub1.S |     4 +-
 .../lib/libgcrypt/mpi/powerpc32/Manifest      |    28 -
 .../lib/libgcrypt/mpi/powerpc32/distfiles     |     1 -
 .../lib/libgcrypt/mpi/powerpc32/mpih-add1.S   |     9 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-lshift.S |     8 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-mul1.S   |    10 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-mul2.S   |    10 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-mul3.S   |     9 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-rshift.S |    10 +-
 .../lib/libgcrypt/mpi/powerpc32/mpih-sub1.S   |     9 +-
 .../lib/libgcrypt/mpi/powerpc32/syntax.h      |     5 +-
 grub-core/lib/libgcrypt/mpi/sparc32/Manifest  |    24 -
 grub-core/lib/libgcrypt/mpi/sparc32/distfiles |     1 -
 .../lib/libgcrypt/mpi/sparc32/mpih-add1.S     |     4 +-
 .../lib/libgcrypt/mpi/sparc32/mpih-lshift.S   |     4 +-
 .../lib/libgcrypt/mpi/sparc32/mpih-rshift.S   |     4 +-
 grub-core/lib/libgcrypt/mpi/sparc32/udiv.S    |     4 +-
 .../lib/libgcrypt/mpi/sparc32v8/Manifest      |    23 -
 .../lib/libgcrypt/mpi/sparc32v8/distfiles     |     1 -
 .../lib/libgcrypt/mpi/sparc32v8/mpih-mul1.S   |     4 +-
 .../lib/libgcrypt/mpi/sparc32v8/mpih-mul2.S   |     4 +-
 .../lib/libgcrypt/mpi/sparc32v8/mpih-mul3.S   |     4 +-
 .../lib/libgcrypt/mpi/supersparc/Manifest     |    21 -
 .../lib/libgcrypt/mpi/supersparc/distfiles    |     1 -
 grub-core/lib/libgcrypt/mpi/supersparc/udiv.S |     4 +-
 grub-core/lib/libgcrypt/src/ChangeLog-2011    |    75 +-
 grub-core/lib/libgcrypt/src/Makefile.am       |    93 +-
 grub-core/lib/libgcrypt/src/Manifest          |    58 -
 grub-core/lib/libgcrypt/src/ath.c             |   344 -
 grub-core/lib/libgcrypt/src/ath.h             |   147 -
 grub-core/lib/libgcrypt/src/cipher-proto.h    |   248 +-
 grub-core/lib/libgcrypt/src/cipher.h          |   167 +-
 grub-core/lib/libgcrypt/src/const-time.c      |    88 +
 grub-core/lib/libgcrypt/src/const-time.h      |   167 +
 grub-core/lib/libgcrypt/src/context.c         |   154 +
 grub-core/lib/libgcrypt/src/context.h         |    33 +
 grub-core/lib/libgcrypt/src/dumpsexp.c        |    25 +-
 grub-core/lib/libgcrypt/src/ec-context.h      |   107 +
 grub-core/lib/libgcrypt/src/fips.c            |   810 +-
 grub-core/lib/libgcrypt/src/g10lib.h          |   284 +-
 grub-core/lib/libgcrypt/src/gcrypt-int.h      |   595 +
 grub-core/lib/libgcrypt/src/gcrypt-module.h   |   240 -
 grub-core/lib/libgcrypt/src/gcrypt-testapi.h  |    70 +
 grub-core/lib/libgcrypt/src/gcrypt.h.in       |  1354 +-
 grub-core/lib/libgcrypt/src/gcryptrnd.c       |   680 -
 .../lib/libgcrypt/src/gen-note-integrity.sh   |   123 +
 grub-core/lib/libgcrypt/src/getrandom.c       |   326 -
 grub-core/lib/libgcrypt/src/global.c          |   911 +-
 grub-core/lib/libgcrypt/src/hmac256.c         |    89 +-
 grub-core/lib/libgcrypt/src/hmac256.h         |     2 +-
 grub-core/lib/libgcrypt/src/hwf-arm.c         |   564 +
 grub-core/lib/libgcrypt/src/hwf-common.h      |    28 +
 grub-core/lib/libgcrypt/src/hwf-ppc.c         |   247 +
 grub-core/lib/libgcrypt/src/hwf-s390x.c       |   231 +
 grub-core/lib/libgcrypt/src/hwf-x86.c         |   512 +
 grub-core/lib/libgcrypt/src/hwfeatures.c      |   326 +-
 .../lib/libgcrypt/src/libgcrypt-config.in     |    20 +-
 grub-core/lib/libgcrypt/src/libgcrypt.def     |   167 +-
 grub-core/lib/libgcrypt/src/libgcrypt.m4      |   179 +-
 grub-core/lib/libgcrypt/src/libgcrypt.pc.in   |    18 +
 grub-core/lib/libgcrypt/src/libgcrypt.vers    |    93 +-
 grub-core/lib/libgcrypt/src/misc.c            |   448 +-
 grub-core/lib/libgcrypt/src/missing-string.c  |     4 +-
 grub-core/lib/libgcrypt/src/module.c          |   212 -
 grub-core/lib/libgcrypt/src/mpi.h             |   223 +-
 grub-core/lib/libgcrypt/src/mpicalc.c         |   627 +
 grub-core/lib/libgcrypt/src/secmem.c          |   621 +-
 grub-core/lib/libgcrypt/src/secmem.h          |    17 +-
 grub-core/lib/libgcrypt/src/sexp.c            |  1390 +-
 grub-core/lib/libgcrypt/src/stdmem.c          |   145 +-
 grub-core/lib/libgcrypt/src/stdmem.h          |    13 +-
 grub-core/lib/libgcrypt/src/types.h           |   160 +-
 grub-core/lib/libgcrypt/src/versioninfo.rc.in |     2 +-
 grub-core/lib/libgcrypt/src/visibility.c      |   996 +-
 grub-core/lib/libgcrypt/src/visibility.h      |  1234 +-
 468 files changed, 210279 insertions(+), 32949 deletions(-)
 create mode 100644 grub-core/lib/libgcrypt/AUTHORS
 create mode 100644 grub-core/lib/libgcrypt/COPYING
 create mode 100644 grub-core/lib/libgcrypt/COPYING.LIB
 create mode 100644 grub-core/lib/libgcrypt/LICENSES
 create mode 100644 grub-core/lib/libgcrypt/README
 create mode 100644 grub-core/lib/libgcrypt/README.GIT
 create mode 100644 grub-core/lib/libgcrypt/THANKS
 create mode 100644 grub-core/lib/libgcrypt/VERSION
 delete mode 100644 grub-core/lib/libgcrypt/cipher/ChangeLog
 delete mode 100644 grub-core/lib/libgcrypt/cipher/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/cipher/ac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/arcfour-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/aria-aesni-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/aria-aesni-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/aria-gfni-avx512-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/aria.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-common-aarch64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-common-amd64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-common-i386.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-common-s390x.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-inline-s390x.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-poly1305-aarch64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-poly1305-amd64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/asm-poly1305-s390x.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/blake2.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx2.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx512.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx512.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/blowfish-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/blowfish-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/bulkhelp.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-aarch64-ce.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-aesni-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-gfni-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-gfni-avx512-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-ppc8le.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-ppc9le.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-simd128.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/camellia-vaes-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/cast5-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/cast5-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-amd64-avx2.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-amd64-avx512.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-amd64-ssse3.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-p10le-8x.s
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20-s390x.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/chacha20.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-aeswrap.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-cbc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-ccm.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-cfb.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-cmac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-ctr.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-eax.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-armv8-aarch32-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-intel-pclmul.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm-siv.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-gcm.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-ocb.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-ofb.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-poly1305.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-siv.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/cipher-xts.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/crc-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/crc-armv8-ce.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/crc-intel-pclmul.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/crc-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/des-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/dsa-common.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-common.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-curves.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-ecdh.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-ecdsa.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-eddsa.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-gost.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-misc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/ecc-sm2.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/gost-s-box.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/gost-sb.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/gost.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/gost28147.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/gostr3411-94.c
 delete mode 100644 grub-core/lib/libgcrypt/cipher/hmac-tests.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kdf-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/keccak-amd64-avx512.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/keccak-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/keccak.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/keccak_permute_32.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/keccak_permute_64.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/kem-ecc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kem-ecc.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/kem.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kyber-common.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kyber-kdep.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kyber.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/kyber.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac-cmac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac-gmac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac-hmac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac-poly1305.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mac.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mceliece6688128f.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/mceliece6688128f.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/poly1305-amd64-avx512.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/poly1305-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/poly1305-p10le.s
 create mode 100644 grub-core/lib/libgcrypt/cipher/poly1305-s390x.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/poly1305.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/pubkey-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/pubkey-util.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-aesni.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-armv8-aarch32-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-armv8-ce.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-gcm-p10le.s
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-internal.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-p10le.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-padlock.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ppc-common.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ppc-functions.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ppc9le.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-s390x.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ssse3-amd64-asm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-ssse3-amd64.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-vaes-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-vaes-avx2-i386.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-vaes-i386.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rijndael-vaes.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/rsa-common.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/salsa20-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/salsa20-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/salsa20.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/scrypt.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/serpent-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/serpent-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/serpent-avx512-x86.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/serpent-sse2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-armv8-aarch32-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-avx-bmi2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-avx2-bmi2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-intel-shaext.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1-ssse3-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha1.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-armv8-aarch32-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-avx2-bmi2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-intel-shaext.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha256-ssse3-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-armv7-neon.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-avx2-bmi2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-avx512-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-ssse3-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sha512-ssse3-i386.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm3-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm3-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm3-avx-bmi2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm3.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-aesni-avx-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-aesni-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-armv8-aarch64-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-armv9-aarch64-sve-ce.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-gfni-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-gfni-avx512-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sm4.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sntrup761.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/sntrup761.h
 create mode 100644 grub-core/lib/libgcrypt/cipher/stribog.c
 delete mode 100644 grub-core/lib/libgcrypt/cipher/test-getrusage.c
 create mode 100644 grub-core/lib/libgcrypt/cipher/twofish-aarch64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/twofish-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/twofish-arm.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/twofish-avx2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/cipher/whirlpool-sse2-amd64.S
 create mode 100644 grub-core/lib/libgcrypt/compat/Makefile.am
 create mode 100644 grub-core/lib/libgcrypt/compat/clock.c
 create mode 100644 grub-core/lib/libgcrypt/compat/compat.c
 create mode 100644 grub-core/lib/libgcrypt/compat/getpid.c
 create mode 100644 grub-core/lib/libgcrypt/compat/libcompat.h
 create mode 100644 grub-core/lib/libgcrypt/config.h.in
 create mode 100755 grub-core/lib/libgcrypt/configure
 create mode 100644 grub-core/lib/libgcrypt/configure.ac
 create mode 100755 grub-core/lib/libgcrypt/mkinstalldirs
 delete mode 100644 grub-core/lib/libgcrypt/mpi/Manifest
 rename grub-core/lib/libgcrypt/mpi/{pentium4/sse2 => aarch64}/distfiles (80%)
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpi-asm-defs.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpih-add1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpih-mul1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpih-mul2.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpih-mul3.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/aarch64/mpih-sub1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/amd64/func_abi.h
 rename grub-core/lib/libgcrypt/mpi/{i586 => arm}/distfiles (57%)
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpi-asm-defs.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpih-add1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpih-mul1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpih-mul2.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpih-mul3.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/arm/mpih-sub1.S
 create mode 100644 grub-core/lib/libgcrypt/mpi/asm-common-aarch64.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/asm-common-amd64.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/asm-common-i386.h
 rename grub-core/lib/libgcrypt/{cipher/rmd.h => mpi/ec-ed25519.c} (51%)
 create mode 100644 grub-core/lib/libgcrypt/mpi/ec-hw-s390x.c
 create mode 100644 grub-core/lib/libgcrypt/mpi/ec-inline.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/ec-internal.h
 create mode 100644 grub-core/lib/libgcrypt/mpi/ec-nist.c
 delete mode 100644 grub-core/lib/libgcrypt/mpi/generic/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i386/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/README
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-add1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-lshift.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-mul1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-mul2.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-mul3.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-rshift.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/i586/mpih-sub1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/m68k/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/m68k/mc68020/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/mips3/Manifest
 create mode 100644 grub-core/lib/libgcrypt/mpi/mpih-const-time.c
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pa7100/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/README
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/distfiles
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/mmx/distfiles
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/mmx/mpih-lshift.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/mmx/mpih-rshift.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/sse2/mpih-add1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/sse2/mpih-mul1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/sse2/mpih-mul2.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/sse2/mpih-mul3.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/pentium4/sse2/mpih-sub1.S
 delete mode 100644 grub-core/lib/libgcrypt/mpi/power/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/powerpc32/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/sparc32/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/sparc32v8/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/mpi/supersparc/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/src/Manifest
 delete mode 100644 grub-core/lib/libgcrypt/src/ath.c
 delete mode 100644 grub-core/lib/libgcrypt/src/ath.h
 create mode 100644 grub-core/lib/libgcrypt/src/const-time.c
 create mode 100644 grub-core/lib/libgcrypt/src/const-time.h
 create mode 100644 grub-core/lib/libgcrypt/src/context.c
 create mode 100644 grub-core/lib/libgcrypt/src/context.h
 create mode 100644 grub-core/lib/libgcrypt/src/ec-context.h
 create mode 100644 grub-core/lib/libgcrypt/src/gcrypt-int.h
 delete mode 100644 grub-core/lib/libgcrypt/src/gcrypt-module.h
 create mode 100644 grub-core/lib/libgcrypt/src/gcrypt-testapi.h
 delete mode 100644 grub-core/lib/libgcrypt/src/gcryptrnd.c
 create mode 100755 grub-core/lib/libgcrypt/src/gen-note-integrity.sh
 delete mode 100644 grub-core/lib/libgcrypt/src/getrandom.c
 create mode 100644 grub-core/lib/libgcrypt/src/hwf-arm.c
 create mode 100644 grub-core/lib/libgcrypt/src/hwf-common.h
 create mode 100644 grub-core/lib/libgcrypt/src/hwf-ppc.c
 create mode 100644 grub-core/lib/libgcrypt/src/hwf-s390x.c
 create mode 100644 grub-core/lib/libgcrypt/src/hwf-x86.c
 create mode 100644 grub-core/lib/libgcrypt/src/libgcrypt.pc.in
 delete mode 100644 grub-core/lib/libgcrypt/src/module.c
 create mode 100644 grub-core/lib/libgcrypt/src/mpicalc.c

diff --git a/grub-core/lib/libgcrypt/AUTHORS b/grub-core/lib/libgcrypt/AUTHORS
new file mode 100644
index 000000000..f9161600b
--- /dev/null
+++ b/grub-core/lib/libgcrypt/AUTHORS
@@ -0,0 +1,274 @@
+Library: Libgcrypt
+Homepage: https://gnupg.org/related_software/libgcrypt/
+Download: https://gnupg.org/ftp/gcrypt/libgcrypt/
+Repository: git://git.gnupg.org/libgcrypt.git
+Maintainer: Werner Koch <wk@gnupg.org>
+Bug reports: https://bugs.gnupg.org
+Security related bug reports: <security@gnupg.org>
+End-of-life: TBD
+License (library): LGPLv2.1+
+License (manual and tools): GPLv2+
+
+
+Libgcrypt is free software.  See the files COPYING.LIB and COPYING for
+copying conditions, and LICENSES for notices about a few contributions
+that require these additional notices to be distributed.  License
+copyright years may be listed using range notation, e.g., 2000-2013,
+indicating that every year in the range, inclusive, is a copyrightable
+year that would otherwise be listed individually.
+
+
+List of Copyright holders
+=========================
+
+  Copyright (C) 1989,1991-2018 Free Software Foundation, Inc.
+  Copyright (C) 1994 X Consortium
+  Copyright (C) 1996 L. Peter Deutsch
+  Copyright (C) 1997 Werner Koch
+  Copyright (C) 1998 The Internet Society
+  Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood
+  Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett
+  Copyright (C) 2003 Nikos Mavroyanopoulos
+  Copyright (c) 2006 CRYPTOGAMS
+  Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation)
+  Copyright (C) 2012-2024 g10 Code GmbH
+  Copyright (C) 2012 Simon Josefsson, Niels Möller
+  Copyright (c) 2012 Intel Corporation
+  Copyright (C) 2013 Christian Grothoff
+  Copyright (C) 2013-2024 Jussi Kivilinna
+  Copyright (C) 2013-2014 Dmitry Eremin-Solenikov
+  Copyright (C) 2014 Stephan Mueller
+  Copyright (C) 2017 Jia Zhang
+  Copyright (C) 2018 Bundesamt für Sicherheit in der Informationstechnik
+  Copyright (C) 2020 Alibaba Group.
+  Copyright (C) 2020 Tianjia Zhang
+  Copyright (C) 2023 Simon Josefsson
+
+
+Authors with a FSF copyright assignment
+=======================================
+
+LIBGCRYPT       Werner Koch    2001-06-07
+Assigns past and future changes.
+Assignment for future changes terminated on 2012-12-04.
+wk@gnupg.org
+Designed and implemented Libgcrypt.
+
+GNUPG  Matthew Skala              1998-08-10
+Disclaims changes.
+mskala@ansuz.sooke.bc.ca
+Wrote cipher/twofish.c.
+
+GNUPG  Natural Resources Canada    1998-08-11
+Disclaims changes by Matthew Skala.
+
+GNUPG  Michael Roth    Germany     1998-09-17
+Assigns changes.
+mroth@nessie.de
+Wrote cipher/des.c.
+Changes and bug fixes all over the place.
+
+GNUPG  Niklas Hernaeus         1998-09-18
+Disclaims changes.
+nh@df.lth.se
+Weak key patches.
+
+GNUPG  Rémi Guyomarch          1999-05-25
+Assigns past and future changes. (g10/compress.c, g10/encr-data.c,
+g10/free-packet.c, g10/mdfilter.c, g10/plaintext.c, util/iobuf.c)
+rguyom@mail.dotcom.fr
+
+ANY     g10 Code GmbH           2001-06-07
+Assignment for future changes terminated on 2012-12-04.
+Code marked with ChangeLog entries of g10 Code employees.
+
+LIBGCRYPT Timo Schulz           2001-08-31
+Assigns past and future changes.
+twoaday@freakmail.de
+
+LIBGCRYPT Simon Josefsson       2002-10-25
+Assigns past and future changes to FSF (cipher/{md4,crc}.c, CTR mode,
+CTS/MAC flags, self test improvements)
+simon@josefsson.org
+
+LIBGCRYPT Moritz Schulte       2003-04-17
+Assigns past and future changes.
+moritz@g10code.com
+
+GNUTLS  Nikolaos Mavrogiannopoulos  2003-11-22
+nmav@gnutls.org
+Original code for cipher/rfc2268.c.
+
+LIBGCRYPT      The Written Word        2005-04-15
+Assigns past and future changes. (new: src/libgcrypt.pc.in,
+src/Makefile.am, src/secmem.c, mpi/hppa1.1/mpih-mul3.S,
+mpi/hppa1.1/udiv-qrnnd.S, mpi/hppa1.1/mpih-mul2.S,
+mpi/hppa1.1/mpih-mul1.S, mpi/Makefile.am, tests/prime.c,
+tests/register.c, tests/ac.c, tests/basic.c, tests/tsexp.c,
+tests/keygen.c, tests/pubkey.c, configure.ac, acinclude.m4)
+
+LIBGCRYPT       Brad Hards       2006-02-09
+Assigns Past and Future Changes
+bradh@frogmouth.net
+(Added OFB mode. Changed cipher/cipher.c, test/basic.c doc/gcrypt.tex.
+ added SHA-224, changed cipher/sha256.c, added HMAC tests.)
+
+LIBGCRYPT       Hye-Shik Chang   2006-09-07
+Assigns Past and Future Changes
+perky@freebsd.org
+(SEED cipher)
+
+LIBGCRYPT       Werner Dittmann  2009-05-20
+Assigns Past and Future Changes
+werner.dittmann@t-online.de
+(mpi/amd64, tests/mpitests.c)
+
+GNUPG           David Shaw
+Assigns past and future changes.
+dshaw@jabberwocky.com
+(cipher/camellia-glue.c and related stuff)
+
+LIBGCRYPT       Andrey Jivsov    2010-12-09
+Assigns Past and Future Changes
+openpgp@brainhub.org
+(cipher/ecc.c and related files)
+
+LIBGCRYPT       Ulrich Müller    2012-02-15
+Assigns Past and Future Changes
+ulm@gentoo.org
+(Changes to cipher/idea.c and related files)
+
+LIBGCRYPT       Vladimir Serbinenko  2012-04-26
+Assigns Past and Future Changes
+phcoder@gmail.com
+(cipher/serpent.c)
+
+
+Authors with a DCO
+==================
+
+Andrei Scherer <andsch@inbox.com>
+2014-08-22:BF7CEF794F9.000003F0andsch@inbox.com:
+
+Christian Aistleitner <christian@quelltextlich.at>
+2013-02-26:20130226110144.GA12678@quelltextlich.at:
+
+Christian Grothoff <christian@grothoff.org>
+2013-03-21:514B5D8A.6040705@grothoff.org:
+
+Clemens Lang <cllang@redhat.com>
+2022-02-10:20220210133844.46581-1-cllang@redhat.com:
+
+Danny Tsen <dtsen@us.ibm.com>
+2021-12-20:OF85D11C2F.7A339D7D-ON002587B1.0042A81E-002587B1.0042B94D@ibm.com
+
+Dmitry Baryshkov <dbaryshkov@gmail.com>
+Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
+2013-07-13:20130713144407.GA27334@fangorn.rup.mentorg.com:
+
+Dmitry Kasatkin <dmitry.kasatkin@intel.com>
+2012-12-14:50CAE2DB.80302@intel.com:
+
+Falko Strenzke <falko.strenzke@mtg.de>
+2023-09-27:51677567-0b78-4665-805d-fd0cdd50f7fa@mtg.de:
+
+H.J. Lu <hjl.tools@gmail.com>
+2020-01-19:20200119135241.GA4970@gmail.com:
+
+Jia Zhang <qianyue.zj@alibaba-inc.com>
+2017-10-17:59E56E30.9060503@alibaba-inc.com:
+
+Jérémie Courrèges-Anglas <jca@wxcvbn.org>
+2016-05-26:87bn3ssqg0.fsf@ritchie.wxcvbn.org:
+
+Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+2012-11-15:20121115172331.150537dzb5i6jmy8@www.dalek.fi:
+
+Jussi Kivilinna <jussi.kivilinna@iki.fi>
+2013-05-06:5186720A.4090101@iki.fi:
+
+Markus Teich <markus dot teich at stusta dot mhn dot de>
+2014-10-08:20141008180509.GA2770@trolle:
+
+Martin Storsjö <martin@martin.st>
+2018-03-28:dc1605ce-a47d-34c5-8851-d9569f9ea5d3@martin.st:
+
+Mathias L. Baumann <mathias.baumann at sociomantic.com>
+2017-01-30:07c06d79-0828-b564-d604-fd16c7c86ebe@sociomantic.com:
+
+Milan Broz <gmazyland@gmail.com>
+2014-01-13:52D44CC6.4050707@gmail.com:
+
+Paul Wolneykien <manowar@altlinux.org>
+2019-11-19:20191119204459.312927aa@rigel.localdomain:
+
+Peter Wu <peter@lekensteyn.nl>
+2015-07-22:20150722191325.GA8113@al:
+
+Rafaël Carré <funman@videolan.org>
+2012-04-20:4F91988B.1080502@videolan.org:
+
+Sergey V. <sftp.mtuci@gmail.com>
+2013-11-07:2066221.5IYa7Yq760@darkstar:
+
+Shawn Landden <shawn@git.icu>
+2019-07-09:2794651562684255@iva4-64850291ca1c.qloud-c.yandex.net:
+
+Simit Ghane <simit.ghane@lge.com>
+2024-05-06:OF22575887.761836D9-ON48258B15.0044A21E-48258B15.0044A222@lge.com:
+
+Stephan Mueller <smueller@chronox.de>
+2014-08-22:2008899.25OeoelVVA@myon.chronox.de:
+
+Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+2020-01-08:dcda0127-2f45-93a3-0736-27259a33bffa@linux.alibaba.com:
+
+Tomáš Mráz <tm@t8m.info>
+2012-04-16:1334571250.5056.52.camel@vespa.frost.loc:
+
+Vitezslav Cizek <vcizek@suse.com>
+2015-11-05:20151105131424.GA32700@kolac.suse.cz:
+
+Werner Koch <wk@gnupg.org> (g10 Code GmbH)
+2012-12-05:87obi8u4h2.fsf@vigenere.g10code.de:
+
+
+More credits
+============
+
+Libgcrypt used to be part of GnuPG but has been taken out into its own
+package on 2000-12-21.
+
+Most of the stuff in mpi has been taken from an old GMP library
+version by Torbjorn Granlund <tege@noisy.tmg.se>.
+
+The files cipher/rndunix.c and cipher/rndw32.c are based on those
+files from Cryptlib.  Copyright Peter Gutmann, Paul Kendall, and Chris
+Wedgwood 1996-1999.
+
+The ECC code cipher/ecc.c was based on code by Sergi Blanch i Torne,
+sergi at calcurco dot org.
+
+The implementation of the Camellia cipher has been been taken from the
+original NTT provided GPL source.
+
+The CAVS testing program tests/cavs_driver.pl is not to be considered
+a part of libgcrypt proper.  We distribute it merely for convenience.
+It has a permissive license and is copyrighted by atsec information
+security corporation.  See the file for details.
+
+The file salsa20.c is based on D.J. Bernstein's public domain code and
+taken from Nettle.  Copyright 2012 Simon Josefsson and Niels Möller.
+
+The sntrup761 code is based on public domain code written by Daniel
+J. Bernstein, Chitchanok Chuengsatiansup, Tanja Lange, and Christine
+van Vredendaal.  Copyright 2023 Simon Josefsson.
+
+
+ This file is free software; as a special exception the author gives
+ unlimited permission to copy and/or distribute it, with or without
+ modifications, as long as this notice is preserved.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
+ implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/grub-core/lib/libgcrypt/COPYING b/grub-core/lib/libgcrypt/COPYING
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/grub-core/lib/libgcrypt/COPYING
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/grub-core/lib/libgcrypt/COPYING.LIB 
b/grub-core/lib/libgcrypt/COPYING.LIB
new file mode 100644
index 000000000..4362b4915
--- /dev/null
+++ b/grub-core/lib/libgcrypt/COPYING.LIB
@@ -0,0 +1,502 @@
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  
USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/grub-core/lib/libgcrypt/LICENSES b/grub-core/lib/libgcrypt/LICENSES
new file mode 100644
index 000000000..c2fea82dc
--- /dev/null
+++ b/grub-core/lib/libgcrypt/LICENSES
@@ -0,0 +1,319 @@
+Additional license notices for Libgcrypt.                    -*- org -*-
+
+This file contains the copying permission notices for various files in
+the Libgcrypt distribution which are not covered by the GNU Lesser
+General Public License (LGPL) or the GNU General Public License (GPL).
+
+These notices all require that a copy of the notice be included
+in the accompanying documentation and be distributed with binary
+distributions of the code, so be sure to include this file along
+with any binary distributions derived from the GNU C Library.
+
+* BSD_3Clause
+
+  For files:
+  - cipher/sha256-avx-amd64.S
+  - cipher/sha256-avx2-bmi2-amd64.S
+  - cipher/sha256-ssse3-amd64.S
+  - cipher/sha512-avx-amd64.S
+  - cipher/sha512-avx2-bmi2-amd64.S
+  - cipher/sha512-ssse3-amd64.S
+  - cipher/sha512-ssse3-i386.c
+  - cipher/sha512-avx512-amd64.S
+
+#+begin_quote
+  Copyright (c) 2012, Intel Corporation
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of the Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+
+  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
+  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
+  For files:
+  - cipher/poly1305-amd64-avx512.S
+
+#+begin_quote
+   Copyright (c) 2021-2022, Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+       * Redistributions of source code must retain the above copyright notice,
+         this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of Intel Corporation nor the names of its 
contributors
+         may be used to endorse or promote products derived from this software
+         without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
+  For files:
+  - random/jitterentropy-base.c
+  - random/jitterentropy-gcd.c
+  - random/jitterentropy-gcd.h
+  - random/jitterentropy-health.c
+  - random/jitterentropy-health.h
+  - random/jitterentropy-noise.c
+  - random/jitterentropy-noise.h
+  - random/jitterentropy-sha3.c
+  - random/jitterentropy-sha3.h
+  - random/jitterentropy-timer.c
+  - random/jitterentropy-timer.h
+  - random/jitterentropy.h
+  - random/rndjent.c (plus common Libgcrypt copyright holders)
+
+#+begin_quote
+ Copyright (C) 2017 - 2021, Stephan Mueller <smueller@chronox.de>
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+    notice, and the entire permission notice in its entirety,
+    including the disclaimer of warranties.
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+ 3. The name of the author may not be used to endorse or promote
+    products derived from this software without specific prior
+    written permission.
+
+ ALTERNATIVELY, this product may be distributed under the terms of
+ the GNU General Public License, in which case the provisions of the GPL2
+ are required INSTEAD OF the above restrictions.  (This clause is
+ necessary due to a potential bad interaction between the GPL and
+ the restrictions contained in a BSD-style copyright.)
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ DAMAGE.
+#+end_quote
+
+  For files:
+  - cipher/cipher-gcm-ppc.c
+  - cipher/keccak-amd64-avx512.S
+
+#+begin_quote
+ Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+       * Redistributions of source code must retain copyright notices,
+         this list of conditions and the following disclaimer.
+
+       * Redistributions in binary form must reproduce the above
+         copyright notice, this list of conditions and the following
+         disclaimer in the documentation and/or other materials
+         provided with the distribution.
+
+       * Neither the name of the CRYPTOGAMS nor the names of its
+         copyright holder and contributors may be used to endorse or
+         promote products derived from this software without specific
+         prior written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this
+ product may be distributed under the terms of the GNU General Public
+ License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ those given above.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
+* X License
+
+  For files:
+  - install.sh
+
+#+begin_quote
+  Copyright (C) 1994 X Consortium
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+  sell copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+  X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+  AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+  TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  Except as contained in this notice, the name of the X Consortium shall not
+  be used in advertising or otherwise to promote the sale, use or other deal-
+  ings in this Software without prior written authorization from the X Consor-
+  tium.
+#+end_quote
+
+* Public domain
+
+  For files:
+  - cipher/arcfour-amd64.S
+
+#+begin_quote
+ Author: Marc Bevand <bevand_m (at) epita.fr>
+ Licence: I hereby disclaim the copyright on this code and place it
+ in the public domain.
+#+end_quote
+
+* OCB license 1
+
+  For files:
+  - cipher/cipher-ocb.c
+
+#+begin_quote
+  OCB is covered by several patents but may be used freely by most
+  software.  See http://web.cs.ucdavis.edu/~rogaway/ocb/license.htm .
+  In particular license 1 is suitable for Libgcrypt: See
+  http://web.cs.ucdavis.edu/~rogaway/ocb/license1.pdf for the full
+  license document; it basically says:
+
+    License 1 — License for Open-Source Software Implementations of OCB
+                (Jan 9, 2013)
+
+    Under this license, you are authorized to make, use, and
+    distribute open-source software implementations of OCB. This
+    license terminates for you if you sue someone over their
+    open-source software implementation of OCB claiming that you have
+    a patent covering their implementation.
+
+
+
+ License for Open Source Software Implementations of OCB
+ January 9, 2013
+
+ 1 Definitions
+
+ 1.1 “Licensor” means Phillip Rogaway.
+
+ 1.2 “Licensed Patents” means any patent that claims priority to United
+ States Patent Application No. 09/918,615 entitled “Method and Apparatus
+ for Facilitating Efficient Authenticated Encryption,” and any utility,
+ divisional, provisional, continuation, continuations-in-part, reexamination,
+ reissue, or foreign counterpart patents that may issue with respect to the
+ aforesaid patent application. This includes, but is not limited to, United
+ States Patent No. 7,046,802; United States Patent No. 7,200,227; United
+ States Patent No. 7,949,129; United States Patent No. 8,321,675 ; and any
+ patent that issues out of United States Patent Application No. 13/669,114.
+
+ 1.3 “Use” means any practice of any invention claimed in the Licensed Patents.
+
+ 1.4 “Software Implementation” means any practice of any invention
+ claimed in the Licensed Patents that takes the form of software executing on
+ a user-programmable, general-purpose computer or that takes the form of a
+ computer-readable medium storing such software. Software Implementation does
+ not include, for example, application-specific integrated circuits (ASICs),
+ field-programmable gate arrays (FPGAs), embedded systems, or IP cores.
+
+ 1.5 “Open Source Software” means software whose source code is published
+ and made available for inspection and use by anyone because either (a) the
+ source code is subject to a license that permits recipients to copy, modify,
+ and distribute the source code without payment of fees or royalties, or
+ (b) the source code is in the public domain, including code released for
+ public use through a CC0 waiver. All licenses certified by the Open Source
+ Initiative at opensource.org as of January 9, 2013 and all Creative Commons
+ licenses identified on the creativecommons.org website as of January 9,
+ 2013, including the Public License Fallback of the CC0 waiver, satisfy these
+ requirements for the purposes of this license.
+
+ 1.6 “Open Source Software Implementation” means a Software
+ Implementation in which the software implicating the Licensed Patents is
+ Open Source Software. Open Source Software Implementation does not include
+ any Software Implementation in which the software implicating the Licensed
+ Patents is combined, so as to form a larger program, with software that is
+ not Open Source Software.
+
+ 2 License Grant
+
+ 2.1 License. Subject to your compliance with the term s of this license,
+ including the restriction set forth in Section 2.2, Licensor hereby
+ grants to you a perpetual, worldwide, non-exclusive, non-transferable,
+ non-sublicenseable, no-charge, royalty-free, irrevocable license to practice
+ any invention claimed in the Licensed Patents in any Open Source Software
+ Implementation.
+
+ 2.2 Restriction. If you or your affiliates institute patent litigation
+ (including, but not limited to, a cross-claim or counterclaim in a lawsuit)
+ against any entity alleging that any Use authorized by this license
+ infringes another patent, then any rights granted to you under this license
+ automatically terminate as of the date such litigation is filed.
+
+ 3 Disclaimer
+ YOUR USE OF THE LICENSED PATENTS IS AT YOUR OWN RISK AND UNLESS REQUIRED
+ BY APPLICABLE LAW, LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
+ KIND CONCERNING THE LICENSED PATENTS OR ANY PRODUCT EMBODYING ANY LICENSED
+ PATENT, EXPRESS OR IMPLIED, STATUT ORY OR OTHERWISE, INCLUDING, WITHOUT
+ LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, OR NONINFRINGEMENT. IN NO EVENT WILL LICENSOR BE LIABLE FOR ANY
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM OR RELATED TO ANY USE OF THE LICENSED PATENTS, INCLUDING,
+ WITHOUT LIMITATION, DIRECT, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE
+ OR SPECIAL DAMAGES, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF
+ SUCH DAMAGES PRIOR TO SUCH AN OCCURRENCE.
+#+end_quote
diff --git a/grub-core/lib/libgcrypt/README b/grub-core/lib/libgcrypt/README
new file mode 100644
index 000000000..7733dbdf9
--- /dev/null
+++ b/grub-core/lib/libgcrypt/README
@@ -0,0 +1,278 @@
+                   Libgcrypt - The GNU Crypto Library
+                  ------------------------------------
+                             Version 1.11
+
+       Copyright (C) 1989,1991-2018 Free Software Foundation, Inc.
+       Copyright (C) 2012-2024 g10 Code GmbH
+       Copyright (C) 2013-2024 Jussi Kivilinna
+
+    Libgcrypt is free software.  See the file AUTHORS for full copying
+    notices, and LICENSES for notices about contributions that require
+    these additional notices to be distributed.
+
+
+    Overview
+    --------
+
+    Libgcrypt is a general purpose crypto library based on the code
+    used in GnuPG.  Libgcrypt depends on the library `libgpg-error',
+    which must be installed correctly before Libgcrypt is to be built.
+    Libgcrypt is distributed under the LGPL, see the section "License"
+    below for details.
+
+
+    Build Instructions
+    ------------------
+
+    The download canonical location for libgcrypt is:
+
+      https://gnupg.org/ftp/gcrypt/libgcrypt/
+
+    To build libgcrypt you need libgpg-error:
+
+      https://gnupg.org/ftp/gcrypt/libgpg-error/
+
+    You should get the latest versions of course.
+
+    After building and installing the libgpg-error package, you may
+    continue with Libgcrypt installation as with allmost all GNU
+    packages, you just have to do
+
+       ./configure
+       make
+       make check
+       make install
+
+    The "make check" is not required but a good idea to see whether
+    the library works as expected.  The check takes some while and
+    prints some benchmarking results.  Before doing "make install" you
+    probably need to become root.
+
+    To build libgcrypt for Microsoft Windows, you need to have the
+    mingw32 cross-building toolchain installed.  Instead of running a
+    plain configure you use
+
+      ./autogen.sh --build-w32
+      make
+      make install
+
+    By default this command sequences expectsd a libgpg-error
+    installed below $HOME/w32root and installs libgcrypt to that
+    directory too.  See the autogen.sh code for details.
+
+    The documentation is available as an Info file (gcrypt.info).  To
+    build documentation in PDF, run this:
+
+      cd doc
+      make pdf
+
+
+
+    Mailing List
+    ------------
+
+    You may want to join the developer's mailing list
+    gcrypt-devel@gnupg.org by sending mail with a subject of
+    "subscribe" to gcrypt-devel-request@gnupg.org.  An archive of this
+    list is available at https://lists.gnupg.org .
+
+
+    Configure options
+    -----------------
+    Here is a list of configure options which are sometimes useful
+    for installation.
+
+     --enable-large-data-tests
+                     With this option a "make check" will take really
+                     long due to extra checks for the hash algorithms.
+
+     --disable-asm
+                     Do not use assembler modules.  It is not possible
+                     to use this on some CPU types.
+
+     --enable-ld-version-script
+                     Libgcrypt tries to build a library where internal
+                     symbols are not exported.  This requires support
+                     from ld and is currently enabled for a few OSes.
+                     If you know that your ld supports the so called
+                     ELF version scripts, you can use this option to
+                     force its use.  OTOH, if you get error message
+                     from the linker, you probably want to use this
+                     option to disable the use of version scripts.
+                     Note, that you should never ever use an
+                     undocumented symbol or one which is prefixed with
+                     an underscore.
+
+     --enable-ciphers=list
+     --enable-pubkey-ciphers=list
+     --enable-digests=list
+                     If not otherwise specified, all algorithms
+                     included in the libgcrypt source tree are built.
+                    An exception are algorithms, which depend on
+                    features not provided by the system, like 64bit
+                    data types.  With these switches it is possible
+                     to select exactly those algorithm modules, which
+                    should be built.  The algorithms are to be
+                     separated by spaces, commas or colons.  To view
+                     the list used with the current build the program
+                     tests/version may be used.
+
+     --disable-endian-check
+                     Don't let configure test for the endianness but
+                     try to use the OS provided macros at compile
+                     time.  This is helpful to create OS X fat binaries.
+
+     --enable-random-daemon
+                     Include support for a global random daemon and
+                     build the daemon.  This is an experimental feature.
+
+     --enable-mpi-path=EXTRA_PATH
+                     Prepend EXTRA_PATH to list of CPU specific
+                     optimizations.  For example, if you want to add
+                     optimizations forn a Intel Pentium 4 compatible
+                     CPU, you may use
+                        --enable-mpi-path=pentium4/sse2:pentium4/mmx
+                     Take care: The generated library may crash on
+                     non-compatible CPUs.
+
+     --enable-random=NAME
+                     Force the use of the random gathering module
+                    NAME.  Default is either to use /dev/random or
+                    the auto mode.  Possible values for NAME are:
+                      egd - Use the module which accesses the
+                            Entropy Gathering Daemon. See the webpages
+                            for more information about it.
+                     unix - Use the standard Unix module which does not
+                            have a very good performance.
+                    linux - Use the module which accesses /dev/random.
+                            This is the first choice and the default one
+                            for GNU/Linux or *BSD.
+                      auto - Compile linux, egd and unix in and
+                             automagically select at runtime.
+
+     --enable-hmac-binary-check
+                     Include support to check the binary at runtime
+                     against a HMAC checksum.  This works only in FIPS
+                     mode on systems providing the dladdr function and using
+                     the ELF binary format.
+
+     --with-fips-module-version=version
+                     Specify a string used as a module version for FIPS
+                     certification purposes.
+
+     --disable-padlock-support
+                     Disable support for the PadLock engine of VIA
+                     processors.  The default is to use PadLock if
+                     available.  Try this if you get problems with
+                     assembler code.
+
+     --disable-aesni-support
+                     Disable support for the AES-NI instructions of
+                     newer Intel CPUs.  The default is to use AES-NI
+                     if available.  Try this if you get problems with
+                     assembler code.
+
+     --disable-O-flag-munging
+                     Some code is too complex for some compilers while
+                     in higher optimization modes, thus the compiler
+                     invocation is modified to use a lower
+                     optimization level.  Usually this works very well
+                     but on some platforms these rules break the
+                     invocation.  This option may be used to disable
+                     the feature under the assumption that either good
+                     CFLAGS are given or the compiler can grok the code.
+
+
+
+
+    Build Problems
+    --------------
+
+    If you have a problem with a a certain release, please first check
+    the Release-info URL given in the NEWS file.
+
+    We can't check all assembler files, so if you have problems
+    assembling them (or the program crashes) use --disable-asm with
+    ./configure.  If you opt to delete individual replacement files in
+    hopes of using the remaining ones, be aware that the configure
+    scripts may consider several subdirectories to get all available
+    assembler files; be sure to delete the correct ones.  Never delete
+    udiv-qrnnd.S in any CPU directory, because there may be no C
+    substitute (in mpi/genereic).  Don't forget to delete
+    "config.cache" and run "./config.status --recheck".  We got a few
+    reports about problems using versions of gcc earlier than 2.96
+    along with a non-GNU assembler (as).  If this applies to your
+    platform, you can either upgrade gcc to a more recent version, or
+    use the GNU assembler.
+
+    Some make tools are broken - the best solution is to use GNU's
+    make.  Try gmake or grab the sources from a GNU archive and
+    install them.
+
+    Specific problems on some machines:
+
+      * AArch64 (GCC 11.1 and 11.2)
+
+       Because of the bug in GCC (fixed in 11.3), with the option
+       -O3, vectorization results wrong code for the function
+       buf_eq_const.  Please use -O2 or -fno-tree-loop-vectorize.
+
+      * IBM RS/6000 running AIX
+
+       Due to a change in gcc (since version 2.8) the MPI stuff may
+       not build. In this case try to run configure using:
+           CFLAGS="-g -O2 -mcpu=powerpc" ./configure
+
+      * SVR4.2 (ESIX V4.2 cc)
+
+        Due to problems with the ESIX as(1), you probably want to do:
+            CFLAGS="-O -K pentium" ./configure --disable-asm
+
+      * SunOS 4.1.4
+
+         ./configure ac_cv_sys_symbol_underscore=yes
+
+      * Sparc64 CPUs
+
+        We have reports about failures in the AES module when
+        compiling using gcc (e.g. version 4.1.2) and the option -O3;
+        using -O2 solves the problem.
+
+
+    License
+    -------
+
+    The library is distributed under the terms of the GNU Lesser
+    General Public License (LGPL); see the file COPYING.LIB for the
+    actual terms.
+
+    The helper programs as well as the documentation are distributed
+    under the terms of the GNU General Public License (GPL); see the
+    file COPYING for the actual terms.
+
+    The file LICENSES has notices about contributions that require
+    that these additional notices are distributed.
+
+
+    Contact
+    -------
+
+    See the file AUTHORS.
+
+    Commercial grade support for Libgcrypt is available; for a listing
+    of offers see https://www.gnupg.org/service.html .
+
+    Since 2001 maintenance and development of Libgcrypt is done by g10
+    Code GmbH and was mostly financed by donations; since 2022 a raise
+    in revenues from support contracts allows to fully finance the
+    development without resorting to donations.  Many thanks to our
+    paid developers for their work and also a big thank you to Jussi
+    Kivilinna for all of his performance work.
+
+  This file is Free Software; as a special exception the authors gives
+  unlimited permission to copy and/or distribute it, with or without
+  modifications, as long as this notice is preserved. For conditions
+  of the whole package, please see the file COPYING.  This file is
+  distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY, to the extent permitted by law; without even the implied
+  warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/grub-core/lib/libgcrypt/README.GIT 
b/grub-core/lib/libgcrypt/README.GIT
new file mode 100644
index 000000000..ee2c6383f
--- /dev/null
+++ b/grub-core/lib/libgcrypt/README.GIT
@@ -0,0 +1,49 @@
+If you are building from GIT, run the script
+
+./autogen.sh
+
+first, to make sure that you have all the necessary maintainer tools
+are installed and to build the actual configuration files.  If you
+have just checked out from GIT, you should add the option "--force" to
+autogen.sh so that meta data is noticed by autom4te.cache.  Then run
+
+./configure --enable-maintainer-mode
+
+followed by the usual make.
+
+If autogen.sh complains about insufficient versions of the required
+tools, or the tools are not installed, you may use environment
+variables to override the default tool names:
+
+ AUTOMAKE_SUFFIX  is used as a suffix for all tools from the automake
+                  package.  For example
+                     AUTOMAKE_SUFFIX="-1.7" ./autogen.sh
+                  uses "automake-1.7" and "aclocal-1.7.
+ AUTOMAKE_PREFIX  is used as a prefix for all tools from the automake
+                  page and may be combined with AUTOMAKE_SUFFIX. e.g.:
+                    AUTOMAKE_PREFIX=/usr/foo/bin ./autogen.sh
+                  uses "automake" and "aclocal" in the /usr/foo/bin
+                  directory.
+ AUTOCONF_SUFFIX  is used as a suffix for all tools from the automake
+                  package
+ AUTOCONF_PREFIX  is used as a prefix for all tools from the automake
+                  package
+ GETTEXT_SUFFIX   is used as a suffix for all tools from the gettext
+                  package
+ GETTEXT_PREFIX   is used as a prefix for all tools from the gettext
+                  package
+
+It is also possible to use the variable name AUTOMAKE, AUTOCONF,
+ACLOCAL, AUTOHEADER, GETTEXT and MSGMERGE to directly specify the name
+of the programs to run.  It is however better to use the suffix and
+prefix forms as described above because that does not require
+knowledge about the actual tools used by autogen.sh.
+
+
+Please don't use autopoint, libtoolize or autoreconf unless you are
+the current maintainer and want to update the standard configuration
+files.  All those files should be in GIT and only updated manually
+if the maintainer decides that newer versions are required.  The
+maintainer should also make sure that the required version of automake
+et al. are properly indicated at the top of configure.ac and take care
+to copy the files and not merely use symlinks.
diff --git a/grub-core/lib/libgcrypt/THANKS b/grub-core/lib/libgcrypt/THANKS
new file mode 100644
index 000000000..6a44eade0
--- /dev/null
+++ b/grub-core/lib/libgcrypt/THANKS
@@ -0,0 +1,168 @@
+Libgcrypt is based on the GnuPG code.  Here is a list of people, who
+helped in GnuPG and Libgcrypt development.  Please help us to keep it
+complete and free of errors.
+
+Albert Chin                china at thewrittenword com
+Allan Clark               allanc@sco.com
+Anand Kumria              wildfire@progsoc.uts.edu.au
+Andreas Metzler            ametzler at downhill.at.eu.org
+Ariel T Glenn             ariel@columbia.edu
+Aurelien Jarno             aurel32 at debian.org
+Ben Hutchings              ben decadent org uk
+Bodo Moeller              Bodo_Moeller@public.uni-hamburg.de
+Brenno de Winter          brenno@dewinter.com
+Brian Moore               bem@cmc.net
+Brian Warner              warner@lothar.com
+Brieuc Jeunhomme          bbp@via.ecp.fr
+Bryan Fullerton           bryanf@samurai.com
+Caskey L. Dickson         caskey@technocage.com
+Cees van de Griend        cees-list@griend.xs4all.nl
+Charles Levert            charles@comm.polymtl.ca
+Christian Biere            christianbiere@gmx.de
+Christian Grothoff         christian at grothoff org
+Christian von Roques      roques@pond.sub.org
+Christopher Oliver        oliver@fritz.traverse.net
+Christian Recktenwald     chris@citecs.de
+Daiki Ueno                 ueno at unixuser org
+Dan Fandrich               dan at coneharvesters com
+Daniel Eisenbud           eisenbud@cs.swarthmore.edu
+Daniel Koening            dan@mail.isis.de
+David Ellement            ellement@sdd.hp.com
+Detlef Lannert            lannert@lannert.rz.uni-duesseldorf.de
+Dirk Lattermann           dlatt@t-online.de
+Dirk Stoecker              gcrypt@dstoecker.de
+Ed Boraas                 ecxjo@esperanto.org
+Elie De Brauwer            elie@de-brauwer.be
+Enzo Michelangeli         em@MailAndNews.com
+Ernst Molitor             ernst.molitor@uni-bonn.de
+Fabian Keil                fk at fabiankeil de
+Fabio Coatti              cova@felix.unife.it
+Felix von Leitner         leitner@amdiv.de
+Frank Heckenbach          heckenb@mi.uni-erlangen.de
+Frank Stajano             frank.stajano@cl.cam.ac.uk
+Gabriele Monti             psicus78 gmail com
+Gaël Quéri                gqueri@mail.dotcom.fr
+Gregor Riepl               seto-kun@freesurf.ch
+Gerlinde Klaes             gk@u64.de
+Greg Louis                glouis@dynamicro.on.ca
+Greg Troxel               gdt@ir.bbn.com
+Gregory Steuck            steuck@iname.com
+Geoff Keating             geoffk@ozemail.com.au
+Harald Denker             harry@hal.westfalen.de
+Hendrik Buschkamp         buschkamp@rheumanet.org
+Holger Schurig            holger@d.om.org
+Hugh Daniel               hugh@toad.com
+Ian McKellar              imckellar@harvestroad.com.au
+Ian Peters                 itp@ximian.com
+Janusz A. Urbanowicz      alex@bofh.torun.pl
+James Troup               james@nocrew.org
+Jean-loup Gailly          gzip@prep.ai.mit.edu
+Jeff Johnson               jbj@redhat.com
+Jens Bachem               bachem@rrz.uni-koeln.de
+J Horacio MG              homega@ciberia.es
+Joachim Backes            backes@rhrk.uni-kl.de
+Jordi Mallach              jordi@sindominio.net
+John A. Martin            jam@jamux.com
+Johnny Teveßen            j.tevessen@gmx.de
+Jörg Schilling            schilling@fokus.gmd.de
+Jun Kuriyama              kuriyama@sky.rim.or.jp
+Karl Fogel                kfogel@guanabana.onshore.com
+Karsten Thygesen          karthy@kom.auc.dk
+Katsuhiro Kondou          kondou@nec.co.jp
+Kazu Yamamoto             kazu@iijlab.net
+Lars Kellogg-Stedman      lars@bu.edu
+Lee Fisher                 blibbet at gmail dot com
+Marco d'Itri               md@linux.it
+Mark Adler                madler@alumni.caltech.edu
+Mark Elbrecht             snowball3@bigfoot.com
+Markus Friedl             Markus.Friedl@informatik.uni-erlangen.de
+Matthias Urlichs           smurf@smurf.noris.de
+Martin Kahlert            martin.kahlert@provi.de
+Martin Hamilton
+Martin Schulte            schulte@thp.uni-koeln.de
+Matthew Skala             mskala@ansuz.sooke.bc.ca
+Max Kellermann             max@duempel.org
+Max Valianskiy            maxcom@maxcom.ml.org
+Michael Fischer v. Mollard mfvm@gmx.de
+Michael Roth              mroth@nessie.de
+Michael Sobolev           mss@despair.transas.com
+Michele Baldessari        michele@pupazzo.org
+Modestas Vainius          geromanas@mailas.com
+Neil Dunbar                neil.dunbar at pobox.com
+Neil Spring               nspring@cs.washington.edu
+Newton Hammet              newton@hammet.net
+Nicolas Graner            Nicolas.Graner@cri.u-psud.fr
+NIIBE Yutaka              gniibe@chroot.org
+Niklas Hernaeus
+Nikolay Sturm             sturm@sec.informatik.tu-darmstadt.de
+Nikos Mavroyanopoulos      nmav@hellug.gr
+Nimrod Zimerman           zimerman@forfree.at
+N J Doye                  nic@niss.ac.uk
+Oliver Haakert            haakert@hsp.de
+Oskari Jääskeläinen       f33003a@cc.hut.fi
+Paul D. Smith             psmith@baynetworks.com
+Philippe Laliberte        arsphl@oeil.qc.ca
+Peter Gutmann             pgut001@cs.auckland.ac.nz
+QingLong                  qinglong@bolizm.ihep.su
+Rafael Ávila de Espíndola  rafael.espindola@gmail.com
+Rafaël Carré               funman@videolan.org
+Ralf Fassel                ralf@akutech.de
+Ralf Hildebrandt           Ralf.Hildebrandt@innominate.com
+Ralf Schneider             ralf@tapfere-schneiderleins.de
+Ralph Gillen              gillen@theochem.uni-duesseldorf.de
+Rami Lehti                 Rami.Lehti@finland.sun.com
+Randolph Chung             tausq@debian.org
+Randy                     mcclellr@oit.edu
+Rat                       ratinox@peorth.gweep.net
+Reinhard Wobst            R.Wobst@ifw-dresden.de
+Rémi Guyomarch            rguyom@mail.dotcom.fr
+Reuben Sumner             rasumner@wisdom.weizmann.ac.il
+Richard Outerbridge       outer@interlog.com
+Roddy Strachan            roddy@satlink.com.au
+Roland Rosenfeld          roland@spinnaker.rhein.de
+Ross Golder               rossigee@bigfoot.com
+Serge Munhoven            munhoven@mema.ucl.ac.be
+Sergi Blanch i Torné       sergi at calcurco cat
+Simon Josefsson            jas@extundo.com
+SL Baur                   steve@xemacs.org
+Stephan Austermuehle       au@hcsd.de
+Stephan Müller             smueller at atsec com
+Stephane Corthesy          stephane@sente.ch
+Stefan Karrmann           S.Karrmann@gmx.net
+Stefan Keller             dres@cs.tu-berlin.de
+Stefan Krüger              stadtkind2 at gmx de
+Steffen Ullrich           ccrlphr@xensei.com
+Steffen Zahn              zahn@berlin.snafu.de
+Steven Bakker             steven@icoe.att.com
+Susanne Schultz           schultz@hsp.de
+Sven Bjorn
+Szakats Istvan             szaki.ms@gmail.com
+Thiago Jung Bauermann     jungmann@cwb.matrix.com.br
+Thomas Roessler           roessler@guug.de
+Tom Holroyd                tomh@po.crl.go.jp
+Tom Spindler              dogcow@home.merit.edu
+Tom Zerucha               tzeruch@ceddec.com
+Tomas Fasth               tomas.fasth@twinspot.net
+Tommi Komulainen           Tommi.Komulainen@iki.fi
+Thomas Mikkelsen          tbm@image.dk
+Ulf Möller                3umoelle@informatik.uni-hamburg.de
+Umberto Salsi             salsi@icosaedro.it
+Uoti Urpala
+Urko Lusa                 ulusa@euskalnet.net
+Victor Stinner             haypo@inl.fr
+Walter Koch               koch@u32.de
+Werner Koch               wk@gnupg.org
+Wim Vandeputte            wim@kd85.com
+                          nbecker@hns.com
+
+
+ Copyright 1998, 1999, 2000, 2001, 2002, 2003,
+           2009, 2011 Free Software Foundation, Inc.
+
+ This file is free software; as a special exception the author gives
+ unlimited permission to copy and/or distribute it, with or without
+ modifications, as long as this notice is preserved.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
+ implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/grub-core/lib/libgcrypt/VERSION b/grub-core/lib/libgcrypt/VERSION
new file mode 100644
index 000000000..1cac385c6
--- /dev/null
+++ b/grub-core/lib/libgcrypt/VERSION
@@ -0,0 +1 @@
+1.11.0
diff --git a/grub-core/lib/libgcrypt/cipher/ChangeLog 
b/grub-core/lib/libgcrypt/cipher/ChangeLog
deleted file mode 100644
index 1b3694f58..000000000
--- a/grub-core/lib/libgcrypt/cipher/ChangeLog
+++ /dev/null
@@ -1,3990 +0,0 @@
-2010-08-19  Werner Koch  <wk@g10code.com>
-
-       * cipher.c (gcry_cipher_open): Remove double release of the module.
-       Fixes bug#1263.
-
-2010-06-10  Jeff Johnson  <n3npq@mac.com>  (wk)
-
-       * ecc.c (ecc_generate_ext): Parse transient-key flag.
-       (generate_key): Add arg TRANSIENT_KEY and use it to set the random
-       level.
-
-2010-04-12  Brad Hards  <bradh@frogmouth.net>  (wk)
-
-       Spelling fixes.
-
-2010-03-26  Werner Koch  <wk@g10code.com>
-
-       * tiger.c (asn): Unfetter the old TIGER from an OID.
-       (TIGER_CONTEXT): Add field VARIANT.
-       (tiger_init): Factor code out to ...
-       (do_init): New.
-       (tiger1_init, tiger2_init): New.
-       (_gcry_digest_spec_tiger1, _gcry_digest_spec_tiger2): New.
-       * md.c (digest_table): Add TIGER1 and TIGER2 variants.
-
-2009-12-11  Werner Koch  <wk@g10code.com>
-
-       * sha256.c (Cho, Maj, Sum0, Sum1): Turn macros into inline
-       functions.
-       (transform): Partly unroll to interweave the chain variables
-
-       * sha512.c (ROTR, Ch, Maj, Sum0, Sum1): Turn macros into inline
-       functions.
-       (transform): Partly unroll to interweave the chain variables.
-       Suggested by Christian Grothoff.
-
-2009-12-10  Werner Koch  <wk@g10code.com>
-
-       * Makefile.am (o_flag_munging): New.
-       (tiger.o, tiger.lo): Use it.
-
-       * cipher.c (do_ctr_encrypt): Add arg OUTBUFLEN.  Check for
-       suitable value.  Add check for valid inputlen.  Wipe temporary
-       memory.
-       (do_ctr_decrypt): Likewise.
-       (do_cbc_encrypt, do_cbc_decrypt): Add arg OUTBUFLEN.  Check for
-       suitable value.  Move check for valid inputlen to here; change
-       returned error from INV_ARG to INV_LENGTH.
-       (do_ecb_encrypt, do_ecb_decrypt): Ditto.
-       (do_cfb_encrypt, do_cfb_decrypt): Ditto.
-       (do_ofb_encrypt, do_ofb_decrypt): Ditto.
-       (cipher_encrypt, cipher_encrypt): Adjust for above changes.
-       (gcry_cipher_encrypt, gcry_cipher_decrypt): Simplify.
-
-2009-12-09  Werner Koch  <wk@g10code.com>
-
-       * cipher.c (gcry_cipher_open): Allow for GCRY_CIPHER_MODE_AESWRAP.
-       (cipher_encrypt, cipher_decrypt): Ditto.
-       (do_aeswrap_encrypt, do_aeswrap_decrypt): New.
-       (struct gcry_cipher_handle): Add field marks.
-       (cipher_setkey, cipher_setiv): Update marks flags.
-       (cipher_reset): Reset marks.
-       (cipher_encrypt, cipher_decrypt): Add new arg OUTBUFLEN.
-       (gcry_cipher_encrypt, gcry_cipher_decrypt): Pass outbuflen to
-       cipher_encrypt.  Replace GPG_ERR_TOO_SHORT by
-       GPG_ERR_BUFFER_TOO_SHORT.
-
-2009-08-21  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (dsa_generate_ext): Release retfactors array before
-       setting it to NULL.  Reported by Daiko Ueno.
-
-2009-07-02  Werner Koch  <wk@g10code.com>
-
-       * md.c (md_read): Fix incomplete check for NULL.
-       Reported by Fabian Kail.
-
-2009-03-31  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (rsa_check_secret_key): Return GPG_ERR_BAD_SECKEY and not
-       GPG_ERR_PUBKEY_ALGO.
-
-2009-02-16  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (generate_x931): Do not initialize TBL with automatic
-       variables.
-       * whirlpool.c, tiger.c, sha256.c, sha1.c, rmd160.c, md5.c
-       * md4.c, crc.c: Remove memory.h.  This is garbage from gnupg.
-       Reported by Dan Fandrich.
-
-2009-01-22  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (compute_keygrip): Remove superfluous const.
-
-2009-01-06  Werner Koch  <wk@g10code.com>
-
-       * rmd160.c (oid_spec_rmd160): Add TeleTrust identifier.
-
-2008-12-10  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (generate): Add arg DOMAIN and use it if specified.
-       (generate_fips186): Ditto.
-       (dsa_generate_ext): Parse and check the optional "domain"
-       parameter and pass them to the generate functions.
-
-       * rijndael.c (rijndael_names): Add "AES128" and "AES-128".
-       (rijndael192_names): Add "AES-192".
-       (rijndael256_names): Add "AES-256".
-
-2008-12-05  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (generate): Add arg TRANSIENT_KEY and use it to detrmine
-       the RNG quality needed.
-       (dsa_generate_ext): Parse the transient-key flag und pass it to
-       generate.
-
-2008-11-28  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (generate_fips186): Add arg DERIVEPARMS and use the seed
-       value if available.
-
-       * primegen.c (_gcry_generate_fips186_2_prime): Fix inner p loop.
-
-2008-11-26  Werner Koch  <wk@g10code.com>
-
-       * primegen.c (_gcry_generate_fips186_3_prime): New.
-       * dsa.c (generate_fips186): Add arg USE_FIPS186_2.
-       (dsa_generate_ext): Parse new flag use-fips183-2.
-
-2008-11-25  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (generate_fips186): New.
-       (dsa_generate_ext): Use new function if derive-parms are given or
-       if in FIPS mode.
-       * primegen.c (_gcry_generate_fips186_2_prime): New.
-
-2008-11-24  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Insert code to output extrainfo.
-       (pubkey_generate): Add arg R_EXTRAINFO and pass it to the extended
-       key generation function.
-       * rsa.c (gen_x931_parm_xp, gen_x931_parm_xi): New.
-       (generate_x931): Generate params if not given.
-       (rsa_generate_ext): Parse use-x931 flag.  Return p-q-swapped
-       indicator.
-       * dsa.c (dsa_generate_ext): Put RETFACTORS into R_EXTRAINFO if
-       possible.
-
-       * pubkey.c (gcry_pk_genkey): Remove parsing of almost all
-       parameters and pass the parameter S-expression to pubkey_generate.
-       (pubkey_generate): Simplify by requitring modules to parse the
-       parameters. Remove the special cases for Elgamal and ECC.
-       (sexp_elements_extract_ecc): Add arg EXTRASPEC and use it.  Fix
-       small memory leak.
-       (sexp_to_key): Pass EXTRASPEC to sexp_elements_extract_ecc.
-       (pubkey_table) [USE_ELGAMAL]: Add real extraspec.
-       * rsa.c (rsa_generate_ext): Adjust for new calling convention.
-       * dsa.c (dsa_generate_ext): Ditto.
-       * elgamal.c (_gcry_elg_generate): Ditto. Rename to elg_generate_ext.
-       (elg_generate): New.
-       (_gcry_elg_generate_using_x): Remove after merging code with
-       elg_generate_ext.
-       (_gcry_pubkey_extraspec_elg): New.
-       (_gcry_elg_check_secret_key, _gcry_elg_encrypt, _gcry_elg_sign) 
-       (_gcry_elg_verify, _gcry_elg_get_nbits): Make static and remove
-       _gcry_ prefix.
-       * ecc.c (_gcry_ecc_generate): Rename to ecc_generate_ext and
-       adjust for new calling convention.
-       (_gcry_ecc_get_param): Rename to ecc_get_param and make static.
-       (_gcry_pubkey_extraspec_ecdsa): Add ecc_generate_ext and
-       ecc_get_param.
-       
-2008-11-20  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (pubkey_generate): Add arg DERIVEPARMS.
-       (gcry_pk_genkey): Parse derive-parms and pass it to above.
-       * rsa.c (generate_x931): New.
-       (rsa_generate_ext): Add arg DERIVEPARMS and call new function in
-       fips mode or if DERIVEPARMS is given.
-       * primegen.c (_gcry_derive_x931_prime, find_x931_prime): New.
-
-2008-11-19  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (rsa_decrypt): Use gcry_create_nonce for blinding.
-       (generate): Rename to generate_std.
-
-2008-11-05  Werner Koch  <wk@g10code.com>
-
-       * md.c (md_open): Use a switch to set the Bsize.
-       (prepare_macpads): Fix long key case for SHA384 and SHA512.
-
-       * cipher.c (gcry_cipher_handle): Add field EXTRASPEC.
-       (gcry_cipher_open): Set it.
-       (gcry_cipher_ctl): Add private control code to disable weak key
-       detection and to return the current input block.
-       * des.c (_tripledes_ctx): Add field FLAGS.
-       (do_tripledes_set_extra_info): New.
-       (_gcry_cipher_extraspec_tripledes): Add new function.
-       (do_tripledes_setkey): Disable weak key detection.
-
-2008-10-24  Werner Koch  <wk@g10code.com>
-
-       * md.c (digest_table): Allow MD5 in fips mode.
-       (md_register_default): Take special action for MD5.
-       (md_enable, gcry_md_hash_buffer): Ditto.
-
-2008-09-30  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (do_setkey): Properly align "t" and "tk".
-       (prepare_decryption): Properly align "w".  Fixes bug #936.
-
-2008-09-18  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Parse domain parameter.
-       (pubkey_generate): Add new arg DOMAIN and remove special case for
-       DSA with qbits.
-       * rsa.c (rsa_generate): Add dummy args QBITS, NAME and DOMAIN and
-       rename to rsa_generate_ext.  Change caller.
-       (_gcry_rsa_generate, _gcry_rsa_check_secret_key) 
-       (_gcry_rsa_encrypt, _gcry_rsa_decrypt, _gcry_rsa_sign) 
-       (_gcry_rsa_verify, _gcry_rsa_get_nbits): Make static and remove
-       _gcry_ prefix.
-       (_gcry_pubkey_spec_rsa, _gcry_pubkey_extraspec_rsa): Adjust names.
-       * dsa.c (dsa_generate_ext): New.
-       (_gcry_dsa_generate): Replace code by a call to dsa_generate.
-       (_gcry_dsa_check_secret_key, _gcry_dsa_sign, _gcry_dsa_verify)
-       (_gcry_dsa_get_nbits): Make static and remove _gcry prefix.
-       (_gcry_dsa_generate2): Remove.
-       (_gcry_pubkey_spec_dsa): Adjust to name changes.
-       (_gcry_pubkey_extraspec_rsa): Add dsa_generate_ext.
-
-2008-09-16  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (run_selftests): Add arg EXTENDED.
-
-2008-09-12  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (test_keys): Do a bad case signature check.
-       * dsa.c (test_keys): Do a bad case check.
-
-       * cipher.c (_gcry_cipher_selftest): Add arg EXTENDED and pass it
-       to the called tests.
-       * md.c (_gcry_md_selftest): Ditto.
-       * pubkey.c (_gcry_pk_selftest): Ditto.
-       * rijndael.c (run_selftests): Add arg EXTENDED and pass it to the
-       called tests.
-       (selftest_fips_128): Add arg EXTENDED and run only one test
-       non-extended mode.
-       (selftest_fips_192): Add dummy arg EXTENDED.
-       (selftest_fips_256): Ditto.
-       * hmac-tests.c (_gcry_hmac_selftest): Ditto.
-       (run_selftests): Ditto.
-       (selftests_sha1): Add arg EXTENDED and run only one test
-       non-extended mode.
-       (selftests_sha224, selftests_sha256): Ditto.
-       (selftests_sha384, selftests_sha512): Ditto.
-       * sha1.c (run_selftests): Add arg EXTENDED and pass it to the
-       called test.
-       (selftests_sha1): Add arg EXTENDED and run only one test
-       non-extended mode.
-       * sha256.c (run_selftests): Add arg EXTENDED and pass it to the
-       called tests.
-       (selftests_sha224): Add arg EXTENDED and run only one test
-       non-extended mode.
-       (selftests_sha256): Ditto.
-       * sha512.c (run_selftests): Add arg EXTENDED and pass it to the
-       called tests.
-       (selftests_sha384): Add arg EXTENDED and run only one test
-       non-extended mode.
-       (selftests_sha512): Ditto.
-       * des.c (run_selftests): Add arg EXTENDED and pass it to the
-       called test.
-       (selftest_fips): Add dummy arg EXTENDED.
-       * rsa.c (run_selftests): Add dummy arg EXTENDED.
-
-       * dsa.c (run_selftests): Add dummy arg EXTENDED.
-
-       * rsa.c (extract_a_from_sexp): New.
-       (selftest_encr_1024): Check that the ciphertext does not match the
-       plaintext.
-       (test_keys): Improve tests and return an error status.
-       (generate): Return an error if test_keys fails.
-       * dsa.c (test_keys): Add comments and return an error status.
-       (generate): Return an error if test_keys failed.
-
-2008-09-11  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (_gcry_rsa_decrypt): Return an error instead of calling
-       BUG in case of a practically impossible condition.
-       (sample_secret_key, sample_public_key): New.
-       (selftest_sign_1024, selftest_encr_1024): New.
-       (selftests_rsa): Implement tests.
-       * dsa.c (sample_secret_key, sample_public_key): New.
-       (selftest_sign_1024): New.
-       (selftests_dsa): Implement tests.
-
-2008-09-09  Werner Koch  <wk@g10code.com>
-
-       * hmac-tests.c (selftests_sha1): Add tests.
-       (selftests_sha224, selftests_sha384, selftests_sha512): Make up tests.
-
-       * hash-common.c, hash-common.h: New.
-       * sha1.c (selftests_sha1): Add 3 tests.
-       * sha256.c (selftests_sha256, selftests_sha224): Ditto.
-       * sha512.c (selftests_sha512, selftests_sha384): Ditto.
-
-2008-08-29  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_get_keygrip): Remove the special case for RSA
-       and check whether a custom computation function has been setup.
-       * rsa.c (compute_keygrip): New.
-       (_gcry_pubkey_extraspec_rsa): Setup this function.
-       * ecc.c (compute_keygrip): New.
-       (_gcry_pubkey_extraspec_ecdsa): Setup this function.
-
-2008-08-28  Werner Koch  <wk@g10code.com>
-
-       * cipher.c (cipher_decrypt, cipher_encrypt): Return an error if
-       mode NONE is used.
-       (gcry_cipher_open): Allow mode NONE only with a debug flag set and
-       if not in FIPS mode.
-
-2008-08-26  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (pubkey_generate): Add arg KEYGEN_FLAGS.
-       (gcry_pk_genkey): Implement new parameter "transient-key" and
-       pass it as flags to pubkey_generate.
-       (pubkey_generate): Make use of an ext_generate function.
-       * rsa.c (generate): Add new arg transient_key and pass appropriate
-       args to the prime generator.
-       (_gcry_rsa_generate): Factor all code out to ...
-       (rsa_generate): .. new func with extra arg KEYGEN_FLAGS.
-       (_gcry_pubkey_extraspec_ecdsa): Setup rsa_generate.
-       * primegen.c (_gcry_generate_secret_prime) 
-       (_gcry_generate_public_prime): Add new arg RANDOM_LEVEL.
-
-2008-08-21  Werner Koch  <wk@g10code.com>
-
-       * primegen.c (_gcry_generate_secret_prime)
-       (_gcry_generate_public_prime): Use a constant macro for the random
-       level.
-       
-2008-08-19  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (sexp_elements_extract_ecc) [!USE_ECC]: Do not allow
-       allow "curve" parameter.
-
-2008-08-15  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (_gcry_pk_selftest): New.
-       * dsa.c (selftests_dsa, run_selftests): New.
-       * rsa.c (selftests_rsa, run_selftests): New.
-       * ecc.c (selftests_ecdsa, run_selftests): New.
-
-       * md.c (_gcry_md_selftest): New.
-       * sha1.c (run_selftests, selftests_sha1): New.
-       * sha256.c (selftests_sha224, selftests_sha256, run_selftests): New.
-       * sha512.c (selftests_sha384, selftests_sha512, run_selftests): New.
-
-       * des.c (selftest): Remove static variable form selftest.
-       (des_setkey): No on-the-fly self test in fips mode.
-       (tripledes_set3keys): Ditto.
-
-       * cipher.c (_gcry_cipher_setkey, _gcry_cipher_setiv): 
-
-       * dsa.c (generate): Bail out in fips mode if NBITS is less than 1024.
-       * rsa.c (generate): Return an error code if the the requested size
-       is less than 1024 and we are in fpis mode.
-       (_gcry_rsa_generate): Take care of that error code.
-
-       * ecc.c (generate_curve): In fips mode enable only NIST curves.
-
-       * cipher.c (_gcry_cipher_selftest): New.
-
-       * sha512.c (_gcry_digest_extraspec_sha384)
-       (_gcry_digest_extraspec_sha512): New.
-       * sha256.c (_gcry_digest_extraspec_sha224)
-       (_gcry_digest_extraspec_sha256): New.
-       * sha1.c (_gcry_digest_extraspec_sha1): New.
-       * ecc.c (_gcry_pubkey_extraspec_ecdsa): New.
-       * dsa.c (_gcry_pubkey_extraspec_dsa): New.
-       * rsa.c (_gcry_pubkey_extraspec_rsa): New.
-       * rijndael.c (_gcry_cipher_extraspec_aes)
-       (_gcry_cipher_extraspec_aes192, _gcry_cipher_extraspec_aes256): New.
-       * des.c (_gcry_cipher_extraspec_tripledes): New.
-
-       * cipher.c (gcry_cipher_register): Rename to _gcry_cipher_register.
-       Add arg EXTRASPEC.
-       (dummy_extra_spec): New.
-       (cipher_table_entry): Add extraspec field.
-       * md.c (_gcry_md_register): Rename to _gcry_md_register.  Add
-       arg EXTRASPEC.
-       (dummy_extra_spec): New.
-       (digest_table_entry): Add extraspec field.
-       * pubkey.c (gcry_pk_register): Rename to _gcry_pk_register.  Add
-       arg EXTRASPEC.
-       (dummy_extra_spec): New.
-       (pubkey_table_entry): Add extraspec field.
-
-       * ac.c: Let most public functions return GPG_ERR_UNSUPPORTED in
-       fips mode.
-
-       * pubkey.c (pubkey_table_entry): Add field FIPS_ALLOWED and mark
-       appropriate algorithms.
-       (dummy_generate, dummy_check_secret_key, dummy_encrypt) 
-       (dummy_decrypt, dummy_sign, dummy_verify, dummy_get_nbits): Signal
-       a fips error when used.
-       (gcry_pk_register): In fips mode do not allow to register new
-       algorithms.
-
-       * md.c (digest_table): Add field FIPS_ALLOWED and mark appropriate
-       algorithms.
-       (md_register_default): In fips mode register only fips algorithms.
-       (gcry_md_register): In fips mode do not allow to register new
-       algorithms.
-       (gcry_md_get): Signal a fips error if called.
-       (gcry_md_hash_buffer): Do not allow rmd160 when not in fips mode.
-       (md_start_debug): Disable in fips_mode.
-
-       * md.c (gcry_md_register_default): Rename to ..
-       (md_register_default): .. this.
-       (md_digest): Remove this commented fucntion.
-       * pubkey.c (gcry_pk_register_default): Rename to ..
-       (pk_register_default): .. this.
-
-       * cipher.c (cipher_table_entry): Add field FIPS_ALLOWED.
-       (gcry_cipher_register_default): Register only fips approved
-       algorithms.
-       (gcry_cipher_register): Do not allow to register new ciphers.
-       (cipher_setiv): Signal fips error.
-       
-       * cipher (gcry_cipher_register_default): Rename to ..
-       (cipher_register_default): .. this.
-       (REGISTER_DEFAULT_CIPHERS): Adjust for that change.
-
-2008-07-05  Werner Koch  <wk@g10code.com>
-
-       * random-daemon.c, random.h, rndhw.c, rndunix.c, rand-internal.h
-       * random.c, rndegd.c, rndlinux.c, rndw32.c: Move to ../cipher.
-       * Makefile.am: Remove random stuff.
-
-2008-06-24  Szakats Istvan <szaki.ms@gmail.com>  (wk)
-
-       * ac.c (_gcry_ac_key_destroy, _gcry_ac_key_pair_generate): Relase
-       some more memory.
-
-2008-04-22  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (_gcry_aes_cfb_enc, _gcry_aes_cbc_enc) 
-       (_gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Use Padlock if possible.
-
-2008-04-18  Werner Koch  <wk@g10code.com>
-
-       * sha1.c (transform_aligned): Remove.  That is will obviosuly not
-       work because we need a scratch working area and our internal API
-       does not allow to modify the buffers.
-
-       * rijndael.c: Factor tables out to ..
-       * rijndael-tables.h: .. new.
-
-       * ac.c (ac_data_extract): Make static.
-
-       * camellia.h [HAVE_CONFIG_H]: Include config.h.
-
-       * rndw32.c (registry_poll): Only print the performance data
-       problem warning once.  Suggested by Simon Josefsson.
-
-2008-03-19  Werner Koch  <wk@g10code.com>
-
-       * cipher.c (gcry_cipher_open) [USE_AES]: Init bulk encryption only
-       if requested.  Suggested by Dirk Stoecker.
-
-2008-03-18  Werner Koch  <wk@g10code.com>
-
-       * sha1.c: Include stdint.h.
-       (transform): Add arg NBLOCKS so that we can work on more than one
-       block and avoid updates of the chaining variables.  Changed all
-       callers to use 1.
-       (sha1_write): Replace loop around transform.
-       (transform_aligned) [WORDS_BIGENDIAN]: New.
-       (TRANSFORM): New macro to replace all direct calls of transform.
-
-2008-03-17  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (_gcry_aes_cfb_dec): New.
-       (do_encrypt): Factor code out to ..
-       (do_encrypt_aligned): .. New.
-       (_gcry_aes_cfb_enc, _gcry_aes_cfb_dec): Use new function.
-       (do_decrypt): Factor code out to ..
-       (do_decrypt_aligned): .. new.
-       (_gcry_aes_cbc_enc, _gcry_aes_cbc_dec): New.
-       * cipher.c (struct gcry_cipher_handle): Put field IV into new
-       union U_IV to enforce proper alignment.  Change all users.
-       (do_cfb_decrypt): Optimize.
-       (do_cbc_encrypt, do_cbc_decrypt): Optimize.
-
-2008-03-15  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (_gcry_aes_cfb_enc): New.
-       * cipher.c (struct gcry_cipher_handle): Add field ALGO and BULK.
-       (gcry_cipher_open): Set ALGO and BULK.
-       (do_cfb_encrypt): Optimize.
-
-2008-02-18  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (_gcry_rsa_verify) [IS_DEVELOPMENT_VERSION]: Print
-       intermediate results.
-
-2008-01-08  Werner Koch  <wk@g10code.com>
-
-       * random.c (add_randomness): Do not just increment
-       POOL_FILLED_COUNTER but update it by the actual amount of data.
-
-2007-12-13  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (sexp_data_to_mpi): Support SHA-224.
-
-2007-12-05  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (USE_PADLOCK): Depend on ENABLE_PADLOCK_SUPPORT.
-       * rndhw.c (USE_PADLOCK): Ditto
-
-       * rsa.c (secret): Fixed condition test for using CRT.  Reported by
-       Dean Scarff.  Fixes bug#864.
-       (_gcry_rsa_check_secret_key): Return an erro if the optional
-       parameters are missing.
-       * pubkey.c (sexp_elements_extract): Add arg ALGO_NAME. Changed all
-       callers to pass NULL. Add hack to allow for optional RSA
-       parameters.
-       (sexp_to_key): Pass algo name to sexp_elements_extract.
-
-2007-12-03  Werner Koch  <wk@g10code.com>
-
-       * random.c (gcry_random_add_bytes): Implement it.
-       * rand-internal.h (RANDOM_ORIGIN_EXTERNAL): New.
-
-2007-11-30  Werner Koch  <wk@g10code.com>
-
-       * rndhw.c: New.
-       * rndlinux.c (_gcry_rndlinux_gather_random): Try to read 50%
-       directly from the hwrng.
-       * random.c (do_fast_random_poll): Also run the hw rng fast poll.
-       (_gcry_random_dump_stats): Tell whether the hw rng failed.
-
-2007-11-29  Werner Koch  <wk@g10code.com>
-
-       * rijndael.c (USE_PADLOCK): Define new macro used for ia32.
-       (RIJNDAEL_context) [USE_PADLOCK]: Add fields USE_PADLOCK and
-       PADLOCK_KEY.
-       (do_setkey) [USE_PADLOCK]: Enable padlock if available for 128 bit
-       AES.
-       (do_padlock) [USE_PADLOCK]: New.
-       (rijndael_encrypt, rijndael_decrypt) [USE_PADLOCK]: Divert to
-       do_padlock.
-       * cipher.c (cipher_context_alignment_t): New.  Use it in this
-       module in place of PROPERLY_ALIGNED_TYPE.
-       (NEED_16BYTE_ALIGNED_CONTEXT): Define macro for ia32.
-       (struct gcry_cipher_handle): Add field HANDLE_OFFSET.
-       (gcry_cipher_open): Take care of increased alignment requirements.
-       (gcry_cipher_close): Ditto.
-
-2007-11-28  Werner Koch  <wk@g10code.com>
-
-       * sha256.c (asn224): Fixed wrong template.  It happened due to a
-       bug in RFC4880.  SHA-224 is not in the stable version of libgcrypt
-       so the consequences are limited to users of this devel version.
-
-2007-10-31  Werner Koch  <wk@g10code.com>
-
-       * ac.c (gcry_ac_data_new): Remove due to the visibility wrapper.
-       (gcry_ac_data_destroy, gcry_ac_data_copy, gcry_ac_data_length) 
-       (gcry_ac_data_set, gcry_ac_data_get_name, gcry_ac_data_get_index) 
-       (gcry_ac_data_to_sexp, gcry_ac_data_from_sexp) 
-       (gcry_ac_data_clear, gcry_ac_io_init, gcry_ac_open) 
-       (gcry_ac_close, gcry_ac_key_init, gcry_ac_key_pair_generate) 
-       (gcry_ac_key_pair_extract, gcry_ac_key_destroy) 
-       (gcry_ac_key_pair_destroy, gcry_ac_key_data_get) 
-       (gcry_ac_key_test, gcry_ac_key_get_nbits, gcry_ac_key_get_grip) 
-       (gcry_ac_data_encrypt, gcry_ac_data_decrypt, gcry_ac_data_sign) 
-       (gcry_ac_data_verify, gcry_ac_data_encode, gcry_ac_data_decode) 
-       (gcry_ac_mpi_to_os, gcry_ac_mpi_to_os_alloc, gcry_ac_os_to_mpi) 
-       (gcry_ac_data_encrypt_scheme, gcry_ac_data_decrypt_scheme) 
-       (gcry_ac_data_sign_scheme, gcry_ac_data_verify_scheme) 
-       (gcry_ac_io_init_va): Ditto.
-       (gcry_ac_id_to_name, gcry_ac_name_to_id): Remove as these
-       deprecated functions are now implemented by visibility.c.
-
-2007-10-26  Werner Koch  <wk@g10code.com>
-
-       * rndw32.c: Disable debug flag.
-
-2007-10-25  Werner Koch  <wk@g10code.com>
-
-       * rndw32.c: Updated from current cryptlib snapshot and modified
-       for our use.  Removed support from pre NT systems.
-       (slow_gatherer_windows95): Remove.
-       (_gcry_rndw32_gather_random): Require an NT platform.
-       (init_system_rng, read_system_rng, read_mbm_data): New.
-       (slow_gatherer_windowsNT): Rename to ...
-       (slow_gatherer): .. this.  Read system RNG and MBM.
-       (registry_poll): New with code factored out from slow_gatherer.
-
-2007-08-23  Werner Koch  <wk@g10code.com>
-
-       * random.c (pool_filled_counter): New.
-       (add_randomness): Use it.
-
-2007-08-22  Werner Koch  <wk@g10code.com>
-
-       * rndw32.c, rndunix.c: Switched to LGPL.
-
-2007-05-30  Werner Koch  <wk@g10code.com>
-
-       * camellia.h, camellia.c: Replace by new LGPL version and adjusted
-       camellia.h.
-
-2007-05-09  Marcus Brinkmann  <marcus@g10code.de>
-
-       * ac.c (_gcry_ac_io_init_va, _gcry_ac_io_write, _gcry_ac_io_read):
-       Adjust users of gcry_ac_io_t because union is not anonymous
-       anymore.
-
-2007-05-02  Werner Koch  <wk@g10code.com>
-
-       * camellia-glue.c (camellia_setkey, camellia_encrypt)
-       (camellia_decrypt): Recalculated used stack size in called
-       functions.
-       * camellia.h: Redefine external symbols.
-
-2007-05-02  David Shaw  <dshaw@jabberwocky.com>
-
-       * Makefile.am, cipher.c: Add Camellia.
-
-       * camellia-glue.c: New.  The necessary glue to interface libgcrypt
-       to the stock NTT Camellia distribution.
-
-       * camellia.h, camellia.c: The stock NTT Camellia distribution
-       (GPL).
-
-2007-04-30  David Shaw  <dshaw@jabberwocky.com>
-
-       * cipher.c: Use #if instead of #ifdef as configure defines the
-       USE_cipher defines as 0 for disabled.
-
-2007-04-30  Werner Koch  <wk@g10code.com>
-
-       * rndegd.c (_gcry_rndegd_set_socket_name): New.
-
-2007-04-30  Marcus Brinkmann  <marcus@g10code.de>
-
-       * ecc.c (ec2os): Fix relocation of short numbers.
-
-       * ecc.c (generate_key): Do not allocate D, which will be allocated
-       by GEN_K.  Remove G.  Fix test if g_x, g_y resp. q_x, q_y are
-       requested.
-       (_gcry_ecc_generate): Release unneeded members of SK.
-       * pubkey.c (sexp_to_key): Release NAME.
-
-2007-04-28  Marcus Brinkmann  <marcus@g10code.de>
-
-       * ac.c (gcry_ac_mpi): Remove member NAME_PROVIDED.
-       (ac_data_mpi_copy, _gcry_ac_data_set, _gcry_ac_data_get_name)
-       (_gcry_ac_data_get_index, ac_data_construct): Adjust handling of
-       NAME accordingly.
-
-2007-04-20  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (domain_parms): Add standard brainpool curves.
-
-2007-04-18  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (generate_curve): Implement alias mechanism.
-
-       * pubkey.c (sexp_elements_extract_ecc): New.
-       (sexp_to_key): Add special case for ecc.
-       (sexp_to_key, sexp_to_sig, sexp_to_enc, gcry_pk_genkey): Replace
-       name_terminated stuff by a call to _gcry_sexp_nth_string.
-       (gcry_pk_get_keygrip): Ditto.
-
-2007-04-16  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (_gcry_ecc_generate): Renamed DUMMY to CURVE and use it.
-
-2007-04-13  Marcus Brinkmann  <marcus@g10code.de>
-
-       * ac.c (ac_data_construct): Cast const away to suppress compiler
-       warning.
-
-       * ecc.c (ecc_generate): Avoid compiler warning for unused argument
-       DUMMY.
-       (ecc_verify): Avoid compiler warning for unused arguments CMP and
-       OPAQUEV.
-
-2007-04-06  Werner Koch  <wk@g10code.com>
-
-       * sha1.c (oid_spec_sha1): Add another oid from X9.62.
-
-2007-03-28  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Do not issue misc-key-info if it is
-       empty.
-       (gcry_pk_genkey): New parameter "curve".
-
-       * ecc.c: Entirely rewritten with only a few traces of the old
-       code left.
-       (_gcry_ecc_generate): New.
-       (generate_key) New arg NAME.
-       (generate_curve): Ditto.  Return actual number of NBITS.
-
-2007-03-26  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Increase size of SKEY array and add a
-       runtime bounds check.
-
-2007-03-23  Werner Koch  <wk@g10code.com>
-
-       * ecc.c (ecc_ctx_init, ecc_ctx_free, ecc_mod, ecc_mulm): New.
-       (duplicate_point, sum_points, escalar_mult): Don't use a
-       copy of base->p.  Replaced all mpi_mulm by ecc_mulm so that we can
-       experiment with different algorithms.
-       (generate_key, check_secret_key, sign, verify): Initialize a
-       computation context for use by ecc_mulm.
-
-2007-03-22  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (pubkey_table): Initialize ECC.
-       * Makefile.am (EXTRA_libcipher_la_SOURCES): Add ecc.c.
-       * ecc.c: New. Heavily reformatted and changed for use in libgcrypt.
-       (point_init): New.
-       (escalar_mult): Make arg R the first arg to be similar to the mpi
-       functions.
-       (duplicate_point): Ditto
-       (sum_points): Ditto
-       (sign, verify): Remove unneeded copy operations.
-       (sum_points): Removed memory leaks and optimized some compares.
-       (verify): Simplified input check.
-
-2007-03-14  Werner Koch  <wk@g10code.com>
-
-       * random.c (MASK_LEVEL): Removed macro as it was used only at one
-       place.  Open coded it there.
-       (gcry_randomize, _gcry_update_random_seed_file)
-       (_gcry_fast_random_poll): Factor lock code out to ..
-       (lock_pool, unlock_pool): .. new.
-       (initialize): Look the pool while allocating. 
-       (read_random_source, do_fast_random_poll): Moved intialization to ...
-       (initialize): .. here.
-       (_gcry_enable_quick_random_gen): No more need for initialization.
-       (is_initialized):  Moved this global flag to ..
-       (initialize): .. here and changed all users to unconditionally call
-       initialize.
-       (add_randomness): Remove initalization here.  It simply can't
-       happen. 
-
-       * random.c (enum random_origins): Moved to ..
-       * rand-internal.h: .. here.
-       * rndunix.c (_gcry_rndunix_gather_random): Use enum in prototype
-       for ORIGIN and renamed REQUESTOR to ORIGIN.
-       * rndegd.c (_gcry_rndegd_gather_random): Ditto.
-       * rndlinux.c (_gcry_rndlinux_gather_random): Ditto.
-       * rndw32.c (_gcry_rndw32_gather_random): Ditto.
-       (_gcry_rndw32_gather_random_fast): Ditto.
-
-2007-03-13  Werner Koch  <wk@g10code.com>
-
-       * random.c (enum random_origins): New.
-       (add_randomness): Renamed arg SOURCE to ORIGIN.
-       (read_random_source): Renamed arg REQUESTOR to ORIGIN.
-       (getfnc_gather_random): Removed static variable because this
-       function is only called one and thus we don't need this
-       optimization.
-       (_gcry_quick_random_gen): Removed and replaced by..
-       (_gcry_enable_quick_random_gen): .. this.  It is onlyu used to
-       enable it and it does not make sense to disable it later. Changed
-       the only one caller too.
-       (get_random_bytes): Removed.
-       (gcry_random_bytes, gcry_random_bytes_secure): Implement in terms
-       of gcry_randomize.
-       * random-daemon.c (_gcry_daemon_get_random_bytes): Removed.
-
-2007-02-23  Werner Koch  <wk@g10code.com>
-
-       * elgamal.c (generate): Removed unused variable TEMP.
-       (test_keys): New arg NODIE.
-       (generate_using_x, _gcry_elg_generate_using_x): New.
-       * pubkey.c (pubkey_generate): New arg XVALUE and direct call to
-       the new elgamal generate fucntion.
-       (gcry_pk_genkey): Parse the new "xvalue" tag.
-
-2007-02-22  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (sexp_data_to_mpi): Handle dynamically allocated
-       algorithms.  Suggested by Neil Dunbar.  Fixes bug#596.
-
-       * rndw32.c (_gcry_rndw32_gather_random_fast): Make it return void.
-
-       * cipher.c (gcry_cipher_algo_name): Simplified.
-
-       * random.c: Use the daemon only if compiled with USE_RANDOM_DAEMON. 
-
-       * Makefile.am (libcipher_la_SOURCES): Build random-daemon support
-       only if requested.
-
-2007-02-21  Werner Koch  <wk@g10code.com>
-
-       * random.c (rndpool, keypool): Make unsigned.
-       (mix_pool): Change char* variables to unsigned char*.
-       (gcry_randomize): Make arg BUFFER a void*.
-       (gcry_create_nonce): Ditto.
-
-       * rmd160.c (gcry_rmd160_mixblock): Make BUFFER a void*.
-       (_gcry_rmd160_hash_buffer): Make OUTBUF and BUFFER void*.
-       * sha1.c (_gcry_sha1_hash_buffer): Ditto.
-
-       * cipher.c (gcry_cipher_encrypt, cry_cipher_decrypt): Change
-       buffer args to void*.
-       (gcry_cipher_register): Make ALGORITHM_ID a int *.
-
-       * md.c (md_start_debug): Make SUFFIX a const char*.  Use snprintf.
-       (gcry_md_debug): New.
-       (gcry_md_ctl): Changed arg BUFFER from unsigned char*.
-
-       * md.c (md_write): Make INBUF a const void*.
-       (gcry_md_write): Remove needless cast.
-       * crc.c (crc32_write): Make INBUF a const void*
-       (update_crc32, crc24rfc2440_write): Ditto.
-       * sha512.c (sha512_write, transform): Ditto.
-       * sha256.c (sha256_write, transform): Ditto.
-       * rmd160.c (rmd160_write, transform): Ditto.
-       * md5.c (md5_write, transform): Ditto.
-       * md4.c (md4_write, transform): Ditto.
-       * sha1.c (sha1_write, transform): Ditto.
-
-       * tiger.c (tiger_write, transform): Ditto.
-       * whirlpool.c (whirlpool_write, whirlpool_add, transform): Ditto.
-
-       * elgamal.c (elg_names): Change to a const*.
-       * dsa.c (dsa_names): Ditto.
-       * rsa.c (rsa_names): Ditto.
-       * pubkey.c (gcry_pk_lookup_func_name): Make ALIASES a const.
-
-2007-02-20  Werner Koch  <wk@g10code.com>
-
-       * rndlinux.c (open_device): Remove unsused arg MINOR.
-
-2007-01-30  Werner Koch  <wk@g10code.com>
-
-       * sha256.c (oid_spec_sha256): Add alias from pkcs#1.
-       * sha512.c (oid_spec_sha512): Ditto.
-       (oid_spec_sha384): Ditto.
-
-2006-12-18  Werner Koch  <wk@g10code.com>
-
-       * rndlinux.c (set_cloexec_flag): New.
-       (open_device): Set close-on-exit flags.  Suggested by Max
-       Kellermann.  Fixes Debian#403613.
-
-       * Makefile.am (AM_CPPFLAGS, AM_CFLAGS): Splitted and merged
-       Moritz' changes.
-       (INCLUDES): Removed.
-
-2006-11-30  Werner Koch  <wk@g10code.com>
-
-       * serpent.c (byte_swap_32): Remove trailing semicolon.
-
-2006-11-15  Werner Koch  <wk@g10code.com>
-
-       * Makefile.am (INCLUDES): Include ../src/
-
-2006-11-03  Werner Koch  <wk@g10code.com>
-
-       * random.c [HAVE_GETTIMEOFDAY]: Included sys/time.h and not
-       sys/times.h.  Reported by Rafaël Carré.
-
-2006-11-05  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am (AM_CFLAGS): Added -I$(top_builddir)/src so that the
-       new gcrypt.h is used, not the one installed in the system.
-
-2006-10-25  Werner Koch  <wk@g10code.com>
-
-       * primegen.c (prime_generate_internal): Tweaked use of secure
-       memory and entropy use. Safe unused primes from the pool. Allocate
-       at least a pool of 30.
-       (save_pool_prime, get_pool_prime): New.
-
-2006-10-23  Werner Koch  <wk@g10code.com>
-
-       * ac.c (_gcry_ac_data_from_sexp): Reset sexp_tmp for failsafe
-       means.  Release sexp_cur if needed.  Reported by Dirk Stoecker.
-
-       * pubkey.c (pubkeys_registered_lock): Intialized it.  It is not
-        realy needed because this is a mere initialization to 0 anyway.
-        Noted by Victor Stinner.
-
-2006-10-17  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (_gcry_dsa_generate2): New.
-       (generate): New arg QBITS.  Add sanity checks for reasonable qbits
-       and nbits.
-       * pubkey.c (gcry_pk_genkey): Parse an qbits element.
-       (pubkey_generate): New arg QBITS.  Pass it to the DSA generation.
-
-2006-10-05  Werner Koch  <wk@g10code.com>
-
-       * md.c (gcry_md_algo_info) <get_asnoid>: Check that the algo is
-       available.
-
-2006-10-04  David Shaw  <dshaw@jabberwocky.com>  (wk)
- 
-       * tiger.c (round): Rename to tiger_round as gcc 4 has a built-in
-       round function that this conflicts with.
- 
-2006-09-11  Werner Koch  <wk@g10code.com>
-
-       * rndw32.c (slow_gatherer_windowsNT): While adding data use the
-       size of the diskPerformance and not its address. Has been fixed in
-       GnuPG more than a year ago.  Noted by Lee Fisher.
-
-2006-08-30  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (sexp_data_to_mpi): Need to allow "ripemd160" here as
-       this is the canonical name.
-
-2006-08-29  Hye-Shik Chang <perky@FreeBSD.org>  (wk)
-
-       * seed.c: New.
-
-2006-08-03  Werner Koch  <wk@g10code.com>
-
-       * random-daemon.c (_gcry_daemon_initialize_basics): Don't
-       initialize the socket.  Remove arg SOCKETNAME.
-       (connect_to_socket): Make sure that daemon is set to -1 on error.
-       (call_daemon): Initialize the socket on the first call.
-       (_gcry_daemon_randomize, _gcry_daemon_get_random_bytes) 
-       (_gcry_daemon_create_nonce): New arg SOCKETNAME.
-       * random.c (initialize): Call new daemon initializator.
-       (get_random_bytes, gcry_randomize, gcry_create_nonce): Pass socket
-       name to daemon call and reset allow_daemon on failure.
-
-2006-07-26  Werner Koch  <wk@g10code.com>
-
-       * rmd160.c (_gcry_rmd160_mixblock): Add cast to transform call.
-
-       * blowfish.c (selftest): Cast string to usnigned char*.
-
-       * primegen.c (prime_generate_internal): Cast unsigned/char*
-       mismatch in calling m_out_of_n.
-       (is_prime): Changed COUNT to unsigned int *.
-
-       * ac.c (_gcry_ac_data_copy): Initialize DATA_MPIS.
-
-       * random.c (gcry_create_nonce): Update the pid after a fork.
-       Reported by Uoti Urpala.
-
-2006-07-04  Marcus Brinkmann  <marcus@g10code.de>
-
-       * sha512.c: Fix typo in copyright notice.
-
-2006-06-21  Werner Koch  <wk@g10code.com>
-
-       * rsa.c (_gcry_rsa_generate): Replace xcalloc by calloc.
-       * pubkey.c (gcry_pk_encrypt, gcry_pk_sign): Ditto.
-       (sexp_to_key, sexp_to_sig, sexp_to_enc, gcry_pk_encrypt) 
-       (gcry_pk_sign, gcry_pk_genkey, gcry_pk_get_keygrip): Ditto. 
-       * md.c (md_copy): Ditto.
-       
-2006-04-22  Moritz Schulte  <moritz@g10code.com>
-
-       * random-daemon.c (_gcry_daemon_initialize_basics): New argument:
-       SOCKETNAME.  Passing on to connect_to_socket() if non-NULL.
-       (connect_to_socket, writen, readn, call_daemon): New functions.
-       (_gcry_daemon_randomize, _gcry_daemon_get_random_bytes) 
-       (_gcry_daemon_create_nonce): Call call_daemon().
-       (RANDOM_DAEMON_SOCKET): New symbol.
-       (daemon_socket): New static variable.
-
-       * random.h (_gcry_daemon_initialize_basics): New parameter:
-       SOCKETNAME.
-       (_gcry_set_random_daemon_socket): New declaration.
-
-       * random.c (initialize_basics): Pass DAEMON_SOCKET_NAME to
-       _gcry_daemon_initialize_basics.
-       (_gcry_set_random_daemon_socket): New function, setting
-       DAEMON_SOCKET_NAME.
-
-2006-04-01  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (eme_pkcs_v1_5_encode): Use KEY_SIZE directly, no need to
-       call gcry_ac_key_get_nbits.
-       (eme_pkcs_v1_5_decode): Likewise.
-       (ac_es_dencode_prepare_pkcs_v1_5): Fill options_em structure with
-       key_size.
-       (_gcry_ac_data_dump, gcry_ac_data_dump): New functions.
-       (_gcry_ac_data_to_sexp, _gcry_ac_data_from_sexp): More or less
-       rewritten; changed S-Expression format so that it matches the one
-       used in pubkey.c.
-
-2006-03-15  Werner Koch  <wk@g10code.com>
-
-       * random-daemon.c: New.
-       * random.c (_gcry_use_random_daemon): New.
-       (get_random_bytes, gcry_randomize, gcry_create_nonce): Try
-       diverting to the daemon functions.
-
-2006-03-14  Werner Koch  <wk@g10code.com>
-
-       * random.c (lock_seed_file): New.
-       (read_seed_file, _gcry_update_random_seed_file): Use it.
-
-       * random.c (gcry_create_nonce):  Detect a fork and re-seed.
-       (read_pool): Fixed the fork detection; it used to work only for
-       multi-threaded processes.
-
-2006-03-12  Brad Hards  <bradh@frogmouth.net>  (wk)
-
-       * md.c (md_open): Use new variable macpads_Bsize instead of
-       hardwiring the block size.  Changed at all places.
-
-2006-03-10  Brad Hards  <bradh@frogmouth.net>  (wk, patch 2005-04-22)
-
-       * md.c, sha256.c:  Add support for SHA-224.
-       (sha224_init): New.
-       
-2006-01-18  Brad Hards  <bradh@frogmouth.net>  (wk 2006-03-07)
-
-       * cipher.c (cipher_encrypt, cipher_decrypt, do_ofb_encrypt)
-       (do_ofb_decrypt, gcry_cipher_open): Implement Output Feedback Mode.
-
-2005-11-02  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_algo_name): Return "?" instead of NULL for
-       unknown algorithm IDs.
-       * cipher.c (cipher_algo_to_string): Likewise.
-
-2005-11-01  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_algo_info): Don't forget to break after switch
-       case.
-
-2005-09-19  Werner Koch  <wk@g10code.com>
-
-       * dsa.c (generate): Add preliminary support for 2 and 4 keys.
-       Return an error code if the key size is not supported.
-       (_gcry_dsa_generate): Return an error.
-
-2005-08-22  Werner Koch  <wk@g10code.com>
-
-       * primegen.c (check_prime): New arg RM_ROUNDS.
-       (prime_generate_internal): Call it here with 5 rounds as used
-       before.
-       (gcry_prime_check): But here with 64 rounds.
-       (is_prime): Make sure never to use less than 5 rounds.
-
-2005-04-16  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (_gcry_ac_init): New function.
-
-2005-04-12  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (_gcry_ac_io_write, _gcry_ac_io_read): Initialize err to
-       make the compiler happy.
-       Always use errno, now that gcry_malloc() is guaranteed to set
-       errno on failure.
-       (_gcry_ac_data_to_sexp): Don't forget to goto out after error in
-       loop.
-       (_gcry_ac_data_to_sexp): Remove unused variable: mpi_list;
-       (_gcry_ac_data_to_sexp): Always deallocate sexp_buffer.
-       (_gcry_ac_data_from_sexp): Don't forget to initialize data_set_new.
-       (_gcry_ac_data_from_sexp): Handle special case, which is
-       necessary, since gcry_sexp_nth() does not distinguish between
-       "element does not exist" and "element is the empty list".
-       (_gcry_ac_io_init_va): Use assert to make sure that mode and type
-       are correct.
-       Use gcry_error_t types where gcry_err_code_t types have been used
-       before.
-
-2005-04-11  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (_gcry_ac_data_sign_scheme): Don't forget to initialize
-       buffer.
-
-       * whirlpool.c: New file.
-       * md.c (digest_table): Add whirlpool.
-       * Makefile.am (EXTRA_libcipher_la_SOURCES): Added: whirlpool.c.
-
-2005-03-30  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (_gcry_ac_data_from_sexp): Use length of SEXP_CUR, not
-       length of SEXP; do not forget to set SEXP_TMP to NULL after it has
-       been released.
-
-       (struct gcry_ac_mpi): New member: name_provided.
-       (_gcry_ac_data_set): Rename variable `name_final' to `name_cp';
-       remove const qualifier; change code to not cast away const
-       qualifiers; use name_provided member as well.
-       (_gcry_ac_data_set, _gcry_ac_data_get_name): Use name_provided
-       member of named mpi structure.
-
-       (gcry_ac_name_to_id): Do not forget to initialize err.
-       (_gcry_ac_data_get_index): Do not forget to initialize mpi_return;
-       use gcry_free() instead of free(); remove unnecessary cast; rename
-       mpi_return and name_return to mpi_cp and name_cp; adjust code.
-       (ac_data_mpi_copy): Do not cast away const qualifier.
-       (ac_data_values_destroy): Likewise.
-       (ac_data_construct): Likewise.
-
-       (ac_data_mpi_copy): Initialize flags to GCRY_AC_FLAG_DEALLOC.
-       (ac_data_extract): Use GCRY_AC_FLAG_DEALLOC instead of
-       GCRY_AC_FLAG_COPY.
-
-       (_gcry_ac_io_init_va, _gcry_ac_io_init, gcry_ac_io_init)
-       (gcry_ac_io_init_va, _gcry_ac_io_write, _gcry_ac_io_read)
-       (_gcry_ac_io_read_all, _gcry_ac_io_process): New functions.
-       (gry_ac_em_dencode_t): Use gcry_ac_io_t in prototype instead of
-       memroy strings directly; adjust encode/decode functions to use io
-       objects.
-       (emsa_pkcs_v1_5_encode_data_cb): New function ...
-       (emsa_pkcs_v1_5_encode): ... use it here.
-       (ac_data_dencode): Use io objects.
-       (_gcry_ac_data_encode, _gcry_ac_data_decode, gcry_ac_data_encode)
-       (gcry_ac_data_decode): Likewise.
-       (_gcry_ac_data_encrypt_scheme, gcry_ac_data_encrypt_scheme)
-       (_gcry_ac_data_decrypt_scheme, gcry_ac_data_decrypt_scheme)
-       (_gcry_ac_data_sign_scheme, gcry_ac_data_sign_scheme)
-       (_gcry_ac_data_verify_scheme, gcry_ac_data_verify_scheme):
-       Likewise.
-
-2005-03-23  Werner Koch  <wk@g10code.com>
-
-       * rndw32.c (_gcry_rndw32_gather_random_fast): While adding data
-       use the size of the object and not the one of its address.  Bug
-       reported by Sascha Kiefer.
-
-2005-03-19  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c (do_cbc_encrypt): Be careful to not overwrite data,
-       which is to be used later on.  This happend, in case CTS is
-       enabled and OUTBUF is equal to INBUF.
-
-2005-02-25  Werner Koch  <wk@g10code.com>
-
-       * pubkey.c (gcry_pk_get_keygrip): Allow for shadowed-private-key.
-
-2005-02-13  Moritz Schulte  <moritz@g10code.com>
-
-       * serpent.c: Updated from 1.2 branch:
-
-       s/u32_t/u32/ and s/byte_t/byte/.  Too match what we have always
-       used and are using in all other files too
-       (serpent_test): Moved prototype out of a fucntion.
-
-2005-02-07  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c: Major parts rewritten.
-       * pubkey.c (_gcry_pk_get_elements): New function.
-
-2004-12-09  Werner Koch  <wk@g10code.com>
-
-       * serpent.c (serpent_setkey): Moved prototype of serpent_test to
-       outer scope.
-
-2004-09-11  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (pubkey_table): Added an alias entry for GCRY_PK_ELG_E.
-
-2004-08-23  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c: Do not include <assert.h>.
-       * rndegd.c: Likewise.
-       * sha1.c: Likewise.
-       * rndunix.c: Likewise.
-       * rndlinux.c: Likewise.
-       * rmd160.c: Likewise.
-       * md5.c: Likewise.
-       * md4.c: Likewise.
-       * cipher.c: Likewise.
-       * crc.c: Likewise.
-       * blowfish.c: Likewise.
-
-       * pubkey.c (dummy_generate, dummy_check_secret_key)
-       (dummy_encrypt, dummy_decrypt, dummy_sign, dummy_verify): Return
-       err code GPG_ERR_NOT_IMPLEMENTED instead of aborting through
-       log_bug().
-       (dummy_get_nbits): Return 0 instead of aborting though log_bug().
-
-2004-08-19  Werner Koch  <wk@g10code.de>
-
-       * pubkey.c (sexp_data_to_mpi): Changed the zero random byte
-       substituting code to actually do clever things.  Thanks to
-       Matthias Urlichs for noting the implementation problem.
-
-2004-08-09  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_sign): Fixed memory leak; fix provided by
-       Modestas Vainius.
-
-2004-07-16  Werner Koch  <wk@gnupg.org>
-
-       * rijndael.c (do_encrypt): Fix alignment problem.  Bugs found by
-       Matthias Urlichs.
-       (do_decrypt): Ditto.
-       (keySched, keySched2): Use 2 macros along with unions in the key
-       schedule context.
-
-2004-07-14  Moritz Schulte  <moritz@g10code.com>
-
-       * rsa.c (_gcry_rsa_decrypt): Don't forget to free "a".  Thanks to
-       Nikos Mavroyanopoulos.
-
-2004-05-09  Werner Koch  <wk@gnupg.org>
-
-       * random.c (read_pool): Mix the PID in to better protect after a
-       fork.
-
-2004-07-04  Moritz Schulte  <moritz@g10code.com>
-
-       * serpent.c: Use "u32_t" instead of "unsigned long", do not
-       declare S-Box variables as "register".  Fixes failure on
-       OpenBSD/sparc64, reported by Nikolay Sturm.
-
-2004-05-07  Werner Koch  <wk@gnupg.org>
-
-       * random.c (initialize): Factored out some code to ..
-       (initialize_basics): .. new function.
-       (_gcry_random_initialize): Just call initialize_basics unless the
-       new arg FULL is set to TRUE.
-       (_gcry_fast_random_poll): Don't do anything unless the random
-       system has been really initialized.
-
-2004-05-07  Moritz Schulte  <moritz@g10code.de>
-
-       * ac.c (gcry_ac_open): Do not dereference NULL pointer.  Reported
-       by Umberto Salsi.
-
-2004-02-20  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (check_prime): New args CB_FUNC and CB_ARG; call them
-       at different stages.  Pass these arguments through all callers.
-
-2004-02-06  Werner Koch  <wk@gnupg.org>
-
-       * des.c: Add a new OID as used by pkcs#12.
-
-       * rfc2268.c: New. Taken from libgcrypt. 
-       * cipher.c: Setup the rfc2268 algorithm.
-
-2004-01-25  Moritz Schulte  <mo@g10code.com>
-
-       * primegen.c (prime_generate_internal): Do not forget to free
-       `q_factor'; fixed by Brieuc Jeunhomme.
-       (prime_generate_internal): Do not forget to free `prime'.
-
-2004-01-14  Moritz Schulte  <mo@g10code.com>
-
-       * ac.c (gcry_ac_data_set): New argument: flags; slightly
-       rewritten.
-       (gcry_ac_data_get_name, gcry_ac_data_get_index): Likewise.
-       (gcry_ac_key_pair_generate): New argument: misc_data; modified
-       order of arguments.
-       (gcry_ac_key_test): New argument: handle.
-       (gcry_ac_key_get_nbits, gcry_ac_key_get_grip): Likewise.
-       Use GCRY_AC_FLAG_NO_BLINDING instead of
-       GCRY_AC_DATA_FLAG_NO_BLINDING.
-       (gcry_ac_mpi): New member: flags.
-       (gcry_ac_data_search, gcry_ac_data_add): Removed functions.
-
-2003-12-22  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (is_prime): Release A2.
-
-2003-12-19  Werner Koch  <wk@gnupg.org>
-
-       * md.c: Moved a couple of functions down below the data structure
-       definitions.
-       (struct gcry_md_context): New field ACTUAL_HANDLE_SIZE.
-       (md_open): Set it here.
-       (strcut gcry_md_list): New field ACTUAL_STRUCT_SIZE.
-       (md_enable): Set it here.
-       (md_close): Wipe the context memory.
-       secure memory.
-       * cipher.c (struct gcry_cipher_handle): New field ACTUAL_HANDLE_SIZE.
-       (gcry_cipher_open): Set it here.
-       (gcry_cipher_close): Use it to always wipe out the handle data.
-
-       * ac.c (gcry_ac_open): Make sure HANDLE gets initialized even when
-       the function is not successful.
-       (gcry_ac_close): Allow a NULL handle.
-       (gcry_ac_key_destroy, gcry_ac_key_pair_destroy): Ditto.
-       (gcry_ac_key_get_grip): Return INV_OBJ on error.
-
-       * primegen.c (prime_generate_internal): Fixed error code for
-       failed malloc.  Replaced the !err if chain by gotos.
-       (gcry_prime_group_generator): Remove the extra sanity check.
-
-       * md.c: Minor code and comment cleanups.
-
-2003-12-16  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (gen_prime): Doc fix.  Thanks to Newton Hammet.
-
-2003-12-11  Werner Koch  <wk@gnupg.org>
-
-       * rndunix.c (slow_poll): Don't use #warning but #error.
-
-       * rndegd.c: Changed indentation.
-       (my_make_filename): Removd the var_arg cruft becuase we
-       don't need it here.  Changed caller.  
-
-       * rndlinux.c: Changed indentation.
-       (open_device): Remove the superfluous stat call and clarify
-       comment.
-
-       * rsa.c: Changed indentation.
-       (secret): Use the standard algorithm if p, q and u are not
-       available.
-       (rsa_blind, rsa_unblind): Renamed from _gcry_rsa_blind,
-       _gcry_rsa_unblind and moved more to the top.
-
-       * md4.c: Changed indentation.  Removed unnecessary casts.
-       * md5.c, rmd160.c, sha1.c, tiger.c: Ditto.
-       * rijndael.c, twofish.c: Ditto.
-       * serpent.c: Removed unnecessary casts.
-       * sha256.c, sha512.c: Ditto.
-
-2003-12-09  Werner Koch  <wk@gnupg.org>
-
-       * dsa.c: Unified indentation style.
-       * elgamal.c: Ditto. 
-       * des.c (des_key_schedule): Code beautifications.
-       * blowfish.c: Changed indentation style.
-       * cast5.c (do_cast_setkey): Ditto.
-
-       * pubkey.c (gcry_pk_encrypt): Replaced the chain of if(!err) tests
-       by straightforward gotos. Other cleanups.
-       (gcry_pk_decrypt): Ditto.
-       (gcry_pk_sign): Ditto.
-       (gcry_pk_verify): Ditto.
-       (gcry_pk_genkey): Ditto.  Use strtoul instead of strtol.
-       (gcry_pk_ctl): Use GPG_ERR_INV_ARG to indicate bad arguments.
-
-2003-12-07  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_register_default): Undef the helper macro.
-       (gcry_pk_map_name): Allow NULL for string.
-       (sexp_to_key): Use memcpy and not strncpy.  Use gcry_free and not
-       free.
-       (sexp_to_sig): Ditto.
-       (sexp_to_enc): Ditto.  Replaced the chain of if(!err) tests by
-       straightforward gotos.
-
-2003-12-05  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c: Documentation cleanups.
-       (gcry_cipher_mode_from_oid): Allow NULL for STRING.
-
-2003-12-03  Werner Koch  <wk@gnupg.org>
-
-       * elgamal.c (sign, do_encrypt, gen_k): Make sure that a small K is
-       only used for encryption.
-
-2003-11-18  Werner Koch  <wk@gnupg.org>
-
-       * random.h (rndw32_set_dll_name): Removed unused prototype.
-
-       * Makefile.am (EXTRA_DIST): Added Manifest.
-
-2003-11-11  Werner Koch  <wk@gnupg.org>
-
-       * Manifest: New.
-
-2003-11-04  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_hash_buffer): Use shortcut for SHA1
-       * sha1.c (_gcry_sha1_hash_buffer): New.
-
-       * random.c: Reformatted most functions.
-       (mix_pool): Moved the failsafe_digest from global
-       scope to here.
-       (do_fast_random_poll): Use the generic fucntions even if a fast
-       gathering function has been used.
-       (read_pool): Detect a fork and retry.
-       (gcry_randomize, get_random_bytes): Don't distinguish anymore
-       between weak and strong random.
-       (gcry_create_nonce): New.
-
-2003-10-31  Werner Koch  <wk@gnupg.org>
-
-       * rndw32.c (slow_gatherer_windowsNT): Use a plain buffer for the
-       disk performance values and not the W32 API structure.
-
-       * dsa.c (verify): s/exp/ex/ due to shadowing of a builtin.
-       * elgamal.c (verify): Ditto.
-
-       * ac.c (gcry_ac_data_get_index): s/index/idx/
-       (gcry_ac_data_copy_internal): Remove the cast in _gcry_malloc.
-       (gcry_ac_data_add): Must use gcry_realloc instead of realloc.
-       * pubkey.c (sexp_elements_extract): s/index/idx/ as tribute to the
-       forehackers.
-       (gcry_pk_encrypt): Removed shadowed definition of I. Reordered
-       arguments to malloc for clarity.
-       (gcry_pk_sign, gcry_pk_genkey): Ditto.
-       * primegen.c (prime_generate_internal): s/random/randomlevel/.
-
-2003-10-27  Moritz Schulte  <mo@g10code.com>
-
-       * pubkey.c (gcry_pk_encrypt): Don't forget to deallocate pkey.
-
-2003-10-27  Werner Koch  <wk@gnupg.org>
-
-       * random.c (gcry_random_add_bytes): Return if buflen is zero to
-       avoid gcc warning about unsed parameter.
-       (MASK_LEVEL): Simplified; does now work for signed and unsigned
-       w/o warnings.
-
-       * md.c (md_start_debug): Removed the const from SUFFIX, because
-       this function is called from the control fucntion which does not
-       require const.
-
-       Prefixed all (pubkey,digest,cipher}_spec_* globale variables with
-       _gcry_.
-
-       * ac.c (ac_key_identifiers): Made static.
-
-       * random.c (getfnc_gather_random,getfnc_fast_random_poll): Move
-       prototypes to ..
-       * rand-internal.h: .. here 
-       * random.c (getfnc_gather_random): Include rndw32 gatherer.
-       * rndunix.c, rndw32.c, rndegd.c: Include them here.
-       * rndlinux.c (_gcry_rndlinux_gather_random): Prepend the _gcry_
-       prefix.  Changed all callers.
-       * rndegd.c (_gcry_rndegd_gather_random): Likewise.
-       (_gcry_rndegd_connect_socket): Likewise.
-       * rndunix.c (_gcry_rndunix_gather_random): Likewise.
-       (waitpid): Made static.
-       * rndw32.c: Removed the old and unused winseed.dll cruft.
-       (_gcry_rndw32_gather_random_fast): Renamed from
-       gather_random_fast.
-       (_gcry_rndw32_gather_random): Renamed from gather_random.  Note,
-       that the changes 2003-04-08 somehow got lost.
-
-       * sha512.c (sha512_init, sha384_init): Made static.
-
-       * cipher.c (do_ctr_decrypt): Removed "return" from this void
-       function.
-
-2003-10-24  Moritz Schulte  <mo@g10code.com>
-
-       * serpent.c: Fix an issue on big-endian systems.
-
-       * rndw32.c: Removed IS_MODULE -cruft.
-       * rndlinux.c (rndlinux_gather_random): Likewise.
-
-2003-10-10  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (gen_prime): Bail out if NBITS is less than 16.
-       (prime_generate_internal): Initialize prime variable to suppress
-       compiler warning.  Check pbits, initialize qbits when passed as
-       zero.
-
-       * primegen.c (prime_generate_internal): New arg
-       ALL_FACTORS. Changed all callers.
-       (gcry_prime_generate): Make the factors arg optional. Request
-       all_factors.  Make sure PRIME is set to NULL even on error.
-       (gcry_prime_group_generator): New.
-       (gcry_prime_release_factors): New.
-
-2003-10-06  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (gen_prime): Assert that NBITS is never zero, it
-       would cause a segv.
-
-2003-09-28  Moritz Schulte  <mo@g10code.com>
-
-       * ac.c: Include "cipher.h".
-
-2003-09-27  Moritz Schulte  <mo@g10code.com>
-
-       * rndegd.c (do_read): Return nread instead of nbytes; thanks to
-       Michael Caerwyn.
-
-2003-09-04  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (_gcry_pk_aliased_algo_name): New.
-       * ac.c (gcry_ac_open): Use it here.
-
-       * Makefile.am (EXTRA_libcipher_la_SOURCES): Add serpent.c
-
-2003-09-02  Moritz Schulte  <mo@g10code.com>
-
-       * primegen.c (gcry_prime_check, gcry_prime_generate): New
-       functions.
-       (prime_generate_internal): New function, based on
-       _gcry_generate_elg_prime.
-       (_gcry_generate_elg_prime): Rewritten as a wrapper for
-       prime_generate_internal.
-
-2003-08-28  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_encrypt): Don't include the flags list in the
-       return value.  This does not make sense and breaks any programs
-       parsing the output strictly (e.g. current gpgsm).
-       (gcry_pk_encrypt): If aliases for the algorithm name exists, take
-       the first one instead of the regular name to adhere to SPKI
-       conventions.
-       (gcry_pk_genkey): Ditto.
-       (gcry_pk_sign): Ditto. Removed unused KEY_ALGO_NAME.
-
-2003-08-19  Moritz Schulte  <mo@g10code.com>
-
-       * cipher.c: Add support for Serpent
-       * serpent.c: New file.
-
-2003-08-10  Moritz Schulte  <moritz@g10code.com>
-
-       * rsa.c (_gcry_rsa_blind, _gcry_rsa_unblind): Declare static.
-
-2003-08-09  Timo Schulz  <twoaday@freakmail.de>
-
-       * random.c (getfnc_gather_random): Don't check NAME_OF_DEV_RANDOM
-       two times, but also the NAME_OF_DEV_URANDOM device.
-       
-2003-08-08  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (sexp_to_enc): Fixed extraction of S-Expression: do not
-       fail if no `flags' sub S-Expression is found.
-
-2003-07-27  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_lookup_func_oid): Allow for empty OID lists.
-
-2003-07-23  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (gcry_ac_data_construct): New argument: include_flags, only
-       include `flags' S-expression, if include_flags is true.  Adjust
-       callers.  Thanks for triggering a bug caused by `flags'
-       sub-S-expression where they are not expected to Ralf Schneider.
-
-2003-07-21  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_lookup_func_name): Use new member name
-       `aliases' instead of `sexp_names'.
-
-       * ac.c (gcry_ac_key_data_get): New function.
-
-       * cipher.c (gcry_cipher_lookup_func_name): Fix return value.
-
-2003-07-20  Moritz Schulte  <moritz@g10code.com>
-
-       * blowfish.c: Adjusted for new gcry_cipher_spec_t structure.
-       * cast5.c: Likewise.
-       * twofish.c: Likewise.
-       * arcfour.c: Likewise.
-       * rijndael.c (rijndael_oids, rijndael192_oids, rijndael256_oids):
-       New variables, adjust for new gcry_cipher_spec_t structure.
-       * des.c (oids_tripledes): New variable, adjust for new
-       gcry_cipher_spec_t structure.
-
-       * md.c (oid_table): Removed.
-
-       * tiger.c (oid_spec_tiger): New variable.
-       (digest_spec_tiger): Adjusted for new gry_md_spec_t structure.
-
-       * sha512.c (oid_spec_sha512): New variable.
-       (digest_spec_sha512): Adjusted for new gry_md_spec_t structure.
-
-       * sha512.c (oid_spec_sha384): New variable.
-       (digest_spec_sha384): Adjusted for new gry_md_spec_t structure.
-
-       * sha256.c (oid_spec_sha256): New variable.
-       (digest_spec_sha256): Adjusted for new gry_md_spec_t structure.
-
-       * sha1.c (oid_spec_sha1): New variable.
-       (digest_spec_sha1): Adjusted for new gry_md_spec_t structure.
-
-       * rmd160.c (oid_spec_rmd160): New variable.
-       (digest_spec_rnd160): Adjusted for new gry_md_spec_t structure.
-
-       * md5.c (oid_spec_md5): New variable.
-       (digest_spec_md5): Adjusted for new gry_md_spec_t structure.
-
-       * md4.c (oid_spec_md4): New variable.
-       (digest_spec_md4): Adjusted for new gry_md_spec_t structure.
-
-       * crc.c (digest_spec_crc32, digest_spec_crc32_rfc1510,
-       digest_spec_crc32_rfc2440): Adjusted for new gry_md_spec_t
-       structure.
-
-2003-07-19  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (gcry_md_lookup_func_oid): New function.
-       (search_oid): New function, copied from cipher.c.
-       (gcry_md_map_name): Adjust for new search_oid_interface.
-
-       * cipher.c (oid_table): Removed table.
-       (gcry_cipher_lookup_func_oid): New function.
-       (search_oid): Rewritten to use the module functions.
-       (gcry_cipher_map_name): Adjust for new search_oid interface.
-       (gcry_cipher_mode_from_oid): Likewise.
-
-2003-07-18  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_hash_buffer): Convert ERR to gpg_error_t in
-       gpg_strerror.
-
-2003-07-14  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c (gcry_cipher_lookup_func_name): Also check the cipher
-       name aliases, not just the primary name.
-       (gcry_cipher_map_name): Remove kludge for aliasing Rijndael to
-       AES.
-
-       * arcfour.c, blowfish.c, cast5.c, des.c, twofish.c: Adjust cipher
-       specification structures.
-
-       * rijndael.c (rijndael_names, rijndael192_names,
-       rijndael256_names): New variables, use them in the cipher
-       specifications.
-
-       * rmd160test.c: Removed file.
-
-       * ac.c, arcfour.c, blowfish.c, cast5.c, cipher.c, des.c, dsa.c,
-       elgamal.c, md.c, pubkey.c, random.c, rijndael.c, rsa.c, twofish.c:
-       Used gcry_err* wrappers for libgpg symbols.
-
-       * primegen.c (gen_prime): Correct the order arguments to
-       extra_check.
-
-2003-07-12  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c: Replaced all public occurences of gpg_error_t with
-       gcry_error_t.
-       * cipher.c: Likewise.
-       * md.c: Likewise.
-       * pubkey.c: Likewise.
-       * random.c: Likewise.
-
-       * cipher.c: Added support for TWOFISH128.
-
-2003-07-08  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (gcry_ac_data_copy_internal): New function, based on
-       gcry_ac_data_copy.
-       (gcry_ac_data_copy): Made public, use gcry_ac_data_copy_internal.
-       (gcry_ac_key_init): Use gcry_ac_data_copy_internal.
-
-2003-07-07  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c (gcry_ac_data_set): Only release old MPI value if it is
-       different from the new value.  Bug reported by Simon Josefsson
-       <jas@extundo.com>.
-
-       * pubkey.c (gcry_pk_list): New function.
-       * md.c (gcry_md_list): New function.
-
-       * ac.c (gcry_ac_key_pair_generate): Fix calculation of format
-       string size.
-
-2003-07-05  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c: Named struct of digest_table `digest_table_entry'.
-       (digest_table_entry): New member: algorithm; filled in.
-       (digest_table_entry): Removed unused member: flags.
-       (gcry_md_register): New argument: algorithm_id, filled in.
-       (gcry_md_register_default): Used algorithm ID from module
-       structure.
-       (gcry_md_map_name): Likewise.
-       (md_enable): Likewise.
-       (md_read): Likewise.
-       (gcry_md_info): Likewise.
-
-       * pubkey.c: Named truct for pubkey_table `pubkey_table_entry'.
-       (pubkey_table_entry): New member: algorithm; filled in.
-       (gcry_pk_register_default): Used algorithm ID from pubkey_table.
-       (gcry_pk_register): New argument: algorithm_id, filled in.
-       (gcry_pk_map_name): Used algorithm ID from module structure.
-       (gcry_pk_decrypt): Likewise.
-       (gcry_pk_encrypt): Likewise.
-       (gcry_pk_verify): Likewise.
-       (gcry_pk_sign): Likewise.
-       (gcry_pk_testkey): Likewise.
-       (gcry_pk_genkey): Likewise.
-       (gcry_pk_get_nbits): Likewise.
-       (sexp_to_key): Removed unused variable: algo.
-       (sexp_to_sig): Likewise.
-
-       * cipher.c: Named struct for cipher_table `cipher_table_entry'.
-       (cipher_table_entry): New member: algorithm; filled in.
-       (gcry_cipher_register_default): Used algorithm ID from
-       cipher_table.
-       (gcry_cipher_register): New argument: algorithm_id, filled in.
-       (gcry_cipher_map_name): Used algorithm ID from module structure.
-
-       * arcfour.c (cipher_spec_arcfour): Removed algorithm ID.
-       * blowfish.c (cipher_spec_blowfish): Likewise.
-       * cast5.c (cipher_spec_cast5): Likewise.
-       * crc.c (digest_spec_crc32): Likewise.
-       * crc.c (digest_spec_crc32_rfc1510): Likewise.
-       * crc.c (digest_spec_crc32_rfc2440): Likewise.
-       * des.c (cipher_spec_des): Likewise.
-       * des.c (cipher_spec_tripledes): Likewise.
-       * dsa.c (pubkey_spec_dsa): Likewise.
-       * elgamal.c (pubkey_spec_elg): Likewise.
-       * md4.c (digest_spec_md4): Likewise.
-       * md5.c (digest_spec_md5): Likewise.
-       * aes.c (cipher_spec_aes): Likewise.
-       * aes.c (cipher_spec_aes192): Likewise.
-       * aes.c (cipher_spec_aes256): Likewise.
-       * rsa.c (pubkey_spec_rsa): Likewise.
-       * sha1.c (digest_spec_sha1): Likewise.
-       * sha256.c (digest_spec_sha256): Likewise.
-       * sha512.c (digest_spec_sha512): Likewise.
-       * tiger.c (digest_spec_tiger): Likewise.
-       * twofish.c (cipher_spec_twofish): Likewise.
-       * twofish.c (cipher_spec_twofish128): Likewise.
-
-       * Makefile.am (EXTRA_libcipher_la_SOURCES): Fix list of source
-       files; reported by Simon Josefsson <jas@extundo.com>.
-
-       * pubkey.c: Replaced all occurences of `id' with `algorithm',
-       since `id' is a keyword in obj-c.
-       * md.c: Likewise.
-       * cipher.c: Likewise.
-
-       * crc.c, md4.c, md5.c, rmd160.c, sha1.c, sha256.c, tiger.c:
-       Replaced all occurences of gcry_digest_spec_t with gcry_md_spec_t.
-
-       * dsa.c, rsa.c, elgamal.c: Replaced all occurencens of
-       gcry_pubkey_spec_t with gcry_pk_spec_t.
-
-       * md.c: Replaced all occurences of gcry_digest_spec_t with
-       gcry_md_spec_t.
-       (gcry_digest_register_default): Renamed to ...
-       (gcry_md_register_default): ... this; adjusted callers.
-       (gcry_digest_lookup_func_name): Renamed to ...
-       (gcry_md_lookup_func_name): ... this; adjusted callers.
-       (gcry_digest_lookup_name): Renamed to ...
-       (gcry_md_lookup_name): ... this; adjusted callers.
-       (gcry_digest_register): Renamed to ...
-       (gcry_md_register): ... this.
-       (gcry_digest_unregister): Renamed to ...
-       (gcry_md_unregister): ... this.
-
-       * pubkey.c (gcry_pubkey_register): Renamed to ...
-       (gcry_pk_register): ... this.
-       (gcry_pubkey_unregister): Renamed to ...
-       (gcry_pk_unregister): ... this.
-       Replaced all occurences of gcry_pubkey_spec_t with gcry_pk_spec_t.
-       (gcry_pubkey_register_default): Renamed to ...
-       (gcry_pk_register_default): ... this; adjusted callers.
-       (gcry_pubkey_lookup_func_name): Renamed to ...
-       (gcry_pk_lookup_func_name): ... this; adjusted callers.
-       (gcry_pubkey_lookup_name): Renamed to ...
-       (gcry_pk_lookup_name): ... this; adjusted callers.
-
-       * md.c (gcry_md_hash_buffer): Fix error checking.  Thanks to Simon
-       Josefsson <jas@extunde.com>.
-
-2003-07-04  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c (gcry_cipher_list): New function.
-
-2003-07-01  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (sexp_to_sig): Accept a `flags' S-expression to be more
-       consistent with sexp_to_enc.
-
-2003-06-30  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am (libcipher_la_SOURCES): Added: ac.c.
-
-       * pubkey.c (_gcry_pk_module_lookup): New function.
-       (_gcry_pk_module_release): New function.
-
-2003-06-29  Moritz Schulte  <moritz@g10code.com>
-
-       * ac.c: New file.
-
-2003-06-26  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_hash_buffer): Trigger BUG correcly with new API.
-
-2003-06-19  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_is_enabled): Fixed. 
-
-2003-06-18  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (gcry_cipher_get_algo_keylen): New.
-       (gcry_cipher_get_algo_blklen): New.
-
-2003-06-18  Moritz Schulte  <moritz@g10code.com>
-
-       * arcfour.c, cipher.c, blowfish.c, md.c, cast5.c, pubkey.c, crc.c,
-       des.c, dsa.c, elgamal.c, md4.c, md5.c, random.c, rijndael.c,
-       rmd160.c, rsa.c, sha1.c, sha256.c, sha512.c, tiger.c, twofish.c:
-       Replaced older types GcryDigestSpec, GcryCipherSpec and
-       GcryPubkeySpec with newer types: gcry_digest_spec_t,
-       gcry_cipher_spec_t and gcry_pubkey_spec_t.
-
-       * md.c (gcry_digest_id_new): Removed function.
-       (gcry_digest_register): Removed code for generating a new module
-       ID.
-
-       * pubkey.c (gcry_pubkey_id_new): Removed function.
-       (gcry_pubkey_register): Removed code for generating a new module
-       ID.
-
-       * cipher.c, md.c, pubkey.c: Replace old type GcryModule with newer
-       one: gcry_module_t.
-       (gcry_cipher_id_new): Removed function.
-       (gcry_cipher_register): Removed code for generating a new module
-       ID.
-
-       * cipher.c (gcry_cipher_register): Adjust call to
-       _gcry_module_add.
-       (gcry_cipher_register_default): Likewise.
-       * pubkey.c (gcry_pubkey_register_default): Likewise.
-       (gcry_pubkey_register): Likewise.
-       * md.c (gcry_digest_register_default): Likewise.
-       (gcry_digest_register): Likewise.
-
-       * md.c (gcry_digest_lookup_func_id): Removed function.
-       (gcry_digest_lookup_id): Likewise.
-       (gcry_digest_id_new): Use _gcry_module_lookup_id instead of
-       gcry_digest_lookup_id.
-       (digest_algo_to_string): Likewise.
-       (check_digest_algo): Likewise.
-       (md_enable): Likewise.
-       (md_digest_length): Likewise.
-       (md_asn_oid): Likewise.
-
-       * pubkey.c (gcry_pubkey_lookup_id): Removed function.
-       (gcry_pubkey_lookup_func_id): Likewise.
-       (gcry_pubkey_id_new): Use _gcry_module_lookup_id instead of
-       gcry_pubkey_id_new.
-       (gcry_pk_algo_name): Likewise.
-       (disable_pubkey_algo): Likewise.
-       (check_pubkey_algo): Likewise.
-       (pubkey_get_npkey): Likewise.
-       (pubkey_get_nskey): Likewise.
-       (pubkey_get_nsig): Likewise.
-       (pubkey_get_nenc): Likewise.
-       (pubkey_generate): Likewise.
-       (pubkey_check_secret_key): Likewise.
-       (pubkey_encrypt): Likewise.
-       (pubkey_decrypt): Likewise.
-       (pubkey_sign): Likewise.
-       (pubkey_verify): Likewise.
-       (gcry_pk_algo_info): Likewise.
-
-       * cipher.c (gcry_cipher_lookup_func_id): Removed function.
-       (gcry_cipher_lookup_id): Likewise.
-       (cipher_algo_to_string): use _gcry_module_lookup_id instead of
-       gcry_cipher_lookup_id.
-       (disable_cipher_algo): Likewise.
-       (check_cipher_algo): Likewise.
-       (cipher_get_blocksize): Likewise.
-       (gcry_cipher_open): Likewise.
-       (gcry_cipher_id_new): Likewise.
-
-2003-06-17  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am (GCRYPT_MODULES): Set to @GCRYPT_CIPHERS@,
-       @GCRYPT_PUBKEY_CIPHERS@, @GCRYPT_DIGESTS@ and @GCRYPT_RANDOM@.
-       (libcipher_la_DEPENDENCIES): Set to $(GCRYPT_MODULES).
-       (libcipher_la_LIBADD): Likewise.
-       (AM_CFLAGS): Added: @GPG_ERROR_CFLAGS@.
-       (EXTRA_libcipher_la_SOURCES): Added all conditional sources.
-
-       * md.c (md_open): Use _gcry_fast_random_poll instead of
-       fast_random_poll.
-       * cipher.c (gcry_cipher_open): Likewise.
-
-       * random.h (fast_random_poll): Removed macro.
-
-       * blowfish.c, md4.c, md5.c, rmd160.c, sha1.c, sha256.c, sha512.c,
-       tiger.c: Use Autoconf's WORDS_BIGENDIAN instead of our own
-       BIG_ENDIAN_HOST.
-
-2003-06-16  Moritz Schulte  <moritz@g10code.com>
-
-       * random.c (getfnc_gather_random): Do not special-case
-       USE_ALL_RANDOM_MODULES, make it the default.
-
-       * dsa.c: Replace last occurences of old type names with newer
-       names (i.e. replace MPI with gcry_mpi_t).
-       * elgamal.c: Likewise.
-       * primegen.c: Likewise.
-       * pubkey.c: Likewise.
-       * rsa.c: Likewise.
-
-2003-06-14  Moritz Schulte  <moritz@g10code.com>
-
-       * des.c (des_setkey): Add selftest check.
-       (tripledes_set3keys): Likewise.
-       (do_tripledes_setkey): Remove selftest check.
-       (do_des_setkey): Likewise.
-
-2003-06-11  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (_gcry_md_init): New function.
-       * cipher.c (_gcry_cipher_init): New function.
-       * pubkey.c (_gcry_pk_init): New function.
-
-2003-06-13  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_get_algo): Reverted to old API.  This is a
-       convenience function anyway and error checking is not approriate.
-       (gcry_md_is_secure): New.
-       (gcry_md_is_enabled): New.
-
-2003-06-12  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (gcry_cipher_open): Make sure HANDLE is set to NULL on
-       error.
-
-2003-06-11  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_open): Make sure H receives either NULL or an
-       valid handle.
-       (gcry_md_copy): Swapped arguments so that it is more in lione with
-       md_open and most other API fucntions like memcpy (destination
-       comes first).  Make sure HANDLE is set to NULL on error.
-       
-       * rijndael.c (do_encrypt): Hack to force correct alignment.  It
-       seems not to be not sufficient, though.  We should rework this
-       fucntions and remove all these ugly casts.  Let the compiler
-       optimize or have an assembler implementation.
-
-2003-06-09  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am: Removed rules serpent, since that is not commited
-       yet.
-
-2003-06-08  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_encrypt): Improve calculation for size of the
-       format string.
-
-2003-06-07  Moritz Schulte  <moritz@g10code.com>
-
-       * arcfour.c, bithelp.h, blowfish.c, cast5.c, cipher.c, crc.c,
-       des.c, dsa.c, elgamal.c, md4.c, md5.c, md.c, primegen.c, pubkey.c,
-       rand-internal.h, random.c, random.h, rijndael.c, rmd160.c,
-       rmd160test.c, rmd.h, rndeged.c, rndlinux.c, rndunix.c, rndw32.c,
-       rsa.c, sha1.c, sha256.c, sha512.c, tiger.c, twofish.c: Edited all
-       preprocessor instructions to remove whitespace before the '#'.
-       This is not required by C89, but there are some compilers out
-       there that don't like it.  Replaced any occurence of the now
-       deprecated type names with the new ones.
-       
-2003-06-04  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_encrypt): Construct an arg_list and use
-       gcry_sexp_build_array instead of gcry_sexp_build.
-       (gcry_pk_sign): Likewise.
-       (gcry_pk_genkey): Likewise.
-
-2003-06-01  Moritz Schulte  <moritz@g10code.com>
-
-       * dsa.c (_gcry_dsa_generate): Do not check wether the algorithm ID
-       does indeed belong to DSA.
-       (_gcry_dsa_sign): Likewise.
-       (_gcry_dsa_verify): Likewise.
-       (_gcry_dsa_get_nbits): Likewise.
-
-       * elgamal.c (_gcry_elg_check_secret_key): Do not check wether the
-       algorithm ID does indeed belong to ElGamal.
-       (_gcry_elg_encrypt): Likewise.
-       (_gcry_elg_decrypt): Likewise.
-       (_gcry_elg_sign): Likewise.
-       (_gcry_elg_verify): Likewise.
-       (_gcry_elg_get_nbits): Likewise.
-       (_gcry_elg_generate): Likewise.
-
-       * rsa.c (_gcry_rsa_generate): Do not check wether the algorithm ID
-       does indeed belong to RSA.
-       (_gcry_rsa_encrypt): Likewise.
-       (_gcry_rsa_decrypt): Likewise.
-       (_gcry_rsa_sign): Likewise.
-       (_gcry_rsa_verify): Likewise.
-       (_gcry_rsa_get_nbits): Likewise.
-
-2003-05-30  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (md_get_algo): Return zero in case to algorithm is enabled.
-
-       * md.c (gcry_md_info): Adjusted for new no-errno-API.
-       (md_final): Likewise.
-       (gcry_md_get_algo): Likewise.
-       * pubkey.c (gcry_pk_get_keygrip): Likewise.
-       (gcry_pk_ctl): Likewise.
-       (gcry_pk_algo_info): Likewise.
-       * des.c (selftest): Likewise.
-
-2003-05-29  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (md_enable): Do not forget to release module on error.
-       (gcry_md_open): Adjusted for new no-errno-API.
-       (md_open): Likewise.
-       (md_copy): Likewise.
-       (gcry_md_copy): Likewise.
-       (gcry_md_setkey): Likewise.
-       (gcry_md_algo_info): Likewise.
-
-       * cipher.c (gcry_cipher_open): Adjusted for new no-errno-API and
-       also fixed a locking bug.
-       (gcry_cipher_encrypt): Adjusted for new no-errno-API.
-       (gcry_cipher_decrypt): Likewise.
-       (gcry_cipher_ctl): Likewise.
-       (gcry_cipher_info): Likewise.
-       (gcry_cipher_algo_info): Likewise.
-
-2003-05-28  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (md_enable): Adjusted for libgpg-error.
-       (gcry_md_enable): Likewise.
-       (gcry_digest_register_default): Likewise.
-       (gcry_digest_register): Likewise.
-       (check_digest_algo): Likewise.
-       (prepare_macpads): Likewise.
-       (gcry_md_setkey): Likewise.
-       (gcry_md_ctl): Likewise.
-       (gcry_md_get): Likewise.
-       (gcry_md_algo_info): Likewise.
-       (gcry_md_info): Likewise.
-       * dsa.c (_gcry_dsa_generate): Likewise.
-       (_gcry_dsa_check_secret_key): Likewise.
-       (_gcry_dsa_sign): Likewie.
-       (_gcry_dsa_verify): Likewise.
-       * twofish.c (do_twofish_setkey): Likewise.
-       (twofish_setkey): Likewise.
-       * cipher.c (gcry_cipher_register): Likewise.
-
-2003-05-25  Moritz Schulte  <moritz@g10code.com>
-
-       * rijndael.c (do_setkey): Adjusted for libgpg-error.
-       (rijndael_setkey): Likewise.
-       * random.c (gcry_random_add_bytes): Likewise.
-       * elgamal.c (_gcry_elg_generate): Likewise.
-       (_gcry_elg_check_secret_key): Likewise.
-       (_gcry_elg_encrypt): Likewise.
-       (_gcry_elg_decrypt): Likewise.
-       (_gcry_elg_sign): Likewise.
-       (_gcry_elg_verify): Likewise.
-       * rsa.c (_gcry_rsa_generate): Likewise.
-       (_gcry_rsa_check_secret_key): Likewise.
-       (_gcry_rsa_encrypt): Likewise.
-       (_gcry_rsa_decrypt): Likewise.
-       (_gcry_rsa_sign): Likewise.
-       (_gcry_rsa_verify): Likewise.
-       * pubkey.c (dummy_generate, dummy_check_secret_key, dummy_encrypt,
-       dummy_decrypt, dummy_sign, dummy_verify): Likewise.
-       (gcry_pubkey_register): Likewise.
-       (check_pubkey_algo): Likewise.
-       (pubkey_generate): Likewise.
-       (pubkey_check_secret_key): Likewise.
-       (pubkey_encrypt): Likewise.
-       (pubkey_decrypt): Likewise.
-       (pubkey_sign): Likewise.
-       (pubkey_verify): Likewise.
-       (sexp_elements_extract): Likewise.
-       (sexp_to_key): Likewise.
-       (sexp_to_sig): Likewise.
-       (sexp_to_enc): Likewise.
-       (sexp_data_to_mpi): Likewise.
-       (gcry_pk_encrypt): Likewise.
-       (gcry_pk_decrypt): Likewise.
-       (gcry_pk_sign): Likewise.
-       (gcry_pk_verify): Likewise.
-       (gcry_pk_testkey): Likewise.
-       (gcry_pk_genkey): Likewise.
-       (gcry_pk_ctl): Likewise.
-       * cipher.c (dummy_setkey): Likewise.
-       (check_cipher_algo): Likewise.
-       (gcry_cipher_open): Likewise.
-       (cipher_setkey): Likewise.
-       (gcry_cipher_ctl): Likewise.
-       (cipher_encrypt): Likewise.
-       (gcry_cipher_encrypt): Likewise.
-       (cipher_decrypt): Likewise.
-       (gcry_cipher_decrypt): Likewise.
-       (gcry_cipher_info): Likewise.
-       (gcry_cipher_algo_info): Likewise.
-       * cast5.c (cast_setkey): Likewise.
-       (do_cast_setkey): Likewise.
-       * arcfour.c (arcfour_setkey): Likewise.
-       (do_arcfour_setkey): Likewise.
-       * blowfish.c (do_bf_setkey): Likewise.
-       (bf_setkey): Likewise.
-       * des.c (do_des_setkey): Likewise.
-       (do_tripledes_setkey): Likewise.
-
-2003-05-22  Moritz Schulte  <moritz@g10code.com>
-
-       * tiger.c: Merged code ussing the U64_C macro from GnuPG.
-
-       * sha512.c: Likewise.
-
-2003-05-17  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Fix type: acquire a lock, instead of
-       releasing it.
-
-2003-05-11  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_testkey): Call REGISTER_DEFAULT_CIPHERS.
-       (gcry_pk_ctl): Likewise.
-
-2003-04-27  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (gcry_pk_genkey): Release sexp after extracted data has
-       been used.
-
-       * md.c (gcry_md_get_algo_dlen): Simplified, simply call
-       md_digest_length to do the job.
-
-       * des.c (do_des_setkey): Check for selftest failure not only
-       during initialization.
-       (do_tripledes_setkey): Include check for selftest failure.
-
-       * pubkey.c (gcry_pubkey_register_default): New macro
-       `pubkey_use_dummy', use it.
-
-       * elgamal.c (elg_names): New variable.
-       (pubkey_spec_elg): Include elg_names.
-
-       * dsa.c (dsa_names): New variable.
-       (pubkey_spec_dsa): Include dsa_names.
-
-       * rsa.c (rsa_names): New variable.
-       (pubkey_spec_rsa): Include rsa_names.
-
-       * pubkey.c (gcry_pubkey_lookup_func_name): Compare name also with
-       the names listed in `sexp_names'.
-
-2003-04-24  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (sexp_to_key): New variables: module, pubkey.  Adjusted
-       to new module interface.
-       (sexp_to_key): Changend type of argument `retalgo' from `int *' to
-       `GcryModule **'.  Adjusted all callers.  Removed argument:
-       r_algotblidx.
-       (sexp_to_sig): Changend type of argument `retalgo' from `int *' to
-       `GcryModule **'.  Adjusted all callers.
-       (sexp_to_enc): Likewise.
-
-       (pubkey_get_npkey, pubkey_get_nskey, pubkey_get_nsig,
-       pubkey_get_nenc): Use strlen to find out the number.
-
-       * rsa.c: Adjust pubkey_spec_rsa to new internal interface.
-       * dsa.c: Likewise.
-       * elgamal.c: Likewise.
-
-2003-04-17  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c (sexp_elements_extract): New function.
-       * pubkey.c (sexp_to_key): Removed variable `idx', added `err', use
-       sexp_elements_extract.
-       (sexp_to_sig): Likewise.
-       (sexp_to_enc): Likewise.
-
-       * pubkey.c: Terminate list correctly.
-       * md.c: Include sha512/sha384 in digest_table.
-
-2003-04-16  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am: Include support for sha512.c.
-
-       * sha512.c: New file, merged from GnuPG, with few modifications
-       for libgcrypt.
-
-       * rand-internal.h: Removed declarations for constructor functions.
-
-       * md.c (md_copy): Call _gcry_module_use for incrementing the usage
-       counter of the digest modules.
-
-       * rsa.c: Do not include "rsa.h".
-       * dsa.c: Do not include "dsa.h".
-       * elgamal.c: Do not include "elgamal.h".
-       * des.c: Do not include "des.h".
-       * cast5.c: Do not include "cast5.h".
-       * blowfish.c: Do not include "blowfish.h".
-       * arcfour.c: Do not include "arcfour.h".
-
-       * Makefile.am (libcipher_la_DEPENDENCIES): Removed.
-       (libcipher_la_LIBADD): Removed.
-       Use Automake conditionals for conditional compilation.
-
-2003-04-13  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c (gcry_cipher_open): Call REGISTER_DEFAULT_CIPHERS.
-
-       * md.c (gcry_md_list): New member: module.
-       (md_enable): New variable: module, changed use of module and
-       digest.
-       (md_enable): Initialize member: module.
-       (md_close): Call _gcry_module_release.
-
-       * cipher.c (gcry_cipher_open): New variable: module, changed use of
-       module and cipher.
-       (struct gcry_cipher_handle): New member: module.
-       (gcry_cipher_open): Initialize member: module.
-       (gcry_cipher_close): Call _gcry_module_release.
-
-2003-04-09  Moritz Schulte  <moritz@g10code.com>
-       
-       * cipher.c: Include "ath.h".
-       * md.c: Likewise.
-       * pubkey.c: Likewise.
-
-       * cipher.c (ciphers_registered_lock): New variable.
-       * md.c (digests_registered_lock): New variable.
-       * pubkey.c (pubkeys_registered_lock): New variable.
-
-       * rndlinux.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_rndlinux_constructor): Removed function.
-
-       * rndegd.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_rndegd_constructor): Removed function.
-
-       * rndunix.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_rndunix_constructor): Removed function.
-
-       * rndw32.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_rndw32_constructor): Removed function.
-
-       * rndegd.c (rndegd_connect_socket): Simplify code for creating the
-       egd socket address.
-       (rndegd_connect_socket): Call log_fatal use instead of
-       g10_log_fatal.
-       (egd_gather_random): Renamed to ...
-       (rndegd_gather_random): ... here.
-
-2003-04-08  Moritz Schulte  <moritz@g10code.com>
-
-       * rndlinux.c: Do not include "dynload.h".
-       * rndunix.c: Likewise.
-       * rndw32.c: Likewise.
-
-       * rndegd.c (rndegd_connect_socket): Factored out from ...
-       (egd_gather_random): here; call it.
-       (egd_socket): New variable.
-       (egd_gather_random): Initialize fd with egd_socket, do not declare
-       fd static.
-       (do_read): Merged few changes from GnuPG. FIXME - not finished?
-       Do not include "dynload.h".
-
-       * rndw32.c (gather_random): Renamed to rndw32_gather_random, do
-       not declare static.
-       (gather_random_fast): Renamed to rndw32_gather_random_fast, do not
-       declare static.
-
-       * rndunix.c (gather_random): Renamed to rndunix_gather_random, do
-       not declare static.
-       * rndegd.c (gather_random): Renamed to rndegd_gather_random, do
-       not declare static.
-       * rndlinux.c (gather_random): Renamed to rndlinux_gather_random,
-       do not declare static.
-
-2003-04-07  Moritz Schulte  <moritz@g10code.com>
-
-       * Makefile.am (libcipher_la_SOURCES): Removed construct.c.
-       (libcipher_la_SOURCES): Added sha1.c, sha256.c, rmd160.c, md4.c,
-       md5.c, tiger.c and crc.c
-       (EXTRA_PROGRAMS): Removed sha1, sha256, rmd160, md4, md5, tiger
-       and crc.  Removed definitions: EXTRA_md4_SOURCES,
-       EXTRA_md5_SOURCES, EXTRA_rmd160_SOURCES, EXTRA_sha1_SOURCES,
-       EXTRA_sha256_SOURCES, EXTRA_tiger_SOURCES and EXTRA_crc_SOURCES,
-       BUILT_SOURCES, DISTCLEANFILES.
-
-       * pubkey.c: Do not include "elgamal.h", "dsa.h" and "rsa.h".
-
-       * Makefile.am (libcipher_la_SOURCES): Removed rsa.h, elgamal.h,
-       dsa.h, des.h, cast5.h, arcfour.h and blowfish.h.
-
-       * rsa.h: Removed file.
-       * elgamal.h: Removed file.
-       * dsa.h: Removed file.
-       * des.h: Removed file.
-       * cast5.h: Removed file.
-       * arcfour.h: Removed file.
-       * blowfish.h: Removed file.
-
-       * Makefile.am (libcipher_la_SOURCES): Removed dynload.c and
-       dynload.h.
-
-       * rsa.c (pubkey_spec_rsa): New variable.
-       * dsa.c (pubkey_spec_rsa): New variable.
-       * elgamal.c (pubkey_spec_elg): New variable.
-       
-       * rsa.c (_gcry_rsa_get_info): Removed function.
-       * elgamal.c (_gcry_elg_get_info): Removed function.
-       * dsa.c (_gcry_dsa_get_info): Removed function.
-
-       * tiger.c (tiger_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_tiger_constructor): Removed function.
-       
-       * sha1.c (sha1_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_sha1_constructor): Removed function.
-
-       * sha256.c (sha256_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_sha256_constructor): Removed function.
-
-       * rmd160.c (rmd160_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_rmd160_constructor): Removed function.
-
-       * md5.c (md5_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_md5_constructor): Removed function.
-
-       * md4.c (md4_get_info): Removed function.
-       (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func): Removed function.
-       (_gcry_md4_constructor): Removed function.
-
-       * crc.c (crc_get_info): Removed function.
-
-       * arcfour.c (do_arcfour_setkey): Changed type of context argument
-       to `void *', added local variable for cast, adjusted callers.
-       (arcfour_setkey): Likewise.
-       (encrypt_stream): Likewise.
-       * cast5.c (cast_setkey): Likewise.
-       (encrypt_block): Likewise.
-       * rijndael.c (rijndael_setkey): Likewise.
-       (rijndael_encrypt): Likewise.
-       (rijndael_decrypt): Likewise.
-       * twofish.c (twofish_setkey): Likewise.
-       (twofish_encrypt): Likewise.
-       (twofish_decrypt): Likewise.
-       * des.c (do_des_setkey): Likewise.
-       (do_des_encrypt): Likewise.
-       (do_des_encrypt): Likewise.
-       (do_tripledes_encrypt): Likewise.
-       (do_tripledes_encrypt): Likewise.
-       * blowfish.c (bf_setkey: Likewise.
-       (encrypt_block): Likewise.
-       (decrypt_block): Likewise.
-       
-       * arcfour.c (encrypt_stream): Likewise.
-
-       * rijndael.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func) Removed function.  
-       
-       * twofish.c (gnupgext_version, func_table): Removed definitions.
-       (gnupgext_enum_func) Removed function.  
-
-       * cast5.c (CIPHER_ALGO_CAST5): Removed.
-
-       * blowfish.c (FNCCAST_SETKEY, FNCCAST_CRYPT): Removed macros.
-       (CIPHER_ALGO_BLOWFISH): Removed symbol.
-       * cast5.c (FNCCAST_SETKEY, FNCCAST_CRYPT): Likewise.
-       * des.c (selftest_failed): Removed.
-       (initialized): New variable.
-       (do_des_setkey): Run selftest, if not yet done.
-       (FNCCAST_SETKEY, FNCCAST_CRYPT): Removed macros.
-
-       * arcfour.c (_gcry_arcfour_get_info): Removed function.
-       * blowfish.c (_gcry_blowfish_get_info): Removed function.
-       * cast5.c (_gcry_cast5_get_info): Removed function.
-       * des.c (_gcry_des_get_info): Removed function.
-       * rijndael.c (_gcry_rijndael_get_info): Removed function.
-       * twofish.c (_gcry_twofish_get_info): Removed function.
-
-       * arcfour.c (cipher_spec_arcfour): New variable.
-       * twofish.c (cipher_spec_twofish, cipher_spec_twofish128): New
-       variables.
-       * rijndael.c (cipher_spec_aes, cipher_spec_aes192,
-       cipher_spec256): New variables.
-       * des.c (cipher_spec_des, cipher_spec_tripledes): New variables.
-       * cast5.c (cipher_spec_cast5): New variable.
-       * blowfish.c (cipher_spec_blowfish): Likewise.
-       
-       * twofish.c: Do not include "dynload.h".
-       * rijndael.c: Likewise.
-       * des.c: Likewise.
-       * cast5.c: Likewise.
-       * blowfish.c: Likewise.
-       * cipher.c: Likewise.
-       * crc.c: Likewise.
-       * md4.c: Likewise.
-       * md5.c: Likewise.
-       * md.c: Likewise.
-       * pubkey.c: Likewise.
-       * rijndael.c: Likewise.
-       * sha1.c: Likewise.
-       * sha256.c: Likewise.
-
-       * arcfour.c: Include "cipher.h".
-       * twofish.c: Likewise.
-       * rijndael.c: Likewise.
-       * des.c: Likewise.
-       * cast5.c: Likewise.
-       * blowfish.c: Likewise.
-
-       * twofish.c (twofish_setkey): Declared argument `key' const.
-       (twofish_encrypt): Declared argument `inbuf' const.
-       (twofish_decrypt): Likewise.
-
-       * rijndael.c (rijndael_setkey): Declared argument `key' const.
-       (rijndael_encrypt): Declared argument `inbuf' const.
-       (rijndael_decrypt): Likewise.
-
-       * des.c (do_des_setkey): Declared argument `key' const.
-       (do_tripledes_setkey): Likewise.
-       (do_des_encrypt): Declared argument `inbuf' const.
-       (do_des_decrypt): Likewise.
-       (do_tripledes_encrypt): Likewise.
-       (do_tripledes_decrypt): Likewise.
-
-       * cast5.c (encrypt_block): Declared argument `inbuf' const.
-       (decrypt_block): Likewise.
-       (cast_setkey): Declared argument `key' const.
-
-       * blowfish.c (do_bf_setkey): Declared argument `key' const.
-       (encrypt_block): Declared argument `inbuf' const.
-       (encrypt_block): Likewise.
-
-       
-
-       * cipher.c: Remove CIPHER_ALGO_DUMMY related code.
-       Removed struct cipher_table_s.
-       Changed definition of cipher_table.
-       Removed definition of disabled_algos.
-       (ciphers_registered, default_ciphers_registered): New variables.
-       (REGISTER_DEFAULT_CIPHERS): New macro.
-       (dummy_setkey): Declared argument `key' const.
-       (dummy_encrypt_block): Declared argument `inbuf' const.
-       (dummy_encrypt_block): Likewise.
-       (dummy_encrypt_stream): Likewise.
-       (dummy_encrypt_stream): Likewise.
-       (dummy_setkey): Use `unsigned char' instead of `byte'.
-       (dummy_encrypt_block): Likewise.
-       (dummy_decrypt_block): Likewise.
-       (dummy_encrypt_stream): Likewise.
-       (dummy_decrypt_stream): Likewise.
-       (gcry_cipher_register_default): New function.
-       (gcry_cipher_lookup_func_id): New function.
-       (gcry_cipher_lookup_func_name): New function.
-       (gcry_cipher_lookup_id): New function.
-       (gcry_cipher_lookup_name): New function.
-       (gcry_cipher_id_new): New function.
-       (gcry_cipher_register): New function.
-       (gcry_cipher_unregister): New function.
-       (setup_cipher_table): Removed function.
-       (load_cipher_modules): Removed function.
-       (gcry_cipher_map_name): Adjusted to use new module management.
-       (cipher_algo_to_string): Likewise.
-       (disable_cipher_algo): Likewise.
-       (check_cipher_algo): Likewise.
-       (cipher_get_keylen): Likewise.
-       (cipher_get_blocksize): Likewise.
-       (gcry_cipher_open): Likewise.
-       (struct gcry_cipher_handle): Replaced members algo, algo_index,
-       blocksize, setkey, encrypt, decrypt, stencrypt, stdecrypt with one
-       member: cipher.
-       (gcry_cipher_open): Adjusted code for new handle structure.
-       (cipher_setkey): Likewise.
-       (cipher_setiv): Likewise.
-       (cipher_reset): Likewise.
-       (do_ecb_encrypt): Likewise.
-       (do_ecb_decrypt): Likewise.
-       (do_cbc_encrypt): Likewise.
-       (do_cbc_decrypt): Likewise.
-       (do_cfb_encrypt): Likewise.
-       (do_cfb_decrypt): Likewise.
-       (do_ctr_encrypt): Likewise.
-       (cipher_encrypt): Likewise.
-       (gcry_cipher_encrypt): Likewise.
-       (cipher_decrypt): Likewise.
-       (gcry_cipher_decrypt): Likewise.
-       (cipher_sync): Likewise.
-       (gcry_cipher_ctl): Likewise.
-
-       * pubkey.c: Removed struct pubkey_table_s.
-       Changed definition of pubkey_table.
-       Removed definition of disabled_algos.
-       (pubkeys_registered, default_pubkeys_registered): New variables.
-       (REGISTER_DEFAULT_PUBKEYS): New macro.
-       (setup_pubkey_table): Removed function.
-       (load_pubkey_modules): Removed function.
-       (gcry_pubkey_register_default): New function.
-       (gcry_pubkey_lookup_func_id): New function.
-       (gcry_pubkey_lookup_func_name): New function.
-       (gcry_pubkey_lookup_id): New function.
-       (gcry_pubkey_lookup_name): New function.
-       (gcry_pubkey_id_new): New function.
-       (gcry_pubkey_register): New function.
-       (gcry_pubkey_unregister): New function.
-       (gcry_pk_map_name): Adjusted to use new module management.
-       (gcry_pk_algo_name): Likewise.
-       (disable_pubkey_algo): Likewise.
-       (check_pubkey_algo): Likewise.
-       (pubkey_get_npkey): Likewise.
-       (pubkey_get_nskey): Likewise.
-       (pubkey_get_nsig): Likewise.
-       (pubkey_get_nenc): Likewise.
-       (pubkey_generate): Likewise.
-       (pubkey_check_secret_key): Likewise.
-       (pubkey_encrypt): Likewise.
-       (pubkey_decrypt): Likewise.
-       (pubkey_sign): Likewise.
-       (pubkey_verify): Likewise.
-       (gcry_pk_get_nbits): Likewise.
-       (gcry_pk_algo_info): Likewise.
-
-       * md.c: Removed struct md_digest_list_s.
-       (digest_list): Changed definition.
-       (digests_registered, default_digests_registered): New variables.
-       (REGISTER_DEFAULT_DIGESTS): New macro.
-       (new_list_item): Removed function.
-       (setup_md_table): Removed function.
-       (load_digest_module): Removed function.
-       (gcry_digest_register_default): New function.
-       (gcry_digest_lookup_func_id): New function.
-       (gcry_digest_lookup_func_name): New function.
-       (gcry_digest_lookup_id): New function.
-       (gcry_digest_lookup_name): New function.
-       (gcry_digest_id_new): New function.
-       (gcry_digest_register): New function.
-       (gcry_digest_unregister): New function.
-       (GcryDigestEntry): New type.
-       (struct gcry_md_context): Adjusted type of `list'.
-       (gcry_md_map_name): Adjusted to use new module management.
-       (digest_algo_to_string): Likewise.
-       (check_digest_algo): Likewise.
-       (md_enable): Likewise.
-       (md_digest_length): Likewise.
-       (md_asn_oid): Likewise.
-
-2003-04-07  Moritz Schulte  <moritz@g10code.com>
-
-       * pubkey.c: Replaced PUBKEY_ALGO_DSA with GCRY_PK_DSA,
-       PUBKEY_ALGO_RSA with GCRY_PK_RSA and PUBKEY_ALGO_ELGAMAL with
-       GCRY_PK_ELG.
-
-       * dsa.c: Replaced PUBKEY_ALGO_DSA with GCRY_PK_DSA.
-
-2003-04-01  Moritz Schulte  <moritz@g10code.com>
-
-       * des.c: Removed checks for GCRY_CIPHER_3DES and GCRY_CIPHER_DES.
-
-2003-03-31  Moritz Schulte  <moritz@g10code.com>
-
-       * tiger.c (tiger_get_info): Do not declare static.
-       * sha256.c (sha256_get_info): Likewise.
-       * sha1.c (sha1_get_info): Likewise.
-       * rmd160.c (rmd160_get_info): Likewise.
-       * md5.c (md5_get_info): Likewise.
-       * md4.c (md4_get_info): Likewise.
-       * crc.c (crc_get_info): Likewise.
-
-       * md.c (load_digest_module): Call setup_md_table during
-       initialization.
-       (new_list_item): Link new element into digest_list.
-
-       * cipher.c (do_ctr_decrypt): Made do_ctr_encrypt act as a wrapper
-       for do_ctr_encrypt, since these functions are identical.
-
-2003-03-30  Simon Josefsson  <jas@extundo.com>
-
-       * cipher.c (struct gcry_cipher_handle): Add counter field.
-       (gcry_cipher_open): Add CTR.
-       (cipher_reset): Clear counter field.
-       (do_ctr_encrypt, do_ctr_decrypt): New functions.
-       (cipher_encrypt, cipher_decrypt): Call CTR functions.
-       (gcry_cipher_ctl): Add SET_CTR to set counter.
-
-2003-03-30  Moritz Schulte  <moritz@g10code.com>
-
-       * rsa.c (_gcry_rsa_blind): New function.
-       (_gcry_rsa_unblind): New function.
-       (_gcry_rsa_decrypt): Use _gcry_rsa_blind and _gcry_rsa_decrypt.
-
-2003-03-26  Moritz Schulte  <moritz@g10code.com>
-
-       * dynload.c (_gcry_enum_gnupgext_pubkeys): Adjust `encrypt' and
-       `decrypt' function arguments.
-       (_gcry_enum_gnupgext_pubkeys): Likewise.
-       * dynload.h: Likewise.
-       
-       * pubkey.c (dummy_decrypt): Add argument: int flags.
-       (dummy_encrypt): Likewise.
-
-       * elgamal.c (_gcry_elg_encrypt): Add argument: int flags.
-       (_gcry_elg_decrypt): Likewise.
-
-       * rsa.c (_gcry_rsa_encrypt): Add argument: int flags.
-       (_gcry_rsa_decrypt): Likewise.
-
-       * pubkey.c: Add `flags' argument to members `encrypt' and
-       `decrypt' of struct `pubkey_table_s'.
-
-       * rsa.h: Add `flags' argument to function declarations.
-       * elgamal.h: Likewise.
-
-       * pubkey.c (sexp_data_to_mpi): New variable: int parsed_flags.
-       (sexp_data_to_mpi): Set `parsed_flags'.
-       (sexp_data_to_mpi): New argument: int *flags.
-       (gcry_pk_encrypt): New variable: int flags.
-       (gcry_pk_encrypt): Pass `flags' to pubkey_encrypt.
-       (pubkey_encrypt): New variable: int flags.
-       (pubkey_encrypt): Pass `flags' to pubkey encrypt function.
-       (pubkey_decrypt): Likewise.
-       (pubkey_decrypt): Pass `flags' to pubkey encrypt function.
-       (gcry_pk_encrypt): Include `flags' s-exp in return list.
-       (sexp_to_enc): New argument: int *flags.
-       (gcry_pk_decrypt): New variable: int flags.
-       (gcry_pk_decrypt): Pass `flags' to pubkey_decrypt.
-       (sexp_to_enc): New variable: int parsed_flags.
-       (sexp_to_enc): Set `parsed_flags'.
-
-2003-03-22  Simon Josefsson  <jas@extundo.com>
-
-       * cipher.c (gcry_cipher_open, do_cbc_encrypt)
-       (gcry_cipher_encrypt): Support GCRY_CIPHER_CBC_MAC.
-       (gcry_cipher_ctl): Support GCRYCTL_SET_CBC_MAC.
-
-2003-03-19  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c (gen_prime): New args EXTRA_CHECK and EXTRA_CHECK_ARG
-       to allow for a user callback.  Changed all callers.
-       (_gcry_generate_secret_prime)
-       (_gcry_generate_public_prime): Ditto, pass them to gen_prime.
-       * rsa.c (check_exponent): New.
-       (generate): Use a callback to ensure that a given exponent is
-       actually generated.
-
-2003-03-12  Moritz Schulte  <moritz@g10code.com>
-
-       * primegen.c: Initialize `no_of_small_prime_numbers' statically.
-       (gen_prime): Remove calculation of `no_of_small_prime_numbers'.
-
-2003-03-03  Moritz Schulte  <moritz@g10code.com>
-
-       * md.c (gcry_md_ctl): Rewritten to use same style like the other
-       functions dispatchers.
-
-2003-03-02  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c (struct gcry_cipher_handle): New member: algo_index.
-       (gcry_cipher_open): Allocate memory for two cipher contexts.
-       Initialize algo_index.
-       (cipher_setkey): Duplicate context into reserved memory.
-       (cipher_reset): New function, which resets the context and clear
-       the IV.
-       (gcry_cipher_ctl): Call cipher_reset.
-
-2003-02-23  Moritz Schulte  <moritz@g10code.com>
-
-       * cipher.c: Remove (bogus) `digitp' macro definition.
-       * md.c: Likewise.
-
-       * blowfish.c (burn_stack): Removed.
-       * arcfour.c (burn_stack): Likewise.
-       * cast5.c (burn_stack): Likewise.
-       * des.c (burn_stack): Likewise.
-       * md4.c (burn_stack): Likewise.
-       * md5.c (burn_stack): Likewise.
-       * random.c (burn_stack): Likewise.
-       * rijndael.c (burn_stack): Likewise.
-       * rmd160.c (burn_stack): Likewise.
-       * sha1.c (burn_stack): Likewise.
-       * sha256.c (burn_stack): Likewise.
-       * tiger.c (burn_stack): Likewise.
-       * twofish.c (burn_stack): Likewise.
-
-       * blowfish.c: Changed all occurences of burn_stack to
-       _gcry_burn_stack.
-       * arcfour.c: Likewise.
-       * cast5.c: Likewise.
-       * des.c: Likewise.
-       * md4.c: Likewise.
-       * md5.c: Likewise.
-       * random.c: Likewise.
-       * rijndael.c: Likewise.
-       * rmd160.c: Likewise.
-       * sha1.c: Likewise.
-       * sha256.c: Likewise.
-       * tiger.c: Likewise.
-       * twofish.c: Likewise.
-
-       * arcfour.c (_gcry_arcfour_get_info): Use GCRY_CIPHER_ARCFOUR
-       instead of hard-coded value `301'.
-
-2003-01-24  Werner Koch  <wk@gnupg.org>
-
-       * random.c (_gcry_register_random_progress): New.
-       (_gcry_random_progress): New.
-
-       * rndlinux.c (gather_random): Call the random progress function. 
-
-2003-01-23  Werner Koch  <wk@gnupg.org>
-
-       * rsa.c (generate): New arg USE_E to request a specific public
-       exponent.
-       (_gcry_rsa_generate): Ditto.
-       * elgamal.c (_gcry_elg_generate): Must add an dummy argument
-       instead of USE_E.
-       * dsa.c (_gcry_dsa_generate): Ditto.
-       * pubkey.c (dummy_generate): Ditto.
-       (pubkey_generate): Add USE_E arg and pass it down.
-       (gcry_pk_genkey): Detect "rsa-use-e" parameter and pass it to generate.
-
-       * pubkey.c (sexp_to_enc): New arg RET_MODERN.
-       (gcry_pk_decrypt): Make use of it to return a real S-expression.
-       Return better error codes.
-       (gcry_pk_verify): Return better error codes.
-
-2003-01-21  Werner Koch  <wk@gnupg.org>
-
-       * random.c (gcry_random_add_bytes): Add QUALITY argument, let
-       function return an error code and disable its core for now.
-
-2003-01-21  Timo Schulz  <twoaday@freakmail.de>
-
-       * random.c (gcry_random_add_bytes): New. Function to add external
-       random to the pool.
-       
-2003-01-20  Simon Josefsson  <jas@extundo.com>
-
-       * crc.c: New.
-       * Makefile.am (EXTRA_PROGRAMS, EXTRA_crc_SOURCES): Add crc.c.
-       * md.c (gcry_md_get_algo_dlen): Add values for CRC.
-
-2003-01-20  Werner Koch  <wk@gnupg.org>
-
-       * sha256.c: New.
-       * bithelp.h (ror): New.
-       * Makfile.am: Add sha256.c.
-       * md.c (oid_table): Add values for SHA256 et al.
-       (gcry_md_get_algo_dlen): Likewise
-
-2003-01-20  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_get_keygrip): Implemented keygrips for DSA
-       and ElGamal.
-
-2003-01-17  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (gcry_cipher_encrypt): Reworked so that the output will
-       never contain the plaintext even if the caller did not checked the
-       return value.
-
-       * md.c (gcry_md_get_algo): Changed error code to GCRYERR_GENERAL
-       because we don't have an invalid md algo but no algorithm enabled.
-
-       * pubkey.c (gcry_pk_genkey): Changed error code for bounds check
-       of table parameters to GCRYERR_INTERNAL.
-
-       * md.c (gcry_md_open): Partly reverted Timo's change from
-       2002-10-10 by removing the check for the algorithm.  An algorithm
-       of 0 is allowed and anyway we should not double check it or check
-       it using a different function.  Also fixed the flags check.
-
-       * pubkey.c (gcry_pk_encrypt): Make sure that R_CIPH points to NULL
-       on error.
-       (gcry_pk_decrypt): Ditto for R_PLAIN.
-       (gcry_pk_sign): Ditto for R_SIG.
-       (gcry_pk_genkey): Ditto for R_KEY.
-
-2003-01-16  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_write): Changed 2nd argument type to void*.
-       (gcry_md_hash_buffer): Changed type of boths buffers to void*.
-       (gcry_md_setkey): Changed 2nd argument type to void*.
-
-2003-01-15  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (sexp_data_to_mpi): New.  This handles pkcs1 padding.
-       (gcry_pk_sign, gcry_pk_verify): Use it here.
-       (gcry_pk_encrypt): And here.
-       (pubkey_verify): Add debug code.
-       (sexp_to_enc): Handle flags in the input and return the pkcs1 flag
-       in a new parameter.
-       (gcry_pk_decrypt): Prepare for future pkcs1 handling.
-
-2002-12-19  Werner Koch  <wk@gnupg.org>
-
-       * random.c (_gcry_random_initialize): New.
-
-2002-12-16  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c: Added a Teletrust specific OID for 3DES.
-
-2002-12-12  Werner Koch  <wk@gnupg.org>
-
-       * md.c: Added another oddball OIW OID (sha-1WithRSAEncryption).
-
-2002-11-23  Werner Koch  <wk@gnupg.org>
-
-       * md.c (load_digest_module): Enlarged checked_algos bitmap.
-       * md4.c (func_table):  Fixed entry for md4. 
-       Both by Simon Josephson.
-       (transform): Copy data to get the alignment straight. Tested only
-       on i386.
-
-2002-11-10  Simon Josefsson  <jas@extundo.com>
-
-       * cipher.c (gcry_cipher_open): Don't reject CTS flag.
-       (do_cbc_encrypt, do_cbc_decrypt, cipher_encrypt) 
-       (gcry_cipher_encrypt, cipher_decrypt)
-       (gcry_cipher_decrypt): Support CTS flag.
-       (gcry_cipher_ctl): Toggle CTS flag.
-
-2002-11-10  Werner Koch  <wk@gnupg.org>
-
-       * md4.c: New. By Simon Josefsson.
-       * Makefile.am (EXTRA_PROGRAMS): Add md4.c. 
-       * md.c (oid_table,gcry_md_get_algo_dlen): MD4 support. 
-
-2002-10-14  Werner Koch  <wk@gnupg.org>
-
-       * arcfour.c (do_encrypt_stream): Don't use increment op when
-       assigning to the same variable.
-
-2002-10-10  Timo Schulz  <ts@winpt.org>
-
-       * pubkey.c (gcry_pk_genkey): Check boundaries.
-       
-       * md.c (gcry_md_open): Check that algo is available and only
-       valid flag values are used.
-       (gcry_md_get_algo): Add error handling.
-       
-2002-09-26  Werner Koch  <wk@gnupg.org>
-
-       * md.c: Include an OID for TIGER.
-       * tiger.c (tiger_get_info): Use a regular OID.
-
-2002-09-17  Werner Koch  <wk@gnupg.org>
-
-       * random.c: Replaced mutex.h by the new ath.h.  Changed all calls.
-
-2002-09-16  Werner Koch  <wk@gnupg.org>
-
-       * arcfour.c (do_encrypt_stream): Use register modifier and modulo.
-       According to Nikos Mavroyanopoulos this increases perfromace on
-       i386 system noticable.  And I always tought gcc is clever enough.
-       * md5.c (transform): Use register modifier.
-       * rmd160.c (transform): Ditto.
-       * sha1.c (transform): Ditto.  We hope that there are 6 free registers.
-       * random.c (gcry_randomize): Rewrote to avoid malloc calls.
-
-       * rndlinux.c (gather_random): Replaced remaining fprintfs by log_*.
-       * arcfour.c (do_arcfour_setkey): Ditto.
-       * twofish.c (do_twofish_setkey): Ditto.
-       * rndegd.c (gather_random): Ditto.
-       * rijndael.c (do_setkey): Ditto.
-       * random.c (_gcry_random_dump_stats): Ditto. 
-       * primegen.c (_gcry_generate_elg_prime): Ditto.
-       * des.c (_gcry_des_get_info): Ditto.
-       * cast5.c (do_cast_setkey): Ditto.
-       * blowfish.c (do_bf_setkey): Ditto.
-
-2002-08-26  Werner Koch  <wk@gnupg.org>
-
-       * des.c (weak_keys): Fixed one entry in the table and compared
-       all entries against the literature.
-       (selftest): Checksum the weak key table.
-
-2002-08-21  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c: Enable keygrip calculation for "openpgp-rsa".
-
-2002-08-17  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (setup_cipher_table): Don't overwrite the DES entry
-       with the entry for DUMMY.
-
-2002-08-14  Werner Koch  <wk@gnupg.org>
-
-       * des.c (do_des_setkey,do_des_encrypt, do_des_decrypt): New.
-       (_gcry_des_get_info): Support plain old DES.
-       * cipher.c (setup_cipher_table): Put DES into the table.
-
-2002-07-25  Werner Koch  <wk@gnupg.org>
-
-       * rndunix.c (_gcry_rndunix_constructor): Prefixed with _gcry_.
-       Noted by Stephan Austermuehle.
-
-2002-07-08  Timo Schulz  <ts@winpt.org>
-
-       * rndw32.c: Replaced the m_ memory functions with the real 
-       gcry_ functions. Renamed all g10_ prefixed functions to log_.
-       
-2002-06-12  Werner Koch  <wk@gnupg.org>
-
-       * rsa.c (generate): Use e = 65537 for now.
-
-2002-06-11  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_get_keygrip): Allow a "protected-private-key".
-
-2002-06-05  Timo Schulz  <ts@winpt.org>
-
-       * cipher.c (gcry_cipher_encrypt, gcry_cipher_decrypt):
-       Check that the input size is a multiple of the blocksize.
-       
-2002-05-23  Werner Koch  <wk@gnupg.org>
-
-       * md.c (oid_table): Add an rsadsi OID for MD5.
-
-2002-05-21  Werner Koch  <wk@gnupg.org>
-
-       * primegen.c, elgamal.c, dsa.c (progress): Do not print anything
-       by default.  Pass an extra identifying string to the callback and
-       reserved 2 argumenst for current and total counters.  Changed the
-       register function prototype.
-
-2002-05-17  Werner Koch  <wk@gnupg.org>
-
-       * rndegd.c (rndegd_constructor): Fixed name of register function
-       and prefixed the function name with _gcry_.
-       * rndw32.c (rndw32_constructor): Ditto.
-       * tiger.c (tiger_constructor): Ditto.
-
-       * Makefile.am: Removed all dynamic loading stuff.
-       * dynload.c: Ditto. Now only used for the constructor system.
-
-2002-05-15  Werner Koch  <wk@gnupg.org>
-
-       * random.c (gcry_random_bytes,gcry_random_bytes_secure)
-       (gcry_randomize): Make sure we are initialized.
-
-2002-05-14  Werner Koch  <wk@gnupg.org>
-
-       Changed license of most files to the LGPL.
-
-2002-05-02  Werner Koch  <wk@gnupg.org>
-
-       * random.c (_gcry_fast_random_poll): Initialize the module so the
-       mutex can be used.
-
-       * primegen.c (small_prime_numbers): Moved table from smallprime.c
-       * smallprime.c: File removed.
-
-       * des.c (leftkey_swap, rightkey_swap, working_memcmp): Made static.
-
-       * cipher.c (gcry_cipher_map_name): Map "RIJNDAEL" to "AES".
-       * rijndael.c (rijndael_get_info): We do only support a 128 bit
-       blocksize so it makes sense to change the algorithm strings to
-       AES.
-
-       * tiger.c (tiger_final): Removed superfluous token pasting operators.
-       * md5.c (md5_final): Ditto.
-
-2002-04-30  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c: Fixed list of copyright years.
-
-2002-03-18  Werner Koch  <wk@gnupg.org>
-
-       * random.c (initialize): Initialize the new pool lock mutex.
-       (_gcry_fast_random_poll): Add locking and moved main
-       code out to...
-       (do_fast_random_poll): new function.
-       (read_pool): Use the new function here.
-       (get_random_bytes): Add locking.
-       (_gcry_update_random_seed_file): Ditto.
-
-2002-03-11  Werner Koch  <wk@gnupg.org>
-
-       * md.c: Add rsaSignatureWithripemd160 to OID table.
-
-2002-02-20  Werner Koch  <wk@gnupg.org>
-
-       * sha1.c: Removed a left over comment note.  The code has been
-       rewritten from scratch in 1998.  Thanks to Niels Möller for
-       reporting this misleading comment.
-
-2002-02-18  Werner Koch  <wk@gnupg.org>
-
-       * rndunix.c (rndunix_constructor): Use the the new prefixed
-       function name.  Reported by Jordi Mallach.
-
-2002-02-10  Werner Koch  <wk@gnupg.org>
-
-       * random.c (mix_pool): Carry an extra failsafe_digest buffer
-       around to make the function more robust.
-
-2002-02-08  Werner Koch  <wk@gnupg.org>
-
-       * random.c (add_randomness): Xor new data into the pool and not
-       just copy it.  This avoids any choosen input attacks which are not
-       serious in our setting because an outsider won't be able to mix
-       data in and even then we keep going with a PRNG.  Thanks to Stefan
-       Keller for pointing this out.
-
-2002-01-04  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_genkey): Do not release skey - it is static.
-
-       * primegen.c (gen_prime): Of course we should use set_bit
-       and not set_highbit to set the second high bit.
-
-2001-12-18  Werner Koch  <wk@gnupg.org>
-
-       * rsa.c (generate): Loop until we find the exact modulus size.
-       Changed the exponent to 41.
-       (rsa_get_info): s/usage/r_usage/ to avoid shadow warnings.
-       * primegen.c (gen_prime): Set 2 high order bits for secret primes.
-
-       * Makefile.am (DISTCLEANFILES): Include construct.c.
-
-2001-12-17  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_get_keygrip): New - experimental.
-
-2001-12-11  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c: Added OIDs for AES.
-       (gcry_cipher_mode_from_oid): New.
-       (gcry_cipher_map_name): Moved OID search code to ..
-       (search_oid): .. new function.
-
-2001-12-10  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (gcry_pk_encrypt): Find the signature algorithm by name
-       and not by number.
-       
-       * pubkey.c (gcry_pk_encrypt,gcry_pk_decrypt,gcry_pk_sign)
-       (gcry_pk_verify,gcry_pk_testkey, gcry_pk_genkey)
-       (gcry_pk_get_nbits): Release the arrays.  Noted by Nikos
-       Mavroyanopoulos.
-
-2001-12-06  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (gcry_cipher_map_name): Look also for OIDs prefixed
-       with "oid."  or "OID.".
-
-2001-12-05  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c (algo_info_table): Fixed entry for openpgp-rsa. 
-
-2001-11-24  Werner Koch  <wk@gnupg.org>
-
-       * pubkey.c: Added the rsaEncryption OID to the tables.
-       (sexp_to_key): Add an arg to return the index of the algorithm,
-       changed all callers.
-       (gcry_pk_sign): Find the signature algorithm by name and not by
-       number.
-       (gcry_pk_get_nbits): Fixed so that we can now really pass a secret
-       key to get the result.
-       
-       * md.c (gcry_md_map_name): Look also for OIDs prefixed with "oid."
-       or "OID." so that an OID string can be used as an S-Exp token.
-
-2001-11-20  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_map_name): Lookup by OID if the the name begins
-       with a digit.
-       (oid_table): New.
-       
-2001-11-16  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_info): New operator GCRYCTL_IS_ALGO_ENABLED.
-
-2001-11-07  Werner Koch  <wk@gnupg.org>
-
-       * md.c (gcry_md_hash_buffer): Close the handle which was left open
-       for algorithms other than rmd160.
-
-2001-08-08  Werner Koch  <wk@gnupg.org>
-
-       * rndw32.c (gather_random): Use toolhelp in addition to the NT
-       gatherer for Windows2000.  Suggested by Sami Tolvanen.
-
-       * random.c (read_pool): Fixed length check, this used to be one
-       byte to strict.  Made an assert out of it because the caller has
-       already made sure that only poolsize bytes are requested.
-       Reported by Marcus Brinkmann.
-
-2001-08-03  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (cipher_encrypt, cipher_decrypt): Prepare to return
-       errors. We have to change the interface to all ciphers to make
-       this really work but we should do so to prepare for hardware
-       encryption modules.
-       (gcry_cipher_encrypt, gcry_cipher_decrypt): Return the error and
-       set lasterr. 
-       (gcry_cipher_ctl): Make sure that errors from setkey are returned.
-
-2001-08-02  Werner Koch  <wk@gnupg.org>
-
-       * rndlinux.c (gather_random): casted a size_t arg to int so that
-       the format string is correct.  Casting is okay here and avoids
-       translation changes. 
-
-       * random.c (fast_random_poll): Do not check the return code of
-       getrusage.
-
-       * rndunix.c: Add a signal.h header to avoid warnings on Solaris 7
-       and 8.
-
-       * tiger.c (print_abc,print_data): Removed.
-
-       * rijndael.c, des.c, blowfish.c, twofish.c, cast5.c, arcfour.c
-       (burn_stack): New.  Add wrappers for most functions to be able to
-       call burn_stack after the function invocation. This methods seems
-       to be the most portable way to zeroise the stack used. It does
-       only work on stack frame based machines but it is highly portable
-       and has no side effects.  Just setting the automatic variables at
-       the end of a function to zero does not work well because the
-       compiler will optimize them away - marking them as volatile would
-       be bad for performance.
-       * md5.c, sha1.c, rmd160.c, tiger.c (burn_stack): Likewise.
-       * random.c (burn_stack): New.
-       (mix_pool): Use it here to burn the stack of the mixblock function.
-
-       * primegen.c (_gcry_generate_elg_prime): Freed q at 3 places.
-       Thanks to Tommi Komulainen.
-
-       * arcfour.c (arcfour_setkey): Check the minimim keylength against
-       bytes and not bits.
-       (selftest): Must reset the key before decryption. 
-
-2001-05-31  Werner Koch  <wk@gnupg.org>
-
-       * sha1.c (sha1_init): Made static.
-
-        Changed all g10_ prefixed function names as well as some mpi_
-       function names to cope with the introduced naming changes.
-       
-       * md.c (prepare_macpads): Made key const.
-
-2001-05-28  Werner Koch  <wk@gnupg.org>
-
-       * rndegd.c (gather_random): Removed the use of tty_printf.
-
-2001-03-29  Werner Koch  <wk@gnupg.org>
-
-       * md5.c (md5_final): Fixed calculation of hashed length.  Thanks
-       to disastry@saiknes.lv for pointing out that it was horrible wrong
-       for more than 512MB of input.
-       * sha1.c (sha1_final): Ditto.
-       * rmd160.c (rmd160_final): Ditto.
-       * tiger.c (tiger_final): Ditto.
-
-       * blowfish.c (encrypt,do_encrypt): Changed name to do_encrypt to
-       avoid name clashes with an encrypt function in stdlib.h of
-       Dynix/PIX.  Thanks to Gene Carter.
-       * elgamal.c (encrypt,do_encrypt): Ditto.
-
-       * twofish.c (gnupgext_enum_func): Use only when when compiled as a
-       module.
-       * rijndael.c (gnupgext_enum_func): Ditto.
-
-       * tiger.c (tiger_get_info): Return "TIGER192" and not just
-       "TIGER".  By Edwin Woudt.
-       
-       * random.c: Always include time.h - standard requirement.  Thanks
-       to James Troup.
-
-       * rndw32.c: Fixes to the macros.
-
-2001-01-11  Werner Koch  <wk@gnupg.org>
-
-       * cipher.c (cipher_encrypt,gcry_cipher_encrypt): Use blocksize and
-       not 8.
-
-2000-12-19  Werner Koch  <wk@gnupg.org>
-
-       Major change:
-       Removed all GnuPG stuff and renamed this piece of software
-       to gcrypt. 
-
-2000-11-14  Werner Koch  <wk@gnupg.org>
-
-       * dsa.c (test_keys): Replaced mpi_alloc by gcry_mpi_new and
-       mpi_free by gcry_mpi_release.
-       * elgamal.c (test_keys,generate): Ditto, also for mpi_alloc_secure.
-       * rsa.c (test_keys,generate,rsa_verify): Ditto.
-       * primegen.c (generate_elg_prime): Ditto.
-       (gen_prime): Ditto and removed nlimbs.
-
-       * rsa.c (generate): Allocate 2 more vars in secure memory.
-
-       * Makefile.am (OMIT_DEPENDENCIES): Hack to work around dependency
-       problems.
-
-2000-10-09  Werner Koch  <wk@gnupg.org>
-
-       * arcfour.c, arcfour.h: New.
-       * cipher.c (cipher_encrypt, cipher_decrypt): Add stream mode.
-       (setup_cipher_table): Add Arcfour.
-       (gcry_cipher_open): Kludge to allow stream mode.
-
-Wed Oct  4 13:16:18 CEST 2000  Werner Koch  <wk@openit.de>
-
-        * sha1.c (transform): Use rol() macro.  Actually this is not needed
-        for a newer gcc but there are still aoter compilers.
-
-        * rsa.c (test_keys): Use new random function. 
-
-        * md.c (gcry_md_setkey): New function to overcome problems with
-        const conflics.  
-        (gcry_md_ctl): Pass set key to the new functions.
-
-        * rijndael.c: New.
-        * cipher.c: Add Rijndael support.
-
-Mon Sep 18 16:35:45 CEST 2000  Werner Koch  <wk@openit.de>
-
-        * rndlinux.c (open_device): Loose random device checking.
-        By Nils Ellmenreich.
-
-        * random.c (fast_random_poll): Check ENOSYS for getrusage.
-        * rndunix.c:  Add 2 sources for QNX. By Sam Roberts.
-
-        * pubkey.c (gcry_pk_algo_info): Add GCRYCTL_GET_ALGO_USAGE.
-
-        * rsa.c: Changed the comment about the patent.
-        (secret): Speed up by using the CRT.  For a 2k keys this
-        is about 3 times faster.
-        (stronger_key_check): New but unused code to check the secret key.
-        * Makefile.am: Included rsa.[ch].
-        * pubkey.c: Enabled RSA support.
-        (pubkey_get_npkey): Removed RSA workaround.
-
-Mon Jul 31 10:04:47 CEST 2000  Werner Koch  <wk@openit.de>
-
-  * pubkey.c: Replaced all gcry_sexp_{car,cdr}_{data,mpi} by the new
-  gcry_sexp_nth_{data,mpi} functions.
-
-Tue Jul 25 17:44:15 CEST 2000  Werner Koch  <wk@openit.de>
-
-  * pubkey.c (exp_to_key,sexp_to_sig,sexp_to_enc,gcry_pk_encrypt,
-    gcry_pk_decrypt,gcry_pk_sign,gcry_pk_genkey): Changed to work with
-    the new S-Exp interface.
-
-Mon Jul 17 16:35:47 CEST 2000  Werner Koch  <wk@>
-
-  * random.c (gather_faked): Replaced make_timestamp by time(2) again.
-
-Fri Jul 14 19:38:23 CEST 2000  Werner Koch  <wk@>
-
-  * md.c (gcry_md_ctl): Support GCRYCTL_{START,STOP}_DUMP.
-
-  * Makefile.am: Never compile mingw32 as module.
-
-  * Makefile.am: Tweaked module build and removed libtool
-
-  * Makefile.am:  Replaced -O1 by -O. Suggested by Alec Habig.
-
-  * elgamal.c (sign): Removed inactive code.
-
-  * rsa.c, rsa.h: New based on the old module version (only in CVS for now).
-  * pubkey.c (setup_pubkey_table): Added commented support for RSA.
-
-  * rndunix.c (waitpid): New. For UTS 2.1.  All by Dave Dykstra.
-  (my_popen): Do the FD_CLOEXEC only if it is available
-  (start_gatherer): Cope with missing _SC_OPEN_MAX
-
-  * rndunix.c: Add some more headers for QNX. By Sam Roberts.
-
-  * rndegd.c (gather_random): Shortcut level 0.
-  * rndunix.c (gather_random): Ditto.
-  * rndw32.c (gather_random): Ditto.
-
-  * rndw32.c: Replaced with code from Cryptlib and commented the old stuff.
-  * rndw32.c: Add some debuging code enabled by an environment variable.
-
-  * random.c (read_seed_file): Binary open for DOSish system
-  (update_random_seed_file): Ditto.
-  * random.c [MINGW32]: Include process.h for getpid.
-  * random.c (fast_random_poll): Add clock_gettime() as fallback for
-  system which support this POSIX.4 fucntion. By Sam Roberts.
-
-  * random.c (read_seed_file): Removed the S_ISLNK test becuase it
-  is already covered by !S_ISREG and is not defined in Unixware.
-  Reported by Dave Dykstra.
-  (update_random_seed_file): Silently ignore update request when pool
-  is not filled.
-
-  * random.c (read_seed_file): New.
-  (set_random_seed_file): New.
-  (read_pool): Try to read the seeding file.
-  (update_random_seed_file): New.
-
-  (read_pool): Do an initial extra seeding when level 2 quality random
-  is requested the first time. This requestes at least POOLSIZE/2 bytes
-  of entropy.  Compined with the seeding file this should make normal
-  random bytes cheaper and increase the quality of the random bytes
-  used for key generation.
-
-  * random.c (read_pool): Print a more friendly error message in
-  cases when too much random is requested in one call.
-
-  * random.c (fast_random_poll): Check whether RUSAGE_SELF is defined;
-  this is not the case for some ESIX and Unixware, although they have
-  getrusage().
-
-  * primegen.c (generate_elg_prime): All primes are now generated with
-  the lowest random quality level.  Because they are public anyway we
-  don't need stronger random and by this we do not drain the systems
-  entropy so much.
-
-  * primegen.c (register_primegen_progress): New.
-  * dsa.c (register_pk_dsa_progress): New.
-  * elgamal.c (register_pk_elg_progress): New.
-
-  * elgamal.c (wiener_map): New.
-  (gen_k): Use a much smaller k.
-  (generate): Calculate the qbits using the wiener map and
-  choose an x at a size comparable to the one choosen in gen_k
-
-  * rmd160.c (rmd160_get_info): Moved casting to the left side due to a
-  problem with UTS4.3. Suggested by Dave Dykstra.
-  * sha1.c (sha1_get_info): Ditto.
-  * tiger.c (tiger_get_info): Ditto.
-  * md5.c (md5_get_info): Ditto
-  * des.c (des_get_info): Ditto.
-  * blowfish.c (blowfish_get_info): Ditto.
-  * cast5.c (cast5_get_info): Ditto.
-  * twofish.c (twofish_get_info): Ditto.
-
-Fri Mar 24 11:25:45 CET 2000  Werner Koch  <wk@openit.de>
-
-       * md.c (md_open): Add hmac arg and allocate space for the pads.
-       (md_finalize): Add HMAC support.
-       (md_copy): Ditto.
-       (md_close): Ditto.
-       (gcry_md_reset): Ditto.
-       (gcry_md_ctl): Ditto.
-       (prepare_macpdas): New.
-
-Mon Mar 13 19:22:46 CET 2000  Werner Koch  <wk@openit.de>
-
-       * md.c (gcry_md_hash_buffer): Add support for the other algorithms.
-
-Mon Jan 31 16:37:34 CET 2000  Werner Koch  <wk@gnupg.de>
-
-       * genprime.c (generate_elg_prime): Fixed returned factors which never
-       worked for non-DSA keys.
-
-Thu Jan 27 18:00:44 CET 2000  Werner Koch  <wk@gnupg.de>
-
-       * pubkey.c (sexp_to_key): Fixed mem leaks in case of errors.
-
-Mon Jan 24 22:24:38 CET 2000  Werner Koch  <wk@gnupg.de>
-
-       * pubkey.c (gcry_pk_decrypt): Implemented.
-       (gcry_pk_encrypt): Implemented.
-       (gcry_pk_testkey): New.
-       (gcry_pk_genkey): New.
-       (pubkey_decrypt): Made static.
-       (pubkey_encrypt): Ditto.
-       (pubkey_check_secret_key): Ditto.
-       (pubkey_generate): Ditto.
-
-Mon Jan 24 13:04:28 CET 2000  Werner Koch  <wk@gnupg.de>
-
-       * pubkey.c (pubkey_nbits): Removed and replaced by ...
-       (gcry_pk_get_nbits): this new one.
-
-Wed Dec  8 21:58:32 CET 1999  Werner Koch  <wk@gnupg.de>
-
-       * dsa.c: s/mpi_powm/gcry_mpi_powm/g
-       * elgamal.c: Ditto.
-       * primegen.c: Ditto.
-
-       * : Replaced g10_opt_verbose by g10_log_verbosity().
-
-       * Makefile.am (INCLUDES): removed intl, add ../gcrypt
-
-Fri Nov 19 17:15:20 CET 1999  Werner Koch  <wk@gnupg.de>
-
-       * dynload.c (cmp_filenames): New to replaced compare_filename() in
-       module.
-       (register_cipher_extension): Removed the tilde expansion stuff.
-       * rndeg.c (my_make_filename): New.
-
-       * : Replaced header util.h by g10lib.h
-
-       * random.c (gather_faked): Replaced make_timestamp by time(2).
-       Disabled wrning printed with tty_printf.
-       * rndlinux.c (gather_random): Always use fprintf instead of tty_xxx;
-       this should be replaced by a callback function.
-
-       * primegen.c (gen_prime): Use gcry_mpi_randomize.
-       (is_prime): Ditto.
-       * elgamal.c (test_keys): Ditto.
-       * dsa.c (test_keys): Ditto.
-
-       * cipher.c (gcry_cipher_close): Die on invalid handle.
-
-Mon Nov 15 21:36:02 CET 1999  Werner Koch  <wk@gnupg.de>
-
-       * elgamal.c (gen_k): Use the new random API.
-       (generate): Ditto.
-       * dsa.c (gen_k): Ditto.
-       (generate): Ditto.
-
-Sat Nov 13 17:44:23 CET 1999  Werner Koch  <wk@gnupg.de>
-
-       * pubkey.c (disable_pubkey_algo): Made static.
-       (gcry_pk_ctl): New.
-
-       * random.c (get_random_bits): Renamed to ...
-       (get_random_bytes): ... this and made static.
-       (gcry_random_bytes): New.
-       (gcry_random_bytes_secure): New.
-       (randomize_buffer): Renamed to ...
-       (gcry_randomize): ...this.
-
-       * md.c (gcry_md_hash_buffer): New.
-
-       * pubkey.c (gcry_pk_algo_info): 4 new commands.
-       (pubkey_get_npkey): Made static.
-       (pubkey_get_nskey): Made static.
-       (pubkey_get_nsig): Made static.
-       (pubkey_get_nenc): Made static.
-
-       * pubkey.c: Removed all G10ERR_xxx.
-       * cipher.c: Changed all GCRYERR_INV_ALGO to GCRYERR_INV_CIPHER_ALGO.
-       * md.c: Changed all GCRYERR_INV_ALGO to GCRYERR_INV_MD_ALGO.
-       * cast5.c (cast_setkey): Changed errocodes to GCRYERR_xxx.
-       * blowfish.c: Ditto.
-       * des.c: Ditto.
-       * twofish.c: Ditto.
-       * dsa.c: Ditto.
-       * elgamal.c: Ditto.
-
-       * g10c.c: Removed
-
-       * cipher.c (gcry_cipher_open): Replaced alloc functions and return NULL
-       if we are out of core.
-       * dynload.c: Replaced all memory allocation functions.
-       * md.c: Ditto.
-       * primegen.c: Ditto.
-       * pubkey.c: Ditto.
-       * random.c: Ditto.
-       * rndw32.c: Ditto.
-       * elgamal.c: Ditto.
-       * dsa.c: Ditto.
-
-Tue Oct 26 14:10:21 CEST 1999  Werner Koch  <wk@gnupg.de>
-
-       * elgamal.c (sign): Hugh found strange code here. Replaced by BUG().
-
-       * cipher.c: Merged with gcrypt/symapi.c.
-
-       * pubkey.c (string_to_pubkey_algo): Renamed function to ...
-       (gcry_pk_map_name): ... this.
-       (pubkey_algo_to_string): Renamed function to ...
-       (gcry_pk_algo_name): ... this.
-       (gcry_pk_algo_info): New.
-       * pubkey.c: Merged with gcrypt/pkapi.c.
-
-       * md.c (md_reset): Clear finalized; thanks to Ulf Moeller for
-       fixing this bug.
-
-       * md.c: Merged with gcrypt/mdapi.c
-
-Wed Sep 15 14:39:59 CEST 1999  Michael Roth <mroth@nessie.de>
-
-       * des.c: Various speed improvements: One bit pre rotation
-         trick after initial permutation (Richard Outerbridge).
-         Finished test of SSLeay Tripple-DES patterns.
-
-Wed Sep 15 16:22:17 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndw32.c: New.
-
-Mon Sep 13 10:51:29 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * bithelp.h: New.
-       * rmd160.h, sha1.h, md5.h: Use the rol macro from bithelp.h
-
-Tue Sep  7 16:23:36 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * Makefile.am: Fixed seds for latest egcc. By Ollivier Robert.
-
-Mon Sep  6 19:59:08 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * des.c (selftest): Add some testpattern
-
-Mon Aug 30 20:38:33 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * cipher.c (do_cbc_encrypt): Fixed serious bug occuring when not using
-       in place encryption. Pointed out by Frank Stajano.
-
-Mon Jul 26 09:34:46 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * md5.c (md5_final): Fix for a SCO cpp bug.
-
-Thu Jul 15 10:15:35 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * elgamal.c (elg_check_secret_key,elg_encrypt
-       elg_decrypt,elg_sign,elg_verify): Sanity check on the args.
-       * dsa.c (dsa_check_secret_key,dsa_sign,dsa_verify): Ditto.
-
-       * pubkey.c (disable_pubkey_algo): New.
-       (check_pubkey_algo2): Look at disabled algo table.
-       * cipher.c (disable_cipher_algo): New.
-       (check_cipher_algo): Look at disabled algo table.
-
-Wed Jul  7 13:08:40 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * Makefile.am: Support for libtool.
-
-Fri Jul  2 11:45:54 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * dsa.c (gen_k): Changed algorithm to consume less random bytes
-       * elgamal.c (gen_k): Ditto.
-
-       * random.c (random_dump_stats): New.
-
-Thu Jul  1 12:47:31 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * primegen.c, elgamal.c, dsa.c (progess): New and replaced all
-       fputc with a call to this function.
-
-Sat Jun 26 12:15:59 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndegd.c (do_write): s/ssize_t/int/ due to SunOS 4.1 probs.
-
-       * cipher.c (do_cbc_encrypt, do_cbc_decrypt): New.
-
-       * dynload.c (HAVE_DL_SHL_LOAD): Map hpux API to dlopen (Dave Dykstra).
-       * Makefile.am (install-exec-hook): Removed.
-
-Sun May 23 14:20:22 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * cipher.c (setup_cipher_table): Enable Twofish
-
-       * random.c (fast_random_poll): Disable use of times() for mingw32.
-
-Mon May 17 21:54:43 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * dynload.c (register_internal_cipher_extension): Minor init fix.
-
-Tue May  4 15:47:53 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * primegen.c (gen_prime): Readded the Fermat test. Fixed the bug
-       that we didn't correct for step when passing the prime to the
-       Rabin-Miller test which led to bad performance (Stefan Keller).
-       (check_prime): Add a first Fermat test.
-
-Sun Apr 18 10:11:28 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * cipher.c (cipher_setiv): Add ivlen arg, changed all callers.
-
-       * random.c (randomize_buffer): alway use secure memory because
-       we can't use m_is_secure() on a statically allocated buffer.
-
-       * twofish.c: Replaced some macros by a loop to reduce text size.
-       * Makefile.am (twofish): No more need for sed editing.
-
-Fri Apr  9 12:26:25 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * cipher.c (cipher_open): Reversed the changes for AUTO_CFB.
-
-       * blowfish.c: Dropped the Blowfish 160 mode.
-       * cipher.c (cipher_open): Ditto.
-       (setup_cipher_table): Ditto.  And removed support of twofish128
-
-Wed Apr  7 20:51:39 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * random.c (get_random_bits): Can now handle requests > POOLSIZE
-
-       * cipher.c (cipher_open): Now uses standard CFB for automode if
-       the blocksize is gt 8 (according to rfc2440).
-
-       * twofish.c: Applied Matthew Skala's patches for 256 bit key.
-
-Tue Apr  6 19:58:12 CEST 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * random.c (get_random_bits): Can now handle requests > POOLSIZE
-
-       * cipher.c (cipher_open): Now uses standard CFB for automode if
-       the blocksize is gt 8 (according to rfc2440).
-
-Sat Mar 20 11:44:21 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndlinux.c (tty_printf) [IS_MODULE]: Removed.
-
-       * rndegd.c (gather_random): Some fixes.
-
-Wed Mar 17 13:09:03 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndegd.c (do_read): New.
-       (gather_random): Changed the implementation.
-
-Mon Mar  8 20:47:17 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * dynload.c (DLSYM_NEEDS_UNDERSCORE): Renamed.
-
-Fri Feb 26 17:55:41 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * md.c: Nearly a total rewrote.
-
-Wed Feb 24 11:07:27 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * cipher.c (context): Fixed alignment
-       * md.c: Ditto.
-
-       * rndegd.c: New
-
-Mon Feb 22 20:04:00 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndegd.c: New.
-
-Wed Feb 10 17:15:39 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * Makefile.am: Modules are now figured out by configure
-       * construct.c: New. Generated by configure. Changed all modules
-       to work with that.
-       * sha1.h: Removed.
-       * md5.h: Removed.
-
-       * twofish.c: Changed interface to allow Twofish/256
-
-       * rndunix.c (start_gatherer): Die on SIGPIPE.
-
-Wed Jan 20 18:59:49 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndunix.c (gather_random): Fix to avoid infinite loop.
-
-Sun Jan 17 11:04:33 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * des.c (is_weak_key): Replace system memcmp due to bugs
-       in SunOS's memcmp.
-       (des_get_info): Return error on failed selftest.
-       * twofish.c (twofish_setkey): Return error on failed selftest or
-       invalid keylength.
-       * cast5.c (cast_setkey): Ditto.
-       * blowfish.c (bf_setkey): Return error on failed selftest.
-
-Tue Jan 12 11:17:18 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * random.c (random_is_faked): New.
-
-       * tiger.c: Only compile if we have the u64 type
-
-Sat Jan  9 16:02:23 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndunix.c (gather_random): check for setuid.
-
-       * Makefile.am: Add a way to staically link random modules
-
-Thu Jan  7 18:00:58 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * md.c (md_stop_debug): Do a flush first.
-       (md_open): size of buffer now depends on the secure parameter
-
-Sun Jan  3 15:28:44 CET 1999  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * rndunix.c (start_gatherer): Fixed stupid ==/= bug
-
-1998-12-31  Geoff Keating  <geoffk@ozemail.com.au>
-
-       * des.c (is_weak_key): Rewrite loop end condition.
-
-Tue Dec 29 14:41:47 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * random.c: add unistd.h for getpid().
-       (RAND_MAX): Fallback value for Sun.
-
-Wed Dec 23 17:12:24 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * md.c (md_copy): Reset debug.
-
-Mon Dec 14 21:18:49 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * random.c (read_random_source): Changed the interface to the
-       random gathering function.
-       (gather_faked): Use new interface.
-       * dynload.c (dynload_getfnc_fast_random_poll): Ditto.
-       (dynload_getfnc_gather_random): Ditto.
-       * rndlinux.c (gather_random): Ditto.
-       * rndunix.c (gather_random): Ditto.
-
-Sat Dec 12 18:40:32 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * dynload.c (SYMBOL_VERSION): New to cope with system which needs
-       underscores.
-
-       * rndunix.c: Rewrote large parts
-
-Thu Dec 10 20:15:36 CET 1998  Werner Koch  <wk@isil.d.shuttle.de>
-
-       * dynload.c (load_extension): increased needed verbosity level.
-
-       * random.c (fast_random_poll): Fallback to a default fast random
-       poll function.
-       (read_random_source): Always use the faked entroy gatherer if no
-       gather module is available.
-       * rndlinux.c (fast_poll): Removed.
-       * rndunix.c (fast_poll): Removed.
-
-
-Wed Nov 25 12:33:41 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rand-*.c: Removed.
-       * rndlinux.c : New.
-       * rndunix.c : New.
-       * random.c : Restructured the interface to the gather modules.
-       (intialize): Call constructor functions
-       (read_radnom_source): Moved to here.
-       * dynload.c (dynload_getfnc_gather_random): New.
-       (dynload_getfnc_fast_random_poll): New.
-       (register_internal_cipher_extension): New.
-       (register_cipher_extension): Support of internal modules.
-
-Sun Nov  8 17:44:36 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rand-unix.c (read_random_source): Removed the assert.
-
-Mon Oct 19 18:34:30 1998  me,,,  (wk@tobold)
-
-       * pubkey.c: Hack to allow us to give some info about RSA keys back.
-
-Thu Oct 15 11:47:57 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * dynload.c: Support for DLD
-
-Wed Oct 14 12:13:07 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rand-unix.c: Now uses names from configure for /dev/random.
-
-1998-10-10  SL Baur  <steve@altair.xemacs.org>
-
-       * Makefile.am: fix sed -O substitutions to catch -O6, etc.
-
-Tue Oct  6 10:06:32 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rand-unix.c (HAVE_GETTIMEOFDAY): Fixed (was ..GETTIMEOFTIME :-)
-       * rand-dummy.c (HAVE_GETTIMEOFDAY): Ditto.
-
-Mon Sep 28 13:23:09 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c (md_digest): New.
-       (md_reset): New.
-
-Wed Sep 23 12:27:02 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * tiger.c (TIGER_CONTEXT): moved "buf", so that it is 64 bit aligned.
-
-Mon Sep 21 06:22:53 1998  Werner Koch  (wk@(none))
-
-       * des.c: Some patches from Michael.
-
-Thu Sep 17 19:00:06 1998  Werner Koch  (wk@(none))
-
-       * des.c : New file from Michael Roth <mroth@nessie.de>
-
-Mon Sep 14 11:10:55 1998  Werner Koch  (wk@(none))
-
-       * blowfish.c (bf_setkey): Niklas Hernaeus patch to detect weak keys.
-
-Mon Sep 14 09:19:25 1998  Werner Koch  (wk@(none))
-
-       * dynload.c (RTLD_NOW): Now defined to 1 if it is undefined.
-
-Mon Sep  7 17:04:33 1998  Werner Koch  (wk@(none))
-
-       * Makefile.am: Fixes to allow a different build directory
-
-Thu Aug  6 17:25:38 1998  Werner Koch,mobil,,, (wk@tobold)
-
-       * random.c (get_random_byte): Removed and changed all callers
-       to use get_random_bits()
-
-Mon Jul 27 10:30:22 1998  Werner Koch  (wk@(none))
-
-       * cipher.c : Support for other blocksizes
-       (cipher_get_blocksize): New.
-       * twofish.c: New.
-       * Makefile.am: Add twofish module.
-
-Mon Jul 13 21:30:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * random.c (read_pool): Simple alloc if secure_alloc is not set.
-       (get_random_bits): Ditto.
-
-Thu Jul  9 13:01:14 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * dynload.c (load_extension): Function now nbails out if
-       the program is run setuid.
-
-Wed Jul  8 18:58:23 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rmd160.c (rmd160_hash_buffer): New.
-
-Thu Jul  2 10:50:30 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cipher.c (cipher_open): algos >=100 use standard CFB
-
-Thu Jun 25 11:18:25 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * Makefile.am: Support for extensions
-
-Thu Jun 18 12:09:38 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * random.c (mix_pool): simpler handling for level 0
-
-Mon Jun 15 14:40:48 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * tiger.c: Removed from dist, will reappear as dynload module
-
-Sat Jun 13 14:16:57 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * pubkey.c: Major changes to allow extensions. Changed the inteface
-       of all public key ciphers and added the ability to load extensions
-       on demand.
-
-       * misc.c: Removed.
-
-Wed Jun 10 07:52:08 1998  Werner Koch,mobil,,, (wk@tobold)
-
-       * dynload.c: New.
-       * cipher.c: Major changes to allow extensions.
-
-Mon Jun  8 22:43:00 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cipher.c: Major internal chnages to support extensions.
-       * blowfish.c (blowfish_get_info): New and made all internal
-       functions static, changed heder.
-       * cast5.c (cast5_get_info): Likewise.
-
-Mon Jun  8 12:27:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * tiger.c (transform): Fix for big endian
-
-       * cipher.c (do_cfb_decrypt): Big endian fix.
-
-Fri May 22 07:30:39 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c (md_get_oid): Add a new one for TIGER.
-
-Thu May 21 13:24:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cipher.c: Add support for a dummy cipher
-
-Thu May 14 15:40:36 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rmd160.c (transform): fixed sigbus - I should better
-       add Christian von Roques's new implemenation of rmd160_write.
-
-Fri May  8 18:07:44 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rand-internal.h, rand-unix.c, rand-w32.c, rand_dummy.c: New
-       * random.c: Moved system specific functions to rand-****.c
-
-Fri May  8 14:01:17 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * random.c (fast_random_poll): add call to gethrtime.
-
-Tue May  5 21:28:55 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * elgamal.c (elg_generate): choosing x was not correct, could
-       yield 6 bytes which are not from the random pool, tsss, tsss..
-
-Tue May  5 14:09:06 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * primegen.c (generate_elg_prime): Add arg mode, changed all
-       callers and implemented mode 1.
-
-Mon Apr 27 14:41:58 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cipher.c (cipher_get_keylen): New.
-
-Sun Apr 26 14:44:52 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * tiger.c, tiger.h: New.
-
-Wed Apr  8 14:57:11 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * misc.c (check_pubkey_algo2): New.
-
-Tue Apr  7 18:46:49 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cipher.c: New
-       * misc.c (check_cipher_algo): Moved to cipher.c
-       * cast5.c: Moved many functions to cipher.c
-       * blowfish.c: Likewise.
-
-Sat Apr  4 19:52:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * cast5.c: Implemented and tested.
-
-Wed Apr  1 16:38:27 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * elgamal.c (elg_generate): Faster generation of x in some cases.
-
-Thu Mar 19 13:54:48 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * blowfish.c (blowfish_decode_cfb): changed XOR operation
-       (blowfish_encode_cfb): Ditto.
-
-Thu Mar 12 14:04:05 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * sha1.c (transform): Rewrote
-
-       * blowfish.c (encrypt): Unrolled for rounds == 16
-       (decrypt): Ditto.
-
-Tue Mar 10 16:32:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rmd160.c (transform): Unrolled the loop.
-
-Tue Mar 10 13:05:14 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * random.c (read_pool): Add pool_balance stuff.
-       (get_random_bits): New.
-
-       * elgamal.c (elg_generate): Now uses get_random_bits to generate x.
-
-
-Tue Mar 10 11:33:51 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c (md_digest_length): New.
-
-Tue Mar 10 11:27:41 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * dsa.c (dsa_verify): Works.
-
-Mon Mar  9 12:59:08 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * dsa.c, dsa.h: Removed some unused code.
-
-Wed Mar  4 10:39:22 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c (md_open): Add call to fast_random_poll.
-       blowfish.c (blowfish_setkey): Ditto.
-
-Tue Mar  3 13:32:54 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * rmd160.c (rmd160_mixblock): New.
-       * random.c: Restructured to start with a new RNG implementation.
-       * random.h: New.
-
-Mon Mar  2 19:21:46 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * gost.c, gost.h: Removed because they did only contain trash.
-
-Sun Mar  1 16:42:29 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * random.c (fill_buffer): removed error message if n == -1.
-
-Fri Feb 27 16:39:34 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c (md_enable): No init if called twice.
-
-Thu Feb 26 07:57:02 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * primegen.c (generate_elg_prime): Changed the progress printing.
-       (gen_prime): Ditto.
-
-Tue Feb 24 12:28:42 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md5.c, md.5 : Replaced by a modified version of md5.c from
-       GNU textutils 1.22.
-
-Wed Feb 18 14:08:30 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * md.c, md.h : New debugging support
-
-Mon Feb 16 10:08:47 1998  Werner Koch  (wk@isil.d.shuttle.de)
-
-       * misc.c (cipher_algo_to_string): New
-       (pubkey_algo_to_string): New.
-       (digest_algo_to_string): New.
-
-
- Copyright 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
-          2007, 2008, 2009, 2010 Free Software Foundation, Inc.
-
- This file is free software; as a special exception the author gives
- unlimited permission to copy and/or distribute it, with or without
- modifications, as long as this notice is preserved.
-
- This file is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
- implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
diff --git a/grub-core/lib/libgcrypt/cipher/ChangeLog-2011 
b/grub-core/lib/libgcrypt/cipher/ChangeLog-2011
index 05516c99e..1ce6bd1e6 100644
--- a/grub-core/lib/libgcrypt/cipher/ChangeLog-2011
+++ b/grub-core/lib/libgcrypt/cipher/ChangeLog-2011
@@ -1,9 +1,37 @@
 2011-12-01  Werner Koch  <wk@g10code.com>
 
-        NB: ChangeLog files are no longer manually maintained.  Starting
-        on December 1st, 2011 we put change information only in the GIT
-        commit log, and generate a top-level ChangeLog file from logs at
-        "make dist".  See doc/HACKING for details.
+       NB: ChangeLog files are no longer manually maintained.  Starting
+       on December 1st, 2011 we put change information only in the GIT
+       commit log, and generate a top-level ChangeLog file from logs at
+       "make dist".  See doc/HACKING for details.
+
+2011-09-16  Werner Koch  <wk@g10code.com>
+
+       * primegen.c (_gcry_primegen_init): New.
+
+2011-09-15  Werner Koch  <wk@g10code.com>
+
+       * cipher-cbc.c, cipher-cfb.c, cipher-ofb.c, cipher-ctr.c: New.
+       * cipher-aeswrap.c: New.
+       * cipher-internal.h: New.
+       * cipher.c (cipher_context_alignment_t, struct gcry_cipher_handle)
+       (CTX_MAGIC_NORMAL, CTX_MAGIC_SECURE, NEED_16BYTE_ALIGNED_CONTEXT)
+       (MAX_BLOCKSIZE): Move to cipher-internal.h.
+       (do_aeswrap_encrypt, do_aeswrap_encrypt)
+       (do_cbc_encrypt, do_cbc_decrypt, do_ctr_encrypt, do_ctr_decrypt)
+       (do_ofb_encrypt, do_ofb_decrypt, do_ctr_encrypt): Move to the
+       respective new cipher-foo.c files.
+       (do_ctr_decrypt): Remove.
+
+2011-09-15  Werner Koch  <wk@g10code.com>
+
+       * pubkey.c (gcry_pk_list): Remove.
+       (gcry_pk_unregister): Remove.
+       * md.c (gcry_md_list): Remove.
+       (gcry_md_unregister): Remove.
+       * cipher.c (gcry_cipher_list): Remove.
+       (gcry_cipher_unregister): Remove.
+       * ac.c: Remove.
 
 2011-06-29  Werner Koch  <wk@g10code.com>
 
@@ -4245,3 +4273,7 @@ Mon Feb 16 10:08:47 1998  Werner Koch  
(wk@isil.d.shuttle.de)
  This file is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY, to the extent permitted by law; without even the
  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+Local Variables:
+buffer-read-only: t
+End:
diff --git a/grub-core/lib/libgcrypt/cipher/Makefile.am 
b/grub-core/lib/libgcrypt/cipher/Makefile.am
index 76cdc96ad..ea9014cc9 100644
--- a/grub-core/lib/libgcrypt/cipher/Makefile.am
+++ b/grub-core/lib/libgcrypt/cipher/Makefile.am
@@ -19,65 +19,317 @@
 
 # Process this file with automake to produce Makefile.in
 
-EXTRA_DIST = Manifest
-
 # Need to include ../src in addition to top_srcdir because gcrypt.h is
 # a built header.
-AM_CPPFLAGS = -I../src -I$(top_srcdir)/src
+AM_CPPFLAGS = -I../src -I$(top_srcdir)/src -I../mpi -I$(top_srcdir)/mpi
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
+AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
+
+EXTRA_DIST = gost-s-box.c kyber-common.c kyber-kdep.c
+
+CLEANFILES = gost-s-box$(EXEEXT_FOR_BUILD)
+DISTCLEANFILES = gost-sb.h
 
 noinst_LTLIBRARIES = libcipher.la
 
-GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ @GCRYPT_DIGESTS@
+GCRYPT_MODULES = @GCRYPT_CIPHERS@ @GCRYPT_PUBKEY_CIPHERS@ \
+                 @GCRYPT_DIGESTS@ @GCRYPT_KDFS@
 
 libcipher_la_DEPENDENCIES = $(GCRYPT_MODULES)
 libcipher_la_LIBADD = $(GCRYPT_MODULES)
 
 libcipher_la_SOURCES = \
-cipher.c pubkey.c ac.c md.c kdf.c \
-hmac-tests.c \
-bithelp.h  \
-primegen.c  \
-hash-common.c hash-common.h \
-rmd.h
+       cipher.c cipher-internal.h \
+       cipher-cbc.c \
+       cipher-cfb.c \
+       cipher-ofb.c \
+       cipher-ctr.c \
+       cipher-aeswrap.c \
+       cipher-ccm.c \
+       cipher-cmac.c \
+       cipher-gcm.c \
+       cipher-poly1305.c \
+       cipher-ocb.c \
+       cipher-xts.c \
+       cipher-eax.c \
+       cipher-siv.c \
+       cipher-gcm-siv.c \
+       pubkey.c pubkey-internal.h pubkey-util.c \
+       md.c \
+       mac.c mac-internal.h \
+       mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
+       poly1305.c poly1305-internal.h \
+       kem.c sntrup761.c sntrup761.h kyber.c kyber.h kem-ecc.c kem-ecc.h \
+       mceliece6688128f.c mceliece6688128f.h \
+       kdf.c kdf-internal.h \
+       bithelp.h  \
+       bufhelp.h  \
+       bulkhelp.h \
+       primegen.c \
+       hash-common.c hash-common.h \
+       dsa-common.c rsa-common.c \
+       sha1.h
 
 EXTRA_libcipher_la_SOURCES = \
-arcfour.c \
-blowfish.c \
-cast5.c \
-crc.c \
-des.c \
-dsa.c \
-elgamal.c \
-ecc.c \
-idea.c \
-md4.c \
-md5.c \
-rijndael.c rijndael-tables.h \
-rmd160.c \
-rsa.c \
-seed.c \
-serpent.c \
-sha1.c \
-sha256.c \
-sha512.c \
-tiger.c \
-whirlpool.c \
-twofish.c \
-rfc2268.c \
-camellia.c camellia.h camellia-glue.c
+       asm-common-aarch64.h \
+       asm-common-amd64.h \
+       asm-common-i386.h \
+       asm-common-s390x.h \
+       asm-inline-s390x.h \
+       asm-poly1305-aarch64.h \
+       asm-poly1305-amd64.h \
+       asm-poly1305-s390x.h \
+       aria.c aria-aesni-avx-amd64.S aria-aesni-avx2-amd64.S \
+       aria-gfni-avx512-amd64.S \
+       arcfour.c arcfour-amd64.S \
+       blowfish.c blowfish-amd64.S blowfish-arm.S \
+       cast5.c cast5-amd64.S cast5-arm.S \
+       chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
+       chacha20-amd64-avx512.S chacha20-armv7-neon.S chacha20-aarch64.S \
+       chacha20-ppc.c chacha20-s390x.S \
+       chacha20-p10le-8x.s \
+       cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c cipher-gcm-armv7-neon.S \
+       cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
+       crc.c crc-intel-pclmul.c crc-armv8-ce.c \
+       crc-armv8-aarch64-ce.S \
+       crc-ppc.c \
+       des.c des-amd64.S \
+       dsa.c \
+       elgamal.c \
+       ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
+       ecc-ecdh.c ecc-ecdsa.c ecc-eddsa.c ecc-gost.c ecc-sm2.c \
+       idea.c \
+       gost28147.c gost.h \
+       gostr3411-94.c \
+       md4.c \
+       md5.c \
+       poly1305-s390x.S poly1305-amd64-avx512.S \
+       poly1305-p10le.s \
+       rijndael.c rijndael-internal.h rijndael-tables.h   \
+       rijndael-aesni.c rijndael-padlock.c                \
+       rijndael-amd64.S rijndael-arm.S                    \
+       rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
+       rijndael-vaes.c rijndael-vaes-avx2-amd64.S         \
+       rijndael-vaes-i386.c rijndael-vaes-avx2-i386.S     \
+       rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
+       rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
+       rijndael-ppc.c rijndael-ppc9le.c                   \
+       rijndael-p10le.c rijndael-gcm-p10le.s              \
+       rijndael-ppc-common.h rijndael-ppc-functions.h     \
+       rijndael-s390x.c                                   \
+       rmd160.c \
+       rsa.c \
+       salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
+       scrypt.c \
+       seed.c \
+       serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
+       serpent-avx512-x86.c serpent-armv7-neon.S \
+       sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
+       sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
+       sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
+       sm4-ppc.c \
+       sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
+       sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
+       sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
+       sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
+       sha256-avx2-bmi2-amd64.S \
+       sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
+       sha256-intel-shaext.c sha256-ppc.c \
+       sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
+       sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \
+       sha512-armv7-neon.S sha512-armv8-aarch64-ce.S sha512-arm.S \
+       sha512-ppc.c sha512-ssse3-i386.c \
+       sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
+       keccak.c keccak_permute_32.h keccak_permute_64.h \
+       keccak-armv7-neon.S keccak-amd64-avx512.S \
+       stribog.c \
+       tiger.c \
+       whirlpool.c whirlpool-sse2-amd64.S \
+       twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
+       twofish-avx2-amd64.S \
+       rfc2268.c \
+       camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
+       camellia-aesni-avx2-amd64.h \
+       camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
+       camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+       camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
+       camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
+       blake2.c \
+       blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
+       blake2s-amd64-avx.S blake2s-amd64-avx512.S
+
+gost28147.lo: gost-sb.h
+gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD)
+       ./gost-s-box$(EXEEXT_FOR_BUILD) $@
+
+gost-s-box$(EXEEXT_FOR_BUILD): gost-s-box.c
+       $(CC_FOR_BUILD) $(CFLAGS_FOR_BUILD) $(LDFLAGS_FOR_BUILD) \
+           $(CPPFLAGS_FOR_BUILD) -o $@ $(srcdir)/gost-s-box.c
+
 
 if ENABLE_O_FLAG_MUNGING
-o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
+o_flag_munging = sed -e 's/[[:blank:]]-O\([2-9sgz][2-9sgz]*\)/ -O1 /' -e 
's/[[:blank:]]-Ofast/ -O1 /g'
 else
 o_flag_munging = cat
 endif
 
 
 # We need to lower the optimization for this module.
-tiger.o: $(srcdir)/tiger.c
-       `echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
+tiger.o: $(srcdir)/tiger.c Makefile
+       `echo $(COMPILE) -c $< | $(o_flag_munging) `
+
+tiger.lo: $(srcdir)/tiger.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(o_flag_munging) `
+
+
+# We need to disable instrumentation for these modules as they use cc as
+# thin assembly front-end and do not tolerate in-between function calls
+# inserted by compiler as those functions may clobber the XMM registers.
+if ENABLE_INSTRUMENTATION_MUNGING
+instrumentation_munging = sed \
+       -e 's/-fsanitize[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+       -e 's/-fprofile[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g' \
+       -e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
+else
+instrumentation_munging = cat
+endif
+
+rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.o: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+rijndael-ssse3-amd64.lo: $(srcdir)/rijndael-ssse3-amd64.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.o: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+cipher-gcm-intel-pclmul.lo: $(srcdir)/cipher-gcm-intel-pclmul.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.o: $(srcdir)/sha1-intel-shaext.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha1-intel-shaext.lo: $(srcdir)/sha1-intel-shaext.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.o: $(srcdir)/sha256-intel-shaext.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-intel-shaext.lo: $(srcdir)/sha256-intel-shaext.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.o: $(srcdir)/sha256-ssse3-i386.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+sha256-ssse3-i386.lo: $(srcdir)/sha256-ssse3-i386.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
+       `echo $(COMPILE) -c $< | $(instrumentation_munging) `
+
+crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
+       `echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
+ppc_vcrypto_cflags = -O2 -maltivec -mvsx -mcrypto
+else
+ppc_vcrypto_cflags =
+endif
+
+if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS
+aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+else
+aarch64_neon_cflags =
+endif
+
+rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+rijndael-p10le.o: $(srcdir)/rijndael-p10le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+rijndael-p10le.lo: $(srcdir)/rijndael-p10le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+sha256-ppc.lo: $(srcdir)/sha256-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+sha512-ppc.o: $(srcdir)/sha512-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+sha512-ppc.lo: $(srcdir)/sha512-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+chacha20-ppc.o: $(srcdir)/chacha20-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+chacha20-ppc.lo: $(srcdir)/chacha20-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-ppc8le.o: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-ppc8le.lo: $(srcdir)/camellia-ppc8le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(COMPILE) $(aarch64_neon_cflags) -c $< | 
$(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+       `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | 
$(instrumentation_munging) `
+
+sm4-ppc.o: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+sm4-ppc.lo: $(srcdir)/sm4-ppc.c Makefile
+       `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | 
$(instrumentation_munging) `
+
+
+if ENABLE_X86_AVX512_INTRINSICS_EXTRA_CFLAGS
+avx512f_cflags = -mavx512f
+else
+avx512f_cflags =
+endif
+
+serpent-avx512-x86.o: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(COMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) `
 
-tiger.lo: $(srcdir)/tiger.c
-       `echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
+serpent-avx512-x86.lo: $(srcdir)/serpent-avx512-x86.c Makefile
+       `echo $(LTCOMPILE) $(avx512f_cflags) -c $< | $(instrumentation_munging) 
`
diff --git a/grub-core/lib/libgcrypt/cipher/Manifest 
b/grub-core/lib/libgcrypt/cipher/Manifest
deleted file mode 100644
index 0cd64f71f..000000000
--- a/grub-core/lib/libgcrypt/cipher/Manifest
+++ /dev/null
@@ -1,73 +0,0 @@
-# Manifest - checksums of the cipher directory
-# Copyright 2003 Free Software Foundation, Inc.
-#
-# This file is part of Libgcrypt.
-#
-# Libgcrypt is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser general Public License as
-# published by the Free Software Foundation; either version 2.1 of
-# the License, or (at your option) any later version.
-#
-# Libgcrypt is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
-
-# Checksums for all source files in this directory. Format is
-# filename, blanks, base-64 part of an OpenPGP detached signature
-# without the header lines.  Blank lines and lines beginning with a
-# hash mark are ignored.  A tool to process this file is available by
-# cvs -d :pserver:anoncvs@cvs.gnupg.org:/cvs/wk co misc-scripts/manifest-tool
-#
-# The special entry "$names$" holds a signature over all sorted
-# filenames excluding itself.
-
-
-# Algorithm API
-cipher.c 
iQCVAwUAQDzrVjEAnp832S/7AQIPDgP+OVJ/YNWY5m7c09EBbPAzL/WsGoj6wrBNMmkRlMOqTHeh+OOtjuFHt1f9uhfM2Nzl7sJ5+h4ryZKLEZmQPRMTZTnAqkvGdsrJWJnigUA9QwYdV0ONqC9C63gpuG465gO9TZVOqlQu/FTxSRuTQYUulkaBNG71n8nZEOusBVwV2YA==58xH
-pubkey.c 
iQCVAwUAP9XQ3jEAnp832S/7AQJ5UgQAyHfEBvPVJ8wTRg8c7ixS2GiVmIgwIo5tvQaiQJTPWASevvYrB+2Z2qa9cATyu50ACjLzbaquGBgPzjJV3dU/qttT1gCqRuN/LCNvXFe5qnIZezejc3RAadFNTw/pOTHq0wxD1Keg66ruei9R36Nba59pEQIWIBXTfubRft2hMYk==E09t
-ac.c 
iQCVAwUAQDzsOzEAnp832S/7AQJCBQP/WI6EV/dsR4rmha6RVhvkjZo17kQ8z6pIl5J3cXOvqEkIFeD2HYu3HHrWST5l7yXlffhpDkVHkfMih4ruK76q6Fm0dxZ98pO4C/dVtgimlvvcy/wOQjpzsE0fYAe1BYdg81LJ09X33vW5x6C29lunfKROO2tPlV5i8ffeoFvmMF8==j26g
-md.c 
iQCVAwUAP+NFGjEAnp832S/7AQJs8wP/Qdk0EAKsyr3O1/pmOSN8AG4rPKbd6KDTzvoBPAN4upFwKYY4hWwvy12Q3YU9DmECrzZkRCXHR7mljVQKs6B7CRZJKjFKmOELpcJDtKvu40vTs1bOH4k9iJYZpGgRA83nkQ+ELAcphAbCA+KIpVr2K4mCJAB0FhpC2uOQ50JHAko==BeF6
-primegen.c 
iQCVAwUAQDzsoDEAnp832S/7AQKYRwP/TqAQBm1rHTnF0HYE05PqXfWlOqa6EosqVpaOcs/OIW6PaqX0xH1UlrukK7jNOjK3xC4o1qNQ1UKzz2dvQaq1bMvNNizeavxAh10SJZc0hIc/ofc83IbjLh8SZVWQ67JxjsUd3DOXmSmhPZ+Pqd7cUIiw8fDoF+I9EZqy3COu1wY==1ebT
-
-# Algorithm implementations
-arcfour.c 
iQCVAwUAP9XR/TEAnp832S/7AQJcRwP6AlvYEx++fpT4mIYo0xRDqKEQeqMQvbaRhIg2eV74JxItpHa3q5YsYIl+n1yUz5g35JRWWXSWmAZBwO5wLKsHii4kRUhgrKWnSoQZoPpl49L5+N3R58ON3S0ru5lsBiEJEze3xplf2vqwrH9v1QHVD+gU7UTlfNqrIJoOUXN+1O4==Tq+x
-blowfish.c 
iQCVAwUAP9XTETEAnp832S/7AQJaEgQAgiqqfuO+zQtscgTB0rvOzVymIKjRKjYhFuLjVuc79G4z1RCAffvIn/YM2d7kt+Z/QF7zjcTAOgETCQL1XokpX2zz9HPAMi2tlDY5zsDufTNqj0n4WBL9nM7w6XAvsiwP1B3bqCTv9SjJV4KbxJ58vw1yQE+sqW74R/QIHFvC7mU==wZnX
-cast5.c 
iQCVAwUAP9XT6DEAnp832S/7AQJ3xgP/ehLjEN3GELGudbqeo91Xd+PqitHrkuBbtRIYX7Udd/fyXLN+h8rMJVyIQX2m+mpxbBxudVU3x8/DNT8B0ZHAwK6qqJmEBLLhEYPgIuF76i9LMrP1KqUPhAwRZ2OppjIIugBQ+rP74aD4eLyd/aKQHNuXML8QGWR6KwQShohXM5I==/BRh
-crc.c 
iQCVAwUAP7ouejEAnp832S/7AQIgwQQApg5Nm63tH5DQkbN+zPzMO9Ygoj3ukxfFTyTBPYSXYKMiTjEbESegaU40uN8jnz2vprcIQWcgZfzO4+opEJMcI35aPwzEk0vKOp0S/PrBLUY2rJfnDVkX5XgJFZa2Q7LLe826UEBzTVYW924utiCCe8oOaOEWVNpg1mqdknu3M9o==kz5D
-des.c 
iQCVAwUAQCN2oDEAnp832S/7AQL/jwP6Auoq6nZCDBjpgc9tDzuIRwa9DqyuM3gX94uvgEpUwdHszb2bG43dz03kVmcYxtj1MzXbyCeCZOwox0b2SKmLgxIbrNP6yGbzVdTj6592gDYuf/ZXmc1ZNJ1DDldcPQ0n9fXUipUPwyPaNWo3mSZaNcMKSWWzdK0J6ciG6nk7SWI==9k/t
-dsa.c 
iQCVAwUAP9XZHDEAnp832S/7AQLBRgP/XrBzTEYx5ccMj1MMb6sg37liEHdIyyy49zjvt6jUqxj4RuwVEN8S6v3u4q/QyJkHAi1E0EkREgENlyHW6PKWhYbcrd0vPIAN15yjnl2yqtrCrJImexUCoqJJewK0E4JOicGbabTil8MZjk+mbhEPnjJBqOkyP1w0i31pEDgE/8M==pC8s
-elgamal.c 
iQCVAwUAP9XbYzEAnp832S/7AQLXagQA3HrvspZfbTGgmUH0IqLQTJ0exUPxJv5DET2TvoIy62trDmMN6lTAj5P+a7jQ8udcu0w+mR2vXUHcxUpNA2PxLaMwGzNSY4zRDNe9r3SFTDrFm6m4y9Ko2e8XtEA+WF6P/XLpck4Jn7vMEDmVGPwkNd22kXFFE8dBGwG6i5Hk1Mk==oBUs
-md4.c 
iQCVAwUAP9h50DEAnp832S/7AQJhHgQAzNA/B6MWFDlCtPkIVaW8RpP1Eg0ZNMsy0s7SJkopOCBlu6CwXUOKe+8ppcSxhjYKh4i4uQr/QtfipYlBjzKJGnrafoF/NugXNCOHSTGT11TvK7mCiBuUMVgvZGAlOJImk6eTTfUjRrMfaXM/SWl8bdJ4ZpzdjEyVh89r7I5JrGk==x2UD
-md5.c 
iQCVAwUAP9h7LzEAnp832S/7AQJUGQP/c0cbf6WZXCzmjufHxiE9FAQBzTsA0WtaNqdFcHl7fhmikGtknlaED8n5a7eYd/C481UQW6Wgq/oZdsvgoPWPhG3fOCy2CFP9cZVXITuMSf0ucyZTFUJNO15fnZ+nDfsUv+JPdv1aSeRinAUtfAcSKfkSyR9BCPZvkx+tgU6cphU==Zv+h
-rijndael.c 
iQCVAwUAP9h9cTEAnp832S/7AQKF1AP+P2L/tPqDJRDg+/fwbOk8Ts0MNxnvvYEm3gE73TKuLt1S+B2+jkrZcKNvM5VGPnVMJbnS0lmIK04nmedHCOftGTOwhGulZAHHIaKGystT3Jql4iPws/JMgAjE7Fyxh5WZMtB9yEljKBpJ5XNqhrMvvxcHpnyP3+YzIXNwzk34V+c==dJ5k
-rmd160.c 
iQCVAwUAP9h+bTEAnp832S/7AQK1OgP+PNKF6Nzi6X93easVlksdLqKEsArCAw2QjGWDGyxTnbiJM55qAl9JxR1mn3V+oOL7izLLwTt6EYK9evhzfcxY5N5Mni85RAcsLPsuAfQDEzjI6GUWHtQUKPbM+BaorzfhQjYFSZyvum/dZYJ/WfiwwwhqqIKyVU2ZFSqA38YGC/c==9jdA
-rsa.c 
iQCVAwUAP9iHIzEAnp832S/7AQKAYwQAuWtnMte54QHN+Hij9t4sGuypXogajOb1vQQwGgS0fKsaBZsuSP2amze4o5diIvsQTsFQ4CzjvqoCVuBDoHM3xkSD8wGDizgvtCamAxkdbF7wmzldKFn8SpJqlVwWQMP6kk1IjXHEuYb4IDWGTbVMhfEu+eOlU8+PSK4IhZqNvt4==/3hp
-serpent.c 
iQCVAwUAP9h/VzEAnp832S/7AQLyCwP/d1zbmb7l/PriZNa9/Z7mo01XFe5MnAqCfIwhl9GjeaMszcoS37jECNq5nLvrTTFIIJpm3rvBePwiCG4Wwx1I18HCxaP198pcSaR+BLOJ3Aj52EZPrxtqlDKuFr38ZOP5giyUqUYVYGVdrz4kRMNWAZQK53GeJnGhXCnhxojLEgA==ck46
-sha1.c 
iQCVAwUAP9iATTEAnp832S/7AQKcSwQAwAs/HnNqho3lU1ZUgCPNt5P2/Brm6W21+wWWGKJkSrra/c4NYVKJGDDwlsFE0b9ln1uZt7bHReFkKXK3JnrKTmNVcx/Cy64iCMRNMhaM72Mqy7wWx5yHBAmMBxzFGnNQKbmeY52zeGih5HsNLSibc2pPuOViWo2JPJ5Ci/wIwl8==/wtO
-sha256.c 
iQCVAwUAP9iAtzEAnp832S/7AQJD2QP/UqvL0hhjG1wEFbGrdkV9tba1sMDXdnnK6X7HdLuRpVAgNiQiFf8JDmntd/dZ2Q71p4Uae2ctqve4WoEijPUZPjACnpuZfx0SEQL0lQBkwxzJp7lz9ujVtwQ2cM/aYexJkXcWgGcloJNLM3JbWPGIJnuYbr/IwJ6RQF9vgj0357o==UWO1
-sha512.c 
iQCVAwUAP9iBTDEAnp832S/7AQIPBAQA28CJSUQLiW0s2x9u8/OH2eKnxPjA4sZmb50WP7920Lem66P31C3BrOqwfBot4RLhjL+zh/+Uc4s3HPwApZuj9E4BxNMlqLv+Tqk++DAbdaOeYT4jeUt+mlhQQ6mH/RDsy32rZsNsGQ2bUGxazZmfG++PL3JyhawqCy00SUDr/o0==H+0X
-tiger.c 
iQCVAwUAP9iCfjEAnp832S/7AQKufwP/fryv3MqSOYY+90325DH7X3/CtekxeooN0scGsHX0fxBakWSMecTNrj33KPddLS46gU/S89zIc2N/Bw/7EVIAXVFA3/3Ip+OrFOuIMO4Py1sCdB8o2Y+5ygv8iXLcsXIq1O0av79i9g774V3uaXa2qN9ZnXe0AEhcy8FHJ2i/wro==5XVB
-twofish.c 
iQCVAwUAP9iD6TEAnp832S/7AQKUnQP/Rq8FaYeHTG7HbZuqAs9pbPitzjDbkdZddmInWR7NmevBkKvhsJALjVooc0KGQfo2lAAmy3Xi/4QQN8VPn51DVjDIgf7x+DQh/9TFJHMccxI9asUgi4+TNnmMqLU1k3N8S2PjyZ1sjeC8B79fKPpwCzj72WkqPkzZw3l2jArr+dU==NdJT
-rfc2268.c 
iQCVAwUAQCN+3jEAnp832S/7AQLv1gQA1hJh29hAjKi4uLSGxXvJ6cyYmPdmevdKrbLnuHZWtHe4xvCgy/nTdEojEpxgLp/hL/ogasuWRC1W16Wiz9ryxf7YR0uhZWayO/bQNagpfU5MIkJTLuKqqgpwYumCSQfOugXVAqcgEzj+13eeyJaFVrzwrNa67sh84nmbjOjNjvE==0zBq
-
-# Random number related
-random.c 
iQCVAwUAP7nsITEAnp832S/7AQK4SAQAtvfUgrtGOQ2PlxGMla0qJLPHjJacMwgq0ecusiI79elPdDsFfCCk6dK1Ug2kFbNm22nCGHNcUquqbX7noi7ZVQnmPBQXzyLNZd7GmrawRZfdlRerTUDBpSnR8V8ui/5+YYp627E7kKGC0hPSgqXFql6oBMIfno0LZwFJTjIevRY==L419
-random.h 
iQCVAwUAP7ovKDEAnp832S/7AQJ3bQQAjnPebnyTC7sphAv2I7uIz+yPgw1ZfbVhLv+OiWDlO9ish+fRyyMpy+HELBOgZjJdgRegqhlZC6qyns5arM/VglYi+PzvdLO3hIqHE/YFfpIFPz8wBrcmlqrYyd3CsGqcYsfjocXNttCBLeSWmoJ09ltKQH8yzJf3oAgN6X1yuc4==eNoU
-rand-internal.h 
iQCVAwUAP7ouvDEAnp832S/7AQLYnAQAhdI7ERoJVCkV8GiV7MjaUxv1WIL7iZ+jIOvVhv4fNyhCGCGoEtTjkyput/lj7Nsh3FXEqRhypGGrCLf47x/gua5n+BwffogxVyUDqiOyyGhNTPpe3fQcNBvbPCtco8yMK4GJO5G3BqzlPyN+BMeogLymyV6Sm1mvh5LZDyAFbfQ==tZSE
-rndlinux.c 
iQCVAwUAP9iPYTEAnp832S/7AQL6/AP/ZDrbOkVuB9qJ7sKeX1MImZEsz3mi0xPovJzaBtBU7a0idcUKrWYOvQFWRlLUeq0iCT6+h2l5bniP7q7hepzlKa+VPY9VWaQthqeJm2l5LN6QQ5PyMfBq04QuBncw9BJnCGmEyTLt3RxIXBAPdxmiVxtcRIFUqCBtQvoUXGLvemw==t37k
-rndegd.c 
iQCVAwUAP9iPRDEAnp832S/7AQImBQP/WHKg+hKXcm1pQvilzML0jZpwK5PAMM4uBnnPJNIXWOYBO6I/Xg9d/tPLg8NlmmtyQCo2Eu0ybDSt+8mu+dWveAys+0LTi0MIqeP9BMzCKz8dnWH6+S8huLXwTF3m0IrqM0JLb6b71GK9SOq6sWQ22yW5vf61hXP8kH9dhIaoMZs==FaHV
-rndunix.c 
iQCVAwUAP9iQlzEAnp832S/7AQL/KgQA29GnvcD4Xb5qjDMBgW9THEE4+4lfex/6k+Fh0IT61OLJsWVLJ7bJpRntburw4uQm4Tf7CO8vaiDFDYhKKrzXeOF1fmdpcL8hA+fNp9I/MUOc4e9kN9+YJ9wikVa0SZj1OBfhzgcFLd1xOtulkr3ii52HLF9vhrxzkgVwvD10Bi8==2cML
-rndw32.c 
iQCVAwUAP9iRKDEAnp832S/7AQIuaAQA3AJr3WqnxNDsWCIdvehf8Suotthj+laX8nJsvDfFhXPKcXDpsg0wTTXSnnKgyED53+uYiMDnVRsxeWAyhKwvx1MjjlaSMMjzbH6isWTH8FaWpLgrxEkXoPeNqYf5FXpdUkcUxGX2RkQeuX/cIfiHLNE9CV0usaF2jysjBX2iERY==EEnO
-
-# Helper
-bithelp.h 
iQCVAwUAP7ouPTEAnp832S/7AQKXggQAqjcgvihIF3WclOgw1JV2rbARw4ISIDRMFqdaNCqBRx6BwEz3UGsEIlz6+iR1sS/reqN61WvtjLb+D0+tujAkGrgQJhFLG85WtG2tB5UVoI3am1fpkwiRm+bR4rv0rGk0BYk81bC7+l4KrK9o5lVp4lCsrorlUKsd48lNmBHyAXM==mDDN
-rmd.h 
iQCVAwUAP7oumjEAnp832S/7AQJiJQP/V4bJwjZaYndJzV+KRnIDbl1koHuw+ZK5heMYVu8Qk4ylqv//BGyeRa3jZCcfPHI35q6HilCs2VBm8hiBMjHSqY/VPn2ZQ0yg/lt6qEvl7YjsLmyMICvjG+ncszHoq9pRvnF3vTnM18sPIioXLk8fskuM0XOCNBs0ARBAQjY9UGI==olUN
-
-# Configuration
-Makefile.am 
iQCVAwUAQCN33TEAnp832S/7AQKFJAQAz7BDkC814q+QiuE/jnutJHR5qlgbrm3ikGbQwdRzYUscst4bCCWy3uKL/sIPGLg+JQXtF5FnsQy3s4D9BOYhp72cA9ktYK65hhi4pNm/JQ0lXkZMNfk8Go5lNzKezlWwHvkMwRXR0Fep0wPdyeaKW5BfaW2ABvgep6Bp+hHEbyg==zSyi
-$names$ 
iQCVAwUAQCN3EDEAnp832S/7AQJXLAP8DvHTpm5DkTF35EmzeKpi9ie59AZcZanD19ir/e/7+PaQxr2riuLHDGwFKTju+dcvvBsqrygXOC378GXVWzIF2OZwS4EdDcJ+pgojo9UpsqpKsJHouY4Ugx5cQialxba462kUn8hcihSBnMyc4LzbJ5WQ4puQuqy544d2x94+2ms==G4Ls
diff --git a/grub-core/lib/libgcrypt/cipher/ac.c 
b/grub-core/lib/libgcrypt/cipher/ac.c
deleted file mode 100644
index 63f6fcd11..000000000
--- a/grub-core/lib/libgcrypt/cipher/ac.c
+++ /dev/null
@@ -1,3301 +0,0 @@
-/* ac.c - Alternative interface for asymmetric cryptography.
-   Copyright (C) 2003, 2004, 2005, 2006
-                 2007, 2008  Free Software Foundation, Inc.
-
-   This file is part of Libgcrypt.
-
-   Libgcrypt is free software; you can redistribute it and/or modify
-   it under the terms of the GNU Lesser general Public License as
-   published by the Free Software Foundation; either version 2.1 of
-   the License, or (at your option) any later version.
-
-   Libgcrypt is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <stddef.h>
-
-#include "g10lib.h"
-#include "cipher.h"
-#include "mpi.h"
-
-
-
-/* At the moment the ac interface is a wrapper around the pk
-   interface, but this might change somewhen in the future, depending
-   on how many people prefer the ac interface.  */
-
-/* Mapping of flag numbers to the according strings as it is expected
-   for S-expressions.  */
-static struct number_string
-{
-  int number;
-  const char *string;
-} ac_flags[] =
-  {
-    { GCRY_AC_FLAG_NO_BLINDING, "no-blinding" },
-  };
-
-/* The positions in this list correspond to the values contained in
-   the gcry_ac_key_type_t enumeration list.  */
-static const char *ac_key_identifiers[] =
-  {
-    "private-key",
-    "public-key"
-  };
-
-/* These specifications are needed for key-pair generation; the caller
-   is allowed to pass additional, algorithm-specific `specs' to
-   gcry_ac_key_pair_generate.  This list is used for decoding the
-   provided values according to the selected algorithm.  */
-struct gcry_ac_key_generate_spec
-{
-  int algorithm;               /* Algorithm for which this flag is
-                                  relevant.  */
-  const char *name;            /* Name of this flag.  */
-  size_t offset;               /* Offset in the cipher-specific spec
-                                  structure at which the MPI value
-                                  associated with this flag is to be
-                                  found.  */
-} ac_key_generate_specs[] =
-  {
-    { GCRY_AC_RSA, "rsa-use-e", offsetof (gcry_ac_key_spec_rsa_t, e) },
-    { 0 }
-  };
-
-/* Handle structure.  */
-struct gcry_ac_handle
-{
-  int algorithm;               /* Algorithm ID associated with this
-                                  handle.  */
-  const char *algorithm_name;  /* Name of the algorithm.  */
-  unsigned int flags;          /* Flags, not used yet.  */
-  gcry_module_t module;                /* Reference to the algorithm
-                                  module.  */
-};
-
-/* A named MPI value.  */
-typedef struct gcry_ac_mpi
-{
-  char *name;                  /* Self-maintained copy of name.  */
-  gcry_mpi_t mpi;              /* MPI value.         */
-  unsigned int flags;          /* Flags.             */
-} gcry_ac_mpi_t;
-
-/* A data set, that is simply a list of named MPI values.  */
-struct gcry_ac_data
-{
-  gcry_ac_mpi_t *data;         /* List of named values.      */
-  unsigned int data_n;         /* Number of values in DATA.  */
-};
-
-/* A single key.  */
-struct gcry_ac_key
-{
-  gcry_ac_data_t data;         /* Data in native ac structure.  */
-  gcry_ac_key_type_t type;     /* Type of the key.              */
-};
-
-/* A key pair.  */
-struct gcry_ac_key_pair
-{
-  gcry_ac_key_t public;
-  gcry_ac_key_t secret;
-};
-
-
-
-/*
- * Functions for working with data sets.
- */
-
-/* Creates a new, empty data set and store it in DATA.  */
-gcry_error_t
-_gcry_ac_data_new (gcry_ac_data_t *data)
-{
-  gcry_ac_data_t data_new;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_new = gcry_malloc (sizeof (*data_new));
-  if (! data_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  data_new->data = NULL;
-  data_new->data_n = 0;
-  *data = data_new;
-  err = 0;
-
- out:
-
-  return err;
-}
-
-/* Destroys all the entries in DATA, but not DATA itself.  */
-static void
-ac_data_values_destroy (gcry_ac_data_t data)
-{
-  unsigned int i;
-
-  for (i = 0; i < data->data_n; i++)
-    if (data->data[i].flags & GCRY_AC_FLAG_DEALLOC)
-      {
-       gcry_mpi_release (data->data[i].mpi);
-       gcry_free (data->data[i].name);
-      }
-}
-
-/* Destroys the data set DATA.  */
-void
-_gcry_ac_data_destroy (gcry_ac_data_t data)
-{
-  if (data)
-    {
-      ac_data_values_destroy (data);
-      gcry_free (data->data);
-      gcry_free (data);
-    }
-}
-
-/* This function creates a copy of the array of named MPIs DATA_MPIS,
-   which is of length DATA_MPIS_N; the copy is stored in
-   DATA_MPIS_CP.  */
-static gcry_error_t
-ac_data_mpi_copy (gcry_ac_mpi_t *data_mpis, unsigned int data_mpis_n,
-                 gcry_ac_mpi_t **data_mpis_cp)
-{
-  gcry_ac_mpi_t *data_mpis_new;
-  gcry_error_t err;
-  unsigned int i;
-  gcry_mpi_t mpi;
-  char *label;
-
-  data_mpis_new = gcry_calloc (data_mpis_n, sizeof (*data_mpis_new));
-  if (! data_mpis_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-  memset (data_mpis_new, 0, sizeof (*data_mpis_new) * data_mpis_n);
-
-  err = 0;
-  for (i = 0; i < data_mpis_n; i++)
-    {
-      /* Copy values.  */
-
-      label = gcry_strdup (data_mpis[i].name);
-      mpi = gcry_mpi_copy (data_mpis[i].mpi);
-      if (! (label && mpi))
-       {
-         err = gcry_error_from_errno (errno);
-         gcry_mpi_release (mpi);
-         gcry_free (label);
-         break;
-       }
-
-      data_mpis_new[i].flags = GCRY_AC_FLAG_DEALLOC;
-      data_mpis_new[i].name = label;
-      data_mpis_new[i].mpi = mpi;
-    }
-  if (err)
-    goto out;
-
-  *data_mpis_cp = data_mpis_new;
-  err = 0;
-
- out:
-
-  if (err)
-    if (data_mpis_new)
-      {
-       for (i = 0; i < data_mpis_n; i++)
-         {
-           gcry_mpi_release (data_mpis_new[i].mpi);
-           gcry_free (data_mpis_new[i].name);
-         }
-       gcry_free (data_mpis_new);
-      }
-
-  return err;
-}
-
-/* Create a copy of the data set DATA and store it in DATA_CP.  */
-gcry_error_t
-_gcry_ac_data_copy (gcry_ac_data_t *data_cp, gcry_ac_data_t data)
-{
-  gcry_ac_mpi_t *data_mpis = NULL;
-  gcry_ac_data_t data_new;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  /* Allocate data set.  */
-  data_new = gcry_malloc (sizeof (*data_new));
-  if (! data_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  err = ac_data_mpi_copy (data->data, data->data_n, &data_mpis);
-  if (err)
-    goto out;
-
-  data_new->data_n = data->data_n;
-  data_new->data = data_mpis;
-  *data_cp = data_new;
-
- out:
-
-  if (err)
-    gcry_free (data_new);
-
-  return err;
-}
-
-/* Returns the number of named MPI values inside of the data set
-   DATA.  */
-unsigned int
-_gcry_ac_data_length (gcry_ac_data_t data)
-{
-  return data->data_n;
-}
-
-
-/* Add the value MPI to DATA with the label NAME.  If FLAGS contains
-   GCRY_AC_FLAG_COPY, the data set will contain copies of NAME
-   and MPI.  If FLAGS contains GCRY_AC_FLAG_DEALLOC or
-   GCRY_AC_FLAG_COPY, the values contained in the data set will
-   be deallocated when they are to be removed from the data set.  */
-gcry_error_t
-_gcry_ac_data_set (gcry_ac_data_t data, unsigned int flags,
-                  const char *name, gcry_mpi_t mpi)
-{
-  gcry_error_t err;
-  gcry_mpi_t mpi_cp;
-  char *name_cp;
-  unsigned int i;
-
-  name_cp = NULL;
-  mpi_cp = NULL;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  if (flags & ~(GCRY_AC_FLAG_DEALLOC | GCRY_AC_FLAG_COPY))
-    {
-      err = gcry_error (GPG_ERR_INV_ARG);
-      goto out;
-    }
-
-  if (flags & GCRY_AC_FLAG_COPY)
-    {
-      /* Create copies.  */
-
-      flags |= GCRY_AC_FLAG_DEALLOC;
-      name_cp = gcry_strdup (name);
-      mpi_cp = gcry_mpi_copy (mpi);
-      if (! (name_cp && mpi_cp))
-       {
-         err = gcry_error_from_errno (errno);
-         goto out;
-       }
-    }
-
-  /* Search for existing entry.  */
-  for (i = 0; i < data->data_n; i++)
-    if (! strcmp (name, data->data[i].name))
-      break;
-  if (i < data->data_n)
-    {
-      /* An entry for NAME does already exist.  */
-      if (data->data[i].flags & GCRY_AC_FLAG_DEALLOC)
-       {
-         gcry_mpi_release (data->data[i].mpi);
-         gcry_free (data->data[i].name);
-       }
-    }
-  else
-    {
-      /* Create a new entry.  */
-
-      gcry_ac_mpi_t *ac_mpis;
-
-      ac_mpis = gcry_realloc (data->data,
-                             sizeof (*data->data) * (data->data_n + 1));
-      if (! ac_mpis)
-       {
-         err = gcry_error_from_errno (errno);
-         goto out;
-       }
-
-      if (data->data != ac_mpis)
-       data->data = ac_mpis;
-      data->data_n++;
-    }
-
-  data->data[i].name = name_cp ? name_cp : ((char *) name);
-  data->data[i].mpi = mpi_cp ? mpi_cp : mpi;
-  data->data[i].flags = flags;
-  err = 0;
-
- out:
-
-  if (err)
-    {
-      gcry_mpi_release (mpi_cp);
-      gcry_free (name_cp);
-    }
-
-  return err;
-}
-
-/* Stores the value labelled with NAME found in the data set DATA in
-   MPI.  The returned MPI value will be released in case
-   gcry_ac_data_set is used to associate the label NAME with a
-   different MPI value.  */
-gcry_error_t
-_gcry_ac_data_get_name (gcry_ac_data_t data, unsigned int flags,
-                       const char *name, gcry_mpi_t *mpi)
-{
-  gcry_mpi_t mpi_return;
-  gcry_error_t err;
-  unsigned int i;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  if (flags & ~(GCRY_AC_FLAG_COPY))
-    {
-      err = gcry_error (GPG_ERR_INV_ARG);
-      goto out;
-    }
-
-  for (i = 0; i < data->data_n; i++)
-    if (! strcmp (name, data->data[i].name))
-      break;
-  if (i == data->data_n)
-    {
-      err = gcry_error (GPG_ERR_NOT_FOUND);
-      goto out;
-    }
-
-  if (flags & GCRY_AC_FLAG_COPY)
-    {
-      mpi_return = gcry_mpi_copy (data->data[i].mpi);
-      if (! mpi_return)
-       {
-         err = gcry_error_from_errno (errno); /* FIXME? */
-         goto out;
-       }
-    }
-  else
-    mpi_return = data->data[i].mpi;
-
-  *mpi = mpi_return;
-  err = 0;
-
- out:
-
-  return err;
-}
-
-/* Stores in NAME and MPI the named MPI value contained in the data
-   set DATA with the index IDX.  NAME or MPI may be NULL.  The
-   returned MPI value will be released in case gcry_ac_data_set is
-   used to associate the label NAME with a different MPI value.  */
-gcry_error_t
-_gcry_ac_data_get_index (gcry_ac_data_t data, unsigned int flags,
-                        unsigned int idx,
-                        const char **name, gcry_mpi_t *mpi)
-{
-  gcry_error_t err;
-  gcry_mpi_t mpi_cp;
-  char *name_cp;
-
-  name_cp = NULL;
-  mpi_cp = NULL;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  if (flags & ~(GCRY_AC_FLAG_COPY))
-    {
-      err = gcry_error (GPG_ERR_INV_ARG);
-      goto out;
-    }
-
-  if (idx >= data->data_n)
-    {
-      err = gcry_error (GPG_ERR_INV_ARG);
-      goto out;
-    }
-
-  if (flags & GCRY_AC_FLAG_COPY)
-    {
-      /* Return copies to the user.  */
-      if (name)
-       {
-         name_cp = gcry_strdup (data->data[idx].name);
-         if (! name_cp)
-           {
-             err = gcry_error_from_errno (errno);
-             goto out;
-           }
-       }
-      if (mpi)
-       {
-         mpi_cp = gcry_mpi_copy (data->data[idx].mpi);
-         if (! mpi_cp)
-           {
-             err = gcry_error_from_errno (errno);
-             goto out;
-           }
-       }
-    }
-
-  if (name)
-    *name = name_cp ? name_cp : data->data[idx].name;
-  if (mpi)
-    *mpi = mpi_cp ? mpi_cp : data->data[idx].mpi;
-  err = 0;
-
- out:
-
-  if (err)
-    {
-      gcry_mpi_release (mpi_cp);
-      gcry_free (name_cp);
-    }
-
-  return err;
-}
-
-/* Convert the data set DATA into a new S-Expression, which is to be
-   stored in SEXP, according to the identifiers contained in
-   IDENTIFIERS.  */
-gcry_error_t
-_gcry_ac_data_to_sexp (gcry_ac_data_t data, gcry_sexp_t *sexp,
-                      const char **identifiers)
-{
-  gcry_sexp_t sexp_new;
-  gcry_error_t err;
-  char *sexp_buffer;
-  size_t sexp_buffer_n;
-  size_t identifiers_n;
-  const char *label;
-  gcry_mpi_t mpi;
-  void **arg_list;
-  size_t data_n;
-  unsigned int i;
-
-  sexp_buffer_n = 1;
-  sexp_buffer = NULL;
-  arg_list = NULL;
-  err = 0;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  /* Calculate size of S-expression representation.  */
-
-  i = 0;
-  if (identifiers)
-    while (identifiers[i])
-      {
-       /* For each identifier, we add "(<IDENTIFIER>)".  */
-       sexp_buffer_n += 1 + strlen (identifiers[i]) + 1;
-       i++;
-      }
-  identifiers_n = i;
-
-  if (! identifiers_n)
-    /* If there are NO identifiers, we still add surrounding braces so
-       that we have a list of named MPI value lists.  Otherwise it
-       wouldn't be too much fun to process these lists.  */
-    sexp_buffer_n += 2;
-
-  data_n = _gcry_ac_data_length (data);
-  for (i = 0; i < data_n; i++)
-    {
-      err = gcry_ac_data_get_index (data, 0, i, &label, NULL);
-      if (err)
-       break;
-      /* For each MPI we add "(<LABEL> %m)".  */
-      sexp_buffer_n += 1 + strlen (label) + 4;
-    }
-  if (err)
-    goto out;
-
-  /* Allocate buffer.  */
-
-  sexp_buffer = gcry_malloc (sexp_buffer_n);
-  if (! sexp_buffer)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Fill buffer.  */
-
-  *sexp_buffer = 0;
-  sexp_buffer_n = 0;
-
-  /* Add identifiers: (<IDENTIFIER0>(<IDENTIFIER1>...)).  */
-  if (identifiers_n)
-    {
-      /* Add nested identifier lists as usual.  */
-      for (i = 0; i < identifiers_n; i++)
-       sexp_buffer_n += sprintf (sexp_buffer + sexp_buffer_n, "(%s",
-                                 identifiers[i]);
-    }
-  else
-    {
-      /* Add special list.  */
-      sexp_buffer_n += sprintf (sexp_buffer + sexp_buffer_n, "(");
-    }
-
-  /* Add MPI list.  */
-  arg_list = gcry_calloc (data_n + 1, sizeof (*arg_list));
-  if (! arg_list)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-  for (i = 0; i < data_n; i++)
-    {
-      err = gcry_ac_data_get_index (data, 0, i, &label, &mpi);
-      if (err)
-       break;
-      sexp_buffer_n += sprintf (sexp_buffer + sexp_buffer_n,
-                               "(%s %%m)", label);
-      arg_list[i] = &data->data[i].mpi;
-    }
-  if (err)
-    goto out;
-
-  if (identifiers_n)
-    {
-      /* Add closing braces for identifier lists as usual.  */
-      for (i = 0; i < identifiers_n; i++)
-       sexp_buffer_n += sprintf (sexp_buffer + sexp_buffer_n, ")");
-    }
-  else
-    {
-      /* Add closing braces for special list.  */
-      sexp_buffer_n += sprintf (sexp_buffer + sexp_buffer_n, ")");
-    }
-
-  /* Construct.  */
-  err = gcry_sexp_build_array (&sexp_new, NULL, sexp_buffer, arg_list);
-  if (err)
-    goto out;
-
-  *sexp = sexp_new;
-
- out:
-
-  gcry_free (sexp_buffer);
-  gcry_free (arg_list);
-
-  return err;
-}
-
-/* Create a new data set, which is to be stored in DATA_SET, from the
-   S-Expression SEXP, according to the identifiers contained in
-   IDENTIFIERS.  */
-gcry_error_t
-_gcry_ac_data_from_sexp (gcry_ac_data_t *data_set, gcry_sexp_t sexp,
-                        const char **identifiers)
-{
-  gcry_ac_data_t data_set_new;
-  gcry_error_t err;
-  gcry_sexp_t sexp_cur;
-  gcry_sexp_t sexp_tmp;
-  gcry_mpi_t mpi;
-  char *string;
-  const char *data;
-  size_t data_n;
-  size_t sexp_n;
-  unsigned int i;
-  int skip_name;
-
-  data_set_new = NULL;
-  sexp_cur = sexp;
-  sexp_tmp = NULL;
-  string = NULL;
-  mpi = NULL;
-  err = 0;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  /* Process S-expression/identifiers.  */
-
-  if (identifiers)
-    {
-      for (i = 0; identifiers[i]; i++)
-       {
-         /* Next identifier.  Extract first data item from
-            SEXP_CUR.  */
-         data = gcry_sexp_nth_data (sexp_cur, 0, &data_n);
-
-         if (! ((data_n == strlen (identifiers[i]))
-                && (! strncmp (data, identifiers[i], data_n))))
-           {
-             /* Identifier mismatch -> error.  */
-             err = gcry_error (GPG_ERR_INV_SEXP);
-             break;
-           }
-
-         /* Identifier matches.  Now we have to distinguish two
-            cases:
-
-            (i)  we are at the last identifier:
-            leave loop
-
-            (ii) we are not at the last identifier:
-            extract next element, which is supposed to be a
-            sublist.  */
-
-         if (! identifiers[i + 1])
-           /* Last identifier.  */
-           break;
-         else
-           {
-             /* Not the last identifier, extract next sublist.  */
-
-             sexp_tmp = gcry_sexp_nth (sexp_cur, 1);
-             if (! sexp_tmp)
-               {
-                 /* Missing sublist.  */
-                 err = gcry_error (GPG_ERR_INV_SEXP);
-                 break;
-               }
-
-             /* Release old SEXP_CUR, in case it is not equal to the
-                original SEXP.  */
-             if (sexp_cur != sexp)
-               gcry_sexp_release (sexp_cur);
-
-             /* Make SEXP_CUR point to the new current sublist.  */
-             sexp_cur = sexp_tmp;
-              sexp_tmp = NULL;
-           }
-       }
-      if (err)
-       goto out;
-
-      if (i)
-        {
-          /* We have at least one identifier in the list, this means
-             the the list of named MPI values is prefixed, this means
-             that we need to skip the first item (the list name), when
-             processing the MPI values.  */
-          skip_name = 1;
-        }
-      else
-        {
-          /* Since there is no identifiers list, the list of named MPI
-             values is not prefixed with a list name, therefore the
-             offset to use is zero.  */
-          skip_name = 0;
-        }
-    }
-  else
-    /* Since there is no identifiers list, the list of named MPI
-       values is not prefixed with a list name, therefore the offset
-       to use is zero.  */
-    skip_name = 0;
-
-  /* Create data set from S-expression data.  */
-
-  err = gcry_ac_data_new (&data_set_new);
-  if (err)
-    goto out;
-
-  /* Figure out amount of named MPIs in SEXP_CUR.  */
-  if (sexp_cur)
-    sexp_n = gcry_sexp_length (sexp_cur) - skip_name;
-  else
-    sexp_n = 0;
-
-  /* Extracte the named MPIs sequentially.  */
-  for (i = 0; i < sexp_n; i++)
-    {
-      /* Store next S-Expression pair, which is supposed to consist of
-        a name and an MPI value, in SEXP_TMP.  */
-
-      sexp_tmp = gcry_sexp_nth (sexp_cur, i + skip_name);
-      if (! sexp_tmp)
-       {
-         err = gcry_error (GPG_ERR_INV_SEXP);
-         break;
-       }
-
-      /* Extract name from current S-Expression pair.  */
-      data = gcry_sexp_nth_data (sexp_tmp, 0, &data_n);
-      string = gcry_malloc (data_n + 1);
-      if (! string)
-       {
-         err = gcry_error_from_errno (errno);
-         break;
-       }
-      memcpy (string, data, data_n);
-      string[data_n] = 0;
-
-      /* Extract MPI value.  */
-      mpi = gcry_sexp_nth_mpi (sexp_tmp, 1, 0);
-      if (! mpi)
-       {
-         err = gcry_error (GPG_ERR_INV_SEXP); /* FIXME? */
-         break;
-       }
-
-      /* Store named MPI in data_set_new.  */
-      err = gcry_ac_data_set (data_set_new, GCRY_AC_FLAG_DEALLOC, string, mpi);
-      if (err)
-       break;
-
-/*       gcry_free (string); */
-      string = NULL;
-/*       gcry_mpi_release (mpi); */
-      mpi = NULL;
-
-      gcry_sexp_release (sexp_tmp);
-      sexp_tmp = NULL;
-    }
-  if (err)
-    goto out;
-
-  *data_set = data_set_new;
-
- out:
-
-  if (sexp_cur != sexp)
-    gcry_sexp_release (sexp_cur);
-  gcry_sexp_release (sexp_tmp);
-  gcry_mpi_release (mpi);
-  gcry_free (string);
-
-  if (err)
-    gcry_ac_data_destroy (data_set_new);
-
-  return err;
-}
-
-
-static void
-_gcry_ac_data_dump (const char *prefix, gcry_ac_data_t data)
-{
-  unsigned char *mpi_buffer;
-  size_t mpi_buffer_n;
-  unsigned int data_n;
-  gcry_error_t err;
-  const char *name;
-  gcry_mpi_t mpi;
-  unsigned int i;
-
-  if (! data)
-    return;
-
-  if (fips_mode ())
-    return;
-
-  mpi_buffer = NULL;
-
-  data_n = _gcry_ac_data_length (data);
-  for (i = 0; i < data_n; i++)
-    {
-      err = gcry_ac_data_get_index (data, 0, i, &name, &mpi);
-      if (err)
-       {
-         log_error ("failed to dump data set");
-         break;
-       }
-
-      err = gcry_mpi_aprint (GCRYMPI_FMT_HEX, &mpi_buffer, &mpi_buffer_n, mpi);
-      if (err)
-       {
-         log_error ("failed to dump data set");
-         break;
-       }
-
-      log_printf ("%s%s%s: %s\n",
-                 prefix ? prefix : "",
-                 prefix ? ": " : ""
-                 , name, mpi_buffer);
-
-      gcry_free (mpi_buffer);
-      mpi_buffer = NULL;
-    }
-
-  gcry_free (mpi_buffer);
-}
-
-/* Dump the named MPI values contained in the data set DATA to
-   Libgcrypt's logging stream.  */
-void
-gcry_ac_data_dump (const char *prefix, gcry_ac_data_t data)
-{
-  _gcry_ac_data_dump (prefix, data);
-}
-
-/* Destroys any values contained in the data set DATA.  */
-void
-_gcry_ac_data_clear (gcry_ac_data_t data)
-{
-  ac_data_values_destroy (data);
-  gcry_free (data->data);
-  data->data = NULL;
-  data->data_n = 0;
-}
-
-
-
-/*
- * Implementation of `ac io' objects.
- */
-
-/* Initialize AC_IO according to MODE, TYPE and the variable list of
-   arguments AP.  The list of variable arguments to specify depends on
-   the given TYPE.  */
-void
-_gcry_ac_io_init_va (gcry_ac_io_t *ac_io,
-                    gcry_ac_io_mode_t mode, gcry_ac_io_type_t type, va_list ap)
-{
-  memset (ac_io, 0, sizeof (*ac_io));
-
-  if (fips_mode ())
-    return;
-
-  gcry_assert ((mode == GCRY_AC_IO_READABLE) || (mode == GCRY_AC_IO_WRITABLE));
-  gcry_assert ((type == GCRY_AC_IO_STRING) || (type == GCRY_AC_IO_STRING));
-
-  ac_io->mode = mode;
-  ac_io->type = type;
-
-  switch (mode)
-    {
-    case GCRY_AC_IO_READABLE:
-      switch (type)
-       {
-       case GCRY_AC_IO_STRING:
-         ac_io->io.readable.string.data = va_arg (ap, unsigned char *);
-         ac_io->io.readable.string.data_n = va_arg (ap, size_t);
-         break;
-
-       case GCRY_AC_IO_CALLBACK:
-         ac_io->io.readable.callback.cb = va_arg (ap, gcry_ac_data_read_cb_t);
-         ac_io->io.readable.callback.opaque = va_arg (ap, void *);
-         break;
-       }
-      break;
-    case GCRY_AC_IO_WRITABLE:
-      switch (type)
-       {
-       case GCRY_AC_IO_STRING:
-         ac_io->io.writable.string.data = va_arg (ap, unsigned char **);
-         ac_io->io.writable.string.data_n = va_arg (ap, size_t *);
-         break;
-
-       case GCRY_AC_IO_CALLBACK:
-         ac_io->io.writable.callback.cb = va_arg (ap, gcry_ac_data_write_cb_t);
-         ac_io->io.writable.callback.opaque = va_arg (ap, void *);
-         break;
-       }
-      break;
-    }
-}
-
-/* Initialize AC_IO according to MODE, TYPE and the variable list of
-   arguments.  The list of variable arguments to specify depends on
-   the given TYPE. */
-void
-_gcry_ac_io_init (gcry_ac_io_t *ac_io,
-                 gcry_ac_io_mode_t mode, gcry_ac_io_type_t type, ...)
-{
-  va_list ap;
-
-  va_start (ap, type);
-  _gcry_ac_io_init_va (ac_io, mode, type, ap);
-  va_end (ap);
-}
-
-
-/* Write to the IO object AC_IO BUFFER_N bytes from BUFFER.  Return
-   zero on success or error code.  */
-static gcry_error_t
-_gcry_ac_io_write (gcry_ac_io_t *ac_io, unsigned char *buffer, size_t buffer_n)
-{
-  gcry_error_t err;
-
-  gcry_assert (ac_io->mode == GCRY_AC_IO_WRITABLE);
-  err = 0;
-
-  switch (ac_io->type)
-    {
-    case GCRY_AC_IO_STRING:
-      {
-       unsigned char *p;
-
-       if (*ac_io->io.writable.string.data)
-         {
-           p = gcry_realloc (*ac_io->io.writable.string.data,
-                             *ac_io->io.writable.string.data_n + buffer_n);
-           if (! p)
-             err = gcry_error_from_errno (errno);
-           else
-             {
-               if (*ac_io->io.writable.string.data != p)
-                 *ac_io->io.writable.string.data = p;
-               memcpy (p + *ac_io->io.writable.string.data_n, buffer, 
buffer_n);
-               *ac_io->io.writable.string.data_n += buffer_n;
-             }
-         }
-       else
-         {
-           if (gcry_is_secure (buffer))
-             p = gcry_malloc_secure (buffer_n);
-           else
-             p = gcry_malloc (buffer_n);
-           if (! p)
-             err = gcry_error_from_errno (errno);
-           else
-             {
-               memcpy (p, buffer, buffer_n);
-               *ac_io->io.writable.string.data = p;
-               *ac_io->io.writable.string.data_n = buffer_n;
-             }
-         }
-      }
-      break;
-
-    case GCRY_AC_IO_CALLBACK:
-      err = (*ac_io->io.writable.callback.cb) 
(ac_io->io.writable.callback.opaque,
-                                              buffer, buffer_n);
-      break;
-    }
-
-  return err;
-}
-
-/* Read *BUFFER_N bytes from the IO object AC_IO into BUFFER; NREAD
-   bytes have already been read from the object; on success, store the
-   amount of bytes read in *BUFFER_N; zero bytes read means EOF.
-   Return zero on success or error code.  */
-static gcry_error_t
-_gcry_ac_io_read (gcry_ac_io_t *ac_io,
-                 unsigned int nread, unsigned char *buffer, size_t *buffer_n)
-{
-  gcry_error_t err;
-
-  gcry_assert (ac_io->mode == GCRY_AC_IO_READABLE);
-  err = 0;
-
-  switch (ac_io->type)
-    {
-    case GCRY_AC_IO_STRING:
-      {
-       size_t bytes_available;
-       size_t bytes_to_read;
-       size_t bytes_wanted;
-
-       bytes_available = ac_io->io.readable.string.data_n - nread;
-       bytes_wanted = *buffer_n;
-
-       if (bytes_wanted > bytes_available)
-         bytes_to_read = bytes_available;
-       else
-         bytes_to_read = bytes_wanted;
-
-       memcpy (buffer, ac_io->io.readable.string.data + nread, bytes_to_read);
-       *buffer_n = bytes_to_read;
-       err = 0;
-       break;
-      }
-
-    case GCRY_AC_IO_CALLBACK:
-      err = (*ac_io->io.readable.callback.cb)
-       (ac_io->io.readable.callback.opaque, buffer, buffer_n);
-      break;
-    }
-
-  return err;
-}
-
-/* Read all data available from the IO object AC_IO into newly
-   allocated memory, storing an appropriate pointer in *BUFFER and the
-   amount of bytes read in *BUFFER_N.  Return zero on success or error
-   code.  */
-static gcry_error_t
-_gcry_ac_io_read_all (gcry_ac_io_t *ac_io, unsigned char **buffer, size_t 
*buffer_n)
-{
-  unsigned char *buffer_new;
-  size_t buffer_new_n;
-  unsigned char buf[BUFSIZ];
-  size_t buf_n;
-  unsigned char *p;
-  gcry_error_t err;
-
-  buffer_new = NULL;
-  buffer_new_n = 0;
-
-  while (1)
-    {
-      buf_n = sizeof (buf);
-      err = _gcry_ac_io_read (ac_io, buffer_new_n, buf, &buf_n);
-      if (err)
-       break;
-
-      if (buf_n)
-       {
-         p = gcry_realloc (buffer_new, buffer_new_n + buf_n);
-         if (! p)
-           {
-             err = gcry_error_from_errno (errno);
-             break;
-           }
-
-         if (buffer_new != p)
-           buffer_new = p;
-
-         memcpy (buffer_new + buffer_new_n, buf, buf_n);
-         buffer_new_n += buf_n;
-       }
-      else
-       break;
-    }
-  if (err)
-    goto out;
-
-  *buffer_n = buffer_new_n;
-  *buffer = buffer_new;
-
- out:
-
-  if (err)
-    gcry_free (buffer_new);
-
-  return err;
-}
-
-/* Read data chunks from the IO object AC_IO until EOF, feeding them
-   to the callback function CB.  Return zero on success or error
-   code.  */
-static gcry_error_t
-_gcry_ac_io_process (gcry_ac_io_t *ac_io,
-                    gcry_ac_data_write_cb_t cb, void *opaque)
-{
-  unsigned char buffer[BUFSIZ];
-  unsigned int nread;
-  size_t buffer_n;
-  gcry_error_t err;
-
-  nread = 0;
-
-  while (1)
-    {
-      buffer_n = sizeof (buffer);
-      err = _gcry_ac_io_read (ac_io, nread, buffer, &buffer_n);
-      if (err)
-       break;
-      if (buffer_n)
-       {
-         err = (*cb) (opaque, buffer, buffer_n);
-         if (err)
-           break;
-         nread += buffer_n;
-       }
-      else
-       break;
-    }
-
-  return err;
-}
-
-
-
-/*
- * Functions for converting data between the native ac and the
- * S-expression structure used by the pk interface.
- */
-
-/* Extract the S-Expression DATA_SEXP into DATA under the control of
-   TYPE and NAME.  This function assumes that S-Expressions are of the
-   following structure:
-
-   (IDENTIFIER [...]
-   (ALGORITHM <list of named MPI values>)) */
-static gcry_error_t
-ac_data_extract (const char *identifier, const char *algorithm,
-                gcry_sexp_t sexp, gcry_ac_data_t *data)
-{
-  gcry_error_t err;
-  gcry_sexp_t value_sexp;
-  gcry_sexp_t data_sexp;
-  size_t data_sexp_n;
-  gcry_mpi_t value_mpi;
-  char *value_name;
-  const char *data_raw;
-  size_t data_raw_n;
-  gcry_ac_data_t data_new;
-  unsigned int i;
-
-  value_sexp = NULL;
-  data_sexp = NULL;
-  value_name = NULL;
-  value_mpi = NULL;
-  data_new = NULL;
-
-  /* Verify that the S-expression contains the correct identifier.  */
-  data_raw = gcry_sexp_nth_data (sexp, 0, &data_raw_n);
-  if ((! data_raw) || strncmp (identifier, data_raw, data_raw_n))
-    {
-      err = gcry_error (GPG_ERR_INV_SEXP);
-      goto out;
-    }
-
-  /* Extract inner S-expression.  */
-  data_sexp = gcry_sexp_find_token (sexp, algorithm, 0);
-  if (! data_sexp)
-    {
-      err = gcry_error (GPG_ERR_INV_SEXP);
-      goto out;
-    }
-
-  /* Count data elements.  */
-  data_sexp_n = gcry_sexp_length (data_sexp);
-  data_sexp_n--;
-
-  /* Allocate new data set.  */
-  err = _gcry_ac_data_new (&data_new);
-  if (err)
-    goto out;
-
-  /* Iterate through list of data elements and add them to the data
-     set.  */
-  for (i = 0; i < data_sexp_n; i++)
-    {
-      /* Get the S-expression of the named MPI, that contains the name
-        and the MPI value.  */
-      value_sexp = gcry_sexp_nth (data_sexp, i + 1);
-      if (! value_sexp)
-       {
-         err = gcry_error (GPG_ERR_INV_SEXP);
-         break;
-       }
-
-      /* Extract the name.  */
-      data_raw = gcry_sexp_nth_data (value_sexp, 0, &data_raw_n);
-      if (! data_raw)
-       {
-         err = gcry_error (GPG_ERR_INV_SEXP);
-         break;
-       }
-
-      /* Extract the MPI value.  */
-      value_mpi = gcry_sexp_nth_mpi (value_sexp, 1, GCRYMPI_FMT_USG);
-      if (! value_mpi)
-       {
-         err = gcry_error (GPG_ERR_INTERNAL); /* FIXME? */
-         break;
-       }
-
-      /* Duplicate the name.  */
-      value_name = gcry_malloc (data_raw_n + 1);
-      if (! value_name)
-       {
-         err = gcry_error_from_errno (errno);
-         break;
-       }
-      strncpy (value_name, data_raw, data_raw_n);
-      value_name[data_raw_n] = 0;
-
-      err = _gcry_ac_data_set (data_new, GCRY_AC_FLAG_DEALLOC, value_name, 
value_mpi);
-      if (err)
-       break;
-
-      gcry_sexp_release (value_sexp);
-      value_sexp = NULL;
-      value_name = NULL;
-      value_mpi = NULL;
-    }
-  if (err)
-    goto out;
-
-  /* Copy out.  */
-  *data = data_new;
-
- out:
-
-  /* Deallocate resources.  */
-  if (err)
-    {
-      _gcry_ac_data_destroy (data_new);
-      gcry_mpi_release (value_mpi);
-      gcry_free (value_name);
-      gcry_sexp_release (value_sexp);
-    }
-  gcry_sexp_release (data_sexp);
-
-  return err;
-}
-
-/* Construct an S-expression from the DATA and store it in
-   DATA_SEXP. The S-expression will be of the following structure:
-
-   (IDENTIFIER [(flags [...])]
-   (ALGORITHM <list of named MPI values>))  */
-static gcry_error_t
-ac_data_construct (const char *identifier, int include_flags,
-                  unsigned int flags, const char *algorithm,
-                  gcry_ac_data_t data, gcry_sexp_t *sexp)
-{
-  unsigned int data_length;
-  gcry_sexp_t sexp_new;
-  gcry_error_t err;
-  size_t sexp_format_n;
-  char *sexp_format;
-  void **arg_list;
-  unsigned int i;
-
-  arg_list = NULL;
-  sexp_new = NULL;
-  sexp_format = NULL;
-
-  /* We build a list of arguments to pass to
-     gcry_sexp_build_array().  */
-  data_length = _gcry_ac_data_length (data);
-  arg_list = gcry_calloc (data_length, sizeof (*arg_list) * 2);
-  if (! arg_list)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Fill list with MPIs.  */
-  for (i = 0; i < data_length; i++)
-    {
-      char **nameaddr  = &data->data[i].name;
-
-      arg_list[(i * 2) + 0] = nameaddr;
-      arg_list[(i * 2) + 1] = &data->data[i].mpi;
-    }
-
-  /* Calculate size of format string.  */
-  sexp_format_n = (3
-                  + (include_flags ? 7 : 0)
-                  + (algorithm ? (2 + strlen (algorithm)) : 0)
-                  + strlen (identifier));
-
-  for (i = 0; i < data_length; i++)
-    /* Per-element sizes.  */
-    sexp_format_n += 6;
-
-  if (include_flags)
-    /* Add flags.  */
-    for (i = 0; i < DIM (ac_flags); i++)
-      if (flags & ac_flags[i].number)
-       sexp_format_n += strlen (ac_flags[i].string) + 1;
-
-  /* Done.  */
-  sexp_format = gcry_malloc (sexp_format_n);
-  if (! sexp_format)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Construct the format string.  */
-
-  *sexp_format = 0;
-  strcat (sexp_format, "(");
-  strcat (sexp_format, identifier);
-  if (include_flags)
-    {
-      strcat (sexp_format, "(flags");
-      for (i = 0; i < DIM (ac_flags); i++)
-       if (flags & ac_flags[i].number)
-         {
-           strcat (sexp_format, " ");
-           strcat (sexp_format, ac_flags[i].string);
-         }
-      strcat (sexp_format, ")");
-    }
-  if (algorithm)
-    {
-      strcat (sexp_format, "(");
-      strcat (sexp_format, algorithm);
-    }
-  for (i = 0; i < data_length; i++)
-    strcat (sexp_format, "(%s%m)");
-  if (algorithm)
-    strcat (sexp_format, ")");
-  strcat (sexp_format, ")");
-
-  /* Create final S-expression.  */
-  err = gcry_sexp_build_array (&sexp_new, NULL, sexp_format, arg_list);
-  if (err)
-    goto out;
-
-  *sexp = sexp_new;
-
- out:
-
-  /* Deallocate resources.  */
-  gcry_free (sexp_format);
-  gcry_free (arg_list);
-  if (err)
-    gcry_sexp_release (sexp_new);
-
-  return err;
-}
-
-
-
-/*
- * Handle management.
- */
-
-/* Creates a new handle for the algorithm ALGORITHM and stores it in
-   HANDLE.  FLAGS is not used yet.  */
-gcry_error_t
-_gcry_ac_open (gcry_ac_handle_t *handle,
-              gcry_ac_id_t algorithm, unsigned int flags)
-{
-  gcry_ac_handle_t handle_new;
-  const char *algorithm_name;
-  gcry_module_t module;
-  gcry_error_t err;
-
-  *handle = NULL;
-  module = NULL;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  /* Get name.  */
-  algorithm_name = _gcry_pk_aliased_algo_name (algorithm);
-  if (! algorithm_name)
-    {
-      err = gcry_error (GPG_ERR_PUBKEY_ALGO);
-      goto out;
-    }
-
-  /* Acquire reference to the pubkey module.  */
-  err = _gcry_pk_module_lookup (algorithm, &module);
-  if (err)
-    goto out;
-
-  /* Allocate.  */
-  handle_new = gcry_malloc (sizeof (*handle_new));
-  if (! handle_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Done.  */
-  handle_new->algorithm = algorithm;
-  handle_new->algorithm_name = algorithm_name;
-  handle_new->flags = flags;
-  handle_new->module = module;
-  *handle = handle_new;
-
- out:
-
-  /* Deallocate resources.  */
-  if (err)
-    _gcry_pk_module_release (module);
-
-  return err;
-}
-
-
-/* Destroys the handle HANDLE.  */
-void
-_gcry_ac_close (gcry_ac_handle_t handle)
-{
-  /* Release reference to pubkey module.  */
-  if (handle)
-    {
-      _gcry_pk_module_release (handle->module);
-      gcry_free (handle);
-    }
-}
-
-
-
-/*
- * Key management.
- */
-
-/* Initialize a key from a given data set.  */
-/* FIXME/Damn: the argument HANDLE is not only unnecessary, it is
-   completely WRONG here.  */
-gcry_error_t
-_gcry_ac_key_init (gcry_ac_key_t *key, gcry_ac_handle_t handle,
-                  gcry_ac_key_type_t type, gcry_ac_data_t data)
-{
-  gcry_ac_data_t data_new;
-  gcry_ac_key_t key_new;
-  gcry_error_t err;
-
-  (void)handle;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  /* Allocate.  */
-  key_new = gcry_malloc (sizeof (*key_new));
-  if (! key_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Copy data set.  */
-  err = _gcry_ac_data_copy (&data_new, data);
-  if (err)
-    goto out;
-
-  /* Done.  */
-  key_new->data = data_new;
-  key_new->type = type;
-  *key = key_new;
-
- out:
-
-  if (err)
-    /* Deallocate resources.  */
-    gcry_free (key_new);
-
-  return err;
-}
-
-
-/* Generates a new key pair via the handle HANDLE of NBITS bits and
-   stores it in KEY_PAIR.  In case non-standard settings are wanted, a
-   pointer to a structure of type gcry_ac_key_spec_<algorithm>_t,
-   matching the selected algorithm, can be given as KEY_SPEC.
-   MISC_DATA is not used yet.  */
-gcry_error_t
-_gcry_ac_key_pair_generate (gcry_ac_handle_t handle, unsigned int nbits,
-                           void *key_spec,
-                           gcry_ac_key_pair_t *key_pair,
-                           gcry_mpi_t **misc_data)
-{
-  gcry_sexp_t genkey_sexp_request;
-  gcry_sexp_t genkey_sexp_reply;
-  gcry_ac_data_t key_data_secret;
-  gcry_ac_data_t key_data_public;
-  gcry_ac_key_pair_t key_pair_new;
-  gcry_ac_key_t key_secret;
-  gcry_ac_key_t key_public;
-  gcry_sexp_t key_sexp;
-  gcry_error_t err;
-  char *genkey_format;
-  size_t genkey_format_n;
-  void **arg_list;
-  size_t arg_list_n;
-  unsigned int i;
-  unsigned int j;
-
-  (void)misc_data;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  key_data_secret = NULL;
-  key_data_public = NULL;
-  key_secret = NULL;
-  key_public = NULL;
-  genkey_format = NULL;
-  arg_list = NULL;
-  genkey_sexp_request = NULL;
-  genkey_sexp_reply = NULL;
-  key_sexp = NULL;
-
-  /* Allocate key pair.  */
-  key_pair_new = gcry_malloc (sizeof (struct gcry_ac_key_pair));
-  if (! key_pair_new)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Allocate keys.  */
-  key_secret = gcry_malloc (sizeof (*key_secret));
-  if (! key_secret)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-  key_public = gcry_malloc (sizeof (*key_public));
-  if (! key_public)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Calculate size of the format string, that is used for creating
-     the request S-expression.  */
-  genkey_format_n = 22;
-
-  /* Respect any relevant algorithm specific commands.  */
-  if (key_spec)
-    for (i = 0; i < DIM (ac_key_generate_specs); i++)
-      if (handle->algorithm == ac_key_generate_specs[i].algorithm)
-       genkey_format_n += 6;
-
-  /* Create format string.  */
-  genkey_format = gcry_malloc (genkey_format_n);
-  if (! genkey_format)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Fill format string.  */
-  *genkey_format = 0;
-  strcat (genkey_format, "(genkey(%s(nbits%d)");
-  if (key_spec)
-    for (i = 0; i < DIM (ac_key_generate_specs); i++)
-      if (handle->algorithm == ac_key_generate_specs[i].algorithm)
-       strcat (genkey_format, "(%s%m)");
-  strcat (genkey_format, "))");
-
-  /* Build list of argument pointers, the algorithm name and the nbits
-     are always needed.  */
-  arg_list_n = 2;
-
-  /* Now the algorithm specific arguments.  */
-  if (key_spec)
-    for (i = 0; i < DIM (ac_key_generate_specs); i++)
-      if (handle->algorithm == ac_key_generate_specs[i].algorithm)
-       arg_list_n += 2;
-
-  /* Allocate list.  */
-  arg_list = gcry_calloc (arg_list_n, sizeof (*arg_list));
-  if (! arg_list)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  arg_list[0] = (void *) &handle->algorithm_name;
-  arg_list[1] = (void *) &nbits;
-  if (key_spec)
-    for (j = 2, i = 0; i < DIM (ac_key_generate_specs); i++)
-      if (handle->algorithm == ac_key_generate_specs[i].algorithm)
-       {
-         /* Add name of this specification flag and the
-            according member of the spec strucuture.  */
-         arg_list[j++] = (void *)(&ac_key_generate_specs[i].name);
-         arg_list[j++] = (void *)
-           (((char *) key_spec)
-            + ac_key_generate_specs[i].offset);
-         /* FIXME: above seems to suck.  */
-       }
-
-  /* Construct final request S-expression.  */
-  err = gcry_sexp_build_array (&genkey_sexp_request,
-                              NULL, genkey_format, arg_list);
-  if (err)
-    goto out;
-
-  /* Perform genkey operation.  */
-  err = gcry_pk_genkey (&genkey_sexp_reply, genkey_sexp_request);
-  if (err)
-    goto out;
-
-  key_sexp = gcry_sexp_find_token (genkey_sexp_reply, "private-key", 0);
-  if (! key_sexp)
-    {
-      err = gcry_error (GPG_ERR_INTERNAL);
-      goto out;
-    }
-  err = ac_data_extract ("private-key", handle->algorithm_name,
-                        key_sexp, &key_data_secret);
-  if (err)
-    goto out;
-
-  gcry_sexp_release (key_sexp);
-  key_sexp = gcry_sexp_find_token (genkey_sexp_reply, "public-key", 0);
-  if (! key_sexp)
-    {
-      err = gcry_error (GPG_ERR_INTERNAL);
-      goto out;
-    }
-  err = ac_data_extract ("public-key", handle->algorithm_name,
-                        key_sexp, &key_data_public);
-  if (err)
-    goto out;
-
-  /* Done.  */
-
-  key_secret->type = GCRY_AC_KEY_SECRET;
-  key_secret->data = key_data_secret;
-  key_public->type = GCRY_AC_KEY_PUBLIC;
-  key_public->data = key_data_public;
-  key_pair_new->secret = key_secret;
-  key_pair_new->public = key_public;
-  *key_pair = key_pair_new;
-
- out:
-
-  /* Deallocate resources.  */
-
-  gcry_free (genkey_format);
-  gcry_free (arg_list);
-  gcry_sexp_release (genkey_sexp_request);
-  gcry_sexp_release (genkey_sexp_reply);
-  gcry_sexp_release (key_sexp);
-  if (err)
-    {
-      _gcry_ac_data_destroy (key_data_secret);
-      _gcry_ac_data_destroy (key_data_public);
-      gcry_free (key_secret);
-      gcry_free (key_public);
-      gcry_free (key_pair_new);
-    }
-
-  return err;
-}
-
-/* Returns the key of type WHICH out of the key pair KEY_PAIR.  */
-gcry_ac_key_t
-_gcry_ac_key_pair_extract (gcry_ac_key_pair_t key_pair,
-                           gcry_ac_key_type_t which)
-{
-  gcry_ac_key_t key;
-
-  if (fips_mode ())
-    return NULL;
-
-  switch (which)
-    {
-    case GCRY_AC_KEY_SECRET:
-      key = key_pair->secret;
-      break;
-
-    case GCRY_AC_KEY_PUBLIC:
-      key = key_pair->public;
-      break;
-
-    default:
-      key = NULL;
-      break;
-    }
-
-  return key;
-}
-
-/* Destroys the key KEY.  */
-void
-_gcry_ac_key_destroy (gcry_ac_key_t key)
-{
-  unsigned int i;
-
-  if (key)
-    {
-      if (key->data)
-        {
-          for (i = 0; i < key->data->data_n; i++)
-            {
-              if (key->data->data[i].mpi)
-                gcry_mpi_release (key->data->data[i].mpi);
-              if (key->data->data[i].name)
-                gcry_free (key->data->data[i].name);
-            }
-          gcry_free (key->data->data);
-          gcry_free (key->data);
-        }
-      gcry_free (key);
-    }
-}
-
-/* Destroys the key pair KEY_PAIR.  */
-void
-_gcry_ac_key_pair_destroy (gcry_ac_key_pair_t key_pair)
-{
-  if (key_pair)
-    {
-      gcry_ac_key_destroy (key_pair->secret);
-      gcry_ac_key_destroy (key_pair->public);
-      gcry_free (key_pair);
-    }
-}
-
-/* Returns the data set contained in the key KEY.  */
-gcry_ac_data_t
-_gcry_ac_key_data_get (gcry_ac_key_t key)
-{
-  if (fips_mode ())
-    return NULL;
-  return key->data;
-}
-
-/* Verifies that the key KEY is sane via HANDLE.  */
-gcry_error_t
-_gcry_ac_key_test (gcry_ac_handle_t handle, gcry_ac_key_t key)
-{
-  gcry_sexp_t key_sexp;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  key_sexp = NULL;
-  err = ac_data_construct (ac_key_identifiers[key->type], 0, 0,
-                          handle->algorithm_name, key->data, &key_sexp);
-  if (err)
-    goto out;
-
-  err = gcry_pk_testkey (key_sexp);
-
- out:
-
-  gcry_sexp_release (key_sexp);
-
-  return gcry_error (err);
-}
-
-/* Stores the number of bits of the key KEY in NBITS via HANDLE.  */
-gcry_error_t
-_gcry_ac_key_get_nbits (gcry_ac_handle_t handle,
-                       gcry_ac_key_t key, unsigned int *nbits)
-{
-  gcry_sexp_t key_sexp;
-  gcry_error_t err;
-  unsigned int n;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  key_sexp = NULL;
-
-  err = ac_data_construct (ac_key_identifiers[key->type],
-                          0, 0, handle->algorithm_name, key->data, &key_sexp);
-  if (err)
-    goto out;
-
-  n = gcry_pk_get_nbits (key_sexp);
-  if (! n)
-    {
-      err = gcry_error (GPG_ERR_PUBKEY_ALGO);
-      goto out;
-    }
-
-  *nbits = n;
-
- out:
-
-  gcry_sexp_release (key_sexp);
-
-  return err;
-}
-
-/* Writes the 20 byte long key grip of the key KEY to KEY_GRIP via
-   HANDLE.  */
-gcry_error_t
-_gcry_ac_key_get_grip (gcry_ac_handle_t handle,
-                      gcry_ac_key_t key, unsigned char *key_grip)
-{
-  gcry_sexp_t key_sexp;
-  gcry_error_t err;
-  unsigned char *ret;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  key_sexp = NULL;
-  err = ac_data_construct (ac_key_identifiers[key->type], 0, 0,
-                          handle->algorithm_name, key->data, &key_sexp);
-  if (err)
-    goto out;
-
-  ret = gcry_pk_get_keygrip (key_sexp, key_grip);
-  if (! ret)
-    {
-      err = gcry_error (GPG_ERR_INV_OBJ);
-      goto out;
-    }
-
-  err = 0;
-
- out:
-
-  gcry_sexp_release (key_sexp);
-
-  return err;
-}
-
-
-
-
-/*
- * Functions performing cryptographic operations.
- */
-
-/* Encrypts the plain text MPI value DATA_PLAIN with the key public
-   KEY under the control of the flags FLAGS and stores the resulting
-   data set into DATA_ENCRYPTED.  */
-gcry_error_t
-_gcry_ac_data_encrypt (gcry_ac_handle_t handle,
-                      unsigned int flags,
-                      gcry_ac_key_t key,
-                      gcry_mpi_t data_plain,
-                      gcry_ac_data_t *data_encrypted)
-{
-  gcry_ac_data_t data_encrypted_new;
-  gcry_ac_data_t data_value;
-  gcry_sexp_t sexp_request;
-  gcry_sexp_t sexp_reply;
-  gcry_sexp_t sexp_key;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_encrypted_new = NULL;
-  sexp_request = NULL;
-  sexp_reply = NULL;
-  data_value = NULL;
-  sexp_key = NULL;
-
-  if (key->type != GCRY_AC_KEY_PUBLIC)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  err = ac_data_construct (ac_key_identifiers[key->type], 0, 0,
-                          handle->algorithm_name, key->data, &sexp_key);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_new (&data_value);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_set (data_value, 0, "value", data_plain);
-  if (err)
-    goto out;
-
-  err = ac_data_construct ("data", 1, flags, handle->algorithm_name,
-                          data_value, &sexp_request);
-  if (err)
-    goto out;
-
-  /* FIXME: error vs. errcode? */
-
-  err = gcry_pk_encrypt (&sexp_reply, sexp_request, sexp_key);
-  if (err)
-    goto out;
-
-  /* Extract data.  */
-  err = ac_data_extract ("enc-val", handle->algorithm_name,
-                        sexp_reply, &data_encrypted_new);
-  if (err)
-    goto out;
-
-  *data_encrypted = data_encrypted_new;
-
- out:
-
-  /* Deallocate resources.  */
-
-  gcry_sexp_release (sexp_request);
-  gcry_sexp_release (sexp_reply);
-  gcry_sexp_release (sexp_key);
-  _gcry_ac_data_destroy (data_value);
-
-  return err;
-}
-
-/* Decrypts the encrypted data contained in the data set
-   DATA_ENCRYPTED with the secret key KEY under the control of the
-   flags FLAGS and stores the resulting plain text MPI value in
-   DATA_PLAIN.  */
-gcry_error_t
-_gcry_ac_data_decrypt (gcry_ac_handle_t handle,
-                      unsigned int flags,
-                      gcry_ac_key_t key,
-                      gcry_mpi_t *data_plain,
-                      gcry_ac_data_t data_encrypted)
-{
-  gcry_mpi_t data_decrypted;
-  gcry_sexp_t sexp_request;
-  gcry_sexp_t sexp_reply;
-  gcry_sexp_t sexp_value;
-  gcry_sexp_t sexp_key;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  sexp_request = NULL;
-  sexp_reply = NULL;
-  sexp_value = NULL;
-  sexp_key = NULL;
-
-  if (key->type != GCRY_AC_KEY_SECRET)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  err = ac_data_construct (ac_key_identifiers[key->type], 0, 0,
-                          handle->algorithm_name, key->data, &sexp_key);
-  if (err)
-    goto out;
-
-  /* Create S-expression from data.  */
-  err = ac_data_construct ("enc-val", 1, flags, handle->algorithm_name,
-                          data_encrypted, &sexp_request);
-  if (err)
-    goto out;
-
-  /* Decrypt.  */
-  err = gcry_pk_decrypt (&sexp_reply, sexp_request, sexp_key);
-  if (err)
-    goto out;
-
-  /* Extract plain text. */
-  sexp_value = gcry_sexp_find_token (sexp_reply, "value", 0);
-  if (! sexp_value)
-    {
-      /* FIXME?  */
-      err = gcry_error (GPG_ERR_GENERAL);
-      goto out;
-    }
-
-  data_decrypted = gcry_sexp_nth_mpi (sexp_value, 1, GCRYMPI_FMT_USG);
-  if (! data_decrypted)
-    {
-      err = gcry_error (GPG_ERR_GENERAL);
-      goto out;
-    }
-
-  *data_plain = data_decrypted;
-
- out:
-
-  /* Deallocate resources.  */
-  gcry_sexp_release (sexp_request);
-  gcry_sexp_release (sexp_reply);
-  gcry_sexp_release (sexp_value);
-  gcry_sexp_release (sexp_key);
-
-  return gcry_error (err);
-
-}
-
-/* Signs the data contained in DATA with the secret key KEY and stores
-   the resulting signature data set in DATA_SIGNATURE.  */
-gcry_error_t
-_gcry_ac_data_sign (gcry_ac_handle_t handle,
-                   gcry_ac_key_t key,
-                   gcry_mpi_t data,
-                   gcry_ac_data_t *data_signature)
-{
-  gcry_ac_data_t data_signed;
-  gcry_ac_data_t data_value;
-  gcry_sexp_t sexp_request;
-  gcry_sexp_t sexp_reply;
-  gcry_sexp_t sexp_key;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_signed = NULL;
-  data_value = NULL;
-  sexp_request = NULL;
-  sexp_reply = NULL;
-  sexp_key = NULL;
-
-  if (key->type != GCRY_AC_KEY_SECRET)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  err = ac_data_construct (ac_key_identifiers[key->type], 0, 0,
-                          handle->algorithm_name, key->data, &sexp_key);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_new (&data_value);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_set (data_value, 0, "value", data);
-  if (err)
-    goto out;
-
-  /* Create S-expression holding the data.  */
-  err = ac_data_construct ("data", 1, 0, NULL, data_value, &sexp_request);
-  if (err)
-    goto out;
-
-  /* Sign.  */
-  err = gcry_pk_sign (&sexp_reply, sexp_request, sexp_key);
-  if (err)
-    goto out;
-
-  /* Extract data.  */
-  err = ac_data_extract ("sig-val", handle->algorithm_name,
-                        sexp_reply, &data_signed);
-  if (err)
-    goto out;
-
-  /* Done.  */
-  *data_signature = data_signed;
-
- out:
-
-  gcry_sexp_release (sexp_request);
-  gcry_sexp_release (sexp_reply);
-  gcry_sexp_release (sexp_key);
-  _gcry_ac_data_destroy (data_value);
-
-  return gcry_error (err);
-}
-
-
-/* Verifies that the signature contained in the data set
-   DATA_SIGNATURE is indeed the result of signing the data contained
-   in DATA with the secret key belonging to the public key KEY.  */
-gcry_error_t
-_gcry_ac_data_verify (gcry_ac_handle_t handle,
-                     gcry_ac_key_t key,
-                     gcry_mpi_t data,
-                     gcry_ac_data_t data_signature)
-{
-  gcry_sexp_t sexp_signature;
-  gcry_ac_data_t data_value;
-  gcry_sexp_t sexp_data;
-  gcry_sexp_t sexp_key;
-  gcry_error_t err;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  sexp_signature = NULL;
-  data_value = NULL;
-  sexp_data = NULL;
-  sexp_key = NULL;
-
-  err = ac_data_construct ("public-key", 0, 0,
-                          handle->algorithm_name, key->data, &sexp_key);
-  if (err)
-    goto out;
-
-  if (key->type != GCRY_AC_KEY_PUBLIC)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  /* Construct S-expression holding the signature data.  */
-  err = ac_data_construct ("sig-val", 1, 0, handle->algorithm_name,
-                          data_signature, &sexp_signature);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_new (&data_value);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_set (data_value, 0, "value", data);
-  if (err)
-    goto out;
-
-  /* Construct S-expression holding the data.  */
-  err = ac_data_construct ("data", 1, 0, NULL, data_value, &sexp_data);
-  if (err)
-    goto out;
-
-  /* Verify signature.  */
-  err = gcry_pk_verify (sexp_signature, sexp_data, sexp_key);
-
- out:
-
-  gcry_sexp_release (sexp_signature);
-  gcry_sexp_release (sexp_data);
-  gcry_sexp_release (sexp_key);
-  _gcry_ac_data_destroy (data_value);
-
-  return gcry_error (err);
-}
-
-
-
-
-/*
- * Implementation of encoding methods (em).
- */
-
-/* Type for functions that encode or decode (hence the name) a
-   message.  */
-typedef gcry_error_t (*gcry_ac_em_dencode_t) (unsigned int flags,
-                                                void *options,
-                                                gcry_ac_io_t *ac_io_read,
-                                                gcry_ac_io_t *ac_io_write);
-
-/* Fill the buffer BUFFER which is BUFFER_N bytes long with non-zero
-   random bytes of random level LEVEL.  */
-static void
-em_randomize_nonzero (unsigned char *buffer, size_t buffer_n,
-                     gcry_random_level_t level)
-{
-  unsigned char *buffer_rand;
-  unsigned int buffer_rand_n;
-  unsigned int zeros;
-  unsigned int i;
-  unsigned int j;
-
-  for (i = 0; i < buffer_n; i++)
-    buffer[i] = 0;
-
-  do
-    {
-      /* Count zeros.  */
-      for (i = zeros = 0; i < buffer_n; i++)
-       if (! buffer[i])
-         zeros++;
-
-      if (zeros)
-       {
-         /* Get random bytes.  */
-         buffer_rand_n = zeros + (zeros / 128);
-         buffer_rand = gcry_random_bytes_secure (buffer_rand_n, level);
-
-         /* Substitute zeros with non-zero random bytes.  */
-         for (i = j = 0; zeros && (i < buffer_n) && (j < buffer_rand_n); i++)
-           if (! buffer[i])
-             {
-               while ((j < buffer_rand_n) && (! buffer_rand[j]))
-                 j++;
-               if (j < buffer_rand_n)
-                 {
-                   buffer[i] = buffer_rand[j++];
-                   zeros--;
-                 }
-               else
-                 break;
-             }
-         gcry_free (buffer_rand);
-       }
-    }
-  while (zeros);
-}
-
-/* Encode a message according to the Encoding Method for Encryption
-   `PKCS-V1_5' (EME-PKCS-V1_5).  */
-static gcry_error_t
-eme_pkcs_v1_5_encode (unsigned int flags, void *opts,
-                     gcry_ac_io_t *ac_io_read,
-                     gcry_ac_io_t *ac_io_write)
-{
-  gcry_ac_eme_pkcs_v1_5_t *options;
-  gcry_error_t err;
-  unsigned char *buffer;
-  unsigned char *ps;
-  unsigned char *m;
-  size_t m_n;
-  unsigned int ps_n;
-  unsigned int k;
-
-  (void)flags;
-
-  options = opts;
-  buffer = NULL;
-  m = NULL;
-
-  err = _gcry_ac_io_read_all (ac_io_read, &m, &m_n);
-  if (err)
-    goto out;
-
-  /* Figure out key length in bytes.  */
-  k = options->key_size / 8;
-
-  if (m_n > k - 11)
-    {
-      /* Key is too short for message.  */
-      err = gcry_error (GPG_ERR_TOO_SHORT);
-      goto out;
-    }
-
-  /* According to this encoding method, the first byte of the encoded
-     message is zero.  This byte will be lost anyway, when the encoded
-     message is to be converted into an MPI, that's why we skip
-     it.  */
-
-  /* Allocate buffer.  */
-  buffer = gcry_malloc (k - 1);
-  if (! buffer)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  /* Generate an octet string PS of length k - mLen - 3 consisting
-     of pseudorandomly generated nonzero octets.  The length of PS
-     will be at least eight octets.  */
-  ps_n = k - m_n - 3;
-  ps = buffer + 1;
-  em_randomize_nonzero (ps, ps_n, GCRY_STRONG_RANDOM);
-
-  /* Concatenate PS, the message M, and other padding to form an
-     encoded message EM of length k octets as:
-
-     EM = 0x00 || 0x02 || PS || 0x00 || M.  */
-
-  buffer[0] = 0x02;
-  buffer[ps_n + 1] = 0x00;
-  memcpy (buffer + ps_n + 2, m, m_n);
-
-  err = _gcry_ac_io_write (ac_io_write, buffer, k - 1);
-
- out:
-
-  gcry_free (buffer);
-  gcry_free (m);
-
-  return err;
-}
-
-/* Decode a message according to the Encoding Method for Encryption
-   `PKCS-V1_5' (EME-PKCS-V1_5).  */
-static gcry_error_t
-eme_pkcs_v1_5_decode (unsigned int flags, void *opts,
-                     gcry_ac_io_t *ac_io_read,
-                     gcry_ac_io_t *ac_io_write)
-{
-  gcry_ac_eme_pkcs_v1_5_t *options;
-  unsigned char *buffer;
-  unsigned char *em;
-  size_t em_n;
-  gcry_error_t err;
-  unsigned int i;
-  unsigned int k;
-
-  (void)flags;
-
-  options = opts;
-  buffer = NULL;
-  em = NULL;
-
-  err = _gcry_ac_io_read_all (ac_io_read, &em, &em_n);
-  if (err)
-    goto out;
-
-  /* Figure out key size.  */
-  k = options->key_size / 8;
-
-  /* Search for zero byte.  */
-  for (i = 0; (i < em_n) && em[i]; i++);
-
-  /* According to this encoding method, the first byte of the encoded
-     message should be zero.  This byte is lost.  */
-
-  if (! ((em_n >= 10)
-        && (em_n == (k - 1))
-        && (em[0] == 0x02)
-        && (i < em_n)
-        && ((i - 1) >= 8)))
-    {
-      err = gcry_error (GPG_ERR_DECRYPT_FAILED);
-      goto out;
-    }
-
-  i++;
-  buffer = gcry_malloc (em_n - i);
-  if (! buffer)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  memcpy (buffer, em + i, em_n - i);
-  err = _gcry_ac_io_write (ac_io_write, buffer, em_n - i);
-
- out:
-
-  gcry_free (buffer);
-  gcry_free (em);
-
-  return err;
-}
-
-static gcry_error_t
-emsa_pkcs_v1_5_encode_data_cb (void *opaque,
-                              unsigned char *buffer, size_t buffer_n)
-{
-  gcry_md_hd_t md_handle;
-
-  md_handle = opaque;
-  gcry_md_write (md_handle, buffer, buffer_n);
-
-  return 0;
-}
-
-
-/* Encode a message according to the Encoding Method for Signatures
-   with Appendix `PKCS-V1_5' (EMSA-PKCS-V1_5).  */
-static gcry_error_t
-emsa_pkcs_v1_5_encode (unsigned int flags, void *opts,
-                      gcry_ac_io_t *ac_io_read,
-                      gcry_ac_io_t *ac_io_write)
-{
-  gcry_ac_emsa_pkcs_v1_5_t *options;
-  gcry_error_t err;
-  gcry_md_hd_t md;
-  unsigned char *t;
-  size_t t_n;
-  unsigned char *h;
-  size_t h_n;
-  unsigned char *ps;
-  size_t ps_n;
-  unsigned char *buffer;
-  size_t buffer_n;
-  unsigned char asn[100];      /* FIXME, always enough?  */
-  size_t asn_n;
-  unsigned int i;
-
-  (void)flags;
-
-  options = opts;
-  buffer = NULL;
-  md = NULL;
-  ps = NULL;
-  t = NULL;
-
-  /* Create hashing handle and get the necessary information.  */
-  err = gcry_md_open (&md, options->md, 0);
-  if (err)
-    goto out;
-
-  asn_n = DIM (asn);
-  err = gcry_md_algo_info (options->md, GCRYCTL_GET_ASNOID, asn, &asn_n);
-  if (err)
-    goto out;
-
-  h_n = gcry_md_get_algo_dlen (options->md);
-
-  err = _gcry_ac_io_process (ac_io_read, emsa_pkcs_v1_5_encode_data_cb, md);
-  if (err)
-    goto out;
-
-  h = gcry_md_read (md, 0);
-
-  /* Encode the algorithm ID for the hash function and the hash value
-     into an ASN.1 value of type DigestInfo with the Distinguished
-     Encoding Rules (DER), where the type DigestInfo has the syntax:
-
-     DigestInfo ::== SEQUENCE {
-     digestAlgorithm AlgorithmIdentifier,
-     digest OCTET STRING
-     }
-
-     The first field identifies the hash function and the second
-     contains the hash value.  Let T be the DER encoding of the
-     DigestInfo value and let tLen be the length in octets of T.  */
-
-  t_n = asn_n + h_n;
-  t = gcry_malloc (t_n);
-  if (! t)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  for (i = 0; i < asn_n; i++)
-    t[i] = asn[i];
-  for (i = 0; i < h_n; i++)
-    t[asn_n + i] = h[i];
-
-  /* If emLen < tLen + 11, output "intended encoded message length
-     too short" and stop.  */
-  if (options->em_n < t_n + 11)
-    {
-      err = gcry_error (GPG_ERR_TOO_SHORT);
-      goto out;
-    }
-
-  /* Generate an octet string PS consisting of emLen - tLen - 3 octets
-     with hexadecimal value 0xFF.  The length of PS will be at least 8
-     octets.  */
-  ps_n = options->em_n - t_n - 3;
-  ps = gcry_malloc (ps_n);
-  if (! ps)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-  for (i = 0; i < ps_n; i++)
-    ps[i] = 0xFF;
-
-  /* Concatenate PS, the DER encoding T, and other padding to form the
-     encoded message EM as:
-
-     EM = 0x00 || 0x01 || PS || 0x00 || T.  */
-
-  buffer_n = ps_n + t_n + 3;
-  buffer = gcry_malloc (buffer_n);
-  if (! buffer)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  buffer[0] = 0x00;
-  buffer[1] = 0x01;
-  for (i = 0; i < ps_n; i++)
-    buffer[2 + i] = ps[i];
-  buffer[2 + ps_n] = 0x00;
-  for (i = 0; i < t_n; i++)
-    buffer[3 + ps_n + i] = t[i];
-
-  err = _gcry_ac_io_write (ac_io_write, buffer, buffer_n);
-
- out:
-
-  gcry_md_close (md);
-
-  gcry_free (buffer);
-  gcry_free (ps);
-  gcry_free (t);
-
-  return err;
-}
-
-/* `Actions' for data_dencode().  */
-typedef enum dencode_action
-  {
-    DATA_ENCODE,
-    DATA_DECODE,
-  }
-dencode_action_t;
-
-/* Encode or decode a message according to the the encoding method
-   METHOD; ACTION specifies whether the message that is contained in
-   BUFFER_IN and of length BUFFER_IN_N should be encoded or decoded.
-   The resulting message will be stored in a newly allocated buffer in
-   BUFFER_OUT and BUFFER_OUT_N.  */
-static gcry_error_t
-ac_data_dencode (gcry_ac_em_t method, dencode_action_t action,
-                unsigned int flags, void *options,
-                gcry_ac_io_t *ac_io_read,
-                gcry_ac_io_t *ac_io_write)
-{
-  struct
-  {
-    gcry_ac_em_t method;
-    gcry_ac_em_dencode_t encode;
-    gcry_ac_em_dencode_t decode;
-  } methods[] =
-    {
-      { GCRY_AC_EME_PKCS_V1_5,
-       eme_pkcs_v1_5_encode, eme_pkcs_v1_5_decode },
-      { GCRY_AC_EMSA_PKCS_V1_5,
-       emsa_pkcs_v1_5_encode, NULL },
-    };
-  size_t methods_n;
-  gcry_error_t err;
-  unsigned int i;
-
-  methods_n = sizeof (methods) / sizeof (*methods);
-
-  for (i = 0; i < methods_n; i++)
-    if (methods[i].method == method)
-      break;
-  if (i == methods_n)
-    {
-      err = gcry_error (GPG_ERR_NOT_FOUND);    /* FIXME? */
-      goto out;
-    }
-
-  err = 0;
-  switch (action)
-    {
-    case DATA_ENCODE:
-      if (methods[i].encode)
-       /* FIXME? */
-       err = (*methods[i].encode) (flags, options, ac_io_read, ac_io_write);
-      break;
-
-    case DATA_DECODE:
-      if (methods[i].decode)
-       /* FIXME? */
-       err = (*methods[i].decode) (flags, options, ac_io_read, ac_io_write);
-      break;
-
-    default:
-      err = gcry_error (GPG_ERR_INV_ARG);
-      break;
-    }
-
- out:
-
-  return err;
-}
-
-/* Encode a message according to the encoding method METHOD.  OPTIONS
-   must be a pointer to a method-specific structure
-   (gcry_ac_em*_t).  */
-gcry_error_t
-_gcry_ac_data_encode (gcry_ac_em_t method,
-                     unsigned int flags, void *options,
-                     gcry_ac_io_t *ac_io_read,
-                     gcry_ac_io_t *ac_io_write)
-{
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  return ac_data_dencode (method, DATA_ENCODE, flags, options,
-                         ac_io_read, ac_io_write);
-}
-
-/* Dencode a message according to the encoding method METHOD.  OPTIONS
-   must be a pointer to a method-specific structure
-   (gcry_ac_em*_t).  */
-gcry_error_t
-_gcry_ac_data_decode (gcry_ac_em_t method,
-                     unsigned int flags, void *options,
-                     gcry_ac_io_t *ac_io_read,
-                     gcry_ac_io_t *ac_io_write)
-{
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  return ac_data_dencode (method, DATA_DECODE, flags, options,
-                         ac_io_read, ac_io_write);
-}
-
-/* Convert an MPI into an octet string.  */
-void
-_gcry_ac_mpi_to_os (gcry_mpi_t mpi, unsigned char *os, size_t os_n)
-{
-  unsigned long digit;
-  gcry_mpi_t base;
-  unsigned int i;
-  unsigned int n;
-  gcry_mpi_t m;
-  gcry_mpi_t d;
-
-  if (fips_mode ())
-    return;
-
-  base = gcry_mpi_new (0);
-  gcry_mpi_set_ui (base, 256);
-
-  n = 0;
-  m = gcry_mpi_copy (mpi);
-  while (gcry_mpi_cmp_ui (m, 0))
-    {
-      n++;
-      gcry_mpi_div (m, NULL, m, base, 0);
-    }
-
-  gcry_mpi_set (m, mpi);
-  d = gcry_mpi_new (0);
-  for (i = 0; (i < n) && (i < os_n); i++)
-    {
-      gcry_mpi_mod (d, m, base);
-      _gcry_mpi_get_ui (d, &digit);
-      gcry_mpi_div (m, NULL, m, base, 0);
-      os[os_n - i - 1] = (digit & 0xFF);
-    }
-
-  for (; i < os_n; i++)
-    os[os_n - i - 1] = 0;
-
-  gcry_mpi_release (base);
-  gcry_mpi_release (d);
-  gcry_mpi_release (m);
-}
-
-/* Convert an MPI into an newly allocated octet string.  */
-gcry_error_t
-_gcry_ac_mpi_to_os_alloc (gcry_mpi_t mpi, unsigned char **os, size_t *os_n)
-{
-  unsigned char *buffer;
-  size_t buffer_n;
-  gcry_error_t err;
-  unsigned int nbits;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  nbits = gcry_mpi_get_nbits (mpi);
-  buffer_n = (nbits + 7) / 8;
-  buffer = gcry_malloc (buffer_n);
-  if (! buffer)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  _gcry_ac_mpi_to_os (mpi, buffer, buffer_n);
-  *os = buffer;
-  *os_n = buffer_n;
-  err = 0;
-
- out:
-
-  return err;
-}
-
-
-/* Convert an octet string into an MPI.  */
-void
-_gcry_ac_os_to_mpi (gcry_mpi_t mpi, unsigned char *os, size_t os_n)
-{
-  unsigned int i;
-  gcry_mpi_t xi;
-  gcry_mpi_t x;
-  gcry_mpi_t a;
-
-  if (fips_mode ())
-    return;
-
-  a = gcry_mpi_new (0);
-  gcry_mpi_set_ui (a, 1);
-  x = gcry_mpi_new (0);
-  gcry_mpi_set_ui (x, 0);
-  xi = gcry_mpi_new (0);
-
-  for (i = 0; i < os_n; i++)
-    {
-      gcry_mpi_mul_ui (xi, a, os[os_n - i - 1]);
-      gcry_mpi_add (x, x, xi);
-      gcry_mpi_mul_ui (a, a, 256);
-    }
-
-  gcry_mpi_release (xi);
-  gcry_mpi_release (a);
-
-  gcry_mpi_set (mpi, x);
-  gcry_mpi_release (x);                /* FIXME: correct? */
-}
-
-
-
-/*
- * Implementation of Encryption Schemes (ES) and Signature Schemes
- * with Appendix (SSA).
- */
-
-/* Schemes consist of two things: encoding methods and cryptographic
-   primitives.
-
-   Since encoding methods are accessible through a common API with
-   method-specific options passed as an anonymous struct, schemes have
-   to provide functions that construct this method-specific structure;
-   this is what the functions of type `gcry_ac_dencode_prepare_t' are
-   there for.  */
-
-typedef gcry_error_t (*gcry_ac_dencode_prepare_t) (gcry_ac_handle_t handle,
-                                                  gcry_ac_key_t key,
-                                                  void *opts,
-                                                  void *opts_em);
-
-/* The `dencode_prepare' function for ES-PKCS-V1_5.  */
-static gcry_error_t
-ac_es_dencode_prepare_pkcs_v1_5 (gcry_ac_handle_t handle, gcry_ac_key_t key,
-                                void *opts, void *opts_em)
-{
-  gcry_ac_eme_pkcs_v1_5_t *options_em;
-  unsigned int nbits;
-  gcry_error_t err;
-
-  (void)opts;
-
-  err = _gcry_ac_key_get_nbits (handle, key, &nbits);
-  if (err)
-    goto out;
-
-  options_em = opts_em;
-  options_em->key_size = nbits;
-
- out:
-
-  return err;
-}
-
-/* The `dencode_prepare' function for SSA-PKCS-V1_5.  */
-static gcry_error_t
-ac_ssa_dencode_prepare_pkcs_v1_5 (gcry_ac_handle_t handle, gcry_ac_key_t key,
-                                 void *opts, void *opts_em)
-{
-  gcry_ac_emsa_pkcs_v1_5_t *options_em;
-  gcry_ac_ssa_pkcs_v1_5_t *options;
-  gcry_error_t err;
-  unsigned int k;
-
-  options_em = opts_em;
-  options = opts;
-
-  err = _gcry_ac_key_get_nbits (handle, key, &k);
-  if (err)
-    goto out;
-
-  k = (k + 7) / 8;
-  options_em->md = options->md;
-  options_em->em_n = k;
-
- out:
-
-  return err;
-}
-
-/* Type holding the information about each supported
-   Encryption/Signature Scheme.  */
-typedef struct ac_scheme
-{
-  gcry_ac_scheme_t scheme;
-  gcry_ac_em_t scheme_encoding;
-  gcry_ac_dencode_prepare_t dencode_prepare;
-  size_t options_em_n;
-} ac_scheme_t;
-
-/* List of supported Schemes.  */
-static ac_scheme_t ac_schemes[] =
-  {
-    { GCRY_AC_ES_PKCS_V1_5, GCRY_AC_EME_PKCS_V1_5,
-      ac_es_dencode_prepare_pkcs_v1_5,
-      sizeof (gcry_ac_eme_pkcs_v1_5_t) },
-    { GCRY_AC_SSA_PKCS_V1_5, GCRY_AC_EMSA_PKCS_V1_5,
-      ac_ssa_dencode_prepare_pkcs_v1_5,
-      sizeof (gcry_ac_emsa_pkcs_v1_5_t) }
-  };
-
-/* Lookup a scheme by it's ID.  */
-static ac_scheme_t *
-ac_scheme_get (gcry_ac_scheme_t scheme)
-{
-  ac_scheme_t *ac_scheme;
-  unsigned int i;
-
-  for (i = 0; i < DIM (ac_schemes); i++)
-    if (scheme == ac_schemes[i].scheme)
-      break;
-  if (i == DIM (ac_schemes))
-    ac_scheme = NULL;
-  else
-    ac_scheme = ac_schemes + i;
-
-  return ac_scheme;
-}
-
-/* Prepares the encoding/decoding by creating an according option
-   structure.  */
-static gcry_error_t
-ac_dencode_prepare (gcry_ac_handle_t handle, gcry_ac_key_t key, void *opts,
-                   ac_scheme_t scheme, void **opts_em)
-{
-  gcry_error_t err;
-  void *options_em;
-
-  options_em = gcry_malloc (scheme.options_em_n);
-  if (! options_em)
-    {
-      err = gcry_error_from_errno (errno);
-      goto out;
-    }
-
-  err = (*scheme.dencode_prepare) (handle, key, opts, options_em);
-  if (err)
-    goto out;
-
-  *opts_em = options_em;
-
- out:
-
-  if (err)
-    free (options_em);
-
-  return err;
-}
-
-/* Convert a data set into a single MPI; currently, this is only
-   supported for data sets containing a single MPI.  */
-static gcry_error_t
-ac_data_set_to_mpi (gcry_ac_data_t data, gcry_mpi_t *mpi)
-{
-  gcry_error_t err;
-  gcry_mpi_t mpi_new;
-  unsigned int elems;
-
-  elems = _gcry_ac_data_length (data);
-
-  if (elems != 1)
-    {
-      /* FIXME: I guess, we should be more flexible in this respect by
-        allowing the actual encryption/signature schemes to implement
-        this conversion mechanism.  */
-      err = gcry_error (GPG_ERR_CONFLICT);
-      goto out;
-    }
-
-  err = _gcry_ac_data_get_index (data, GCRY_AC_FLAG_COPY, 0, NULL, &mpi_new);
-  if (err)
-    goto out;
-
-  *mpi = mpi_new;
-
- out:
-
-  return err;
-}
-
-/* Encrypts the plain text message contained in M, which is of size
-   M_N, with the public key KEY_PUBLIC according to the Encryption
-   Scheme SCHEME_ID.  HANDLE is used for accessing the low-level
-   cryptographic primitives.  If OPTS is not NULL, it has to be an
-   anonymous structure specific to the chosen scheme (gcry_ac_es_*_t).
-   The encrypted message will be stored in C and C_N.  */
-gcry_error_t
-_gcry_ac_data_encrypt_scheme (gcry_ac_handle_t handle,
-                             gcry_ac_scheme_t scheme_id,
-                             unsigned int flags, void *opts,
-                             gcry_ac_key_t key,
-                             gcry_ac_io_t *io_message,
-                             gcry_ac_io_t *io_cipher)
-{
-  gcry_error_t err;
-  gcry_ac_io_t io_em;
-  unsigned char *em;
-  size_t em_n;
-  gcry_mpi_t mpi_plain;
-  gcry_ac_data_t data_encrypted;
-  gcry_mpi_t mpi_encrypted;
-  unsigned char *buffer;
-  size_t buffer_n;
-  void *opts_em;
-  ac_scheme_t *scheme;
-
-  (void)flags;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_encrypted = NULL;
-  mpi_encrypted = NULL;
-  mpi_plain = NULL;
-  opts_em = NULL;
-  buffer = NULL;
-  em = NULL;
-
-  scheme = ac_scheme_get (scheme_id);
-  if (! scheme)
-    {
-      err = gcry_error (GPG_ERR_NO_ENCRYPTION_SCHEME);
-      goto out;
-    }
-
-  if (key->type != GCRY_AC_KEY_PUBLIC)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  err = ac_dencode_prepare (handle, key, opts, *scheme, &opts_em);
-  if (err)
-    goto out;
-
-  _gcry_ac_io_init (&io_em, GCRY_AC_IO_WRITABLE,
-                   GCRY_AC_IO_STRING, &em, &em_n);
-
-  err = _gcry_ac_data_encode (scheme->scheme_encoding, 0, opts_em,
-                             io_message, &io_em);
-  if (err)
-    goto out;
-
-  mpi_plain = gcry_mpi_snew (0);
-  gcry_ac_os_to_mpi (mpi_plain, em, em_n);
-
-  err = _gcry_ac_data_encrypt (handle, 0, key, mpi_plain, &data_encrypted);
-  if (err)
-    goto out;
-
-  err = ac_data_set_to_mpi (data_encrypted, &mpi_encrypted);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_mpi_to_os_alloc (mpi_encrypted, &buffer, &buffer_n);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_io_write (io_cipher, buffer, buffer_n);
-
- out:
-
-  gcry_ac_data_destroy (data_encrypted);
-  gcry_mpi_release (mpi_encrypted);
-  gcry_mpi_release (mpi_plain);
-  gcry_free (opts_em);
-  gcry_free (buffer);
-  gcry_free (em);
-
-  return err;
-}
-
-/* Decryptes the cipher message contained in C, which is of size C_N,
-   with the secret key KEY_SECRET according to the Encryption Scheme
-   SCHEME_ID.  Handle is used for accessing the low-level
-   cryptographic primitives.  If OPTS is not NULL, it has to be an
-   anonymous structure specific to the chosen scheme (gcry_ac_es_*_t).
-   The decrypted message will be stored in M and M_N.  */
-gcry_error_t
-_gcry_ac_data_decrypt_scheme (gcry_ac_handle_t handle,
-                             gcry_ac_scheme_t scheme_id,
-                             unsigned int flags, void *opts,
-                             gcry_ac_key_t key,
-                             gcry_ac_io_t *io_cipher,
-                             gcry_ac_io_t *io_message)
-{
-  gcry_ac_io_t io_em;
-  gcry_error_t err;
-  gcry_ac_data_t data_encrypted;
-  unsigned char *em;
-  size_t em_n;
-  gcry_mpi_t mpi_encrypted;
-  gcry_mpi_t mpi_decrypted;
-  void *opts_em;
-  ac_scheme_t *scheme;
-  char *elements_enc;
-  size_t elements_enc_n;
-  unsigned char *c;
-  size_t c_n;
-
-  (void)flags;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_encrypted = NULL;
-  mpi_encrypted = NULL;
-  mpi_decrypted = NULL;
-  elements_enc = NULL;
-  opts_em = NULL;
-  em = NULL;
-  c = NULL;
-
-  scheme = ac_scheme_get (scheme_id);
-  if (! scheme)
-    {
-      err = gcry_error (GPG_ERR_NO_ENCRYPTION_SCHEME);
-      goto out;
-    }
-
-  if (key->type != GCRY_AC_KEY_SECRET)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  err = _gcry_ac_io_read_all (io_cipher, &c, &c_n);
-  if (err)
-    goto out;
-
-  mpi_encrypted = gcry_mpi_snew (0);
-  gcry_ac_os_to_mpi (mpi_encrypted, c, c_n);
-
-  err = _gcry_pk_get_elements (handle->algorithm, &elements_enc, NULL);
-  if (err)
-    goto out;
-
-  elements_enc_n = strlen (elements_enc);
-  if (elements_enc_n != 1)
-    {
-      /* FIXME? */
-      err = gcry_error (GPG_ERR_CONFLICT);
-      goto out;
-    }
-
-  err = _gcry_ac_data_new (&data_encrypted);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_set (data_encrypted, GCRY_AC_FLAG_COPY | 
GCRY_AC_FLAG_DEALLOC,
-                          elements_enc, mpi_encrypted);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_decrypt (handle, 0, key, &mpi_decrypted, data_encrypted);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_mpi_to_os_alloc (mpi_decrypted, &em, &em_n);
-  if (err)
-    goto out;
-
-  err = ac_dencode_prepare (handle, key, opts, *scheme, &opts_em);
-  if (err)
-    goto out;
-
-  _gcry_ac_io_init (&io_em, GCRY_AC_IO_READABLE,
-                   GCRY_AC_IO_STRING, em, em_n);
-
-  err = _gcry_ac_data_decode (scheme->scheme_encoding, 0, opts_em,
-                             &io_em, io_message);
-  if (err)
-    goto out;
-
- out:
-
-  _gcry_ac_data_destroy (data_encrypted);
-  gcry_mpi_release (mpi_encrypted);
-  gcry_mpi_release (mpi_decrypted);
-  free (elements_enc);
-  gcry_free (opts_em);
-  gcry_free (em);
-  gcry_free (c);
-
-  return err;
-}
-
-
-/* Signs the message contained in M, which is of size M_N, with the
-   secret key KEY according to the Signature Scheme SCHEME_ID.  Handle
-   is used for accessing the low-level cryptographic primitives.  If
-   OPTS is not NULL, it has to be an anonymous structure specific to
-   the chosen scheme (gcry_ac_ssa_*_t).  The signed message will be
-   stored in S and S_N.  */
-gcry_error_t
-_gcry_ac_data_sign_scheme (gcry_ac_handle_t handle,
-                          gcry_ac_scheme_t scheme_id,
-                          unsigned int flags, void *opts,
-                          gcry_ac_key_t key,
-                          gcry_ac_io_t *io_message,
-                          gcry_ac_io_t *io_signature)
-{
-  gcry_ac_io_t io_em;
-  gcry_error_t err;
-  gcry_ac_data_t data_signed;
-  unsigned char *em;
-  size_t em_n;
-  gcry_mpi_t mpi;
-  void *opts_em;
-  unsigned char *buffer;
-  size_t buffer_n;
-  gcry_mpi_t mpi_signed;
-  ac_scheme_t *scheme;
-
-  (void)flags;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  data_signed = NULL;
-  mpi_signed = NULL;
-  opts_em = NULL;
-  buffer = NULL;
-  mpi = NULL;
-  em = NULL;
-
-  if (key->type != GCRY_AC_KEY_SECRET)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  scheme = ac_scheme_get (scheme_id);
-  if (! scheme)
-    {
-      /* FIXME: adjust api of scheme_get in respect to err codes.  */
-      err = gcry_error (GPG_ERR_NO_SIGNATURE_SCHEME);
-      goto out;
-    }
-
-  err = ac_dencode_prepare (handle, key, opts, *scheme, &opts_em);
-  if (err)
-    goto out;
-
-  _gcry_ac_io_init (&io_em, GCRY_AC_IO_WRITABLE,
-                   GCRY_AC_IO_STRING, &em, &em_n);
-
-  err = _gcry_ac_data_encode (scheme->scheme_encoding, 0, opts_em,
-                             io_message, &io_em);
-  if (err)
-    goto out;
-
-  mpi = gcry_mpi_new (0);
-  _gcry_ac_os_to_mpi (mpi, em, em_n);
-
-  err = _gcry_ac_data_sign (handle, key, mpi, &data_signed);
-  if (err)
-    goto out;
-
-  err = ac_data_set_to_mpi (data_signed, &mpi_signed);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_mpi_to_os_alloc (mpi_signed, &buffer, &buffer_n);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_io_write (io_signature, buffer, buffer_n);
-
- out:
-
-  _gcry_ac_data_destroy (data_signed);
-  gcry_mpi_release (mpi_signed);
-  gcry_mpi_release (mpi);
-  gcry_free (opts_em);
-  gcry_free (buffer);
-  gcry_free (em);
-
-  return err;
-}
-
-/* Verifies that the signature contained in S, which is of length S_N,
-   is indeed the result of signing the message contained in M, which
-   is of size M_N, with the secret key belonging to the public key
-   KEY_PUBLIC.  If OPTS is not NULL, it has to be an anonymous
-   structure (gcry_ac_ssa_*_t) specific to the Signature Scheme, whose
-   ID is contained in SCHEME_ID.  */
-gcry_error_t
-_gcry_ac_data_verify_scheme (gcry_ac_handle_t handle,
-                            gcry_ac_scheme_t scheme_id,
-                            unsigned int flags, void *opts,
-                            gcry_ac_key_t key,
-                            gcry_ac_io_t *io_message,
-                            gcry_ac_io_t *io_signature)
-{
-  gcry_ac_io_t io_em;
-  gcry_error_t err;
-  gcry_ac_data_t data_signed;
-  unsigned char *em;
-  size_t em_n;
-  void *opts_em;
-  gcry_mpi_t mpi_signature;
-  gcry_mpi_t mpi_data;
-  ac_scheme_t *scheme;
-  char *elements_sig;
-  size_t elements_sig_n;
-  unsigned char *s;
-  size_t s_n;
-
-  (void)flags;
-
-  if (fips_mode ())
-    return gpg_error (GPG_ERR_NOT_SUPPORTED);
-
-  mpi_signature = NULL;
-  elements_sig = NULL;
-  data_signed = NULL;
-  mpi_data = NULL;
-  opts_em = NULL;
-  em = NULL;
-  s = NULL;
-
-  if (key->type != GCRY_AC_KEY_PUBLIC)
-    {
-      err = gcry_error (GPG_ERR_WRONG_KEY_USAGE);
-      goto out;
-    }
-
-  scheme = ac_scheme_get (scheme_id);
-  if (! scheme)
-    {
-      err = gcry_error (GPG_ERR_NO_SIGNATURE_SCHEME);
-      goto out;
-    }
-
-  err = ac_dencode_prepare (handle, key, opts, *scheme, &opts_em);
-  if (err)
-    goto out;
-
-  _gcry_ac_io_init (&io_em, GCRY_AC_IO_WRITABLE,
-                   GCRY_AC_IO_STRING, &em, &em_n);
-
-  err = _gcry_ac_data_encode (scheme->scheme_encoding, 0, opts_em,
-                             io_message, &io_em);
-  if (err)
-    goto out;
-
-  mpi_data = gcry_mpi_new (0);
-  _gcry_ac_os_to_mpi (mpi_data, em, em_n);
-
-  err = _gcry_ac_io_read_all (io_signature, &s, &s_n);
-  if (err)
-    goto out;
-
-  mpi_signature = gcry_mpi_new (0);
-  _gcry_ac_os_to_mpi (mpi_signature, s, s_n);
-
-  err = _gcry_pk_get_elements (handle->algorithm, NULL, &elements_sig);
-  if (err)
-    goto out;
-
-  elements_sig_n = strlen (elements_sig);
-  if (elements_sig_n != 1)
-    {
-      /* FIXME? */
-      err = gcry_error (GPG_ERR_CONFLICT);
-      goto out;
-    }
-
-  err = _gcry_ac_data_new (&data_signed);
-  if (err)
-    goto out;
-
-  err = _gcry_ac_data_set (data_signed, GCRY_AC_FLAG_COPY | 
GCRY_AC_FLAG_DEALLOC,
-                          elements_sig, mpi_signature);
-  if (err)
-    goto out;
-
-  gcry_mpi_release (mpi_signature);
-  mpi_signature = NULL;
-
-  err = _gcry_ac_data_verify (handle, key, mpi_data, data_signed);
-
- out:
-
-  _gcry_ac_data_destroy (data_signed);
-  gcry_mpi_release (mpi_signature);
-  gcry_mpi_release (mpi_data);
-  free (elements_sig);
-  gcry_free (opts_em);
-  gcry_free (em);
-  gcry_free (s);
-
-  return err;
-}
-
-
-/*
- * General functions.
- */
-
-gcry_err_code_t
-_gcry_ac_init (void)
-{
-  if (fips_mode ())
-    return GPG_ERR_NOT_SUPPORTED;
-
-  return 0;
-}
diff --git a/grub-core/lib/libgcrypt/cipher/arcfour-amd64.S 
b/grub-core/lib/libgcrypt/cipher/arcfour-amd64.S
new file mode 100644
index 000000000..2abd90a7e
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/arcfour-amd64.S
@@ -0,0 +1,108 @@
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+**
+** 2013/12/20 <jussi.kivilinna@iki.fi>:
+**  - Integrated to libgcrypt
+**  - 4.18 cycles/byte on Intel i5-4570
+*/
+
+#ifdef __x86_64__
+#include <config.h>
+#if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || 
\
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+.align 16
+.globl _gcry_arcfour_amd64
+ELF(.type _gcry_arcfour_amd64,@function)
+_gcry_arcfour_amd64:
+       CFI_STARTPROC()
+       ENTER_SYSV_FUNC_PARAMS_0_4
+       push    %rbp
+       CFI_PUSH(%rbp)
+       push    %rbx
+       CFI_PUSH(%rbx)
+       mov     %rdi,           %rbp    # key = ARG(key)
+       mov     %rsi,           %rbx    # rbx = ARG(len)
+       mov     %rdx,           %rsi    # in = ARG(in)
+       mov     %rcx,           %rdi    # out = ARG(out)
+       mov     (4*256)(%rbp),  %ecx    # x = key->x
+       mov     (4*256+4)(%rbp),%edx    # y = key->y
+       inc     %rcx                    # x++
+       and     $255,           %rcx    # x &= 0xff
+       lea     -8(%rbx,%rsi),  %rbx    # rbx = in+len-8
+       mov     %rbx,           %r9     # tmp = in+len-8
+       mov     (%rbp,%rcx,4),  %eax    # tx = d[x]
+       cmp     %rsi,           %rbx    # cmp in with in+len-8
+       jl      .Lend                   # jump if (in+len-8 < in)
+
+.Lstart:
+       add     $8,             %rsi            # increment in
+       add     $8,             %rdi            # increment out
+
+       # generate the next 8 bytes of the rc4 stream into %r8
+       mov     $8,             %r11            # byte counter
+1:     add     %al,            %dl             # y += tx
+       mov     (%rbp,%rdx,4),  %ebx            # ty = d[y]
+       mov     %ebx,           (%rbp,%rcx,4)   # d[x] = ty
+       add     %al,            %bl             # val = ty + tx
+       mov     %eax,           (%rbp,%rdx,4)   # d[y] = tx
+       inc     %cl                             # x++           (NEXT ROUND)
+       mov     (%rbp,%rcx,4),  %eax            # tx = d[x]     (NEXT ROUND)
+       shl     $8,             %r8
+       movb    (%rbp,%rbx,4),  %r8b            # val = d[val]
+       dec     %r11b
+       jnz 1b
+
+       # xor 8 bytes
+       bswap   %r8
+       xor     -8(%rsi),       %r8
+       cmp     %r9,            %rsi            # cmp in+len-8 with in
+       mov     %r8,            -8(%rdi)
+       jle     .Lstart                         # jump if (in <= in+len-8)
+
+.Lend:
+       add     $8,             %r9             # tmp = in+len
+
+       # handle the last bytes, one by one
+1:     cmp     %rsi,           %r9             # cmp in with in+len
+       jle     .Lfinished                      # jump if (in+len <= in)
+       add     %al,            %dl             # y += tx
+       mov     (%rbp,%rdx,4),  %ebx            # ty = d[y]
+       mov     %ebx,           (%rbp,%rcx,4)   # d[x] = ty
+       add     %al,            %bl             # val = ty + tx
+       mov     %eax,           (%rbp,%rdx,4)   # d[y] = tx
+       inc     %cl                             # x++           (NEXT ROUND)
+       mov     (%rbp,%rcx,4),  %eax            # tx = d[x]     (NEXT ROUND)
+       movb    (%rbp,%rbx,4),  %r8b            # val = d[val]
+       xor     (%rsi),         %r8b            # xor 1 byte
+       movb    %r8b,           (%rdi)
+       inc     %rsi                            # in++
+       inc     %rdi                            # out++
+       jmp 1b
+
+.Lfinished:
+       dec     %rcx                            # x--
+       movb    %cl,            (4*256)(%rbp)   # key->y = y
+       movb    %dl,            (4*256+4)(%rbp) # key->x = x
+       pop     %rbx
+       CFI_POP(%rbx)
+       pop     %rbp
+       CFI_POP(%rbp)
+       EXIT_SYSV_FUNC
+       ret_spec_stop
+       CFI_ENDPROC()
+.L__gcry_arcfour_amd64_end:
+ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
+
+#endif
+#endif
diff --git a/grub-core/lib/libgcrypt/cipher/arcfour.c 
b/grub-core/lib/libgcrypt/cipher/arcfour.c
index 6ef07fb20..c8d22c701 100644
--- a/grub-core/lib/libgcrypt/cipher/arcfour.c
+++ b/grub-core/lib/libgcrypt/cipher/arcfour.c
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see 
<https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * For a description of the algorithm, see:
  *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
@@ -30,18 +30,70 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "cipher-internal.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
 
 static const char *selftest(void);
 
+#ifdef USE_AMD64_ASM
+
+typedef struct {
+    u32 sbox[256];
+    u32 idx_i, idx_j;
+} ARCFOUR_context;
+
+void _gcry_arcfour_amd64(void *key, size_t len, const byte *indata,
+                        byte *outdata);
+
+static void
+encrypt_stream (void *context,
+                byte *outbuf, const byte *inbuf, size_t length)
+{
+  _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
+}
+
+#else /*!USE_AMD64_ASM*/
+
 typedef struct {
-    int idx_i, idx_j;
     byte sbox[256];
+    int idx_i, idx_j;
 } ARCFOUR_context;
 
 static void
 do_encrypt_stream( ARCFOUR_context *ctx,
-                  byte *outbuf, const byte *inbuf, unsigned int length )
+                  byte *outbuf, const byte *inbuf, size_t length )
 {
+#ifndef __i386__
+  register unsigned int i = ctx->idx_i;
+  register byte j = ctx->idx_j;
+  register byte *sbox = ctx->sbox;
+  register byte t, u;
+
+  while ( length-- )
+    {
+      i++;
+      t = sbox[(byte)i];
+      j += t;
+      u = sbox[j];
+      sbox[(byte)i] = u;
+      u += t;
+      sbox[j] = t;
+      *outbuf++ = sbox[u] ^ *inbuf++;
+    }
+
+  ctx->idx_i = (byte)i;
+  ctx->idx_j = (byte)j;
+#else /*__i386__*/
+  /* Old implementation of arcfour is faster on i386 than the version above.
+   * This is because version above increases register pressure which on i386
+   * would push some of the variables to memory/stack.  Therefore keep this
+   * version for i386 to avoid regressing performance.  */
   register int i = ctx->idx_i;
   register int j = ctx->idx_j;
   register byte *sbox = ctx->sbox;
@@ -59,17 +111,20 @@ do_encrypt_stream( ARCFOUR_context *ctx,
 
   ctx->idx_i = i;
   ctx->idx_j = j;
+#endif
 }
 
 static void
 encrypt_stream (void *context,
-                byte *outbuf, const byte *inbuf, unsigned int length)
+                byte *outbuf, const byte *inbuf, size_t length)
 {
   ARCFOUR_context *ctx = (ARCFOUR_context *) context;
   do_encrypt_stream (ctx, outbuf, inbuf, length );
   _gcry_burn_stack (64);
 }
 
+#endif /*!USE_AMD64_ASM*/
+
 
 static gcry_err_code_t
 do_arcfour_setkey (void *context, const byte *key, unsigned int keylen)
@@ -96,27 +151,32 @@ do_arcfour_setkey (void *context, const byte *key, 
unsigned int keylen)
   ctx->idx_i = ctx->idx_j = 0;
   for (i=0; i < 256; i++ )
     ctx->sbox[i] = i;
-  for (i=0; i < 256; i++ )
-    karr[i] = key[i%keylen];
+  for (i=j=0; i < 256; i++,j++ )
+    {
+      if (j >= keylen)
+        j = 0;
+      karr[i] = key[j];
+    }
   for (i=j=0; i < 256; i++ )
     {
       int t;
-      j = (j + ctx->sbox[i] + karr[i]) % 256;
+      j = (j + ctx->sbox[i] + karr[i]) & 255;
       t = ctx->sbox[i];
       ctx->sbox[i] = ctx->sbox[j];
       ctx->sbox[j] = t;
     }
-  memset( karr, 0, 256 );
+  wipememory( karr, sizeof(karr) );
 
   return GPG_ERR_NO_ERROR;
 }
 
 static gcry_err_code_t
-arcfour_setkey ( void *context, const byte *key, unsigned int keylen )
+arcfour_setkey ( void *context, const byte *key, unsigned int keylen,
+                 cipher_bulk_ops_t *bulk_ops )
 {
   ARCFOUR_context *ctx = (ARCFOUR_context *) context;
   gcry_err_code_t rc = do_arcfour_setkey (ctx, key, keylen );
-  _gcry_burn_stack (300);
+  (void)bulk_ops;
   return rc;
 }
 
@@ -129,18 +189,18 @@ selftest(void)
 
   /* Test vector from Cryptlib labeled there: "from the
      State/Commerce Department". */
-  static byte key_1[] =
+  static const byte key_1[] =
     { 0x61, 0x8A, 0x63, 0xD2, 0xFB };
-  static byte plaintext_1[] =
+  static const byte plaintext_1[] =
     { 0xDC, 0xEE, 0x4C, 0xF9, 0x2C };
   static const byte ciphertext_1[] =
     { 0xF1, 0x38, 0x29, 0xC9, 0xDE };
 
-  arcfour_setkey( &ctx, key_1, sizeof(key_1));
+  arcfour_setkey( &ctx, key_1, sizeof(key_1), NULL);
   encrypt_stream( &ctx, scratch, plaintext_1, sizeof(plaintext_1));
   if ( memcmp (scratch, ciphertext_1, sizeof (ciphertext_1)))
     return "Arcfour encryption test 1 failed.";
-  arcfour_setkey( &ctx, key_1, sizeof(key_1));
+  arcfour_setkey( &ctx, key_1, sizeof(key_1), NULL);
   encrypt_stream(&ctx, scratch, scratch, sizeof(plaintext_1)); /* decrypt */
   if ( memcmp (scratch, plaintext_1, sizeof (plaintext_1)))
     return "Arcfour decryption test 1 failed.";
@@ -150,6 +210,7 @@ selftest(void)
 
 gcry_cipher_spec_t _gcry_cipher_spec_arcfour =
   {
+    GCRY_CIPHER_ARCFOUR, {0, 0},
     "ARCFOUR", NULL, NULL, 1, 128, sizeof (ARCFOUR_context),
     arcfour_setkey, NULL, NULL, encrypt_stream, encrypt_stream,
   };
diff --git a/grub-core/lib/libgcrypt/cipher/aria-aesni-avx-amd64.S 
b/grub-core/lib/libgcrypt/cipher/aria-aesni-avx-amd64.S
new file mode 100644
index 000000000..2a88c1e72
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/aria-aesni-avx-amd64.S
@@ -0,0 +1,1440 @@
+/* aria-aesni-avx-amd64.S  -  AESNI/GFNI/AVX implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 16 + 15)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)                   \
+       vpcmpeqq minus_one, x, tmp;                     \
+       vpsubq minus_one, x, x;                         \
+       vpslldq $8, tmp, tmp;                           \
+       vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpand x, mask4bit, tmp0;                        \
+       vpandn x, mask4bit, x;                          \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vmovdqu .Lshufb_16x16b rRIP, a0;                \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vmovdqu .Lshufb_16x16b rRIP, a0;                \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu (0 * 16)(rio), x0;                      \
+       vmovdqu (1 * 16)(rio), x1;                      \
+       vmovdqu (2 * 16)(rio), x2;                      \
+       vmovdqu (3 * 16)(rio), x3;                      \
+       vmovdqu (4 * 16)(rio), x4;                      \
+       vmovdqu (5 * 16)(rio), x5;                      \
+       vmovdqu (6 * 16)(rio), x6;                      \
+       vmovdqu (7 * 16)(rio), x7;                      \
+       vmovdqu (8 * 16)(rio), y0;                      \
+       vmovdqu (9 * 16)(rio), y1;                      \
+       vmovdqu (10 * 16)(rio), y2;                     \
+       vmovdqu (11 * 16)(rio), y3;                     \
+       vmovdqu (12 * 16)(rio), y4;                     \
+       vmovdqu (13 * 16)(rio), y5;                     \
+       vmovdqu (14 * 16)(rio), y6;                     \
+       vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu x0, 0 * 16(mem_ab);                     \
+       vmovdqu x1, 1 * 16(mem_ab);                     \
+       vmovdqu x2, 2 * 16(mem_ab);                     \
+       vmovdqu x3, 3 * 16(mem_ab);                     \
+       vmovdqu x4, 4 * 16(mem_ab);                     \
+       vmovdqu x5, 5 * 16(mem_ab);                     \
+       vmovdqu x6, 6 * 16(mem_ab);                     \
+       vmovdqu x7, 7 * 16(mem_ab);                     \
+       vmovdqu y0, 0 * 16(mem_cd);                     \
+       vmovdqu y1, 1 * 16(mem_cd);                     \
+       vmovdqu y2, 2 * 16(mem_cd);                     \
+       vmovdqu y3, 3 * 16(mem_cd);                     \
+       vmovdqu y4, 4 * 16(mem_cd);                     \
+       vmovdqu y5, 5 * 16(mem_cd);                     \
+       vmovdqu y6, 6 * 16(mem_cd);                     \
+       vmovdqu y7, 7 * 16(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu x0, 0 * 16(mem);                        \
+       vmovdqu x1, 1 * 16(mem);                        \
+       vmovdqu x2, 2 * 16(mem);                        \
+       vmovdqu x3, 3 * 16(mem);                        \
+       vmovdqu x4, 4 * 16(mem);                        \
+       vmovdqu x5, 5 * 16(mem);                        \
+       vmovdqu x6, 6 * 16(mem);                        \
+       vmovdqu x7, 7 * 16(mem);                        \
+       vmovdqu y0, 8 * 16(mem);                        \
+       vmovdqu y1, 9 * 16(mem);                        \
+       vmovdqu y2, 10 * 16(mem);                       \
+       vmovdqu y3, 11 * 16(mem);                       \
+       vmovdqu y4, 12 * 16(mem);                       \
+       vmovdqu y5, 13 * 16(mem);                       \
+       vmovdqu y6, 14 * 16(mem);                       \
+       vmovdqu y7, 15 * 16(mem);
+
+#define vload_if_enough_nblks(blk_offs, rnblks, rio, v)        \
+       vpxor v, v, v;                                  \
+       cmp $(blk_offs), rnblks;                        \
+       jbe 1f;                                         \
+       vmovdqu (blk_offs * 16)(rio), v;                \
+       1:;
+
+#define vstore_if_enough_nblks(blk_offs, rnblks, mem, v)\
+       cmp $(blk_offs), rnblks;                        \
+       jbe 1f;                                         \
+       vmovdqu v, (blk_offs * 16)(mem);                \
+       1:;
+
+#define inpack_1_15_pre(x0, x1, x2, x3,                        \
+                       x4, x5, x6, x7,                 \
+                       y0, y1, y2, y3,                 \
+                       y4, y5, y6, y7,                 \
+                       rio, rnblks)                    \
+       vmovdqu (0 * 16)(rio), x0;                      \
+       vload_if_enough_nblks(1, rnblks, rio, x1);      \
+       vload_if_enough_nblks(2, rnblks, rio, x2);      \
+       vload_if_enough_nblks(3, rnblks, rio, x3);      \
+       vload_if_enough_nblks(4, rnblks, rio, x4);      \
+       vload_if_enough_nblks(5, rnblks, rio, x5);      \
+       vload_if_enough_nblks(6, rnblks, rio, x6);      \
+       vload_if_enough_nblks(7, rnblks, rio, x7);      \
+       vload_if_enough_nblks(8, rnblks, rio, y0);      \
+       vload_if_enough_nblks(9, rnblks, rio, y1);      \
+       vload_if_enough_nblks(10, rnblks, rio, y2);     \
+       vload_if_enough_nblks(11, rnblks, rio, y3);     \
+       vload_if_enough_nblks(12, rnblks, rio, y4);     \
+       vload_if_enough_nblks(13, rnblks, rio, y5);     \
+       vload_if_enough_nblks(14, rnblks, rio, y6);     \
+       vpxor y7, y7, y7;
+
+#define write_output_1_15(x0, x1, x2, x3,              \
+                         x4, x5, x6, x7,               \
+                         y0, y1, y2, y3,               \
+                         y4, y5, y6, y7,               \
+                         mem, rnblks)                  \
+       vmovdqu x0, (0 * 16)(mem);                      \
+       vstore_if_enough_nblks(1, rnblks, mem, x1);     \
+       vstore_if_enough_nblks(2, rnblks, mem, x2);     \
+       vstore_if_enough_nblks(3, rnblks, mem, x3);     \
+       vstore_if_enough_nblks(4, rnblks, mem, x4);     \
+       vstore_if_enough_nblks(5, rnblks, mem, x5);     \
+       vstore_if_enough_nblks(6, rnblks, mem, x6);     \
+       vstore_if_enough_nblks(7, rnblks, mem, x7);     \
+       vstore_if_enough_nblks(8, rnblks, mem, y0);     \
+       vstore_if_enough_nblks(9, rnblks, mem, y1);     \
+       vstore_if_enough_nblks(10, rnblks, mem, y2);    \
+       vstore_if_enough_nblks(11, rnblks, mem, y3);    \
+       vstore_if_enough_nblks(12, rnblks, mem, y4);    \
+       vstore_if_enough_nblks(13, rnblks, mem, y5);    \
+       vstore_if_enough_nblks(14, rnblks, mem, y6);
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu x0, ((idx + 0) * 16)(mem_tmp);          \
+       vmovdqu x1, ((idx + 1) * 16)(mem_tmp);          \
+       vmovdqu x2, ((idx + 2) * 16)(mem_tmp);          \
+       vmovdqu x3, ((idx + 3) * 16)(mem_tmp);          \
+       vmovdqu x4, ((idx + 4) * 16)(mem_tmp);          \
+       vmovdqu x5, ((idx + 5) * 16)(mem_tmp);          \
+       vmovdqu x6, ((idx + 6) * 16)(mem_tmp);          \
+       vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu ((idx + 0) * 16)(mem_tmp), x0;          \
+       vmovdqu ((idx + 1) * 16)(mem_tmp), x1;          \
+       vmovdqu ((idx + 2) * 16)(mem_tmp), x2;          \
+       vmovdqu ((idx + 3) * 16)(mem_tmp), x3;          \
+       vmovdqu ((idx + 4) * 16)(mem_tmp), x4;          \
+       vmovdqu ((idx + 5) * 16)(mem_tmp), x5;          \
+       vmovdqu ((idx + 6) * 16)(mem_tmp), x6;          \
+       vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     t0, t1, t2, rk,                   \
+                     idx, round)                       \
+       /* AddRoundKey */                               \
+       vmovd ((round * 16) + idx + 0)(rk), t0;         \
+       vpshufb .Lthree_x16 rRIP, t0, t2;               \
+       vpxor t2, x0, x0;                               \
+       vpshufb .Ltwo_x16 rRIP, t0, t2;                 \
+       vpxor t2, x1, x1;                               \
+       vpshufb .Lone_x16 rRIP, t0, t2;                 \
+       vpxor t2, x2, x2;                               \
+       vpshufb t1, t0, t2;                             \
+       vpxor t2, x3, x3;                               \
+       vmovd ((round * 16) + idx + 4)(rk), t0;         \
+       vpshufb .Lthree_x16 rRIP, t0, t2;               \
+       vpxor t2, x4, x4;                               \
+       vpshufb .Ltwo_x16 rRIP, t0, t2;                 \
+       vpxor t2, x5, x5;                               \
+       vpshufb .Lone_x16 rRIP, t0, t2;                 \
+       vpxor t2, x6, x6;                               \
+       vpshufb t1, t0, t2;                             \
+       vpxor t2, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vmovddup .Ltf_s2_bitmatrix rRIP, t0;            \
+       vmovddup .Ltf_inv_bitmatrix rRIP, t1;           \
+       vmovddup .Ltf_id_bitmatrix rRIP, t2;            \
+       vmovddup .Ltf_aff_bitmatrix rRIP, t3;           \
+       vmovddup .Ltf_x2_bitmatrix rRIP, t4;            \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#define aria_sbox_8way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      t0, t1, t2, t3,                  \
+                      t4, t5, t6, t7)                  \
+       vmovdqa .Linv_shift_row rRIP, t0;               \
+       vmovdqa .Lshift_row rRIP, t1;                   \
+       vbroadcastss .L0f0f0f0f rRIP, t6;               \
+       vmovdqa .Ltf_lo__inv_aff__and__s2 rRIP, t2;     \
+       vmovdqa .Ltf_hi__inv_aff__and__s2 rRIP, t3;     \
+       vmovdqa .Ltf_lo__x2__and__fwd_aff rRIP, t4;     \
+       vmovdqa .Ltf_hi__x2__and__fwd_aff rRIP, t5;     \
+                                                       \
+       vaesenclast t7, x0, x0;                         \
+       vaesenclast t7, x4, x4;                         \
+       vaesenclast t7, x1, x1;                         \
+       vaesenclast t7, x5, x5;                         \
+       vaesdeclast t7, x2, x2;                         \
+       vaesdeclast t7, x6, x6;                         \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+       vaesdeclast t7, x3, x3;                         \
+       vaesdeclast t7, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       vpxor x0, x3, t0;                               \
+       vpxor x1, x0, t1;                               \
+       vpxor x2, x1, t2;                               \
+       vpxor x3, x2, t3;                               \
+       /* X = T ^ rotr(X, 16); */                      \
+       vpxor t2, x0, x0;                               \
+       vpxor x1, t3, t3;                               \
+       vpxor t0, x2, x2;                               \
+       vpxor t1, x3, x1;                               \
+       vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;                               \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxor y4, y0, y0;                               \
+       vpxor y5, y1, y1;                               \
+       vpxor y6, y2, y2;                               \
+       vpxor y7, y3, y3;                               \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxor x4, x0, x0;                               \
+       vpxor x5, x1, x1;                               \
+       vpxor x6, x2, x2;                               \
+       vpxor x7, x3, x3;                               \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxor x4, y4, y4;                               \
+       vpxor x5, y5, y5;                               \
+       vpxor x6, y6, y6;                               \
+       vpxor x7, y7, y7;                               \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxor x0, y0, y0;                               \
+       vpxor x1, y1, y1;                               \
+       vpxor x2, y2, y2;                               \
+       vpxor x3, y3, y3;                               \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, last_round);   \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, last_round);   \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       vpxor y7, y7, y7;                               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 8, last_round);   \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, round);        \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, y7, y2, rk, 0, last_round);   \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#endif /* CONFIG_AS_GFNI */
+
+
+SECTION_RODATA
+.align 16
+
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+       .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+       .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+       .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+       .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+       .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+       .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+       .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+       .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+.Lthree_x16:
+       .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+.Ltwo_x16:
+       .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+.Lone_x16:
+       .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+       .long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx_crypt_16way,@function;)
+__aria_aesni_avx_crypt_16way:
+       /* input:
+       *      %r9: rk
+       *      %rsi: dst
+       *      %rdx: src
+       *      %xmm0..%xmm15: 16 byte-sliced blocks
+       */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 16(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %r8);
+       aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+               %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+               %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+       aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+               %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+               %xmm15, %rax, %r9, 0);
+       aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+               %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+               %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_aesni;
+
+       aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+               %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+               %xmm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+                          %xmm9, %xmm13, %xmm0, %xmm5,
+                          %xmm10, %xmm14, %xmm3, %xmm6,
+                          %xmm11, %xmm15, %xmm2, %xmm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx_crypt_16way,.-__aria_aesni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_aesni_avx_ecb_crypt_blk1_16,@function;)
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16:
+       /* input:
+       *      %rdi: ctx, CTX
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: round keys
+       *      %r8: num blocks
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       cmpq $16, %r8;
+       jb .Lecb_less_than_16;
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx);
+
+       call __aria_aesni_avx_crypt_16way;
+
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %r11);
+
+.Lecb_end:
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+
+.Lecb_less_than_16:
+       pushq %r8;
+       inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                       %xmm15, %rdx, %r8d);
+
+       call __aria_aesni_avx_crypt_16way;
+
+       popq %rax;
+       write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+                         %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+                         %xmm14, %xmm15, %r11, %eax);
+
+       jmp .Lecb_end;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx_ecb_crypt_blk1_16,
+         .-_gcry_aria_aesni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+ELF(.type __aria_aesni_avx_ctr_gen_keystream_16way,@function;)
+__aria_aesni_avx_ctr_gen_keystream_16way:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: keystream
+       *      %r8: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       /* load IV */
+       vmovdqu (%r8), %xmm8;
+       cmpb $(0x100 - 16), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       /* byteswap */
+       vmovdqa .Lbswap128_mask rRIP, %xmm1;
+       vpshufb %xmm1, %xmm8, %xmm3; /* be => le */
+
+       vpcmpeqd %xmm0, %xmm0, %xmm0;
+       vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */
+
+       /* construct IVs */
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm9;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm10;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm11;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm12;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm13;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm14;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm15;
+       vmovdqu %xmm8, (0 * 16)(%rcx);
+       vmovdqu %xmm9, (1 * 16)(%rcx);
+       vmovdqu %xmm10, (2 * 16)(%rcx);
+       vmovdqu %xmm11, (3 * 16)(%rcx);
+       vmovdqu %xmm12, (4 * 16)(%rcx);
+       vmovdqu %xmm13, (5 * 16)(%rcx);
+       vmovdqu %xmm14, (6 * 16)(%rcx);
+       vmovdqu %xmm15, (7 * 16)(%rcx);
+
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm8;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm9;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm10;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm11;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm12;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm13;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm14;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm15;
+       inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */
+       vpshufb %xmm1, %xmm3, %xmm4;
+       vmovdqu %xmm4, (%r8);
+
+       vmovdqu (0 * 16)(%rcx), %xmm0;
+       vmovdqu (1 * 16)(%rcx), %xmm1;
+       vmovdqu (2 * 16)(%rcx), %xmm2;
+       vmovdqu (3 * 16)(%rcx), %xmm3;
+       vmovdqu (4 * 16)(%rcx), %xmm4;
+       vmovdqu (5 * 16)(%rcx), %xmm5;
+       vmovdqu (6 * 16)(%rcx), %xmm6;
+       vmovdqu (7 * 16)(%rcx), %xmm7;
+
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       addb $16, 15(%r8);
+       pushq %rcx;
+       movl $14, %ecx;
+       1:
+         adcb $0, (%r8, %rcx);
+         jnc 2f;
+         loop 1b;
+       2:
+       popq %rcx;
+       jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%r8);
+.Lctr_byteadd_xmm:
+       vmovdqa %xmm8, %xmm0;
+       vpaddb .Lbige_addb_1 rRIP, %xmm8, %xmm1;
+       vpaddb .Lbige_addb_2 rRIP, %xmm8, %xmm2;
+       vpaddb .Lbige_addb_3 rRIP, %xmm8, %xmm3;
+       vpaddb .Lbige_addb_4 rRIP, %xmm8, %xmm4;
+       vpaddb .Lbige_addb_5 rRIP, %xmm8, %xmm5;
+       vpaddb .Lbige_addb_6 rRIP, %xmm8, %xmm6;
+       vpaddb .Lbige_addb_7 rRIP, %xmm8, %xmm7;
+       vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm8;
+       vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm9;
+       vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm10;
+       vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm11;
+       vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm12;
+       vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm13;
+       vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm14;
+       vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
__aria_aesni_avx_ctr_gen_keystream_16way,.-__aria_aesni_avx_ctr_gen_keystream_16way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_aesni_avx_ctr_crypt_blk16,@function;)
+_gcry_aria_aesni_avx_ctr_crypt_blk16:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx_ctr_gen_keystream_16way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_aesni_avx_crypt_16way;
+
+       popq %rsi;
+       vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+       vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+       vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+       vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+       vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+       vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+       vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+       vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+       vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+       vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+       vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+       vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+       vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+       vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+       vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+       vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
_gcry_aria_aesni_avx_ctr_crypt_blk16,.-_gcry_aria_aesni_avx_ctr_crypt_blk16;)
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx_crypt_16way,@function;)
+__aria_gfni_avx_crypt_16way:
+       /* input:
+       *      %r9: rk
+       *      %rsi: dst
+       *      %rdx: src
+       *      %xmm0..%xmm15: 16 byte-sliced blocks
+       */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 16(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3,
+                     %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11,
+                     %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %r8);
+       aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11,
+                    %xmm12, %xmm13, %xmm14, %xmm15,
+                    %xmm0, %xmm1, %xmm2, %xmm3,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11,
+                    %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %r9, 0);
+       aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10,
+                    %xmm12, %xmm13, %xmm14, %xmm15,
+                    %xmm0, %xmm1, %xmm2, %xmm3,
+                    %xmm4, %xmm5, %xmm6, %xmm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4,
+                          %xmm9, %xmm13, %xmm0, %xmm5,
+                          %xmm10, %xmm14, %xmm3, %xmm6,
+                          %xmm11, %xmm15, %xmm2, %xmm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx_crypt_16way,.-__aria_gfni_avx_crypt_16way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ecb_crypt_blk1_16
+ELF(.type _gcry_aria_gfni_avx_ecb_crypt_blk1_16,@function;)
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16:
+       /* input:
+       *      %rdi: ctx, CTX
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: round keys
+       *      %r8: num blocks
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       cmpq $16, %r8;
+       jb .Lecb_less_than_16_gfni;
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx);
+
+       call __aria_gfni_avx_crypt_16way;
+
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %r11);
+
+.Lecb_end_gfni:
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+
+.Lecb_less_than_16_gfni:
+       pushq %r8;
+       inpack_1_15_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                       %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                       %xmm15, %rdx, %r8d);
+
+       call __aria_gfni_avx_crypt_16way;
+
+       popq %rax;
+       write_output_1_15(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6,
+                         %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13,
+                         %xmm14, %xmm15, %r11, %eax);
+
+       jmp .Lecb_end_gfni;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx_ecb_crypt_blk1_16,
+         .-_gcry_aria_gfni_avx_ecb_crypt_blk1_16;)
+
+.align 16
+.globl _gcry_aria_gfni_avx_ctr_crypt_blk16
+ELF(.type _gcry_aria_gfni_avx_ctr_crypt_blk16,@function;)
+_gcry_aria_gfni_avx_ctr_crypt_blk16:
+       /* input:
+       *      %rdi: ctx
+       *      %rsi: dst
+       *      %rdx: src
+       *      %rcx: iv (big endian, 128bit)
+       */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 16), %rsp;
+       andq $~15, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx_ctr_gen_keystream_16way
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx_crypt_16way;
+
+       popq %rsi;
+       vpxor (0 * 16)(%r11), %xmm1, %xmm1;
+       vpxor (1 * 16)(%r11), %xmm0, %xmm0;
+       vpxor (2 * 16)(%r11), %xmm3, %xmm3;
+       vpxor (3 * 16)(%r11), %xmm2, %xmm2;
+       vpxor (4 * 16)(%r11), %xmm4, %xmm4;
+       vpxor (5 * 16)(%r11), %xmm5, %xmm5;
+       vpxor (6 * 16)(%r11), %xmm6, %xmm6;
+       vpxor (7 * 16)(%r11), %xmm7, %xmm7;
+       vpxor (8 * 16)(%r11), %xmm8, %xmm8;
+       vpxor (9 * 16)(%r11), %xmm9, %xmm9;
+       vpxor (10 * 16)(%r11), %xmm10, %xmm10;
+       vpxor (11 * 16)(%r11), %xmm11, %xmm11;
+       vpxor (12 * 16)(%r11), %xmm12, %xmm12;
+       vpxor (13 * 16)(%r11), %xmm13, %xmm13;
+       vpxor (14 * 16)(%r11), %xmm14, %xmm14;
+       vpxor (15 * 16)(%r11), %xmm15, %xmm15;
+       write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
_gcry_aria_gfni_avx_ctr_crypt_blk16,.-_gcry_aria_gfni_avx_ctr_crypt_blk16;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/grub-core/lib/libgcrypt/cipher/aria-aesni-avx2-amd64.S 
b/grub-core/lib/libgcrypt/cipher/aria-aesni-avx2-amd64.S
new file mode 100644
index 000000000..d33fa54b2
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/aria-aesni-avx2-amd64.S
@@ -0,0 +1,1830 @@
+/* aria-aesni-avx2-amd64.S  -  AESNI/GFNI/AVX2 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#ifdef ENABLE_GFNI_SUPPORT
+#  define CONFIG_AS_GFNI 1
+#endif
+#ifdef HAVE_GCC_INLINE_ASM_VAES_VPCLMUL
+#  define CONFIG_AS_VAES 1
+#endif
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 32 + 31)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define inc_le128(x, minus_one, tmp)                   \
+       vpcmpeqq minus_one, x, tmp;                     \
+       vpsubq minus_one, x, x;                         \
+       vpslldq $8, tmp, tmp;                           \
+       vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpand x, mask4bit, tmp0;                        \
+       vpandn x, mask4bit, x;                          \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti128 .Lshufb_16x16b rRIP, a0;         \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu d2, st0;                                \
+       vmovdqu d3, st1;                                \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu a0, st0;                                \
+       vmovdqu a1, st1;                                \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti128 .Lshufb_16x16b rRIP, a0;         \
+       vmovdqu st1, a1;                                \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu d3, st1;                                \
+       vmovdqu st0, d3;                                \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu d2, st0;                                \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu st0, d2;                                \
+       vmovdqu st1, d3;                                \
+                                                       \
+       vmovdqu b0, st0;                                \
+       vmovdqu b1, st1;                                \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu st0, b0;                                \
+       vmovdqu st1, b1;                                \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu (0 * 32)(rio), x0;                      \
+       vmovdqu (1 * 32)(rio), x1;                      \
+       vmovdqu (2 * 32)(rio), x2;                      \
+       vmovdqu (3 * 32)(rio), x3;                      \
+       vmovdqu (4 * 32)(rio), x4;                      \
+       vmovdqu (5 * 32)(rio), x5;                      \
+       vmovdqu (6 * 32)(rio), x6;                      \
+       vmovdqu (7 * 32)(rio), x7;                      \
+       vmovdqu (8 * 32)(rio), y0;                      \
+       vmovdqu (9 * 32)(rio), y1;                      \
+       vmovdqu (10 * 32)(rio), y2;                     \
+       vmovdqu (11 * 32)(rio), y3;                     \
+       vmovdqu (12 * 32)(rio), y4;                     \
+       vmovdqu (13 * 32)(rio), y5;                     \
+       vmovdqu (14 * 32)(rio), y6;                     \
+       vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu x0, 0 * 32(mem_ab);                     \
+       vmovdqu x1, 1 * 32(mem_ab);                     \
+       vmovdqu x2, 2 * 32(mem_ab);                     \
+       vmovdqu x3, 3 * 32(mem_ab);                     \
+       vmovdqu x4, 4 * 32(mem_ab);                     \
+       vmovdqu x5, 5 * 32(mem_ab);                     \
+       vmovdqu x6, 6 * 32(mem_ab);                     \
+       vmovdqu x7, 7 * 32(mem_ab);                     \
+       vmovdqu y0, 0 * 32(mem_cd);                     \
+       vmovdqu y1, 1 * 32(mem_cd);                     \
+       vmovdqu y2, 2 * 32(mem_cd);                     \
+       vmovdqu y3, 3 * 32(mem_cd);                     \
+       vmovdqu y4, 4 * 32(mem_cd);                     \
+       vmovdqu y5, 5 * 32(mem_cd);                     \
+       vmovdqu y6, 6 * 32(mem_cd);                     \
+       vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu x0, 0 * 32(mem);                        \
+       vmovdqu x1, 1 * 32(mem);                        \
+       vmovdqu x2, 2 * 32(mem);                        \
+       vmovdqu x3, 3 * 32(mem);                        \
+       vmovdqu x4, 4 * 32(mem);                        \
+       vmovdqu x5, 5 * 32(mem);                        \
+       vmovdqu x6, 6 * 32(mem);                        \
+       vmovdqu x7, 7 * 32(mem);                        \
+       vmovdqu y0, 8 * 32(mem);                        \
+       vmovdqu y1, 9 * 32(mem);                        \
+       vmovdqu y2, 10 * 32(mem);                       \
+       vmovdqu y3, 11 * 32(mem);                       \
+       vmovdqu y4, 12 * 32(mem);                       \
+       vmovdqu y5, 13 * 32(mem);                       \
+       vmovdqu y6, 14 * 32(mem);                       \
+       vmovdqu y7, 15 * 32(mem);                       \
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu x0, ((idx + 0) * 32)(mem_tmp);          \
+       vmovdqu x1, ((idx + 1) * 32)(mem_tmp);          \
+       vmovdqu x2, ((idx + 2) * 32)(mem_tmp);          \
+       vmovdqu x3, ((idx + 3) * 32)(mem_tmp);          \
+       vmovdqu x4, ((idx + 4) * 32)(mem_tmp);          \
+       vmovdqu x5, ((idx + 5) * 32)(mem_tmp);          \
+       vmovdqu x6, ((idx + 6) * 32)(mem_tmp);          \
+       vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu ((idx + 0) * 32)(mem_tmp), x0;          \
+       vmovdqu ((idx + 1) * 32)(mem_tmp), x1;          \
+       vmovdqu ((idx + 2) * 32)(mem_tmp), x2;          \
+       vmovdqu ((idx + 3) * 32)(mem_tmp), x3;          \
+       vmovdqu ((idx + 4) * 32)(mem_tmp), x4;          \
+       vmovdqu ((idx + 5) * 32)(mem_tmp), x5;          \
+       vmovdqu ((idx + 6) * 32)(mem_tmp), x6;          \
+       vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     t0, rk, idx, round)               \
+       /* AddRoundKey */                               \
+       vpbroadcastb ((round * 16) + idx + 3)(rk), t0;  \
+       vpxor t0, x0, x0;                               \
+       vpbroadcastb ((round * 16) + idx + 2)(rk), t0;  \
+       vpxor t0, x1, x1;                               \
+       vpbroadcastb ((round * 16) + idx + 1)(rk), t0;  \
+       vpxor t0, x2, x2;                               \
+       vpbroadcastb ((round * 16) + idx + 0)(rk), t0;  \
+       vpxor t0, x3, x3;                               \
+       vpbroadcastb ((round * 16) + idx + 7)(rk), t0;  \
+       vpxor t0, x4, x4;                               \
+       vpbroadcastb ((round * 16) + idx + 6)(rk), t0;  \
+       vpxor t0, x5, x5;                               \
+       vpbroadcastb ((round * 16) + idx + 5)(rk), t0;  \
+       vpxor t0, x6, x6;                               \
+       vpbroadcastb ((round * 16) + idx + 4)(rk), t0;  \
+       vpxor t0, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
+#ifdef CONFIG_AS_VAES
+#define aria_sbox_8way_vaes(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpxor t7, t7, t7;                               \
+       vpxor t6, t6, t6;                               \
+       vbroadcasti128 .Linv_shift_row rRIP, t0;        \
+       vbroadcasti128 .Lshift_row rRIP, t1;            \
+       vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+       vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+       vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+       vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+                                                       \
+       vaesenclast t7, x0, x0;                         \
+       vaesenclast t7, x4, x4;                         \
+       vaesenclast t7, x1, x1;                         \
+       vaesenclast t7, x5, x5;                         \
+       vaesdeclast t7, x2, x2;                         \
+       vaesdeclast t7, x6, x6;                         \
+                                                       \
+       vpbroadcastd .L0f0f0f0f rRIP, t6;               \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+                                                       \
+       vaesdeclast t7, x3, x3;                         \
+       vaesdeclast t7, x7, x7;
+#endif /* CONFIG_AS_VAES */
+
+#define aria_sbox_8way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      t0, t1, t2, t3,                  \
+                      t4, t5, t6, t7)                  \
+       vpxor t7, t7, t7;                               \
+       vpxor t6, t6, t6;                               \
+       vbroadcasti128 .Linv_shift_row rRIP, t0;        \
+       vbroadcasti128 .Lshift_row rRIP, t1;            \
+       vbroadcasti128 .Ltf_lo__inv_aff__and__s2 rRIP, t2;\
+       vbroadcasti128 .Ltf_hi__inv_aff__and__s2 rRIP, t3;\
+       vbroadcasti128 .Ltf_lo__x2__and__fwd_aff rRIP, t4;\
+       vbroadcasti128 .Ltf_hi__x2__and__fwd_aff rRIP, t5;\
+                                                       \
+       vextracti128 $1, x0, t6##_x;                    \
+       vaesenclast t7##_x, x0##_x, x0##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x0, x0;                 \
+                                                       \
+       vextracti128 $1, x4, t6##_x;                    \
+       vaesenclast t7##_x, x4##_x, x4##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x4, x4;                 \
+                                                       \
+       vextracti128 $1, x1, t6##_x;                    \
+       vaesenclast t7##_x, x1##_x, x1##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x1, x1;                 \
+                                                       \
+       vextracti128 $1, x5, t6##_x;                    \
+       vaesenclast t7##_x, x5##_x, x5##_x;             \
+       vaesenclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x5, x5;                 \
+                                                       \
+       vextracti128 $1, x2, t6##_x;                    \
+       vaesdeclast t7##_x, x2##_x, x2##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x2, x2;                 \
+                                                       \
+       vextracti128 $1, x6, t6##_x;                    \
+       vaesdeclast t7##_x, x6##_x, x6##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x6, x6;                 \
+                                                       \
+       vpbroadcastd .L0f0f0f0f rRIP, t6;               \
+                                                       \
+       /* AES inverse shift rows */                    \
+       vpshufb t0, x0, x0;                             \
+       vpshufb t0, x4, x4;                             \
+       vpshufb t0, x1, x1;                             \
+       vpshufb t0, x5, x5;                             \
+       vpshufb t1, x3, x3;                             \
+       vpshufb t1, x7, x7;                             \
+       vpshufb t1, x2, x2;                             \
+       vpshufb t1, x6, x6;                             \
+                                                       \
+       /* affine transformation for S2 */              \
+       filter_8bit(x1, t2, t3, t6, t0);                \
+       /* affine transformation for S2 */              \
+       filter_8bit(x5, t2, t3, t6, t0);                \
+                                                       \
+       /* affine transformation for X2 */              \
+       filter_8bit(x3, t4, t5, t6, t0);                \
+       /* affine transformation for X2 */              \
+       filter_8bit(x7, t4, t5, t6, t0);                \
+                                                       \
+       vpxor t6, t6, t6;                               \
+       vextracti128 $1, x3, t6##_x;                    \
+       vaesdeclast t7##_x, x3##_x, x3##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x3, x3;                 \
+                                                       \
+       vextracti128 $1, x7, t6##_x;                    \
+       vaesdeclast t7##_x, x7##_x, x7##_x;             \
+       vaesdeclast t7##_x, t6##_x, t6##_x;             \
+       vinserti128 $1, t6##_x, x7, x7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       vpxor x0, x3, t0;                               \
+       vpxor x1, x0, t1;                               \
+       vpxor x2, x1, t2;                               \
+       vpxor x3, x2, t3;                               \
+       /* X = T ^ rotr(X, 16); */                      \
+       vpxor t2, x0, x0;                               \
+       vpxor x1, t3, t3;                               \
+       vpxor t0, x2, x2;                               \
+       vpxor t1, x3, x1;                               \
+       vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;                               \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxor y4, y0, y0;                               \
+       vpxor y5, y1, y1;                               \
+       vpxor y6, y2, y2;                               \
+       vpxor y7, y3, y3;                               \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxor x4, x0, x0;                               \
+       vpxor x5, x1, x1;                               \
+       vpxor x6, x2, x2;                               \
+       vpxor x7, x3, x3;                               \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxor x4, y4, y4;                               \
+       vpxor x5, y5, y5;                               \
+       vpxor x6, y6, y6;                               \
+       vpxor x7, y7, y7;                               \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxor x0, y0, y0;                               \
+       vpxor x1, y1, y1;                               \
+       vpxor x2, y2, y2;                               \
+       vpxor x3, y3, y3;                               \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxor y0, x4, x4;                               \
+       vpxor y1, x5, x5;                               \
+       vpxor y2, x6, x6;                               \
+       vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round)                     \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3,                                \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7); \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x0, x1, x2, x3,             \
+                           x4, x5, x6, x7,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+               x4, x5, x6, x7,                         \
+               y0, y1, y2, y3,                         \
+               y4, y5, y6, y7,                         \
+               mem_tmp, rk, round, last_round)         \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_gfni(x2, x3, x0, x1,             \
+                           x6, x7, x4, x5,             \
+                           y0, y1, y2, y3,             \
+                           y4, y5, y6, y7);            \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
+#ifdef CONFIG_AS_VAES
+#define aria_fe_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_fo_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
+                           x7, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x0, x1, x2, x3, x4, x5, x6, \
+                           x7, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);    \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3);    \
+       aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3);    \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 0);              \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);               \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);                 \
+       aria_store_state_8way(x3, x2, x1, x0,           \
+                             x6, x7, x4, x5,           \
+                             mem_tmp, 0);
+
+#define aria_ff_vaes(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem_tmp, rk, round, last_round)    \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);                    \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 8, last_round);           \
+                                                       \
+       aria_store_state_8way(x0, x1, x2, x3,           \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, 8);              \
+                                                       \
+       aria_load_state_8way(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, 0);               \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, round);                \
+                                                       \
+       aria_sbox_8way_vaes(x2, x3, x0, x1, x6, x7, x4, \
+                           x5, y0, y1, y2, y3, y4, y5, \
+                           y6, y7);    \
+                                                       \
+       aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7,   \
+                     y0, rk, 0, last_round);           \
+                                                       \
+       aria_load_state_8way(y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            mem_tmp, 8);
+#endif /* CONFIG_AS_VAES */
+
+SECTION_RODATA
+.align 32
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 32
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+       .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+       .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+       .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+       .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ *      1 1 0 0 0 0 0 1     x0     0
+ *      0 1 0 0 1 0 0 0     x1     0
+ *      1 1 0 0 1 1 1 1     x2     0
+ *      0 1 1 0 1 0 0 1     x3     1
+ *      0 1 0 0 1 1 0 0  *  x4  +  0
+ *      0 1 0 1 1 0 0 0     x5     0
+ *      0 0 0 0 0 1 0 1     x6     0
+ *      1 1 1 0 0 1 1 1     x7     1
+ */
+.Ltf_lo__inv_aff__and__s2:
+       .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+       .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ *      1 0 1 1 0 0 0 1     x0     0
+ *      0 1 1 1 1 0 1 1     x1     0
+ *      0 0 0 1 1 0 1 0     x2     1
+ *      0 1 0 0 0 1 0 0     x3     0
+ *      0 0 1 1 1 0 1 1  *  x4  +  0
+ *      0 1 0 0 1 0 0 0     x5     0
+ *      1 1 0 1 0 0 1 1     x6     0
+ *      0 1 0 0 1 0 1 0     x7     0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+       .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+       .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+#ifdef CONFIG_AS_GFNI
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+       .long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type __aria_aesni_avx2_crypt_32way,@function;)
+__aria_aesni_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+               %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+               %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_aesni:
+       aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+               %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+               %ymm15, %rax, %r9, 0);
+       aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+               %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+               %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_aesni;
+
+       aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+               %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+               %ymm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_crypt_32way,.-__aria_aesni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_aesni_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_aesni_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_aesni_avx2_ecb_crypt_blk32;)
+
+.align 16
+ELF(.type __aria_aesni_avx2_ctr_gen_keystream_32way,@function;)
+__aria_aesni_avx2_ctr_gen_keystream_32way:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: keystream
+        *      %r8: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       cmpb $(0x100 - 32), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       movq 8(%r8), %r11;
+       bswapq %r11;
+
+       vbroadcasti128 .Lbswap128_mask rRIP, %ymm6;
+       vpcmpeqd %ymm0, %ymm0, %ymm0;
+       vpsrldq $8, %ymm0, %ymm0;   /* ab: -1:0 ; cd: -1:0 */
+       vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+       /* load IV and byteswap */
+       vmovdqu (%r8), %xmm7;
+       vpshufb %xmm6, %xmm7, %xmm7;
+       vmovdqa %xmm7, %xmm3;
+       inc_le128(%xmm7, %xmm0, %xmm4);
+       vinserti128 $1, %xmm7, %ymm3, %ymm3;
+       vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 32), %r11;
+       ja .Lhandle_ctr_carry;
+
+       /* construct IVs */
+       vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+       vpshufb %ymm6, %ymm3, %ymm9;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+       vpshufb %ymm6, %ymm3, %ymm10;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+       vpshufb %ymm6, %ymm3, %ymm11;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+       vpshufb %ymm6, %ymm3, %ymm12;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+       vpshufb %ymm6, %ymm3, %ymm13;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+       vpshufb %ymm6, %ymm3, %ymm14;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+       vpshufb %ymm6, %ymm3, %ymm15;
+       vmovdqu %ymm8, (0 * 32)(%rcx);
+       vmovdqu %ymm9, (1 * 32)(%rcx);
+       vmovdqu %ymm10, (2 * 32)(%rcx);
+       vmovdqu %ymm11, (3 * 32)(%rcx);
+       vmovdqu %ymm12, (4 * 32)(%rcx);
+       vmovdqu %ymm13, (5 * 32)(%rcx);
+       vmovdqu %ymm14, (6 * 32)(%rcx);
+       vmovdqu %ymm15, (7 * 32)(%rcx);
+
+       vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+       vpshufb %ymm6, %ymm3, %ymm8;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+       vpshufb %ymm6, %ymm3, %ymm9;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+       vpshufb %ymm6, %ymm3, %ymm10;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+       vpshufb %ymm6, %ymm3, %ymm11;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+       vpshufb %ymm6, %ymm3, %ymm12;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+       vpshufb %ymm6, %ymm3, %ymm13;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+       vpshufb %ymm6, %ymm3, %ymm14;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+       vpshufb %ymm6, %ymm3, %ymm15;
+       vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+       vpshufb %xmm6, %xmm3, %xmm3;
+       vmovdqu %xmm3, (%r8);
+       vmovdqu (0 * 32)(%rcx), %ymm0;
+       vmovdqu (1 * 32)(%rcx), %ymm1;
+       vmovdqu (2 * 32)(%rcx), %ymm2;
+       vmovdqu (3 * 32)(%rcx), %ymm3;
+       vmovdqu (4 * 32)(%rcx), %ymm4;
+       vmovdqu (5 * 32)(%rcx), %ymm5;
+       vmovdqu (6 * 32)(%rcx), %ymm6;
+       vmovdqu (7 * 32)(%rcx), %ymm7;
+       jmp .Lctr_carry_done;
+
+       .Lhandle_ctr_carry:
+       /* construct IVs */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+       vmovdqu %ymm8, (0 * 32)(%rcx);
+       vmovdqu %ymm9, (1 * 32)(%rcx);
+       vmovdqu %ymm10, (2 * 32)(%rcx);
+       vmovdqu %ymm11, (3 * 32)(%rcx);
+       vmovdqu %ymm12, (4 * 32)(%rcx);
+       vmovdqu %ymm13, (5 * 32)(%rcx);
+       vmovdqu %ymm14, (6 * 32)(%rcx);
+       vmovdqu %ymm15, (7 * 32)(%rcx);
+
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+       inc_le128(%ymm3, %ymm0, %ymm4);
+       vextracti128 $1, %ymm3, %xmm3;
+       vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+       vmovdqu %xmm3, (%r8);
+       vmovdqu (0 * 32)(%rcx), %ymm0;
+       vmovdqu (1 * 32)(%rcx), %ymm1;
+       vmovdqu (2 * 32)(%rcx), %ymm2;
+       vmovdqu (3 * 32)(%rcx), %ymm3;
+       vmovdqu (4 * 32)(%rcx), %ymm4;
+       vmovdqu (5 * 32)(%rcx), %ymm5;
+       vmovdqu (6 * 32)(%rcx), %ymm6;
+       vmovdqu (7 * 32)(%rcx), %ymm7;
+
+.Lctr_carry_done:
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       addb $32, 15(%r8);
+       pushq %rcx;
+       movl $14, %ecx;
+       1:
+         adcb $0, (%r8, %rcx);
+         jnc 2f;
+         loop 1b;
+       2:
+       popq %rcx;
+       jmp .Lctr_byteadd_ymm;
+.align 8
+.Lctr_byteadd:
+       vbroadcasti128 (%r8), %ymm8;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $32, 15(%r8);
+.Lctr_byteadd_ymm:
+       vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm15;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm0;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm1;
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm2;
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm3;
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm4;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm5;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm6;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm7;
+       vpaddb .Lbige_addb_0_1 rRIP, %ymm15, %ymm8;
+       vpaddb .Lbige_addb_2_3 rRIP, %ymm15, %ymm9;
+       vpaddb .Lbige_addb_4_5 rRIP, %ymm15, %ymm10;
+       vpaddb .Lbige_addb_6_7 rRIP, %ymm15, %ymm11;
+       vpaddb .Lbige_addb_8_9 rRIP, %ymm15, %ymm12;
+       vpaddb .Lbige_addb_10_11 rRIP, %ymm15, %ymm13;
+       vpaddb .Lbige_addb_12_13 rRIP, %ymm15, %ymm14;
+       vpaddb .Lbige_addb_14_15 rRIP, %ymm15, %ymm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_aesni_avx2_ctr_gen_keystream_32way,
+         .-__aria_aesni_avx2_ctr_gen_keystream_32way;)
+
+.align 16
+.globl _gcry_aria_aesni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_aesni_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_aesni_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_aesni_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_aesni_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_aesni_avx2_ctr_crypt_blk32;)
+
+#ifdef CONFIG_AS_VAES
+.align 16
+ELF(.type __aria_vaes_avx2_crypt_32way,@function;)
+__aria_vaes_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo_vaes(%ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_vaes:
+       aria_fe_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %rax, %r9, 0);
+       aria_fo_vaes(%ymm9, %ymm8, %ymm11, %ymm10,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_vaes;
+
+       aria_ff_vaes(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_vaes_avx2_crypt_32way,.-__aria_vaes_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_vaes_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_vaes_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_vaes_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_vaes_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_vaes_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_vaes_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_vaes_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_vaes_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_vaes_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_VAES */
+
+#ifdef CONFIG_AS_GFNI
+.align 16
+ELF(.type __aria_gfni_avx2_crypt_32way,@function;)
+__aria_gfni_avx2_crypt_32way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %ymm0..%ymm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 32(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+                     %ymm4, %ymm5, %ymm6, %ymm7,
+                     %ymm8, %ymm9, %ymm10, %ymm11,
+                     %ymm12, %ymm13, %ymm14,
+                     %ymm15, %rax, %r8);
+       aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11,
+                    %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rax, %r9, 0);
+       aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+                    %ymm12, %ymm13, %ymm14, %ymm15,
+                    %ymm0, %ymm1, %ymm2, %ymm3,
+                    %ymm4, %ymm5, %ymm6, %ymm7,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+                          %ymm9, %ymm13, %ymm0, %ymm5,
+                          %ymm10, %ymm14, %ymm3, %ymm6,
+                          %ymm11, %ymm15, %ymm2, %ymm7,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx2_crypt_32way,.-__aria_gfni_avx2_crypt_32way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ecb_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ecb_crypt_blk32,@function;)
+_gcry_aria_gfni_avx2_ecb_crypt_blk32:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rdx);
+
+       call __aria_gfni_avx2_crypt_32way;
+
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ecb_crypt_blk32,
+         .-_gcry_aria_gfni_avx2_ecb_crypt_blk32;)
+
+.align 16
+.globl _gcry_aria_gfni_avx2_ctr_crypt_blk32
+ELF(.type _gcry_aria_gfni_avx2_ctr_crypt_blk32,@function;)
+_gcry_aria_gfni_avx2_ctr_crypt_blk32:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 32), %rsp;
+       andq $~31, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi; /* use stack for temporary store */
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx2_crypt_32way;
+
+       popq %rsi;
+       vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+       vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+       vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+       vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+       vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+       vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+       vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+       vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+       vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+       vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+       vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+       vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+       vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+       vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+       vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+       vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+       write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+                    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+                    %ymm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       vzeroall;
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx2_ctr_crypt_blk32,
+         .-_gcry_aria_gfni_avx2_ctr_crypt_blk32;)
+#endif /* CONFIG_AS_GFNI */
+
+#endif /* ENABLE_AVX2_SUPPORT && ENABLE_AESNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/grub-core/lib/libgcrypt/cipher/aria-gfni-avx512-amd64.S 
b/grub-core/lib/libgcrypt/cipher/aria-gfni-avx512-amd64.S
new file mode 100644
index 000000000..0eaa2de8f
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/aria-gfni-avx512-amd64.S
@@ -0,0 +1,1010 @@
+/* aria-gfni-avx512-amd64.S  -  GFNI/AVX512 implementation of ARIA cipher
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AVX512_SUPPORT) && defined(ENABLE_GFNI_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/* struct ARIA_context: */
+#define ARIA_BLOCK_SIZE  16
+#define ARIA_MAX_RD_KEYS 17
+#define ARIA_CTX_enc_key 0
+#define ARIA_CTX_dec_key (ARIA_CTX_enc_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+#define ARIA_CTX_rounds (ARIA_CTX_dec_key + (ARIA_BLOCK_SIZE * 
ARIA_MAX_RD_KEYS))
+
+/* register macros */
+#define CTX %rdi
+
+/* helper macros */
+#define STACK_DEPTH (2 * 8 + 16 * 64 + 63)
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7)            \
+       ( (((a0) & 1) << 0) |                           \
+         (((a1) & 1) << 1) |                           \
+         (((a2) & 1) << 2) |                           \
+         (((a3) & 1) << 3) |                           \
+         (((a4) & 1) << 4) |                           \
+         (((a5) & 1) << 5) |                           \
+         (((a6) & 1) << 6) |                           \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)          \
+       ( ((l7) << (0 * 8)) |                           \
+         ((l6) << (1 * 8)) |                           \
+         ((l5) << (2 * 8)) |                           \
+         ((l4) << (3 * 8)) |                           \
+         ((l3) << (4 * 8)) |                           \
+         ((l2) << (5 * 8)) |                           \
+         ((l1) << (6 * 8)) |                           \
+         ((l0) << (7 * 8)) )
+
+/* asm macros */
+#define clear_vec4(v0,v1,v2,v3) \
+       vpxord v0, v0, v0; \
+       vpxord v1, v1, v1; \
+       vpxord v2, v2, v2; \
+       vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+       clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \
+       clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \
+       clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \
+       clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31)
+
+#define clear_regs() \
+       kxorq %k1, %k1, %k1; \
+       vzeroall; \
+       clear_zmm16_zmm31()
+
+#define add_le128(out, in, lo_counter, hi_counter1)    \
+       vpaddq lo_counter, in, out;                     \
+       vpcmpuq $1, lo_counter, out, %k1;               \
+       kaddb %k1, %k1, %k1;                            \
+       vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0)     \
+       vpandq x, mask4bit, tmp0;                       \
+       vpandqn x, mask4bit, x;                         \
+       vpsrld $4, x, x;                                \
+                                                       \
+       vpshufb tmp0, lo_t, tmp0;                       \
+       vpshufb x, hi_t, x;                             \
+       vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2)          \
+       vpunpckhdq x1, x0, t2;                          \
+       vpunpckldq x1, x0, x0;                          \
+                                                       \
+       vpunpckldq x3, x2, t1;                          \
+       vpunpckhdq x3, x2, x2;                          \
+                                                       \
+       vpunpckhqdq t1, x0, x1;                         \
+       vpunpcklqdq t1, x0, x0;                         \
+                                                       \
+       vpunpckhqdq x2, t2, x3;                         \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0,               \
+                        a1, b1, c1, d1,                \
+                        a2, b2, c2, d2,                \
+                        a3, b3, c3, d3,                \
+                        st0, st1)                      \
+       vmovdqu64 d2, st0;                              \
+       vmovdqu64 d3, st1;                              \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 a0, st0;                              \
+       vmovdqu64 a1, st1;                              \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;        \
+       vmovdqu64 st1, a1;                              \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu64 d3, st1;                              \
+       vmovdqu64 st0, d3;                              \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu64 d2, st0;                              \
+                                                       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3);          \
+       transpose_4x4(a1, b1, c1, d1, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 b0, st0;                              \
+       vmovdqu64 b1, st1;                              \
+       transpose_4x4(a2, b2, c2, d2, b0, b1);          \
+       transpose_4x4(a3, b3, c3, d3, b0, b1);          \
+       vmovdqu64 st0, b0;                              \
+       vmovdqu64 st1, b1;                              \
+       /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0,             \
+                          a1, b1, c1, d1,              \
+                          a2, b2, c2, d2,              \
+                          a3, b3, c3, d3,              \
+                          st0, st1)                    \
+       vmovdqu64 d2, st0;                              \
+       vmovdqu64 d3, st1;                              \
+       transpose_4x4(a0, a1, a2, a3, d2, d3);          \
+       transpose_4x4(b0, b1, b2, b3, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 a0, st0;                              \
+       vmovdqu64 a1, st1;                              \
+       transpose_4x4(c0, c1, c2, c3, a0, a1);          \
+       transpose_4x4(d0, d1, d2, d3, a0, a1);          \
+                                                       \
+       vbroadcasti64x2 .Lshufb_16x16b rRIP, a0;        \
+       vmovdqu64 st1, a1;                              \
+       vpshufb a0, a2, a2;                             \
+       vpshufb a0, a3, a3;                             \
+       vpshufb a0, b0, b0;                             \
+       vpshufb a0, b1, b1;                             \
+       vpshufb a0, b2, b2;                             \
+       vpshufb a0, b3, b3;                             \
+       vpshufb a0, a1, a1;                             \
+       vpshufb a0, c0, c0;                             \
+       vpshufb a0, c1, c1;                             \
+       vpshufb a0, c2, c2;                             \
+       vpshufb a0, c3, c3;                             \
+       vpshufb a0, d0, d0;                             \
+       vpshufb a0, d1, d1;                             \
+       vpshufb a0, d2, d2;                             \
+       vpshufb a0, d3, d3;                             \
+       vmovdqu64 d3, st1;                              \
+       vmovdqu64 st0, d3;                              \
+       vpshufb a0, d3, a0;                             \
+       vmovdqu64 d2, st0;                              \
+                                                       \
+       transpose_4x4(c0, d0, a0, b0, d2, d3);          \
+       transpose_4x4(c1, d1, a1, b1, d2, d3);          \
+       vmovdqu64 st0, d2;                              \
+       vmovdqu64 st1, d3;                              \
+                                                       \
+       vmovdqu64 b0, st0;                              \
+       vmovdqu64 b1, st1;                              \
+       transpose_4x4(c2, d2, a2, b2, b0, b1);          \
+       transpose_4x4(c3, d3, a3, b3, b0, b1);          \
+       vmovdqu64 st0, b0;                              \
+       vmovdqu64 st1, b1;                              \
+       /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    rio)                               \
+       vmovdqu64 (0 * 64)(rio), x0;                    \
+       vmovdqu64 (1 * 64)(rio), x1;                    \
+       vmovdqu64 (2 * 64)(rio), x2;                    \
+       vmovdqu64 (3 * 64)(rio), x3;                    \
+       vmovdqu64 (4 * 64)(rio), x4;                    \
+       vmovdqu64 (5 * 64)(rio), x5;                    \
+       vmovdqu64 (6 * 64)(rio), x6;                    \
+       vmovdqu64 (7 * 64)(rio), x7;                    \
+       vmovdqu64 (8 * 64)(rio), y0;                    \
+       vmovdqu64 (9 * 64)(rio), y1;                    \
+       vmovdqu64 (10 * 64)(rio), y2;                   \
+       vmovdqu64 (11 * 64)(rio), y3;                   \
+       vmovdqu64 (12 * 64)(rio), y4;                   \
+       vmovdqu64 (13 * 64)(rio), y5;                   \
+       vmovdqu64 (14 * 64)(rio), y6;                   \
+       vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3,                  \
+                     x4, x5, x6, x7,                   \
+                     y0, y1, y2, y3,                   \
+                     y4, y5, y6, y7,                   \
+                     mem_ab, mem_cd)                   \
+       byteslice_16x16b(x0, x1, x2, x3,                \
+                        x4, x5, x6, x7,                \
+                        y0, y1, y2, y3,                \
+                        y4, y5, y6, y7,                \
+                        (mem_ab), (mem_cd));           \
+                                                       \
+       vmovdqu64 x0, 0 * 64(mem_ab);                   \
+       vmovdqu64 x1, 1 * 64(mem_ab);                   \
+       vmovdqu64 x2, 2 * 64(mem_ab);                   \
+       vmovdqu64 x3, 3 * 64(mem_ab);                   \
+       vmovdqu64 x4, 4 * 64(mem_ab);                   \
+       vmovdqu64 x5, 5 * 64(mem_ab);                   \
+       vmovdqu64 x6, 6 * 64(mem_ab);                   \
+       vmovdqu64 x7, 7 * 64(mem_ab);                   \
+       vmovdqu64 y0, 0 * 64(mem_cd);                   \
+       vmovdqu64 y1, 1 * 64(mem_cd);                   \
+       vmovdqu64 y2, 2 * 64(mem_cd);                   \
+       vmovdqu64 y3, 3 * 64(mem_cd);                   \
+       vmovdqu64 y4, 4 * 64(mem_cd);                   \
+       vmovdqu64 y5, 5 * 64(mem_cd);                   \
+       vmovdqu64 y6, 6 * 64(mem_cd);                   \
+       vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    mem)                               \
+       vmovdqu64 x0, 0 * 64(mem);                      \
+       vmovdqu64 x1, 1 * 64(mem);                      \
+       vmovdqu64 x2, 2 * 64(mem);                      \
+       vmovdqu64 x3, 3 * 64(mem);                      \
+       vmovdqu64 x4, 4 * 64(mem);                      \
+       vmovdqu64 x5, 5 * 64(mem);                      \
+       vmovdqu64 x6, 6 * 64(mem);                      \
+       vmovdqu64 x7, 7 * 64(mem);                      \
+       vmovdqu64 y0, 8 * 64(mem);                      \
+       vmovdqu64 y1, 9 * 64(mem);                      \
+       vmovdqu64 y2, 10 * 64(mem);                     \
+       vmovdqu64 y3, 11 * 64(mem);                     \
+       vmovdqu64 y4, 12 * 64(mem);                     \
+       vmovdqu64 y5, 13 * 64(mem);                     \
+       vmovdqu64 y6, 14 * 64(mem);                     \
+       vmovdqu64 y7, 15 * 64(mem);                     \
+
+#define aria_store_state_8way(x0, x1, x2, x3,          \
+                             x4, x5, x6, x7,           \
+                             mem_tmp, idx)             \
+       vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp);        \
+       vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp);        \
+       vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp);        \
+       vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp);        \
+       vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp);        \
+       vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp);        \
+       vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp);        \
+       vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            mem_tmp, idx)              \
+       vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0;        \
+       vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1;        \
+       vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2;        \
+       vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3;        \
+       vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4;        \
+       vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5;        \
+       vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6;        \
+       vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      t0, rk, round)                   \
+       /* AddRoundKey */                               \
+       vpbroadcastb ((round * 16) + 3)(rk), t0;        \
+       vpxorq t0, x0, x0;                              \
+       vpbroadcastb ((round * 16) + 2)(rk), t0;        \
+       vpxorq t0, x1, x1;                              \
+       vpbroadcastb ((round * 16) + 1)(rk), t0;        \
+       vpxorq t0, x2, x2;                              \
+       vpbroadcastb ((round * 16) + 0)(rk), t0;        \
+       vpxorq t0, x3, x3;                              \
+       vpbroadcastb ((round * 16) + 7)(rk), t0;        \
+       vpxorq t0, x4, x4;                              \
+       vpbroadcastb ((round * 16) + 6)(rk), t0;        \
+       vpxorq t0, x5, x5;                              \
+       vpbroadcastb ((round * 16) + 5)(rk), t0;        \
+       vpxorq t0, x6, x6;                              \
+       vpbroadcastb ((round * 16) + 4)(rk), t0;        \
+       vpxorq t0, x7, x7;                              \
+       vpbroadcastb ((round * 16) + 11)(rk), t0;       \
+       vpxorq t0, y0, y0;                              \
+       vpbroadcastb ((round * 16) + 10)(rk), t0;       \
+       vpxorq t0, y1, y1;                              \
+       vpbroadcastb ((round * 16) + 9)(rk), t0;        \
+       vpxorq t0, y2, y2;                              \
+       vpbroadcastb ((round * 16) + 8)(rk), t0;        \
+       vpxorq t0, y3, y3;                              \
+       vpbroadcastb ((round * 16) + 15)(rk), t0;       \
+       vpxorq t0, y4, y4;                              \
+       vpbroadcastb ((round * 16) + 14)(rk), t0;       \
+       vpxorq t0, y5, y5;                              \
+       vpbroadcastb ((round * 16) + 13)(rk), t0;       \
+       vpxorq t0, y6, y6;                              \
+       vpbroadcastb ((round * 16) + 12)(rk), t0;       \
+       vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3,            \
+                           x4, x5, x6, x7,             \
+                           t0, t1, t2, t3,             \
+                           t4, t5, t6, t7)             \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3,           \
+                            x4, x5, x6, x7,            \
+                            y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            t0, t1, t2, t3,            \
+                            t4, t5, t6, t7)            \
+       vpbroadcastq .Ltf_s2_bitmatrix rRIP, t0;        \
+       vpbroadcastq .Ltf_inv_bitmatrix rRIP, t1;       \
+       vpbroadcastq .Ltf_id_bitmatrix rRIP, t2;        \
+       vpbroadcastq .Ltf_aff_bitmatrix rRIP, t3;       \
+       vpbroadcastq .Ltf_x2_bitmatrix rRIP, t4;        \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, x2, x2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, x6, x6;     \
+       vgf2p8affineinvqb $0, t2, x2, x2;               \
+       vgf2p8affineinvqb $0, t2, x6, x6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, x3, x3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, x7, x7;      \
+       vgf2p8affineinvqb $0, t2, x3, x3;               \
+       vgf2p8affineinvqb $0, t2, x7, x7;               \
+       vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1;   \
+       vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5;   \
+       vgf2p8affineqb $(tf_inv_const), t1, y2, y2;     \
+       vgf2p8affineqb $(tf_inv_const), t1, y6, y6;     \
+       vgf2p8affineinvqb $0, t2, y2, y2;               \
+       vgf2p8affineinvqb $0, t2, y6, y6;               \
+       vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0;  \
+       vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4;  \
+       vgf2p8affineqb $(tf_x2_const), t4, y3, y3;      \
+       vgf2p8affineqb $(tf_x2_const), t4, y7, y7;      \
+       vgf2p8affineinvqb $0, t2, y3, y3;               \
+       vgf2p8affineinvqb $0, t2, y7, y7;
+
+#define aria_diff_m(x0, x1, x2, x3,                    \
+                   t0, t1, t2, t3)                     \
+       /* T = rotr32(X, 8); */                         \
+       /* X ^= T */                                    \
+       /* X = T ^ rotr(X, 16); */                      \
+       vmovdqa64 x0, t0;                               \
+       vmovdqa64 x3, t3;                               \
+       vpternlogq $0x96, x2, x1, x0;                   \
+       vpternlogq $0x96, x2, x1, x3;                   \
+       vpternlogq $0x96, t0, t3, x2;                   \
+       vpternlogq $0x96, t0, t3, x1;
+
+#define aria_diff_word(x0, x1, x2, x3,                 \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7)                  \
+       /* t1 ^= t2; */                                 \
+       vpxorq y0, x4, x4;                              \
+       vpxorq y1, x5, x5;                              \
+       vpxorq y2, x6, x6;                              \
+       vpxorq y3, x7, x7;                              \
+                                                       \
+       /* t2 ^= t3; */                                 \
+       vpxorq y4, y0, y0;                              \
+       vpxorq y5, y1, y1;                              \
+       vpxorq y6, y2, y2;                              \
+       vpxorq y7, y3, y3;                              \
+                                                       \
+       /* t0 ^= t1; */                                 \
+       vpxorq x4, x0, x0;                              \
+       vpxorq x5, x1, x1;                              \
+       vpxorq x6, x2, x2;                              \
+       vpxorq x7, x3, x3;                              \
+                                                       \
+       /* t3 ^= t1; */                                 \
+       vpxorq x4, y4, y4;                              \
+       vpxorq x5, y5, y5;                              \
+       vpxorq x6, y6, y6;                              \
+       vpxorq x7, y7, y7;                              \
+                                                       \
+       /* t2 ^= t0; */                                 \
+       vpxorq x0, y0, y0;                              \
+       vpxorq x1, y1, y1;                              \
+       vpxorq x2, y2, y2;                              \
+       vpxorq x3, y3, y3;                              \
+                                                       \
+       /* t1 ^= t2; */                                 \
+       vpxorq y0, x4, x4;                              \
+       vpxorq y1, x5, x5;                              \
+       vpxorq y2, x6, x6;                              \
+       vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7,  \
+                      z0, rk, round);                  \
+                                                       \
+       aria_sbox_16way_gfni(x2, x3, x0, x1,            \
+                            x6, x7, x4, x5,            \
+                            y2, y3, y0, y1,            \
+                            y6, y7, y4, y5,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
+       aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
+       aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
+       aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T3 = ABCD -> BADC                            \
+        * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6        \
+        * T0 = ABCD -> CDAB                            \
+        * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1        \
+        * T1 = ABCD -> DCBA                            \
+        * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4        \
+        */                                             \
+       aria_diff_word(x2, x3, x0, x1,                  \
+                      x7, x6, x5, x4,                  \
+                      y0, y1, y2, y3,                  \
+                      y5, y4, y7, y6);                 \
+
+
+#define aria_fo_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round)                \
+       aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7,  \
+                      y0, y1, y2, y3, y4, y5, y6, y7,  \
+                      z0, rk, round);                  \
+                                                       \
+       aria_sbox_16way_gfni(x0, x1, x2, x3,            \
+                            x4, x5, x6, x7,            \
+                            y0, y1, y2, y3,            \
+                            y4, y5, y6, y7,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+                                                       \
+       aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3);    \
+       aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3);    \
+       aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3);    \
+       aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3);    \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7);                 \
+       /* aria_diff_byte()                             \
+        * T1 = ABCD -> BADC                            \
+        * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6        \
+        * T2 = ABCD -> CDAB                            \
+        * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1       \
+        * T3 = ABCD -> DCBA                            \
+        * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4        \
+        */                                             \
+       aria_diff_word(x0, x1, x2, x3,                  \
+                      x5, x4, x7, x6,                  \
+                      y2, y3, y0, y1,                  \
+                      y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3,                   \
+                    x4, x5, x6, x7,                    \
+                    y0, y1, y2, y3,                    \
+                    y4, y5, y6, y7,                    \
+                    z0, z1, z2, z3,                    \
+                    z4, z5, z6, z7,                    \
+                    mem_tmp, rk, round, last_round)    \
+       aria_ark_16way(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      z0, rk, round);                  \
+       aria_sbox_16way_gfni(x2, x3, x0, x1,            \
+                            x6, x7, x4, x5,            \
+                            y2, y3, y0, y1,            \
+                            y6, y7, y4, y5,            \
+                            z0, z1, z2, z3,            \
+                            z4, z5, z6, z7);           \
+       aria_ark_16way(x0, x1, x2, x3,                  \
+                      x4, x5, x6, x7,                  \
+                      y0, y1, y2, y3,                  \
+                      y4, y5, y6, y7,                  \
+                      z0, rk, last_round);
+
+SECTION_RODATA
+.align 64
+.Lcounter0123_lo:
+       .quad 0, 0
+       .quad 1, 0
+       .quad 2, 0
+       .quad 3, 0
+
+.align 32
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.align 16
+.Lcounter4444_lo:
+       .quad 4, 0
+.Lcounter8888_lo:
+       .quad 8, 0
+.Lcounter16161616_lo:
+       .quad 16, 0
+.Lcounter1111_hi:
+       .quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+       .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+                   BV8(1, 1, 0, 0, 0, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 0, 0, 1),
+                   BV8(1, 1, 1, 1, 1, 0, 0, 0),
+                   BV8(0, 1, 1, 1, 1, 1, 0, 0),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 0),
+                   BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 1, 0, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 1, 0, 0, 1, 0),
+                   BV8(0, 0, 1, 0, 1, 0, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 1, 0, 0),
+                   BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+       .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+                   BV8(0, 0, 1, 1, 1, 1, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 1),
+                   BV8(1, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(0, 1, 0, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 0, 0, 1, 1, 1, 0),
+                   BV8(0, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+       .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 1, 1, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 1, 0),
+                   BV8(1, 1, 1, 0, 0, 0, 1, 1),
+                   BV8(1, 1, 1, 0, 1, 1, 0, 0),
+                   BV8(0, 1, 1, 0, 1, 0, 1, 1),
+                   BV8(1, 0, 1, 1, 1, 1, 0, 1),
+                   BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+       .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 1, 0, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 1, 0, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 1, 0, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 1, 0, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 1, 0, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 1, 0),
+                   BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
+.text
+
+.align 16
+ELF(.type __aria_gfni_avx512_crypt_64way,@function;)
+__aria_gfni_avx512_crypt_64way:
+       /* input:
+        *      %r9: rk
+        *      %rsi: dst
+        *      %rdx: src
+        *      %zmm0..%zmm15: byte-sliced blocks
+        */
+       CFI_STARTPROC();
+
+       movq %rsi, %rax;
+       leaq 8 * 64(%rax), %r8;
+
+       movl ARIA_CTX_rounds(CTX), %r10d;
+       subl $2, %r10d;
+
+       inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+                     %zmm4, %zmm5, %zmm6, %zmm7,
+                     %zmm8, %zmm9, %zmm10, %zmm11,
+                     %zmm12, %zmm13, %zmm14,
+                     %zmm15, %rax, %r8);
+       aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+                    %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0);
+       leaq 1*16(%r9), %r9;
+
+.align 16
+.Loop_gfni:
+       aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0);
+       aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+                    %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 1);
+       leaq 2*16(%r9), %r9;
+       subl $2, %r10d;
+       jnz .Loop_gfni;
+
+       aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+                    %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10,
+                    %zmm12, %zmm13, %zmm14, %zmm15,
+                    %zmm24, %zmm25, %zmm26, %zmm27,
+                    %zmm28, %zmm29, %zmm30, %zmm31,
+                    %rax, %r9, 0, 1);
+
+       debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+                          %zmm8, %zmm13, %zmm2, %zmm7,
+                          %zmm11, %zmm14, %zmm1, %zmm4,
+                          %zmm10, %zmm15, %zmm0, %zmm5,
+                          (%rax), (%r8));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_crypt_64way,.-__aria_gfni_avx512_crypt_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ecb_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ecb_crypt_blk64,@function;)
+_gcry_aria_gfni_avx512_ecb_crypt_blk64:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: round keys
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 64), %rsp;
+       andq $~63, %rsp;
+
+       movq %rcx, %r9;
+       movq %rsi, %r11;
+       movq %rsp, %rsi; /* use stack for temporary store */
+
+       inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+                    %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %rdx);
+
+       call __aria_gfni_avx512_crypt_64way;
+
+       write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %r11);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       clear_regs();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ecb_crypt_blk64,
+         .-_gcry_aria_gfni_avx512_ecb_crypt_blk64;)
+
+.align 16
+ELF(.type __aria_gfni_avx512_ctr_gen_keystream_64way,@function;)
+__aria_gfni_avx512_ctr_gen_keystream_64way:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: keystream
+        *      %r8: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       cmpb $(0x100 - 64), 15(%r8);
+       jbe .Lctr_byteadd;
+
+       vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+       vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+       vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+       vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+       vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+       vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+       /* load IV and byteswap */
+       movq 8(%r8), %r11;
+       movq (%r8), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       vbroadcasti64x2 (%r8), %zmm20;
+       vpshufb %zmm19, %zmm20, %zmm20;
+
+       /* check need for handling 64-bit overflow and carry */
+       cmpq $(0xffffffffffffffff - 64), %r11;
+       ja .Lload_ctr_carry;
+
+       /* construct IVs */
+       vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+       vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+       vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+       vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+       vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+       vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+       vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+       vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+       vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+       vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+       vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+       vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+       vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+       vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+       vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+       vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+       jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+       /* construct IVs */
+       add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+       add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+       add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+       add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+       add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+       add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+       add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+       add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+       add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+       add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+       add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+       add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+       add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+       add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+       add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+       add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+       /* Byte-swap IVs and update counter. */
+       addq $64, %r11;
+       adcq $0, %r10;
+       vpshufb %zmm19, %zmm15, %zmm15;
+       vpshufb %zmm19, %zmm14, %zmm14;
+       vpshufb %zmm19, %zmm13, %zmm13;
+       vpshufb %zmm19, %zmm12, %zmm12;
+       vpshufb %zmm19, %zmm11, %zmm11;
+       vpshufb %zmm19, %zmm10, %zmm10;
+       vpshufb %zmm19, %zmm9, %zmm9;
+       vpshufb %zmm19, %zmm8, %zmm8;
+       bswapq %r11;
+       bswapq %r10;
+       vpshufb %zmm19, %zmm7, %zmm7;
+       vpshufb %zmm19, %zmm6, %zmm6;
+       vpshufb %zmm19, %zmm5, %zmm5;
+       vpshufb %zmm19, %zmm4, %zmm4;
+       vpshufb %zmm19, %zmm3, %zmm3;
+       vpshufb %zmm19, %zmm2, %zmm2;
+       vpshufb %zmm19, %zmm1, %zmm1;
+       vpshufb %zmm19, %zmm0, %zmm0;
+       movq %r11, 8(%r8);
+       movq %r10, (%r8);
+
+       ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%r8), %r11;
+       movq (%r8), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $64, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%r8);
+       movq %r10, (%r8);
+       jmp .Lctr_byteadd_zmm;
+.align 16
+.Lctr_byteadd:
+       vbroadcasti64x2 (%r8), %zmm3;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $64, 15(%r8);
+.Lctr_byteadd_zmm:
+       vbroadcasti64x2 .Lbige_addb_16 rRIP, %zmm16;
+       vmovdqa64 .Lbige_addb_0_1 rRIP, %zmm17;
+       vmovdqa64 .Lbige_addb_4_5 rRIP, %zmm18;
+       vmovdqa64 .Lbige_addb_8_9 rRIP, %zmm19;
+       vmovdqa64 .Lbige_addb_12_13 rRIP, %zmm20;
+       vpaddb %zmm16, %zmm3, %zmm7;
+       vpaddb %zmm17, %zmm3, %zmm0;
+       vpaddb %zmm18, %zmm3, %zmm1;
+       vpaddb %zmm19, %zmm3, %zmm2;
+       vpaddb %zmm20, %zmm3, %zmm3;
+       vpaddb %zmm16, %zmm7, %zmm11;
+       vpaddb %zmm17, %zmm7, %zmm4;
+       vpaddb %zmm18, %zmm7, %zmm5;
+       vpaddb %zmm19, %zmm7, %zmm6;
+       vpaddb %zmm20, %zmm7, %zmm7;
+       vpaddb %zmm16, %zmm11, %zmm15;
+       vpaddb %zmm17, %zmm11, %zmm8;
+       vpaddb %zmm18, %zmm11, %zmm9;
+       vpaddb %zmm19, %zmm11, %zmm10;
+       vpaddb %zmm20, %zmm11, %zmm11;
+       vpaddb %zmm17, %zmm15, %zmm12;
+       vpaddb %zmm18, %zmm15, %zmm13;
+       vpaddb %zmm19, %zmm15, %zmm14;
+       vpaddb %zmm20, %zmm15, %zmm15;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __aria_gfni_avx512_ctr_gen_keystream_64way,
+         .-__aria_gfni_avx512_ctr_gen_keystream_64way;)
+
+.align 16
+.globl _gcry_aria_gfni_avx512_ctr_crypt_blk64
+ELF(.type _gcry_aria_gfni_avx512_ctr_crypt_blk64,@function;)
+_gcry_aria_gfni_avx512_ctr_crypt_blk64:
+       /* input:
+        *      %rdi: ctx
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+       spec_stop_avx512;
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       subq $(16 * 64), %rsp;
+       andq $~63, %rsp;
+
+       movq %rcx, %r8;  /* %r8: iv */
+       movq %rsp, %rcx; /* %rcx: keystream */
+       call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+       pushq %rsi;
+       movq %rdx, %r11;
+       movq %rcx, %rsi;
+       movq %rcx, %rdx;
+       leaq ARIA_CTX_enc_key(CTX), %r9;
+
+       call __aria_gfni_avx512_crypt_64way;
+
+       popq %rsi;
+       vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+       vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+       vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+       vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+       vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+       vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+       vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+       vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+       vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+       vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+       vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+       vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+       vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+       vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+       vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+       vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+       write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+                    %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+                    %zmm15, %rsi);
+
+       movl $STACK_DEPTH, %eax;
+       leave;
+       CFI_LEAVE();
+       clear_regs();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_aria_gfni_avx512_ctr_crypt_blk64,
+         .-_gcry_aria_gfni_avx512_ctr_crypt_blk64;)
+
+#endif /* ENABLE_AVX512_SUPPORT && ENABLE_GFNI_SUPPORT */
+#endif /* __x86_64 */
diff --git a/grub-core/lib/libgcrypt/cipher/aria.c 
b/grub-core/lib/libgcrypt/cipher/aria.c
new file mode 100644
index 000000000..bc2d43841
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/aria.c
@@ -0,0 +1,1768 @@
+/* aria.c  -  ARIA Cipher Algorithm
+ *
+ * Copyright (C) 2022-2023 Taehee Yoo <ap420073@gmail.com>
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
+#include "bulkhelp.h"
+
+/* Attribute macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+/* Attribute macro to force inlining of function. */
+#if __GNUC__ >= 4
+#  define ALWAYS_INLINE inline __attribute__ ((always_inline))
+#else
+#  define ALWAYS_INLINE inline
+#endif
+
+/* Attribute macro to prevent inlining of function. */
+#if __GNUC__ >= 4
+#  define NO_INLINE __attribute__ ((noinline))
+#else
+#  define NO_INLINE
+#endif
+
+
+/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || 
\
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
+/* USE_GFNI_AVX inidicates whether to compile with Intel GFNI/AVX code. */
+#undef USE_GFNI_AVX
+#if defined(USE_AESNI_AVX) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX 1
+#endif
+
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
+#undef USE_AESNI_AVX2
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || 
\
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_AESNI_AVX2 1
+# endif
+#endif
+
+/* USE_VAES_AVX2 inidicates whether to compile with Intel VAES/AVX2 code. */
+#undef USE_VAES_AVX2
+#if defined(USE_AESNI_AVX2) && defined(HAVE_GCC_INLINE_ASM_VAES_VPCLMUL)
+# define USE_VAES_AVX2 1
+#endif
+
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. 
*/
+#undef USE_GFNI_AVX512
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || 
\
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+#  define USE_GFNI_AVX512 1
+# endif
+#endif
+
+/* How many parallel blocks to handle in bulk processing functions. */
+#if defined(USE_GFNI_AVX512)
+# define MAX_PARALLEL_BLKS 64
+#elif defined(USE_AESNI_AVX2)
+# define MAX_PARALLEL_BLKS 32
+#elif defined(USE_AESNI_AVX)
+# define MAX_PARALLEL_BLKS 16
+#else
+# define MAX_PARALLEL_BLKS 8
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
+    defined(USE_GFNI_AVX512)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+static const char *aria_selftest (void);
+
+
+#define ARIA_MIN_KEY_SIZE      16
+#define ARIA_MAX_KEY_SIZE      32
+#define ARIA_BLOCK_SIZE                16
+#define ARIA_MAX_RD_KEYS       17
+#define ARIA_RD_KEY_WORDS      (ARIA_BLOCK_SIZE / sizeof(u32))
+
+
+typedef struct
+{
+  u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+  u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS];
+  int rounds;
+
+  unsigned int decryption_prepared:1; /* The decryption key is set up. */
+  unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for
+                                      * current bulk operation. */
+
+#ifdef USE_AESNI_AVX
+  unsigned int use_aesni_avx:1;
+  unsigned int use_gfni_avx:1;
+#endif
+#ifdef USE_AESNI_AVX2
+  unsigned int use_aesni_avx2:1;
+  unsigned int use_vaes_avx2:1;
+  unsigned int use_gfni_avx2:1;
+#endif
+#ifdef USE_GFNI_AVX512
+  unsigned int use_gfni_avx512:1;
+#endif
+} ARIA_context;
+
+
+static const u32 key_rc[20] =
+  {
+    0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0,
+    0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0,
+    0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e,
+    0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0,
+    0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0
+  };
+
+
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  u32 s1[256];
+  u32 s2[256];
+  u32 x1[256];
+  u32 x2[256];
+  volatile u32 counter_tail;
+} sboxes ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+
+    { /* s1 */
+      0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b,
+      0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5,
+      0x00303030, 0x00010101, 0x00676767, 0x002b2b2b,
+      0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676,
+      0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d,
+      0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0,
+      0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf,
+      0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0,
+      0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626,
+      0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc,
+      0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1,
+      0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515,
+      0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3,
+      0x00181818, 0x00969696, 0x00050505, 0x009a9a9a,
+      0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2,
+      0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575,
+      0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a,
+      0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0,
+      0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3,
+      0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484,
+      0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed,
+      0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b,
+      0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939,
+      0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf,
+      0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb,
+      0x00434343, 0x004d4d4d, 0x00333333, 0x00858585,
+      0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f,
+      0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8,
+      0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f,
+      0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5,
+      0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121,
+      0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2,
+      0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec,
+      0x005f5f5f, 0x00979797, 0x00444444, 0x00171717,
+      0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d,
+      0x00646464, 0x005d5d5d, 0x00191919, 0x00737373,
+      0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc,
+      0x00222222, 0x002a2a2a, 0x00909090, 0x00888888,
+      0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414,
+      0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb,
+      0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a,
+      0x00494949, 0x00060606, 0x00242424, 0x005c5c5c,
+      0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262,
+      0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979,
+      0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d,
+      0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9,
+      0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea,
+      0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808,
+      0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e,
+      0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6,
+      0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f,
+      0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a,
+      0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666,
+      0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e,
+      0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9,
+      0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e,
+      0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111,
+      0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494,
+      0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9,
+      0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf,
+      0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d,
+      0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868,
+      0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f,
+      0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616
+    },
+    { /* s2 */
+      0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc,
+      0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc,
+      0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646,
+      0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1,
+      0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb,
+      0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b,
+      0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303,
+      0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1,
+      0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b,
+      0x55005555, 0xf000f0f0, 0x99009999, 0x69006969,
+      0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae,
+      0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb,
+      0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb,
+      0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4,
+      0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa,
+      0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb,
+      0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929,
+      0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191,
+      0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595,
+      0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd,
+      0x08000808, 0x7a007a7a, 0x88008888, 0x38003838,
+      0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828,
+      0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7,
+      0x93009393, 0xa400a4a4, 0x12001212, 0x53005353,
+      0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131,
+      0x36003636, 0x21002121, 0x58005858, 0x48004848,
+      0x01000101, 0x8e008e8e, 0x37003737, 0x74007474,
+      0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1,
+      0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7,
+      0xc400c4c4, 0x56005656, 0x42004242, 0x26002626,
+      0x07000707, 0x98009898, 0x60006060, 0xd900d9d9,
+      0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040,
+      0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd,
+      0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404,
+      0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f,
+      0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc,
+      0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757,
+      0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565,
+      0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e,
+      0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5,
+      0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717,
+      0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a,
+      0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767,
+      0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343,
+      0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5,
+      0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434,
+      0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a,
+      0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8,
+      0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b,
+      0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6,
+      0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424,
+      0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada,
+      0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef,
+      0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f,
+      0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a,
+      0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c,
+      0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333,
+      0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3,
+      0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0,
+      0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d,
+      0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5,
+      0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8,
+      0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a,
+      0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181
+    },
+    { /* x1 */
+      0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5,
+      0x30300030, 0x36360036, 0xa5a500a5, 0x38380038,
+      0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e,
+      0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb,
+      0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082,
+      0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087,
+      0x34340034, 0x8e8e008e, 0x43430043, 0x44440044,
+      0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb,
+      0x54540054, 0x7b7b007b, 0x94940094, 0x32320032,
+      0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d,
+      0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b,
+      0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e,
+      0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066,
+      0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2,
+      0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049,
+      0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025,
+      0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064,
+      0x86860086, 0x68680068, 0x98980098, 0x16160016,
+      0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc,
+      0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092,
+      0x6c6c006c, 0x70700070, 0x48480048, 0x50500050,
+      0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da,
+      0x5e5e005e, 0x15150015, 0x46460046, 0x57570057,
+      0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084,
+      0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000,
+      0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a,
+      0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005,
+      0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006,
+      0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f,
+      0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002,
+      0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003,
+      0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b,
+      0x3a3a003a, 0x91910091, 0x11110011, 0x41410041,
+      0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea,
+      0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce,
+      0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073,
+      0x96960096, 0xacac00ac, 0x74740074, 0x22220022,
+      0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085,
+      0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8,
+      0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e,
+      0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071,
+      0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089,
+      0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e,
+      0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b,
+      0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b,
+      0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020,
+      0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe,
+      0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4,
+      0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033,
+      0x88880088, 0x07070007, 0xc7c700c7, 0x31310031,
+      0xb1b100b1, 0x12120012, 0x10100010, 0x59590059,
+      0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f,
+      0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9,
+      0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d,
+      0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f,
+      0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef,
+      0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d,
+      0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0,
+      0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c,
+      0x83830083, 0x53530053, 0x99990099, 0x61610061,
+      0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e,
+      0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026,
+      0xe1e100e1, 0x69690069, 0x14140014, 0x63630063,
+      0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d
+    },
+    { /* x2 */
+      0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00,
+      0x87878700, 0xb9b9b900, 0x21212100, 0x78787800,
+      0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100,
+      0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00,
+      0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00,
+      0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300,
+      0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600,
+      0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00,
+      0x81818100, 0x65656500, 0xf5f5f500, 0x89898900,
+      0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600,
+      0x57575700, 0x43434300, 0x56565600, 0x17171700,
+      0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00,
+      0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300,
+      0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00,
+      0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800,
+      0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00,
+      0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00,
+      0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800,
+      0x67676700, 0x88888800, 0x06060600, 0xc3c3c300,
+      0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00,
+      0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00,
+      0x02020200, 0x24242400, 0x75757500, 0x93939300,
+      0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200,
+      0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00,
+      0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00,
+      0x12121200, 0x97979700, 0x32323200, 0xababab00,
+      0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300,
+      0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900,
+      0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100,
+      0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900,
+      0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600,
+      0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100,
+      0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500,
+      0x86868600, 0x36363600, 0xbebebe00, 0x61616100,
+      0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00,
+      0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00,
+      0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00,
+      0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500,
+      0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00,
+      0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700,
+      0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00,
+      0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000,
+      0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100,
+      0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00,
+      0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600,
+      0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000,
+      0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00,
+      0x18181800, 0x83838300, 0x16161600, 0xa5a5a500,
+      0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500,
+      0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00,
+      0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300,
+      0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500,
+      0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00,
+      0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300,
+      0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900,
+      0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00,
+      0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400,
+      0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00,
+      0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00,
+      0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300,
+      0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700,
+      0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00,
+      0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300,
+      0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000
+    },
+    0
+  };
+
+#ifdef USE_AESNI_AVX
+extern unsigned int
+_gcry_aria_aesni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+                                      const byte *in, const void *key,
+                                      u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_GFNI_AVX
+extern unsigned int
+_gcry_aria_gfni_avx_ecb_crypt_blk1_16(const void *ctx, byte *out,
+                                     const byte *in, const void *key,
+                                     u64 nblks) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx_ctr_crypt_blk16(const void *ctx, byte *out,
+                                   const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX */
+
+static inline unsigned int
+aria_avx_ecb_crypt_blk1_16(const ARIA_context *ctx, byte *out, const byte *in,
+                          const u32 key[][ARIA_RD_KEY_WORDS], size_t nblks)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX
+  else if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX */
+  else
+    return _gcry_aria_aesni_avx_ecb_crypt_blk1_16(ctx, out, in, key, nblks)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx_ctr_crypt_blk16(const ARIA_context *ctx, byte *out, const byte *in,
+                        byte *iv)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX
+  else if (ctx->use_gfni_avx)
+    return _gcry_aria_gfni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX */
+  else
+    return _gcry_aria_aesni_avx_ctr_crypt_blk16(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+extern unsigned int
+_gcry_aria_aesni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                     const byte *in,
+                                     const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_aesni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                     const byte *in, byte *iv) ASM_FUNC_ABI;
+
+#ifdef USE_VAES_AVX2
+extern unsigned int
+_gcry_aria_vaes_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in,
+                                    const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_vaes_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_VAES_AVX2 */
+
+#ifdef USE_GFNI_AVX2
+extern unsigned int
+_gcry_aria_gfni_avx2_ecb_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in,
+                                    const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx2_ctr_crypt_blk32(const void *ctx, byte *out,
+                                    const byte *in, byte *iv) ASM_FUNC_ABI;
+#endif /* USE_GFNI_AVX2 */
+
+static inline unsigned int
+aria_avx2_ecb_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+                         const u32 key[][ARIA_RD_KEY_WORDS])
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX2
+  else if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
+    return _gcry_aria_aesni_avx2_ecb_crypt_blk32(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_avx2_ctr_crypt_blk32(const ARIA_context *ctx, byte *out, const byte *in,
+                         byte *iv)
+{
+  if (0) { }
+#ifdef USE_GFNI_AVX2
+  else if (ctx->use_gfni_avx2)
+    return _gcry_aria_gfni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_GFNI_AVX2 */
+#ifdef USE_VAES_AVX2
+  else if (ctx->use_vaes_avx2)
+    return _gcry_aria_vaes_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+#endif /* USE_VAES_AVX2 */
+  else
+    return _gcry_aria_aesni_avx2_ctr_crypt_blk32(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_GFNI_AVX512
+extern unsigned int
+_gcry_aria_gfni_avx512_ecb_crypt_blk64(const void *ctx, byte *out,
+                                      const byte *in,
+                                      const void *key) ASM_FUNC_ABI;
+extern unsigned int
+_gcry_aria_gfni_avx512_ctr_crypt_blk64(const void *ctx, byte *out,
+                                      const byte *in, byte *iv) ASM_FUNC_ABI;
+
+static inline unsigned int
+aria_gfni_avx512_ecb_crypt_blk64(const ARIA_context *ctx, byte *out,
+                                const byte *in,
+                                const u32 key[][ARIA_RD_KEY_WORDS])
+{
+  return _gcry_aria_gfni_avx512_ecb_crypt_blk64(ctx, out, in, key)
+               + ASM_EXTRA_STACK;
+}
+
+static inline unsigned int
+aria_gfni_avx512_ctr_crypt_blk64(const ARIA_context *ctx, byte *out,
+                                const byte *in, byte *iv)
+{
+  return _gcry_aria_gfni_avx512_ctr_crypt_blk64(ctx, out, in, iv)
+               + ASM_EXTRA_STACK;
+}
+#endif /* USE_AESNI_AVX2 */
+
+/* Prefetching for sbox tables. */
+static inline void
+prefetch_table(const volatile byte *tab, size_t len)
+{
+  size_t i;
+
+  for (i = 0; len - i >= 8 * 32; i += 8 * 32)
+    {
+      (void)tab[i + 0 * 32];
+      (void)tab[i + 1 * 32];
+      (void)tab[i + 2 * 32];
+      (void)tab[i + 3 * 32];
+      (void)tab[i + 4 * 32];
+      (void)tab[i + 5 * 32];
+      (void)tab[i + 6 * 32];
+      (void)tab[i + 7 * 32];
+    }
+  for (; i < len; i += 32)
+    {
+      (void)tab[i];
+    }
+
+  (void)tab[len - 1];
+}
+
+static inline void
+prefetch_sboxes(void)
+{
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  sboxes.counter_head++;
+  sboxes.counter_tail++;
+
+  /* Prefetch look-up tables to cache.  */
+  prefetch_table((const void *)&sboxes, sizeof(sboxes));
+}
+
+
+static ALWAYS_INLINE
+u32 rotr32(u32 v, u32 r)
+{
+  return ror(v, r);
+}
+
+static ALWAYS_INLINE
+u32 bswap32(u32 v)
+{
+  return _gcry_bswap32(v);
+}
+
+static ALWAYS_INLINE u32
+get_u8(u32 x, u32 y)
+{
+  return (x >> ((3 - y) * 8)) & 0xFF;
+}
+
+static ALWAYS_INLINE u32
+make_u32(byte v0, byte v1, byte v2, byte v3)
+{
+  return ((u32)v0 << 24) | ((u32)v1 << 16) | ((u32)v2 <<  8) | ((u32)v3);
+}
+
+static ALWAYS_INLINE u32
+aria_m(u32 t0)
+{
+  return rotr32(t0, 8) ^ rotr32(t0 ^ rotr32(t0, 8), 16);
+}
+
+/* S-Box Layer 1 + M */
+static ALWAYS_INLINE void
+aria_sbox_layer1_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = sboxes.s1[get_u8(*t0, 0)] ^
+       sboxes.s2[get_u8(*t0, 1)] ^
+       sboxes.x1[get_u8(*t0, 2)] ^
+       sboxes.x2[get_u8(*t0, 3)];
+  *t1 = sboxes.s1[get_u8(*t1, 0)] ^
+       sboxes.s2[get_u8(*t1, 1)] ^
+       sboxes.x1[get_u8(*t1, 2)] ^
+       sboxes.x2[get_u8(*t1, 3)];
+  *t2 = sboxes.s1[get_u8(*t2, 0)] ^
+       sboxes.s2[get_u8(*t2, 1)] ^
+       sboxes.x1[get_u8(*t2, 2)] ^
+       sboxes.x2[get_u8(*t2, 3)];
+  *t3 = sboxes.s1[get_u8(*t3, 0)] ^
+       sboxes.s2[get_u8(*t3, 1)] ^
+       sboxes.x1[get_u8(*t3, 2)] ^
+       sboxes.x2[get_u8(*t3, 3)];
+}
+
+/* S-Box Layer 2 + M */
+static ALWAYS_INLINE void
+aria_sbox_layer2_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = sboxes.x1[get_u8(*t0, 0)] ^
+       sboxes.x2[get_u8(*t0, 1)] ^
+       sboxes.s1[get_u8(*t0, 2)] ^
+       sboxes.s2[get_u8(*t0, 3)];
+  *t1 = sboxes.x1[get_u8(*t1, 0)] ^
+       sboxes.x2[get_u8(*t1, 1)] ^
+       sboxes.s1[get_u8(*t1, 2)] ^
+       sboxes.s2[get_u8(*t1, 3)];
+  *t2 = sboxes.x1[get_u8(*t2, 0)] ^
+       sboxes.x2[get_u8(*t2, 1)] ^
+       sboxes.s1[get_u8(*t2, 2)] ^
+       sboxes.s2[get_u8(*t2, 3)];
+  *t3 = sboxes.x1[get_u8(*t3, 0)] ^
+       sboxes.x2[get_u8(*t3, 1)] ^
+       sboxes.s1[get_u8(*t3, 2)] ^
+       sboxes.s2[get_u8(*t3, 3)];
+}
+
+/* Word-level diffusion */
+static ALWAYS_INLINE void
+aria_diff_word(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t1 ^= *t2;
+  *t2 ^= *t3;
+  *t0 ^= *t1;
+
+  *t3 ^= *t1;
+  *t2 ^= *t0;
+  *t1 ^= *t2;
+}
+
+/* Byte-level diffusion */
+static inline void aria_diff_byte(u32 *t1, u32 *t2, u32 *t3)
+{
+  *t1 = ((*t1 << 8) & 0xff00ff00) ^ ((*t1 >> 8) & 0x00ff00ff);
+  *t2 = rotr32(*t2, 16);
+  *t3 = bswap32(*t3);
+}
+
+/* Key XOR Layer */
+static ALWAYS_INLINE void
+aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 ^= rk[0];
+  *t1 ^= rk[1];
+  *t2 ^= rk[2];
+  *t3 ^= rk[3];
+}
+
+/* Odd round Substitution & Diffusion */
+static ALWAYS_INLINE void
+aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  aria_sbox_layer1_with_pre_diff(t0, t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+  aria_diff_byte(t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Even round Substitution & Diffusion */
+static ALWAYS_INLINE void
+aria_subst_diff_even(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  aria_sbox_layer2_with_pre_diff(t0, t1, t2, t3);
+  aria_diff_word(t0, t1, t2, t3);
+  aria_diff_byte(t3, t0, t1);
+  aria_diff_word(t0, t1, t2, t3);
+}
+
+/* Last round */
+static ALWAYS_INLINE void
+aria_last_round(u32 *t0, u32 *t1, u32 *t2, u32 *t3)
+{
+  *t0 = make_u32((byte)(sboxes.x1[get_u8(*t0, 0)]),
+                (byte)(sboxes.x2[get_u8(*t0, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t0, 2)]),
+                (byte)(sboxes.s2[get_u8(*t0, 3)]));
+  *t1 = make_u32((byte)(sboxes.x1[get_u8(*t1, 0)]),
+                (byte)(sboxes.x2[get_u8(*t1, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t1, 2)]),
+                (byte)(sboxes.s2[get_u8(*t1, 3)]));
+  *t2 = make_u32((byte)(sboxes.x1[get_u8(*t2, 0)]),
+                (byte)(sboxes.x2[get_u8(*t2, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t2, 2)]),
+                (byte)(sboxes.s2[get_u8(*t2, 3)]));
+  *t3 = make_u32((byte)(sboxes.x1[get_u8(*t3, 0)]),
+                (byte)(sboxes.x2[get_u8(*t3, 1)] >> 24),
+                (byte)(sboxes.s1[get_u8(*t3, 2)]),
+                (byte)(sboxes.s2[get_u8(*t3, 3)]));
+}
+
+/* Q, R Macro expanded ARIA GSRK */
+static ALWAYS_INLINE void
+aria_gsrk(u32 *rk, u32 *x, u32 *y, u32 n)
+{
+  int q = 4 - (n / 32);
+  int r = n % 32;
+
+  rk[0] = (x[0]) ^
+         ((y[q % 4]) >> r) ^
+         ((y[(q + 3) % 4]) << (32 - r));
+  rk[1] = (x[1]) ^
+         ((y[(q + 1) % 4]) >> r) ^
+         ((y[q % 4]) << (32 - r));
+  rk[2] = (x[2]) ^
+         ((y[(q + 2) % 4]) >> r) ^
+         ((y[(q + 1) % 4]) << (32 - r));
+  rk[3] = (x[3]) ^
+         ((y[(q + 3) % 4]) >> r) ^
+         ((y[(q + 2) % 4]) << (32 - r));
+}
+
+
+static NO_INLINE void
+aria_set_encrypt_key(ARIA_context *ctx, const byte *in_key, u32 key_len)
+{
+  u32 w0[4], w1[4], w2[4], w3[4];
+  u32 reg0, reg1, reg2, reg3;
+  const u32 *ck;
+  int rkidx = 0;
+
+  ctx->rounds = (key_len + 32) / 4;
+  prefetch_sboxes();
+
+  ck = &key_rc[(key_len - 16) / 2];
+
+  w0[0] = buf_get_be32(in_key + 0);
+  w0[1] = buf_get_be32(in_key + 4);
+  w0[2] = buf_get_be32(in_key + 8);
+  w0[3] = buf_get_be32(in_key + 12);
+
+  reg0 = w0[0] ^ ck[0];
+  reg1 = w0[1] ^ ck[1];
+  reg2 = w0[2] ^ ck[2];
+  reg3 = w0[3] ^ ck[3];
+
+  aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+  if (key_len > 16)
+    {
+      w1[0] = buf_get_be32(in_key + 16);
+      w1[1] = buf_get_be32(in_key + 20);
+      if (key_len > 24)
+       {
+         w1[2] = buf_get_be32(in_key + 24);
+         w1[3] = buf_get_be32(in_key + 28);
+       }
+      else
+       {
+         w1[2] = 0;
+         w1[3] = 0;
+       }
+    }
+  else
+    {
+      w1[0] = 0;
+      w1[1] = 0;
+      w1[2] = 0;
+      w1[3] = 0;
+    }
+
+  w1[0] ^= reg0;
+  w1[1] ^= reg1;
+  w1[2] ^= reg2;
+  w1[3] ^= reg3;
+
+  reg0 = w1[0];
+  reg1 = w1[1];
+  reg2 = w1[2];
+  reg3 = w1[3];
+
+  reg0 ^= ck[4];
+  reg1 ^= ck[5];
+  reg2 ^= ck[6];
+  reg3 ^= ck[7];
+
+  aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+
+  reg0 ^= w0[0];
+  reg1 ^= w0[1];
+  reg2 ^= w0[2];
+  reg3 ^= w0[3];
+
+  w2[0] = reg0;
+  w2[1] = reg1;
+  w2[2] = reg2;
+  w2[3] = reg3;
+
+  reg0 ^= ck[8];
+  reg1 ^= ck[9];
+  reg2 ^= ck[10];
+  reg3 ^= ck[11];
+
+  aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+
+  w3[0] = reg0 ^ w1[0];
+  w3[1] = reg1 ^ w1[1];
+  w3[2] = reg2 ^ w1[2];
+  w3[3] = reg3 ^ w1[3];
+
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 19);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 19);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 31);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 31);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w1, w2, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w2, w3, 67);
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w3, w0, 67);
+
+  rkidx++;
+  aria_gsrk(ctx->enc_key[rkidx], w0, w1, 97);
+  if (key_len > 16)
+    {
+      rkidx++;
+      aria_gsrk(ctx->enc_key[rkidx], w1, w2, 97);
+      rkidx++;
+      aria_gsrk(ctx->enc_key[rkidx], w2, w3, 97);
+
+      if (key_len > 24)
+       {
+         rkidx++;
+         aria_gsrk(ctx->enc_key[rkidx], w3, w0, 97);
+
+         rkidx++;
+         aria_gsrk(ctx->enc_key[rkidx], w0, w1, 109);
+       }
+    }
+
+  wipememory(w0, sizeof(w0));
+  wipememory(w1, sizeof(w1));
+  wipememory(w2, sizeof(w2));
+  wipememory(w3, sizeof(w3));
+}
+
+static void
+aria_set_decrypt_key(ARIA_context *ctx)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    {
+      ctx->dec_key[0][i] = ctx->enc_key[ctx->rounds][i];
+      ctx->dec_key[ctx->rounds][i] = ctx->enc_key[0][i];
+    }
+
+  for (i = 1; i < ctx->rounds; i++)
+    {
+      ctx->dec_key[i][0] = aria_m(ctx->enc_key[ctx->rounds - i][0]);
+      ctx->dec_key[i][1] = aria_m(ctx->enc_key[ctx->rounds - i][1]);
+      ctx->dec_key[i][2] = aria_m(ctx->enc_key[ctx->rounds - i][2]);
+      ctx->dec_key[i][3] = aria_m(ctx->enc_key[ctx->rounds - i][3]);
+
+      aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+      aria_diff_byte(&ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+      aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1],
+                    &ctx->dec_key[i][2], &ctx->dec_key[i][3]);
+    }
+}
+
+static NO_INLINE unsigned int
+aria_crypt(ARIA_context *ctx, byte *out, const byte *in,
+          u32 key[][ARIA_RD_KEY_WORDS])
+{
+  u32 reg0, reg1, reg2, reg3;
+  int rounds = ctx->rounds;
+  int rkidx = 0;
+
+  reg0 = buf_get_be32(in + 0);
+  reg1 = buf_get_be32(in + 4);
+  reg2 = buf_get_be32(in + 8);
+  reg3 = buf_get_be32(in + 12);
+
+  aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+  rkidx++;
+
+  while (1)
+    {
+      aria_subst_diff_odd(&reg0, &reg1, &reg2, &reg3);
+      aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+      rkidx++;
+
+      if (rkidx >= rounds)
+       break;
+
+      aria_subst_diff_even(&reg0, &reg1, &reg2, &reg3);
+      aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+      rkidx++;
+    }
+
+  aria_last_round(&reg0, &reg1, &reg2, &reg3);
+  aria_add_round_key(key[rkidx], &reg0, &reg1, &reg2, &reg3);
+
+  buf_put_be32(out + 0, reg0);
+  buf_put_be32(out + 4, reg1);
+  buf_put_be32(out + 8, reg2);
+  buf_put_be32(out + 12, reg3);
+
+  return 4 * sizeof(void *) + 4 * sizeof(u32); /* stack burn depth */
+}
+
+unsigned int
+aria_encrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  prefetch_sboxes ();
+
+  return aria_crypt (ctx, outbuf, inbuf, ctx->enc_key);
+}
+
+unsigned int
+aria_decrypt(void *c, byte *outbuf, const byte *inbuf)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  if (!ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  prefetch_sboxes ();
+
+  return aria_crypt (ctx, outbuf, inbuf, ctx->dec_key);
+}
+
+
+static unsigned int
+aria_crypt_2blks(ARIA_context *ctx, byte *out, const byte *in,
+                u32 key[][ARIA_RD_KEY_WORDS])
+{
+  u32 ra0, ra1, ra2, ra3;
+  u32 rb0, rb1, rb2, rb3;
+  int rounds = ctx->rounds;
+  int rkidx = 0;
+
+  ra0 = buf_get_be32(in + 0);
+  ra1 = buf_get_be32(in + 4);
+  ra2 = buf_get_be32(in + 8);
+  ra3 = buf_get_be32(in + 12);
+  rb0 = buf_get_be32(in + 16);
+  rb1 = buf_get_be32(in + 20);
+  rb2 = buf_get_be32(in + 24);
+  rb3 = buf_get_be32(in + 28);
+
+  while (1)
+    {
+      aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+      aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+      rkidx++;
+
+      aria_subst_diff_odd(&ra0, &ra1, &ra2, &ra3);
+      aria_subst_diff_odd(&rb0, &rb1, &rb2, &rb3);
+      aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+      aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+      rkidx++;
+
+      if (rkidx >= rounds)
+       break;
+
+      aria_subst_diff_even(&ra0, &ra1, &ra2, &ra3);
+      aria_subst_diff_even(&rb0, &rb1, &rb2, &rb3);
+    }
+
+  aria_last_round(&ra0, &ra1, &ra2, &ra3);
+  aria_last_round(&rb0, &rb1, &rb2, &rb3);
+  aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3);
+  aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3);
+
+  buf_put_be32(out + 0, ra0);
+  buf_put_be32(out + 4, ra1);
+  buf_put_be32(out + 8, ra2);
+  buf_put_be32(out + 12, ra3);
+  buf_put_be32(out + 16, rb0);
+  buf_put_be32(out + 20, rb1);
+  buf_put_be32(out + 24, rb2);
+  buf_put_be32(out + 28, rb3);
+
+  return 4 * sizeof(void *) + 8 * sizeof(u32); /* stack burn depth */
+}
+
+static unsigned int
+aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in,
+                  size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS])
+{
+  unsigned int burn_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 64)
+       {
+         nburn = aria_gfni_avx512_ecb_crypt_blk64 (ctx, out, in, key);
+         in += 64 * ARIA_BLOCK_SIZE;
+         out += 64 * ARIA_BLOCK_SIZE;
+         num_blks -= 64;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 32)
+       {
+         nburn = aria_avx2_ecb_crypt_blk32 (ctx, out, in, key);
+         in += 32 * ARIA_BLOCK_SIZE;
+         out += 32 * ARIA_BLOCK_SIZE;
+         num_blks -= 32;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX2 */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      unsigned int nburn = 0;
+
+      while (num_blks >= 3)
+       {
+         size_t curr_blks = num_blks < 16 ? num_blks : 16;
+         nburn = aria_avx_ecb_crypt_blk1_16 (ctx, out, in, key, curr_blks);
+         in += curr_blks * ARIA_BLOCK_SIZE;
+         out += curr_blks * ARIA_BLOCK_SIZE;
+         num_blks -= curr_blks;
+       }
+
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      if (num_blks == 0)
+       return burn_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+  if (!ctx->bulk_prefetch_ready)
+    {
+      prefetch_sboxes();
+      ctx->bulk_prefetch_ready = 1;
+    }
+
+  while (num_blks >= 2)
+    {
+      unsigned int nburn = aria_crypt_2blks (ctx, out, in, key);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 2 * ARIA_BLOCK_SIZE;
+      in += 2 * ARIA_BLOCK_SIZE;
+      num_blks -= 2;
+    }
+
+  while (num_blks)
+    {
+      unsigned int nburn = aria_crypt (ctx, out, in, key);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += ARIA_BLOCK_SIZE;
+      in += ARIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  if (burn_depth)
+    burn_depth += sizeof(void *) * 5;
+  return burn_depth;
+}
+
+static unsigned int
+aria_enc_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  return aria_crypt_blocks (ctx, out, in, num_blks, ctx->enc_key);
+}
+
+static unsigned int
+aria_dec_blocks (void *c, byte *out, const byte *in, size_t num_blks)
+{
+  ARIA_context *ctx = (ARIA_context *)c;
+
+  return aria_crypt_blocks (ctx, out, in, num_blks, ctx->dec_key);
+}
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size 16. */
+static void
+_gcry_aria_ctr_enc(void *context, unsigned char *ctr,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+  if (ctx->use_gfni_avx512)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 64)
+       {
+         nburn = aria_gfni_avx512_ctr_crypt_blk64 (ctx, outbuf, inbuf, ctr);
+         inbuf += 64 * ARIA_BLOCK_SIZE;
+         outbuf += 64 * ARIA_BLOCK_SIZE;
+         nblocks -= 64;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 || ctx->use_gfni_avx2)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 32)
+       {
+         nburn = aria_avx2_ctr_crypt_blk32 (ctx, outbuf, inbuf, ctr);
+         inbuf += 32 * ARIA_BLOCK_SIZE;
+         outbuf += 32 * ARIA_BLOCK_SIZE;
+         nblocks -= 32;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx || ctx->use_gfni_avx)
+    {
+      size_t nburn = 0;
+
+      while (nblocks >= 16)
+       {
+         nburn = aria_avx_ctr_crypt_blk16 (ctx, outbuf, inbuf, ctr);
+         inbuf += 16 * ARIA_BLOCK_SIZE;
+         outbuf += 16 * ARIA_BLOCK_SIZE;
+         nblocks -= 16;
+       }
+
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+#endif /* USE_AESNI_AVX */
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn = 0;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ctr_enc_128(ctx, aria_enc_blocks, outbuf, inbuf,
+                              nblocks, ctr, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CBC mode. */
+static void
+_gcry_aria_cbc_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks, int cbc_mac)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char *last_iv;
+  unsigned int burn_depth = 0;
+
+  prefetch_sboxes();
+
+  last_iv = iv;
+
+  for (; nblocks; nblocks--)
+    {
+      cipher_block_xor (outbuf, inbuf, last_iv, ARIA_BLOCK_SIZE);
+
+      burn_depth = aria_crypt (ctx, outbuf, outbuf, ctx->enc_key);
+
+      last_iv = outbuf;
+      inbuf += ARIA_BLOCK_SIZE;
+      if (!cbc_mac)
+       outbuf += ARIA_BLOCK_SIZE;
+    }
+
+  if (last_iv != iv)
+    cipher_block_cpy (iv, last_iv, ARIA_BLOCK_SIZE);
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cbc_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_cbc_dec_128(ctx, aria_dec_blocks, outbuf, inbuf,
+                              nblocks, iv, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CFB mode. */
+static void
+_gcry_aria_cfb_enc (void *context, unsigned char *iv,
+                   void *outbuf_arg, const void *inbuf_arg,
+                   size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned int burn_depth = 0;
+
+  prefetch_sboxes();
+
+  for (; nblocks; nblocks--)
+    {
+      /* Encrypt the IV. */
+      burn_depth = aria_crypt (ctx, iv, iv, ctx->enc_key);
+      /* XOR the input with the IV and store input into IV.  */
+      cipher_block_xor_2dst(outbuf, iv, inbuf, ARIA_BLOCK_SIZE);
+      outbuf += ARIA_BLOCK_SIZE;
+      inbuf += ARIA_BLOCK_SIZE;
+    }
+
+  if (burn_depth)
+    _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_aria_cfb_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_cfb_dec_128(ctx, aria_enc_blocks, outbuf, inbuf,
+                              nblocks, iv, tmpbuf,
+                              sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption in ECB mode. */
+static void
+_gcry_aria_ecb_crypt (void *context, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_n,
+                                outbuf, inbuf, nblocks, MAX_PARALLEL_BLKS);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_xts_crypt_128(ctx, crypt_blk1_n,
+                                outbuf, inbuf, nblocks,
+                                tweak, tmpbuf,
+                                sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */
+static void
+_gcry_aria_ctr32le_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      size_t nblocks)
+{
+  ARIA_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ctr32le_enc_128 (ctx, aria_enc_blocks, outbuf, inbuf,
+                                   nblocks, ctr, tmpbuf,
+                                   sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                   &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  ARIA_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  int burn_stack_depth = 0;
+
+  if (!encrypt && !ctx->decryption_prepared)
+    {
+      aria_set_decrypt_key (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      bulk_crypt_fn_t crypt_blk1_n;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+      crypt_blk1_n = encrypt ? aria_enc_blocks : aria_dec_blocks;
+
+      nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_n, outbuf, inbuf, nblocks,
+                                 &blkn, encrypt, tmpbuf,
+                                 sizeof(tmpbuf) / ARIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+
+  return 0;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+static size_t
+_gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+  ARIA_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned char tmpbuf[MAX_PARALLEL_BLKS * ARIA_BLOCK_SIZE];
+      unsigned int tmp_used = ARIA_BLOCK_SIZE;
+      size_t nburn;
+
+      ctx->bulk_prefetch_ready = 0;
+
+      nburn = bulk_ocb_auth_128 (c, ctx, aria_enc_blocks, abuf, nblocks,
+                                &blkn, tmpbuf,
+                                sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory (tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+aria_setkey(void *c, const byte *key, unsigned keylen,
+           cipher_bulk_ops_t *bulk_ops)
+{
+  ARIA_context *ctx = c;
+  static int initialized = 0;
+  static const char *selftest_failed = NULL;
+  unsigned int hwf = _gcry_get_hw_features ();
+
+  (void)hwf;
+
+  if (keylen != 16 && keylen != 24 && keylen != 32)
+    return GPG_ERR_INV_KEYLEN;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = aria_selftest ();
+      if (selftest_failed)
+       log_error("%s\n", selftest_failed);
+    }
+
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+#ifdef USE_GFNI_AVX512
+  ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
+#ifdef USE_AESNI_AVX2
+  ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_VAES_AVX2
+  ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & 
HWF_INTEL_AVX2);
+#endif
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX);
+#endif
+#ifdef USE_GFNI_AVX
+  ctx->use_gfni_avx = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX);
+#endif
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cbc_enc = _gcry_aria_cbc_enc;
+  bulk_ops->cbc_dec = _gcry_aria_cbc_dec;
+  bulk_ops->cfb_enc = _gcry_aria_cfb_enc;
+  bulk_ops->cfb_dec = _gcry_aria_cfb_dec;
+  bulk_ops->ctr_enc = _gcry_aria_ctr_enc;
+  bulk_ops->ctr32le_enc = _gcry_aria_ctr32le_enc;
+  bulk_ops->ecb_crypt = _gcry_aria_ecb_crypt;
+  bulk_ops->xts_crypt = _gcry_aria_xts_crypt;
+  bulk_ops->ocb_crypt = _gcry_aria_ocb_crypt;
+  bulk_ops->ocb_auth = _gcry_aria_ocb_auth;
+
+  /* Setup context and encryption key. */
+  ctx->decryption_prepared = 0;
+  aria_set_encrypt_key (ctx, key, keylen);
+
+  _gcry_burn_stack (3 * sizeof(void *) + 5 * 4 * sizeof(u32));
+  return 0;
+}
+
+
+static const char *
+aria_selftest (void)
+{
+  ARIA_context ctx;
+  byte scratch[16];
+
+  static const byte key[16] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+  };
+  static const byte plaintext[16] = {
+    0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+    0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+  };
+  static const byte ciphertext[16] = {
+    0xd7, 0x18, 0xfb, 0xd6, 0xab, 0x64, 0x4c, 0x73,
+    0x9d, 0xa9, 0x5f, 0x3b, 0xe6, 0x45, 0x17, 0x78
+  };
+
+  memset (&ctx, 0, sizeof(ctx));
+
+  aria_set_encrypt_key (&ctx, key, 16);
+  aria_encrypt (&ctx, scratch, plaintext);
+  if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
+    return "ARIA test encryption failed.";
+  aria_decrypt (&ctx, scratch, scratch);
+  if (memcmp (scratch, plaintext, sizeof (plaintext)))
+    return "ARIA test decryption failed.";
+
+  return NULL;
+}
+
+
+static const gcry_cipher_oid_spec_t aria128_oids[] =
+  {
+    { "1.2.410.200046.1.1.1", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.2", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.3", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.4", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.5", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.34", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.37", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+static const gcry_cipher_oid_spec_t aria192_oids[] =
+  {
+    { "1.2.410.200046.1.1.6", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.7", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.8", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.9", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.10", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.35", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.38", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+static const gcry_cipher_oid_spec_t aria256_oids[] =
+  {
+    { "1.2.410.200046.1.1.11", GCRY_CIPHER_MODE_ECB },
+    { "1.2.410.200046.1.1.12", GCRY_CIPHER_MODE_CBC },
+    { "1.2.410.200046.1.1.13", GCRY_CIPHER_MODE_CFB },
+    { "1.2.410.200046.1.1.14", GCRY_CIPHER_MODE_OFB },
+    { "1.2.410.200046.1.1.15", GCRY_CIPHER_MODE_CTR },
+    { "1.2.410.200046.1.1.36", GCRY_CIPHER_MODE_GCM },
+    { "1.2.410.200046.1.1.39", GCRY_CIPHER_MODE_CCM },
+    { NULL }
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria128 =
+  {
+    GCRY_CIPHER_ARIA128, { 0, 0 },
+    "ARIA128", NULL, aria128_oids, ARIA_BLOCK_SIZE, 128,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria192 =
+  {
+    GCRY_CIPHER_ARIA192, { 0, 0 },
+    "ARIA192",NULL,aria192_oids, ARIA_BLOCK_SIZE, 192,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
+
+gcry_cipher_spec_t _gcry_cipher_spec_aria256 =
+  {
+    GCRY_CIPHER_ARIA256, { 0, 0 },
+    "ARIA256", NULL, aria256_oids, ARIA_BLOCK_SIZE, 256,
+    sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt
+  };
diff --git a/grub-core/lib/libgcrypt/cipher/asm-common-aarch64.h 
b/grub-core/lib/libgcrypt/cipher/asm-common-aarch64.h
new file mode 100644
index 000000000..3a72d7c45
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-common-aarch64.h
@@ -0,0 +1,132 @@
+/* asm-common-aarch64.h  -  Common macros for AArch64 assembly
+ *
+ * Copyright (C) 2018 Martin Storsjö <martin@martin.st>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AARCH64_H
+#define GCRY_ASM_COMMON_AARCH64_H
+
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef _WIN32
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
+#ifdef __APPLE__
+#define GET_DATA_POINTER(reg, name) \
+       adrp    reg, name@GOTPAGE ; \
+       add     reg, reg, name@GOTPAGEOFF ;
+#else
+#define GET_DATA_POINTER(reg, name) \
+       adrp    reg, name ; \
+       add     reg, reg, #:lo12:name ;
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 31
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+       0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+       0x80|((value)&0x7f), \
+       0x80|(((value)>>7)&0x7f), \
+       0x80|(((value)>>14)&0x7f), \
+       0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+       .cfi_escape \
+         0x0f, /* DW_CFA_def_cfa_expression */ \
+           DW_SLEB128_7BIT(11), /* length */ \
+         0x8f, /* DW_OP_breg31, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs), \
+         0x06, /* DW_OP_deref */ \
+         0x23, /* DW_OP_plus_constu */ \
+           DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+       .cfi_escape \
+         0x10, /* DW_CFA_expression */ \
+           DW_SLEB128_7BIT(regno), \
+           DW_SLEB128_7BIT(5), /* length */ \
+         0x8f, /* DW_OP_breg31, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+/* 'ret' instruction replacement for straight-line speculation mitigation */
+#define ret_spec_stop \
+       ret; dsb sy; isb;
+
+#define CLEAR_REG(reg) movi reg.16b, #0;
+
+#define VPUSH_ABI \
+       stp d8, d9, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d10, d11, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d12, d13, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       stp d14, d15, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16);
+
+#define VPOP_ABI \
+       ldp d14, d15, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d12, d13, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d10, d11, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       ldp d8, d9, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16);
+
+#endif /* GCRY_ASM_COMMON_AARCH64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-common-amd64.h 
b/grub-core/lib/libgcrypt/cipher/asm-common-amd64.h
new file mode 100644
index 000000000..870fef9aa
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-common-amd64.h
@@ -0,0 +1,213 @@
+/* asm-common-amd64.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_AMD64_H
+#define GCRY_ASM_COMMON_AMD64_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
+#ifdef __PIC__
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+#ifdef __PIC__
+#  define RIP %rip
+#else
+#  define RIP
+#endif
+
+#ifdef __PIC__
+#  define ADD_RIP +rip
+#else
+#  define ADD_RIP
+#endif
+
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
+#else
+#  ifdef __code_model_large__
+#    define GET_EXTERN_POINTER(name, reg) \
+              pushq %r15; \
+              pushq %r14; \
+           1: leaq 1b(%rip), reg; \
+              movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
+              movabsq $name@GOT, %r15; \
+              addq %r14, reg; \
+              popq %r14; \
+              movq (reg, %r15), reg; \
+              popq %r15;
+#  else
+#    define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
+#  endif
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+       CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+       CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+       CFI_ADJUST_CFA_OFFSET(-8);
+# define CFI_LEAVE() \
+       CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_rax 0
+# define DW_REGNO_rdx 1
+# define DW_REGNO_rcx 2
+# define DW_REGNO_rbx 3
+# define DW_REGNO_rsi 4
+# define DW_REGNO_rdi 5
+# define DW_REGNO_rbp 6
+# define DW_REGNO_rsp 7
+# define DW_REGNO_r8  8
+# define DW_REGNO_r9  9
+# define DW_REGNO_r10 10
+# define DW_REGNO_r11 11
+# define DW_REGNO_r12 12
+# define DW_REGNO_r13 13
+# define DW_REGNO_r14 14
+# define DW_REGNO_r15 15
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+       0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+       0x80|((value)&0x7f), \
+       0x80|(((value)>>7)&0x7f), \
+       0x80|(((value)>>14)&0x7f), \
+       0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+       .cfi_escape \
+         0x0f, /* DW_CFA_def_cfa_expression */ \
+           DW_SLEB128_7BIT(11), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs), \
+         0x06, /* DW_OP_deref */ \
+         0x23, /* DW_OP_plus_constu */ \
+           DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(reg,rsp_offs) \
+       .cfi_escape \
+         0x10, /* DW_CFA_expression */ \
+           DW_SLEB128_7BIT(DW_REGNO(reg)), \
+           DW_SLEB128_7BIT(5), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ENTER_SYSV_FUNC_PARAMS_0_4 \
+       pushq %rdi; \
+       CFI_PUSH(%rdi); \
+       pushq %rsi; \
+       CFI_PUSH(%rsi); \
+       movq %rcx, %rdi; \
+       movq %rdx, %rsi; \
+       movq %r8, %rdx; \
+       movq %r9, %rcx; \
+
+# define ENTER_SYSV_FUNC_PARAMS_5 \
+       ENTER_SYSV_FUNC_PARAMS_0_4; \
+       movq 0x38(%rsp), %r8;
+
+# define ENTER_SYSV_FUNC_PARAMS_6 \
+       ENTER_SYSV_FUNC_PARAMS_5; \
+       movq 0x40(%rsp), %r9;
+
+# define EXIT_SYSV_FUNC \
+       popq %rsi; \
+       CFI_POP(%rsi); \
+       popq %rdi; \
+       CFI_POP(%rdi);
+#else
+# define ENTER_SYSV_FUNC_PARAMS_0_4
+# define ENTER_SYSV_FUNC_PARAMS_5
+# define ENTER_SYSV_FUNC_PARAMS_6
+# define EXIT_SYSV_FUNC
+#endif
+
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
+#define ret_spec_stop \
+       ret; int3;
+
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+       vpxord %ymm16, %ymm16, %ymm16; \
+       vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord %ymm16, %ymm16, %ymm16;
+
+#define spec_stop_avx512_intel_syntax \
+       vpxord ymm16, ymm16, ymm16; \
+       vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord ymm16, ymm16, ymm16;
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-common-i386.h 
b/grub-core/lib/libgcrypt/cipher/asm-common-i386.h
new file mode 100644
index 000000000..d746ebc4a
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-common-i386.h
@@ -0,0 +1,161 @@
+/* asm-common-i386.h  -  Common macros for i386 assembly
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_I386_H
+#define GCRY_ASM_COMMON_I386_H
+
+#include <config.h>
+
+#ifdef HAVE_COMPATIBLE_GCC_I386_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define SYM_NAME(name) _##name
+#else
+# define SYM_NAME(name) name
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN32_PLATFORM_AS
+# define DECL_GET_PC_THUNK(reg)
+# define GET_DATA_POINTER(name, reg) leal name, %reg
+#else
+# define DECL_GET_PC_THUNK(reg) \
+      .type __gcry_get_pc_thunk_##reg, @function; \
+      .align 16; \
+      __gcry_get_pc_thunk_##reg:; \
+       CFI_STARTPROC(); \
+       movl (%esp), %reg; \
+       ret_spec_stop; \
+       CFI_ENDPROC()
+# define GET_DATA_POINTER(name, reg) \
+       call __gcry_get_pc_thunk_##reg; \
+       addl $_GLOBAL_OFFSET_TABLE_, %reg; \
+       movl name##@GOT(%reg), %reg;
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+       CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+       CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+       CFI_ADJUST_CFA_OFFSET(-4);
+# define CFI_LEAVE() \
+       CFI_ADJUST_CFA_OFFSET(-4); CFI_DEF_CFA_REGISTER(%esp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_eax 0
+# define DW_REGNO_edx 1
+# define DW_REGNO_ecx 2
+# define DW_REGNO_ebx 3
+# define DW_REGNO_esi 4
+# define DW_REGNO_edi 5
+# define DW_REGNO_ebp 6
+# define DW_REGNO_esp 7
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+       0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+       0x80|((value)&0x7f), \
+       0x80|(((value)>>7)&0x7f), \
+       0x80|(((value)>>14)&0x7f), \
+       0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(esp_offs,cfa_depth) \
+       .cfi_escape \
+         0x0f, /* DW_CFA_def_cfa_expression */ \
+           DW_SLEB128_7BIT(11), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(esp_offs), \
+         0x06, /* DW_OP_deref */ \
+         0x23, /* DW_OP_plus_constu */ \
+           DW_SLEB128_28BIT((cfa_depth)+4)
+
+# define CFI_REG_ON_STACK(reg,esp_offs) \
+       .cfi_escape \
+         0x10, /* DW_CFA_expression */ \
+           DW_SLEB128_7BIT(DW_REGNO(reg)), \
+           DW_SLEB128_7BIT(5), /* length */ \
+         0x77, /* DW_OP_breg7, rsp + constant */ \
+           DW_SLEB128_28BIT(esp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
+#define ret_spec_stop \
+       ret; int3;
+
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+       vpxord %ymm7, %ymm7, %ymm7; \
+       vpopcntb %xmm7, %xmm7; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord %ymm7, %ymm7, %ymm7;
+
+#define spec_stop_avx512_intel_syntax \
+       vpxord ymm7, ymm7, ymm7; \
+       vpopcntb xmm7, xmm7; /* Supported only by newer AVX512 CPUs. */ \
+       vpxord ymm7, ymm7, ymm7;
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-common-s390x.h 
b/grub-core/lib/libgcrypt/cipher/asm-common-s390x.h
new file mode 100644
index 000000000..b3a996cd6
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-common-s390x.h
@@ -0,0 +1,90 @@
+/* asm-common-s390x.h  -  Common macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_COMMON_S390X_H
+#define GCRY_ASM_COMMON_S390X_H
+
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_ELF_DIRECTIVES
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 15
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+       0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+       0x80|((value)&0x7f), \
+       0x80|(((value)>>7)&0x7f), \
+       0x80|(((value)>>14)&0x7f), \
+       0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+       .cfi_escape \
+         0x0f, /* DW_CFA_def_cfa_expression */ \
+           DW_SLEB128_7BIT(11), /* length */ \
+         0x7f, /* DW_OP_breg15, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs), \
+         0x06, /* DW_OP_deref */ \
+         0x23, /* DW_OP_plus_constu */ \
+           DW_SLEB128_28BIT((cfa_depth)+160)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+       .cfi_escape \
+         0x10, /* DW_CFA_expression */ \
+           DW_SLEB128_7BIT(regno), \
+           DW_SLEB128_7BIT(5), /* length */ \
+         0x7f, /* DW_OP_breg15, rsp + constant */ \
+           DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
+#endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-inline-s390x.h 
b/grub-core/lib/libgcrypt/cipher/asm-inline-s390x.h
new file mode 100644
index 000000000..001cb965f
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-inline-s390x.h
@@ -0,0 +1,205 @@
+/* asm-inline-s390x.h  -  Common macros for zSeries inline assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_INLINE_S390X_H
+#define GCRY_ASM_INLINE_S390X_H
+
+#include <config.h>
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+
+typedef unsigned int u128_t __attribute__ ((mode (TI)));
+
+enum kmxx_functions_e
+{
+  KM_FUNCTION_AES_128 = 18,
+  KM_FUNCTION_AES_192 = 19,
+  KM_FUNCTION_AES_256 = 20,
+  KM_FUNCTION_XTS_AES_128 = 50,
+  KM_FUNCTION_XTS_AES_256 = 52,
+
+  KMID_FUNCTION_SHA1 = 1,
+  KMID_FUNCTION_SHA256 = 2,
+  KMID_FUNCTION_SHA512 = 3,
+  KMID_FUNCTION_SHA3_224 = 32,
+  KMID_FUNCTION_SHA3_256 = 33,
+  KMID_FUNCTION_SHA3_384 = 34,
+  KMID_FUNCTION_SHA3_512 = 35,
+  KMID_FUNCTION_SHAKE128 = 36,
+  KMID_FUNCTION_SHAKE256 = 37,
+  KMID_FUNCTION_GHASH = 65,
+
+  PCC_FUNCTION_NIST_P256 = 64,
+  PCC_FUNCTION_NIST_P384 = 65,
+  PCC_FUNCTION_NIST_P521 = 66,
+  PCC_FUNCTION_ED25519 = 72,
+  PCC_FUNCTION_ED448 = 73,
+  PCC_FUNCTION_X25519 = 80,
+  PCC_FUNCTION_X448 = 81
+};
+
+enum kmxx_function_flags_e
+{
+  KM_ENCRYPT  = 0 << 7,
+  KM_DECRYPT  = 1 << 7,
+
+  KMF_LCFB_16 = 16 << 24,
+
+  KMA_LPC     = 1 << 8,
+  KMA_LAAD    = 1 << 9,
+  KMA_HS      = 1 << 10,
+
+  KLMD_PADDING_STATE = 1 << 8,
+};
+
+static ALWAYS_INLINE u128_t km_function_to_mask(enum kmxx_functions_e func)
+{
+  return (u128_t)1 << (127 - func);
+}
+
+static inline u128_t kimd_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
+               "   brc 1,0b\n\t"
+               : [r1] "=a" (r1)
+               : [reg0] "r" (reg0), [reg1] "r" (reg1)
+               : "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static inline u128_t klmd_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+  u128_t r1;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
+               "   brc 1,0b\n\t"
+               : [r1] "=a" (r1)
+               : [reg0] "r" (reg0), [reg1] "r" (reg1)
+               : "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static inline u128_t pcc_query(void)
+{
+  static u128_t function_codes = 0;
+  static int initialized = 0;
+  register unsigned long reg0 asm("0") = 0;
+  register void *reg1 asm("1") = &function_codes;
+
+  if (initialized)
+    return function_codes;
+
+  asm volatile ("0: .insn rre,0xb92c << 16, 0, 0\n\t"
+               "   brc 1,0b\n\t"
+               :
+               : [reg0] "r" (reg0), [reg1] "r" (reg1)
+               : "cc", "memory");
+
+  initialized = 1;
+  return function_codes;
+}
+
+static ALWAYS_INLINE void
+kimd_execute(unsigned int func, void *param_block, const void *src,
+            size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93e << 16, 0, %[r1]\n\t"
+               "   brc 1,0b\n\t"
+               : [r1] "+a" (r1)
+               : [func] "r" (reg0), [param_ptr] "r" (reg1)
+               : "cc", "memory");
+}
+
+static ALWAYS_INLINE void
+klmd_execute(unsigned int func, void *param_block, const void *src,
+            size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, 0, %[r1]\n\t"
+               "   brc 1,0b\n\t"
+               : [func] "+r" (reg0), [r1] "+a" (r1)
+               : [param_ptr] "r" (reg1)
+               : "cc", "memory");
+}
+
+static ALWAYS_INLINE void
+klmd_shake_execute(unsigned int func, void *param_block, void *dst,
+                  size_t dst_len, const void *src, size_t src_len)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  u128_t r1 = ((u128_t)(uintptr_t)dst << 64) | (u64)dst_len;
+  u128_t r2 = ((u128_t)(uintptr_t)src << 64) | (u64)src_len;
+
+  asm volatile ("0: .insn rre,0xb93f << 16, %[r1], %[r2]\n\t"
+               "   brc 1,0b\n\t"
+               : [func] "+r" (reg0), [r1] "+a" (r1), [r2] "+a" (r2)
+               : [param_ptr] "r" (reg1)
+               : "cc", "memory");
+}
+
+static ALWAYS_INLINE unsigned int
+pcc_scalar_multiply(unsigned int func, void *param_block)
+{
+  register unsigned long reg0 asm("0") = func;
+  register byte *reg1 asm("1") = param_block;
+  register unsigned long error = 0;
+
+  asm volatile ("0: .insn rre,0xb92c << 16, 0, 0\n\t"
+               "   brc 1,0b\n\t"
+               "   brc 7,1f\n\t"
+               "   j 2f\n\t"
+               "1: lhi %[error], 1\n\t"
+               "2:\n\t"
+               : [func] "+r" (reg0), [error] "+r" (error)
+               : [param_ptr] "r" (reg1)
+               : "cc", "memory");
+
+  return error;
+}
+
+#endif /* GCRY_ASM_INLINE_S390X_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-poly1305-aarch64.h 
b/grub-core/lib/libgcrypt/cipher/asm-poly1305-aarch64.h
new file mode 100644
index 000000000..2f05aae2a
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-poly1305-aarch64.h
@@ -0,0 +1,245 @@
+/* asm-common-aarch64.h  -  Poly1305 macros for ARMv8/AArch64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AARCH64_H
+#define GCRY_ASM_POLY1305_AARCH64_H
+
+#include "asm-common-aarch64.h"
+
+#ifdef __AARCH64EL__
+  #define le_to_host(reg) /*_*/
+#else
+  #define le_to_host(reg) rev reg, reg;
+#endif
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 Aarch64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    x8
+#define POLY_RSRC      x9
+
+#define POLY_R_H0      x10
+#define POLY_R_H1      x11
+#define POLY_R_H2      x12
+#define POLY_R_H2d     w12
+#define POLY_R_R0      x13
+#define POLY_R_R1      x14
+#define POLY_R_R1_MUL5 x15
+#define POLY_R_X0_HI   x16
+#define POLY_R_X0_LO   x17
+#define POLY_R_X1_HI   x19
+#define POLY_R_X1_LO   x20
+#define POLY_R_ONE     x21
+#define POLY_R_ONEd    w21
+
+#define POLY_TMP0      x22
+#define POLY_TMP1      x23
+#define POLY_TMP2      x24
+#define POLY_TMP3      x25
+
+#define POLY_CHACHA_ROUND x26
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)
+#define POLY_S_R1      (4 * 4 + 1 * 8)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)
+
+#define POLY1305_PUSH_REGS() \
+       stp x19, x20, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       CFI_REG_ON_STACK(19, 0); \
+       CFI_REG_ON_STACK(20, 8); \
+       stp x21, x22, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       CFI_REG_ON_STACK(21, 0); \
+       CFI_REG_ON_STACK(22, 8); \
+       stp x23, x24, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       CFI_REG_ON_STACK(23, 0); \
+       CFI_REG_ON_STACK(24, 8); \
+       stp x25, x26, [sp, #-16]!; \
+       CFI_ADJUST_CFA_OFFSET(16); \
+       CFI_REG_ON_STACK(25, 0); \
+       CFI_REG_ON_STACK(26, 8);
+
+#define POLY1305_POP_REGS() \
+       ldp x25, x26, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       CFI_RESTORE(x25); \
+       CFI_RESTORE(x26); \
+       ldp x23, x24, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       CFI_RESTORE(x23); \
+       CFI_RESTORE(x24); \
+       ldp x21, x22, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       CFI_RESTORE(x21); \
+       CFI_RESTORE(x22); \
+       ldp x19, x20, [sp], #16; \
+       CFI_ADJUST_CFA_OFFSET(-16); \
+       CFI_RESTORE(x19); \
+       CFI_RESTORE(x20);
+
+#define POLY1305_LOAD_STATE() \
+       ldr POLY_R_R1, [POLY_RSTATE, #(POLY_S_R1)]; \
+       ldr POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)];  \
+       ldr POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+       ldr POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)]; \
+       ldr POLY_R_R0, [POLY_RSTATE, #(POLY_S_R0)]; \
+       add POLY_R_R1_MUL5, POLY_R_R1, POLY_R_R1, lsr #2; \
+       mov POLY_R_ONE, #1;
+
+#define POLY1305_STORE_STATE() \
+       str POLY_R_H0, [POLY_RSTATE, #(POLY_S_H0)]; \
+       str POLY_R_H1, [POLY_RSTATE, #(POLY_S_H1)]; \
+       str POLY_R_H2d, [POLY_RSTATE, #(POLY_S_H2d)];
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+       /* a = h + m */ \
+       ldr POLY_TMP0, [POLY_RSRC, #((src_offset) + 0 * 8)];
+#define POLY1305_BLOCK_PART2(src_offset) \
+       ldr POLY_TMP1, [POLY_RSRC, #((src_offset) + 1 * 8)];
+#define POLY1305_BLOCK_PART3() \
+       le_to_host(POLY_TMP0);
+#define POLY1305_BLOCK_PART4() \
+       le_to_host(POLY_TMP1);
+#define POLY1305_BLOCK_PART5() \
+       adds POLY_R_H0, POLY_R_H0, POLY_TMP0;
+#define POLY1305_BLOCK_PART6() \
+       adcs POLY_R_H1, POLY_R_H1, POLY_TMP1;
+#define POLY1305_BLOCK_PART7() \
+       adc POLY_R_H2d, POLY_R_H2d, POLY_R_ONEd;
+
+#define POLY1305_BLOCK_PART8() \
+       /* h = a * r (partial mod 2^130-5): */ \
+       mul POLY_R_X1_LO, POLY_R_H0, POLY_R_R1;   /* lo: h0 * r1 */
+#define POLY1305_BLOCK_PART9() \
+       mul POLY_TMP0, POLY_R_H1, POLY_R_R0;      /* lo: h1 * r0 */
+#define POLY1305_BLOCK_PART10() \
+       mul POLY_R_X0_LO, POLY_R_H0, POLY_R_R0;   /* lo: h0 * r0 */
+#define POLY1305_BLOCK_PART11() \
+       umulh POLY_R_X1_HI, POLY_R_H0, POLY_R_R1; /* hi: h0 * r1 */
+#define POLY1305_BLOCK_PART12() \
+       adds POLY_R_X1_LO, POLY_R_X1_LO, POLY_TMP0;
+#define POLY1305_BLOCK_PART13() \
+       umulh POLY_TMP1, POLY_R_H1, POLY_R_R0;    /* hi: h1 * r0 */
+#define POLY1305_BLOCK_PART14() \
+       mul POLY_TMP2, POLY_R_H1, POLY_R_R1_MUL5;   /* lo: h1 * r1 mod 2^130-5 
*/
+#define POLY1305_BLOCK_PART15() \
+       umulh POLY_R_X0_HI, POLY_R_H0, POLY_R_R0; /* hi: h0 * r0 */
+#define POLY1305_BLOCK_PART16() \
+       adc POLY_R_X1_HI, POLY_R_X1_HI, POLY_TMP1;
+#define POLY1305_BLOCK_PART17() \
+       umulh POLY_TMP3, POLY_R_H1, POLY_R_R1_MUL5; /* hi: h1 * r1 mod 2^130-5 
*/
+#define POLY1305_BLOCK_PART18() \
+       adds POLY_R_X0_LO, POLY_R_X0_LO, POLY_TMP2;
+#define POLY1305_BLOCK_PART19() \
+       mul POLY_R_H1, POLY_R_H2, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */
+#define POLY1305_BLOCK_PART20() \
+       adc POLY_R_X0_HI, POLY_R_X0_HI, POLY_TMP3;
+#define POLY1305_BLOCK_PART21() \
+       mul POLY_R_H2, POLY_R_H2, POLY_R_R0;      /* h2 * r0 */
+#define POLY1305_BLOCK_PART22() \
+       adds POLY_R_H1, POLY_R_H1, POLY_R_X1_LO;
+#define POLY1305_BLOCK_PART23() \
+       adc POLY_R_H0, POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART24() \
+       /* carry propagation */ \
+       and POLY_R_H2, POLY_R_H0, #3;
+#define POLY1305_BLOCK_PART25() \
+       lsr POLY_R_H0, POLY_R_H0, #2;
+#define POLY1305_BLOCK_PART26() \
+       add POLY_R_H0, POLY_R_H0, POLY_R_H0, lsl #2;
+#define POLY1305_BLOCK_PART27() \
+       adds POLY_R_H0, POLY_R_H0, POLY_R_X0_LO;
+#define POLY1305_BLOCK_PART28() \
+       adcs POLY_R_H1, POLY_R_H1, POLY_R_X0_HI;
+#define POLY1305_BLOCK_PART29() \
+       adc POLY_R_H2d, POLY_R_H2d, wzr;
+
+//#define TESTING_POLY1305_ASM
+#ifdef TESTING_POLY1305_ASM
+/* for testing only. */
+.align 3
+.globl _gcry_poly1305_aarch64_blocks1
+ELF(.type _gcry_poly1305_aarch64_blocks1,%function;)
+_gcry_poly1305_aarch64_blocks1:
+       /* input:
+        *      x0: poly1305-state
+        *      x1: src
+        *      x2: nblks
+        */
+       CFI_STARTPROC()
+       POLY1305_PUSH_REGS();
+
+       mov POLY_RSTATE, x0;
+       mov POLY_RSRC, x1;
+
+       POLY1305_LOAD_STATE();
+
+.L_gcry_poly1305_aarch64_loop1:
+       POLY1305_BLOCK_PART1(0 * 16);
+       POLY1305_BLOCK_PART2(0 * 16);
+       add POLY_RSRC, POLY_RSRC, #16;
+       POLY1305_BLOCK_PART3();
+       POLY1305_BLOCK_PART4();
+       POLY1305_BLOCK_PART5();
+       POLY1305_BLOCK_PART6();
+       POLY1305_BLOCK_PART7();
+       POLY1305_BLOCK_PART8();
+       POLY1305_BLOCK_PART9();
+       POLY1305_BLOCK_PART10();
+       POLY1305_BLOCK_PART11();
+       POLY1305_BLOCK_PART12();
+       POLY1305_BLOCK_PART13();
+       POLY1305_BLOCK_PART14();
+       POLY1305_BLOCK_PART15();
+       POLY1305_BLOCK_PART16();
+       POLY1305_BLOCK_PART17();
+       POLY1305_BLOCK_PART18();
+       POLY1305_BLOCK_PART19();
+       POLY1305_BLOCK_PART20();
+       POLY1305_BLOCK_PART21();
+       POLY1305_BLOCK_PART22();
+       POLY1305_BLOCK_PART23();
+       POLY1305_BLOCK_PART24();
+       POLY1305_BLOCK_PART25();
+       POLY1305_BLOCK_PART26();
+       POLY1305_BLOCK_PART27();
+       POLY1305_BLOCK_PART28();
+       POLY1305_BLOCK_PART29();
+
+       subs x2, x2, #1;
+       b.ne .L_gcry_poly1305_aarch64_loop1;
+
+       POLY1305_STORE_STATE();
+
+       mov x0, #0;
+
+       POLY1305_POP_REGS();
+       ret_spec_stop;
+       CFI_ENDPROC()
+ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AARCH64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-poly1305-amd64.h 
b/grub-core/lib/libgcrypt/cipher/asm-poly1305-amd64.h
new file mode 100644
index 000000000..3f99ea3e1
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-poly1305-amd64.h
@@ -0,0 +1,171 @@
+/* asm-common-amd64.h  -  Poly1305 macros for AMD64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AMD64_H
+#define GCRY_ASM_POLY1305_AMD64_H
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305 AMD64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE    %r8
+#define POLY_RSRC      %r9
+
+#define POLY_R_H0      %rbx
+#define POLY_R_H1      %rcx
+#define POLY_R_H2      %r10
+#define POLY_R_H2d     %r10d
+#define POLY_R_R0      %r11
+#define POLY_R_R1_MUL5 %r12
+#define POLY_R_X0_HI   %r13
+#define POLY_R_X0_LO   %r14
+#define POLY_R_X1_HI   %r15
+#define POLY_R_X1_LO   %rsi
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define POLY1305_LOAD_STATE() \
+       movq POLY_S_H0, POLY_R_H0; \
+       movq POLY_S_H1, POLY_R_H1; \
+       movl POLY_S_H2d, POLY_R_H2d; \
+       movq POLY_S_R0, POLY_R_R0; \
+       movq POLY_S_R1, POLY_R_R1_MUL5; \
+       shrq $2, POLY_R_R1_MUL5; \
+       addq POLY_S_R1, POLY_R_R1_MUL5;
+
+#define POLY1305_STORE_STATE() \
+       movq POLY_R_H0, POLY_S_H0; \
+       movq POLY_R_H1, POLY_S_H1; \
+       movl POLY_R_H2d, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1(src_offset) \
+       addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \
+       adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \
+       adcl $1, POLY_R_H2d; \
+       \
+       /* h = a * r (partial mod 2^130-5): */ \
+       \
+       /* h0 * r1 */ \
+       movq POLY_R_H0, %rax; \
+       mulq POLY_S_R1; \
+       movq %rax, POLY_R_X1_LO; \
+       movq %rdx, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART2() \
+       \
+       /* h0 * r0 */ \
+       movq POLY_R_H0, %rax; \
+       mulq POLY_R_R0; \
+       movq %rax, POLY_R_X0_LO; \
+       movq %rdx, POLY_R_X0_HI;
+
+#define POLY1305_BLOCK_PART3() \
+       \
+       /* h1 * r0 */ \
+       movq POLY_R_H1, %rax; \
+       mulq POLY_R_R0; \
+       addq %rax, POLY_R_X1_LO; \
+       adcq %rdx, POLY_R_X1_HI; \
+       \
+       /* h1 * r1 mod 2^130-5 */ \
+       movq POLY_R_R1_MUL5, %rax; \
+       mulq POLY_R_H1;
+
+#define POLY1305_BLOCK_PART4() \
+       movq POLY_R_H2, POLY_R_H1; \
+       imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \
+       addq %rax, POLY_R_X0_LO; \
+       adcq %rdx, POLY_R_X0_HI; \
+       imulq POLY_R_R0, POLY_R_H2;      /* h2 * r0 */ \
+       addq POLY_R_X1_LO, POLY_R_H1; \
+       adcq POLY_R_X1_HI, POLY_R_H2;
+
+#define POLY1305_BLOCK_PART5() \
+       \
+       /* carry propagation */ \
+       movq POLY_R_H2, POLY_R_H0; \
+       andl $3, POLY_R_H2d; \
+       shrq $2, POLY_R_H0; \
+       leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \
+       addq POLY_R_X0_LO, POLY_R_H0; \
+       adcq POLY_R_X0_HI, POLY_R_H1; \
+       adcl $0, POLY_R_H2d;
+
+#ifdef TESTING_POLY1305_ASM
+/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */
+.align 8
+.globl _gcry_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_poly1305_amd64_ssse3_blocks1,@function;)
+
+_gcry_poly1305_amd64_ssse3_blocks1:
+       /* input:
+        *      %rdi: poly1305-state
+        *      %rsi: src
+        *      %rdx: nblks
+        */
+       pushq %rbp;
+       movq %rsp, %rbp;
+
+       subq $(10 * 8), %rsp;
+       movq %rbx, (1 * 8)(%rsp);
+       movq %r12, (2 * 8)(%rsp);
+       movq %r13, (3 * 8)(%rsp);
+       movq %r14, (4 * 8)(%rsp);
+       movq %r15, (5 * 8)(%rsp);
+
+       movq %rdx, (8 * 8)(%rsp); # NBLKS
+
+       movq %rdi, POLY_RSTATE;
+       movq %rsi, POLY_RSRC;
+
+       POLY1305_LOAD_STATE();
+
+.L_poly1:
+       POLY1305_BLOCK_PART1(0 * 16);
+       POLY1305_BLOCK_PART2();
+       POLY1305_BLOCK_PART3();
+       POLY1305_BLOCK_PART4();
+       POLY1305_BLOCK_PART5();
+
+       subq $1, (8 * 8)(%rsp); # NBLKS
+       leaq (16)(POLY_RSRC), POLY_RSRC;
+       jnz .L_poly1;
+
+       POLY1305_STORE_STATE();
+
+       movq (1 * 8)(%rsp), %rbx;
+       movq (2 * 8)(%rsp), %r12;
+       movq (3 * 8)(%rsp), %r13;
+       movq (4 * 8)(%rsp), %r14;
+       movq (5 * 8)(%rsp), %r15;
+
+       xorl %eax, %eax;
+       leave
+       ret;
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/asm-poly1305-s390x.h 
b/grub-core/lib/libgcrypt/cipher/asm-poly1305-s390x.h
new file mode 100644
index 000000000..113ab9491
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/asm-poly1305-s390x.h
@@ -0,0 +1,140 @@
+/* asm-common-amd64.h  -  Poly1305 macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_S390X_H
+#define GCRY_ASM_POLY1305_S390X_H
+
+#include "asm-common-s390x.h"
+
+/**********************************************************************
+  poly1305 for stitched chacha20-poly1305
+ **********************************************************************/
+
+#define POLY_RSTATE       %r1
+#define POLY_RSRC         %r14
+
+#define POLY_R_H0_TMP_HI  %r6  // even-
+#define POLY_R_H0         %r7  //      odd pair
+#define POLY_R_H1_TMP_HI  %r8  // even-
+#define POLY_R_H1         %r9  //      odd pair
+#define POLY_R_H2         %r10
+#define POLY_R_R0         %r11
+#define POLY_R_R1         %r12
+#define POLY_R_R1_MUL5    %r13
+#define POLY_R_X0_HI      %r2  // even-
+#define POLY_R_X0_LO      %r3  //      odd pair
+#define POLY_R_X1_HI      %r4  // even-
+#define POLY_R_X1_LO      %r5  //      odd pair
+
+#define POLY_S_R0      (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1      (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0      (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1      (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d     (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define INC_POLY1305_SRC(a) \
+       aghi POLY_RSRC, (a);
+
+#define POLY1305_LOAD_STATE() \
+       lg POLY_R_H0, POLY_S_H0; \
+       lg POLY_R_H1, POLY_S_H1; \
+       llgf POLY_R_H2, POLY_S_H2d; \
+       rllg POLY_R_H0, POLY_R_H0, 32; \
+       rllg POLY_R_H1, POLY_R_H1, 32; \
+       lg POLY_R_R0, POLY_S_R0; \
+       lg POLY_R_R1, POLY_S_R1; \
+       rllg POLY_R_R0, POLY_R_R0, 32; \
+       rllg POLY_R_R1, POLY_R_R1, 32; \
+       srlg POLY_R_R1_MUL5, POLY_R_R1, 2; \
+       algr POLY_R_R1_MUL5, POLY_R_R1;
+
+#define POLY1305_STORE_STATE() \
+       rllg POLY_R_H0, POLY_R_H0, 32; \
+       rllg POLY_R_H1, POLY_R_H1, 32; \
+       stg POLY_R_H0, POLY_S_H0; \
+       stg POLY_R_H1, POLY_S_H1; \
+       st POLY_R_H2, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1_HB(src_offset, high_pad) \
+       lrvg POLY_R_X0_HI, ((src_offset) + 1 * 8)(POLY_RSRC); \
+       lrvg POLY_R_X0_LO, ((src_offset) + 0 * 8)(POLY_RSRC); \
+       lghi POLY_R_H1_TMP_HI, (high_pad);
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+       POLY1305_BLOCK_PART1_HB(src_offset, 1);
+
+#define POLY1305_BLOCK_PART2() \
+       algr POLY_R_H0, POLY_R_X0_LO; \
+       alcgr POLY_R_H1, POLY_R_X0_HI; \
+       alcgr POLY_R_H2, POLY_R_H1_TMP_HI; \
+       lgr POLY_R_X1_LO, POLY_R_H0; \
+       lgr POLY_R_X0_LO, POLY_R_H0;
+
+#define POLY1305_BLOCK_PART3() \
+       /* h = a * r (partial mod 2^130-5): */ \
+       \
+       /* h0 * r1 */ \
+       mlgr POLY_R_X1_HI, POLY_R_R1; \
+       \
+       /* h1 * r0 */ \
+       lgr POLY_R_H0, POLY_R_H1; \
+       mlgr POLY_R_H0_TMP_HI, POLY_R_R0; \
+       \
+       /* h1 * r1 mod 2^130-5 */ \
+       mlgr POLY_R_H1_TMP_HI, POLY_R_R1_MUL5;
+
+#define POLY1305_BLOCK_PART4() \
+       \
+       /* h0 * r0 */ \
+       mlgr POLY_R_X0_HI, POLY_R_R0; \
+       \
+       algr POLY_R_X1_LO, POLY_R_H0; \
+       alcgr POLY_R_X1_HI, POLY_R_H0_TMP_HI; \
+       \
+       lgr POLY_R_H0_TMP_HI, POLY_R_H2; \
+       msgr POLY_R_H0_TMP_HI, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */ \
+       msgr POLY_R_H2, POLY_R_R0;             /* h2 * r0 */
+
+#define POLY1305_BLOCK_PART5() \
+       \
+       algr POLY_R_X0_LO, POLY_R_H1; \
+       alcgr POLY_R_X0_HI, POLY_R_H1_TMP_HI;
+
+#define POLY1305_BLOCK_PART6() \
+       \
+       algrk POLY_R_H1, POLY_R_H0_TMP_HI, POLY_R_X1_LO; \
+       alcgr POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART7() \
+       \
+       /* carry propagation */ \
+       srlg POLY_R_H0, POLY_R_H2, 2; \
+       risbgn POLY_R_X1_LO, POLY_R_H2, 0, 0x80 | 61, 0; \
+       lghi POLY_R_H1_TMP_HI, 0; \
+       agr POLY_R_H0, POLY_R_X1_LO; \
+       risbgn POLY_R_H2, POLY_R_H2, 62, 0x80 | 63, 0;
+
+#define POLY1305_BLOCK_PART8() \
+       algr POLY_R_H0, POLY_R_X0_LO; \
+       alcgr POLY_R_H1, POLY_R_X0_HI; \
+       alcgr POLY_R_H2, POLY_R_H1_TMP_HI;
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/grub-core/lib/libgcrypt/cipher/bithelp.h 
b/grub-core/lib/libgcrypt/cipher/bithelp.h
index 150532433..7793ce7ca 100644
--- a/grub-core/lib/libgcrypt/cipher/bithelp.h
+++ b/grub-core/lib/libgcrypt/cipher/bithelp.h
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,41 +14,110 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
-#ifndef G10_BITHELP_H
-#define G10_BITHELP_H
+#ifndef GCRYPT_BITHELP_H
+#define GCRYPT_BITHELP_H
+
+#include "types.h"
 
 
 /****************
  * Rotate the 32 bit unsigned integer X by N bits left/right
  */
-#if defined(__GNUC__) && defined(__i386__)
+static inline u32 rol(u32 x, int n)
+{
+       return ( (x << (n&(32-1))) | (x >> ((32-n)&(32-1))) );
+}
+
+static inline u32 ror(u32 x, int n)
+{
+       return ( (x >> (n&(32-1))) | (x << ((32-n)&(32-1))) );
+}
+
+static inline u64 rol64(u64 x, int n)
+{
+  return ( (x << (n&(64-1))) | (x >> ((64-n)&(64-1))) );
+}
+
+/* Byte swap for 32-bit and 64-bit integers.  If available, use compiler
+   provided helpers.  */
+#ifdef HAVE_BUILTIN_BSWAP32
+# define _gcry_bswap32 __builtin_bswap32
+#else
 static inline u32
-rol( u32 x, int n)
+_gcry_bswap32(u32 x)
+{
+       return ((rol(x, 8) & 0x00ff00ffL) | (ror(x, 8) & 0xff00ff00L));
+}
+#endif
+
+#ifdef HAVE_BUILTIN_BSWAP64
+# define _gcry_bswap64 __builtin_bswap64
+#else
+static inline u64
+_gcry_bswap64(u64 x)
 {
-       __asm__("roll %%cl,%0"
-               :"=r" (x)
-               :"0" (x),"c" (n));
-       return x;
+       return ((u64)_gcry_bswap32(x) << 32) | (_gcry_bswap32(x >> 32));
 }
+#endif
+
+/* Endian dependent byte swap operations.  */
+#ifdef WORDS_BIGENDIAN
+# define le_bswap32(x) _gcry_bswap32(x)
+# define be_bswap32(x) ((u32)(x))
+# define le_bswap64(x) _gcry_bswap64(x)
+# define be_bswap64(x) ((u64)(x))
 #else
-#define rol(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) )
+# define le_bswap32(x) ((u32)(x))
+# define be_bswap32(x) _gcry_bswap32(x)
+# define le_bswap64(x) ((u64)(x))
+# define be_bswap64(x) _gcry_bswap64(x)
 #endif
 
-#if defined(__GNUC__) && defined(__i386__)
-static inline u32
-ror(u32 x, int n)
+
+/* Count trailing zero bits in an unsigend int.  We return an int
+   because that is what gcc's builtin does.  Returns the number of
+   bits in X if X is 0. */
+static inline int
+_gcry_ctz (unsigned int x)
 {
-       __asm__("rorl %%cl,%0"
-               :"=r" (x)
-               :"0" (x),"c" (n));
-       return x;
+#if defined (HAVE_BUILTIN_CTZ)
+  return x ? __builtin_ctz (x) : 8 * sizeof (x);
+#else
+  /* See
+   * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightModLookup
+   */
+  static const unsigned char mod37[] =
+    {
+      sizeof (unsigned int)*8,
+          0,  1, 26,  2, 23, 27,  0,  3, 16, 24, 30, 28, 11,  0, 13,
+      4,  7, 17,  0, 25, 22, 31, 15, 29, 10, 12,  6,  0, 21, 14,  9,
+      5, 20,  8, 19, 18
+    };
+  return (int)mod37[(-x & x) % 37];
+#endif
 }
+
+
+/* Count trailing zero bits in an u64.  We return an int because that
+   is what gcc's builtin does.  Returns the number of bits in X if X
+   is 0.  */
+static inline int
+_gcry_ctz64(u64 x)
+{
+#if defined (HAVE_BUILTIN_CTZL) && SIZEOF_UNSIGNED_LONG >= 8
+  return x ? __builtin_ctzl (x) : 8 * sizeof (x);
+#elif defined (HAVE_BUILTIN_CTZ) && SIZEOF_UNSIGNED_INT >= 8
+#warning hello
+  return x ? __builtin_ctz (x) : 8 * sizeof (x);
 #else
-#define ror(x,n) ( ((x) >> (n)) | ((x) << (32-(n))) )
+  if ((x & 0xffffffff))
+    return _gcry_ctz (x);
+  else
+    return 32 + _gcry_ctz (x >> 32);
 #endif
+}
 
 
-#endif /*G10_BITHELP_H*/
+#endif /*GCRYPT_BITHELP_H*/
diff --git a/grub-core/lib/libgcrypt/cipher/blake2.c 
b/grub-core/lib/libgcrypt/cipher/blake2.c
new file mode 100644
index 000000000..451e71f64
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blake2.c
@@ -0,0 +1,1086 @@
+/* blake2.c - BLAKE2b and BLAKE2s hash functions (RFC 7693)
+ * Copyright (C) 2017  Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/ref
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#include <config.h>
+#include <string.h>
+#include "g10lib.h"
+#include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher.h"
+#include "hash-common.h"
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX512 1
+#endif
+
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and 
additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if (defined(USE_AVX) || defined(USE_AVX2) || defined(USE_AVX512)) \
+    && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+#define BLAKE2B_BLOCKBYTES 128
+#define BLAKE2B_OUTBYTES 64
+#define BLAKE2B_KEYBYTES 64
+
+#define BLAKE2S_BLOCKBYTES 64
+#define BLAKE2S_OUTBYTES 32
+#define BLAKE2S_KEYBYTES 32
+
+typedef struct
+{
+  u64 h[8];
+  u64 t[2];
+  u64 f[2];
+} BLAKE2B_STATE;
+
+struct blake2b_param_s
+{
+  byte digest_length;
+  byte key_length;
+  byte fanout;
+  byte depth;
+  byte leaf_length[4];
+  byte node_offset[4];
+  byte xof_length[4];
+  byte node_depth;
+  byte inner_length;
+  byte reserved[14];
+  byte salt[16];
+  byte personal[16];
+};
+
+typedef struct BLAKE2B_CONTEXT_S
+{
+  BLAKE2B_STATE state;
+  byte buf[BLAKE2B_BLOCKBYTES];
+  size_t buflen;
+  size_t outlen;
+#ifdef USE_AVX2
+  unsigned int use_avx2:1;
+#endif
+#ifdef USE_AVX512
+  unsigned int use_avx512:1;
+#endif
+} BLAKE2B_CONTEXT;
+
+typedef struct
+{
+  u32 h[8];
+  u32 t[2];
+  u32 f[2];
+} BLAKE2S_STATE;
+
+struct blake2s_param_s
+{
+  byte digest_length;
+  byte key_length;
+  byte fanout;
+  byte depth;
+  byte leaf_length[4];
+  byte node_offset[4];
+  byte xof_length[2];
+  byte node_depth;
+  byte inner_length;
+  /* byte reserved[0]; */
+  byte salt[8];
+  byte personal[8];
+};
+
+typedef struct BLAKE2S_CONTEXT_S
+{
+  BLAKE2S_STATE state;
+  byte buf[BLAKE2S_BLOCKBYTES];
+  size_t buflen;
+  size_t outlen;
+#ifdef USE_AVX
+  unsigned int use_avx:1;
+#endif
+#ifdef USE_AVX512
+  unsigned int use_avx512:1;
+#endif
+} BLAKE2S_CONTEXT;
+
+typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk,
+                                          size_t nblks);
+
+
+static const u64 blake2b_IV[8] =
+{
+  U64_C(0x6a09e667f3bcc908), U64_C(0xbb67ae8584caa73b),
+  U64_C(0x3c6ef372fe94f82b), U64_C(0xa54ff53a5f1d36f1),
+  U64_C(0x510e527fade682d1), U64_C(0x9b05688c2b3e6c1f),
+  U64_C(0x1f83d9abfb41bd6b), U64_C(0x5be0cd19137e2179)
+};
+
+static const u32 blake2s_IV[8] =
+{
+  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static byte zero_block[BLAKE2B_BLOCKBYTES] = { 0, };
+
+
+static void blake2_write(void *S, const void *inbuf, size_t inlen,
+                        byte *tmpbuf, size_t *tmpbuflen, size_t blkbytes,
+                        blake2_transform_t transform_fn)
+{
+  const byte* in = inbuf;
+  unsigned int burn = 0;
+
+  if (inlen > 0)
+    {
+      size_t left = *tmpbuflen;
+      size_t fill = blkbytes - left;
+      size_t nblks;
+
+      if (inlen > fill)
+       {
+         if (fill > 0)
+           buf_cpy (tmpbuf + left, in, fill); /* Fill buffer */
+         left = 0;
+
+         burn = transform_fn (S, tmpbuf, 1); /* Increment counter + Compress */
+
+         in += fill;
+         inlen -= fill;
+
+         nblks = inlen / blkbytes - !(inlen % blkbytes);
+         if (nblks)
+           {
+             burn = transform_fn(S, in, nblks);
+             in += blkbytes * nblks;
+             inlen -= blkbytes * nblks;
+           }
+       }
+
+      gcry_assert (inlen > 0);
+
+      buf_cpy (tmpbuf + left, in, inlen);
+      *tmpbuflen = left + inlen;
+    }
+
+  if (burn)
+    _gcry_burn_stack (burn);
+
+  return;
+}
+
+
+static inline void blake2b_set_lastblock(BLAKE2B_STATE *S)
+{
+  S->f[0] = U64_C(0xffffffffffffffff);
+}
+
+static inline int blake2b_is_lastblock(const BLAKE2B_STATE *S)
+{
+  return S->f[0] != 0;
+}
+
+static inline void blake2b_increment_counter(BLAKE2B_STATE *S, const int inc)
+{
+  S->t[0] += (u64)inc;
+  S->t[1] += (S->t[0] < (u64)inc) - (inc < 0);
+}
+
+static inline u64 rotr64(u64 x, u64 n)
+{
+  return ((x >> (n & 63)) | (x << ((64 - n) & 63)));
+}
+
+static unsigned int blake2b_transform_generic(BLAKE2B_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
+{
+  static const byte blake2b_sigma[12][16] =
+  {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
+  };
+  const byte* in = inblks;
+  u64 m[16];
+  u64 v[16];
+
+  while (nblks--)
+    {
+      /* Increment counter */
+      blake2b_increment_counter (S, BLAKE2B_BLOCKBYTES);
+
+      /* Compress */
+      m[0] = buf_get_le64 (in + 0 * sizeof(m[0]));
+      m[1] = buf_get_le64 (in + 1 * sizeof(m[0]));
+      m[2] = buf_get_le64 (in + 2 * sizeof(m[0]));
+      m[3] = buf_get_le64 (in + 3 * sizeof(m[0]));
+      m[4] = buf_get_le64 (in + 4 * sizeof(m[0]));
+      m[5] = buf_get_le64 (in + 5 * sizeof(m[0]));
+      m[6] = buf_get_le64 (in + 6 * sizeof(m[0]));
+      m[7] = buf_get_le64 (in + 7 * sizeof(m[0]));
+      m[8] = buf_get_le64 (in + 8 * sizeof(m[0]));
+      m[9] = buf_get_le64 (in + 9 * sizeof(m[0]));
+      m[10] = buf_get_le64 (in + 10 * sizeof(m[0]));
+      m[11] = buf_get_le64 (in + 11 * sizeof(m[0]));
+      m[12] = buf_get_le64 (in + 12 * sizeof(m[0]));
+      m[13] = buf_get_le64 (in + 13 * sizeof(m[0]));
+      m[14] = buf_get_le64 (in + 14 * sizeof(m[0]));
+      m[15] = buf_get_le64 (in + 15 * sizeof(m[0]));
+
+      v[ 0] = S->h[0];
+      v[ 1] = S->h[1];
+      v[ 2] = S->h[2];
+      v[ 3] = S->h[3];
+      v[ 4] = S->h[4];
+      v[ 5] = S->h[5];
+      v[ 6] = S->h[6];
+      v[ 7] = S->h[7];
+      v[ 8] = blake2b_IV[0];
+      v[ 9] = blake2b_IV[1];
+      v[10] = blake2b_IV[2];
+      v[11] = blake2b_IV[3];
+      v[12] = blake2b_IV[4] ^ S->t[0];
+      v[13] = blake2b_IV[5] ^ S->t[1];
+      v[14] = blake2b_IV[6] ^ S->f[0];
+      v[15] = blake2b_IV[7] ^ S->f[1];
+
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+    d = rotr64(d ^ a, 32);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 24);                  \
+    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+    d = rotr64(d ^ a, 16);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 63);                  \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+      ROUND(0);
+      ROUND(1);
+      ROUND(2);
+      ROUND(3);
+      ROUND(4);
+      ROUND(5);
+      ROUND(6);
+      ROUND(7);
+      ROUND(8);
+      ROUND(9);
+      ROUND(10);
+      ROUND(11);
+
+#undef G
+#undef ROUND
+
+      S->h[0] = S->h[0] ^ v[0] ^ v[0 + 8];
+      S->h[1] = S->h[1] ^ v[1] ^ v[1 + 8];
+      S->h[2] = S->h[2] ^ v[2] ^ v[2 + 8];
+      S->h[3] = S->h[3] ^ v[3] ^ v[3 + 8];
+      S->h[4] = S->h[4] ^ v[4] ^ v[4 + 8];
+      S->h[5] = S->h[5] ^ v[5] ^ v[5 + 8];
+      S->h[6] = S->h[6] ^ v[6] ^ v[6 + 8];
+      S->h[7] = S->h[7] ^ v[7] ^ v[7 + 8];
+
+      in += BLAKE2B_BLOCKBYTES;
+    }
+
+  return sizeof(void *) * 4 + sizeof(u64) * 16 * 2;
+}
+
+#ifdef USE_AVX2
+unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S,
+                                                const void *inblks,
+                                                size_t nblks) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_AVX512
+unsigned int _gcry_blake2b_transform_amd64_avx512(BLAKE2B_STATE *S,
+                                                  const void *inblks,
+                                                  size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2b_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    {}
+#ifdef USE_AVX512
+  else if (c->use_avx512)
+    nburn = _gcry_blake2b_transform_amd64_avx512(&c->state, inblks, nblks);
+#endif
+#ifdef USE_AVX2
+  else if (c->use_avx2)
+    nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2b_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
+static void blake2b_final(void *ctx)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  BLAKE2B_STATE *S = &c->state;
+  unsigned int burn;
+  size_t i;
+
+  gcry_assert (sizeof(c->buf) >= c->outlen);
+  if (blake2b_is_lastblock(S))
+    return;
+
+  if (c->buflen < BLAKE2B_BLOCKBYTES)
+    memset (c->buf + c->buflen, 0, BLAKE2B_BLOCKBYTES - c->buflen); /* Padding 
*/
+  blake2b_set_lastblock (S);
+  blake2b_increment_counter (S, (int)c->buflen - BLAKE2B_BLOCKBYTES);
+  burn = blake2b_transform (ctx, c->buf, 1);
+
+  /* Output full hash to buffer */
+  for (i = 0; i < 8; ++i)
+    buf_put_le64 (c->buf + sizeof(S->h[i]) * i, S->h[i]);
+
+  /* Zero out extra buffer bytes. */
+  if (c->outlen < sizeof(c->buf))
+    memset (c->buf + c->outlen, 0, sizeof(c->buf) - c->outlen);
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+static byte *blake2b_read(void *ctx)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  return c->buf;
+}
+
+static void blake2b_write(void *ctx, const void *inbuf, size_t inlen)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  BLAKE2B_STATE *S = &c->state;
+  blake2_write(S, inbuf, inlen, c->buf, &c->buflen, BLAKE2B_BLOCKBYTES,
+              blake2b_transform);
+}
+
+static inline void blake2b_init_param(BLAKE2B_STATE *S,
+                                     const struct blake2b_param_s *P)
+{
+  const byte *p = (const byte *)P;
+  size_t i;
+
+  /* init xors IV with input parameter block */
+
+  /* IV XOR ParamBlock */
+  for (i = 0; i < 8; ++i)
+    S->h[i] = blake2b_IV[i] ^ buf_get_le64(p + sizeof(S->h[i]) * i);
+}
+
+static inline gcry_err_code_t blake2b_init(BLAKE2B_CONTEXT *ctx,
+                                          const byte *key, size_t keylen)
+{
+  struct blake2b_param_s P[1] = { { 0, } };
+  BLAKE2B_STATE *S = &ctx->state;
+
+  if (!ctx->outlen || ctx->outlen > BLAKE2B_OUTBYTES)
+    return GPG_ERR_INV_ARG;
+  if (sizeof(P[0]) != sizeof(u64) * 8)
+    return GPG_ERR_INTERNAL;
+  if (keylen && (!key || keylen > BLAKE2B_KEYBYTES))
+    return GPG_ERR_INV_KEYLEN;
+
+  P->digest_length = ctx->outlen;
+  P->key_length = keylen;
+  P->fanout = 1;
+  P->depth = 1;
+
+  blake2b_init_param (S, P);
+  wipememory (P, sizeof(P));
+
+  if (key)
+    {
+      blake2b_write (ctx, key, keylen);
+      blake2b_write (ctx, zero_block, BLAKE2B_BLOCKBYTES - keylen);
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags,
+                                       const byte *key, size_t keylen,
+                                       unsigned int dbits)
+{
+  BLAKE2B_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)features;
+  (void)flags;
+
+  memset (c, 0, sizeof (*c));
+
+#ifdef USE_AVX2
+  c->use_avx2 = !!(features & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_AVX512
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+#endif
+
+  c->outlen = dbits / 8;
+  c->buflen = 0;
+  return blake2b_init(c, key, keylen);
+}
+
+/* Variable-length Hash Function H'.  */
+gcry_err_code_t
+blake2b_vl_hash (const void *in, size_t inlen, size_t outputlen, void *output)
+{
+  gcry_err_code_t ec;
+  BLAKE2B_CONTEXT ctx;
+  unsigned char buf[4];
+
+  ec = blake2b_init_ctx (&ctx, 0, NULL, 0,
+                         (outputlen < 64 ? outputlen: 64)*8);
+  if (ec)
+    return ec;
+
+  buf_put_le32 (buf, outputlen);
+  blake2b_write (&ctx, buf, 4);
+  blake2b_write (&ctx, in, inlen);
+  blake2b_final (&ctx);
+
+  if (outputlen <= 64)
+    memcpy (output, ctx.buf, outputlen);
+  else
+    {
+      int r = (outputlen-1)/32 - 1;
+      unsigned int remained = outputlen - 32*r;
+      int i;
+      unsigned char d[64];
+
+      i = 0;
+      while (1)
+        {
+          memcpy (d, ctx.buf, 64);
+          memcpy ((unsigned char *)output+i*32, d, 32);
+
+          if (++i >= r)
+            break;
+
+          ec = blake2b_init_ctx (&ctx, 0, NULL, 0, 64*8);
+          if (ec)
+            return ec;
+
+          blake2b_write (&ctx, d, 64);
+          blake2b_final (&ctx);
+        }
+
+      ec = blake2b_init_ctx (&ctx, 0, NULL, 0, remained*8);
+      if (ec)
+        return ec;
+
+      blake2b_write (&ctx, d, 64);
+      blake2b_final (&ctx);
+
+      memcpy ((unsigned char *)output+r*32, ctx.buf, remained);
+    }
+
+  wipememory (buf, sizeof (buf));
+  wipememory (&ctx, sizeof (ctx));
+  return 0;
+}
+
+static inline void blake2s_set_lastblock(BLAKE2S_STATE *S)
+{
+  S->f[0] = 0xFFFFFFFFUL;
+}
+
+static inline int blake2s_is_lastblock(BLAKE2S_STATE *S)
+{
+  return S->f[0] != 0;
+}
+
+static inline void blake2s_increment_counter(BLAKE2S_STATE *S, const int inc)
+{
+  S->t[0] += (u32)inc;
+  S->t[1] += (S->t[0] < (u32)inc) - (inc < 0);
+}
+
+static unsigned int blake2s_transform_generic(BLAKE2S_STATE *S,
+                                              const void *inblks,
+                                              size_t nblks)
+{
+  static const byte blake2s_sigma[10][16] =
+  {
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+  };
+  unsigned int burn = 0;
+  const byte* in = inblks;
+  u32 m[16];
+  u32 v[16];
+
+  while (nblks--)
+    {
+      /* Increment counter */
+      blake2s_increment_counter (S, BLAKE2S_BLOCKBYTES);
+
+      /* Compress */
+      m[0] = buf_get_le32 (in + 0 * sizeof(m[0]));
+      m[1] = buf_get_le32 (in + 1 * sizeof(m[0]));
+      m[2] = buf_get_le32 (in + 2 * sizeof(m[0]));
+      m[3] = buf_get_le32 (in + 3 * sizeof(m[0]));
+      m[4] = buf_get_le32 (in + 4 * sizeof(m[0]));
+      m[5] = buf_get_le32 (in + 5 * sizeof(m[0]));
+      m[6] = buf_get_le32 (in + 6 * sizeof(m[0]));
+      m[7] = buf_get_le32 (in + 7 * sizeof(m[0]));
+      m[8] = buf_get_le32 (in + 8 * sizeof(m[0]));
+      m[9] = buf_get_le32 (in + 9 * sizeof(m[0]));
+      m[10] = buf_get_le32 (in + 10 * sizeof(m[0]));
+      m[11] = buf_get_le32 (in + 11 * sizeof(m[0]));
+      m[12] = buf_get_le32 (in + 12 * sizeof(m[0]));
+      m[13] = buf_get_le32 (in + 13 * sizeof(m[0]));
+      m[14] = buf_get_le32 (in + 14 * sizeof(m[0]));
+      m[15] = buf_get_le32 (in + 15 * sizeof(m[0]));
+
+      v[ 0] = S->h[0];
+      v[ 1] = S->h[1];
+      v[ 2] = S->h[2];
+      v[ 3] = S->h[3];
+      v[ 4] = S->h[4];
+      v[ 5] = S->h[5];
+      v[ 6] = S->h[6];
+      v[ 7] = S->h[7];
+      v[ 8] = blake2s_IV[0];
+      v[ 9] = blake2s_IV[1];
+      v[10] = blake2s_IV[2];
+      v[11] = blake2s_IV[3];
+      v[12] = S->t[0] ^ blake2s_IV[4];
+      v[13] = S->t[1] ^ blake2s_IV[5];
+      v[14] = S->f[0] ^ blake2s_IV[6];
+      v[15] = S->f[1] ^ blake2s_IV[7];
+
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+    d = ror(d ^ a, 16);                     \
+    c = c + d;                              \
+    b = ror(b ^ c, 12);                     \
+    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+    d = ror(d ^ a, 8);                      \
+    c = c + d;                              \
+    b = ror(b ^ c, 7);                      \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+      ROUND(0);
+      ROUND(1);
+      ROUND(2);
+      ROUND(3);
+      ROUND(4);
+      ROUND(5);
+      ROUND(6);
+      ROUND(7);
+      ROUND(8);
+      ROUND(9);
+
+#undef G
+#undef ROUND
+
+      S->h[0] = S->h[0] ^ v[0] ^ v[0 + 8];
+      S->h[1] = S->h[1] ^ v[1] ^ v[1 + 8];
+      S->h[2] = S->h[2] ^ v[2] ^ v[2 + 8];
+      S->h[3] = S->h[3] ^ v[3] ^ v[3 + 8];
+      S->h[4] = S->h[4] ^ v[4] ^ v[4 + 8];
+      S->h[5] = S->h[5] ^ v[5] ^ v[5 + 8];
+      S->h[6] = S->h[6] ^ v[6] ^ v[6 + 8];
+      S->h[7] = S->h[7] ^ v[7] ^ v[7 + 8];
+
+      in += BLAKE2S_BLOCKBYTES;
+    }
+
+  return burn;
+}
+
+#ifdef USE_AVX
+unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S,
+                                               const void *inblks,
+                                               size_t nblks) ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_AVX512
+unsigned int _gcry_blake2s_transform_amd64_avx512(BLAKE2S_STATE *S,
+                                                  const void *inblks,
+                                                  size_t nblks) ASM_FUNC_ABI;
+#endif
+
+static unsigned int blake2s_transform(void *ctx, const void *inblks,
+                                      size_t nblks)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  unsigned int nburn;
+
+  if (0)
+    { }
+#ifdef USE_AVX512
+  else if (c->use_avx512)
+    nburn = _gcry_blake2s_transform_amd64_avx512(&c->state, inblks, nblks);
+#endif
+#ifdef USE_AVX
+  else if (c->use_avx)
+    nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks);
+#endif
+  else
+    nburn = blake2s_transform_generic(&c->state, inblks, nblks);
+
+  if (nburn)
+    nburn += ASM_EXTRA_STACK;
+
+  return nburn;
+}
+
+static void blake2s_final(void *ctx)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  BLAKE2S_STATE *S = &c->state;
+  unsigned int burn;
+  size_t i;
+
+  gcry_assert (sizeof(c->buf) >= c->outlen);
+  if (blake2s_is_lastblock(S))
+    return;
+
+  if (c->buflen < BLAKE2S_BLOCKBYTES)
+    memset (c->buf + c->buflen, 0, BLAKE2S_BLOCKBYTES - c->buflen); /* Padding 
*/
+  blake2s_set_lastblock (S);
+  blake2s_increment_counter (S, (int)c->buflen - BLAKE2S_BLOCKBYTES);
+  burn = blake2s_transform (ctx, c->buf, 1);
+
+  /* Output full hash to buffer */
+  for (i = 0; i < 8; ++i)
+    buf_put_le32 (c->buf + sizeof(S->h[i]) * i, S->h[i]);
+
+  /* Zero out extra buffer bytes. */
+  if (c->outlen < sizeof(c->buf))
+    memset (c->buf + c->outlen, 0, sizeof(c->buf) - c->outlen);
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+static byte *blake2s_read(void *ctx)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  return c->buf;
+}
+
+static void blake2s_write(void *ctx, const void *inbuf, size_t inlen)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  BLAKE2S_STATE *S = &c->state;
+  blake2_write(S, inbuf, inlen, c->buf, &c->buflen, BLAKE2S_BLOCKBYTES,
+              blake2s_transform);
+}
+
+static inline void blake2s_init_param(BLAKE2S_STATE *S,
+                                     const struct blake2s_param_s *P)
+{
+  const byte *p = (const byte *)P;
+  size_t i;
+
+  /* init2 xors IV with input parameter block */
+
+  /* IV XOR ParamBlock */
+  for (i = 0; i < 8; ++i)
+    S->h[i] ^= blake2s_IV[i] ^ buf_get_le32(&p[i * 4]);
+}
+
+static inline gcry_err_code_t blake2s_init(BLAKE2S_CONTEXT *ctx,
+                                          const byte *key, size_t keylen)
+{
+  struct blake2s_param_s P[1] = { { 0, } };
+  BLAKE2S_STATE *S = &ctx->state;
+
+  if (!ctx->outlen || ctx->outlen > BLAKE2S_OUTBYTES)
+    return GPG_ERR_INV_ARG;
+  if (sizeof(P[0]) != sizeof(u32) * 8)
+    return GPG_ERR_INTERNAL;
+  if (keylen && (!key || keylen > BLAKE2S_KEYBYTES))
+    return GPG_ERR_INV_KEYLEN;
+
+  P->digest_length = ctx->outlen;
+  P->key_length = keylen;
+  P->fanout = 1;
+  P->depth = 1;
+
+  blake2s_init_param (S, P);
+  wipememory (P, sizeof(P));
+
+  if (key)
+    {
+      blake2s_write (ctx, key, keylen);
+      blake2s_write (ctx, zero_block, BLAKE2S_BLOCKBYTES - keylen);
+    }
+
+  return 0;
+}
+
+static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags,
+                                       const byte *key, size_t keylen,
+                                       unsigned int dbits)
+{
+  BLAKE2S_CONTEXT *c = ctx;
+  unsigned int features = _gcry_get_hw_features ();
+
+  (void)features;
+  (void)flags;
+
+  memset (c, 0, sizeof (*c));
+
+#ifdef USE_AVX
+  c->use_avx = !!(features & HWF_INTEL_AVX);
+#endif
+#ifdef USE_AVX
+  c->use_avx512 = !!(features & HWF_INTEL_AVX512);
+#endif
+
+  c->outlen = dbits / 8;
+  c->buflen = 0;
+  return blake2s_init(c, key, keylen);
+}
+
+/* Selftests from "RFC 7693, Appendix E. BLAKE2b and BLAKE2s Self-Test
+ * Module C Source". */
+static void selftest_seq(byte *out, size_t len, u32 seed)
+{
+  size_t i;
+  u32 t, a, b;
+
+  a = 0xDEAD4BAD * seed;
+  b = 1;
+
+  for (i = 0; i < len; i++)
+    {
+      t = a + b;
+      a = b;
+      b = t;
+      out[i] = (t >> 24) & 0xFF;
+    }
+}
+
+static gpg_err_code_t
+selftests_blake2b (int algo, int extended, selftest_report_func_t report)
+{
+  static const byte blake2b_res[32] =
+  {
+    0xC2, 0x3A, 0x78, 0x00, 0xD9, 0x81, 0x23, 0xBD,
+    0x10, 0xF5, 0x06, 0xC6, 0x1E, 0x29, 0xDA, 0x56,
+    0x03, 0xD7, 0x63, 0xB8, 0xBB, 0xAD, 0x2E, 0x73,
+    0x7F, 0x5E, 0x76, 0x5A, 0x7B, 0xCC, 0xD4, 0x75
+  };
+  static const size_t b2b_md_len[4] = { 20, 32, 48, 64 };
+  static const size_t b2b_in_len[6] = { 0, 3, 128, 129, 255, 1024 };
+  size_t i, j, outlen, inlen;
+  byte in[1024], key[64];
+  BLAKE2B_CONTEXT ctx;
+  BLAKE2B_CONTEXT ctx2;
+  const char *what;
+  const char *errtxt;
+
+  (void)extended;
+
+  what = "rfc7693 BLAKE2b selftest";
+
+  /* 256-bit hash for testing */
+  if (blake2b_init_ctx(&ctx, 0, NULL, 0, 32 * 8))
+    {
+      errtxt = "init failed";
+      goto failed;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      outlen = b2b_md_len[i];
+      for (j = 0; j < 6; j++)
+       {
+         inlen = b2b_in_len[j];
+
+         selftest_seq(in, inlen, inlen); /* unkeyed hash */
+         blake2b_init_ctx(&ctx2, 0, NULL, 0, outlen * 8);
+         blake2b_write(&ctx2, in, inlen);
+         blake2b_final(&ctx2);
+         blake2b_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+
+         selftest_seq(key, outlen, outlen); /* keyed hash */
+         blake2b_init_ctx(&ctx2, 0, key, outlen, outlen * 8);
+         blake2b_write(&ctx2, in, inlen);
+         blake2b_final(&ctx2);
+         blake2b_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+       }
+    }
+
+  /* compute and compare the hash of hashes */
+  blake2b_final(&ctx);
+  for (i = 0; i < 32; i++)
+    {
+      if (ctx.buf[i] != blake2b_res[i])
+       {
+         errtxt = "digest mismatch";
+         goto failed;
+       }
+    }
+
+  return 0;
+
+failed:
+  if (report)
+    report ("digest", algo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_blake2s (int algo, int extended, selftest_report_func_t report)
+{
+  static const byte blake2s_res[32] =
+  {
+    0x6A, 0x41, 0x1F, 0x08, 0xCE, 0x25, 0xAD, 0xCD,
+    0xFB, 0x02, 0xAB, 0xA6, 0x41, 0x45, 0x1C, 0xEC,
+    0x53, 0xC5, 0x98, 0xB2, 0x4F, 0x4F, 0xC7, 0x87,
+    0xFB, 0xDC, 0x88, 0x79, 0x7F, 0x4C, 0x1D, 0xFE
+  };
+  static const size_t b2s_md_len[4] = { 16, 20, 28, 32 };
+  static const size_t b2s_in_len[6] = { 0, 3, 64, 65, 255, 1024 };
+  size_t i, j, outlen, inlen;
+  byte in[1024], key[32];
+  BLAKE2S_CONTEXT ctx;
+  BLAKE2S_CONTEXT ctx2;
+  const char *what;
+  const char *errtxt;
+
+  (void)extended;
+
+  what = "rfc7693 BLAKE2s selftest";
+
+  /* 256-bit hash for testing */
+  if (blake2s_init_ctx(&ctx, 0, NULL, 0, 32 * 8))
+    {
+      errtxt = "init failed";
+      goto failed;
+    }
+
+  for (i = 0; i < 4; i++)
+    {
+      outlen = b2s_md_len[i];
+      for (j = 0; j < 6; j++)
+       {
+         inlen = b2s_in_len[j];
+
+         selftest_seq(in, inlen, inlen); /* unkeyed hash */
+         blake2s_init_ctx(&ctx2, 0, NULL, 0, outlen * 8);
+         blake2s_write(&ctx2, in, inlen);
+         blake2s_final(&ctx2);
+         blake2s_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+
+         selftest_seq(key, outlen, outlen); /* keyed hash */
+         blake2s_init_ctx(&ctx2, 0, key, outlen, outlen * 8);
+         blake2s_write(&ctx2, in, inlen);
+         blake2s_final(&ctx2);
+         blake2s_write(&ctx, ctx2.buf, outlen); /* hash the hash */
+       }
+    }
+
+  /* compute and compare the hash of hashes */
+  blake2s_final(&ctx);
+  for (i = 0; i < 32; i++)
+    {
+      if (ctx.buf[i] != blake2s_res[i])
+       {
+         errtxt = "digest mismatch";
+         goto failed;
+       }
+    }
+
+  return 0;
+
+failed:
+  if (report)
+    report ("digest", algo, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+
+gcry_err_code_t _gcry_blake2_init_with_key(void *ctx, unsigned int flags,
+                                          const unsigned char *key,
+                                          size_t keylen, int algo)
+{
+  gcry_err_code_t rc;
+  switch (algo)
+    {
+    case GCRY_MD_BLAKE2B_512:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 512);
+      break;
+    case GCRY_MD_BLAKE2B_384:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 384);
+      break;
+    case GCRY_MD_BLAKE2B_256:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 256);
+      break;
+    case GCRY_MD_BLAKE2B_160:
+      rc = blake2b_init_ctx (ctx, flags, key, keylen, 160);
+      break;
+    case GCRY_MD_BLAKE2S_256:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 256);
+      break;
+    case GCRY_MD_BLAKE2S_224:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 224);
+      break;
+    case GCRY_MD_BLAKE2S_160:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 160);
+      break;
+    case GCRY_MD_BLAKE2S_128:
+      rc = blake2s_init_ctx (ctx, flags, key, keylen, 128);
+      break;
+    default:
+      rc = GPG_ERR_DIGEST_ALGO;
+      break;
+    }
+
+  return rc;
+}
+
+
+#define DEFINE_BLAKE2_VARIANT(bs, BS, dbits, oid_branch) \
+  static void blake2##bs##_##dbits##_init(void *ctx, unsigned int flags) \
+  { \
+    int err = blake2##bs##_init_ctx (ctx, flags, NULL, 0, dbits); \
+    gcry_assert (err == 0); \
+  } \
+  static void \
+  _gcry_blake2##bs##_##dbits##_hash_buffers(void *outbuf, size_t nbytes, \
+        const gcry_buffer_t *iov, int iovcnt) \
+  { \
+    BLAKE2##BS##_CONTEXT hd; \
+    (void)nbytes; \
+    blake2##bs##_##dbits##_init (&hd, 0); \
+    for (;iovcnt > 0; iov++, iovcnt--) \
+      blake2##bs##_write (&hd, (const char*)iov[0].data + iov[0].off, \
+                          iov[0].len); \
+    blake2##bs##_final (&hd); \
+    memcpy (outbuf, blake2##bs##_read (&hd), dbits / 8); \
+  } \
+  static const byte blake2##bs##_##dbits##_asn[] = { 0x30 }; \
+  static const gcry_md_oid_spec_t oid_spec_blake2##bs##_##dbits[] = \
+    { \
+      { " 1.3.6.1.4.1.1722.12.2." oid_branch }, \
+      { NULL } \
+    }; \
+  const gcry_md_spec_t _gcry_digest_spec_blake2##bs##_##dbits = \
+    { \
+      GCRY_MD_BLAKE2##BS##_##dbits, {0, 0}, \
+      "BLAKE2" #BS "_" #dbits, blake2##bs##_##dbits##_asn, \
+      DIM (blake2##bs##_##dbits##_asn), oid_spec_blake2##bs##_##dbits, \
+      dbits / 8, blake2##bs##_##dbits##_init, blake2##bs##_write, \
+      blake2##bs##_final, blake2##bs##_read, NULL, \
+      _gcry_blake2##bs##_##dbits##_hash_buffers, \
+      sizeof (BLAKE2##BS##_CONTEXT), selftests_blake2##bs \
+    };
+
+DEFINE_BLAKE2_VARIANT(b, B, 512, "1.16")
+DEFINE_BLAKE2_VARIANT(b, B, 384, "1.12")
+DEFINE_BLAKE2_VARIANT(b, B, 256, "1.8")
+DEFINE_BLAKE2_VARIANT(b, B, 160, "1.5")
+
+DEFINE_BLAKE2_VARIANT(s, S, 256, "2.8")
+DEFINE_BLAKE2_VARIANT(s, S, 224, "2.7")
+DEFINE_BLAKE2_VARIANT(s, S, 160, "2.5")
+DEFINE_BLAKE2_VARIANT(s, S, 128, "2.4")
diff --git a/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx2.S 
b/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx2.S
new file mode 100644
index 000000000..43c2cce18
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx2.S
@@ -0,0 +1,301 @@
+/* blake2b-amd64-avx2.S  -  AVX2 implementation of BLAKE2b
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 8)
+#define STATE_F (STATE_T + 2 * 8)
+
+/* vector registers */
+#define ROW1  %ymm0
+#define ROW2  %ymm1
+#define ROW3  %ymm2
+#define ROW4  %ymm3
+#define TMP1  %ymm4
+#define TMP1x %xmm4
+#define R16   %ymm5
+#define R24   %ymm6
+
+#define MA1   %ymm8
+#define MA2   %ymm9
+#define MA3   %ymm10
+#define MA4   %ymm11
+#define MA1x  %xmm8
+#define MA2x  %xmm9
+#define MA3x  %xmm10
+#define MA4x  %xmm11
+
+#define MB1   %ymm12
+#define MB2   %ymm13
+#define MB3   %ymm14
+#define MB4   %ymm15
+#define MB1x  %xmm12
+#define MB2x  %xmm13
+#define MB3x  %xmm14
+#define MB4x  %xmm15
+
+/**********************************************************************
+  blake2b/AVX2
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+        vmovq (s4)*8(RINBLKS), TMP1x; \
+        vpinsrq $1, (s2)*8(RINBLKS), m1x, m1x; \
+        vpinsrq $1, (s6)*8(RINBLKS), TMP1x, TMP1x; \
+        vinserti128 $1, TMP1x, m1, m1; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+          vmovq (s5)*8(RINBLKS), TMP1x; \
+          vpinsrq $1, (s3)*8(RINBLKS), m2x, m2x; \
+          vpinsrq $1, (s7)*8(RINBLKS), TMP1x, TMP1x; \
+          vinserti128 $1, TMP1x, m2, m2; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+            vmovq (s12)*8(RINBLKS), TMP1x; \
+            vpinsrq $1, (s10)*8(RINBLKS), m3x, m3x; \
+            vpinsrq $1, (s14)*8(RINBLKS), TMP1x, TMP1x; \
+            vinserti128 $1, TMP1x, m3, m3; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+              vmovq (s13)*8(RINBLKS), TMP1x; \
+              vpinsrq $1, (s11)*8(RINBLKS), m4x, m4x; \
+              vpinsrq $1, (s15)*8(RINBLKS), TMP1x, TMP1x; \
+              vinserti128 $1, TMP1x, m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
15)
+#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5, 
 3)
+#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9, 
 4)
+#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15, 
 8)
+#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 
13)
+#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1, 
 9)
+#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 
11)
+#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 
10)
+#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10, 
 5)
+#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 
, 0)
+#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) \
+        LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
+
+#define ROR_32(in, out) vpshufd $0xb1, in, out;
+
+#define ROR_24(in, out) vpshufb R24, in, out;
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_63(in, out) \
+        vpsrlq $63, in, TMP1; \
+        vpaddq in, in, out; \
+        vpxor  TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddq m, r1, r1; \
+        vpaddq r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddq r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_32, ROR_24);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_63);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+SECTION_RODATA
+.align 32
+ELF(.type _blake2b_avx2_data,@object;)
+_blake2b_avx2_data:
+.Liv:
+        .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+        .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+        .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+        .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+.Lshuf_ror16:
+        .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
+.Lshuf_ror24:
+        .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
+
+.text
+.align 64
+.globl _gcry_blake2b_transform_amd64_avx2
+ELF(.type _gcry_blake2b_transform_amd64_avx2,@function;)
+
+_gcry_blake2b_transform_amd64_avx2:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        vzeroupper;
+
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        vbroadcasti128 .Lshuf_ror16 rRIP, R16;
+        vbroadcasti128 .Lshuf_ror24 rRIP, R24;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(10, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(11, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 128(RINBLKS), RINBLKS;
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        ROUND(10, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(10, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 8)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 8)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2b_transform_amd64_avx2,
+    .-_gcry_blake2b_transform_amd64_avx2;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx512.S 
b/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx512.S
new file mode 100644
index 000000000..b030849eb
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blake2b-amd64-avx512.S
@@ -0,0 +1,429 @@
+/* blake2b-amd64-avx512.S  -  AVX512 implementation of BLAKE2b
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 8)
+#define STATE_F (STATE_T + 2 * 8)
+
+/* vector registers */
+#define ROW1  %ymm0
+#define ROW2  %ymm1
+#define ROW3  %ymm2
+#define ROW4  %ymm3
+#define TMP1  %ymm4
+#define TMP1x %xmm4
+#define R16   %ymm13
+
+#define MA1   %ymm5
+#define MA2   %ymm6
+#define MA3   %ymm7
+#define MA4   %ymm8
+#define MA1x  %xmm5
+#define MA2x  %xmm6
+#define MA3x  %xmm7
+#define MA4x  %xmm8
+
+#define MB1   %ymm9
+#define MB2   %ymm10
+#define MB3   %ymm11
+#define MB4   %ymm12
+#define MB1x  %xmm9
+#define MB2x  %xmm10
+#define MB3x  %xmm11
+#define MB4x  %xmm12
+
+/**********************************************************************
+  blake2b/AVX2
+ **********************************************************************/
+
+#define VPINSRQ_KMASK(kpos, qpos, mem, vreg) \
+        vmovdqu64 -((qpos) * 8) + mem, vreg {kpos}
+
+#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu (s0)*8(RINBLKS), m1x; /* merged load */ \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovdqu64 (s8)*8(RINBLKS), m3 {%k4}{z}; /* merged load */ \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k5, 1, (s2)*8(RINBLKS), m1); /* merged load */ \
+          VPINSRQ_KMASK(%k6, 1, (s3)*8(RINBLKS), m2); /* merged load */ \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu64 (s0)*8(RINBLKS), m1 {%k4}{z}; /* merged load */; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k5, 1, (s10)*8(RINBLKS), m3); /* merged load */ \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define GATHER_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovq (s1)*8(RINBLKS), m2x; \
+            vmovdqu (s8)*8(RINBLKS), m3x; /* merged load */ \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              vinserti64x2 $1, (s13)*8(RINBLKS), m4, m4; /* merged load */ \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k3, 3, (s7)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3);
+
+#define GATHER_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                     s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*8(RINBLKS), m1x; \
+          vmovdqu64 (s1)*8(RINBLKS), m2 {%k7}{z}; /* merged load */; \
+            vmovq (s8)*8(RINBLKS), m3x; \
+              vmovq (s9)*8(RINBLKS), m4x; \
+        VPINSRQ_KMASK(%k1, 1, (s2)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k1, 1, (s3)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k1, 1, (s10)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k1, 1, (s11)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k2, 2, (s4)*8(RINBLKS), m1); \
+          VPINSRQ_KMASK(%k2, 2, (s5)*8(RINBLKS), m2); \
+            VPINSRQ_KMASK(%k2, 2, (s12)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k2, 2, (s13)*8(RINBLKS), m4); \
+        VPINSRQ_KMASK(%k3, 3, (s6)*8(RINBLKS), m1); \
+            VPINSRQ_KMASK(%k3, 3, (s14)*8(RINBLKS), m3); \
+              VPINSRQ_KMASK(%k3, 3, (s15)*8(RINBLKS), m4);
+
+#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
15)
+#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5, 
 3)
+#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  
9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 
15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 
13)
+#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  
1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  
8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 
10)
+#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 
10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        GATHER_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x, \
+                     10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 
13 , 0)
+#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \
+        LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) \
+        LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x)
+
+#define ROR_32(in, out) vpshufd $0xb1, in, out
+
+#define ROR_24(in, out) vprorq $24, in, out
+
+#define ROR_16(in, out) vpshufb R16, in, out
+
+#define ROR_63(in, out) vprorq $63, in, out
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddq m, r1, r1; \
+        vpaddq r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddq r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2)
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_32, ROR_24)
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_63)
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(2,1,0,3), r4, r4
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpermq $MM_SHUFFLE(0,3,2,1), r4, r4
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4)
+
+SECTION_RODATA
+
+.align 32
+ELF(.type _blake2b_avx512_data,@object;)
+_blake2b_avx512_data:
+.Liv:
+        .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+        .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+        .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
+        .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+.Lshuf_ror16:
+        .byte 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9
+.Lk1_mask:
+       .byte (1 << 1)
+.Lk4_mask:
+       .byte (1 << 0) + (1 << 2)
+.Lk5_mask:
+       .byte (1 << 1) + (1 << 3)
+.Lk6_mask:
+       .byte (1 << 1) + (1 << 2)
+.Lk7_mask:
+       .byte (1 << 0) + (1 << 3)
+
+.text
+
+.align 64
+.globl _gcry_blake2b_transform_amd64_avx512
+ELF(.type _gcry_blake2b_transform_amd64_avx512,@function;)
+
+_gcry_blake2b_transform_amd64_avx512:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        spec_stop_avx512;
+
+        kmovb .Lk1_mask rRIP, %k1;
+        kshiftlb $1, %k1, %k2;
+        kshiftlb $2, %k1, %k3;
+        kmovb .Lk4_mask rRIP, %k4;
+        kmovb .Lk5_mask rRIP, %k5;
+        kmovb .Lk6_mask rRIP, %k6;
+        kmovb .Lk7_mask rRIP, %k7;
+
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        vbroadcasti128 .Lshuf_ror16 rRIP, R16;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.align 16
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(10, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(11, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 128(RINBLKS), RINBLKS;
+        addq $128, (STATE_T + 0)(RSTATE);
+        adcq $0, (STATE_T + 8)(RSTATE);
+
+        ROUND(10, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2;
+
+        vmovdqa .Liv+(0 * 8) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 8) rRIP, ROW4;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.align 16
+.Loop_end:
+        ROUND(10, MA1, MA2, MA3, MA4);
+        ROUND(11, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE);
+
+        xorl %eax, %eax;
+        kxord %k1, %k1, %k1;
+        kxord %k2, %k2, %k2;
+        kxord %k3, %k3, %k3;
+        kxord %k4, %k4, %k4;
+        kxord %k5, %k5, %k5;
+        kxord %k6, %k6, %k6;
+        kxord %k7, %k7, %k7;
+
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2b_transform_amd64_avx512,
+    .-_gcry_blake2b_transform_amd64_avx512;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx.S 
b/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx.S
new file mode 100644
index 000000000..44b82ab2d
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx.S
@@ -0,0 +1,281 @@
+/* blake2s-amd64-avx.S  -  AVX implementation of BLAKE2s
+ *
+ * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 4)
+#define STATE_F (STATE_T + 2 * 4)
+
+/* vector registers */
+#define ROW1  %xmm0
+#define ROW2  %xmm1
+#define ROW3  %xmm2
+#define ROW4  %xmm3
+#define TMP1  %xmm4
+#define TMP1x %xmm4
+#define R16   %xmm5
+#define R8    %xmm6
+
+#define MA1   %xmm8
+#define MA2   %xmm9
+#define MA3   %xmm10
+#define MA4   %xmm11
+
+#define MB1   %xmm12
+#define MB2   %xmm13
+#define MB3   %xmm14
+#define MB4   %xmm15
+
+/**********************************************************************
+  blake2s/AVX
+ **********************************************************************/
+
+#define GATHER_MSG(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
15)
+#define LOAD_MSG_1(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5, 
 3)
+#define LOAD_MSG_2(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9, 
 4)
+#define LOAD_MSG_3(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15, 
 8)
+#define LOAD_MSG_4(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 
13)
+#define LOAD_MSG_5(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1, 
 9)
+#define LOAD_MSG_6(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 
11)
+#define LOAD_MSG_7(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 
10)
+#define LOAD_MSG_8(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10, 
 5)
+#define LOAD_MSG_9(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 
, 0)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
+
+#define ROR_16(in, out) vpshufb R16, in, out;
+
+#define ROR_8(in, out)  vpshufb R8, in, out;
+
+#define ROR_12(in, out) \
+        vpsrld $12, in, TMP1; \
+        vpslld $(32 - 12), in, out; \
+        vpxor TMP1, out, out;
+
+#define ROR_7(in, out) \
+        vpsrld $7, in, TMP1; \
+        vpslld $(32 - 7), in, out; \
+        vpxor TMP1, out, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddd m, r1, r1; \
+        vpaddd r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddd r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_12);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_8, ROR_7);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+SECTION_RODATA
+
+.align 16
+ELF(.type _blake2s_avx_data,@object;)
+_blake2s_avx_data:
+.Liv:
+        .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+        .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.Lshuf_ror16:
+        .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.Lshuf_ror8:
+        .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
+
+.text
+
+.align 64
+.globl _gcry_blake2s_transform_amd64_avx
+ELF(.type _gcry_blake2s_transform_amd64_avx,@function;)
+
+_gcry_blake2s_transform_amd64_avx:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        vzeroupper;
+
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        vmovdqa .Lshuf_ror16 rRIP, R16;
+        vmovdqa .Lshuf_ror8 rRIP, R8;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 64(RINBLKS), RINBLKS;
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.Loop_end:
+        ROUND(8, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+
+        vpxor ROW3, ROW1, ROW1;
+        vpxor ROW4, ROW2, ROW2;
+        vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
+        vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        xor %eax, %eax;
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2s_transform_amd64_avx,
+    .-_gcry_blake2s_transform_amd64_avx;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx512.S 
b/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx512.S
new file mode 100644
index 000000000..543944bfd
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blake2s-amd64-avx512.S
@@ -0,0 +1,397 @@
+/* blake2s-amd64-avx512.S  -  AVX512 implementation of BLAKE2s
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 BLAKE2 reference implementation
+ * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
+ * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+   (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+/* register macros */
+#define RSTATE  %rdi
+#define RINBLKS %rsi
+#define RNBLKS  %rdx
+#define RIV     %rcx
+
+/* state structure */
+#define STATE_H 0
+#define STATE_T (STATE_H + 8 * 4)
+#define STATE_F (STATE_T + 2 * 4)
+
+/* vector registers */
+#define ROW1  %xmm0
+#define ROW2  %xmm1
+#define ROW3  %xmm2
+#define ROW4  %xmm3
+#define TMP1  %xmm4
+#define TMP1x %xmm4
+
+#define MA1   %xmm5
+#define MA2   %xmm6
+#define MA3   %xmm7
+#define MA4   %xmm8
+
+#define MB1   %xmm9
+#define MB2   %xmm10
+#define MB3   %xmm11
+#define MB4   %xmm12
+
+/**********************************************************************
+  blake2s/AVX
+ **********************************************************************/
+
+#define VPINSRD_KMASK(kpos, dpos, mem, vreg) \
+        vmovdqu32 -((dpos) * 4) + mem, vreg {kpos}
+
+#define GATHER_MSG(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_2(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovq (s0)*4(RINBLKS), m1; /* merged load */ \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_3(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovdqu32 (s8)*4(RINBLKS), m3 {%k4}{z}; /* merged load */ \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_5(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        VPINSRD_KMASK(%k5, 1, (s2)*4(RINBLKS), m1); /* merged load */ \
+          VPINSRD_KMASK(%k6, 1, (s3)*4(RINBLKS), m2); /* merged load */ \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_6(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovdqu32 (s0)*4(RINBLKS), m1 {%k4}{z}; /* merged load */; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            VPINSRD_KMASK(%k5, 1, (s10)*4(RINBLKS), m3); /* merged load */ \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define GATHER_MSG_8(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovd (s1)*4(RINBLKS), m2; \
+            vmovq (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrq $1, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+          vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3;
+
+#define GATHER_MSG_9(m1, m2, m3, m4, \
+                   s0, s1, s2, s3, s4, s5, s6, s7, s8, \
+                   s9, s10, s11, s12, s13, s14, s15) \
+        vmovd (s0)*4(RINBLKS), m1; \
+          vmovdqu32 (s1)*4(RINBLKS), m2 {%k7}{z}; /* merged load */; \
+            vmovd (s8)*4(RINBLKS), m3; \
+              vmovd (s9)*4(RINBLKS), m4; \
+        vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
+          vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
+            vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
+              vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
+        vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
+          vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
+            vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
+              vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
+        vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
+            vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
+              vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
+
+#define LOAD_MSG_0(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 
15)
+#define LOAD_MSG_1(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5, 
 3)
+#define LOAD_MSG_2(m1, m2, m3, m4) \
+        GATHER_MSG_2(m1, m2, m3, m4, \
+                     11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  
9,  4)
+#define LOAD_MSG_3(m1, m2, m3, m4) \
+        GATHER_MSG_3(m1, m2, m3, m4, \
+                     7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 
15,  8)
+#define LOAD_MSG_4(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                    9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 
13)
+#define LOAD_MSG_5(m1, m2, m3, m4) \
+        GATHER_MSG_5(m1, m2, m3, m4, \
+                     2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  
1,  9)
+#define LOAD_MSG_6(m1, m2, m3, m4) \
+        GATHER_MSG_6(m1, m2, m3, m4, \
+                     12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  
8, 11)
+#define LOAD_MSG_7(m1, m2, m3, m4) \
+        GATHER_MSG(m1, m2, m3, m4, \
+                   13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 
10)
+#define LOAD_MSG_8(m1, m2, m3, m4) \
+        GATHER_MSG_8(m1, m2, m3, m4, \
+                     6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 
10,  5)
+#define LOAD_MSG_9(m1, m2, m3, m4) \
+        GATHER_MSG_9(m1, m2, m3, m4, \
+                     10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 
13 , 0)
+
+#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
+
+#define ROR_16(in, out) vprord $16, in, out;
+
+#define ROR_8(in, out)  vprord $8, in, out;
+
+#define ROR_12(in, out) vprord $12, in, out;
+
+#define ROR_7(in, out) vprord $7, in, out;
+
+#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
+        vpaddd m, r1, r1; \
+        vpaddd r2, r1, r1; \
+        vpxor r1, r4, r4; \
+        ROR_A(r4, r4); \
+        vpaddd r4, r3, r3; \
+        vpxor r3, r2, r2; \
+        ROR_B(r2, r2);
+
+#define G1(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_16, ROR_12);
+
+#define G2(r1, r2, r3, r4, m) \
+        G(r1, r2, r3, r4, m, ROR_8, ROR_7);
+
+#define MM_SHUFFLE(z,y,x,w) \
+        (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define DIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
+
+#define UNDIAGONALIZE(r1, r2, r3, r4) \
+        vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
+        vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
+        vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
+
+#define ROUND(r, m1, m2, m3, m4) \
+        G1(ROW1, ROW2, ROW3, ROW4, m1); \
+        G2(ROW1, ROW2, ROW3, ROW4, m2); \
+        DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
+        G1(ROW1, ROW2, ROW3, ROW4, m3); \
+        G2(ROW1, ROW2, ROW3, ROW4, m4); \
+        UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
+
+SECTION_RODATA
+
+ELF(.type _blake2s_avx512_data,@object;)
+.align 16
+_blake2s_avx512_data:
+.Liv:
+        .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+        .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.Lk4_mask:
+       .byte (1 << 0) + (1 << 2)
+.Lk5_mask:
+       .byte (1 << 1) + (1 << 3)
+.Lk6_mask:
+       .byte (1 << 1) + (1 << 2)
+.Lk7_mask:
+       .byte (1 << 0) + (1 << 3)
+
+.text
+
+.align 64
+.globl _gcry_blake2s_transform_amd64_avx512
+ELF(.type _gcry_blake2s_transform_amd64_avx512,@function;)
+
+_gcry_blake2s_transform_amd64_avx512:
+        /* input:
+         *     %rdi: state
+         *     %rsi: blks
+         *     %rdx: num_blks
+         */
+        CFI_STARTPROC();
+
+        spec_stop_avx512;
+
+        kmovb .Lk4_mask rRIP, %k4;
+        kmovb .Lk5_mask rRIP, %k5;
+        kmovb .Lk6_mask rRIP, %k6;
+        kmovb .Lk7_mask rRIP, %k7;
+
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
+        vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        LOAD_MSG(1, MB1, MB2, MB3, MB4);
+        jmp .Loop;
+
+.align 64, 0xcc
+.Loop:
+        ROUND(0, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(2, MA1, MA2, MA3, MA4);
+        ROUND(1, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(3, MB1, MB2, MB3, MB4);
+        ROUND(2, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(4, MA1, MA2, MA3, MA4);
+        ROUND(3, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(5, MB1, MB2, MB3, MB4);
+        ROUND(4, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(6, MA1, MA2, MA3, MA4);
+        ROUND(5, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(7, MB1, MB2, MB3, MB4);
+        ROUND(6, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(8, MA1, MA2, MA3, MA4);
+        ROUND(7, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(9, MB1, MB2, MB3, MB4);
+        sub $1, RNBLKS;
+        jz .Loop_end;
+
+        lea 64(RINBLKS), RINBLKS;
+        addq $64, (STATE_T + 0)(RSTATE);
+
+        ROUND(8, MA1, MA2, MA3, MA4);
+                                      LOAD_MSG(0, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+                                      LOAD_MSG(1, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2;
+
+        vmovdqa .Liv+(0 * 4) rRIP, ROW3;
+        vmovdqa .Liv+(4 * 4) rRIP, ROW4;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        vpxor (STATE_T)(RSTATE), ROW4, ROW4;
+
+        jmp .Loop;
+
+.align 64, 0xcc
+.Loop_end:
+        ROUND(8, MA1, MA2, MA3, MA4);
+        ROUND(9, MB1, MB2, MB3, MB4);
+
+        vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1;
+        vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2;
+
+        vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
+        vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
+
+        xorl %eax, %eax;
+        kxord %k4, %k4, %k4;
+        kxord %k5, %k5, %k5;
+        kxord %k6, %k6, %k6;
+        kxord %k7, %k7, %k7;
+
+        vzeroall;
+        ret_spec_stop;
+        CFI_ENDPROC();
+ELF(.size _gcry_blake2s_transform_amd64_avx512,
+    .-_gcry_blake2s_transform_amd64_avx512;)
+
+#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/blowfish-amd64.S 
b/grub-core/lib/libgcrypt/cipher/blowfish-amd64.S
new file mode 100644
index 000000000..95d57a99b
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blowfish-amd64.S
@@ -0,0 +1,601 @@
+/* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0     0
+#define s1     ((s0) + 256 * 4)
+#define s2     ((s1) + 256 * 4)
+#define s3     ((s2) + 256 * 4)
+#define p      ((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+       movzbl RX0bh,           RT1d; \
+       movzbl RX0bl,           RT3d; \
+       rorq $16,               RX0; \
+       movzbl RX0bh,           RT0d; \
+       movzbl RX0bl,           RT2d; \
+       rorq $16,               RX0; \
+       movl s0(CTX,RT0,4),     RT0d; \
+       addl s1(CTX,RT2,4),     RT0d; \
+       xorl s2(CTX,RT1,4),     RT0d; \
+       addl s3(CTX,RT3,4),     RT0d; \
+       xorq RT0,               RX0;
+
+#define load_roundkey_enc(n) \
+       movq p+4*(n)(CTX),      RX3;
+
+#define add_roundkey_enc() \
+       xorq RX3,               RX0;
+
+#define round_enc(n) \
+       add_roundkey_enc(); \
+       load_roundkey_enc(n); \
+       \
+       F(); \
+       F();
+
+#define load_roundkey_dec(n) \
+       movq p+4*(n-1)(CTX),    RX3; \
+       rorq $32,               RX3;
+
+#define add_roundkey_dec() \
+       xorq RX3,               RX0;
+
+#define round_dec(n) \
+       add_roundkey_dec(); \
+       load_roundkey_dec(n); \
+       \
+       F(); \
+       F();
+
+#define read_block() \
+       movq (RIO),             RX0; \
+       rorq $32,               RX0; \
+       bswapq                  RX0;
+
+#define write_block() \
+       bswapq                  RX0; \
+       movq RX0,               (RIO);
+
+.align 16
+ELF(.type   __blowfish_enc_blk1,@function;)
+
+__blowfish_enc_blk1:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RX0: input plaintext block
+        * output:
+        *      RX0: output plaintext block
+        */
+       CFI_STARTPROC();
+       movq %rbp, %r11;
+       CFI_REGISTER(%rbp, %r11);
+
+       load_roundkey_enc(0);
+       round_enc(2);
+       round_enc(4);
+       round_enc(6);
+       round_enc(8);
+       round_enc(10);
+       round_enc(12);
+       round_enc(14);
+       round_enc(16);
+       add_roundkey_enc();
+
+       movq %r11, %rbp;
+       CFI_RESTORE(%rbp)
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_do_encrypt
+ELF(.type   _gcry_blowfish_amd64_do_encrypt,@function;)
+
+_gcry_blowfish_amd64_do_encrypt:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: u32 *ret_xl
+        *      %rdx: u32 *ret_xr
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       movl (%rdx), RX0d;
+       shlq $32, RX0;
+       movl (%rsi), RT3d;
+       movq %rdx, %r10;
+       orq RT3, RX0;
+       movq %rsi, RX2;
+
+       call __blowfish_enc_blk1;
+
+       movl RX0d, (%r10);
+       shrq $32, RX0;
+       movl RX0d, (RX2);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_encrypt_block
+ELF(.type   _gcry_blowfish_amd64_encrypt_block,@function;)
+
+_gcry_blowfish_amd64_encrypt_block:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       movq %rsi, %r10;
+
+       movq %rdx, RIO;
+       read_block();
+
+       call __blowfish_enc_blk1;
+
+       movq %r10, RIO;
+       write_block();
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
_gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_decrypt_block
+ELF(.type   _gcry_blowfish_amd64_decrypt_block,@function;)
+
+_gcry_blowfish_amd64_decrypt_block:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       movq %rbp, %r11;
+       CFI_REGISTER(%rbp, %r11);
+
+       movq %rsi, %r10;
+       movq %rdx, RIO;
+
+       read_block();
+
+       load_roundkey_dec(17);
+       round_dec(15);
+       round_dec(13);
+       round_dec(11);
+       round_dec(9);
+       round_dec(7);
+       round_dec(5);
+       round_dec(3);
+       round_dec(1);
+       add_roundkey_dec();
+
+       movq %r10, RIO;
+       write_block();
+
+       movq %r11, %rbp;
+       CFI_RESTORE(%rbp);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
_gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
+
+/**********************************************************************
+  4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+       movzbl x ## bh,         RT1d; \
+       movzbl x ## bl,         RT3d; \
+       rorq $16,               x; \
+       movzbl x ## bh,         RT0d; \
+       movzbl x ## bl,         RT2d; \
+       rorq $16,               x; \
+       movl s0(CTX,RT0,4),     RT0d; \
+       addl s1(CTX,RT2,4),     RT0d; \
+       xorl s2(CTX,RT1,4),     RT0d; \
+       addl s3(CTX,RT3,4),     RT0d; \
+       xorq RT0,               x;
+
+#define add_preloaded_roundkey4() \
+       xorq RKEY,              RX0; \
+       xorq RKEY,              RX1; \
+       xorq RKEY,              RX2; \
+       xorq RKEY,              RX3;
+
+#define preload_roundkey_enc(n) \
+       movq p+4*(n)(CTX),      RKEY;
+
+#define add_roundkey_enc4(n) \
+       add_preloaded_roundkey4(); \
+       preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+       add_roundkey_enc4(n); \
+       \
+       F4(RX0); \
+       F4(RX1); \
+       F4(RX2); \
+       F4(RX3); \
+       \
+       F4(RX0); \
+       F4(RX1); \
+       F4(RX2); \
+       F4(RX3);
+
+#define preload_roundkey_dec(n) \
+       movq p+4*((n)-1)(CTX),  RKEY; \
+       rorq $32,               RKEY;
+
+#define add_roundkey_dec4(n) \
+       add_preloaded_roundkey4(); \
+       preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+       add_roundkey_dec4(n); \
+       \
+       F4(RX0); \
+       F4(RX1); \
+       F4(RX2); \
+       F4(RX3); \
+       \
+       F4(RX0); \
+       F4(RX1); \
+       F4(RX2); \
+       F4(RX3);
+
+#define inbswap_block4() \
+       rorq $32,               RX0; \
+       bswapq                  RX0; \
+       rorq $32,               RX1; \
+       bswapq                  RX1; \
+       rorq $32,               RX2; \
+       bswapq                  RX2; \
+       rorq $32,               RX3; \
+       bswapq                  RX3;
+
+#define inctrswap_block4() \
+       rorq $32,               RX0; \
+       rorq $32,               RX1; \
+       rorq $32,               RX2; \
+       rorq $32,               RX3;
+
+#define outbswap_block4() \
+       bswapq                  RX0; \
+       bswapq                  RX1; \
+       bswapq                  RX2; \
+       bswapq                  RX3;
+
+.align 16
+ELF(.type   __blowfish_enc_blk4,@function;)
+
+__blowfish_enc_blk4:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+        * output:
+        *      RX0,RX1,RX2,RX3: four output ciphertext blocks
+        */
+       CFI_STARTPROC();
+       preload_roundkey_enc(0);
+
+       round_enc4(0);
+       round_enc4(2);
+       round_enc4(4);
+       round_enc4(6);
+       round_enc4(8);
+       round_enc4(10);
+       round_enc4(12);
+       round_enc4(14);
+       add_preloaded_roundkey4();
+
+       outbswap_block4();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
+
+.align 16
+ELF(.type   __blowfish_dec_blk4,@function;)
+
+__blowfish_dec_blk4:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      RX0,RX1,RX2,RX3: four input ciphertext blocks
+        * output:
+        *      RX0,RX1,RX2,RX3: four output plaintext blocks
+        */
+       CFI_STARTPROC();
+       preload_roundkey_dec(17);
+
+       inbswap_block4();
+
+       round_dec4(17);
+       round_dec4(15);
+       round_dec4(13);
+       round_dec4(11);
+       round_dec4(9);
+       round_dec4(7);
+       round_dec4(5);
+       round_dec4(3);
+       add_preloaded_roundkey4();
+
+       outbswap_block4();
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_ctr_enc
+ELF(.type   _gcry_blowfish_amd64_ctr_enc,@function;)
+_gcry_blowfish_amd64_ctr_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (4 blocks)
+        *      %rdx: src (4 blocks)
+        *      %rcx: iv (big endian, 64bit)
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       pushq %rbx;
+       CFI_PUSH(%rbx);
+       pushq %r12;
+       CFI_PUSH(%r12);
+       pushq %r13;
+       CFI_PUSH(%r13);
+
+       /* %r11-%r13 are not used by __blowfish_enc_blk4 */
+       movq %rcx, %r13; /*iv*/
+       movq %rdx, %r12; /*src*/
+       movq %rsi, %r11; /*dst*/
+
+       /* load IV and byteswap */
+       movq (%r13), RT0;
+       bswapq RT0;
+       movq RT0, RX0;
+
+       /* construct IVs */
+       leaq 1(RT0), RX1;
+       leaq 2(RT0), RX2;
+       leaq 3(RT0), RX3;
+       leaq 4(RT0), RT0;
+       bswapq RT0;
+
+       inctrswap_block4();
+
+       /* store new IV */
+       movq RT0, (%r13);
+
+       call __blowfish_enc_blk4;
+
+       /* XOR key-stream with plaintext */
+       xorq 0 * 8(%r12), RX0;
+       xorq 1 * 8(%r12), RX1;
+       xorq 2 * 8(%r12), RX2;
+       xorq 3 * 8(%r12), RX3;
+       movq RX0, 0 * 8(%r11);
+       movq RX1, 1 * 8(%r11);
+       movq RX2, 2 * 8(%r11);
+       movq RX3, 3 * 8(%r11);
+
+       popq %r13;
+       CFI_POP(%r13);
+       popq %r12;
+       CFI_POP(%r12);
+       popq %rbx;
+       CFI_POP(%rbx);
+       popq %rbp;
+       CFI_POP(%rbp);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_cbc_dec
+ELF(.type   _gcry_blowfish_amd64_cbc_dec,@function;)
+_gcry_blowfish_amd64_cbc_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (4 blocks)
+        *      %rdx: src (4 blocks)
+        *      %rcx: iv (64bit)
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       pushq %rbx;
+       CFI_PUSH(%rbx);
+       pushq %r12;
+       CFI_PUSH(%r12);
+       pushq %r13;
+       CFI_PUSH(%r13);
+
+       /* %r11-%r13 are not used by __blowfish_dec_blk4 */
+       movq %rsi, %r11; /*dst*/
+       movq %rdx, %r12; /*src*/
+       movq %rcx, %r13; /*iv*/
+
+       /* load input */
+       movq 0 * 8(%r12), RX0;
+       movq 1 * 8(%r12), RX1;
+       movq 2 * 8(%r12), RX2;
+       movq 3 * 8(%r12), RX3;
+
+       call __blowfish_dec_blk4;
+
+       movq 3 * 8(%r12), RT0;
+       xorq      (%r13), RX0;
+       xorq 0 * 8(%r12), RX1;
+       xorq 1 * 8(%r12), RX2;
+       xorq 2 * 8(%r12), RX3;
+       movq RT0, (%r13); /* store new IV */
+
+       movq RX0, 0 * 8(%r11);
+       movq RX1, 1 * 8(%r11);
+       movq RX2, 2 * 8(%r11);
+       movq RX3, 3 * 8(%r11);
+
+       popq %r13;
+       CFI_POP(%r13);
+       popq %r12;
+       CFI_POP(%r12);
+       popq %rbx;
+       CFI_POP(%rbx);
+       popq %rbp;
+       CFI_POP(%rbp);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
+
+.align 16
+.globl  _gcry_blowfish_amd64_cfb_dec
+ELF(.type   _gcry_blowfish_amd64_cfb_dec,@function;)
+_gcry_blowfish_amd64_cfb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (4 blocks)
+        *      %rdx: src (4 blocks)
+        *      %rcx: iv (64bit)
+        */
+       CFI_STARTPROC();
+       ENTER_SYSV_FUNC_PARAMS_0_4
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       pushq %rbx;
+       CFI_PUSH(%rbx);
+       pushq %r12;
+       CFI_PUSH(%r12);
+       pushq %r13;
+       CFI_PUSH(%r13);
+
+       /* %r11-%r13 are not used by __blowfish_enc_blk4 */
+       movq %rcx, %r13; /*iv*/
+       movq %rdx, %r12; /*src*/
+       movq %rsi, %r11; /*dst*/
+
+       /* Load input */
+       movq (%r13), RX0;
+       movq 0 * 8(%r12), RX1;
+       movq 1 * 8(%r12), RX2;
+       movq 2 * 8(%r12), RX3;
+
+       inbswap_block4();
+
+       /* Update IV */
+       movq 3 * 8(%r12), RT0;
+       movq RT0, (%r13);
+
+       call __blowfish_enc_blk4;
+
+       xorq 0 * 8(%r12), RX0;
+       xorq 1 * 8(%r12), RX1;
+       xorq 2 * 8(%r12), RX2;
+       xorq 3 * 8(%r12), RX3;
+       movq RX0, 0 * 8(%r11);
+       movq RX1, 1 * 8(%r11);
+       movq RX2, 2 * 8(%r11);
+       movq RX3, 3 * 8(%r11);
+
+       popq %r13;
+       CFI_POP(%r13);
+       popq %r12;
+       CFI_POP(%r12);
+       popq %rbx;
+       CFI_POP(%rbx);
+       popq %rbp;
+       CFI_POP(%rbp);
+
+       EXIT_SYSV_FUNC
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/blowfish-arm.S 
b/grub-core/lib/libgcrypt/cipher/blowfish-arm.S
new file mode 100644
index 000000000..a5101b5c0
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/blowfish-arm.S
@@ -0,0 +1,743 @@
+/* blowfish-arm.S  -  ARM assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of crypto context */
+#define s0     0
+#define s1     (s0 + (1 * 256) * 4)
+#define s2     (s0 + (2 * 256) * 4)
+#define s3     (s0 + (3 * 256) * 4)
+#define p      (s3 + (1 * 256) * 4)
+
+/* register macros */
+#define CTXs0 r0
+#define CTXs1 r9
+#define CTXs2 r8
+#define CTXs3 r10
+#define RMASK lr
+#define RKEYL r2
+#define RKEYR ip
+
+#define RL0 r3
+#define RR0 r4
+
+#define RL1 r9
+#define RR1 r10
+
+#define RT0 r11
+#define RT1 r7
+#define RT2 r5
+#define RT3 r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+       ldrb rout, [rsrc, #((offs) + 0)]; \
+       ldrb rtmp, [rsrc, #((offs) + 1)]; \
+       orr rout, rout, rtmp, lsl #8; \
+       ldrb rtmp, [rsrc, #((offs) + 2)]; \
+       orr rout, rout, rtmp, lsl #16; \
+       ldrb rtmp, [rsrc, #((offs) + 3)]; \
+       orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+       mov rtmp0, rin, lsr #8; \
+       strb rin, [rdst, #((offs) + 0)]; \
+       mov rtmp1, rin, lsr #16; \
+       strb rtmp0, [rdst, #((offs) + 1)]; \
+       mov rtmp0, rin, lsr #24; \
+       strb rtmp1, [rdst, #((offs) + 2)]; \
+       strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+       ldrb rout, [rsrc, #((offs) + 3)]; \
+       ldrb rtmp, [rsrc, #((offs) + 2)]; \
+       orr rout, rout, rtmp, lsl #8; \
+       ldrb rtmp, [rsrc, #((offs) + 1)]; \
+       orr rout, rout, rtmp, lsl #16; \
+       ldrb rtmp, [rsrc, #((offs) + 0)]; \
+       orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+       mov rtmp0, rin, lsr #8; \
+       strb rin, [rdst, #((offs) + 3)]; \
+       mov rtmp1, rin, lsr #16; \
+       strb rtmp0, [rdst, #((offs) + 2)]; \
+       mov rtmp0, rin, lsr #24; \
+       strb rtmp1, [rdst, #((offs) + 1)]; \
+       strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+       #define ldr_unaligned_host ldr_unaligned_le
+       #define str_unaligned_host str_unaligned_le
+
+       /* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+       #define host_to_be(reg, rtmp) \
+               rev reg, reg;
+       #define be_to_host(reg, rtmp) \
+               rev reg, reg;
+#else
+       #define host_to_be(reg, rtmp) \
+               eor     rtmp, reg, reg, ror #16; \
+               mov     rtmp, rtmp, lsr #8; \
+               bic     rtmp, rtmp, #65280; \
+               eor     reg, rtmp, reg, ror #8;
+       #define be_to_host(reg, rtmp) \
+               eor     rtmp, reg, reg, ror #16; \
+               mov     rtmp, rtmp, lsr #8; \
+               bic     rtmp, rtmp, #65280; \
+               eor     reg, rtmp, reg, ror #8;
+#endif
+#else
+       #define ldr_unaligned_host ldr_unaligned_be
+       #define str_unaligned_host str_unaligned_be
+
+       /* nop on big-endian */
+       #define host_to_be(reg, rtmp) /*_*/
+       #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(l, r) \
+       and RT0, RMASK, l, lsr#(24 - 2); \
+       and RT1, RMASK, l, lsr#(16 - 2); \
+       ldr RT0, [CTXs0, RT0]; \
+       and RT2, RMASK, l, lsr#(8 - 2); \
+       ldr RT1, [CTXs1, RT1]; \
+       and RT3, RMASK, l, lsl#2; \
+       ldr RT2, [CTXs2, RT2]; \
+       add RT0, RT1; \
+       ldr RT3, [CTXs3, RT3]; \
+       eor RT0, RT2; \
+       add RT0, RT3; \
+       eor r, RT0;
+
+#define load_roundkey_enc(n) \
+       ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
+       ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
+
+#define add_roundkey_enc() \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR;
+
+#define round_enc(n) \
+       add_roundkey_enc(); \
+       load_roundkey_enc(n); \
+       \
+       F(RL0, RR0); \
+       F(RR0, RL0);
+
+#define load_roundkey_dec(n) \
+       ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
+       ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
+
+#define add_roundkey_dec() \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR;
+
+#define round_dec(n) \
+       add_roundkey_dec(); \
+       load_roundkey_dec(n); \
+       \
+       F(RL0, RR0); \
+       F(RR0, RL0);
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+       ldr l0, [rin, #((offs) + 0)]; \
+       ldr r0, [rin, #((offs) + 4)]; \
+       convert(l0, rtmp); \
+       convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+       convert(l0, rtmp); \
+       convert(r0, rtmp); \
+       str l0, [rout, #((offs) + 0)]; \
+       str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+       /* unaligned word reads allowed */
+       #define read_block(rin, offs, l0, r0, rtmp0) \
+               read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
+
+       #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+               write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
+
+       #define read_block_host(rin, offs, l0, r0, rtmp0) \
+               read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
+
+       #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+               write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
+#else
+       /* need to handle unaligned reads by byte reads */
+       #define read_block(rin, offs, l0, r0, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+                       ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block_aligned(rin, offs, l0, r0, host_to_be, 
rtmp0); \
+               2:;
+
+       #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+                       str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block_aligned(rout, offs, l0, r0, be_to_host, 
rtmp0); \
+               2:;
+
+       #define read_block_host(rin, offs, l0, r0, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+                       ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block_aligned(rin, offs, l0, r0, host_to_host, 
rtmp0); \
+               2:;
+
+       #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); 
\
+                       str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); 
\
+                       b 2f; \
+               1:;\
+                       write_block_aligned(rout, offs, l0, r0, host_to_host); \
+               2:;
+#endif
+
+.align 3
+.type  __blowfish_enc_blk1,%function;
+
+__blowfish_enc_blk1:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0]: src
+        * output:
+        *      [RR0, RL0]: dst
+        */
+       push {lr};
+
+       add CTXs1, CTXs0, #(s1 - s0);
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+       add CTXs3, CTXs1, #(s3 - s1);
+
+       load_roundkey_enc(0);
+       round_enc(2);
+       round_enc(4);
+       round_enc(6);
+       round_enc(8);
+       round_enc(10);
+       round_enc(12);
+       round_enc(14);
+       round_enc(16);
+       add_roundkey_enc();
+
+       pop {pc};
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_arm_do_encrypt
+.type   _gcry_blowfish_arm_do_encrypt,%function;
+
+_gcry_blowfish_arm_do_encrypt:
+       /* input:
+        *      r0: ctx, CTX
+        *      r1: u32 *ret_xl
+        *      r2: u32 *ret_xr
+        */
+       push {r2, r4-r11, ip, lr};
+
+       ldr RL0, [r1];
+       ldr RR0, [r2];
+
+       bl __blowfish_enc_blk1;
+
+       pop {r2};
+       str RR0, [r1];
+       str RL0, [r2];
+
+       pop {r4-r11, ip, pc};
+.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
+
+.align 3
+.globl _gcry_blowfish_arm_encrypt_block
+.type   _gcry_blowfish_arm_encrypt_block,%function;
+
+_gcry_blowfish_arm_encrypt_block:
+       /* input:
+        *      r0: ctx, CTX
+        *      r1: dst
+        *      r2: src
+        */
+       push {r4-r11, ip, lr};
+
+       read_block(r2, 0, RL0, RR0, RT0);
+
+       bl __blowfish_enc_blk1;
+
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
+
+       pop {r4-r11, ip, pc};
+.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
+
+.align 3
+.globl _gcry_blowfish_arm_decrypt_block
+.type   _gcry_blowfish_arm_decrypt_block,%function;
+
+_gcry_blowfish_arm_decrypt_block:
+       /* input:
+        *      r0: ctx, CTX
+        *      r1: dst
+        *      r2: src
+        */
+       push {r4-r11, ip, lr};
+
+       add CTXs1, CTXs0, #(s1 - s0);
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+       add CTXs3, CTXs1, #(s3 - s1);
+
+       read_block(r2, 0, RL0, RR0, RT0);
+
+       load_roundkey_dec(17);
+       round_dec(15);
+       round_dec(13);
+       round_dec(11);
+       round_dec(9);
+       round_dec(7);
+       round_dec(5);
+       round_dec(3);
+       round_dec(1);
+       add_roundkey_dec();
+
+       write_block(r1, 0, RR0, RL0, RT0, RT1);
+
+       pop {r4-r11, ip, pc};
+.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
+
+/***********************************************************************
+ * 2-way blowfish
+ ***********************************************************************/
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
+       \
+       and RT0, RMASK, l0, lsr#(24 - 2); \
+       and RT1, RMASK, l0, lsr#(16 - 2); \
+       and RT2, RMASK, l0, lsr#(8 - 2); \
+       add RT1, #(s1 - s0); \
+       \
+       ldr RT0, [CTXs0, RT0]; \
+       and RT3, RMASK, l0, lsl#2; \
+       ldr RT1, [CTXs0, RT1]; \
+       add RT3, #(s3 - s2); \
+       ldr RT2, [CTXs2, RT2]; \
+       add RT0, RT1; \
+       ldr RT3, [CTXs2, RT3]; \
+       \
+       and RT1, RMASK, l1, lsr#(24 - 2); \
+       eor RT0, RT2; \
+       and RT2, RMASK, l1, lsr#(16 - 2); \
+       add RT0, RT3; \
+       add RT2, #(s1 - s0); \
+       and RT3, RMASK, l1, lsr#(8 - 2); \
+       eor r0, RT0; \
+       \
+       ldr RT1, [CTXs0, RT1]; \
+       and RT0, RMASK, l1, lsl#2; \
+       ldr RT2, [CTXs0, RT2]; \
+       add RT0, #(s3 - s2); \
+       ldr RT3, [CTXs2, RT3]; \
+       add RT1, RT2; \
+       ldr RT0, [CTXs2, RT0]; \
+       \
+       and RT2, RMASK, r0, lsr#(24 - 2); \
+       eor RT1, RT3; \
+       and RT3, RMASK, r0, lsr#(16 - 2); \
+       add RT1, RT0; \
+       add RT3, #(s1 - s0); \
+       and RT0, RMASK, r0, lsr#(8 - 2); \
+       eor r1, RT1; \
+       \
+       ldr RT2, [CTXs0, RT2]; \
+       and RT1, RMASK, r0, lsl#2; \
+       ldr RT3, [CTXs0, RT3]; \
+       add RT1, #(s3 - s2); \
+       ldr RT0, [CTXs2, RT0]; \
+       add RT2, RT3; \
+       ldr RT1, [CTXs2, RT1]; \
+       \
+       and RT3, RMASK, r1, lsr#(24 - 2); \
+       eor RT2, RT0; \
+       and RT0, RMASK, r1, lsr#(16 - 2); \
+       add RT2, RT1; \
+       add RT0, #(s1 - s0); \
+       and RT1, RMASK, r1, lsr#(8 - 2); \
+       eor l0, RT2; \
+       \
+       ldr RT3, [CTXs0, RT3]; \
+       and RT2, RMASK, r1, lsl#2; \
+       ldr RT0, [CTXs0, RT0]; \
+       add RT2, #(s3 - s2); \
+       ldr RT1, [CTXs2, RT1]; \
+       eor l1, RKEYL; \
+       ldr RT2, [CTXs2, RT2]; \
+       \
+       eor r0, RKEYR; \
+       add RT3, RT0; \
+       eor r1, RKEYR; \
+       eor RT3, RT1; \
+       eor l0, RKEYL; \
+       add RT3, RT2; \
+       set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
+       eor l1, RT3; \
+       set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
+
+#define load_n_add_roundkey_enc2(n) \
+       load_roundkey_enc(n); \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR; \
+       eor RL1, RKEYL; \
+       eor RR1, RKEYR; \
+       load_roundkey_enc((n) + 2);
+
+#define next_key(reg, offs) \
+       ldr reg, [CTXs2, #(offs)];
+
+#define dummy(x, y) /* do nothing */
+
+#define round_enc2(n, load_next_key) \
+       F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
+
+#define load_n_add_roundkey_dec2(n) \
+       load_roundkey_dec(n); \
+       eor RL0, RKEYL; \
+       eor RR0, RKEYR; \
+       eor RL1, RKEYL; \
+       eor RR1, RKEYR; \
+       load_roundkey_dec((n) - 2);
+
+#define round_dec2(n, load_next_key) \
+       F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+       ldr l0, [rin, #(0)]; \
+       ldr r0, [rin, #(4)]; \
+       convert(l0, rtmp); \
+       ldr l1, [rin, #(8)]; \
+       convert(r0, rtmp); \
+       ldr r1, [rin, #(12)]; \
+       convert(l1, rtmp); \
+       convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+       convert(l0, rtmp); \
+       convert(r0, rtmp); \
+       convert(l1, rtmp); \
+       str l0, [rout, #(0)]; \
+       convert(r1, rtmp); \
+       str r0, [rout, #(4)]; \
+       str l1, [rout, #(8)]; \
+       str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+       /* unaligned word reads allowed */
+       #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+               read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+       #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+       #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+               read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+       #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+       /* need to handle unaligned reads by byte reads */
+       #define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_be(l0, rin, 0, rtmp0); \
+                       ldr_unaligned_be(r0, rin, 4, rtmp0); \
+                       ldr_unaligned_be(l1, rin, 8, rtmp0); \
+                       ldr_unaligned_be(r1, rin, 12, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, 
rtmp0); \
+               2:;
+
+       #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+                       str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+                       str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+                       str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, 
rtmp0); \
+               2:;
+
+       #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+               tst rin, #3; \
+               beq 1f; \
+                       ldr_unaligned_host(l0, rin, 0, rtmp0); \
+                       ldr_unaligned_host(r0, rin, 4, rtmp0); \
+                       ldr_unaligned_host(l1, rin, 8, rtmp0); \
+                       ldr_unaligned_host(r1, rin, 12, rtmp0); \
+                       b 2f; \
+               1:;\
+                       read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, 
rtmp0); \
+               2:;
+
+       #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+               tst rout, #3; \
+               beq 1f; \
+                       str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+                       str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+                       str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+                       str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+                       b 2f; \
+               1:;\
+                       write_block2_aligned(rout, l0, r0, l1, r1, 
host_to_host, rtmp0); \
+               2:;
+#endif
+
+.align 3
+.type  _gcry_blowfish_arm_enc_blk2,%function;
+
+_gcry_blowfish_arm_enc_blk2:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0], [RL1, RR1]: src
+        * output:
+        *      [RR0, RL0], [RR1, RL1]: dst
+        */
+       push {RT0,lr};
+
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+
+       load_n_add_roundkey_enc2(0);
+       round_enc2(2, next_key);
+       round_enc2(4, next_key);
+       round_enc2(6, next_key);
+       round_enc2(8, next_key);
+       round_enc2(10, next_key);
+       round_enc2(12, next_key);
+       round_enc2(14, next_key);
+       round_enc2(16, dummy);
+
+       host_to_be(RR0, RT0);
+       host_to_be(RL0, RT0);
+       host_to_be(RR1, RT0);
+       host_to_be(RL1, RT0);
+
+       pop {RT0,pc};
+.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cfb_dec;
+.type  _gcry_blowfish_arm_cfb_dec,%function;
+
+_gcry_blowfish_arm_cfb_dec:
+       /* input:
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
+        */
+       push {r2, r4-r11, ip, lr};
+
+       mov lr, r3;
+
+       /* Load input (iv/r3 is aligned, src/r2 might not be) */
+       ldm r3, {RL0, RR0};
+       host_to_be(RL0, RT0);
+       host_to_be(RR0, RT0);
+       read_block(r2, 0, RL1, RR1, RT0);
+
+       /* Update IV, load src[1] and save to iv[0] */
+       read_block_host(r2, 8, r5, r6, RT0);
+       stm lr, {r5, r6};
+
+       bl _gcry_blowfish_arm_enc_blk2;
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
+
+       /* r1: dst, r0: src */
+       pop {r0};
+
+       /* dst = src ^ result */
+       read_block2_host(r0, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+       pop {r4-r11, ip, pc};
+.ltorg
+.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
+
+.align 3
+.globl _gcry_blowfish_arm_ctr_enc;
+.type  _gcry_blowfish_arm_ctr_enc,%function;
+
+_gcry_blowfish_arm_ctr_enc:
+       /* input:
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit, big-endian)
+        */
+       push {r2, r4-r11, ip, lr};
+
+       mov lr, r3;
+
+       /* Load IV (big => host endian) */
+       read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT0);
+
+       /* Construct IVs */
+       adds RR1, RR0, #1; /* +1 */
+       adc RL1, RL0, #0;
+       adds r6, RR1, #1; /* +2 */
+       adc r5, RL1, #0;
+
+       /* Store new IV (host => big-endian) */
+       write_block_aligned(lr, 0, r5, r6, host_to_be, RT0);
+
+       bl _gcry_blowfish_arm_enc_blk2;
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
+
+       /* r1: dst, r0: src */
+       pop {r0};
+
+       /* XOR key-stream with plaintext */
+       read_block2_host(r0, r5, r6, r7, r8, lr);
+       eor r5, r4;
+       eor r6, r3;
+       eor r7, r10;
+       eor r8, r9;
+       write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+       pop {r4-r11, ip, pc};
+.ltorg
+.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
+
+.align 3
+.type  _gcry_blowfish_arm_dec_blk2,%function;
+
+_gcry_blowfish_arm_dec_blk2:
+       /* input:
+        *      preloaded: CTX
+        *      [RL0, RR0], [RL1, RR1]: src
+        * output:
+        *      [RR0, RL0], [RR1, RL1]: dst
+        */
+       add CTXs2, CTXs0, #(s2 - s0);
+       mov RMASK, #(0xff << 2); /* byte mask */
+
+       load_n_add_roundkey_dec2(17);
+       round_dec2(15, next_key);
+       round_dec2(13, next_key);
+       round_dec2(11, next_key);
+       round_dec2(9, next_key);
+       round_dec2(7, next_key);
+       round_dec2(5, next_key);
+       round_dec2(3, next_key);
+       round_dec2(1, dummy);
+
+       host_to_be(RR0, RT0);
+       host_to_be(RL0, RT0);
+       host_to_be(RR1, RT0);
+       host_to_be(RL1, RT0);
+
+       b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cbc_dec;
+.type  _gcry_blowfish_arm_cbc_dec,%function;
+
+_gcry_blowfish_arm_cbc_dec:
+       /* input:
+        *      r0: CTX
+        *      r1: dst (2 blocks)
+        *      r2: src (2 blocks)
+        *      r3: iv (64bit)
+        */
+       push {r2-r11, ip, lr};
+
+       read_block2(r2, RL0, RR0, RL1, RR1, RT0);
+
+       /* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+        * of function call. */
+       b _gcry_blowfish_arm_dec_blk2;
+.Ldec_cbc_tail:
+       /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
+
+       /* r0: src, r1: dst, r2: iv */
+       pop {r0, r2};
+
+       /* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+       read_block_host(r0, 0, r7, r8, r5);
+       /* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+       ldm r2, {r5, r6};
+
+       /* out[1] ^= IV+1 */
+       eor r10, r7;
+       eor r9, r8;
+       /* out[0] ^= IV */
+       eor r4, r5;
+       eor r3, r6;
+
+       /* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+       read_block_host(r0, 8, r7, r8, r5);
+       /* store IV+2 to iv[0] (aligned). */
+       stm r2, {r7, r8};
+
+       /* store result to dst[0-3]. Might be unaligned. */
+       write_block2_host(r1, r4, r3, r10, r9, r5, r6);
+
+       pop {r4-r11, ip, pc};
+.ltorg
+.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/grub-core/lib/libgcrypt/cipher/blowfish.c 
b/grub-core/lib/libgcrypt/cipher/blowfish.c
index b4d2b9c9a..87abd563a 100644
--- a/grub-core/lib/libgcrypt/cipher/blowfish.c
+++ b/grub-core/lib/libgcrypt/cipher/blowfish.c
@@ -4,7 +4,7 @@
  * This file is part of Libgcrypt.
  *
  * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
+ * it under the terms of the GNU Lesser General Public License as
  * published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
@@ -14,8 +14,8 @@
  * GNU Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ * License along with this program; if not, see 
<https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
  *
  * For a description of the algorithm, see:
  *   Bruce Schneier: Applied Cryptography. John Wiley & Sons, 1996.
@@ -36,21 +36,41 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-internal.h"
 
 #define BLOWFISH_BLOCKSIZE 8
-#define BLOWFISH_ROUNDS 16
+#define BLOWFISH_KEY_MIN_BITS 8
+#define BLOWFISH_KEY_MAX_BITS 576
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AMD64_ASM 1
+#endif
+
+/* USE_ARM_ASM indicates whether to use ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+#  define USE_ARM_ASM 1
+# endif
+#endif
 
 typedef struct {
     u32 s0[256];
     u32 s1[256];
     u32 s2[256];
     u32 s3[256];
-    u32 p[BLOWFISH_ROUNDS+2];
+    u32 p[16+2];
 } BLOWFISH_context;
 
-static gcry_err_code_t bf_setkey (void *c, const byte *key, unsigned keylen);
-static void encrypt_block (void *bc, byte *outbuf, const byte *inbuf);
-static void decrypt_block (void *bc, byte *outbuf, const byte *inbuf);
+static gcry_err_code_t bf_setkey (void *c, const byte *key, unsigned keylen,
+                                  cipher_bulk_ops_t *bulk_ops);
+static unsigned int encrypt_block (void *bc, byte *outbuf, const byte *inbuf);
+static unsigned int decrypt_block (void *bc, byte *outbuf, const byte *inbuf);
 
 
 /* precomputed S boxes */
@@ -234,49 +254,157 @@ static const u32 ks3[256] = {
     0x01C36AE4,0xD6EBE1F9,0x90D4F869,0xA65CDEA0,0x3F09252D,0xC208E69F,
     0xB74E6132,0xCE77E25B,0x578FDFE3,0x3AC372E6 };
 
-static const u32 ps[BLOWFISH_ROUNDS+2] = {
+static const u32 ps[16+2] = {
     0x243F6A88,0x85A308D3,0x13198A2E,0x03707344,0xA4093822,0x299F31D0,
     0x082EFA98,0xEC4E6C89,0x452821E6,0x38D01377,0xBE5466CF,0x34E90C6C,
     0xC0AC29B7,0xC97C50DD,0x3F84D5B5,0xB5470917,0x9216D5D9,0x8979FB1B };
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_amd64_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+                                           u32 *ret_xr);
+
+extern void _gcry_blowfish_amd64_encrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+extern void _gcry_blowfish_amd64_decrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
+}
 
-#if BLOWFISH_ROUNDS != 16
-static inline u32
-function_F( BLOWFISH_context *bc, u32 x )
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-    u16 a, b, c, d;
-
-#ifdef WORDS_BIGENDIAN
-    a = ((byte*)&x)[0];
-    b = ((byte*)&x)[1];
-    c = ((byte*)&x)[2];
-    d = ((byte*)&x)[3];
-#else
-    a = ((byte*)&x)[3];
-    b = ((byte*)&x)[2];
-    c = ((byte*)&x)[1];
-    d = ((byte*)&x)[0];
-#endif
+  _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
+}
 
-    return ((bc->s0[a] + bc->s1[b]) ^ bc->s2[c] ) + bc->s3[d];
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
 }
-#endif
 
-#ifdef WORDS_BIGENDIAN
-#define F(x) ((( s0[((byte*)&x)[0]] + s1[((byte*)&x)[1]])       \
-                  ^ s2[((byte*)&x)[2]]) + s3[((byte*)&x)[3]] )
-#else
-#define F(x) ((( s0[((byte*)&x)[3]] + s1[((byte*)&x)[2]])       \
-                  ^ s2[((byte*)&x)[1]]) + s3[((byte*)&x)[0]] )
-#endif
-#define R(l,r,i)  do { l ^= p[i]; r ^= F(l); } while(0)
+static inline void
+blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *ctr)
+{
+  _gcry_blowfish_amd64_ctr_enc(ctx, out, in, ctr);
+}
+
+static inline void
+blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+  _gcry_blowfish_amd64_cbc_dec(ctx, out, in, iv);
+}
+
+static inline void
+blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+  _gcry_blowfish_amd64_cfb_dec(ctx, out, in, iv);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (2*8);
+}
+
+#elif defined(USE_ARM_ASM)
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_arm_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+                                           u32 *ret_xr);
+
+extern void _gcry_blowfish_arm_encrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+extern void _gcry_blowfish_arm_decrypt_block(BLOWFISH_context *c, byte *out,
+                                              const byte *in);
+
+/* These assembly implementations process two blocks in parallel. */
+extern void _gcry_blowfish_arm_ctr_enc(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_arm_cbc_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+extern void _gcry_blowfish_arm_cfb_dec(BLOWFISH_context *ctx, byte *out,
+                                        const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_arm_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_arm_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_arm_decrypt_block (context, outbuf, inbuf);
+}
+
+static unsigned int
+encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+static unsigned int
+decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  return /*burn_stack*/ (10*4);
+}
+
+#else /*USE_ARM_ASM*/
+
+
+#define F(x) ((( s0[(x)>>24] + s1[((x)>>16)&0xff])      \
+                  ^ s2[((x)>>8)&0xff]) + s3[(x)&0xff] )
+#define R(l,r,i) do { l ^= p[i]; r ^= F(l); } while(0)
+#define R3(l,r,i) do { R(l##0,r##0,i);R(l##1,r##1,i);R(l##2,r##2,i);} while(0)
 
 
 static void
 do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
-#if BLOWFISH_ROUNDS == 16
   u32 xl, xr, *s0, *s1, *s2, *s3, *p;
 
   xl = *ret_xl;
@@ -287,16 +415,16 @@ do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 
*ret_xr )
   s2 = bc->s2;
   s3 = bc->s3;
 
-  R( xl, xr,   0);
-  R( xr, xl,   1);
-  R( xl, xr,   2);
-  R( xr, xl,   3);
-  R( xl, xr,   4);
-  R( xr, xl,   5);
-  R( xl, xr,   6);
-  R( xr, xl,   7);
-  R( xl, xr,   8);
-  R( xr, xl,   9);
+  R( xl, xr,  0);
+  R( xr, xl,  1);
+  R( xl, xr,  2);
+  R( xr, xl,  3);
+  R( xl, xr,  4);
+  R( xr, xl,  5);
+  R( xl, xr,  6);
+  R( xr, xl,  7);
+  R( xl, xr,  8);
+  R( xr, xl,  9);
   R( xl, xr, 10);
   R( xr, xl, 11);
   R( xl, xr, 12);
@@ -304,45 +432,67 @@ do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 
*ret_xr )
   R( xl, xr, 14);
   R( xr, xl, 15);
 
-  xl ^= p[BLOWFISH_ROUNDS];
-  xr ^= p[BLOWFISH_ROUNDS+1];
+  xl ^= p[16];
+  xr ^= p[16+1];
 
   *ret_xl = xr;
   *ret_xr = xl;
+}
 
-#else
-  u32 xl, xr, temp, *p;
-  int i;
 
-  xl = *ret_xl;
-  xr = *ret_xr;
+static void
+do_encrypt_3 ( BLOWFISH_context *bc, byte *dst, const byte *src )
+{
+  u32 xl0, xr0, xl1, xr1, xl2, xr2, *s0, *s1, *s2, *s3, *p;
+
+  xl0 = buf_get_be32(src + 0);
+  xr0 = buf_get_be32(src + 4);
+  xl1 = buf_get_be32(src + 8);
+  xr1 = buf_get_be32(src + 12);
+  xl2 = buf_get_be32(src + 16);
+  xr2 = buf_get_be32(src + 20);
   p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
 
-  for(i=0; i < BLOWFISH_ROUNDS; i++ )
-    {
-      xl ^= p[i];
-      xr ^= function_F(bc, xl);
-      temp = xl;
-      xl = xr;
-      xr = temp;
-    }
-  temp = xl;
-  xl = xr;
-  xr = temp;
-
-  xr ^= p[BLOWFISH_ROUNDS];
-  xl ^= p[BLOWFISH_ROUNDS+1];
-
-  *ret_xl = xl;
-  *ret_xr = xr;
-#endif
+  R3( xl, xr,  0);
+  R3( xr, xl,  1);
+  R3( xl, xr,  2);
+  R3( xr, xl,  3);
+  R3( xl, xr,  4);
+  R3( xr, xl,  5);
+  R3( xl, xr,  6);
+  R3( xr, xl,  7);
+  R3( xl, xr,  8);
+  R3( xr, xl,  9);
+  R3( xl, xr, 10);
+  R3( xr, xl, 11);
+  R3( xl, xr, 12);
+  R3( xr, xl, 13);
+  R3( xl, xr, 14);
+  R3( xr, xl, 15);
+
+  xl0 ^= p[16];
+  xr0 ^= p[16+1];
+  xl1 ^= p[16];
+  xr1 ^= p[16+1];
+  xl2 ^= p[16];
+  xr2 ^= p[16+1];
+
+  buf_put_be32(dst + 0, xr0);
+  buf_put_be32(dst + 4, xl0);
+  buf_put_be32(dst + 8, xr1);
+  buf_put_be32(dst + 12, xl1);
+  buf_put_be32(dst + 16, xr2);
+  buf_put_be32(dst + 20, xl2);
 }
 
 
 static void
 decrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
-#if BLOWFISH_ROUNDS == 16
   u32 xl, xr, *s0, *s1, *s2, *s3, *p;
 
   xl = *ret_xl;
@@ -361,77 +511,94 @@ decrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
   R( xr, xl, 12);
   R( xl, xr, 11);
   R( xr, xl, 10);
-  R( xl, xr,   9);
-  R( xr, xl,   8);
-  R( xl, xr,   7);
-  R( xr, xl,   6);
-  R( xl, xr,   5);
-  R( xr, xl,   4);
-  R( xl, xr,   3);
-  R( xr, xl,   2);
+  R( xl, xr,  9);
+  R( xr, xl,  8);
+  R( xl, xr,  7);
+  R( xr, xl,  6);
+  R( xl, xr,  5);
+  R( xr, xl,  4);
+  R( xl, xr,  3);
+  R( xr, xl,  2);
 
   xl ^= p[1];
   xr ^= p[0];
 
   *ret_xl = xr;
   *ret_xr = xl;
+}
 
-#else
-  u32 xl, xr, temp, *p;
-  int i;
 
-  xl = *ret_xl;
-  xr = *ret_xr;
+static void
+do_decrypt_3 ( BLOWFISH_context *bc, byte *dst, const byte *src )
+{
+  u32 xl0, xr0, xl1, xr1, xl2, xr2, *s0, *s1, *s2, *s3, *p;
+
+  xl0 = buf_get_be32(src + 0);
+  xr0 = buf_get_be32(src + 4);
+  xl1 = buf_get_be32(src + 8);
+  xr1 = buf_get_be32(src + 12);
+  xl2 = buf_get_be32(src + 16);
+  xr2 = buf_get_be32(src + 20);
   p = bc->p;
+  s0 = bc->s0;
+  s1 = bc->s1;
+  s2 = bc->s2;
+  s3 = bc->s3;
 
-  for (i=BLOWFISH_ROUNDS+1; i > 1; i-- )
-    {
-      xl ^= p[i];
-      xr ^= function_F(bc, xl);
-      temp = xl;
-      xl = xr;
-      xr = temp;
-    }
-
-  temp = xl;
-  xl = xr;
-  xr = temp;
-
-  xr ^= p[1];
-  xl ^= p[0];
-
-  *ret_xl = xl;
-  *ret_xr = xr;
-#endif
+  R3( xl, xr, 17);
+  R3( xr, xl, 16);
+  R3( xl, xr, 15);
+  R3( xr, xl, 14);
+  R3( xl, xr, 13);
+  R3( xr, xl, 12);
+  R3( xl, xr, 11);
+  R3( xr, xl, 10);
+  R3( xl, xr,  9);
+  R3( xr, xl,  8);
+  R3( xl, xr,  7);
+  R3( xr, xl,  6);
+  R3( xl, xr,  5);
+  R3( xr, xl,  4);
+  R3( xl, xr,  3);
+  R3( xr, xl,  2);
+
+  xl0 ^= p[1];
+  xr0 ^= p[0];
+  xl1 ^= p[1];
+  xr1 ^= p[0];
+  xl2 ^= p[1];
+  xr2 ^= p[0];
+
+  buf_put_be32(dst + 0, xr0);
+  buf_put_be32(dst + 4, xl0);
+  buf_put_be32(dst + 8, xr1);
+  buf_put_be32(dst + 12, xl1);
+  buf_put_be32(dst + 16, xr2);
+  buf_put_be32(dst + 20, xl2);
 }
 
 #undef F
 #undef R
+#undef R3
 
 static void
 do_encrypt_block ( BLOWFISH_context *bc, byte *outbuf, const byte *inbuf )
 {
   u32 d1, d2;
 
-  d1 = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-  d2 = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
   do_encrypt( bc, &d1, &d2 );
-  outbuf[0] = (d1 >> 24) & 0xff;
-  outbuf[1] = (d1 >> 16) & 0xff;
-  outbuf[2] = (d1 >>   8) & 0xff;
-  outbuf[3] =  d1         & 0xff;
-  outbuf[4] = (d2 >> 24) & 0xff;
-  outbuf[5] = (d2 >> 16) & 0xff;
-  outbuf[6] = (d2 >>   8) & 0xff;
-  outbuf[7] =  d2         & 0xff;
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
 }
 
-static void
+static unsigned int
 encrypt_block (void *context, byte *outbuf, const byte *inbuf)
 {
   BLOWFISH_context *bc = (BLOWFISH_context *) context;
   do_encrypt_block (bc, outbuf, inbuf);
-  _gcry_burn_stack (64);
+  return /*burn_stack*/ (64);
 }
 
 
@@ -440,25 +607,251 @@ do_decrypt_block (BLOWFISH_context *bc, byte *outbuf, 
const byte *inbuf)
 {
   u32 d1, d2;
 
-  d1 = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-  d2 = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
   decrypt( bc, &d1, &d2 );
-  outbuf[0] = (d1 >> 24) & 0xff;
-  outbuf[1] = (d1 >> 16) & 0xff;
-  outbuf[2] = (d1 >>   8) & 0xff;
-  outbuf[3] =  d1         & 0xff;
-  outbuf[4] = (d2 >> 24) & 0xff;
-  outbuf[5] = (d2 >> 16) & 0xff;
-  outbuf[6] = (d2 >>   8) & 0xff;
-  outbuf[7] =  d2         & 0xff;
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
 }
 
-static void
+static unsigned int
 decrypt_block (void *context, byte *outbuf, const byte *inbuf)
 {
   BLOWFISH_context *bc = (BLOWFISH_context *) context;
   do_decrypt_block (bc, outbuf, inbuf);
-  _gcry_burn_stack (64);
+  return /*burn_stack*/ (64);
+}
+
+#endif /*!USE_AMD64_ASM&&!USE_ARM_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOWFISH_BLOCKSIZE. */
+static void
+_gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+                      const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* Prepare the counter blocks. */
+      cipher_block_cpy (tmpbuf + 0, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 16, ctr, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 8, 1, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (tmpbuf + 16, 2, BLOWFISH_BLOCKSIZE);
+      cipher_block_add (ctr, 3, BLOWFISH_BLOCKSIZE);
+      /* Encrypt the counter. */
+      do_encrypt_3(ctx, tmpbuf, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE * 3);
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+      inbuf  += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      cipher_block_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+      /* Increment the counter.  */
+      cipher_block_add (ctr, 1, BLOWFISH_BLOCKSIZE);
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+                      const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3)
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_3 (ctx, savebuf, inbuf);
+
+      cipher_block_xor_1 (savebuf + 0, iv, BLOWFISH_BLOCKSIZE);
+      cipher_block_xor_1 (savebuf + 8, inbuf, BLOWFISH_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, BLOWFISH_BLOCKSIZE);
+      buf_cpy (outbuf, savebuf, BLOWFISH_BLOCKSIZE * 3);
+      inbuf += BLOWFISH_BLOCKSIZE * 3;
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block (ctx, savebuf, inbuf);
+
+      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, 
BLOWFISH_BLOCKSIZE);
+      inbuf += BLOWFISH_BLOCKSIZE;
+      outbuf += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+static void
+_gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+                      const void *inbuf_arg, size_t nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE * 3];
+  int burn_stack_depth = (64) + 4 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#elif defined(USE_ARM_ASM)
+  {
+    /* Process data in 2 block chunks. */
+    while (nblocks >= 2)
+      {
+        _gcry_blowfish_arm_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 2;
+        outbuf += 2 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 2 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
+  for ( ;nblocks >= 3; nblocks -= 3 )
+    {
+      cipher_block_cpy (tmpbuf + 0, iv, BLOWFISH_BLOCKSIZE);
+      cipher_block_cpy (tmpbuf + 8, inbuf + 0, BLOWFISH_BLOCKSIZE * 2);
+      cipher_block_cpy (iv, inbuf + 16, BLOWFISH_BLOCKSIZE);
+      do_encrypt_3 (ctx, tmpbuf, tmpbuf);
+      buf_xor (outbuf, inbuf, tmpbuf, BLOWFISH_BLOCKSIZE * 3);
+      outbuf += BLOWFISH_BLOCKSIZE * 3;
+      inbuf  += BLOWFISH_BLOCKSIZE * 3;
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      cipher_block_xor_n_copy(outbuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
 }
 
 
@@ -466,14 +859,19 @@ static const char*
 selftest(void)
 {
   BLOWFISH_context c;
+  cipher_bulk_ops_t bulk_ops;
   byte plain[] = "BLOWFISH";
   byte buffer[8];
-  byte plain3[] = { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
-  byte key3[] = { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
-  byte cipher3[] = { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
+  static const byte plain3[] =
+    { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
+  static const byte key3[] =
+    { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
+  static const byte cipher3[] =
+    { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
 
   bf_setkey( (void *) &c,
-             (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26 );
+             (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26,
+             &bulk_ops );
   encrypt_block( (void *) &c, buffer, plain );
   if( memcmp( buffer, "\x32\x4E\xD0\xFE\xF4\x13\xA2\x03", 8 ) )
     return "Blowfish selftest failed (1).";
@@ -481,22 +879,79 @@ selftest(void)
   if( memcmp( buffer, plain, 8 ) )
     return "Blowfish selftest failed (2).";
 
-  bf_setkey( (void *) &c, key3, 8 );
+  bf_setkey( (void *) &c, key3, 8, &bulk_ops );
   encrypt_block( (void *) &c, buffer, plain3 );
   if( memcmp( buffer, cipher3, 8 ) )
     return "Blowfish selftest failed (3).";
   decrypt_block( (void *) &c, buffer, buffer );
   if( memcmp( buffer, plain3, 8 ) )
     return "Blowfish selftest failed (4).";
+
   return NULL;
 }
 
 
+struct hashset_elem {
+  u32 val;
+  short nidx;
+  char used;
+};
+
+static inline byte
+val_to_hidx(u32 val)
+{
+  /* bf sboxes are quite random already. */
+  return (val >> 24) ^ (val >> 16)  ^ (val >> 8) ^ val;
+}
+
+static inline int
+add_val(struct hashset_elem hset[256], u32 val, int *midx,
+       struct hashset_elem *mpool)
+{
+  struct hashset_elem *elem;
+  byte hidx;
+
+  hidx = val_to_hidx(val);
+  elem = &hset[hidx];
+
+  /* Check if first is in use. */
+  if (elem->used == 0)
+    {
+      elem->val = val;
+      elem->nidx = -1;
+      elem->used = 1;
+      return 0;
+    }
+
+  /* Check if first matches. */
+  if (elem->val == val)
+    return 1;
+
+  for (; elem->nidx >= 0; elem = &mpool[elem->nidx])
+    {
+      /* Check if elem matches. */
+      if (elem->val == val)
+        return 1;
+    }
+
+  elem->nidx = (*midx)++;
+  elem = &mpool[elem->nidx];
+
+  elem->val = val;
+  elem->nidx = -1;
+  elem->used = 1;
+
+  return 0;
+}
 
 static gcry_err_code_t
 do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
 {
-  int i, j;
+  struct hashset_elem mempool[4 * 255]; /* Enough entries for the worst case. 
*/
+  struct hashset_elem hset[4][256];
+  int memidx = 0;
+  int weak = 0;
+  int i, j, ret;
   u32 data, datal, datar;
   static int initialized;
   static const char *selftest_failed;
@@ -511,7 +966,13 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, 
unsigned keylen)
   if( selftest_failed )
     return GPG_ERR_SELFTEST_FAILED;
 
-  for(i=0; i < BLOWFISH_ROUNDS+2; i++ )
+  if (keylen < BLOWFISH_KEY_MIN_BITS / 8 ||
+      keylen > BLOWFISH_KEY_MAX_BITS / 8)
+    return GPG_ERR_INV_KEYLEN;
+
+  memset(hset, 0, sizeof(hset));
+
+  for(i=0; i < 16+2; i++ )
     c->p[i] = ps[i];
   for(i=0; i < 256; i++ )
     {
@@ -521,25 +982,18 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, 
unsigned keylen)
       c->s3[i] = ks3[i];
     }
 
-  for(i=j=0; i < BLOWFISH_ROUNDS+2; i++ )
+  for(i=j=0; i < 16+2; i++ )
     {
-#ifdef WORDS_BIGENDIAN
-      ((byte*)&data)[0] = key[j];
-      ((byte*)&data)[1] = key[(j+1)%keylen];
-      ((byte*)&data)[2] = key[(j+2)%keylen];
-      ((byte*)&data)[3] = key[(j+3)%keylen];
-#else
-      ((byte*)&data)[3] = key[j];
-      ((byte*)&data)[2] = key[(j+1)%keylen];
-      ((byte*)&data)[1] = key[(j+2)%keylen];
-      ((byte*)&data)[0] = key[(j+3)%keylen];
-#endif
+      data = ((u32)key[j] << 24) |
+             ((u32)key[(j+1)%keylen] << 16) |
+             ((u32)key[(j+2)%keylen] << 8) |
+             ((u32)key[(j+3)%keylen]);
       c->p[i] ^= data;
       j = (j+4) % keylen;
     }
 
   datal = datar = 0;
-  for(i=0; i < BLOWFISH_ROUNDS+2; i += 2 )
+  for(i=0; i < 16+2; i += 2 )
     {
       do_encrypt( c, &datal, &datar );
       c->p[i]   = datal;
@@ -550,55 +1004,85 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, 
unsigned keylen)
       do_encrypt( c, &datal, &datar );
       c->s0[i]   = datal;
       c->s0[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[0], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[0], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
     }
   for(i=0; i < 256; i += 2 )
     {
       do_encrypt( c, &datal, &datar );
       c->s1[i]   = datal;
       c->s1[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[1], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[1], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
     }
   for(i=0; i < 256; i += 2 )
     {
       do_encrypt( c, &datal, &datar );
       c->s2[i]   = datal;
       c->s2[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[2], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[2], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
     }
   for(i=0; i < 256; i += 2 )
     {
       do_encrypt( c, &datal, &datar );
       c->s3[i]   = datal;
       c->s3[i+1] = datar;
+
+      /* Add values to hashset, detect duplicates (weak keys). */
+      ret = add_val (hset[3], datal, &memidx, mempool);
+      weak = ret ? 1 : weak;
+      ret = add_val (hset[3], datar, &memidx, mempool);
+      weak = ret ? 1 : weak;
     }
 
+  /* Clear stack. */
+  wipememory(hset, sizeof(hset));
+  wipememory(mempool, sizeof(mempool[0]) * memidx);
+
+  _gcry_burn_stack (64);
 
   /* Check for weak key.  A weak key is a key in which a value in
      the P-array (here c) occurs more than once per table.  */
-  for(i=0; i < 255; i++ )
-    {
-      for( j=i+1; j < 256; j++)
-        {
-          if( (c->s0[i] == c->s0[j]) || (c->s1[i] == c->s1[j]) ||
-              (c->s2[i] == c->s2[j]) || (c->s3[i] == c->s3[j]) )
-            return GPG_ERR_WEAK_KEY;
-        }
-    }
+  if (weak)
+    return GPG_ERR_WEAK_KEY;
 
   return GPG_ERR_NO_ERROR;
 }
 
 
 static gcry_err_code_t
-bf_setkey (void *context, const byte *key, unsigned keylen)
+bf_setkey (void *context, const byte *key, unsigned keylen,
+           cipher_bulk_ops_t *bulk_ops)
 {
   BLOWFISH_context *c = (BLOWFISH_context *) context;
   gcry_err_code_t rc = do_bf_setkey (c, key, keylen);
-  _gcry_burn_stack (64);
+
+  /* Setup bulk encryption routines.  */
+  memset (bulk_ops, 0, sizeof(*bulk_ops));
+  bulk_ops->cfb_dec = _gcry_blowfish_cfb_dec;
+  bulk_ops->cbc_dec = _gcry_blowfish_cbc_dec;
+  bulk_ops->ctr_enc = _gcry_blowfish_ctr_enc;
+
   return rc;
 }
 
 
 gcry_cipher_spec_t _gcry_cipher_spec_blowfish =
   {
+    GCRY_CIPHER_BLOWFISH, {0, 0},
     "BLOWFISH", NULL, NULL, BLOWFISH_BLOCKSIZE, 128,
     sizeof (BLOWFISH_context),
     bf_setkey, encrypt_block, decrypt_block
diff --git a/grub-core/lib/libgcrypt/cipher/bufhelp.h 
b/grub-core/lib/libgcrypt/cipher/bufhelp.h
index df3559472..6dcc622a1 100644
--- a/grub-core/lib/libgcrypt/cipher/bufhelp.h
+++ b/grub-core/lib/libgcrypt/cipher/bufhelp.h
@@ -1,5 +1,5 @@
 /* bufhelp.h  -  Some buffer manipulation helpers
- * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright (C) 2012-2017 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -20,269 +20,23 @@
 #define GCRYPT_BUFHELP_H
 
 
+#include "g10lib.h"
 #include "bithelp.h"
+#include "const-time.h"
 
 
-#undef BUFHELP_FAST_UNALIGNED_ACCESS
+#undef BUFHELP_UNALIGNED_ACCESS
 #if defined(HAVE_GCC_ATTRIBUTE_PACKED) && \
     defined(HAVE_GCC_ATTRIBUTE_ALIGNED) && \
-    (defined(__i386__) || defined(__x86_64__) || \
-     (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)) || \
-     defined(__aarch64__))
-/* These architectures are able of unaligned memory accesses and can
-   handle those fast.
+    defined(HAVE_GCC_ATTRIBUTE_MAY_ALIAS)
+/* Compiler is supports attributes needed for automatically issuing unaligned
+   memory access instructions.
  */
-# define BUFHELP_FAST_UNALIGNED_ACCESS 1
+# define BUFHELP_UNALIGNED_ACCESS 1
 #endif
 
 
-#ifdef BUFHELP_FAST_UNALIGNED_ACCESS
-/* Define type with one-byte alignment on architectures with fast unaligned
-   memory accesses.
- */
-typedef struct bufhelp_int_s
-{
-  uintptr_t a;
-} __attribute__((packed, aligned(1))) bufhelp_int_t;
-#else
-/* Define type with default alignment for other architectures (unaligned
-   accessed handled in per byte loops).
- */
-typedef struct bufhelp_int_s
-{
-  uintptr_t a;
-} bufhelp_int_t;
-#endif
-
-
-/* Optimized function for small buffer copying */
-static inline void
-buf_cpy(void *_dst, const void *_src, size_t len)
-{
-#if __GNUC__ >= 4 && (defined(__x86_64__) || defined(__i386__))
-  /* For AMD64 and i386, memcpy is faster.  */
-  memcpy(_dst, _src, len);
-#else
-  byte *dst = _dst;
-  const byte *src = _src;
-  bufhelp_int_t *ldst;
-  const bufhelp_int_t *lsrc;
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-  const unsigned int longmask = sizeof(bufhelp_int_t) - 1;
-
-  /* Skip fast processing if buffers are unaligned.  */
-  if (((uintptr_t)dst | (uintptr_t)src) & longmask)
-    goto do_bytes;
-#endif
-
-  ldst = (bufhelp_int_t *)(void *)dst;
-  lsrc = (const bufhelp_int_t *)(const void *)src;
-
-  for (; len >= sizeof(bufhelp_int_t); len -= sizeof(bufhelp_int_t))
-    (ldst++)->a = (lsrc++)->a;
-
-  dst = (byte *)ldst;
-  src = (const byte *)lsrc;
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-do_bytes:
-#endif
-  /* Handle tail.  */
-  for (; len; len--)
-    *dst++ = *src++;
-#endif /*__GNUC__ >= 4 && (__x86_64__ || __i386__)*/
-}
-
-
-/* Optimized function for buffer xoring */
-static inline void
-buf_xor(void *_dst, const void *_src1, const void *_src2, size_t len)
-{
-  byte *dst = _dst;
-  const byte *src1 = _src1;
-  const byte *src2 = _src2;
-  bufhelp_int_t *ldst;
-  const bufhelp_int_t *lsrc1, *lsrc2;
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-  const unsigned int longmask = sizeof(bufhelp_int_t) - 1;
-
-  /* Skip fast processing if buffers are unaligned.  */
-  if (((uintptr_t)dst | (uintptr_t)src1 | (uintptr_t)src2) & longmask)
-    goto do_bytes;
-#endif
-
-  ldst = (bufhelp_int_t *)(void *)dst;
-  lsrc1 = (const bufhelp_int_t *)(const void *)src1;
-  lsrc2 = (const bufhelp_int_t *)(const void *)src2;
-
-  for (; len >= sizeof(bufhelp_int_t); len -= sizeof(bufhelp_int_t))
-    (ldst++)->a = (lsrc1++)->a ^ (lsrc2++)->a;
-
-  dst = (byte *)ldst;
-  src1 = (const byte *)lsrc1;
-  src2 = (const byte *)lsrc2;
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-do_bytes:
-#endif
-  /* Handle tail.  */
-  for (; len; len--)
-    *dst++ = *src1++ ^ *src2++;
-}
-
-
-/* Optimized function for in-place buffer xoring. */
-static inline void
-buf_xor_1(void *_dst, const void *_src, size_t len)
-{
-  byte *dst = _dst;
-  const byte *src = _src;
-  bufhelp_int_t *ldst;
-  const bufhelp_int_t *lsrc;
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-  const unsigned int longmask = sizeof(bufhelp_int_t) - 1;
-
-  /* Skip fast processing if buffers are unaligned.  */
-  if (((uintptr_t)dst | (uintptr_t)src) & longmask)
-    goto do_bytes;
-#endif
-
-  ldst = (bufhelp_int_t *)(void *)dst;
-  lsrc = (const bufhelp_int_t *)(const void *)src;
-
-  for (; len >= sizeof(bufhelp_int_t); len -= sizeof(bufhelp_int_t))
-    (ldst++)->a ^= (lsrc++)->a;
-
-  dst = (byte *)ldst;
-  src = (const byte *)lsrc;
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-do_bytes:
-#endif
-  /* Handle tail.  */
-  for (; len; len--)
-    *dst++ ^= *src++;
-}
-
-
-/* Optimized function for buffer xoring with two destination buffers.  Used
-   mainly by CFB mode encryption.  */
-static inline void
-buf_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t len)
-{
-  byte *dst1 = _dst1;
-  byte *dst2 = _dst2;
-  const byte *src = _src;
-  bufhelp_int_t *ldst1, *ldst2;
-  const bufhelp_int_t *lsrc;
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-  const unsigned int longmask = sizeof(bufhelp_int_t) - 1;
-
-  /* Skip fast processing if buffers are unaligned.  */
-  if (((uintptr_t)src | (uintptr_t)dst1 | (uintptr_t)dst2) & longmask)
-    goto do_bytes;
-#endif
-
-  ldst1 = (bufhelp_int_t *)(void *)dst1;
-  ldst2 = (bufhelp_int_t *)(void *)dst2;
-  lsrc = (const bufhelp_int_t *)(const void *)src;
-
-  for (; len >= sizeof(bufhelp_int_t); len -= sizeof(bufhelp_int_t))
-    (ldst1++)->a = ((ldst2++)->a ^= (lsrc++)->a);
-
-  dst1 = (byte *)ldst1;
-  dst2 = (byte *)ldst2;
-  src = (const byte *)lsrc;
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-do_bytes:
-#endif
-  /* Handle tail.  */
-  for (; len; len--)
-    *dst1++ = (*dst2++ ^= *src++);
-}
-
-
-/* Optimized function for combined buffer xoring and copying.  Used by mainly
-   CBC mode decryption.  */
-static inline void
-buf_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy,
-                const void *_src_cpy, size_t len)
-{
-  byte *dst_xor = _dst_xor;
-  byte *srcdst_cpy = _srcdst_cpy;
-  const byte *src_xor = _src_xor;
-  const byte *src_cpy = _src_cpy;
-  byte temp;
-  bufhelp_int_t *ldst_xor, *lsrcdst_cpy;
-  const bufhelp_int_t *lsrc_cpy, *lsrc_xor;
-  uintptr_t ltemp;
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-  const unsigned int longmask = sizeof(bufhelp_int_t) - 1;
-
-  /* Skip fast processing if buffers are unaligned.  */
-  if (((uintptr_t)src_cpy | (uintptr_t)src_xor | (uintptr_t)dst_xor |
-       (uintptr_t)srcdst_cpy) & longmask)
-    goto do_bytes;
-#endif
-
-  ldst_xor = (bufhelp_int_t *)(void *)dst_xor;
-  lsrc_xor = (const bufhelp_int_t *)(void *)src_xor;
-  lsrcdst_cpy = (bufhelp_int_t *)(void *)srcdst_cpy;
-  lsrc_cpy = (const bufhelp_int_t *)(const void *)src_cpy;
-
-  for (; len >= sizeof(bufhelp_int_t); len -= sizeof(bufhelp_int_t))
-    {
-      ltemp = (lsrc_cpy++)->a;
-      (ldst_xor++)->a = (lsrcdst_cpy)->a ^ (lsrc_xor++)->a;
-      (lsrcdst_cpy++)->a = ltemp;
-    }
-
-  dst_xor = (byte *)ldst_xor;
-  src_xor = (const byte *)lsrc_xor;
-  srcdst_cpy = (byte *)lsrcdst_cpy;
-  src_cpy = (const byte *)lsrc_cpy;
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
-do_bytes:
-#endif
-  /* Handle tail.  */
-  for (; len; len--)
-    {
-      temp = *src_cpy++;
-      *dst_xor++ = *srcdst_cpy ^ *src_xor++;
-      *srcdst_cpy++ = temp;
-    }
-}
-
-
-/* Optimized function for combined buffer xoring and copying.  Used by mainly
-   CFB mode decryption.  */
-static inline void
-buf_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t len)
-{
-  buf_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, len);
-}
-
-
-/* Constant-time compare of two buffers.  Returns 1 if buffers are equal,
-   and 0 if buffers differ.  */
-static inline int
-buf_eq_const(const void *_a, const void *_b, size_t len)
-{
-  const byte *a = _a;
-  const byte *b = _b;
-  size_t diff, i;
-
-  /* Constant-time compare. */
-  for (i = 0, diff = 0; i < len; i++)
-    diff -= !!(a[i] - b[i]);
-
-  return !diff;
-}
-
-
-#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
+#ifndef BUFHELP_UNALIGNED_ACCESS
 
 /* Functions for loading and storing unaligned u32 values of different
    endianness.  */
@@ -365,12 +119,12 @@ static inline void buf_put_le64(void *_buf, u64 val)
   out[0] = val;
 }
 
-#else /*BUFHELP_FAST_UNALIGNED_ACCESS*/
+#else /*BUFHELP_UNALIGNED_ACCESS*/
 
 typedef struct bufhelp_u32_s
 {
   u32 a;
-} __attribute__((packed, aligned(1))) bufhelp_u32_t;
+} __attribute__((packed, aligned(1), may_alias)) bufhelp_u32_t;
 
 /* Functions for loading and storing unaligned u32 values of different
    endianness.  */
@@ -400,7 +154,7 @@ static inline void buf_put_le32(void *_buf, u32 val)
 typedef struct bufhelp_u64_s
 {
   u64 a;
-} __attribute__((packed, aligned(1))) bufhelp_u64_t;
+} __attribute__((packed, aligned(1), may_alias)) bufhelp_u64_t;
 
 /* Functions for loading and storing unaligned u64 values of different
    endianness.  */
@@ -426,7 +180,193 @@ static inline void buf_put_le64(void *_buf, u64 val)
   out->a = le_bswap64(val);
 }
 
+#endif /*BUFHELP_UNALIGNED_ACCESS*/
+
+
+/* Host-endian get/put macros */
+#ifdef WORDS_BIGENDIAN
+# define buf_get_he32 buf_get_be32
+# define buf_put_he32 buf_put_be32
+# define buf_get_he64 buf_get_be64
+# define buf_put_he64 buf_put_be64
+#else
+# define buf_get_he32 buf_get_le32
+# define buf_put_he32 buf_put_le32
+# define buf_get_he64 buf_get_le64
+# define buf_put_he64 buf_put_le64
+#endif
+
+
+
+/* Optimized function for small buffer copying */
+static inline void
+buf_cpy(void *_dst, const void *_src, size_t len)
+{
+  byte *dst = _dst;
+  const byte *src = _src;
+
+#if __GNUC__ >= 4
+  if (!__builtin_constant_p (len))
+    {
+      if (UNLIKELY(len == 0))
+       return;
+      memcpy(_dst, _src, len);
+      return;
+    }
+#endif
+
+  while (len >= sizeof(u64))
+    {
+      buf_put_he64(dst, buf_get_he64(src));
+      dst += sizeof(u64);
+      src += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      buf_put_he32(dst, buf_get_he32(src));
+      dst += sizeof(u32);
+      src += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst++ = *src++;
+}
+
+
+/* Optimized function for buffer xoring */
+static inline void
+buf_xor(void *_dst, const void *_src1, const void *_src2, size_t len)
+{
+  byte *dst = _dst;
+  const byte *src1 = _src1;
+  const byte *src2 = _src2;
+
+  while (len >= sizeof(u64))
+    {
+      buf_put_he64(dst, buf_get_he64(src1) ^ buf_get_he64(src2));
+      dst += sizeof(u64);
+      src1 += sizeof(u64);
+      src2 += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len > sizeof(u32))
+    {
+      buf_put_he32(dst, buf_get_he32(src1) ^ buf_get_he32(src2));
+      dst += sizeof(u32);
+      src1 += sizeof(u32);
+      src2 += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst++ = *src1++ ^ *src2++;
+}
+
+
+/* Optimized function for buffer xoring with two destination buffers.  Used
+   mainly by CFB mode encryption.  */
+static inline void
+buf_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t len)
+{
+  byte *dst1 = _dst1;
+  byte *dst2 = _dst2;
+  const byte *src = _src;
+
+  while (len >= sizeof(u64))
+    {
+      u64 temp = buf_get_he64(dst2) ^ buf_get_he64(src);
+      buf_put_he64(dst2, temp);
+      buf_put_he64(dst1, temp);
+      dst2 += sizeof(u64);
+      dst1 += sizeof(u64);
+      src += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      u32 temp = buf_get_he32(dst2) ^ buf_get_he32(src);
+      buf_put_he32(dst2, temp);
+      buf_put_he32(dst1, temp);
+      dst2 += sizeof(u32);
+      dst1 += sizeof(u32);
+      src += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst1++ = (*dst2++ ^= *src++);
+}
+
+
+/* Optimized function for combined buffer xoring and copying.  Used by mainly
+   CBC mode decryption.  */
+static inline void
+buf_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy,
+                const void *_src_cpy, size_t len)
+{
+  byte *dst_xor = _dst_xor;
+  byte *srcdst_cpy = _srcdst_cpy;
+  const byte *src_xor = _src_xor;
+  const byte *src_cpy = _src_cpy;
+
+  while (len >= sizeof(u64))
+    {
+      u64 temp = buf_get_he64(src_cpy);
+      buf_put_he64(dst_xor, buf_get_he64(srcdst_cpy) ^ buf_get_he64(src_xor));
+      buf_put_he64(srcdst_cpy, temp);
+      dst_xor += sizeof(u64);
+      srcdst_cpy += sizeof(u64);
+      src_xor += sizeof(u64);
+      src_cpy += sizeof(u64);
+      len -= sizeof(u64);
+    }
+
+  if (len >= sizeof(u32))
+    {
+      u32 temp = buf_get_he32(src_cpy);
+      buf_put_he32(dst_xor, buf_get_he32(srcdst_cpy) ^ buf_get_he32(src_xor));
+      buf_put_he32(srcdst_cpy, temp);
+      dst_xor += sizeof(u32);
+      srcdst_cpy += sizeof(u32);
+      src_xor += sizeof(u32);
+      src_cpy += sizeof(u32);
+      len -= sizeof(u32);
+    }
+
+  /* Handle tail.  */
+  for (; len; len--)
+    {
+      byte temp = *src_cpy++;
+      *dst_xor++ = *srcdst_cpy ^ *src_xor++;
+      *srcdst_cpy++ = temp;
+    }
+}
+
+
+/* Optimized function for combined buffer xoring and copying.  Used by mainly
+   CFB mode decryption.  */
+static inline void
+buf_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t len)
+{
+  buf_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, len);
+}
+
+
+/* Constant-time compare of two buffers.  Returns 1 if buffers are equal,
+   and 0 if buffers differ.  */
+static inline int
+buf_eq_const(const void *a, const void *b, size_t len)
+{
+  return ct_memequal (a, b, len);
+}
 
-#endif /*BUFHELP_FAST_UNALIGNED_ACCESS*/
 
 #endif /*GCRYPT_BUFHELP_H*/
diff --git a/grub-core/lib/libgcrypt/cipher/bulkhelp.h 
b/grub-core/lib/libgcrypt/cipher/bulkhelp.h
new file mode 100644
index 000000000..833262e2a
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/bulkhelp.h
@@ -0,0 +1,493 @@
+/* bulkhelp.h  -  Some bulk processing helpers
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRYPT_BULKHELP_H
+#define GCRYPT_BULKHELP_H
+
+
+#include "g10lib.h"
+#include "cipher-internal.h"
+
+
+#ifdef __x86_64__
+/* Use u64 to store pointers for x32 support (assembly function assumes
+ * 64-bit pointers). */
+typedef u64 ocb_L_uintptr_t;
+#else
+typedef uintptr_t ocb_L_uintptr_t;
+#endif
+
+typedef unsigned int (*bulk_crypt_fn_t) (void *ctx, byte *out,
+                                         const byte *in,
+                                         size_t num_blks);
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk64 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[64], u64 blkn)
+{
+  unsigned int n = 64 - (blkn % 64);
+  unsigned int i;
+
+  for (i = 0; i < 64; i += 8)
+    {
+      Ls[(i + 0 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(15 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(23 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(31 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[5];
+  Ls[(39 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(47 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(55 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(63 + n) % 64];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk32 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[32], u64 blkn)
+{
+  unsigned int n = 32 - (blkn % 32);
+  unsigned int i;
+
+  for (i = 0; i < 32; i += 8)
+    {
+      Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+  Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(31 + n) % 32];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk16 (gcry_cipher_hd_t c,
+                                         ocb_L_uintptr_t Ls[16], u64 blkn)
+{
+  unsigned int n = 16 - (blkn % 16);
+  unsigned int i;
+
+  for (i = 0; i < 16; i += 8)
+    {
+      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+    }
+
+  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+  return &Ls[(15 + n) % 16];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk8 (gcry_cipher_hd_t c,
+                                        ocb_L_uintptr_t Ls[8], u64 blkn)
+{
+  unsigned int n = 8 - (blkn % 8);
+
+  Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+  Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+  Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+  Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+  Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+
+  return &Ls[(7 + n) % 8];
+}
+
+
+static inline unsigned int
+bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *ctr,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+      for (i = 1; i < curr_blks; i++)
+        {
+          cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
+          cipher_block_add (&tmpbuf[i * 16], i, 16);
+        }
+      cipher_block_add (ctr, curr_blks, 16);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ctr32le_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                      const byte *inbuf, size_t nblocks, byte *ctr,
+                      byte *tmpbuf, size_t tmpbuf_nblocks,
+                      unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      u64 ctr_lo = buf_get_le64(ctr + 0 * 8);
+      u64 ctr_hi = buf_get_he64(ctr + 1 * 8);
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+      for (i = 1; i < curr_blks; i++)
+        {
+          u32 lo_u32 = (u32)ctr_lo + i;
+          u64 lo_u64 = ctr_lo & ~(u64)(u32)-1;
+          lo_u64 += lo_u32;
+          buf_put_le64(&tmpbuf[0 * 8 + i * 16], lo_u64);
+          buf_put_he64(&tmpbuf[1 * 8 + i * 16], ctr_hi);
+        }
+      buf_put_le32(ctr, (u32)ctr_lo + curr_blks);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *iv,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      nburn = crypt_fn (priv, tmpbuf, inbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_n_copy_2(outbuf, &tmpbuf[i * 16], iv, inbuf, 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cfb_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                  const byte *inbuf, size_t nblocks, byte *iv,
+                  byte *tmpbuf, size_t tmpbuf_nblocks,
+                  unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      cipher_block_cpy (&tmpbuf[0 * 16], iv, 16);
+      if (curr_blks > 1)
+        memcpy (&tmpbuf[1 * 16], &inbuf[(1 - 1) * 16], 16 * curr_blks - 16);
+      cipher_block_cpy (iv, &inbuf[(curr_blks - 1) * 16], 16);
+
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor (outbuf, inbuf, &tmpbuf[i * 16], 16);
+          outbuf += 16;
+          inbuf += 16;
+        }
+
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_crypt_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+                    byte *outbuf, const byte *inbuf, size_t nblocks, u64 *blkn,
+                    int encrypt, byte *tmpbuf, size_t tmpbuf_nblocks,
+                    unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          if (encrypt)
+            cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
+          cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
+                            c->u_iv.iv, 16);
+        }
+
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
+
+          /* Checksum_i = Checksum_{i-1} xor P_i  */
+          if (!encrypt)
+              cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
+        }
+
+      outbuf += curr_blks * 16;
+      inbuf  += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+                   const byte *abuf, size_t nblocks, u64 *blkn, byte *tmpbuf,
+                   size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks)
+{
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+          /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+          cipher_block_xor_2dst (&tmpbuf[i * 16],
+                                  c->u_mode.ocb.aad_offset, l, 16);
+          cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
+        }
+
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
+        }
+
+      abuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                    const byte *inbuf, size_t nblocks, byte *tweak,
+                    byte *tmpbuf, size_t tmpbuf_nblocks,
+                    unsigned int *num_used_tmpblocks)
+{
+  u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+  unsigned int tmp_used = 16;
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  tweak_next_lo = buf_get_le64 (tweak + 0);
+  tweak_next_hi = buf_get_le64 (tweak + 8);
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+      size_t i;
+
+      if (curr_blks * 16 > tmp_used)
+        tmp_used = curr_blks * 16;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          tweak_lo = tweak_next_lo;
+          tweak_hi = tweak_next_hi;
+
+          /* Generate next tweak. */
+          carry = -(tweak_next_hi >> 63) & 0x87;
+          tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+          tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
+          buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+      for (i = 0; i < curr_blks; i++)
+        {
+          /* Xor-Encrypt/Decrypt-Xor block. */
+          tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
+          tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
+          tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
+          tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
+          buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+          buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+        }
+
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  buf_put_le64 (tweak + 0, tweak_next_lo);
+  buf_put_le64 (tweak + 8, tweak_next_hi);
+
+  *num_used_tmpblocks = tmp_used;
+  return burn_depth;
+}
+
+static inline unsigned int
+bulk_ecb_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+                   const byte *inbuf, size_t nblocks, size_t fn_max_nblocks)
+{
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (nblocks >= 1)
+    {
+      size_t curr_blks = nblocks > fn_max_nblocks ? fn_max_nblocks : nblocks;
+      nburn = crypt_fn (priv, outbuf, inbuf, curr_blks);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      inbuf += curr_blks * 16;
+      outbuf += curr_blks * 16;
+      nblocks -= curr_blks;
+    }
+
+  return burn_depth;
+}
+
+#endif /*GCRYPT_BULKHELP_H*/
diff --git a/grub-core/lib/libgcrypt/cipher/camellia-aarch64-ce.c 
b/grub-core/lib/libgcrypt/cipher/camellia-aarch64-ce.c
new file mode 100644
index 000000000..76813e945
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/camellia-aarch64-ce.c
@@ -0,0 +1,42 @@
+/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* __AARCH64EL__ */
diff --git a/grub-core/lib/libgcrypt/cipher/camellia-aarch64.S 
b/grub-core/lib/libgcrypt/cipher/camellia-aarch64.S
new file mode 100644
index 000000000..1d820553c
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/camellia-aarch64.S
@@ -0,0 +1,585 @@
+/* camellia-aarch64.S  -  ARMv8/AArch64 assembly implementation of Camellia
+ *                        cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* struct camellia_ctx: */
+#define key_table 0
+
+/* register macros */
+#define CTX x0
+#define RDST x1
+#define RSRC x2
+#define RKEYBITS w3
+
+#define RTAB1 x4
+#define RTAB2 x5
+#define RTAB3 x6
+#define RTAB4 x7
+#define RMASK w8
+
+#define IL w9
+#define IR w10
+
+#define xIL x9
+#define xIR x10
+
+#define XL w11
+#define XR w12
+#define YL w13
+#define YR w14
+
+#define RT0 w15
+#define RT1 w16
+#define RT2 w17
+#define RT3 w19
+
+#define xRT0 x15
+#define xRT1 x16
+#define xRT2 x17
+#define xRT3 x19
+
+#ifdef __AARCH64EL__
+  #define host_to_be(reg, rtmp) \
+         rev reg, reg;
+  #define be_to_host(reg, rtmp) \
+         rev reg, reg;
+#else
+  /* nop on big-endian */
+  #define host_to_be(reg, rtmp) /*_*/
+  #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define ldr_input_aligned_be(rin, a, b, c, d, rtmp) \
+       ldr a, [rin, #0]; \
+       ldr b, [rin, #4]; \
+       be_to_host(a, rtmp); \
+       ldr c, [rin, #8]; \
+       be_to_host(b, rtmp); \
+       ldr d, [rin, #12]; \
+       be_to_host(c, rtmp); \
+       be_to_host(d, rtmp);
+
+#define str_output_aligned_be(rout, a, b, c, d, rtmp) \
+       be_to_host(a, rtmp); \
+       be_to_host(b, rtmp); \
+       str a, [rout, #0]; \
+       be_to_host(c, rtmp); \
+       str b, [rout, #4]; \
+       be_to_host(d, rtmp); \
+       str c, [rout, #8]; \
+       str d, [rout, #12];
+
+/* unaligned word reads/writes allowed */
+#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
+       ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp)
+
+#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+       str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0)
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(xl, xr, kl, kr, yl, yr) \
+       ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
+       and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
+       ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
+       and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
+       and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
+       ldr  IR, [RTAB1,  xIR]; \
+       and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
+       eor yl, yl, RT2; \
+       ldr  IL, [RTAB1,  xIL]; \
+       eor yr, yr, RT3; \
+       \
+       ldr RT0, [RTAB3, xRT0]; \
+       ldr RT1, [RTAB3, xRT1]; \
+       \
+       and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
+       and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
+       \
+       eor IR, IR, RT0; \
+       eor IL, IL, RT1; \
+       \
+       ldr RT2, [RTAB2, xRT2]; \
+       and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
+       ldr RT3, [RTAB2, xRT3]; \
+       and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
+       \
+       ldr RT0, [RTAB4, xRT0]; \
+       ldr RT1, [RTAB4, xRT1]; \
+       \
+       eor IR, IR, RT2; \
+       eor IL, IL, RT3; \
+       eor IR, IR, RT0; \
+       eor IL, IL, RT1; \
+       \
+       eor IR, IR, IL; \
+       eor yr, yr, IL, ror#8; \
+       eor yl, yl, IR; \
+       eor yr, yr, IR;
+
+#define enc_rounds(n) \
+       roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
+       roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
+       roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
+
+#define dec_rounds(n) \
+       roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
+       roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
+       roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
+       roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
+
+/* perform FL and FL⁻¹ */
+#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
+       ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
+       ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
+       and RT0, RT0, ll; \
+       ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
+       orr RT2, RT2, rr; \
+       ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
+       eor rl, rl, RT2; \
+       eor lr, lr, RT0, ror#31; \
+       and RT3, RT3, rl; \
+       orr RT1, RT1, lr; \
+       eor ll, ll, RT1; \
+       eor rr, rr, RT3, ror#31;
+
+#define enc_fls(n) \
+       fls(XL, XR, YL, YR, \
+           (n) * 2 + 0, (n) * 2 + 1, \
+           (n) * 2 + 2, (n) * 2 + 3);
+
+#define dec_fls(n) \
+       fls(XL, XR, YL, YR, \
+           (n) * 2 + 2, (n) * 2 + 3, \
+           (n) * 2 + 0, (n) * 2 + 1);
+
+#define inpack(n) \
+       ldr_input_be(RSRC, XL, XR, YL, YR, RT0); \
+       ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+       ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+       eor XL, XL, RT0; \
+       eor XR, XR, RT1;
+
+#define outunpack(n) \
+       ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+       ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+       eor YL, YL, RT0; \
+       eor YR, YR, RT1; \
+       str_output_be(RDST, YL, YR, XL, XR, RT0, RT1);
+
+.globl _gcry_camellia_arm_encrypt_block
+ELF(.type   _gcry_camellia_arm_encrypt_block,@function;)
+
+.align 4
+_gcry_camellia_arm_encrypt_block:
+       CFI_STARTPROC()
+       stp x19, x30, [sp, #-16]!
+       CFI_ADJUST_CFA_OFFSET(16)
+       CFI_REG_ON_STACK(19, 0)
+       CFI_REG_ON_STACK(30, 8)
+
+       /* input:
+        *      x0: keytable
+        *      x1: dst
+        *      x2: src
+        *      w3: keybitlen
+        */
+
+       GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables);
+       mov RMASK, #(0xff<<4); /* byte mask */
+       add RTAB2, RTAB1, #(1 * 4);
+       add RTAB3, RTAB1, #(2 * 4);
+       add RTAB4, RTAB1, #(3 * 4);
+
+       inpack(0);
+
+       enc_rounds(0);
+       enc_fls(8);
+       enc_rounds(8);
+       enc_fls(16);
+       enc_rounds(16);
+
+       cmp RKEYBITS, #(16 * 8);
+       bne .Lenc_256;
+
+       outunpack(24);
+
+       CFI_REMEMBER_STATE()
+       ldp x19, x30, [sp], #16
+       CFI_ADJUST_CFA_OFFSET(-16)
+       CFI_RESTORE(x19)
+       CFI_RESTORE(x30)
+       ret_spec_stop;
+       CFI_RESTORE_STATE()
+
+.Lenc_256:
+       enc_fls(24);
+       enc_rounds(24);
+
+       outunpack(32);
+
+       ldp x19, x30, [sp], #16
+       CFI_ADJUST_CFA_OFFSET(-16)
+       CFI_RESTORE(x19)
+       CFI_RESTORE(x30)
+       ret_spec_stop;
+       CFI_ENDPROC()
+ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
+
+.globl _gcry_camellia_arm_decrypt_block
+ELF(.type   _gcry_camellia_arm_decrypt_block,@function;)
+
+.align 4
+_gcry_camellia_arm_decrypt_block:
+       CFI_STARTPROC()
+       stp x19, x30, [sp, #-16]!
+       CFI_ADJUST_CFA_OFFSET(16)
+       CFI_REG_ON_STACK(19, 0)
+       CFI_REG_ON_STACK(30, 8)
+
+       /* input:
+        *      x0: keytable
+        *      x1: dst
+        *      x2: src
+        *      w3: keybitlen
+        */
+
+       GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables);
+       mov RMASK, #(0xff<<4); /* byte mask */
+       add RTAB2, RTAB1, #(1 * 4);
+       add RTAB3, RTAB1, #(2 * 4);
+       add RTAB4, RTAB1, #(3 * 4);
+
+       cmp RKEYBITS, #(16 * 8);
+       bne .Ldec_256;
+
+       inpack(24);
+
+.Ldec_128:
+       dec_rounds(16);
+       dec_fls(16);
+       dec_rounds(8);
+       dec_fls(8);
+       dec_rounds(0);
+
+       outunpack(0);
+
+       CFI_REMEMBER_STATE()
+       ldp x19, x30, [sp], #16
+       CFI_ADJUST_CFA_OFFSET(-16)
+       CFI_RESTORE(x19)
+       CFI_RESTORE(x30)
+       ret_spec_stop;
+       CFI_RESTORE_STATE()
+
+.Ldec_256:
+       inpack(32);
+       dec_rounds(24);
+       dec_fls(24);
+
+       b .Ldec_128;
+       CFI_ENDPROC()
+ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
+
+/* Encryption/Decryption tables */
+SECTION_RODATA
+ELF(.type _gcry_camellia_arm_tables,%object;)
+.balign 32
+_gcry_camellia_arm_tables:
+.Lcamellia_sp1110:
+.long 0x70707000
+.Lcamellia_sp0222:
+            .long 0x00e0e0e0
+.Lcamellia_sp3033:
+                        .long 0x38003838
+.Lcamellia_sp4404:
+                                    .long 0x70700070
+.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
+.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
+.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
+.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
+.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
+.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
+.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
+.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
+.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
+.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
+.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
+.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
+.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
+.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
+.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
+.long 0x23232300, 0x00464646, 0x91009191, 0x86860086
+.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
+.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
+.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
+.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
+.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
+.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
+.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
+.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
+.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
+.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
+.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
+.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
+.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
+.long 0x92929200, 0x00252525, 0x49004949, 0x51510051
+.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
+.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
+.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
+.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
+.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
+.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
+.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
+.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
+.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
+.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
+.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
+.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
+.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
+.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
+.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
+.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
+.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
+.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
+.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
+.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
+.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
+.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
+.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
+.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
+.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
+.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
+.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
+.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
+.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
+.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
+.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
+.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
+.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
+.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
+.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
+.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
+.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
+.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
+.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
+.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
+.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
+.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
+.long 0x12121200, 0x00242424, 0x09000909, 0x00000000
+.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
+.long 0x20202000, 0x00404040, 0x10001010, 0x75750075
+.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
+.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
+.long 0x84848400, 0x00090909, 0x42004242, 0x09090009
+.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
+.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
+.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
+.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
+.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
+.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
+.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
+.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
+.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
+.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
+.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
+.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
+.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
+.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
+.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
+.long 0x04040400, 0x00080808, 0x02000202, 0x13130013
+.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
+.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
+.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
+.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
+.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
+.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
+.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
+.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
+.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
+.long 0x32323200, 0x00646464, 0x19001919, 0x78780078
+.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
+.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
+.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
+.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
+.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
+.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
+.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
+.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
+.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
+.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
+.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
+.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
+.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
+.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
+.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
+.long 0x24242400, 0x00484848, 0x12001212, 0x40400040
+.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
+.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
+.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
+.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
+.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
+.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
+.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
+.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
+.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
+.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
+.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
+.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
+.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
+.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
+.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
+.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
+.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
+.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
+.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
+.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
+.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
+.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
+.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
+.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
+.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
+.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
+.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
+.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
+.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
+.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
+.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
+.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
+.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
+.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
+.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
+.long 0x09090900, 0x00121212, 0x84008484, 0x01010001
+.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
+.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
+.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
+.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
+.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
+.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
+.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
+.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
+.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
+.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
+.long 0x33333300, 0x00666666, 0x99009999, 0x99990099
+.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
+.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
+.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
+.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
+.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
+.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
+.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
+.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
+.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
+.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
+.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
+.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
+.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
+.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
+.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
+.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
+.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
+.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
+.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
+.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
+.long 0x13131300, 0x00262626, 0x89008989, 0x08080008
+.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
+.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
+.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
+.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
+.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
+.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
+.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
+.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
+.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
+.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
+.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
+.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
+.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
+.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
+.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
+.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
+.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
+.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
+.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
+.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
+.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
+.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
+.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
+.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
+.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
+.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
+.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
+.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
+.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
+.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
+.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
+.long 0x88888800, 0x00111111, 0x44004444, 0x96960096
+.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
+.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
+.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
+.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
+.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
+.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
+.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
+.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
+.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
+.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
+.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
+.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
+.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
+.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
+.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
+.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
+.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
+.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
+.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
+.long 0x40404000, 0x00808080, 0x20002020, 0x07070007
+.long 0x28282800, 0x00505050, 0x14001414, 0x55550055
+.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
+.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
+.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
+.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
+.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
+.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
+.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
+.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
+.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
+.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
+.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
+.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
+.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
+.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
+ELF(.size _gcry_camellia_arm_tables,.-_gcry_camellia_arm_tables;)
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__*/
diff --git a/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx-amd64.S 
b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx-amd64.S
new file mode 100644
index 000000000..76e62ea89
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx-amd64.S
@@ -0,0 +1,2802 @@
+/* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
+ *
+ * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+       vpand x, mask4bit, tmp0; \
+       vpandn x, mask4bit, x; \
+       vpsrld $4, x, x; \
+       \
+       vpshufb tmp0, lo_t, tmp0; \
+       vpshufb x, hi_t, x; \
+       vpxor tmp0, x, x;
+
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+                 t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       vmovdqa .Linv_shift_row rRIP, t4; \
+       vbroadcastss .L0f0f0f0f rRIP, t7; \
+       vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \
+       vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \
+       \
+       /* AES inverse shift rows */ \
+       vpshufb t4, x0, x0; \
+       vpshufb t4, x7, x7; \
+       vpshufb t4, x1, x1; \
+       vpshufb t4, x4, x4; \
+       vpshufb t4, x2, x2; \
+       vpshufb t4, x5, x5; \
+       vpshufb t4, x3, x3; \
+       vpshufb t4, x6, x6; \
+       \
+       /* prefilter sboxes 1, 2 and 3 */ \
+       vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \
+       vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \
+       filter_8bit(x0, t0, t1, t7, t6); \
+       filter_8bit(x7, t0, t1, t7, t6); \
+       filter_8bit(x1, t0, t1, t7, t6); \
+       filter_8bit(x4, t0, t1, t7, t6); \
+       filter_8bit(x2, t0, t1, t7, t6); \
+       filter_8bit(x5, t0, t1, t7, t6); \
+       \
+       /* prefilter sbox 4 */ \
+       vpxor t4, t4, t4; \
+       filter_8bit(x3, t2, t3, t7, t6); \
+       filter_8bit(x6, t2, t3, t7, t6); \
+       \
+       /* AES subbytes + AES shift rows */ \
+       vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+       vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
+       vaesenclast t4, x0, x0; \
+       vaesenclast t4, x7, x7; \
+       vaesenclast t4, x1, x1; \
+       vaesenclast t4, x4, x4; \
+       vaesenclast t4, x2, x2; \
+       vaesenclast t4, x5, x5; \
+       vaesenclast t4, x3, x3; \
+       vaesenclast t4, x6, x6; \
+       \
+       /* postfilter sboxes 1 and 4 */ \
+       vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \
+       vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \
+       filter_8bit(x0, t0, t1, t7, t6); \
+       filter_8bit(x7, t0, t1, t7, t6); \
+       filter_8bit(x3, t0, t1, t7, t6); \
+       filter_8bit(x6, t0, t1, t7, t6); \
+       \
+       /* postfilter sbox 3 */ \
+       vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \
+       vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \
+       filter_8bit(x2, t2, t3, t7, t6); \
+       filter_8bit(x5, t2, t3, t7, t6); \
+       \
+       vmovq key, t0; \
+       \
+       /* postfilter sbox 2 */ \
+       filter_8bit(x1, t4, t5, t7, t2); \
+       filter_8bit(x4, t4, t5, t7, t2); \
+       \
+       vpshufb .Lbyte_threes rRIP, t0, t3; \
+       vpshufb .Lbyte_twos rRIP, t0, t2; \
+       \
+       /* P-function */ \
+       vpxor x5, x0, x0; \
+       vpxor x6, x1, x1; \
+       vpxor x7, x2, x2; \
+       vpxor x4, x3, x3; \
+       \
+       vpshufb .Lbyte_ones rRIP, t0, t1; \
+       vpshufb .Lbyte_sevens rRIP, t0, t7; \
+       \
+       vpxor x2, x4, x4; \
+       vpxor x3, x5, x5; \
+       vpxor x0, x6, x6; \
+       vpxor x1, x7, x7; \
+       \
+       vpshufb .Lbyte_sixs rRIP, t0, t6; \
+       vpshufb .Lbyte_fives rRIP, t0, t5; \
+       vpxor x7, x0, x0; \
+       vpxor x4, x1, x1; \
+       vpxor x5, x2, x2; \
+       vpxor x6, x3, x3; \
+       \
+       vpshufb .Lbyte_fours rRIP, t0, t4; \
+       \
+       vpxor x3, x4, x4; \
+       vpxor x0, x5, x5; \
+       vpxor x1, x6, x6; \
+       vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpxor t3, x4, x4; \
+       vpxor t3, t3, t3; \
+       vpxor 0 * 16(mem_cd), x4, x4; \
+       \
+       vpshufb t3, t0, t0; \
+       \
+       vpxor t2, x5, x5; \
+       vpxor 1 * 16(mem_cd), x5, x5; \
+       \
+       vpxor t1, x6, x6; \
+       vpxor 2 * 16(mem_cd), x6, x6; \
+       \
+       vpxor t0, x7, x7; \
+       vpxor 3 * 16(mem_cd), x7, x7; \
+       \
+       vpxor t7, x0, x0; \
+       vpxor 4 * 16(mem_cd), x0, x0; \
+       \
+       vpxor t6, x1, x1; \
+       vpxor 5 * 16(mem_cd), x1, x1; \
+       \
+       vpxor t5, x2, x2; \
+       vpxor 6 * 16(mem_cd), x2, x2; \
+       \
+       vpxor t4, x3, x3; \
+       vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+       roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+       \
+       vmovdqu x4, 0 * 16(mem_cd); \
+       vmovdqu x5, 1 * 16(mem_cd); \
+       vmovdqu x6, 2 * 16(mem_cd); \
+       vmovdqu x7, 3 * 16(mem_cd); \
+       vmovdqu x0, 4 * 16(mem_cd); \
+       vmovdqu x1, 5 * 16(mem_cd); \
+       vmovdqu x2, 6 * 16(mem_cd); \
+       vmovdqu x3, 7 * 16(mem_cd); \
+       \
+       roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+       \
+       store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+       /* Store new AB state */ \
+       vmovdqu x0, 0 * 16(mem_ab); \
+       vmovdqu x1, 1 * 16(mem_ab); \
+       vmovdqu x2, 2 * 16(mem_ab); \
+       vmovdqu x3, 3 * 16(mem_ab); \
+       vmovdqu x4, 4 * 16(mem_ab); \
+       vmovdqu x5, 5 * 16(mem_ab); \
+       vmovdqu x6, 6 * 16(mem_ab); \
+       vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i) \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+       two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+       vpcmpgtb v0, zero, t0; \
+       vpaddb v0, v0, v0; \
+       vpabsb t0, t0; \
+       \
+       vpcmpgtb v1, zero, t1; \
+       vpaddb v1, v1, v1; \
+       vpabsb t1, t1; \
+       \
+       vpcmpgtb v2, zero, t2; \
+       vpaddb v2, v2, v2; \
+       vpabsb t2, t2; \
+       \
+       vpor t0, v1, v1; \
+       \
+       vpcmpgtb v3, zero, t0; \
+       vpaddb v3, v3, v3; \
+       vpabsb t0, t0; \
+       \
+       vpor t1, v2, v2; \
+       vpor t2, v3, v3; \
+       vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+             tt1, tt2, tt3, kll, klr, krl, krr) \
+       /* \
+        * t0 = kll; \
+        * t0 &= ll; \
+        * lr ^= rol32(t0, 1); \
+        */ \
+       vpxor tt0, tt0, tt0; \
+       vmovd kll, t0; \
+       vpshufb tt0, t0, t3; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
+       \
+       vpand l0, t0, t0; \
+       vpand l1, t1, t1; \
+       vpand l2, t2, t2; \
+       vpand l3, t3, t3; \
+       \
+       rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+       \
+       vpxor l4, t0, l4; \
+       vmovdqu l4, 4 * 16(l); \
+       vpxor l5, t1, l5; \
+       vmovdqu l5, 5 * 16(l); \
+       vpxor l6, t2, l6; \
+       vmovdqu l6, 6 * 16(l); \
+       vpxor l7, t3, l7; \
+       vmovdqu l7, 7 * 16(l); \
+       \
+       /* \
+        * t2 = krr; \
+        * t2 |= rr; \
+        * rl ^= t2; \
+        */ \
+       \
+       vmovd krr, t0; \
+       vpshufb tt0, t0, t3; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
+       \
+       vpor 4 * 16(r), t0, t0; \
+       vpor 5 * 16(r), t1, t1; \
+       vpor 6 * 16(r), t2, t2; \
+       vpor 7 * 16(r), t3, t3; \
+       \
+       vpxor 0 * 16(r), t0, t0; \
+       vpxor 1 * 16(r), t1, t1; \
+       vpxor 2 * 16(r), t2, t2; \
+       vpxor 3 * 16(r), t3, t3; \
+       vmovdqu t0, 0 * 16(r); \
+       vmovdqu t1, 1 * 16(r); \
+       vmovdqu t2, 2 * 16(r); \
+       vmovdqu t3, 3 * 16(r); \
+       \
+       /* \
+        * t2 = krl; \
+        * t2 &= rl; \
+        * rr ^= rol32(t2, 1); \
+        */ \
+       vmovd krl, t0; \
+       vpshufb tt0, t0, t3; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
+       \
+       vpand 0 * 16(r), t0, t0; \
+       vpand 1 * 16(r), t1, t1; \
+       vpand 2 * 16(r), t2, t2; \
+       vpand 3 * 16(r), t3, t3; \
+       \
+       rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+       \
+       vpxor 4 * 16(r), t0, t0; \
+       vpxor 5 * 16(r), t1, t1; \
+       vpxor 6 * 16(r), t2, t2; \
+       vpxor 7 * 16(r), t3, t3; \
+       vmovdqu t0, 4 * 16(r); \
+       vmovdqu t1, 5 * 16(r); \
+       vmovdqu t2, 6 * 16(r); \
+       vmovdqu t3, 7 * 16(r); \
+       \
+       /* \
+        * t0 = klr; \
+        * t0 |= lr; \
+        * ll ^= t0; \
+        */ \
+       \
+       vmovd klr, t0; \
+       vpshufb tt0, t0, t3; \
+       vpshufb .Lbyte_ones rRIP, t0, t2; \
+       vpshufb .Lbyte_twos rRIP, t0, t1; \
+       vpshufb .Lbyte_threes rRIP, t0, t0; \
+       \
+       vpor l4, t0, t0; \
+       vpor l5, t1, t1; \
+       vpor l6, t2, t2; \
+       vpor l7, t3, t3; \
+       \
+       vpxor l0, t0, l0; \
+       vmovdqu l0, 0 * 16(l); \
+       vpxor l1, t1, l1; \
+       vmovdqu l1, 1 * 16(l); \
+       vpxor l2, t2, l2; \
+       vmovdqu l2, 2 * 16(l); \
+       vpxor l3, t3, l3; \
+       vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+       vpunpckhdq x1, x0, t2; \
+       vpunpckldq x1, x0, x0; \
+       \
+       vpunpckldq x3, x2, t1; \
+       vpunpckhdq x3, x2, x2; \
+       \
+       vpunpckhqdq t1, x0, x1; \
+       vpunpcklqdq t1, x0, x0; \
+       \
+       vpunpckhqdq x2, t2, x3; \
+       vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+                             a3, b3, c3, d3, st0, st1) \
+       vmovdqu d2, st0; \
+       vmovdqu d3, st1; \
+       transpose_4x4(a0, a1, a2, a3, d2, d3); \
+       transpose_4x4(b0, b1, b2, b3, d2, d3); \
+       vmovdqu st0, d2; \
+       vmovdqu st1, d3; \
+       \
+       vmovdqu a0, st0; \
+       vmovdqu a1, st1; \
+       transpose_4x4(c0, c1, c2, c3, a0, a1); \
+       transpose_4x4(d0, d1, d2, d3, a0, a1); \
+       \
+       vmovdqu .Lshufb_16x16b rRIP, a0; \
+       vmovdqu st1, a1; \
+       vpshufb a0, a2, a2; \
+       vpshufb a0, a3, a3; \
+       vpshufb a0, b0, b0; \
+       vpshufb a0, b1, b1; \
+       vpshufb a0, b2, b2; \
+       vpshufb a0, b3, b3; \
+       vpshufb a0, a1, a1; \
+       vpshufb a0, c0, c0; \
+       vpshufb a0, c1, c1; \
+       vpshufb a0, c2, c2; \
+       vpshufb a0, c3, c3; \
+       vpshufb a0, d0, d0; \
+       vpshufb a0, d1, d1; \
+       vpshufb a0, d2, d2; \
+       vpshufb a0, d3, d3; \
+       vmovdqu d3, st1; \
+       vmovdqu st0, d3; \
+       vpshufb a0, d3, a0; \
+       vmovdqu d2, st0; \
+       \
+       transpose_4x4(a0, b0, c0, d0, d2, d3); \
+       transpose_4x4(a1, b1, c1, d1, d2, d3); \
+       vmovdqu st0, d2; \
+       vmovdqu st1, d3; \
+       \
+       vmovdqu b0, st0; \
+       vmovdqu b1, st1; \
+       transpose_4x4(a2, b2, c2, d2, b0, b1); \
+       transpose_4x4(a3, b3, c3, d3, b0, b1); \
+       vmovdqu st0, b0; \
+       vmovdqu st1, b1; \
+       /* does not adjust output bytes inside vectors */
+
+#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
+       vpunpcklbw a, b, t0; \
+       vpunpckhbw a, b, b; \
+       \
+       vpunpcklbw c, d, t1; \
+       vpunpckhbw c, d, d; \
+       \
+       vpunpcklbw e, f, t2; \
+       vpunpckhbw e, f, f; \
+       \
+       vpunpcklbw g, h, t3; \
+       vpunpckhbw g, h, h; \
+       \
+       vpunpcklwd t0, t1, g; \
+       vpunpckhwd t0, t1, t0; \
+       \
+       vpunpcklwd b, d, t1; \
+       vpunpckhwd b, d, e; \
+       \
+       vpunpcklwd t2, t3, c; \
+       vpunpckhwd t2, t3, t2; \
+       \
+       vpunpcklwd f, h, t3; \
+       vpunpckhwd f, h, b; \
+       \
+       vpunpcklwd e, b, t4; \
+       vpunpckhwd e, b, b; \
+       \
+       vpunpcklwd t1, t3, e; \
+       vpunpckhwd t1, t3, f; \
+       \
+       vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
+       \
+       vpunpcklwd g, c, d; \
+       vpunpckhwd g, c, c; \
+       \
+       vpunpcklwd t0, t2, t1; \
+       vpunpckhwd t0, t2, h; \
+       \
+       vpunpckhqdq b, h, a; \
+       vpshufb t3, a, a; \
+       vpunpcklqdq b, h, b; \
+       vpshufb t3, b, b; \
+       \
+       vpunpckhqdq e, d, g; \
+       vpshufb t3, g, g; \
+       vpunpcklqdq e, d, h; \
+       vpshufb t3, h, h; \
+       \
+       vpunpckhqdq f, c, e; \
+       vpshufb t3, e, e; \
+       vpunpcklqdq f, c, f; \
+       vpshufb t3, f, f; \
+       \
+       vpunpckhqdq t4, t1, c; \
+       vpshufb t3, c, c; \
+       vpunpcklqdq t4, t1, d; \
+       vpshufb t3, d, d;
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio, key) \
+       vmovq key, x0; \
+       vpshufb .Lpack_bswap rRIP, x0, x0; \
+       \
+       vpxor 0 * 16(rio), x0, y7; \
+       vpxor 1 * 16(rio), x0, y6; \
+       vpxor 2 * 16(rio), x0, y5; \
+       vpxor 3 * 16(rio), x0, y4; \
+       vpxor 4 * 16(rio), x0, y3; \
+       vpxor 5 * 16(rio), x0, y2; \
+       vpxor 6 * 16(rio), x0, y1; \
+       vpxor 7 * 16(rio), x0, y0; \
+       vpxor 8 * 16(rio), x0, x7; \
+       vpxor 9 * 16(rio), x0, x6; \
+       vpxor 10 * 16(rio), x0, x5; \
+       vpxor 11 * 16(rio), x0, x4; \
+       vpxor 12 * 16(rio), x0, x3; \
+       vpxor 13 * 16(rio), x0, x2; \
+       vpxor 14 * 16(rio), x0, x1; \
+       vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd) \
+       byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+                             y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+       \
+       vmovdqu x0, 0 * 16(mem_ab); \
+       vmovdqu x1, 1 * 16(mem_ab); \
+       vmovdqu x2, 2 * 16(mem_ab); \
+       vmovdqu x3, 3 * 16(mem_ab); \
+       vmovdqu x4, 4 * 16(mem_ab); \
+       vmovdqu x5, 5 * 16(mem_ab); \
+       vmovdqu x6, 6 * 16(mem_ab); \
+       vmovdqu x7, 7 * 16(mem_ab); \
+       vmovdqu y0, 0 * 16(mem_cd); \
+       vmovdqu y1, 1 * 16(mem_cd); \
+       vmovdqu y2, 2 * 16(mem_cd); \
+       vmovdqu y3, 3 * 16(mem_cd); \
+       vmovdqu y4, 4 * 16(mem_cd); \
+       vmovdqu y5, 5 * 16(mem_cd); \
+       vmovdqu y6, 6 * 16(mem_cd); \
+       vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+                   y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+       byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+                             y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+       \
+       vmovdqu x0, stack_tmp0; \
+       \
+       vmovq key, x0; \
+       vpshufb .Lpack_bswap rRIP, x0, x0; \
+       \
+       vpxor x0, y7, y7; \
+       vpxor x0, y6, y6; \
+       vpxor x0, y5, y5; \
+       vpxor x0, y4, y4; \
+       vpxor x0, y3, y3; \
+       vpxor x0, y2, y2; \
+       vpxor x0, y1, y1; \
+       vpxor x0, y0, y0; \
+       vpxor x0, x7, x7; \
+       vpxor x0, x6, x6; \
+       vpxor x0, x5, x5; \
+       vpxor x0, x4, x4; \
+       vpxor x0, x3, x3; \
+       vpxor x0, x2, x2; \
+       vpxor x0, x1, x1; \
+       vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                    y6, y7, rio) \
+       vmovdqu x0, 0 * 16(rio); \
+       vmovdqu x1, 1 * 16(rio); \
+       vmovdqu x2, 2 * 16(rio); \
+       vmovdqu x3, 3 * 16(rio); \
+       vmovdqu x4, 4 * 16(rio); \
+       vmovdqu x5, 5 * 16(rio); \
+       vmovdqu x6, 6 * 16(rio); \
+       vmovdqu x7, 7 * 16(rio); \
+       vmovdqu y0, 8 * 16(rio); \
+       vmovdqu y1, 9 * 16(rio); \
+       vmovdqu y2, 10 * 16(rio); \
+       vmovdqu y3, 11 * 16(rio); \
+       vmovdqu y4, 12 * 16(rio); \
+       vmovdqu y5, 13 * 16(rio); \
+       vmovdqu y6, 14 * 16(rio); \
+       vmovdqu y7, 15 * 16(rio);
+
+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_data,@object;)
+_camellia_aesni_avx_data:
+.align 16
+
+#define SHUFB_BYTES(idx) \
+       0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+       .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+       .long 0x00010203
+       .long 0x04050607
+       .long 0x80808080
+       .long 0x80808080
+
+.Lbyte_ones:
+       .quad 1 * 0x0101010101010101
+       .quad 1 * 0x0101010101010101
+.Lbyte_twos:
+       .quad 2 * 0x0101010101010101
+       .quad 2 * 0x0101010101010101
+.Lbyte_threes:
+       .quad 3 * 0x0101010101010101
+       .quad 3 * 0x0101010101010101
+.Lbyte_fours:
+       .quad 4 * 0x0101010101010101
+       .quad 4 * 0x0101010101010101
+.Lbyte_fives:
+       .quad 5 * 0x0101010101010101
+       .quad 5 * 0x0101010101010101
+.Lbyte_sixs:
+       .quad 6 * 0x0101010101010101
+       .quad 6 * 0x0101010101010101
+.Lbyte_sevens:
+       .quad 7 * 0x0101010101010101
+       .quad 7 * 0x0101010101010101
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+       .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+       .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+       .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+       .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+       .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+       .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+       .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+       .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+       .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+       .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+       .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+       .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+       .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+       .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+       .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+       .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+       .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+       .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+       .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+       .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+       .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+       .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* shuffle mask for 8x8 byte transpose */
+.Ltranspose_8x8_shuf:
+       .byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
+
+/* CTR byte addition constants */
+.Lbige_addb_1:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+       .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+       .long 0x0f0f0f0f
+
+.text
+
+.align 16
+ELF(.type   __camellia_enc_blk16,@function;)
+
+__camellia_enc_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rax: temporary storage, 256 bytes
+        *      %r8d: 24 for 16 byte key, 32 for larger
+        *      %xmm0..%xmm15: 16 plaintext blocks
+        * output:
+        *      %xmm0..%xmm15: 16 encrypted blocks, order swapped:
+        *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+        */
+       CFI_STARTPROC();
+
+       leaq 8 * 16(%rax), %rcx;
+
+       leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx);
+
+.align 8
+.Lenc_loop:
+       enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %rcx, 0);
+
+       cmpq %r8, CTX;
+       je .Lenc_done;
+       leaq (8 * 8)(CTX), CTX;
+
+       fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+             %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+             %xmm15,
+             ((key_table) + 0)(CTX),
+             ((key_table) + 4)(CTX),
+             ((key_table) + 8)(CTX),
+             ((key_table) + 12)(CTX));
+       jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+       /* load CD for output */
+       vmovdqu 0 * 16(%rcx), %xmm8;
+       vmovdqu 1 * 16(%rcx), %xmm9;
+       vmovdqu 2 * 16(%rcx), %xmm10;
+       vmovdqu 3 * 16(%rcx), %xmm11;
+       vmovdqu 4 * 16(%rcx), %xmm12;
+       vmovdqu 5 * 16(%rcx), %xmm13;
+       vmovdqu 6 * 16(%rcx), %xmm14;
+       vmovdqu 7 * 16(%rcx), %xmm15;
+
+       outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                   %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                   %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
+
+.align 16
+ELF(.type   __camellia_dec_blk16,@function;)
+
+__camellia_dec_blk16:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rax: temporary storage, 256 bytes
+        *      %r8d: 24 for 16 byte key, 32 for larger
+        *      %xmm0..%xmm15: 16 encrypted blocks
+        * output:
+        *      %xmm0..%xmm15: 16 plaintext blocks, order swapped:
+        *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+        */
+       CFI_STARTPROC();
+
+       movq %r8, %rcx;
+       movq CTX, %r8
+       leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+       leaq 8 * 16(%rax), %rcx;
+
+       inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                     %xmm15, %rax, %rcx);
+
+.align 8
+.Ldec_loop:
+       dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rax, %rcx, 0);
+
+       cmpq %r8, CTX;
+       je .Ldec_done;
+
+       fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+             %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+             %xmm15,
+             ((key_table) + 8)(CTX),
+             ((key_table) + 12)(CTX),
+             ((key_table) + 0)(CTX),
+             ((key_table) + 4)(CTX));
+
+       leaq (-8 * 8)(CTX), CTX;
+       jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+       /* load CD for output */
+       vmovdqu 0 * 16(%rcx), %xmm8;
+       vmovdqu 1 * 16(%rcx), %xmm9;
+       vmovdqu 2 * 16(%rcx), %xmm10;
+       vmovdqu 3 * 16(%rcx), %xmm11;
+       vmovdqu 4 * 16(%rcx), %xmm12;
+       vmovdqu 5 * 16(%rcx), %xmm13;
+       vmovdqu 6 * 16(%rcx), %xmm14;
+       vmovdqu 7 * 16(%rcx), %xmm15;
+
+       outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                   %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                   %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+       vpcmpeqq minus_one, x, tmp; \
+       vpsubq minus_one, x, x; \
+       vpslldq $8, tmp, tmp; \
+       vpsubq tmp, x, x;
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ctr_enc
+ELF(.type   _gcry_camellia_aesni_avx_ctr_enc,@function;)
+
+_gcry_camellia_aesni_avx_ctr_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv (big endian, 128bit)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       cmpb $(0x100 - 16), 15(%rcx);
+       jbe .Lctr_byteadd;
+
+       vmovdqa .Lbswap128_mask rRIP, %xmm14;
+
+       /* load IV and byteswap */
+       vmovdqu (%rcx), %xmm15;
+       vmovdqu %xmm15, 15 * 16(%rax);
+       vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
+
+       vpcmpeqd %xmm15, %xmm15, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+       /* construct IVs */
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm13;
+       vmovdqu %xmm13, 14 * 16(%rax);
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm13;
+       vmovdqu %xmm13, 13 * 16(%rax);
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm12;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm11;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm10;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm9;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm8;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm7;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm6;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm5;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm4;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm3;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm2;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vpshufb %xmm14, %xmm0, %xmm1;
+       inc_le128(%xmm0, %xmm15, %xmm13);
+       vmovdqa %xmm0, %xmm13;
+       vpshufb %xmm14, %xmm0, %xmm0;
+       inc_le128(%xmm13, %xmm15, %xmm14);
+       vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
+       vmovdqu %xmm13, (%rcx);
+
+.align 8
+.Lload_ctr_done:
+       /* inpack16_pre: */
+       vmovq (key_table)(CTX), %xmm15;
+       vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+       vpxor %xmm0, %xmm15, %xmm0;
+       vpxor %xmm1, %xmm15, %xmm1;
+       vpxor %xmm2, %xmm15, %xmm2;
+       vpxor %xmm3, %xmm15, %xmm3;
+       vpxor %xmm4, %xmm15, %xmm4;
+       vpxor %xmm5, %xmm15, %xmm5;
+       vpxor %xmm6, %xmm15, %xmm6;
+       vpxor %xmm7, %xmm15, %xmm7;
+       vpxor %xmm8, %xmm15, %xmm8;
+       vpxor %xmm9, %xmm15, %xmm9;
+       vpxor %xmm10, %xmm15, %xmm10;
+       vpxor %xmm11, %xmm15, %xmm11;
+       vpxor %xmm12, %xmm15, %xmm12;
+       vpxor 13 * 16(%rax), %xmm15, %xmm13;
+       vpxor 14 * 16(%rax), %xmm15, %xmm14;
+       vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+       call __camellia_enc_blk16;
+
+       vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+       vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+       vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+       vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+       vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+       vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+       vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+       vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+       vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+       vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+       vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+       vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+       vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+       vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+       vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+       vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+       movq 8(%rcx), %r11;
+       movq (%rcx), %r10;
+       bswapq %r11;
+       bswapq %r10;
+       addq $16, %r11;
+       adcq $0, %r10;
+       bswapq %r11;
+       bswapq %r10;
+       movq %r11, 8(%rcx);
+       movq %r10, (%rcx);
+       jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+       vmovdqu (%rcx), %xmm15;
+       je .Lctr_byteadd_full_ctr_carry;
+       addb $16, 15(%rcx);
+.Lctr_byteadd_xmm:
+       vmovdqa %xmm15, %xmm0;
+       vpaddb .Lbige_addb_1 rRIP, %xmm15, %xmm14;
+       vmovdqu %xmm15, 15 * 16(%rax);
+       vpaddb .Lbige_addb_2 rRIP, %xmm15, %xmm13;
+       vmovdqu %xmm14, 14 * 16(%rax);
+       vpaddb .Lbige_addb_3 rRIP, %xmm15, %xmm12;
+       vmovdqu %xmm13, 13 * 16(%rax);
+       vpaddb .Lbige_addb_4 rRIP, %xmm15, %xmm11;
+       vpaddb .Lbige_addb_5 rRIP, %xmm15, %xmm10;
+       vpaddb .Lbige_addb_6 rRIP, %xmm15, %xmm9;
+       vpaddb .Lbige_addb_7 rRIP, %xmm15, %xmm8;
+       vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm7;
+       vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm6;
+       vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm5;
+       vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm4;
+       vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm3;
+       vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm2;
+       vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm1;
+       vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm0;
+
+       jmp .Lload_ctr_done;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ecb_enc,@function;)
+
+_gcry_camellia_aesni_avx_ecb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx, (key_table)(CTX));
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       call __camellia_enc_blk16;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_enc,.-_gcry_camellia_aesni_avx_ecb_enc;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ecb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ecb_dec,@function;)
+
+_gcry_camellia_aesni_avx_ecb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       call __camellia_dec_blk16;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ecb_dec,.-_gcry_camellia_aesni_avx_ecb_dec;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_cbc_dec
+ELF(.type   _gcry_camellia_aesni_avx_cbc_dec,@function;)
+
+_gcry_camellia_aesni_avx_cbc_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       movq %rcx, %r9;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+                    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+                    %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       call __camellia_dec_blk16;
+
+       /* XOR output with IV */
+       vpxor (%r9), %xmm7, %xmm7;
+       vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+       vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+       vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+       vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+       vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+       vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+       vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+       vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+       vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+       vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+       vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+       vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+       vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+       vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+       vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+       movq (15 * 16 + 0)(%rdx), %r10;
+       movq (15 * 16 + 8)(%rdx), %r11;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       /* store new IV */
+       movq %r10, (0)(%r9);
+       movq %r11, (8)(%r9);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_cfb_dec
+ELF(.type   _gcry_camellia_aesni_avx_cfb_dec,@function;)
+
+_gcry_camellia_aesni_avx_cfb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: iv
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %eax;
+       cmovel %eax, %r8d; /* max */
+
+       subq $(16 * 16), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       /* inpack16_pre: */
+       vmovq (key_table)(CTX), %xmm0;
+       vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0;
+       vpxor (%rcx), %xmm0, %xmm15;
+       vmovdqu 15 * 16(%rdx), %xmm1;
+       vmovdqu %xmm1, (%rcx); /* store new IV */
+       vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+       vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+       vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+       vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+       vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+       vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+       vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+       vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+       vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+       vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+       vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+       vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+       vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+       vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+       vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+       call __camellia_enc_blk16;
+
+       vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+       vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+       vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+       vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+       vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+       vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+       vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+       vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+       vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+       vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+       vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+       vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+       vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+       vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+       vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+       vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ocb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ocb_enc,@function;)
+
+_gcry_camellia_aesni_avx_ocb_enc:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       subq $(16 * 16 + 4 * 8), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       movq %r10, (16 * 16 + 0 * 8)(%rsp);
+       movq %r11, (16 * 16 + 1 * 8)(%rsp);
+       movq %r12, (16 * 16 + 2 * 8)(%rsp);
+       movq %r13, (16 * 16 + 3 * 8)(%rsp);
+       CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+       CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+       CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+       CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+       vmovdqu (%rcx), %xmm14;
+       vmovdqu (%r8), %xmm15;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+         vmovdqu (n * 16)(%rdx), xreg; \
+         vpxor (lreg), %xmm14, %xmm14; \
+         vpxor xreg, %xmm15, %xmm15; \
+         vpxor xreg, %xmm14, xreg; \
+         vmovdqu %xmm14, (n * 16)(%rsi);
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %xmm0);
+       vmovdqu %xmm0, (15 * 16)(%rax);
+       OCB_INPUT(1, %r11, %xmm0);
+       vmovdqu %xmm0, (14 * 16)(%rax);
+       OCB_INPUT(2, %r12, %xmm13);
+       OCB_INPUT(3, %r13, %xmm12);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %xmm11);
+       OCB_INPUT(5, %r11, %xmm10);
+       OCB_INPUT(6, %r12, %xmm9);
+       OCB_INPUT(7, %r13, %xmm8);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(8, %r10, %xmm7);
+       OCB_INPUT(9, %r11, %xmm6);
+       OCB_INPUT(10, %r12, %xmm5);
+       OCB_INPUT(11, %r13, %xmm4);
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(12, %r10, %xmm3);
+       OCB_INPUT(13, %r11, %xmm2);
+       OCB_INPUT(14, %r12, %xmm1);
+       OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+       vmovdqu %xmm14, (%rcx);
+       vmovdqu %xmm15, (%r8);
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %r10d;
+       cmovel %r10d, %r8d; /* max */
+
+       /* inpack16_pre: */
+       vmovq (key_table)(CTX), %xmm15;
+       vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+       vpxor %xmm0, %xmm15, %xmm0;
+       vpxor %xmm1, %xmm15, %xmm1;
+       vpxor %xmm2, %xmm15, %xmm2;
+       vpxor %xmm3, %xmm15, %xmm3;
+       vpxor %xmm4, %xmm15, %xmm4;
+       vpxor %xmm5, %xmm15, %xmm5;
+       vpxor %xmm6, %xmm15, %xmm6;
+       vpxor %xmm7, %xmm15, %xmm7;
+       vpxor %xmm8, %xmm15, %xmm8;
+       vpxor %xmm9, %xmm15, %xmm9;
+       vpxor %xmm10, %xmm15, %xmm10;
+       vpxor %xmm11, %xmm15, %xmm11;
+       vpxor %xmm12, %xmm15, %xmm12;
+       vpxor %xmm13, %xmm15, %xmm13;
+       vpxor 14 * 16(%rax), %xmm15, %xmm14;
+       vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+       call __camellia_enc_blk16;
+
+       vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+       vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+       vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+       vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+       vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+       vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+       vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+       vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+       vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+       vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+       vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+       vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+       vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+       vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+       vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+       vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       movq (16 * 16 + 0 * 8)(%rsp), %r10;
+       movq (16 * 16 + 1 * 8)(%rsp), %r11;
+       movq (16 * 16 + 2 * 8)(%rsp), %r12;
+       movq (16 * 16 + 3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ocb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ocb_dec,@function;)
+
+_gcry_camellia_aesni_avx_ocb_dec:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst (16 blocks)
+        *      %rdx: src (16 blocks)
+        *      %rcx: offset
+        *      %r8 : checksum
+        *      %r9 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       subq $(16 * 16 + 4 * 8), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       movq %r10, (16 * 16 + 0 * 8)(%rsp);
+       movq %r11, (16 * 16 + 1 * 8)(%rsp);
+       movq %r12, (16 * 16 + 2 * 8)(%rsp);
+       movq %r13, (16 * 16 + 3 * 8)(%rsp);
+       CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+       CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+       CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+       CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+       vmovdqu (%rcx), %xmm15;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+         vmovdqu (n * 16)(%rdx), xreg; \
+         vpxor (lreg), %xmm15, %xmm15; \
+         vpxor xreg, %xmm15, xreg; \
+         vmovdqu %xmm15, (n * 16)(%rsi);
+       movq (0 * 8)(%r9), %r10;
+       movq (1 * 8)(%r9), %r11;
+       movq (2 * 8)(%r9), %r12;
+       movq (3 * 8)(%r9), %r13;
+       OCB_INPUT(0, %r10, %xmm0);
+       vmovdqu %xmm0, (15 * 16)(%rax);
+       OCB_INPUT(1, %r11, %xmm14);
+       OCB_INPUT(2, %r12, %xmm13);
+       OCB_INPUT(3, %r13, %xmm12);
+       movq (4 * 8)(%r9), %r10;
+       movq (5 * 8)(%r9), %r11;
+       movq (6 * 8)(%r9), %r12;
+       movq (7 * 8)(%r9), %r13;
+       OCB_INPUT(4, %r10, %xmm11);
+       OCB_INPUT(5, %r11, %xmm10);
+       OCB_INPUT(6, %r12, %xmm9);
+       OCB_INPUT(7, %r13, %xmm8);
+       movq (8 * 8)(%r9), %r10;
+       movq (9 * 8)(%r9), %r11;
+       movq (10 * 8)(%r9), %r12;
+       movq (11 * 8)(%r9), %r13;
+       OCB_INPUT(8, %r10, %xmm7);
+       OCB_INPUT(9, %r11, %xmm6);
+       OCB_INPUT(10, %r12, %xmm5);
+       OCB_INPUT(11, %r13, %xmm4);
+       movq (12 * 8)(%r9), %r10;
+       movq (13 * 8)(%r9), %r11;
+       movq (14 * 8)(%r9), %r12;
+       movq (15 * 8)(%r9), %r13;
+       OCB_INPUT(12, %r10, %xmm3);
+       OCB_INPUT(13, %r11, %xmm2);
+       OCB_INPUT(14, %r12, %xmm1);
+       OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+       vmovdqu %xmm15, (%rcx);
+
+       movq %r8, %r10;
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %r9d;
+       cmovel %r9d, %r8d; /* max */
+
+       /* inpack16_pre: */
+       vmovq (key_table)(CTX, %r8, 8), %xmm15;
+       vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+       vpxor %xmm0, %xmm15, %xmm0;
+       vpxor %xmm1, %xmm15, %xmm1;
+       vpxor %xmm2, %xmm15, %xmm2;
+       vpxor %xmm3, %xmm15, %xmm3;
+       vpxor %xmm4, %xmm15, %xmm4;
+       vpxor %xmm5, %xmm15, %xmm5;
+       vpxor %xmm6, %xmm15, %xmm6;
+       vpxor %xmm7, %xmm15, %xmm7;
+       vpxor %xmm8, %xmm15, %xmm8;
+       vpxor %xmm9, %xmm15, %xmm9;
+       vpxor %xmm10, %xmm15, %xmm10;
+       vpxor %xmm11, %xmm15, %xmm11;
+       vpxor %xmm12, %xmm15, %xmm12;
+       vpxor %xmm13, %xmm15, %xmm13;
+       vpxor %xmm14, %xmm15, %xmm14;
+       vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+       call __camellia_dec_blk16;
+
+       vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+       vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+       vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+       vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+       vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+       vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+       vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+       vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+       vmovdqu %xmm7, (7 * 16)(%rax);
+       vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+       vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+       vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+       vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+       vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+       vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+       vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+       vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+       /* Checksum_i = Checksum_{i-1} xor P_i  */
+
+       vpxor (%r10), %xmm7, %xmm7;
+       vpxor %xmm6, %xmm7, %xmm7;
+       vpxor %xmm5, %xmm7, %xmm7;
+       vpxor %xmm4, %xmm7, %xmm7;
+       vpxor %xmm3, %xmm7, %xmm7;
+       vpxor %xmm2, %xmm7, %xmm7;
+       vpxor %xmm1, %xmm7, %xmm7;
+       vpxor %xmm0, %xmm7, %xmm7;
+       vpxor %xmm15, %xmm7, %xmm7;
+       vpxor %xmm14, %xmm7, %xmm7;
+       vpxor %xmm13, %xmm7, %xmm7;
+       vpxor %xmm12, %xmm7, %xmm7;
+       vpxor %xmm11, %xmm7, %xmm7;
+       vpxor %xmm10, %xmm7, %xmm7;
+       vpxor %xmm9, %xmm7, %xmm7;
+       vpxor %xmm8, %xmm7, %xmm7;
+       vmovdqu %xmm7, (%r10);
+       vmovdqu (7 * 16)(%rax), %xmm7;
+
+       write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+                    %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+                    %xmm8, %rsi);
+
+       vzeroall;
+
+       movq (16 * 16 + 0 * 8)(%rsp), %r10;
+       movq (16 * 16 + 1 * 8)(%rsp), %r11;
+       movq (16 * 16 + 2 * 8)(%rsp), %r12;
+       movq (16 * 16 + 3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_ocb_auth
+ELF(.type   _gcry_camellia_aesni_avx_ocb_auth,@function;)
+
+_gcry_camellia_aesni_avx_ocb_auth:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: abuf (16 blocks)
+        *      %rdx: offset
+        *      %rcx: checksum
+        *      %r8 : L pointers (void *L[16])
+        */
+       CFI_STARTPROC();
+
+       pushq %rbp;
+       CFI_PUSH(%rbp);
+       movq %rsp, %rbp;
+       CFI_DEF_CFA_REGISTER(%rbp);
+
+       vzeroupper;
+
+       subq $(16 * 16 + 4 * 8), %rsp;
+       andq $~31, %rsp;
+       movq %rsp, %rax;
+
+       movq %r10, (16 * 16 + 0 * 8)(%rsp);
+       movq %r11, (16 * 16 + 1 * 8)(%rsp);
+       movq %r12, (16 * 16 + 2 * 8)(%rsp);
+       movq %r13, (16 * 16 + 3 * 8)(%rsp);
+       CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+       CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+       CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+       CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
+
+       vmovdqu (%rdx), %xmm15;
+
+       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+         vmovdqu (n * 16)(%rsi), xreg; \
+         vpxor (lreg), %xmm15, %xmm15; \
+         vpxor xreg, %xmm15, xreg;
+
+       movq (0 * 8)(%r8), %r10;
+       movq (1 * 8)(%r8), %r11;
+       movq (2 * 8)(%r8), %r12;
+       movq (3 * 8)(%r8), %r13;
+       OCB_INPUT(0, %r10, %xmm0);
+       vmovdqu %xmm0, (15 * 16)(%rax);
+       OCB_INPUT(1, %r11, %xmm14);
+       OCB_INPUT(2, %r12, %xmm13);
+       OCB_INPUT(3, %r13, %xmm12);
+       movq (4 * 8)(%r8), %r10;
+       movq (5 * 8)(%r8), %r11;
+       movq (6 * 8)(%r8), %r12;
+       movq (7 * 8)(%r8), %r13;
+       OCB_INPUT(4, %r10, %xmm11);
+       OCB_INPUT(5, %r11, %xmm10);
+       OCB_INPUT(6, %r12, %xmm9);
+       OCB_INPUT(7, %r13, %xmm8);
+       movq (8 * 8)(%r8), %r10;
+       movq (9 * 8)(%r8), %r11;
+       movq (10 * 8)(%r8), %r12;
+       movq (11 * 8)(%r8), %r13;
+       OCB_INPUT(8, %r10, %xmm7);
+       OCB_INPUT(9, %r11, %xmm6);
+       OCB_INPUT(10, %r12, %xmm5);
+       OCB_INPUT(11, %r13, %xmm4);
+       movq (12 * 8)(%r8), %r10;
+       movq (13 * 8)(%r8), %r11;
+       movq (14 * 8)(%r8), %r12;
+       movq (15 * 8)(%r8), %r13;
+       OCB_INPUT(12, %r10, %xmm3);
+       OCB_INPUT(13, %r11, %xmm2);
+       OCB_INPUT(14, %r12, %xmm1);
+       OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+       cmpl $128, key_bitlength(CTX);
+       movl $32, %r8d;
+       movl $24, %r10d;
+       cmovel %r10d, %r8d; /* max */
+
+       vmovdqu %xmm15, (%rdx);
+
+       movq %rcx, %r10;
+
+       /* inpack16_pre: */
+       vmovq (key_table)(CTX), %xmm15;
+       vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
+       vpxor %xmm0, %xmm15, %xmm0;
+       vpxor %xmm1, %xmm15, %xmm1;
+       vpxor %xmm2, %xmm15, %xmm2;
+       vpxor %xmm3, %xmm15, %xmm3;
+       vpxor %xmm4, %xmm15, %xmm4;
+       vpxor %xmm5, %xmm15, %xmm5;
+       vpxor %xmm6, %xmm15, %xmm6;
+       vpxor %xmm7, %xmm15, %xmm7;
+       vpxor %xmm8, %xmm15, %xmm8;
+       vpxor %xmm9, %xmm15, %xmm9;
+       vpxor %xmm10, %xmm15, %xmm10;
+       vpxor %xmm11, %xmm15, %xmm11;
+       vpxor %xmm12, %xmm15, %xmm12;
+       vpxor %xmm13, %xmm15, %xmm13;
+       vpxor %xmm14, %xmm15, %xmm14;
+       vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+       call __camellia_enc_blk16;
+
+       vpxor %xmm7, %xmm6, %xmm6;
+       vpxor %xmm5, %xmm4, %xmm4;
+       vpxor %xmm3, %xmm2, %xmm2;
+       vpxor %xmm1, %xmm0, %xmm0;
+       vpxor %xmm15, %xmm14, %xmm14;
+       vpxor %xmm13, %xmm12, %xmm12;
+       vpxor %xmm11, %xmm10, %xmm10;
+       vpxor %xmm9, %xmm8, %xmm8;
+
+       vpxor %xmm6, %xmm4, %xmm4;
+       vpxor %xmm2, %xmm0, %xmm0;
+       vpxor %xmm14, %xmm12, %xmm12;
+       vpxor %xmm10, %xmm8, %xmm8;
+
+       vpxor %xmm4, %xmm0, %xmm0;
+       vpxor %xmm12, %xmm8, %xmm8;
+
+       vpxor %xmm0, %xmm8, %xmm0;
+       vpxor (%r10), %xmm0, %xmm0;
+       vmovdqu %xmm0, (%r10);
+
+       vzeroall;
+
+       movq (16 * 16 + 0 * 8)(%rsp), %r10;
+       movq (16 * 16 + 1 * 8)(%rsp), %r11;
+       movq (16 * 16 + 2 * 8)(%rsp), %r12;
+       movq (16 * 16 + 3 * 8)(%rsp), %r13;
+       CFI_RESTORE(%r10);
+       CFI_RESTORE(%r11);
+       CFI_RESTORE(%r12);
+       CFI_RESTORE(%r13);
+
+       leave;
+       CFI_LEAVE();
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size 
_gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
+
+/*
+ * IN:
+ *  ab: 64-bit AB state
+ *  cd: 64-bit CD state
+ */
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
+                  _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
+       vmovq key, t0; \
+       vpxor x, x, t3; \
+       \
+       vpxor ab, t0, x; \
+       \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       \
+       /* input rotation for sbox4 (<<< 1) */ \
+       vpand x, sbox4mask, t0; \
+       vpandn x, sbox4mask, x; \
+       vpaddw t0, t0, t1; \
+       vpsrlw $7, t0, t0; \
+       vpor t0, t1, t0; \
+       vpand sbox4mask, t0, t0; \
+       vpor t0, x, x; \
+       \
+       vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+       vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
+       \
+       /* prefilter sboxes */ \
+       filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
+       \
+       /* AES subbytes + AES shift rows + AES inv shift rows */ \
+       vaesenclast t3, x, x; \
+       \
+       /* postfilter sboxes */ \
+       filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
+       \
+       /* output rotation for sbox2 (<<< 1) */ \
+       /* output rotation for sbox3 (>>> 1) */ \
+       vpshufb inv_shift_row, x, t1; \
+       vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
+       vpshufb .Lsp1110111010011110mask rRIP, x, x; \
+       vpaddb t1, t1, t2; \
+       vpsrlw $7, t1, t0; \
+       vpsllw $7, t1, t3; \
+       vpor t0, t2, t0; \
+       vpsrlw $1, t1, t1; \
+       vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
+       vpor t1, t3, t1; \
+       \
+       vpxor x, t4, t4; \
+       vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
+       vpxor t4, t0, t0; \
+       vpxor t1, t0, t0; \
+       vpsrldq $8, t0, x; \
+       vpxor t0, x, x;
+
+#define vec_rol128(in, out, nrol, t0) \
+       vpshufd $0x4e, in, out; \
+       vpsllq $(nrol), in, t0; \
+       vpsrlq $(64-(nrol)), out, out; \
+       vpaddd t0, out, out;
+
+#define vec_ror128(in, out, nror, t0) \
+       vpshufd $0x4e, in, out; \
+       vpsrlq $(nror), in, t0; \
+       vpsllq $(64-(nror)), out, out; \
+       vpaddd t0, out, out;
+
+SECTION_RODATA
+
+ELF(.type _camellia_aesni_avx_keysetup_data,@object;)
+_camellia_aesni_avx_keysetup_data:
+
+.align 16
+.Linv_shift_row_and_unpcklbw:
+       .byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff
+       .byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff
+.Lsp0044440444044404mask:
+       .long 0xffff0404, 0x0404ff04;
+       .long 0x0d0dff0d, 0x0d0dff0d;
+.Lsp1110111010011110mask:
+       .long 0x000000ff, 0x000000ff;
+       .long 0x0bffff0b, 0x0b0b0bff;
+.Lsp0222022222000222mask:
+       .long 0xff060606, 0xff060606;
+       .long 0x0c0cffff, 0xff0c0c0c;
+.Lsp3033303303303033mask:
+       .long 0x04ff0404, 0x04ff0404;
+       .long 0xff0a0aff, 0x0aff0a0a;
+.Lsbox4_input_mask:
+       .byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
+.Lsigma1:
+       .long 0x3BCC908B, 0xA09E667F;
+.Lsigma2:
+       .long 0x4CAA73B2, 0xB67AE858;
+.Lsigma3:
+       .long 0xE94F82BE, 0xC6EF372F;
+.Lsigma4:
+       .long 0xF1D36F1C, 0x54FF53A5;
+.Lsigma5:
+       .long 0xDE682D1D, 0x10E527FA;
+.Lsigma6:
+       .long 0xB3E6C1FD, 0xB05688C2;
+
+.text
+
+.align 16
+ELF(.type  __camellia_avx_setup128,@function;)
+__camellia_avx_setup128:
+       /* input:
+        *      %rdi: ctx, CTX; subkey storage at key_table(CTX)
+        *      %xmm0: key
+        */
+       CFI_STARTPROC();
+
+#define cmll_sub(n, ctx) (key_table+((n)*8))(ctx)
+#define KL128 %xmm0
+#define KA128 %xmm2
+
+       vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+
+       vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+       vmovq .Lsbox4_input_mask rRIP, %xmm12;
+       vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+       vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+       vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
+
+       /*
+        * Generate KA
+        */
+       vpsrldq $8, KL128, %xmm2;
+       vmovdqa KL128, %xmm3;
+       vpslldq $8, %xmm3, %xmm3;
+       vpsrldq $8, %xmm3, %xmm3;
+
+       camellia_f(%xmm2, %xmm4, %xmm1,
+                  %xmm5, %xmm6, %xmm7, %xmm8,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+       vpxor %xmm4, %xmm3, %xmm3;
+       camellia_f(%xmm3, %xmm2, %xmm1,
+                  %xmm5, %xmm6, %xmm7, %xmm8,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+       camellia_f(%xmm2, %xmm3, %xmm1,
+                  %xmm5, %xmm6, %xmm7, %xmm8,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+       vpxor %xmm4, %xmm3, %xmm3;
+       camellia_f(%xmm3, %xmm4, %xmm1,
+                  %xmm5, %xmm6, %xmm7, %xmm8,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+
+       vpslldq $8, %xmm3, %xmm3;
+       vpxor %xmm4, %xmm2, %xmm2;
+       vpsrldq $8, %xmm3, %xmm3;
+       vpslldq $8, %xmm2, KA128;
+       vpor %xmm3, KA128, KA128;
+
+        /*
+         * Generate subkeys
+         */
+       vmovdqu KA128, cmll_sub(24, CTX);
+       vec_rol128(KL128, %xmm3, 15, %xmm15);
+       vec_rol128(KA128, %xmm4, 15, %xmm15);
+       vec_rol128(KA128, %xmm5, 30, %xmm15);
+       vec_rol128(KL128, %xmm6, 45, %xmm15);
+       vec_rol128(KA128, %xmm7, 45, %xmm15);
+       vec_rol128(KL128, %xmm8, 60, %xmm15);
+       vec_rol128(KA128, %xmm9, 60, %xmm15);
+       vec_ror128(KL128, %xmm10, 128-77, %xmm15);
+
+       /* absorb kw2 to other subkeys */
+       vpslldq $8, KL128, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, KA128, KA128;
+       vpxor %xmm15, %xmm3, %xmm3;
+       vpxor %xmm15, %xmm4, %xmm4;
+
+       /* subl(1) ^= subr(1) & ~subr(9); */
+       vpandn %xmm15, %xmm5, %xmm13;
+       vpslldq $12, %xmm13, %xmm13;
+       vpsrldq $8, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm5, %xmm14;
+       vpslld $1, %xmm14, %xmm11;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm11, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm6, %xmm6;
+       vpxor %xmm15, %xmm8, %xmm8;
+       vpxor %xmm15, %xmm9, %xmm9;
+
+       /* subl(1) ^= subr(1) & ~subr(17); */
+       vpandn %xmm15, %xmm10, %xmm13;
+       vpslldq $12, %xmm13, %xmm13;
+       vpsrldq $8, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm10, %xmm14;
+       vpslld $1, %xmm14, %xmm11;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm11, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpshufd $0x1b, KL128, KL128;
+       vpshufd $0x1b, KA128, KA128;
+       vpshufd $0x1b, %xmm3, %xmm3;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm5, %xmm5;
+       vpshufd $0x1b, %xmm6, %xmm6;
+       vpshufd $0x1b, %xmm7, %xmm7;
+       vpshufd $0x1b, %xmm8, %xmm8;
+       vpshufd $0x1b, %xmm9, %xmm9;
+       vpshufd $0x1b, %xmm10, %xmm10;
+
+       vmovdqu KL128, cmll_sub(0, CTX);
+       vpshufd $0x1b, KL128, KL128;
+       vmovdqu KA128, cmll_sub(2, CTX);
+       vmovdqu %xmm3, cmll_sub(4, CTX);
+       vmovdqu %xmm4, cmll_sub(6, CTX);
+       vmovdqu %xmm5, cmll_sub(8, CTX);
+       vmovdqu %xmm6, cmll_sub(10, CTX);
+       vpsrldq $8, %xmm8, %xmm8;
+       vmovq %xmm7, cmll_sub(12, CTX);
+       vmovq %xmm8, cmll_sub(13, CTX);
+       vmovdqu %xmm9, cmll_sub(14, CTX);
+       vmovdqu %xmm10, cmll_sub(16, CTX);
+
+       vmovdqu cmll_sub(24, CTX), KA128;
+
+       vec_ror128(KL128, %xmm3, 128 - 94, %xmm7);
+       vec_ror128(KA128, %xmm4, 128 - 94, %xmm7);
+       vec_ror128(KL128, %xmm5, 128 - 111, %xmm7);
+       vec_ror128(KA128, %xmm6, 128 - 111, %xmm7);
+
+       vpxor %xmm15, %xmm3, %xmm3;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm5, %xmm5;
+       vpslldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm6, %xmm6;
+
+       /* absorb kw4 to other subkeys */
+       vpslldq $8, %xmm6, %xmm15;
+       vpxor %xmm15, %xmm5, %xmm5;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm3, %xmm3;
+
+       /* subl(25) ^= subr(25) & ~subr(16); */
+       vpshufd $0x1b, cmll_sub(16, CTX), %xmm10;
+       vpandn %xmm15, %xmm10, %xmm13;
+       vpslldq $4, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(25) & subl(16), subr(25) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm10, %xmm14;
+       vpslld $1, %xmm14, %xmm11;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm11, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpshufd $0x1b, %xmm3, %xmm3;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm5, %xmm5;
+       vpshufd $0x1b, %xmm6, %xmm6;
+
+       vmovdqu %xmm3, cmll_sub(18, CTX);
+       vmovdqu %xmm4, cmll_sub(20, CTX);
+       vmovdqu %xmm5, cmll_sub(22, CTX);
+       vmovdqu %xmm6, cmll_sub(24, CTX);
+
+       vpshufd $0x1b, cmll_sub(14, CTX), %xmm3;
+       vpshufd $0x1b, cmll_sub(12, CTX), %xmm4;
+       vpshufd $0x1b, cmll_sub(10, CTX), %xmm5;
+       vpshufd $0x1b, cmll_sub(8, CTX), %xmm6;
+
+       vpxor %xmm15, %xmm3, %xmm3;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm5, %xmm5;
+
+       /* subl(25) ^= subr(25) & ~subr(8); */
+       vpandn %xmm15, %xmm6, %xmm13;
+       vpslldq $4, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(25) & subl(8), subr(25) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm6, %xmm14;
+       vpslld $1, %xmm14, %xmm11;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm11, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpshufd $0x1b, %xmm3, %xmm3;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm5, %xmm5;
+
+       vmovdqu %xmm3, cmll_sub(14, CTX);
+       vmovdqu %xmm4, cmll_sub(12, CTX);
+       vmovdqu %xmm5, cmll_sub(10, CTX);
+
+       vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
+       vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
+       vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
+       vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
+
+       vpxor %xmm15, %xmm6, %xmm6;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm2, %xmm2;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpshufd $0x1b, %xmm6, %xmm6;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm2, %xmm2;
+       vpshufd $0x1b, %xmm0, %xmm0;
+
+       vpsrldq $8, %xmm2, %xmm3;
+       vpsrldq $8, %xmm4, %xmm5;
+       vpsrldq $8, %xmm6, %xmm7;
+
+        /*
+        * key XOR is end of F-function.
+        */
+       vpxor %xmm2, %xmm0, %xmm0;
+       vpxor %xmm4, %xmm2, %xmm2;
+
+       vmovq %xmm0, cmll_sub(0, CTX);
+       vmovq %xmm3, cmll_sub(2, CTX);
+       vpxor %xmm5, %xmm3, %xmm3;
+       vpxor %xmm6, %xmm4, %xmm4;
+       vpxor %xmm7, %xmm5, %xmm5;
+       vmovq %xmm2, cmll_sub(3, CTX);
+       vmovq %xmm3, cmll_sub(4, CTX);
+       vmovq %xmm4, cmll_sub(5, CTX);
+       vmovq %xmm5, cmll_sub(6, CTX);
+
+       vmovq cmll_sub(7, CTX), %xmm7;
+       vmovq cmll_sub(8, CTX), %xmm8;
+       vmovq cmll_sub(9, CTX), %xmm9;
+       vmovq cmll_sub(10, CTX), %xmm10;
+       /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+       vpandn %xmm10, %xmm8, %xmm15;
+       vpsrldq $4, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm10, %xmm0;
+       /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm8, %xmm0, %xmm15;
+       vpslld $1, %xmm15, %xmm14;
+       vpsrld $31, %xmm15, %xmm15;
+       vpaddd %xmm14, %xmm15, %xmm15;
+       vpslldq $12, %xmm15, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpxor %xmm0, %xmm6, %xmm6;
+       vmovq %xmm6, cmll_sub(7, CTX);
+
+       vmovq cmll_sub(11, CTX), %xmm11;
+       vmovq cmll_sub(12, CTX), %xmm12;
+       vmovq cmll_sub(13, CTX), %xmm13;
+       vmovq cmll_sub(14, CTX), %xmm14;
+       vmovq cmll_sub(15, CTX), %xmm15;
+       /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+       vpandn %xmm7, %xmm9, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm7, %xmm0;
+       /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm9, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vpxor %xmm11, %xmm0, %xmm0;
+       vpxor %xmm12, %xmm10, %xmm10;
+       vpxor %xmm13, %xmm11, %xmm11;
+       vpxor %xmm14, %xmm12, %xmm12;
+       vpxor %xmm15, %xmm13, %xmm13;
+       vmovq %xmm0, cmll_sub(10, CTX);
+       vmovq %xmm10, cmll_sub(11, CTX);
+       vmovq %xmm11, cmll_sub(12, CTX);
+       vmovq %xmm12, cmll_sub(13, CTX);
+       vmovq %xmm13, cmll_sub(14, CTX);
+
+       vmovq cmll_sub(16, CTX), %xmm6;
+       vmovq cmll_sub(17, CTX), %xmm7;
+       vmovq cmll_sub(18, CTX), %xmm8;
+       vmovq cmll_sub(19, CTX), %xmm9;
+       vmovq cmll_sub(20, CTX), %xmm10;
+       /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+       vpandn %xmm8, %xmm6, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm8, %xmm0;
+       /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm6, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vpxor %xmm14, %xmm0, %xmm0;
+       vmovq %xmm0, cmll_sub(15, CTX);
+
+       /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+       vpandn %xmm15, %xmm7, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm15, %xmm0;
+       /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm7, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vmovq cmll_sub(21, CTX), %xmm1;
+       vmovq cmll_sub(22, CTX), %xmm2;
+       vmovq cmll_sub(23, CTX), %xmm3;
+       vmovq cmll_sub(24, CTX), %xmm4;
+
+       vpxor %xmm9, %xmm0, %xmm0;
+       vpxor %xmm10, %xmm8, %xmm8;
+       vpxor %xmm1, %xmm9, %xmm9;
+       vpxor %xmm2, %xmm10, %xmm10;
+       vpxor %xmm3, %xmm1, %xmm1;
+       vpxor %xmm4, %xmm3, %xmm3;
+
+       vmovq %xmm0, cmll_sub(18, CTX);
+       vmovq %xmm8, cmll_sub(19, CTX);
+       vmovq %xmm9, cmll_sub(20, CTX);
+       vmovq %xmm10, cmll_sub(21, CTX);
+       vmovq %xmm1, cmll_sub(22, CTX);
+       vmovq %xmm2, cmll_sub(23, CTX);
+       vmovq %xmm3, cmll_sub(24, CTX);
+
+       /* kw2 and kw4 are unused now. */
+       movq $0, cmll_sub(1, CTX);
+       movq $0, cmll_sub(25, CTX);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
+
+.align 16
+ELF(.type  __camellia_avx_setup256,@function;)
+
+__camellia_avx_setup256:
+       /* input:
+        *      %rdi: ctx, CTX; subkey storage at key_table(CTX)
+        *      %xmm0 & %xmm1: key
+        */
+       CFI_STARTPROC();
+
+#define KL128 %xmm0
+#define KR128 %xmm1
+#define KA128 %xmm2
+#define KB128 %xmm3
+
+       vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+       vpshufb .Lbswap128_mask rRIP, KR128, KR128;
+
+       vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+       vmovq .Lsbox4_input_mask rRIP, %xmm12;
+       vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+       vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+       vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
+
+       /*
+        * Generate KA
+        */
+       vpxor KL128, KR128, %xmm3;
+       vpsrldq $8, KR128, %xmm6;
+       vpsrldq $8, %xmm3, %xmm2;
+       vpslldq $8, %xmm3, %xmm3;
+       vpsrldq $8, %xmm3, %xmm3;
+
+       camellia_f(%xmm2, %xmm4, %xmm5,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
+       vpxor %xmm4, %xmm3, %xmm3;
+       camellia_f(%xmm3, %xmm2, %xmm5,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
+       vpxor %xmm6, %xmm2, %xmm2;
+       camellia_f(%xmm2, %xmm3, %xmm5,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
+       vpxor %xmm4, %xmm3, %xmm3;
+       vpxor KR128, %xmm3, %xmm3;
+       camellia_f(%xmm3, %xmm4, %xmm5,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
+
+       vpslldq $8, %xmm3, %xmm3;
+       vpxor %xmm4, %xmm2, %xmm2;
+       vpsrldq $8, %xmm3, %xmm3;
+       vpslldq $8, %xmm2, KA128;
+       vpor %xmm3, KA128, KA128;
+
+       /*
+        * Generate KB
+        */
+       vpxor KA128, KR128, %xmm3;
+       vpsrldq $8, %xmm3, %xmm4;
+       vpslldq $8, %xmm3, %xmm3;
+       vpsrldq $8, %xmm3, %xmm3;
+
+       camellia_f(%xmm4, %xmm5, %xmm6,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
+       vpxor %xmm5, %xmm3, %xmm3;
+
+       camellia_f(%xmm3, %xmm5, %xmm6,
+                  %xmm7, %xmm8, %xmm9, %xmm10,
+                  %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
+       vpslldq $8, %xmm3, %xmm3;
+       vpxor %xmm5, %xmm4, %xmm4;
+       vpsrldq $8, %xmm3, %xmm3;
+       vpslldq $8, %xmm4, %xmm4;
+       vpor %xmm3, %xmm4, KB128;
+
+        /*
+         * Generate subkeys
+         */
+       vmovdqu KB128, cmll_sub(32, CTX);
+       vec_rol128(KR128, %xmm4, 15, %xmm15);
+       vec_rol128(KA128, %xmm5, 15, %xmm15);
+       vec_rol128(KR128, %xmm6, 30, %xmm15);
+       vec_rol128(KB128, %xmm7, 30, %xmm15);
+       vec_rol128(KL128, %xmm8, 45, %xmm15);
+       vec_rol128(KA128, %xmm9, 45, %xmm15);
+       vec_rol128(KL128, %xmm10, 60, %xmm15);
+       vec_rol128(KR128, %xmm11, 60, %xmm15);
+       vec_rol128(KB128, %xmm12, 60, %xmm15);
+
+       /* absorb kw2 to other subkeys */
+       vpslldq $8, KL128, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, KB128, KB128;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm5, %xmm5;
+
+       /* subl(1) ^= subr(1) & ~subr(9); */
+       vpandn %xmm15, %xmm6, %xmm13;
+       vpslldq $12, %xmm13, %xmm13;
+       vpsrldq $8, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(1) & subl(9), subr(1) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm6, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm7, %xmm7;
+       vpxor %xmm15, %xmm8, %xmm8;
+       vpxor %xmm15, %xmm9, %xmm9;
+
+       vpshufd $0x1b, KL128, KL128;
+       vpshufd $0x1b, KB128, KB128;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm5, %xmm5;
+       vpshufd $0x1b, %xmm6, %xmm6;
+       vpshufd $0x1b, %xmm7, %xmm7;
+       vpshufd $0x1b, %xmm8, %xmm8;
+       vpshufd $0x1b, %xmm9, %xmm9;
+
+       vmovdqu KL128, cmll_sub(0, CTX);
+       vpshufd $0x1b, KL128, KL128;
+       vmovdqu KB128, cmll_sub(2, CTX);
+       vmovdqu %xmm4, cmll_sub(4, CTX);
+       vmovdqu %xmm5, cmll_sub(6, CTX);
+       vmovdqu %xmm6, cmll_sub(8, CTX);
+       vmovdqu %xmm7, cmll_sub(10, CTX);
+       vmovdqu %xmm8, cmll_sub(12, CTX);
+       vmovdqu %xmm9, cmll_sub(14, CTX);
+
+       vmovdqu cmll_sub(32, CTX), KB128;
+
+       /* subl(1) ^= subr(1) & ~subr(17); */
+       vpandn %xmm15, %xmm10, %xmm13;
+       vpslldq $12, %xmm13, %xmm13;
+       vpsrldq $8, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(1) & subl(17), subr(1) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm10, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm11, %xmm11;
+       vpxor %xmm15, %xmm12, %xmm12;
+
+       vec_ror128(KL128, %xmm4, 128-77, %xmm14);
+       vec_ror128(KA128, %xmm5, 128-77, %xmm14);
+       vec_ror128(KR128, %xmm6, 128-94, %xmm14);
+       vec_ror128(KA128, %xmm7, 128-94, %xmm14);
+       vec_ror128(KL128, %xmm8, 128-111, %xmm14);
+       vec_ror128(KB128, %xmm9, 128-111, %xmm14);
+
+       vpxor %xmm15, %xmm4, %xmm4;
+
+       vpshufd $0x1b, %xmm10, %xmm10;
+       vpshufd $0x1b, %xmm11, %xmm11;
+       vpshufd $0x1b, %xmm12, %xmm12;
+       vpshufd $0x1b, %xmm4, %xmm4;
+
+       vmovdqu %xmm10, cmll_sub(16, CTX);
+       vmovdqu %xmm11, cmll_sub(18, CTX);
+       vmovdqu %xmm12, cmll_sub(20, CTX);
+       vmovdqu %xmm4, cmll_sub(22, CTX);
+
+       /* subl(1) ^= subr(1) & ~subr(25); */
+       vpandn %xmm15, %xmm5, %xmm13;
+       vpslldq $12, %xmm13, %xmm13;
+       vpsrldq $8, %xmm13, %xmm13;
+       vpxor %xmm13, %xmm15, %xmm15;
+       /* dw = subl(1) & subl(25), subr(1) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm5, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm6, %xmm6;
+       vpxor %xmm15, %xmm7, %xmm7;
+       vpxor %xmm15, %xmm8, %xmm8;
+       vpslldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm9, %xmm9;
+
+       /* absorb kw4 to other subkeys */
+       vpslldq $8, %xmm9, %xmm15;
+       vpxor %xmm15, %xmm8, %xmm8;
+       vpxor %xmm15, %xmm7, %xmm7;
+       vpxor %xmm15, %xmm6, %xmm6;
+
+       /* subl(33) ^= subr(33) & ~subr(24); */
+       vpandn %xmm15, %xmm5, %xmm14;
+       vpslldq $4, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+       /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm5, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpshufd $0x1b, %xmm5, %xmm5;
+       vpshufd $0x1b, %xmm6, %xmm6;
+       vpshufd $0x1b, %xmm7, %xmm7;
+       vpshufd $0x1b, %xmm8, %xmm8;
+       vpshufd $0x1b, %xmm9, %xmm9;
+
+       vmovdqu %xmm5, cmll_sub(24, CTX);
+       vmovdqu %xmm6, cmll_sub(26, CTX);
+       vmovdqu %xmm7, cmll_sub(28, CTX);
+       vmovdqu %xmm8, cmll_sub(30, CTX);
+       vmovdqu %xmm9, cmll_sub(32, CTX);
+
+       vpshufd $0x1b, cmll_sub(22, CTX), %xmm0;
+       vpshufd $0x1b, cmll_sub(20, CTX), %xmm1;
+       vpshufd $0x1b, cmll_sub(18, CTX), %xmm2;
+       vpshufd $0x1b, cmll_sub(16, CTX), %xmm3;
+       vpshufd $0x1b, cmll_sub(14, CTX), %xmm4;
+       vpshufd $0x1b, cmll_sub(12, CTX), %xmm5;
+       vpshufd $0x1b, cmll_sub(10, CTX), %xmm6;
+       vpshufd $0x1b, cmll_sub(8, CTX), %xmm7;
+
+       vpxor %xmm15, %xmm0, %xmm0;
+       vpxor %xmm15, %xmm1, %xmm1;
+       vpxor %xmm15, %xmm2, %xmm2;
+
+       /* subl(33) ^= subr(33) & ~subr(24); */
+       vpandn %xmm15, %xmm3, %xmm14;
+       vpslldq $4, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+       /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm3, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm5, %xmm5;
+       vpxor %xmm15, %xmm6, %xmm6;
+
+       vpshufd $0x1b, %xmm0, %xmm0;
+       vpshufd $0x1b, %xmm1, %xmm1;
+       vpshufd $0x1b, %xmm2, %xmm2;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm5, %xmm5;
+       vpshufd $0x1b, %xmm6, %xmm6;
+
+       vmovdqu %xmm0, cmll_sub(22, CTX);
+       vmovdqu %xmm1, cmll_sub(20, CTX);
+       vmovdqu %xmm2, cmll_sub(18, CTX);
+       vmovdqu %xmm4, cmll_sub(14, CTX);
+       vmovdqu %xmm5, cmll_sub(12, CTX);
+       vmovdqu %xmm6, cmll_sub(10, CTX);
+
+       vpshufd $0x1b, cmll_sub(6, CTX), %xmm6;
+       vpshufd $0x1b, cmll_sub(4, CTX), %xmm4;
+       vpshufd $0x1b, cmll_sub(2, CTX), %xmm2;
+       vpshufd $0x1b, cmll_sub(0, CTX), %xmm0;
+
+       /* subl(33) ^= subr(33) & ~subr(24); */
+       vpandn %xmm15, %xmm7, %xmm14;
+       vpslldq $4, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+       /* dw = subl(33) & subl(24), subr(33) ^= CAMELLIA_RL1(dw); */
+       vpand %xmm15, %xmm7, %xmm14;
+       vpslld $1, %xmm14, %xmm13;
+       vpsrld $31, %xmm14, %xmm14;
+       vpaddd %xmm13, %xmm14, %xmm14;
+       vpsrldq $12, %xmm14, %xmm14;
+       vpslldq $8, %xmm14, %xmm14;
+       vpxor %xmm14, %xmm15, %xmm15;
+
+       vpxor %xmm15, %xmm6, %xmm6;
+       vpxor %xmm15, %xmm4, %xmm4;
+       vpxor %xmm15, %xmm2, %xmm2;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpshufd $0x1b, %xmm6, %xmm6;
+       vpshufd $0x1b, %xmm4, %xmm4;
+       vpshufd $0x1b, %xmm2, %xmm2;
+       vpshufd $0x1b, %xmm0, %xmm0;
+
+       vpsrldq $8, %xmm2, %xmm3;
+       vpsrldq $8, %xmm4, %xmm5;
+       vpsrldq $8, %xmm6, %xmm7;
+
+        /*
+        * key XOR is end of F-function.
+        */
+       vpxor %xmm2, %xmm0, %xmm0;
+       vpxor %xmm4, %xmm2, %xmm2;
+
+       vmovq %xmm0, cmll_sub(0, CTX);
+       vmovq %xmm3, cmll_sub(2, CTX);
+       vpxor %xmm5, %xmm3, %xmm3;
+       vpxor %xmm6, %xmm4, %xmm4;
+       vpxor %xmm7, %xmm5, %xmm5;
+       vmovq %xmm2, cmll_sub(3, CTX);
+       vmovq %xmm3, cmll_sub(4, CTX);
+       vmovq %xmm4, cmll_sub(5, CTX);
+       vmovq %xmm5, cmll_sub(6, CTX);
+
+       vmovq cmll_sub(7, CTX), %xmm7;
+       vmovq cmll_sub(8, CTX), %xmm8;
+       vmovq cmll_sub(9, CTX), %xmm9;
+       vmovq cmll_sub(10, CTX), %xmm10;
+       /* tl = subl(10) ^ (subr(10) & ~subr(8)); */
+       vpandn %xmm10, %xmm8, %xmm15;
+       vpsrldq $4, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm10, %xmm0;
+       /* dw = tl & subl(8), tr = subr(10) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm8, %xmm0, %xmm15;
+       vpslld $1, %xmm15, %xmm14;
+       vpsrld $31, %xmm15, %xmm15;
+       vpaddd %xmm14, %xmm15, %xmm15;
+       vpslldq $12, %xmm15, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpxor %xmm0, %xmm6, %xmm6;
+       vmovq %xmm6, cmll_sub(7, CTX);
+
+       vmovq cmll_sub(11, CTX), %xmm11;
+       vmovq cmll_sub(12, CTX), %xmm12;
+       vmovq cmll_sub(13, CTX), %xmm13;
+       vmovq cmll_sub(14, CTX), %xmm14;
+       vmovq cmll_sub(15, CTX), %xmm15;
+       /* tl = subl(7) ^ (subr(7) & ~subr(9)); */
+       vpandn %xmm7, %xmm9, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm7, %xmm0;
+       /* dw = tl & subl(9), tr = subr(7) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm9, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vpxor %xmm11, %xmm0, %xmm0;
+       vpxor %xmm12, %xmm10, %xmm10;
+       vpxor %xmm13, %xmm11, %xmm11;
+       vpxor %xmm14, %xmm12, %xmm12;
+       vpxor %xmm15, %xmm13, %xmm13;
+       vmovq %xmm0, cmll_sub(10, CTX);
+       vmovq %xmm10, cmll_sub(11, CTX);
+       vmovq %xmm11, cmll_sub(12, CTX);
+       vmovq %xmm12, cmll_sub(13, CTX);
+       vmovq %xmm13, cmll_sub(14, CTX);
+
+       vmovq cmll_sub(16, CTX), %xmm6;
+       vmovq cmll_sub(17, CTX), %xmm7;
+       vmovq cmll_sub(18, CTX), %xmm8;
+       vmovq cmll_sub(19, CTX), %xmm9;
+       vmovq cmll_sub(20, CTX), %xmm10;
+       /* tl = subl(18) ^ (subr(18) & ~subr(16)); */
+       vpandn %xmm8, %xmm6, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm8, %xmm0;
+       /* dw = tl & subl(16), tr = subr(18) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm6, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vpxor %xmm14, %xmm0, %xmm0;
+       vmovq %xmm0, cmll_sub(15, CTX);
+
+       /* tl = subl(15) ^ (subr(15) & ~subr(17)); */
+       vpandn %xmm15, %xmm7, %xmm1;
+       vpsrldq $4, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm15, %xmm0;
+       /* dw = tl & subl(17), tr = subr(15) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm7, %xmm0, %xmm1;
+       vpslld $1, %xmm1, %xmm2;
+       vpsrld $31, %xmm1, %xmm1;
+       vpaddd %xmm2, %xmm1, %xmm1;
+       vpslldq $12, %xmm1, %xmm1;
+       vpsrldq $8, %xmm1, %xmm1;
+       vpxor %xmm1, %xmm0, %xmm0;
+
+       vmovq cmll_sub(21, CTX), %xmm1;
+       vmovq cmll_sub(22, CTX), %xmm2;
+       vmovq cmll_sub(23, CTX), %xmm3;
+       vmovq cmll_sub(24, CTX), %xmm4;
+
+       vpxor %xmm9, %xmm0, %xmm0;
+       vpxor %xmm10, %xmm8, %xmm8;
+       vpxor %xmm1, %xmm9, %xmm9;
+       vpxor %xmm2, %xmm10, %xmm10;
+       vpxor %xmm3, %xmm1, %xmm1;
+
+       vmovq %xmm0, cmll_sub(18, CTX);
+       vmovq %xmm8, cmll_sub(19, CTX);
+       vmovq %xmm9, cmll_sub(20, CTX);
+       vmovq %xmm10, cmll_sub(21, CTX);
+       vmovq %xmm1, cmll_sub(22, CTX);
+
+       vmovq cmll_sub(25, CTX), %xmm5;
+       vmovq cmll_sub(26, CTX), %xmm6;
+       vmovq cmll_sub(27, CTX), %xmm7;
+       vmovq cmll_sub(28, CTX), %xmm8;
+       vmovq cmll_sub(29, CTX), %xmm9;
+       vmovq cmll_sub(30, CTX), %xmm10;
+       vmovq cmll_sub(31, CTX), %xmm11;
+       vmovq cmll_sub(32, CTX), %xmm12;
+
+       /* tl = subl(26) ^ (subr(26) & ~subr(24)); */
+       vpandn %xmm6, %xmm4, %xmm15;
+       vpsrldq $4, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm6, %xmm0;
+       /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm4, %xmm0, %xmm15;
+       vpslld $1, %xmm15, %xmm14;
+       vpsrld $31, %xmm15, %xmm15;
+       vpaddd %xmm14, %xmm15, %xmm15;
+       vpslldq $12, %xmm15, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpxor %xmm0, %xmm2, %xmm2;
+       vmovq %xmm2, cmll_sub(23, CTX);
+
+       /* tl = subl(23) ^ (subr(23) &  ~subr(25)); */
+       vpandn %xmm3, %xmm5, %xmm15;
+       vpsrldq $4, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm3, %xmm0;
+       /* dw = tl & subl(26), tr = subr(24) ^ CAMELLIA_RL1(dw); */
+       vpand %xmm5, %xmm0, %xmm15;
+       vpslld $1, %xmm15, %xmm14;
+       vpsrld $31, %xmm15, %xmm15;
+       vpaddd %xmm14, %xmm15, %xmm15;
+       vpslldq $12, %xmm15, %xmm15;
+       vpsrldq $8, %xmm15, %xmm15;
+       vpxor %xmm15, %xmm0, %xmm0;
+
+       vpxor %xmm7, %xmm0, %xmm0;
+       vpxor %xmm8, %xmm6, %xmm6;
+       vpxor %xmm9, %xmm7, %xmm7;
+       vpxor %xmm10, %xmm8, %xmm8;
+       vpxor %xmm11, %xmm9, %xmm9;
+       vpxor %xmm12, %xmm11, %xmm11;
+
+       vmovq %xmm0, cmll_sub(26, CTX);
+       vmovq %xmm6, cmll_sub(27, CTX);
+       vmovq %xmm7, cmll_sub(28, CTX);
+       vmovq %xmm8, cmll_sub(29, CTX);
+       vmovq %xmm9, cmll_sub(30, CTX);
+       vmovq %xmm10, cmll_sub(31, CTX);
+       vmovq %xmm11, cmll_sub(32, CTX);
+
+       /* kw2 and kw4 are unused now. */
+       movq $0, cmll_sub(1, CTX);
+       movq $0, cmll_sub(33, CTX);
+
+       vzeroall;
+
+       ret_spec_stop;
+       CFI_ENDPROC();
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
+
+.align 16
+.globl _gcry_camellia_aesni_avx_keygen
+ELF(.type  _gcry_camellia_aesni_avx_keygen,@function;)
+
+_gcry_camellia_aesni_avx_keygen:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: key
+        *      %rdx: keylen
+        */
+       CFI_STARTPROC();
+
+       vzeroupper;
+
+       vmovdqu (%rsi), %xmm0;
+       cmpl $24, %edx;
+       jb __camellia_avx_setup128;
+       je .Lprepare_key192;
+
+       vmovdqu 16(%rsi), %xmm1;
+       jmp __camellia_avx_setup256;
+
+.Lprepare_key192:
+       vpcmpeqd %xmm2, %xmm2, %xmm2;
+       vmovq 16(%rsi), %xmm1;
+
+       vpxor %xmm1, %xmm2, %xmm2;
+       vpslldq $8, %xmm2, %xmm2;
+       vpor %xmm2, %xmm1, %xmm1;
+
+       jmp __camellia_avx_setup256;
+       CFI_ENDPROC();
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.S 
b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.S
new file mode 100644
index 000000000..5102d1912
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.S
@@ -0,0 +1,34 @@
+/* camellia-aesni-avx2-amd64.S  -  AES-NI/AVX2 implementation of Camellia 
cipher
+ *
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#undef CAMELLIA_VAES_BUILD
+#define FUNC_NAME(func) _gcry_camellia_aesni_avx2_ ## func
+
+#include "camellia-aesni-avx2-amd64.h"
+
+#endif /* defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.h 
b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.h
new file mode 100644
index 000000000..4c3fb4b26
--- /dev/null
+++ b/grub-core/lib/libgcrypt/cipher/camellia-aesni-avx2-amd64.h
@@ -0,0 +1,2327 @@
+/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of 
Camellia
+ *
+ * Copyright (C) 2013-2015,2020-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_CAMELLIA_AESNI_AVX2_AMD64_H
+#define GCRY_CAMELLIA_AESNI_AVX2_AMD64_H
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+#ifndef CAMELLIA_GFNI_BUILD
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+       vpand x, mask4bit, tmp0; \
+       vpandn x, mask4bit, x; \
+       vpsrld $4, x, x; \
+       \
+       vpshufb tmp0, lo_t, tmp0; \
+       vpshufb x, hi_t, x; \
+       vpxor tmp0, x, x;
+#endif
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+#ifdef CAMELLIA_VAES_BUILD
+# define IF_AESNI(...)
+# define IF_VAES(...) __VA_ARGS__
+#else
+# define IF_AESNI(...) __VA_ARGS__
+# define IF_VAES(...)
+#endif
+
+#ifdef CAMELLIA_GFNI_BUILD
+# define IF_GFNI(...) __VA_ARGS__
+# define IF_NOT_GFNI(...)
+#else
+# define IF_GFNI(...)
+# define IF_NOT_GFNI(...) __VA_ARGS__
+#endif
+
+/**********************************************************************
+  GFNI helper macros and constants
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+       ( (((a0) & 1) << 0) | \
+         (((a1) & 1) << 1) | \
+         (((a2) & 1) << 2) | \
+         (((a3) & 1) << 3) | \
+         (((a4) & 1) << 4) | \
+         (((a5) & 1) << 5) | \
+         (((a6) & 1) << 6) | \
+         (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+       ( ((l7) << (0 * 8)) | \
+         ((l6) << (1 * 8)) | \
+         ((l5) << (2 * 8)) | \
+         ((l4) << (3 * 8)) | \
+         ((l3) << (4 * 8)) | \
+         ((l2) << (5 * 8)) | \
+         ((l1) << (6 * 8)) | \
+         ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and 
s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
+/**********************************************************************
+  32-way camellia
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* roundsm32 (GFNI version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+                 t6, t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+       vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+       vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+       \
+       /* prefilter sboxes */ \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+       vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+       \
+       /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+       vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 3 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+       vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+       \
+       /* sbox GF8 inverse + postfilter sbox 2 */ \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+       vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+       \
+       vpbroadcastb 7+key, t7; \
+       vpbroadcastb 6+key, t6; \
+       \
+       /* P-function */ \
+       vpxor x5, x0, x0; \
+       vpxor x6, x1, x1; \
+       vpxor x7, x2, x2; \
+       vpxor x4, x3, x3; \
+       \
+       vpbroadcastb 5+key, t5; \
+       vpbroadcastb 4+key, t4; \
+       \
+       vpxor x2, x4, x4; \
+       vpxor x3, x5, x5; \
+       vpxor x0, x6, x6; \
+       vpxor x1, x7, x7; \
+       \
+       vpbroadcastb 3+key, t3; \
+       vpbroadcastb 2+key, t2; \
+       \
+       vpxor x7, x0, x0; \
+       vpxor x4, x1, x1; \
+       vpxor x5, x2, x2; \
+       vpxor x6, x3, x3; \
+       \
+       vpbroadcastb 1+key, t1; \
+       vpbroadcastb 0+key, t0; \
+       \
+       vpxor x3, x4, x4; \
+       vpxor x0, x5, x5; \
+       vpxor x1, x6, x6; \
+       vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpxor t7, x0, x0; \
+       vpxor 4 * 32(mem_cd), x0, x0; \
+       \
+       vpxor t6, x1, x1; \
+       vpxor 5 * 32(mem_cd), x1, x1; \
+       \
+       vpxor t5, x2, x2; \
+       vpxor 6 * 32(mem_cd), x2, x2; \
+       \
+       vpxor t4, x3, x3; \
+       vpxor 7 * 32(mem_cd), x3, x3; \
+       \
+       vpxor t3, x4, x4; \
+       vpxor 0 * 32(mem_cd), x4, x4; \
+       \
+       vpxor t2, x5, x5; \
+       vpxor 1 * 32(mem_cd), x5, x5; \
+       \
+       vpxor t1, x6, x6; \
+       vpxor 2 * 32(mem_cd), x6, x6; \
+       \
+       vpxor t0, x7, x7; \
+       vpxor 3 * 32(mem_cd), x7, x7;
+
+#else /* CAMELLIA_GFNI_BUILD */
+
+/* roundsm32 (AES-NI / VAES version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+                 t6, t7, mem_cd, key) \
+       /* \
+        * S-function with AES subbytes \
+        */ \
+       vbroadcasti128 .Linv_shift_row rRIP, t4; \
+       vpbroadcastd .L0f0f0f0f rRIP, t7; \
+       vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
+       vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
+       vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
+       vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
+       \
+       /* AES inverse shift rows */ \
+       vpshufb t4, x0, x0; \
+       vpshufb t4, x7, x7; \
+       vpshufb t4, x3, x3; \
+       vpshufb t4, x6, x6; \
+       vpshufb t4, x2, x2; \
+       vpshufb t4, x5, x5; \
+       vpshufb t4, x1, x1; \
+       vpshufb t4, x4, x4; \
+       \
+       /* prefilter sboxes 1, 2 and 3 */ \
+       /* prefilter sbox 4 */ \
+       filter_8bit(x0, t5, t6, t7, t4); \
+       filter_8bit(x7, t5, t6, t7, t4); \
+       IF_AESNI(vextracti128 $1, x0, t0##_x); \
+       IF_AESNI(vextracti128 $1, x7, t1##_x); \
+       filter_8bit(x3, t2, t3, t7, t4); \
+       filter_8bit(x6, t2, t3, t7, t4); \
+       IF_AESNI(vextracti128 $1, x3, t3##_x); \
+       IF_AESNI(vextracti128 $1, x6, t2##_x); \
+       filter_8bit(x2, t5, t6, t7, t4); \
+       filter_8bit(x5, t5, t6, t7, t4); \
+       filter_8bit(x1, t5, t6, t7, t4); \
+       filter_8bit(x4, t5, t6, t7, t4); \
+       \
+       vpxor t4, t4, t4; \
+       \
+       /* AES subbytes + AES shift rows */ \
+       IF_AESNI(vextracti128 $1, x2, t6##_x; \
+                vextracti128 $1, x5, t5##_x; \
+                vaesenclast t4##_x, x0##_x, x0##_x; \
+                vaesenclast t4##_x, t0##_x, t0##_x; \
+                vaesenclast t4##_x, x7##_x, x7##_x; \
+                vaesenclast t4##_x, t1##_x, t1##_x; \
+                vaesenclast t4##_x, x3##_x, x3##_x; \
+                vaesenclast t4##_x, t3##_x, t3##_x; \
+                vaesenclast t4##_x, x6##_x, x6##_x; \
+                vaesenclast t4##_x, t2##_x, t2##_x; \
+                vinserti128 $1, t0##_x, x0, x0; \
+                vinserti128 $1, t1##_x, x7, x7; \
+                vinserti128 $1, t3##_x, x3, x3; \
+                vinserti128 $1, t2##_x, x6, x6; \
+                vextracti128 $1, x1, t3##_x; \
+                vextracti128 $1, x4, t2##_x); \
+       vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
+       vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
+       IF_AESNI(vaesenclast t4##_x, x2##_x, x2##_x; \
+                vaesenclast t4##_x, t6##_x, t6##_x; \
+                vaesenclast t4##_x, x5##_x, x5##_x; \
+                vaesenclast t4##_x, t5##_x, t5##_x; \
+                vaesenclast t4##_x, x1##_x, x1##_x; \
+                vaesenclast t4##_x, t3##_x, t3##_x; \
+                vaesenclast t4##_x, x4##_x, x4##_x; \
+                vaesenclast t4##_x, t2##_x, t2##_x; \
+                vinserti128 $1, t6##_x, x2, x2; \
+                vinserti128 $1, t5##_x, x5, x5; \
+                vinserti128 $1, t3##_x, x1, x1; \
+                vinserti128 $1, t2##_x, x4, x4); \
+       IF_VAES(vaesenclast t4, x0, x0; \
+               vaesenclast t4, x7, x7; \
+               vaesenclast t4, x3, x3; \
+               vaesenclast t4, x6, x6; \
+               vaesenclast t4, x2, x2; \
+               vaesenclast t4, x5, x5; \
+               vaesenclast t4, x1, x1; \
+               vaesenclast t4, x4, x4); \
+       \
+       /* postfilter sboxes 1 and 4 */ \
+       vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
+       vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
+       filter_8bit(x0, t0, t1, t7, t4); \
+       filter_8bit(x7, t0, t1, t7, t4); \
+       filter_8bit(x3, t0, t1, t7, t6); \
+       filter_8bit(x6, t0, t1, t7, t6); \
+       \
+       /* postfilter sbox 3 */ \
+       vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
+       vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
+       filter_8bit(x2, t2, t3, t7, t6); \
+       filter_8bit(x5, t2, t3, t7, t6); \
+       \
+       /* postfilter sbox 2 */ \
+       filter_8bit(x1, t4, t5, t7, t2); \
+       filter_8bit(x4, t4, t5, t7, t2); \
+       \
+       vpbroadcastb 7+key, t7; \
+       vpbroadcastb 6+key, t6; \
+       \
+       /* P-function */ \
+       vpxor x5, x0, x0; \
+       vpxor x6, x1, x1; \
+       vpxor x7, x2, x2; \
+       vpxor x4, x3, x3; \
+       \
+       vpbroadcastb 5+key, t5; \
+       vpbroadcastb 4+key, t4; \
+       \
+       vpxor x2, x4, x4; \
+       vpxor x3, x5, x5; \
+       vpxor x0, x6, x6; \
+       vpxor x1, x7, x7; \
+       \
+       vpbroadcastb 3+key, t3; \
+       vpbroadcastb 2+key, t2; \
+       \
+       vpxor x7, x0, x0; \
+       vpxor x4, x1, x1; \
+       vpxor x5, x2, x2; \
+       vpxor x6, x3, x3; \
+       \
+       vpbroadcastb 1+key, t1; \
+       vpbroadcastb 0+key, t0; \
+       \
+       vpxor x3, x4, x4; \
+       vpxor x0, x5, x5; \
+       vpxor x1, x6, x6; \
+       vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+       \
+       /* Add key material and result to CD (x becomes new CD) */ \
+       \
+       vpxor t7, x0, x0; \
+       vpxor 4 * 32(mem_cd), x0, x0; \
+       \
+       vpxor t6, x1, x1; \
+       vpxor 5 * 32(mem_cd), x1, x1; \
+       \
+       vpxor t5, x2, x2; \
+       vpxor 6 * 32(mem_cd), x2, x2; \
+       \
+       vpxor t4, x3, x3; \
+       vpxor 7 * 32(mem_cd), x3, x3; \
+       \
+       vpxor t3, x4, x4; \
+       vpxor 0 * 32(mem_cd), x4, x4; \
+       \
+       vpxor t2, x5, x5; \
+       vpxor 1 * 32(mem_cd), x5, x5; \
+       \
+       vpxor t1, x6, x6; \
+       vpxor 2 * 32(mem_cd), x6, x6; \
+       \
+       vpxor t0, x7, x7; \
+       vpxor 3 * 32(mem_cd), x7, x7;
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                     y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+       roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+                 y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+       \
+       vmovdqu x0, 4 * 32(mem_cd); \
+       vmovdqu x1, 5 * 32(mem_cd); \
+       vmovdqu x2, 6 * 32(mem_cd); \
+       vmovdqu x3, 7 * 32(mem_cd); \
+       vmovdqu x4, 0 * 32(mem_cd); \
+       vmovdqu x5, 1 * 32(mem_cd); \
+