1 /*
   2  * ---------------------------------------------------------------------------
   3  * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
   4  *
   5  * LICENSE TERMS
   6  *
   7  * The free distribution and use of this software is allowed (with or without
   8  * changes) provided that:
   9  *
  10  *  1. source code distributions include the above copyright notice, this
  11  *     list of conditions and the following disclaimer;
  12  *
  13  *  2. binary distributions include the above copyright notice, this list
  14  *     of conditions and the following disclaimer in their documentation;
  15  *
  16  *  3. the name of the copyright holder is not used to endorse products
  17  *     built using this software without specific written permission.
  18  *
  19  * DISCLAIMER
  20  *
  21  * This software is provided 'as is' with no explicit or implied warranties
  22  * in respect of its properties, including, but not limited to, correctness
  23  * and/or fitness for purpose.
  24  * ---------------------------------------------------------------------------
  25  * Issue 20/12/2007
  26  *
  27  * I am grateful to Dag Arne Osvik for many discussions of the techniques that
  28  * can be used to optimise AES assembler code on AMD64/EM64T architectures.
  29  * Some of the techniques used in this implementation are the result of
  30  * suggestions made by him for which I am most grateful.
  31  *
  32  * An AES implementation for AMD64 processors using the YASM assembler.  This
  33  * implementation provides only encryption, decryption and hence requires key
  34  * scheduling support in C. It uses 8k bytes of tables but its encryption and
  35  * decryption performance is very close to that obtained using large tables.
  36  * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
  37  * which are as follows:
  38  *               ms windows  gnu/linux/opensolaris os
  39  *
  40  *   in_blk          rcx     rdi
  41  *   out_blk         rdx     rsi
  42  *   context (cx)     r8     rdx
  43  *
  44  *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
  45  *   registers       rdi      -      on both
  46  *
  47  *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
  48  *   registers        -      rdi     on both
  49  *
  50  * The convention used here is that for gnu/linux/opensolaris os.
  51  *
  52  * This code provides the standard AES block size (128 bits, 16 bytes) and the
  53  * three standard AES key sizes (128, 192 and 256 bits). It has the same call
  54  * interface as my C implementation.  It uses the Microsoft C AMD64 calling
  55  * conventions in which the three parameters are placed in  rcx, rdx and r8
  56  * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
  57  *
  58  * OpenSolaris Note:
  59  * Modified to use GNU/Linux/Solaris calling conventions.
  60  * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
  61  *
  62  *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
  63  *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
  64  *
  65  *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
  66  *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
  67  *
  68  *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
  69  *                                            const aes_encrypt_ctx cx[1])/
  70  *
  71  *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
  72  *                                            const aes_decrypt_ctx cx[1])/
  73  *
  74  *     AES_RETURN aes_encrypt_key(const unsigned char key[],
  75  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
  76  *
  77  *     AES_RETURN aes_decrypt_key(const unsigned char key[],
  78  *                           unsigned int len, const aes_decrypt_ctx cx[1])/
  79  *
  80  * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
  81  * either bits or bytes.
  82  *
  83  * Comment in/out the following lines to obtain the desired subroutines. These
  84  * selections MUST match those in the C header file aesopt.h
  85  */
  86 #define AES_REV_DKS       /* define if key decryption schedule is reversed */
  87 
  88 #define LAST_ROUND_TABLES /* define for the faster version using extra tables */
  89 
  90 /*
  91  * The encryption key schedule has the following in memory layout where N is the
  92  * number of rounds (10, 12 or 14):
  93  *
  94  * lo: | input key (round 0)  |  / each round is four 32-bit words
  95  *     | encryption round 1   |
  96  *     | encryption round 2   |
  97  *     ....
  98  *     | encryption round N-1 |
  99  * hi: | encryption round N   |
 100  *
 101  * The decryption key schedule is normally set up so that it has the same
 102  * layout as above by actually reversing the order of the encryption key
 103  * schedule in memory (this happens when AES_REV_DKS is set):
 104  *
 105  * lo: | decryption round 0   | =              | encryption round N   |
 106  *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
 107  *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
 108  *     ....                       ....
 109  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
 110  * hi: | decryption round N   | =              | input key (round 0)  |
 111  *
 112  * with rounds except the first and last modified using inv_mix_column()
 113  * But if AES_REV_DKS is NOT set the order of keys is left as it is for
 114  * encryption so that it has to be accessed in reverse when used for
 115  * decryption (although the inverse mix column modifications are done)
 116  *
 117  * lo: | decryption round 0   | =              | input key (round 0)  |
 118  *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
 119  *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
 120  *     ....                       ....
 121  *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
 122  * hi: | decryption round N   | =              | encryption round N   |
 123  *
 124  * This layout is faster when the assembler key scheduling provided here
 125  * is used.
 126  *
 127  * End of user defines
 128  */
 129 
 130 /*
 131  * ---------------------------------------------------------------------------
 132  * OpenSolaris OS modifications
 133  *
 134  * This source originates from Brian Gladman file aes_amd64.asm
 135  * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
 136  * with these changes:
 137  *
 138  * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
 139  * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
 140  * AES_128, AES_192, AES_256, AES_VAR ifdefs.
 141  *
 142  * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
 143  *
 144  * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
 145  *
 146  * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
 147  * (operands reversed, literals prefixed with "$", registers prefixed with "%",
 148  * and "[register+offset]", addressing changed to "offset(register)",
 149  * parenthesis in constant expressions "()" changed to square brackets "[]",
 150  * "." removed from  local (numeric) labels, and other changes.
 151  * Examples:
 152  * Intel/yasm/nasm Syntax       ATT/OpenSolaris Syntax
 153  * mov  rax,(4*20h)             mov     $[4*0x20],%rax
 154  * mov  rax,[ebx+20h]           mov     0x20(%ebx),%rax
 155  * lea  rax,[ebx+ecx]           lea     (%ebx,%ecx),%rax
 156  * sub  rax,[ebx+ecx*4-20h]     sub     -0x20(%ebx,%ecx,4),%rax
 157  *
 158  * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
 159  * /usr/include/sys/asm_linkage.h, lint(1B) guards, EXPORT DELETE START
 160  * and EXPORT DELETE END markers, and dummy C function definitions for lint.
 161  *
 162  * 6. Renamed functions and reordered parameters to match OpenSolaris:
 163  * Original Gladman interface:
 164  *      int aes_encrypt(const unsigned char *in,
 165  *              unsigned char *out, const aes_encrypt_ctx cx[1])/
 166  *      int aes_decrypt(const unsigned char *in,
 167  *              unsigned char *out, const aes_encrypt_ctx cx[1])/
 168  * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
 169  * and a union type, inf., containing inf.l, a uint32_t and
 170  * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
 171  * used and contains the key schedule length * 16 where key schedule length is
 172  * 10, 12, or 14 bytes.
 173  *
 174  * OpenSolaris OS interface:
 175  *      void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
 176  *              const uint32_t pt[4], uint32_t ct[4])/
 177  *      void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
 178  *              const uint32_t pt[4], uint32_t ct[4])/
 179  *      typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
 180  *               uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
 181  * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
 182  * ct is crypto text, and MAX_AES_NR is 14.
 183  * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
 184  */
 185 
 186 #if defined(lint) || defined(__lint)
 187 
 188 #include <sys/types.h>
 189 /* ARGSUSED */
 190 void
 191 aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
 192         uint32_t ct[4]) {
 193 }
 194 /* ARGSUSED */
 195 void
 196 aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
 197         uint32_t pt[4]) {
 198 }
 199 
 200 
 201 #else
 202 
 203 #include <sys/asm_linkage.h>
 204 
 205 #define KS_LENGTH       60
 206 
 207 #define raxd            eax
 208 #define rdxd            edx
 209 #define rcxd            ecx
 210 #define rbxd            ebx
 211 #define rsid            esi
 212 #define rdid            edi
 213 
 214 #define raxb            al
 215 #define rdxb            dl
 216 #define rcxb            cl
 217 #define rbxb            bl
 218 #define rsib            sil
 219 #define rdib            dil
 220 
 221 / finite field multiplies by {02}, {04} and {08}
 222 
 223 #define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
 224 #define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
 225 #define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
 226 
 227 / finite field multiplies required in table generation
 228 
 229 #define f3(x) [[f2(x)] ^ [x]]
 230 #define f9(x) [[f8(x)] ^ [x]]
 231 #define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
 232 #define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
 233 #define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
 234 
 235 / macros for expanding S-box data
 236 
 237 #define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
 238 #define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
 239 #define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
 240 
 241 #define enc_vals(x)     \
 242    .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
 243    .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
 244    .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
 245    .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
 246    .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
 247    .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
 248    .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
 249    .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
 250    .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
 251    .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
 252    .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
 253    .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
 254    .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
 255    .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
 256    .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
 257    .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
 258    .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
 259    .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
 260    .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
 261    .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
 262    .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
 263    .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
 264    .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
 265    .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
 266    .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
 267    .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
 268    .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
 269    .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
 270    .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
 271    .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
 272    .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
 273    .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
 274 
 275 #define dec_vals(x) \
 276    .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
 277    .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
 278    .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
 279    .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
 280    .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
 281    .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
 282    .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
 283    .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
 284    .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
 285    .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
 286    .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
 287    .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
 288    .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
 289    .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
 290    .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
 291    .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
 292    .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
 293    .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
 294    .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
 295    .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
 296    .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
 297    .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
 298    .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
 299    .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
 300    .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
 301    .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
 302    .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
 303    .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
 304    .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
 305    .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
 306    .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
 307    .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
 308 
 309 #define tptr    %rbp    /* table pointer */
 310 #define kptr    %r8     /* key schedule pointer */
 311 #define fofs    128     /* adjust offset in key schedule to keep |disp| < 128 */
 312 #define fk_ref(x, y)    -16*x+fofs+4*y(kptr)
 313 
 314 #ifdef  AES_REV_DKS
 315 #define rofs            128
 316 #define ik_ref(x, y)    -16*x+rofs+4*y(kptr)
 317 
 318 #else
 319 #define rofs            -128
 320 #define ik_ref(x, y)    16*x+rofs+4*y(kptr)
 321 #endif  /* AES_REV_DKS */
 322 
 323 #define tab_0(x)        (tptr,x,8)
 324 #define tab_1(x)        3(tptr,x,8)
 325 #define tab_2(x)        2(tptr,x,8)
 326 #define tab_3(x)        1(tptr,x,8)
 327 #define tab_f(x)        1(tptr,x,8)
 328 #define tab_i(x)        7(tptr,x,8)
 329 
 330         /* EXPORT DELETE START */
 331 #define ff_rnd(p1, p2, p3, p4, round)   /* normal forward round */ \
 332         mov     fk_ref(round,0), p1; \
 333         mov     fk_ref(round,1), p2; \
 334         mov     fk_ref(round,2), p3; \
 335         mov     fk_ref(round,3), p4; \
 336  \
 337         movzx   %al, %esi; \
 338         movzx   %ah, %edi; \
 339         shr     $16, %eax; \
 340         xor     tab_0(%rsi), p1; \
 341         xor     tab_1(%rdi), p4; \
 342         movzx   %al, %esi; \
 343         movzx   %ah, %edi; \
 344         xor     tab_2(%rsi), p3; \
 345         xor     tab_3(%rdi), p2; \
 346  \
 347         movzx   %bl, %esi; \
 348         movzx   %bh, %edi; \
 349         shr     $16, %ebx; \
 350         xor     tab_0(%rsi), p2; \
 351         xor     tab_1(%rdi), p1; \
 352         movzx   %bl, %esi; \
 353         movzx   %bh, %edi; \
 354         xor     tab_2(%rsi), p4; \
 355         xor     tab_3(%rdi), p3; \
 356  \
 357         movzx   %cl, %esi; \
 358         movzx   %ch, %edi; \
 359         shr     $16, %ecx; \
 360         xor     tab_0(%rsi), p3; \
 361         xor     tab_1(%rdi), p2; \
 362         movzx   %cl, %esi; \
 363         movzx   %ch, %edi; \
 364         xor     tab_2(%rsi), p1; \
 365         xor     tab_3(%rdi), p4; \
 366  \
 367         movzx   %dl, %esi; \
 368         movzx   %dh, %edi; \
 369         shr     $16, %edx; \
 370         xor     tab_0(%rsi), p4; \
 371         xor     tab_1(%rdi), p3; \
 372         movzx   %dl, %esi; \
 373         movzx   %dh, %edi; \
 374         xor     tab_2(%rsi), p2; \
 375         xor     tab_3(%rdi), p1; \
 376  \
 377         mov     p1, %eax; \
 378         mov     p2, %ebx; \
 379         mov     p3, %ecx; \
 380         mov     p4, %edx
 381 
 382 #ifdef  LAST_ROUND_TABLES
 383 
 384 #define fl_rnd(p1, p2, p3, p4, round)   /* last forward round */ \
 385         add     $2048, tptr; \
 386         mov     fk_ref(round,0), p1; \
 387         mov     fk_ref(round,1), p2; \
 388         mov     fk_ref(round,2), p3; \
 389         mov     fk_ref(round,3), p4; \
 390  \
 391         movzx   %al, %esi; \
 392         movzx   %ah, %edi; \
 393         shr     $16, %eax; \
 394         xor     tab_0(%rsi), p1; \
 395         xor     tab_1(%rdi), p4; \
 396         movzx   %al, %esi; \
 397         movzx   %ah, %edi; \
 398         xor     tab_2(%rsi), p3; \
 399         xor     tab_3(%rdi), p2; \
 400  \
 401         movzx   %bl, %esi; \
 402         movzx   %bh, %edi; \
 403         shr     $16, %ebx; \
 404         xor     tab_0(%rsi), p2; \
 405         xor     tab_1(%rdi), p1; \
 406         movzx   %bl, %esi; \
 407         movzx   %bh, %edi; \
 408         xor     tab_2(%rsi), p4; \
 409         xor     tab_3(%rdi), p3; \
 410  \
 411         movzx   %cl, %esi; \
 412         movzx   %ch, %edi; \
 413         shr     $16, %ecx; \
 414         xor     tab_0(%rsi), p3; \
 415         xor     tab_1(%rdi), p2; \
 416         movzx   %cl, %esi; \
 417         movzx   %ch, %edi; \
 418         xor     tab_2(%rsi), p1; \
 419         xor     tab_3(%rdi), p4; \
 420  \
 421         movzx   %dl, %esi; \
 422         movzx   %dh, %edi; \
 423         shr     $16, %edx; \
 424         xor     tab_0(%rsi), p4; \
 425         xor     tab_1(%rdi), p3; \
 426         movzx   %dl, %esi; \
 427         movzx   %dh, %edi; \
 428         xor     tab_2(%rsi), p2; \
 429         xor     tab_3(%rdi), p1
 430 
 431 #else
 432 
 433 #define fl_rnd(p1, p2, p3, p4, round)   /* last forward round */ \
 434         mov     fk_ref(round,0), p1; \
 435         mov     fk_ref(round,1), p2; \
 436         mov     fk_ref(round,2), p3; \
 437         mov     fk_ref(round,3), p4; \
 438  \
 439         movzx   %al, %esi; \
 440         movzx   %ah, %edi; \
 441         shr     $16, %eax; \
 442         movzx   tab_f(%rsi), %esi; \
 443         movzx   tab_f(%rdi), %edi; \
 444         xor     %esi, p1; \
 445         rol     $8, %edi; \
 446         xor     %edi, p4; \
 447         movzx   %al, %esi; \
 448         movzx   %ah, %edi; \
 449         movzx   tab_f(%rsi), %esi; \
 450         movzx   tab_f(%rdi), %edi; \
 451         rol     $16, %esi; \
 452         rol     $24, %edi; \
 453         xor     %esi, p3; \
 454         xor     %edi, p2; \
 455  \
 456         movzx   %bl, %esi; \
 457         movzx   %bh, %edi; \
 458         shr     $16, %ebx; \
 459         movzx   tab_f(%rsi), %esi; \
 460         movzx   tab_f(%rdi), %edi; \
 461         xor     %esi, p2; \
 462         rol     $8, %edi; \
 463         xor     %edi, p1; \
 464         movzx   %bl, %esi; \
 465         movzx   %bh, %edi; \
 466         movzx   tab_f(%rsi), %esi; \
 467         movzx   tab_f(%rdi), %edi; \
 468         rol     $16, %esi; \
 469         rol     $24, %edi; \
 470         xor     %esi, p4; \
 471         xor     %edi, p3; \
 472  \
 473         movzx   %cl, %esi; \
 474         movzx   %ch, %edi; \
 475         movzx   tab_f(%rsi), %esi; \
 476         movzx   tab_f(%rdi), %edi; \
 477         shr     $16, %ecx; \
 478         xor     %esi, p3; \
 479         rol     $8, %edi; \
 480         xor     %edi, p2; \
 481         movzx   %cl, %esi; \
 482         movzx   %ch, %edi; \
 483         movzx   tab_f(%rsi), %esi; \
 484         movzx   tab_f(%rdi), %edi; \
 485         rol     $16, %esi; \
 486         rol     $24, %edi; \
 487         xor     %esi, p1; \
 488         xor     %edi, p4; \
 489  \
 490         movzx   %dl, %esi; \
 491         movzx   %dh, %edi; \
 492         movzx   tab_f(%rsi), %esi; \
 493         movzx   tab_f(%rdi), %edi; \
 494         shr     $16, %edx; \
 495         xor     %esi, p4; \
 496         rol     $8, %edi; \
 497         xor     %edi, p3; \
 498         movzx   %dl, %esi; \
 499         movzx   %dh, %edi; \
 500         movzx   tab_f(%rsi), %esi; \
 501         movzx   tab_f(%rdi), %edi; \
 502         rol     $16, %esi; \
 503         rol     $24, %edi; \
 504         xor     %esi, p2; \
 505         xor     %edi, p1
 506 
 507 #endif  /* LAST_ROUND_TABLES */
 508 
 509 #define ii_rnd(p1, p2, p3, p4, round)   /* normal inverse round */ \
 510         mov     ik_ref(round,0), p1; \
 511         mov     ik_ref(round,1), p2; \
 512         mov     ik_ref(round,2), p3; \
 513         mov     ik_ref(round,3), p4; \
 514  \
 515         movzx   %al, %esi; \
 516         movzx   %ah, %edi; \
 517         shr     $16, %eax; \
 518         xor     tab_0(%rsi), p1; \
 519         xor     tab_1(%rdi), p2; \
 520         movzx   %al, %esi; \
 521         movzx   %ah, %edi; \
 522         xor     tab_2(%rsi), p3; \
 523         xor     tab_3(%rdi), p4; \
 524  \
 525         movzx   %bl, %esi; \
 526         movzx   %bh, %edi; \
 527         shr     $16, %ebx; \
 528         xor     tab_0(%rsi), p2; \
 529         xor     tab_1(%rdi), p3; \
 530         movzx   %bl, %esi; \
 531         movzx   %bh, %edi; \
 532         xor     tab_2(%rsi), p4; \
 533         xor     tab_3(%rdi), p1; \
 534  \
 535         movzx   %cl, %esi; \
 536         movzx   %ch, %edi; \
 537         shr     $16, %ecx; \
 538         xor     tab_0(%rsi), p3; \
 539         xor     tab_1(%rdi), p4; \
 540         movzx   %cl, %esi; \
 541         movzx   %ch, %edi; \
 542         xor     tab_2(%rsi), p1; \
 543         xor     tab_3(%rdi), p2; \
 544  \
 545         movzx   %dl, %esi; \
 546         movzx   %dh, %edi; \
 547         shr     $16, %edx; \
 548         xor     tab_0(%rsi), p4; \
 549         xor     tab_1(%rdi), p1; \
 550         movzx   %dl, %esi; \
 551         movzx   %dh, %edi; \
 552         xor     tab_2(%rsi), p2; \
 553         xor     tab_3(%rdi), p3; \
 554  \
 555         mov     p1, %eax; \
 556         mov     p2, %ebx; \
 557         mov     p3, %ecx; \
 558         mov     p4, %edx
 559 
 560 #ifdef  LAST_ROUND_TABLES
 561 
 562 #define il_rnd(p1, p2, p3, p4, round)   /* last inverse round */ \
 563         add     $2048, tptr; \
 564         mov     ik_ref(round,0), p1; \
 565         mov     ik_ref(round,1), p2; \
 566         mov     ik_ref(round,2), p3; \
 567         mov     ik_ref(round,3), p4; \
 568  \
 569         movzx   %al, %esi; \
 570         movzx   %ah, %edi; \
 571         shr     $16, %eax; \
 572         xor     tab_0(%rsi), p1; \
 573         xor     tab_1(%rdi), p2; \
 574         movzx   %al, %esi; \
 575         movzx   %ah, %edi; \
 576         xor     tab_2(%rsi), p3; \
 577         xor     tab_3(%rdi), p4; \
 578  \
 579         movzx   %bl, %esi; \
 580         movzx   %bh, %edi; \
 581         shr     $16, %ebx; \
 582         xor     tab_0(%rsi), p2; \
 583         xor     tab_1(%rdi), p3; \
 584         movzx   %bl, %esi; \
 585         movzx   %bh, %edi; \
 586         xor     tab_2(%rsi), p4; \
 587         xor     tab_3(%rdi), p1; \
 588  \
 589         movzx   %cl, %esi; \
 590         movzx   %ch, %edi; \
 591         shr     $16, %ecx; \
 592         xor     tab_0(%rsi), p3; \
 593         xor     tab_1(%rdi), p4; \
 594         movzx   %cl, %esi; \
 595         movzx   %ch, %edi; \
 596         xor     tab_2(%rsi), p1; \
 597         xor     tab_3(%rdi), p2; \
 598  \
 599         movzx   %dl, %esi; \
 600         movzx   %dh, %edi; \
 601         shr     $16, %edx; \
 602         xor     tab_0(%rsi), p4; \
 603         xor     tab_1(%rdi), p1; \
 604         movzx   %dl, %esi; \
 605         movzx   %dh, %edi; \
 606         xor     tab_2(%rsi), p2; \
 607         xor     tab_3(%rdi), p3
 608 
 609 #else
 610 
 611 #define il_rnd(p1, p2, p3, p4, round)   /* last inverse round */ \
 612         mov     ik_ref(round,0), p1; \
 613         mov     ik_ref(round,1), p2; \
 614         mov     ik_ref(round,2), p3; \
 615         mov     ik_ref(round,3), p4; \
 616  \
 617         movzx   %al, %esi; \
 618         movzx   %ah, %edi; \
 619         movzx   tab_i(%rsi), %esi; \
 620         movzx   tab_i(%rdi), %edi; \
 621         shr     $16, %eax; \
 622         xor     %esi, p1; \
 623         rol     $8, %edi; \
 624         xor     %edi, p2; \
 625         movzx   %al, %esi; \
 626         movzx   %ah, %edi; \
 627         movzx   tab_i(%rsi), %esi; \
 628         movzx   tab_i(%rdi), %edi; \
 629         rol     $16, %esi; \
 630         rol     $24, %edi; \
 631         xor     %esi, p3; \
 632         xor     %edi, p4; \
 633  \
 634         movzx   %bl, %esi; \
 635         movzx   %bh, %edi; \
 636         movzx   tab_i(%rsi), %esi; \
 637         movzx   tab_i(%rdi), %edi; \
 638         shr     $16, %ebx; \
 639         xor     %esi, p2; \
 640         rol     $8, %edi; \
 641         xor     %edi, p3; \
 642         movzx   %bl, %esi; \
 643         movzx   %bh, %edi; \
 644         movzx   tab_i(%rsi), %esi; \
 645         movzx   tab_i(%rdi), %edi; \
 646         rol     $16, %esi; \
 647         rol     $24, %edi; \
 648         xor     %esi, p4; \
 649         xor     %edi, p1; \
 650  \
 651         movzx   %cl, %esi; \
 652         movzx   %ch, %edi; \
 653         movzx   tab_i(%rsi), %esi; \
 654         movzx   tab_i(%rdi), %edi; \
 655         shr     $16, %ecx; \
 656         xor     %esi, p3; \
 657         rol     $8, %edi; \
 658         xor     %edi, p4; \
 659         movzx   %cl, %esi; \
 660         movzx   %ch, %edi; \
 661         movzx   tab_i(%rsi), %esi; \
 662         movzx   tab_i(%rdi), %edi; \
 663         rol     $16, %esi; \
 664         rol     $24, %edi; \
 665         xor     %esi, p1; \
 666         xor     %edi, p2; \
 667  \
 668         movzx   %dl, %esi; \
 669         movzx   %dh, %edi; \
 670         movzx   tab_i(%rsi), %esi; \
 671         movzx   tab_i(%rdi), %edi; \
 672         shr     $16, %edx; \
 673         xor     %esi, p4; \
 674         rol     $8, %edi; \
 675         xor     %edi, p1; \
 676         movzx   %dl, %esi; \
 677         movzx   %dh, %edi; \
 678         movzx   tab_i(%rsi), %esi; \
 679         movzx   tab_i(%rdi), %edi; \
 680         rol     $16, %esi; \
 681         rol     $24, %edi; \
 682         xor     %esi, p2; \
 683         xor     %edi, p3
 684 
 685 #endif  /* LAST_ROUND_TABLES */
 686         /* EXPORT DELETE END */
 687 
 688 /*
 689  * OpenSolaris OS:
 690  * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
 691  *      const uint32_t pt[4], uint32_t ct[4])/
 692  *
 693  * Original interface:
 694  * int aes_encrypt(const unsigned char *in,
 695  *      unsigned char *out, const aes_encrypt_ctx cx[1])/
 696  */
 697         .align  64
 698 enc_tab:
 699         enc_vals(u8)
 700 #ifdef  LAST_ROUND_TABLES
 701         / Last Round Tables:
 702         enc_vals(w8)
 703 #endif
 704 
 705 
 706         ENTRY_NP(aes_encrypt_amd64)
 707         /* EXPORT DELETE START */
 708 #ifdef  GLADMAN_INTERFACE
 709         / Original interface
 710         sub     $[4*8], %rsp    / gnu/linux/opensolaris binary interface
 711         mov     %rsi, (%rsp)    / output pointer (P2)
 712         mov     %rdx, %r8       / context (P3)
 713 
 714         mov     %rbx, 1*8(%rsp) / P1: input pointer in rdi
 715         mov     %rbp, 2*8(%rsp) / P2: output pointer in (rsp)
 716         mov     %r12, 3*8(%rsp) / P3: context in r8
 717         movzx   4*KS_LENGTH(kptr), %esi / Get byte key length * 16
 718 
 719 #else
 720         / OpenSolaris OS interface
 721         sub     $[4*8], %rsp    / Make room on stack to save registers
 722         mov     %rcx, (%rsp)    / Save output pointer (P4) on stack
 723         mov     %rdi, %r8       / context (P1)
 724         mov     %rdx, %rdi      / P3: save input pointer
 725         shl     $4, %esi        / P2: esi byte key length * 16
 726 
 727         mov     %rbx, 1*8(%rsp) / Save registers
 728         mov     %rbp, 2*8(%rsp)
 729         mov     %r12, 3*8(%rsp)
 730         / P1: context in r8
 731         / P2: byte key length * 16 in esi
 732         / P3: input pointer in rdi
 733         / P4: output pointer in (rsp)
 734 #endif  /* GLADMAN_INTERFACE */
 735 
 736         lea     enc_tab(%rip), tptr
 737         sub     $fofs, kptr
 738 
 739         / Load input block into registers
 740         mov     (%rdi), %eax
 741         mov     1*4(%rdi), %ebx
 742         mov     2*4(%rdi), %ecx
 743         mov     3*4(%rdi), %edx
 744 
 745         xor     fofs(kptr), %eax
 746         xor     fofs+4(kptr), %ebx
 747         xor     fofs+8(kptr), %ecx
 748         xor     fofs+12(kptr), %edx
 749 
 750         lea     (kptr,%rsi), kptr
 751         / Jump based on byte key length * 16:
 752         cmp     $[10*16], %esi
 753         je      3f
 754         cmp     $[12*16], %esi
 755         je      2f
 756         cmp     $[14*16], %esi
 757         je      1f
 758         mov     $-1, %rax       / error
 759         jmp     4f
 760 
 761         / Perform normal forward rounds
 762 1:      ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
 763         ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
 764 2:      ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
 765         ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
 766 3:      ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
 767         ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
 768         ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
 769         ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
 770         ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
 771         ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
 772         ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
 773         ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
 774         ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
 775         fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
 776 
 777         / Copy results
 778         mov     (%rsp), %rbx
 779         mov     %r9d, (%rbx)
 780         mov     %r10d, 4(%rbx)
 781         mov     %r11d, 8(%rbx)
 782         mov     %r12d, 12(%rbx)
 783         xor     %rax, %rax
 784 4:      / Restore registers
 785         mov     1*8(%rsp), %rbx
 786         mov     2*8(%rsp), %rbp
 787         mov     3*8(%rsp), %r12
 788         add     $[4*8], %rsp
 789         /* EXPORT DELETE END */
 790         ret
 791 
 792         SET_SIZE(aes_encrypt_amd64)
 793 
 794 /*
 795  * OpenSolaris OS:
 796  * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
 797  *      const uint32_t pt[4], uint32_t ct[4])/
 798  *
 799  * Original interface:
 800  * int aes_decrypt(const unsigned char *in,
 801  *      unsigned char *out, const aes_encrypt_ctx cx[1])/
 802  */
 803         .align  64
 804 dec_tab:
 805         dec_vals(v8)
 806 #ifdef  LAST_ROUND_TABLES
 807         / Last Round Tables:
 808         dec_vals(w8)
 809 #endif
 810 
 811 
 812         ENTRY_NP(aes_decrypt_amd64)
 813         /* EXPORT DELETE START */
 814 #ifdef  GLADMAN_INTERFACE
 815         / Original interface
 816         sub     $[4*8], %rsp    / gnu/linux/opensolaris binary interface
 817         mov     %rsi, (%rsp)    / output pointer (P2)
 818         mov     %rdx, %r8       / context (P3)
 819 
 820         mov     %rbx, 1*8(%rsp) / P1: input pointer in rdi
 821         mov     %rbp, 2*8(%rsp) / P2: output pointer in (rsp)
 822         mov     %r12, 3*8(%rsp) / P3: context in r8
 823         movzx   4*KS_LENGTH(kptr), %esi / Get byte key length * 16
 824 
 825 #else
 826         / OpenSolaris OS interface
 827         sub     $[4*8], %rsp    / Make room on stack to save registers
 828         mov     %rcx, (%rsp)    / Save output pointer (P4) on stack
 829         mov     %rdi, %r8       / context (P1)
 830         mov     %rdx, %rdi      / P3: save input pointer
 831         shl     $4, %esi        / P2: esi byte key length * 16
 832 
 833         mov     %rbx, 1*8(%rsp) / Save registers
 834         mov     %rbp, 2*8(%rsp)
 835         mov     %r12, 3*8(%rsp)
 836         / P1: context in r8
 837         / P2: byte key length * 16 in esi
 838         / P3: input pointer in rdi
 839         / P4: output pointer in (rsp)
 840 #endif  /* GLADMAN_INTERFACE */
 841 
 842         lea     dec_tab(%rip), tptr
 843         sub     $rofs, kptr
 844 
 845         / Load input block into registers
 846         mov     (%rdi), %eax
 847         mov     1*4(%rdi), %ebx
 848         mov     2*4(%rdi), %ecx
 849         mov     3*4(%rdi), %edx
 850 
 851 #ifdef AES_REV_DKS
 852         mov     kptr, %rdi
 853         lea     (kptr,%rsi), kptr
 854 #else
 855         lea     (kptr,%rsi), %rdi
 856 #endif
 857 
 858         xor     rofs(%rdi), %eax
 859         xor     rofs+4(%rdi), %ebx
 860         xor     rofs+8(%rdi), %ecx
 861         xor     rofs+12(%rdi), %edx
 862 
 863         / Jump based on byte key length * 16:
 864         cmp     $[10*16], %esi
 865         je      3f
 866         cmp     $[12*16], %esi
 867         je      2f
 868         cmp     $[14*16], %esi
 869         je      1f
 870         mov     $-1, %rax       / error
 871         jmp     4f
 872 
 873         / Perform normal inverse rounds
 874 1:      ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
 875         ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
 876 2:      ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
 877         ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
 878 3:      ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
 879         ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
 880         ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
 881         ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
 882         ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
 883         ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
 884         ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
 885         ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
 886         ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
 887         il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
 888 
 889         / Copy results
 890         mov     (%rsp), %rbx
 891         mov     %r9d, (%rbx)
 892         mov     %r10d, 4(%rbx)
 893         mov     %r11d, 8(%rbx)
 894         mov     %r12d, 12(%rbx)
 895         xor     %rax, %rax
 896 4:      / Restore registers
 897         mov     1*8(%rsp), %rbx
 898         mov     2*8(%rsp), %rbp
 899         mov     3*8(%rsp), %r12
 900         add     $[4*8], %rsp
 901         /* EXPORT DELETE END */
 902         ret
 903 
 904         SET_SIZE(aes_decrypt_amd64)
 905 #endif  /* lint || __lint */